From 7d68656ab085bfcb213eebf1b54a3af3d939aa86 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Wed, 4 Dec 2024 02:48:43 +0000
Subject: [PATCH 01/76] add print_prompts cli arg

---
 utils/prompt_generation.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/utils/prompt_generation.py b/utils/prompt_generation.py
index ccb8b2cc..f96e0c6d 100644
--- a/utils/prompt_generation.py
+++ b/utils/prompt_generation.py
@@ -286,7 +286,7 @@ def generate_prompts(args):
     # Add 1 to prompt lengths to account for the extra token added by vLLM
     prompt_lengths = [pl + 1 for pl in prompt_lengths]
 
-    print_prompts = (not args.save_path) and (args.num_prompts < 5)
+    print_prompts = (args.num_prompts < 5) and args.print_prompts
     # Save prompts to a JSONL file if a save path is provided
     if args.save_path:
         file_path = Path(args.save_path).resolve()
@@ -350,6 +350,12 @@ def add_prompt_gen_args(parser):
         default=None,
         help="Path to save the generated prompts in JSONL format.",
     )
+    parser.add_argument(
+        "--print_prompts",
+        action="store_true",
+        default=False,
+        help="Print generated prompts if there arent more than 5.",
+    )
     return parser
 
 

From 8d78d64e62b9604b89552d70820413ea8036b19c Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Wed, 4 Dec 2024 02:50:29 +0000
Subject: [PATCH 02/76] remove redundant stop token from vLLM example api calls

---
 utils/prompt_client_cli.py                                       | 1 -
 .../src/example_openai_client_alpaca_eval.py                     | 1 -
 .../src/example_requests_client_alpaca_eval.py                   | 1 -
 3 files changed, 3 deletions(-)

diff --git a/utils/prompt_client_cli.py b/utils/prompt_client_cli.py
index d3707418..f97786e5 100644
--- a/utils/prompt_client_cli.py
+++ b/utils/prompt_client_cli.py
@@ -81,7 +81,6 @@ def call_inference_api(
         "top_p": 0.9,
         "max_tokens": max_tokens,
         "stream": stream,
-        "stop": ["<|eot_id|>"],
     }
     req_time = time.time()
     # using requests stream=True, make sure to set a timeout
diff --git a/vllm-tt-metal-llama3-70b/src/example_openai_client_alpaca_eval.py b/vllm-tt-metal-llama3-70b/src/example_openai_client_alpaca_eval.py
index 032c9343..1a3c781e 100644
--- a/vllm-tt-metal-llama3-70b/src/example_openai_client_alpaca_eval.py
+++ b/vllm-tt-metal-llama3-70b/src/example_openai_client_alpaca_eval.py
@@ -40,7 +40,6 @@ def call_inference_api(prompt, response_idx, stream=True, headers=None, client=N
             temperature=1,
             max_tokens=2048,
             top_p=0.9,
-            stop=["<|eot_id|>"],
             stream=stream,
         )
         if stream:
diff --git a/vllm-tt-metal-llama3-70b/src/example_requests_client_alpaca_eval.py b/vllm-tt-metal-llama3-70b/src/example_requests_client_alpaca_eval.py
index ed761905..ca201900 100644
--- a/vllm-tt-metal-llama3-70b/src/example_requests_client_alpaca_eval.py
+++ b/vllm-tt-metal-llama3-70b/src/example_requests_client_alpaca_eval.py
@@ -108,7 +108,6 @@ def call_inference_api(
         "top_p": 0.9,
         "max_tokens": max_tokens,
         "stream": stream,
-        "stop": ["<|eot_id|>"],
     }
     req_time = time.time()
     # using requests stream=True, make sure to set a timeout

From 3108bc0a713e86f79a7fe887929a75f026034996 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Wed, 4 Dec 2024 02:51:39 +0000
Subject: [PATCH 03/76] add capture_trace.py util to pre-prompt vllm server to
 capture all trace input sizes

---
 utils/capture_traces.py | 120 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 120 insertions(+)
 create mode 100644 utils/capture_traces.py

diff --git a/utils/capture_traces.py b/utils/capture_traces.py
new file mode 100644
index 00000000..828c7e1d
--- /dev/null
+++ b/utils/capture_traces.py
@@ -0,0 +1,120 @@
+# SPDX-License-Identifier: Apache-2.0
+#
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+import os
+import time
+import logging
+import requests
+import argparse
+from utils.prompt_generation import generate_prompts
+from utils.prompt_client_cli import (
+    call_inference_api,
+    get_api_base_url,
+    get_authorization,
+)
+
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+def get_api_health_url():
+    DEPLOY_URL = os.getenv("DEPLOY_URL", "http://127.0.0.1")
+    health_url = f"{DEPLOY_URL}:{os.getenv('SERVICE_PORT', '8000')}/health"
+    return health_url
+
+
+def check_health(base_url: str, timeout: int = 300, interval: int = 10) -> bool:
+    """
+    Check the health endpoint until the service is ready.
+    """
+    health_url = get_api_health_url()
+    start_time = time.time()
+    headers = {"Authorization": f"Bearer {get_authorization()}"}
+
+    while time.time() - start_time < timeout:
+        try:
+            response = requests.get(health_url, headers=headers)
+            if response.status_code == 200:
+                logger.info("vLLM service is healthy and ready")
+                return True
+        except requests.exceptions.RequestException as e:
+            logger.warning(f"Health check failed: {e}")
+
+        logger.info(f"Service not ready, waiting {interval} seconds...")
+        time.sleep(interval)
+
+    logger.error(f"Service did not become healthy within {timeout} seconds")
+    return False
+
+
+def capture_input_sizes():
+    """
+    Capture different input size graphs with the TT model on vLLM.
+    get_padded_prefill_len() defines the different input sizes for prefill:
+    https://github.com/tenstorrent/tt-metal/blob/main/models/demos/t3000/llama2_70b/tt/llama_generation.py#L341
+    """
+    input_sizes = [sz - 8 for sz in [32, 64, 128, 256, 512, 1024, 2048, 3072, 4096]]
+    prompts_per_size = 1
+    output_seq_len = 1
+
+    base_url = get_api_base_url()
+    if not check_health(base_url):
+        raise RuntimeError("vLLM did not start correctly!")
+
+    api_url = f"{base_url}/completions"
+    headers = {"Authorization": f"Bearer {get_authorization()}"}
+    vllm_model = os.environ.get("VLLM_MODEL", "meta-llama/Llama-3.1-70B-Instruct")
+
+    for size in input_sizes:
+        logger.info(f"Capture input size: {size}")
+
+        args = argparse.Namespace(
+            tokenizer_model=vllm_model,
+            dataset="random",
+            max_prompt_length=size,
+            input_seq_len=size,
+            distribution="fixed",
+            template=None,
+            save_path=None,
+            print_prompts=False,
+            num_prompts=prompts_per_size,
+        )
+
+        prompts, prompt_lengths = generate_prompts(args)
+
+        for i, (prompt, prompt_len) in enumerate(zip(prompts, prompt_lengths)):
+            try:
+                response_data = call_inference_api(
+                    prompt=prompt,
+                    response_idx=i,
+                    prompt_len=prompt_len,
+                    stream=True,
+                    headers=headers,
+                    api_url=api_url,
+                    max_tokens=output_seq_len,
+                    vll_model=vllm_model,
+                    tokenizer=None,
+                )
+
+                logger.info(
+                    f"Input size: {size}, input_seq_len: {prompt_len}, TTFT: {response_data['ttft']:.3f}s"
+                )
+
+            except Exception as e:
+                logger.error(f"Error processing prompt: {e}")
+
+
+def main():
+    try:
+        capture_input_sizes()
+    except Exception as e:
+        logger.error(f"Capturing input sizes failed: {e}")
+        raise
+
+
+if __name__ == "__main__":
+    main()

From ea3d75dbd9caaa5029c7dd4af8fcfce35a6da03f Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Wed, 4 Dec 2024 03:13:38 +0000
Subject: [PATCH 04/76] adding utils/startup_utils.py to refine handling of
 startup in automation

---
 utils/capture_traces.py | 35 ++-------------------------
 utils/startup_utils.py  | 53 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+), 33 deletions(-)
 create mode 100644 utils/startup_utils.py

diff --git a/utils/capture_traces.py b/utils/capture_traces.py
index 828c7e1d..ecc1d95d 100644
--- a/utils/capture_traces.py
+++ b/utils/capture_traces.py
@@ -3,9 +3,7 @@
 # SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
 
 import os
-import time
 import logging
-import requests
 import argparse
 from utils.prompt_generation import generate_prompts
 from utils.prompt_client_cli import (
@@ -13,6 +11,7 @@
     get_api_base_url,
     get_authorization,
 )
+from utils.startup_utils import wait_for_healthy
 
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
@@ -21,36 +20,6 @@
 logger.setLevel(logging.INFO)
 
 
-def get_api_health_url():
-    DEPLOY_URL = os.getenv("DEPLOY_URL", "http://127.0.0.1")
-    health_url = f"{DEPLOY_URL}:{os.getenv('SERVICE_PORT', '8000')}/health"
-    return health_url
-
-
-def check_health(base_url: str, timeout: int = 300, interval: int = 10) -> bool:
-    """
-    Check the health endpoint until the service is ready.
-    """
-    health_url = get_api_health_url()
-    start_time = time.time()
-    headers = {"Authorization": f"Bearer {get_authorization()}"}
-
-    while time.time() - start_time < timeout:
-        try:
-            response = requests.get(health_url, headers=headers)
-            if response.status_code == 200:
-                logger.info("vLLM service is healthy and ready")
-                return True
-        except requests.exceptions.RequestException as e:
-            logger.warning(f"Health check failed: {e}")
-
-        logger.info(f"Service not ready, waiting {interval} seconds...")
-        time.sleep(interval)
-
-    logger.error(f"Service did not become healthy within {timeout} seconds")
-    return False
-
-
 def capture_input_sizes():
     """
     Capture different input size graphs with the TT model on vLLM.
@@ -62,7 +31,7 @@ def capture_input_sizes():
     output_seq_len = 1
 
     base_url = get_api_base_url()
-    if not check_health(base_url):
+    if not wait_for_healthy(base_url):
         raise RuntimeError("vLLM did not start correctly!")
 
     api_url = f"{base_url}/completions"
diff --git a/utils/startup_utils.py b/utils/startup_utils.py
new file mode 100644
index 00000000..33ef7f86
--- /dev/null
+++ b/utils/startup_utils.py
@@ -0,0 +1,53 @@
+# SPDX-License-Identifier: Apache-2.0
+#
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+import os
+import time
+import logging
+import requests
+from utils.prompt_client_cli import (
+    get_authorization,
+)
+
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+def get_api_health_url():
+    DEPLOY_URL = os.getenv("DEPLOY_URL", "http://127.0.0.1")
+    health_url = f"{DEPLOY_URL}:{os.getenv('SERVICE_PORT', '8000')}/health"
+    return health_url
+
+
+def wait_for_healthy(base_url: str, timeout: int = 300, interval: int = 10) -> bool:
+    """
+    Check the health endpoint until the service is ready.
+    """
+    health_url = get_api_health_url()
+    start_time = time.time()
+    headers = {"Authorization": f"Bearer {get_authorization()}"}
+    total_time_waited = 0
+    while time.time() - start_time < timeout:
+        try:
+            response = requests.get(health_url, headers=headers, timeout=interval)
+            if response.status_code == 200:
+                startup_time = time.time() - start_time
+                logger.info(
+                    f"vLLM service is healthy. startup_time:= {startup_time} seconds"
+                )
+                return True
+        except requests.exceptions.RequestException as e:
+            logger.warning(f"Health check failed: {e}")
+
+        total_time_waited += interval
+        logger.info(
+            f"Service not ready after {total_time_waited} seconds, waiting {interval} seconds before polling ..."
+        )
+        time.sleep(0.05)
+
+    logger.error(f"Service did not become healthy within {timeout} seconds")
+    return False

From cc1d17a6dd06ae59cd9934df7c22e54e71a7f138 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Wed, 4 Dec 2024 04:47:07 +0000
Subject: [PATCH 05/76] adding force_max_tokens as option to
 call_inference_api(), add input_seq_lengths and output_seq_lengths directly
 args to test_api_call_threaded_full_queue() to allow for varied isl and osl
 within batch

---
 utils/prompt_client_cli.py | 62 +++++++++++++++++++++++++-------------
 1 file changed, 41 insertions(+), 21 deletions(-)

diff --git a/utils/prompt_client_cli.py b/utils/prompt_client_cli.py
index f97786e5..58524cfd 100644
--- a/utils/prompt_client_cli.py
+++ b/utils/prompt_client_cli.py
@@ -71,6 +71,7 @@ def call_inference_api(
     max_tokens,
     vll_model,
     tokenizer,
+    force_max_tokens=True,
 ):
     # set API prompt and optional parameters
     json_data = {
@@ -82,6 +83,9 @@ def call_inference_api(
         "max_tokens": max_tokens,
         "stream": stream,
     }
+    if force_max_tokens:
+        # use a reserved special token avoid the model to stopping before osl reached
+        json_data["stop"] = "<|reserved_special_token_249|>"
     req_time = time.time()
     # using requests stream=True, make sure to set a timeout
     response = requests.post(
@@ -102,11 +106,10 @@ def call_inference_api(
                         if num_completion_tokens == 0:
                             first_token_time = time.time()
                             ttft = first_token_time - req_time
-                        num_completion_tokens += 1
                         data_str = line[len("data: ") :].strip()
                         if data_str == "[DONE]":
-                            num_completion_tokens -= 1
                             break
+                        num_completion_tokens += 1
                         try:
                             # Parse the JSON data
                             data = json.loads(data_str)
@@ -117,10 +120,7 @@ def call_inference_api(
                             print(f"Failed to decode JSON: {e}")
                             continue
         else:
-            # If not chunked, you can access the entire response body at once
-            data = response.json()["usage"]
             raise ValueError("Response is not chunked")
-
     else:
         data = response.json()
         full_text = data["choices"][0]["text"]
@@ -128,10 +128,15 @@ def call_inference_api(
         # conservatively set the first token time to the request time
         first_token_time = req_time
         logger.info(f"usage: {data['usage']}")
-        # TODO: verify the number of tokens
-        # num_completion_tokens = len(tokenizer.encode(full_text, add_special_tokens=False))
 
-    num_completion_tokens = max(num_completion_tokens, 2)
+    # verify the number of completion tokens
+    checksum_num_completion_tokens = len(
+        tokenizer.encode(full_text, add_special_tokens=False)
+    )
+    token_diff = checksum_num_completion_tokens - num_completion_tokens
+    if token_diff != 0:
+        logger.warning(f"response_idx=:{response_idx}, token_diff =: {token_diff}")
+
     throughput_time = max(time.time() - first_token_time, 0.0001)
     response_data = {
         "response_idx": response_idx,
@@ -139,7 +144,7 @@ def call_inference_api(
         "response": full_text,
         "prompt_length": prompt_len,
         "num_completion_tokens": num_completion_tokens,
-        "tps": (num_completion_tokens - 1) / throughput_time,
+        "tps": (max(num_completion_tokens, 1)) / throughput_time,
         "ttft": ttft,
     }
     with responses_lock:
@@ -198,7 +203,8 @@ def calculate_batch_sizes(num_prompts, max_batch_size, vary_batch_size):
 
 def test_api_call_threaded_full_queue(
     prompts,
-    prompt_lengths,
+    input_seq_lengths,
+    output_seq_lengths,
     batch_size,
     num_full_iterations,
     vary_batch_size,
@@ -228,13 +234,16 @@ def test_api_call_threaded_full_queue(
     if batch_size == 1:
         logger.info("Running with single thread")
         for iter_num in range(num_full_iterations):
-            for i, (prompt, prompt_len) in enumerate(zip(prompts, prompt_lengths)):
+            for i, (prompt, isl, osl) in enumerate(
+                zip(prompts, input_seq_lengths, output_seq_lengths)
+            ):
                 handle_delay(inter_batch_delay)
                 response_idx = iter_num * num_prompts + i
                 response_data = call_func(
                     prompt=prompt,
                     response_idx=response_idx,
-                    prompt_len=prompt_len,
+                    prompt_len=isl,
+                    max_tokens=osl,
                     **call_func_kwargs,
                 )
                 # Write the response data to the JSONL file
@@ -264,22 +273,28 @@ def test_api_call_threaded_full_queue(
             for bsz in batch_sizes:
                 batch_end = min(batch_start + bsz, num_prompts)
                 batch_prompts = prompts[batch_start:batch_end]
-                batch_prompt_lengths = prompt_lengths[batch_start:batch_end]
+                batch_input_seq_lengths = input_seq_lengths[batch_start:batch_end]
+                batch_output_seq_lengths = output_seq_lengths[batch_start:batch_end]
                 handle_delay(inter_batch_delay)
                 # Submit all prompts in the current batch
                 logger.info(f"Sending batch requests: {bsz}")
                 with ThreadPoolExecutor(max_workers=bsz) as executor:
                     futures = []
 
-                    for i, (prompt, prompt_len) in enumerate(
-                        zip(batch_prompts, batch_prompt_lengths)
+                    for i, (prompt, isl, osl) in enumerate(
+                        zip(
+                            batch_prompts,
+                            batch_input_seq_lengths,
+                            batch_output_seq_lengths,
+                        )
                     ):
                         response_idx = iter_num * num_prompts + i
                         future = executor.submit(
                             call_func,
                             prompt=prompt,
                             response_idx=response_idx,
-                            prompt_len=prompt_len,
+                            prompt_len=isl,
+                            max_tokens=osl,
                             **call_func_kwargs,
                         )
                         futures.append(future)
@@ -308,13 +323,16 @@ def test_api_call_threaded_full_queue(
 
             # Submit all prompts across all iterations
             for iter_num in range(num_full_iterations):
-                for i, (prompt, prompt_len) in enumerate(zip(prompts, prompt_lengths)):
+                for i, (prompt, isl, osl) in enumerate(
+                    zip(prompts, input_seq_lengths, output_seq_lengths)
+                ):
                     response_idx = iter_num * num_prompts + i
                     future = executor.submit(
                         call_func,
                         prompt=prompt,
                         response_idx=response_idx,
-                        prompt_len=prompt_len,
+                        prompt_len=isl,
+                        max_tokens=osl,
                         **call_func_kwargs,
                     )
                     futures.append(future)
@@ -348,14 +366,16 @@ def main():
 
     # generate prompts
     tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_model)
-    prompts, prompt_lengths = generate_prompts(args)
+    prompts, input_seq_lengths = generate_prompts(args)
+    output_seq_lengths = [args.output_seq_len] * len(prompts)
 
     headers = {"Authorization": f"Bearer {get_authorization()}"}
     api_url = get_api_url()
     logging.info(f"API_URL: {api_url}")
     test_api_call_threaded_full_queue(
         prompts=prompts,
-        prompt_lengths=prompt_lengths,
+        input_seq_lengths=input_seq_lengths,
+        output_seq_lengths=output_seq_lengths,
         batch_size=args.batch_size,
         num_full_iterations=args.num_full_iterations,
         vary_batch_size=args.vary_batch_size,
@@ -365,9 +385,9 @@ def main():
             "stream": not args.no_stream,
             "headers": headers,
             "api_url": api_url,
-            "max_tokens": args.output_seq_len,
             "vll_model": args.vllm_model,
             "tokenizer": tokenizer,
+            "force_max_tokens": True,
         },
     )
 

From 059d5135f5e0da6b086e60494eaa9faaa9c6d393 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Wed, 4 Dec 2024 21:43:09 +0000
Subject: [PATCH 06/76] faster mock model prefill

---
 tests/mock_vllm_model.py | 63 ++++++++++++++++++++++------------------
 1 file changed, 34 insertions(+), 29 deletions(-)

diff --git a/tests/mock_vllm_model.py b/tests/mock_vllm_model.py
index f3ff4503..ef807a76 100644
--- a/tests/mock_vllm_model.py
+++ b/tests/mock_vllm_model.py
@@ -269,36 +269,41 @@ def prefill_forward(
         """
 
         batch, batch_seq_len = tokens.shape
-        output_logits = torch.zeros(batch, 1, self.params.vocab_size)
-        prompt_lens = (
-            prompt_lens
-            if prompt_lens is not None
-            else torch.tensor([batch_seq_len] * batch)
-        )
-        for user_id in range(batch):
-            seq_len = prompt_lens[user_id]
-            prefill_seq_len = get_padded_prefill_len(seq_len)
-            prefill_ids = torch.cat(
-                [
-                    tokens[user_id : user_id + 1, :seq_len],
-                    torch.zeros(1, prefill_seq_len - seq_len).long(),
-                ],
-                dim=-1,
-            )
-            logger.info(f"Filling kv cache for user {user_id + 1}")
-            last_token_idx = seq_len - 1
-            logits = self.prefill_forward_single_user(
-                prefill_ids,
-                start_pos,
-                user_id,
-                last_token_idx=last_token_idx,
-                page_table=page_table,
-                kv_cache=kv_cache,
+        # faster prefill that does not mimic the actual prefill process
+        fast_prefill = True
+        if fast_prefill:
+            output_logits = torch.randn((batch, 1, self.params.vocab_size))
+        else:
+            output_logits = torch.zeros(batch, 1, self.params.vocab_size)
+            prompt_lens = (
+                prompt_lens
+                if prompt_lens is not None
+                else torch.tensor([batch_seq_len] * batch)
             )
-            # Since we give unpadded_seq_len, only the tile containing the last token is returned
-            output_logits[user_id] = logits[
-                :, last_token_idx % 32 : last_token_idx % 32 + 1, :
-            ]
+            for user_id in range(batch):
+                seq_len = prompt_lens[user_id]
+                prefill_seq_len = get_padded_prefill_len(seq_len)
+                prefill_ids = torch.cat(
+                    [
+                        tokens[user_id : user_id + 1, :seq_len],
+                        torch.zeros(1, prefill_seq_len - seq_len).long(),
+                    ],
+                    dim=-1,
+                )
+                logger.info(f"Filling kv cache for user {user_id + 1}")
+                last_token_idx = seq_len - 1
+                logits = self.prefill_forward_single_user(
+                    prefill_ids,
+                    start_pos,
+                    user_id,
+                    last_token_idx=last_token_idx,
+                    page_table=page_table,
+                    kv_cache=kv_cache,
+                )
+                # Since we give unpadded_seq_len, only the tile containing the last token is returned
+                output_logits[user_id] = logits[
+                    :, last_token_idx % 32 : last_token_idx % 32 + 1, :
+                ]
 
         return output_logits
 

From 48d17deb89107977792d19fb3f023151f6bd3efe Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Thu, 5 Dec 2024 01:48:44 +0000
Subject: [PATCH 07/76] make it not send stop tokens by default and speed up
 mock model decode and prefill

---
 tests/mock_vllm_model.py | 63 +++++++++++++++++++++++-----------------
 1 file changed, 36 insertions(+), 27 deletions(-)

diff --git a/tests/mock_vllm_model.py b/tests/mock_vllm_model.py
index ef807a76..8fa6a510 100644
--- a/tests/mock_vllm_model.py
+++ b/tests/mock_vllm_model.py
@@ -14,12 +14,12 @@
 import torch
 from huggingface_hub import hf_hub_download
 
-from vllm.engine.metrics import logger
-
 # mock out ttnn fully so we can import ttnn without using it
 sys.modules["ttnn"] = MagicMock()
 sys.modules["ttnn.device"] = MagicMock()
 
+from vllm.engine.metrics import logger
+
 from models.demos.t3000.llama2_70b.tt.llama_common import (
     setup_llama_env,
 )
@@ -31,6 +31,8 @@
     get_model_config,
 )
 
+torch.manual_seed(9387)
+
 
 def setup_mock_model_weights(cache_root: str, weights_dir: str, hf_token: str):
     if not hf_token:
@@ -269,10 +271,11 @@ def prefill_forward(
         """
 
         batch, batch_seq_len = tokens.shape
-        # faster prefill that does not mimic the actual prefill process
         fast_prefill = True
         if fast_prefill:
-            output_logits = torch.randn((batch, 1, self.params.vocab_size))
+            # faster prefill that does not mimic the actual prefill process
+            logger.info("Filling kv cache via fast_prefill in mock model")
+            output_logits = self.decode_forward(tokens=tokens, start_pos=start_pos)
         else:
             output_logits = torch.zeros(batch, 1, self.params.vocab_size)
             prompt_lens = (
@@ -304,29 +307,27 @@ def prefill_forward(
                 output_logits[user_id] = logits[
                     :, last_token_idx % 32 : last_token_idx % 32 + 1, :
                 ]
-
         return output_logits
 
-    def decode_mock_send_token(self, logits, start_pos, batch, send_eot=False):
+    def decode_send_stop_token(self, logits, start_pos, batch):
         # tooling for sending EOT token or other specific token at specific output position
         EOT_ID = 128009
         send_index = 200
         send_token = EOT_ID
-        if send_eot:
-            if start_pos is not None:
-                if isinstance(start_pos, int):
-                    # if start pos is same across batch, ie. now in prefill
-                    cache_idxs = torch.tensor(
-                        [start_pos for _ in range(batch)], dtype=torch.int64
-                    )
-                else:  # if start_pos is a tensor ie. is different across batch, now in decode mode
-                    # if start position is greater than index to send EOT
-                    cache_idxs = start_pos.to(dtype=torch.int64)
-                    send_token_mask = cache_idxs > send_index
-                    # find positions where start pos passes send_index (ie. done decoding) + make 1D
-                    batch_indices = torch.nonzero(send_token_mask).squeeze()
-                    # assign a high logit at at the send _token index so model will select it and generate the EOT so that generation stops
-                    logits[batch_indices, 0, send_token] = 100.0
+        if start_pos is not None:
+            if isinstance(start_pos, int):
+                # if start pos is same across batch, ie. now in prefill
+                cache_idxs = torch.tensor(
+                    [start_pos for _ in range(batch)], dtype=torch.int64
+                )
+            else:  # if start_pos is a tensor ie. is different across batch, now in decode mode
+                # if start position is greater than index to send EOT
+                cache_idxs = start_pos.to(dtype=torch.int64)
+                send_token_mask = cache_idxs > send_index
+                # find positions where start pos passes send_index (ie. done decoding) + make 1D
+                batch_indices = torch.nonzero(send_token_mask).squeeze()
+                # assign a high logit at at the send _token index so model will select it and generate the EOT so that generation stops
+                logits[batch_indices, 0, send_token] = 100.0
         return logits
 
     def decode_forward(
@@ -342,15 +343,23 @@ def decode_forward(
         assert len(tokens.shape) == 2
         batch, seqlen = tokens.shape
         forward_start = time.time()
-        simulated_tps = 10000.0
+        simulated_tps = 100000.0
         simulated_duration = 1.0 / simulated_tps
-        # update the new tokens generated to the input id
-        # vocab_size = tokenizer.nwords
+        low_value = -100.0
+        high_value = 100.0
+        vocab_size = 128256
+        unreserved_vocab_size = 128000
         # logits: [batch, seqlen, vocab_size]
-        logits = torch.randn((batch, seqlen, 128256))
-        logits = self.decode_mock_send_token(logits, start_pos, batch, send_eot=True)
-        actual_duration = time.time() - forward_start
+        logits = torch.full((batch, seqlen, vocab_size), low_value)
+        # set randomly selected tokens to high value
+        gen_token_ids = torch.randint(0, unreserved_vocab_size, (batch,))
+        logits[:, :, gen_token_ids] = high_value
+        send_eot = False
+        if send_eot:
+            # optionally send EOT token with some logic
+            logits = self.decode_send_stop_token(logits, start_pos, batch)
         # simulate forward latency
+        actual_duration = time.time() - forward_start
         time.sleep(max(simulated_duration - actual_duration, 0))
         return logits
 

From fead1aaf6f9759df523d52097ceeb92b8bd74844 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Thu, 5 Dec 2024 01:49:57 +0000
Subject: [PATCH 08/76] adding token count verification for vllm open ai api
 server to prompt_client_cli.py

---
 utils/prompt_client_cli.py | 41 ++++++++++++++++++++++++++++----------
 1 file changed, 30 insertions(+), 11 deletions(-)

diff --git a/utils/prompt_client_cli.py b/utils/prompt_client_cli.py
index 58524cfd..c73372c3 100644
--- a/utils/prompt_client_cli.py
+++ b/utils/prompt_client_cli.py
@@ -82,6 +82,7 @@ def call_inference_api(
         "top_p": 0.9,
         "max_tokens": max_tokens,
         "stream": stream,
+        "stream_options": {"include_usage": True},
     }
     if force_max_tokens:
         # use a reserved special token avoid the model to stopping before osl reached
@@ -109,13 +110,18 @@ def call_inference_api(
                         data_str = line[len("data: ") :].strip()
                         if data_str == "[DONE]":
                             break
-                        num_completion_tokens += 1
                         try:
                             # Parse the JSON data
                             data = json.loads(data_str)
                             # Extract text from the 'choices' field
-                            content = data["choices"][0].get("text", "")
-                            full_text += content
+                            if data["choices"]:
+                                num_completion_tokens += 1
+                                content = data["choices"][0].get("text", "")
+                                full_text += content
+                            else:
+                                # final response has complete usage
+                                usage_dict = data.get("usage", {})
+
                         except json.JSONDecodeError as e:
                             print(f"Failed to decode JSON: {e}")
                             continue
@@ -124,18 +130,31 @@ def call_inference_api(
     else:
         data = response.json()
         full_text = data["choices"][0]["text"]
-        num_completion_tokens = data["usage"]["completion_tokens"]
+        usage_dict = data["usage"]
+        usage_completion_tokens = usage_dict["completion_tokens"]
         # conservatively set the first token time to the request time
         first_token_time = req_time
         logger.info(f"usage: {data['usage']}")
 
-    # verify the number of completion tokens
-    checksum_num_completion_tokens = len(
-        tokenizer.encode(full_text, add_special_tokens=False)
-    )
-    token_diff = checksum_num_completion_tokens - num_completion_tokens
-    if token_diff != 0:
-        logger.warning(f"response_idx=:{response_idx}, token_diff =: {token_diff}")
+    # verify the number of input tokens
+    isl_diff = usage_dict["prompt_tokens"] - prompt_len
+    if isl_diff != 0:
+        logger.warning(
+            f"response_idx=:{response_idx}, isl_diff(actual - expected) =: {isl_diff}"
+        )
+
+    # verify the number of output tokens
+    usage_completion_tokens = usage_dict["completion_tokens"]
+    if num_completion_tokens > 0:
+        osl_diff = usage_completion_tokens - num_completion_tokens
+        if osl_diff != 0:
+            logger.warning(
+                f"response_idx=:{response_idx}, osl_diff(actual - expected) =: {osl_diff}"
+            )
+        if max_tokens != usage_completion_tokens or max_tokens != num_completion_tokens:
+            logger.warning(
+                f"response_idx=:{response_idx}, max_tokens=:{max_tokens}, num_completion_tokens=:{num_completion_tokens}, usage_completion_tokens:={usage_completion_tokens}"
+            )
 
     throughput_time = max(time.time() - first_token_time, 0.0001)
     response_data = {

From 5a80551a9fd79022fcc9a82d8d417eec20cfd173 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Thu, 5 Dec 2024 04:38:47 +0000
Subject: [PATCH 09/76] add max-log-len to limit logging of prompts to avoid
 clutter in logs

---
 tests/mock_vllm_api_server.py                       | 1 +
 vllm-tt-metal-llama3-70b/src/run_vllm_api_server.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/tests/mock_vllm_api_server.py b/tests/mock_vllm_api_server.py
index bb357277..063d55b1 100644
--- a/tests/mock_vllm_api_server.py
+++ b/tests/mock_vllm_api_server.py
@@ -86,6 +86,7 @@ def main():
         "num_scheduler_steps": "10",
         "port": os.getenv("SERVICE_PORT", "7000"),
         "seed": "4862",
+        "max-log-len": "32",
         "download-dir": os.getenv("CACHE_DIR", None),
         "api-key": get_encoded_api_key(os.getenv("JWT_SECRET", None)),
     }
diff --git a/vllm-tt-metal-llama3-70b/src/run_vllm_api_server.py b/vllm-tt-metal-llama3-70b/src/run_vllm_api_server.py
index a5b51126..992874b1 100644
--- a/vllm-tt-metal-llama3-70b/src/run_vllm_api_server.py
+++ b/vllm-tt-metal-llama3-70b/src/run_vllm_api_server.py
@@ -48,6 +48,7 @@ def main():
         "max_model_len": "131072",
         "max_num_batched_tokens": "131072",
         "num_scheduler_steps": "10",
+        "max-log-len": "32",
         "port": os.getenv("SERVICE_PORT", "7000"),
         "download-dir": os.getenv("CACHE_DIR", None),
         "api-key": get_encoded_api_key(os.getenv("JWT_SECRET", None)),

From d845f08d21ab505145fad0f71537057e8bc7d344 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Thu, 5 Dec 2024 04:40:09 +0000
Subject: [PATCH 10/76] add InferenceServerContext to startup_utils.py, improve
 wait_for_healthy

---
 utils/startup_utils.py | 77 +++++++++++++++++++++++++++++++++++++++---
 1 file changed, 72 insertions(+), 5 deletions(-)

diff --git a/utils/startup_utils.py b/utils/startup_utils.py
index 33ef7f86..0da62715 100644
--- a/utils/startup_utils.py
+++ b/utils/startup_utils.py
@@ -5,7 +5,12 @@
 import os
 import time
 import logging
+import subprocess
+import psutil
+import signal
+
 import requests
+
 from utils.prompt_client_cli import (
     get_authorization,
 )
@@ -19,11 +24,11 @@
 
 def get_api_health_url():
     DEPLOY_URL = os.getenv("DEPLOY_URL", "http://127.0.0.1")
-    health_url = f"{DEPLOY_URL}:{os.getenv('SERVICE_PORT', '8000')}/health"
+    health_url = f"{DEPLOY_URL}:{os.getenv('SERVICE_PORT', '7000')}/health"
     return health_url
 
 
-def wait_for_healthy(base_url: str, timeout: int = 300, interval: int = 10) -> bool:
+def wait_for_healthy(timeout: int = 300, interval: int = 10) -> bool:
     """
     Check the health endpoint until the service is ready.
     """
@@ -32,6 +37,7 @@ def wait_for_healthy(base_url: str, timeout: int = 300, interval: int = 10) -> b
     headers = {"Authorization": f"Bearer {get_authorization()}"}
     total_time_waited = 0
     while time.time() - start_time < timeout:
+        req_time = time.time()
         try:
             response = requests.get(health_url, headers=headers, timeout=interval)
             if response.status_code == 200:
@@ -43,11 +49,72 @@ def wait_for_healthy(base_url: str, timeout: int = 300, interval: int = 10) -> b
         except requests.exceptions.RequestException as e:
             logger.warning(f"Health check failed: {e}")
 
-        total_time_waited += interval
+        total_time_waited = time.time() - start_time
+        sleep_interval = max(2 - (time.time() - req_time), 0)
         logger.info(
-            f"Service not ready after {total_time_waited} seconds, waiting {interval} seconds before polling ..."
+            f"Service not ready after {total_time_waited:.2f} seconds, waiting {sleep_interval:.2f} seconds before polling ..."
         )
-        time.sleep(0.05)
+        time.sleep(sleep_interval)
 
     logger.error(f"Service did not become healthy within {timeout} seconds")
     return False
+
+
+class InferenceServerContext:
+    def __init__(self, startup_script_path):
+        self.startup_script_path = startup_script_path
+
+    def __enter__(self):
+        self.process = subprocess.Popen(
+            ["python", self.startup_script_path],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+            preexec_fn=os.setsid,
+        )
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if not self.process:
+            return
+
+        # Log initial state
+        try:
+            parent = psutil.Process(self.process.pid)
+            children = parent.children(recursive=True)
+            logger.info(f"Found {len(children)} child processes before termination")
+            for child in children:
+                logger.info(f"Child PID: {child.pid}, Name: {child.name()}")
+        except psutil.NoSuchProcess:
+            logger.warning("Main process already terminated")
+            return
+
+        # Send SIGTERM to process group
+        try:
+            os.killpg(self.process.pid, signal.SIGTERM)
+            logger.info(f"Sent SIGTERM to process group {self.process.pid}")
+        except ProcessLookupError:
+            logger.warning("Process group already terminated")
+            return
+
+        # Wait for graceful shutdown
+        try:
+            self.process.wait(timeout=5)
+            logger.info("Process terminated gracefully")
+        except subprocess.TimeoutExpired:
+            logger.warning("Timeout expired, force killing process group")
+            try:
+                os.killpg(self.process.pid, signal.SIGKILL)
+            except ProcessLookupError:
+                pass
+
+        # Final verification
+        try:
+            parent = psutil.Process(self.process.pid)
+            remaining = parent.children(recursive=True)
+            if remaining:
+                logger.error(f"{len(remaining)} child processes still exist")
+                for proc in remaining:
+                    logger.error(f"Remaining PID: {proc.pid}, Name: {proc.name()}")
+        except psutil.NoSuchProcess:
+            logger.info("All inference server processes terminated")

From 632ac83af91dd970f6b42759b2c7b4eca4d75ba3 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Thu, 5 Dec 2024 22:57:08 +0000
Subject: [PATCH 11/76] add all_responses to utils/prompt_client_cli.py not
 using globals

---
 utils/prompt_client_cli.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/utils/prompt_client_cli.py b/utils/prompt_client_cli.py
index c73372c3..611a96d1 100644
--- a/utils/prompt_client_cli.py
+++ b/utils/prompt_client_cli.py
@@ -58,7 +58,6 @@ def get_api_url():
 
 # Thread-safe data collection
 responses_lock = threading.Lock()
-responses = []
 
 
 def call_inference_api(
@@ -161,13 +160,13 @@ def call_inference_api(
         "response_idx": response_idx,
         "prompt": prompt,
         "response": full_text,
-        "prompt_length": prompt_len,
-        "num_completion_tokens": num_completion_tokens,
+        "input_seq_len": prompt_len,
+        "output_seq_len": num_completion_tokens,
         "tps": (max(num_completion_tokens, 1)) / throughput_time,
         "ttft": ttft,
     }
-    with responses_lock:
-        responses.append(response_data)
+    # with responses_lock:
+    #     responses.append(response_data)
     return response_data
 
 
@@ -250,6 +249,7 @@ def test_api_call_threaded_full_queue(
         f"Running {total_prompts} prompts in full queue with batch size {batch_size}."
     )
     num_prompts = len(prompts)
+    all_responses = []
     if batch_size == 1:
         logger.info("Running with single thread")
         for iter_num in range(num_full_iterations):
@@ -267,13 +267,14 @@ def test_api_call_threaded_full_queue(
                 )
                 # Write the response data to the JSONL file
                 with responses_lock:
+                    all_responses.append(response_data)
                     with open(json_fpath, "a") as f:
                         if response_counter > 0:
                             f.write(",")
                         json.dump(response_data, f, indent=4)
                 response_counter += 1
                 logger.info(
-                    f"Processed {response_counter}/{total_prompts} responses. Avg. TPS: {response_data['tps']:.2f}, TTFT: {response_data['ttft']:.2f}, Completion Tokens: {response_data['num_completion_tokens']}, Prompt Length: {response_data['prompt_length']}"
+                    f"Processed {response_counter}/{total_prompts} responses. Avg. TPS: {response_data['tps']:.2f}, TTFT: {response_data['ttft']:.2f}, input_seq_len: {response_data['input_seq_len']},output_seq_len: {response_data['output_seq_len']}"
                 )
     elif batch_size > 1 and vary_batch_size:
         logger.info(
@@ -322,13 +323,14 @@ def test_api_call_threaded_full_queue(
                         try:
                             response_data = future.result()
                             with responses_lock:
+                                all_responses.append(response_data)
                                 with open(json_fpath, "a") as f:
                                     if response_counter > 0:
                                         f.write(",")
                                     json.dump(response_data, f, indent=4)
                             response_counter += 1
                             logger.info(
-                                f"Processed {response_counter}/{total_prompts} responses. Avg. TPS: {response_data['tps']:.2f}, TTFT: {response_data['ttft']:.2f}, Completion Tokens: {response_data['num_completion_tokens']}, Prompt Length: {response_data['prompt_length']}"
+                                f"Processed {response_counter}/{total_prompts} responses. Avg. TPS: {response_data['tps']:.2f}, TTFT: {response_data['ttft']:.2f}, input_seq_len: {response_data['input_seq_len']},output_seq_len: {response_data['output_seq_len']}"
                             )
                         except Exception as e:
                             logger.error(f"Error processing response: {e}")
@@ -361,13 +363,14 @@ def test_api_call_threaded_full_queue(
                 try:
                     response_data = future.result()
                     with responses_lock:
+                        all_responses.append(response_data)
                         with open(json_fpath, "a") as f:
                             if response_counter > 0:
                                 f.write(",")
                             json.dump(response_data, f, indent=4)
                     response_counter += 1
                     logger.info(
-                        f"Processed {response_counter}/{total_prompts} responses. Avg. TPS: {response_data['tps']:.2f}, TTFT: {response_data['ttft']:.2f}, Completion Tokens: {response_data['num_completion_tokens']}, Prompt Length: {response_data['prompt_length']}"
+                        f"Processed {response_counter}/{total_prompts} responses. Avg. TPS: {response_data['tps']:.2f}, TTFT: {response_data['ttft']:.2f}, input_seq_len: {response_data['input_seq_len']}, output_seq_len: {response_data['output_seq_len']}"
                     )
                 except Exception as e:
                     logger.error(f"Error processing response: {e}")

From f563e32b2987ecede98fbd7994358f547bc8735a Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Thu, 5 Dec 2024 23:06:20 +0000
Subject: [PATCH 12/76] adding new utils/prompt_client_cli.py using
 utils/prompt_client.py and utils/batch_processor.py with configs in
 utils/prompt_configs.py and utils/prompt_generation.py for prompt generation

---
 utils/batch_processor.py   | 274 ++++++++++++++++++++
 utils/prompt_client.py     | 273 ++++++++++++++++++++
 utils/prompt_client_cli.py | 507 ++++++++-----------------------------
 utils/prompt_configs.py    |  40 +++
 utils/prompt_generation.py | 117 +++------
 utils/startup_utils.py     |  44 ----
 6 files changed, 728 insertions(+), 527 deletions(-)
 create mode 100644 utils/batch_processor.py
 create mode 100644 utils/prompt_client.py
 create mode 100644 utils/prompt_configs.py

diff --git a/utils/batch_processor.py b/utils/batch_processor.py
new file mode 100644
index 00000000..c8f9ea90
--- /dev/null
+++ b/utils/batch_processor.py
@@ -0,0 +1,274 @@
+# SPDX-License-Identifier: Apache-2.0
+#
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+import threading
+import logging
+import json
+import time
+from datetime import datetime
+from pathlib import Path
+from typing import List
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+import numpy as np
+from transformers import AutoTokenizer
+
+from prompt_configs import BatchConfig
+from prompt_client import PromptClient
+
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+class BatchProcessor:
+    def __init__(self, prompt_client: PromptClient, batch_config: BatchConfig):
+        self.prompt_client = prompt_client
+        self.batch_config = batch_config
+        self.responses_lock = threading.Lock()
+
+    def _calculate_batch_sizes(self, num_prompts: int) -> List[int]:
+        if self.batch_config.vary_batch_size:
+            mean_workers = self.batch_config.batch_size / 2
+            std_dev = self.batch_config.batch_size / 4
+
+            batch_sizes = []
+            remaining = num_prompts
+
+            while remaining > 0:
+                size = int(
+                    np.clip(
+                        np.random.normal(mean_workers, std_dev),
+                        1,
+                        self.batch_config.batch_size,
+                    )
+                )
+                if size > remaining:
+                    size = remaining
+                batch_sizes.append(size)
+                remaining -= size
+
+            return batch_sizes
+
+        return [self.batch_config.batch_size] * (
+            num_prompts // self.batch_config.batch_size
+        )
+
+    def process_batch(
+        self,
+        prompts: List[str],
+        input_seq_lengths: List[int],
+        tokenizer: AutoTokenizer,
+    ) -> List[dict]:
+        timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+        json_fpath = (
+            Path(self.prompt_client.env_config.cache_root)
+            / f"alpaca_eval_responses_{timestamp}.json"
+        )
+
+        total_prompts = len(prompts) * self.batch_config.num_full_iterations
+        response_counter = 0
+        all_responses = []
+
+        with open(json_fpath, "a") as f:
+            f.write("[\n")
+
+        if self.batch_config.batch_size == 1:
+            all_responses = self._process_single_thread(
+                prompts,
+                input_seq_lengths,
+                tokenizer,
+                json_fpath,
+                total_prompts,
+                response_counter,
+            )
+        else:
+            all_responses = self._process_multi_thread(
+                prompts,
+                input_seq_lengths,
+                tokenizer,
+                json_fpath,
+                total_prompts,
+                response_counter,
+            )
+
+        with open(json_fpath, "a") as f:
+            f.write("\n]")
+
+        return all_responses
+
+    def _process_single_thread(
+        self,
+        prompts: List[str],
+        input_seq_lengths: List[int],
+        tokenizer: AutoTokenizer,
+        json_fpath: Path,
+        total_prompts: int,
+        response_counter: int,
+    ) -> List[dict]:
+        all_responses = []
+
+        for iter_num in range(self.batch_config.num_full_iterations):
+            for i, (prompt, isl) in enumerate(zip(prompts, input_seq_lengths)):
+                if self.batch_config.inter_batch_delay > 0:
+                    time.sleep(self.batch_config.inter_batch_delay)
+
+                response_idx = iter_num * len(prompts) + i
+                response_data = self.prompt_client.call_inference(
+                    prompt=prompt,
+                    response_idx=response_idx,
+                    prompt_len=isl,
+                    max_tokens=self.batch_config.output_seq_lens[i],
+                    stream=self.batch_config.stream,
+                    vll_model=self.batch_config.vllm_model,
+                    tokenizer=tokenizer,
+                )
+
+                self._save_response(
+                    response_data, all_responses, json_fpath, response_counter
+                )
+                response_counter += 1
+                self._log_progress(response_counter, total_prompts, response_data)
+
+        return all_responses
+
+    def _process_multi_thread(
+        self,
+        prompts: List[str],
+        input_seq_lengths: List[int],
+        tokenizer: AutoTokenizer,
+        json_fpath: Path,
+        total_prompts: int,
+        response_counter: int,
+    ) -> List[dict]:
+        all_responses = []
+
+        if self.batch_config.vary_batch_size:
+            batch_sizes = self._calculate_batch_sizes(len(prompts))
+
+            for iter_num in range(self.batch_config.num_full_iterations):
+                batch_start = 0
+
+                for bsz in batch_sizes:
+                    batch_end = min(batch_start + bsz, len(prompts))
+                    self._process_batch_chunk(
+                        prompts[batch_start:batch_end],
+                        input_seq_lengths[batch_start:batch_end],
+                        iter_num,
+                        bsz,
+                        tokenizer,
+                        all_responses,
+                        json_fpath,
+                        total_prompts,
+                        response_counter,
+                    )
+                    batch_start = batch_end
+        else:
+            with ThreadPoolExecutor(
+                max_workers=self.batch_config.batch_size
+            ) as executor:
+                futures = []
+
+                for iter_num in range(self.batch_config.num_full_iterations):
+                    for i, (prompt, isl) in enumerate(zip(prompts, input_seq_lengths)):
+                        response_idx = iter_num * len(prompts) + i
+                        future = executor.submit(
+                            self.prompt_client.call_inference,
+                            prompt=prompt,
+                            response_idx=response_idx,
+                            prompt_len=isl,
+                            max_tokens=self.batch_config.output_seq_lens[i],
+                            stream=self.batch_config.stream,
+                            vll_model=self.batch_config.vllm_model,
+                            tokenizer=tokenizer,
+                        )
+                        futures.append(future)
+
+                for future in as_completed(futures):
+                    try:
+                        response_data = future.result()
+                        self._save_response(
+                            response_data, all_responses, json_fpath, response_counter
+                        )
+                        response_counter += 1
+                        self._log_progress(
+                            response_counter, total_prompts, response_data
+                        )
+                    except Exception as e:
+                        logger.error(f"Error processing response: {e}")
+
+        return all_responses
+
+    def _process_batch_chunk(
+        self,
+        batch_prompts: List[str],
+        batch_input_seq_lengths: List[int],
+        iter_num: int,
+        batch_size: int,
+        tokenizer: AutoTokenizer,
+        all_responses: List[dict],
+        json_fpath: Path,
+        total_prompts: int,
+        response_counter: int,
+    ):
+        if self.batch_config.inter_batch_delay > 0:
+            time.sleep(self.batch_config.inter_batch_delay)
+
+        with ThreadPoolExecutor(max_workers=batch_size) as executor:
+            futures = []
+
+            for i, (prompt, isl) in enumerate(
+                zip(batch_prompts, batch_input_seq_lengths)
+            ):
+                response_idx = iter_num * len(batch_prompts) + i
+                future = executor.submit(
+                    self.prompt_client.call_inference,
+                    prompt=prompt,
+                    response_idx=response_idx,
+                    prompt_len=isl,
+                    max_tokens=self.batch_config.output_seq_lens[i],
+                    stream=self.batch_config.stream,
+                    vll_model=self.batch_config.vllm_model,
+                    tokenizer=tokenizer,
+                )
+                futures.append(future)
+
+            for future in as_completed(futures):
+                try:
+                    response_data = future.result()
+                    self._save_response(
+                        response_data, all_responses, json_fpath, response_counter
+                    )
+                    response_counter += 1
+                    self._log_progress(response_counter, total_prompts, response_data)
+                except Exception as e:
+                    logger.error(f"Error processing response: {e}")
+
+    def _save_response(
+        self,
+        response_data: dict,
+        all_responses: List[dict],
+        json_fpath: Path,
+        response_counter: int,
+    ):
+        with self.responses_lock:
+            all_responses.append(response_data)
+            with open(json_fpath, "a") as f:
+                if response_counter > 0:
+                    f.write(",")
+                json.dump(response_data, f, indent=4)
+
+    def _log_progress(
+        self, response_counter: int, total_prompts: int, response_data: dict
+    ):
+        logger.info(
+            f"Processed {response_counter}/{total_prompts} responses. "
+            f"decode_tps: {response_data['decode_tps']:.2f}, "
+            f"total_tps: {response_data['total_tps']:.2f}, "
+            f"ttft: {response_data['ttft']:.2f}, "
+            f"input_seq_len: {response_data['input_seq_len']}, "
+            f"output_seq_len: {response_data['output_seq_len']}"
+        )
diff --git a/utils/prompt_client.py b/utils/prompt_client.py
new file mode 100644
index 00000000..00473045
--- /dev/null
+++ b/utils/prompt_client.py
@@ -0,0 +1,273 @@
+# SPDX-License-Identifier: Apache-2.0
+#
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+import logging
+import json
+import time
+from typing import List
+
+import requests
+import jwt
+from transformers import AutoTokenizer
+
+from prompt_generation import generate_prompts
+from prompt_configs import PromptConfig, EnvironmentConfig
+
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+class PromptClient:
+    def __init__(self, env_config: EnvironmentConfig):
+        self.env_config = env_config
+        self.headers = {"Authorization": f"Bearer {self._get_authorization()}"}
+        self.completions_url = self._get_api_completions_url()
+        self.health_url = self._get_api_health_url()
+        self.server_ready = False
+
+    def _get_authorization(self) -> str:
+        if self.env_config.authorization:
+            return self.env_config.authorization
+
+        if self.env_config.jwt_secret:
+            json_payload = json.loads(
+                '{"team_id": "tenstorrent", "token_id":"debug-test"}'
+            )
+            encoded_jwt = jwt.encode(
+                json_payload, self.env_config.jwt_secret, algorithm="HS256"
+            )
+            return encoded_jwt
+
+        raise ValueError(
+            "Neither AUTHORIZATION or JWT_SECRET environment variables are set."
+        )
+
+    def _get_api_base_url(self) -> str:
+        return f"{self.env_config.deploy_url}:{self.env_config.service_port}/v1"
+
+    def _get_api_completions_url(self) -> str:
+        return f"{self._get_api_base_url()}/completions"
+
+    def _get_api_health_url(self) -> str:
+        return f"{self._get_api_base_url()}/health"
+
+    def get_health(self) -> requests.Response:
+        return requests.get(self.health_url, headers=self.headers)
+
+    def wait_for_healthy(self, timeout: int = 300, interval: int = 10) -> bool:
+        if self.server_ready:
+            return True
+
+        start_time = time.time()
+        total_time_waited = 0
+
+        while time.time() - start_time < timeout:
+            req_time = time.time()
+            try:
+                response = requests.get(
+                    self.health_url, headers=self.headers, timeout=interval
+                )
+                if response.status_code == 200:
+                    startup_time = time.time() - start_time
+                    logger.info(
+                        f"vLLM service is healthy. startup_time:= {startup_time} seconds"
+                    )
+                    self.server_ready = True
+                    return True
+
+            except requests.exceptions.RequestException as e:
+                logger.warning(f"Health check failed: {e}")
+
+            total_time_waited = time.time() - start_time
+            sleep_interval = max(2 - (time.time() - req_time), 0)
+            logger.info(
+                f"Service not ready after {total_time_waited:.2f} seconds, "
+                f"waiting {sleep_interval:.2f} seconds before polling ..."
+            )
+            time.sleep(sleep_interval)
+
+        logger.error(f"Service did not become healthy within {timeout} seconds")
+        return False
+
+    def capture_traces(
+        self,
+        input_sizes: List[int] = None,
+        prompts_per_size: int = 1,
+        output_seq_len: int = 1,
+    ) -> None:
+        logger.info("Capturing input sizes ...")
+
+        # Default input sizes based on get_padded_prefill_len()
+        if input_sizes is None:
+            input_sizes = [32, 64, 128, 256, 512, 1024, 2048, 3072, 4096]
+
+        # Check service health before starting
+        if not self.wait_for_healthy():
+            raise RuntimeError("vLLM did not start correctly!")
+
+        for size in input_sizes:
+            logger.info(f"Capture input size: {size}")
+
+            # Create prompt config for current size
+            prompt_config = PromptConfig(
+                input_seq_len=size,
+                max_prompt_length=size,
+                num_prompts=prompts_per_size,
+                distribution="fixed",
+                dataset="random",
+                tokenizer_model=self.env_config.vllm_model,
+                template=None,
+                save_path=None,
+                print_prompts=False,
+            )
+
+            # Generate prompts for current size
+            prompts, prompt_lengths = generate_prompts(prompt_config)
+
+            # Process each prompt
+            for i, (prompt, prompt_len) in enumerate(zip(prompts, prompt_lengths)):
+                try:
+                    logger.info(f"Starting capture for input_seq_len: {prompt_len}")
+                    response_data = self.call_inference(
+                        prompt=prompt,
+                        response_idx=i,
+                        prompt_len=prompt_len,
+                        max_tokens=output_seq_len,
+                        stream=True,
+                        vll_model=self.env_config.vllm_model,
+                        tokenizer=None,
+                        force_max_tokens=True,
+                    )
+                    logger.info(
+                        f"Input size: {size}, "
+                        f"input_seq_len: {prompt_len}, "
+                        f"TTFT: {response_data['ttft']:.3f}s"
+                    )
+                except Exception as e:
+                    logger.error(f"Error processing prompt: {e}")
+
+    def call_inference(
+        self,
+        prompt: str,
+        response_idx: int,
+        prompt_len: int,
+        max_tokens: int,
+        stream: bool,
+        vll_model: str,
+        tokenizer: AutoTokenizer,
+        force_max_tokens: bool = True,
+        include_usage: bool = True,
+    ) -> dict:
+        json_data = {
+            "model": vll_model,
+            "prompt": prompt,
+            "temperature": 1,
+            "top_k": 20,
+            "top_p": 0.9,
+            "max_tokens": max_tokens,
+            "stream": stream,
+            "stream_options": {"include_usage": include_usage},
+        }
+
+        if force_max_tokens:
+            json_data["stop"] = "<|reserved_special_token_249|>"
+
+        req_time = time.perf_counter()
+        response = requests.post(
+            self.completions_url,
+            json=json_data,
+            headers=self.headers,
+            stream=stream,
+            timeout=600,
+        )
+
+        return self._process_response(
+            response, req_time, response_idx, prompt, prompt_len, max_tokens, stream
+        )
+
+    def _process_response(
+        self,
+        response: requests.Response,
+        req_time: float,
+        response_idx: int,
+        prompt: str,
+        prompt_len: int,
+        max_tokens: int,
+        stream: bool,
+    ) -> dict:
+        full_text = ""
+        num_completion_tokens = 0
+        first_token_time = 0
+        ttft = 0
+        usage_dict = {}
+
+        if stream:
+            assert (
+                response.headers.get("transfer-encoding") == "chunked"
+            ), "Response is not chunked"
+            for line in response.iter_lines(decode_unicode=True):
+                if line and line.startswith("data: "):
+                    if num_completion_tokens == 0:
+                        first_token_time = time.perf_counter()
+                        ttft = first_token_time - req_time
+
+                    data_str = line[len("data: ") :].strip()
+                    if data_str == "[DONE]":
+                        break
+
+                    try:
+                        data = json.loads(data_str)
+                        if data["choices"]:
+                            full_text += data["choices"][0].get("text", "")
+                            num_completion_tokens += 1
+                        else:
+                            usage_dict = data.get("usage", {})
+                    except json.JSONDecodeError as e:
+                        logger.error(f"Failed to decode JSON: {e}")
+                        continue
+        else:
+            data = response.json()
+            full_text = data["choices"][0]["text"]
+            usage_dict = data["usage"]
+            first_token_time = req_time
+
+        decode_time = max(time.perf_counter() - first_token_time, 0.0001)
+        total_time = max(time.perf_counter() - req_time, 0.0001)
+
+        # verify the number of input tokens
+        isl_diff = usage_dict["prompt_tokens"] - prompt_len
+        if isl_diff != 0:
+            logger.warning(
+                f"response_idx=:{response_idx}, isl_diff(actual - expected) =: {isl_diff}"
+            )
+
+        # verify the number of output tokens
+        usage_completion_tokens = usage_dict["completion_tokens"]
+        if num_completion_tokens > 0:
+            osl_diff = usage_completion_tokens - num_completion_tokens
+            if osl_diff != 0:
+                logger.warning(
+                    f"response_idx=:{response_idx}, osl_diff(actual - expected) =: {osl_diff}"
+                )
+            if (
+                max_tokens != usage_completion_tokens
+                or max_tokens != num_completion_tokens
+            ):
+                logger.warning(
+                    f"response_idx=:{response_idx}, max_tokens=:{max_tokens}, num_completion_tokens=:{num_completion_tokens}, usage_completion_tokens:={usage_completion_tokens}"
+                )
+
+        return {
+            "response_idx": response_idx,
+            "prompt": prompt,
+            "response": full_text,
+            "input_seq_len": prompt_len,
+            "output_seq_len": num_completion_tokens,
+            "decode_tps": (max(num_completion_tokens, 1)) / decode_time,
+            "total_tps": (max(num_completion_tokens, 1)) / total_time,
+            "ttft": ttft,
+        }
diff --git a/utils/prompt_client_cli.py b/utils/prompt_client_cli.py
index 611a96d1..c671ca33 100644
--- a/utils/prompt_client_cli.py
+++ b/utils/prompt_client_cli.py
@@ -3,22 +3,16 @@
 # SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
 
 import os
-import getpass
-import threading
 import logging
-import json
 import argparse
-import time
-from datetime import datetime
-import requests
-from pathlib import Path
-from concurrent.futures import ThreadPoolExecutor, as_completed
 
-import jwt
 import numpy as np
 from transformers import AutoTokenizer
 
-from utils.prompt_generation import add_prompt_gen_args, generate_prompts
+from prompt_configs import PromptConfig, BatchConfig, EnvironmentConfig
+from prompt_client import PromptClient
+from batch_processor import BatchProcessor
+from prompt_generation import generate_prompts
 
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
@@ -26,393 +20,6 @@
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 
-# set numpy seed for reproducibility
-np.random.seed(42)
-
-
-def get_authorization():
-    authorization = os.getenv("AUTHORIZATION", None)
-    if authorization is None:
-        jwt_secret = os.getenv("JWT_SECRET", None)
-        if jwt_secret is None:
-            raise ValueError(
-                "Neither AUTHORIZATION or JWT_SECRET environment variables are set."
-            )
-        json_payload = json.loads('{"team_id": "tenstorrent", "token_id":"debug-test"}')
-        encoded_jwt = jwt.encode(json_payload, jwt_secret, algorithm="HS256")
-        authorization = f"{encoded_jwt}"
-    return authorization
-
-
-def get_api_base_url():
-    DEPLOY_URL = os.getenv("DEPLOY_URL", "http://127.0.0.1")
-    base_url = f"{DEPLOY_URL}:{os.getenv('SERVICE_PORT', '8000')}/v1"
-    return base_url
-
-
-def get_api_url():
-    base_url = get_api_base_url()
-    api_url = f"{base_url}/completions"
-    return api_url
-
-
-# Thread-safe data collection
-responses_lock = threading.Lock()
-
-
-def call_inference_api(
-    prompt,
-    response_idx,
-    prompt_len,
-    stream,
-    headers,
-    api_url,
-    max_tokens,
-    vll_model,
-    tokenizer,
-    force_max_tokens=True,
-):
-    # set API prompt and optional parameters
-    json_data = {
-        "model": vll_model,
-        "prompt": prompt,
-        "temperature": 1,
-        "top_k": 20,
-        "top_p": 0.9,
-        "max_tokens": max_tokens,
-        "stream": stream,
-        "stream_options": {"include_usage": True},
-    }
-    if force_max_tokens:
-        # use a reserved special token avoid the model to stopping before osl reached
-        json_data["stop"] = "<|reserved_special_token_249|>"
-    req_time = time.time()
-    # using requests stream=True, make sure to set a timeout
-    response = requests.post(
-        api_url, json=json_data, headers=headers, stream=stream, timeout=600
-    )
-    # Handle chunked response
-    full_text = ""
-    num_completion_tokens = 0
-    first_token_time = 0
-    ttft = 0
-    if stream:
-        if response.headers.get("transfer-encoding") == "chunked":
-            for line in response.iter_lines(decode_unicode=True):
-                # Process each line of data as it's received
-                if line:
-                    # Remove the 'data: ' prefix
-                    if line.startswith("data: "):
-                        if num_completion_tokens == 0:
-                            first_token_time = time.time()
-                            ttft = first_token_time - req_time
-                        data_str = line[len("data: ") :].strip()
-                        if data_str == "[DONE]":
-                            break
-                        try:
-                            # Parse the JSON data
-                            data = json.loads(data_str)
-                            # Extract text from the 'choices' field
-                            if data["choices"]:
-                                num_completion_tokens += 1
-                                content = data["choices"][0].get("text", "")
-                                full_text += content
-                            else:
-                                # final response has complete usage
-                                usage_dict = data.get("usage", {})
-
-                        except json.JSONDecodeError as e:
-                            print(f"Failed to decode JSON: {e}")
-                            continue
-        else:
-            raise ValueError("Response is not chunked")
-    else:
-        data = response.json()
-        full_text = data["choices"][0]["text"]
-        usage_dict = data["usage"]
-        usage_completion_tokens = usage_dict["completion_tokens"]
-        # conservatively set the first token time to the request time
-        first_token_time = req_time
-        logger.info(f"usage: {data['usage']}")
-
-    # verify the number of input tokens
-    isl_diff = usage_dict["prompt_tokens"] - prompt_len
-    if isl_diff != 0:
-        logger.warning(
-            f"response_idx=:{response_idx}, isl_diff(actual - expected) =: {isl_diff}"
-        )
-
-    # verify the number of output tokens
-    usage_completion_tokens = usage_dict["completion_tokens"]
-    if num_completion_tokens > 0:
-        osl_diff = usage_completion_tokens - num_completion_tokens
-        if osl_diff != 0:
-            logger.warning(
-                f"response_idx=:{response_idx}, osl_diff(actual - expected) =: {osl_diff}"
-            )
-        if max_tokens != usage_completion_tokens or max_tokens != num_completion_tokens:
-            logger.warning(
-                f"response_idx=:{response_idx}, max_tokens=:{max_tokens}, num_completion_tokens=:{num_completion_tokens}, usage_completion_tokens:={usage_completion_tokens}"
-            )
-
-    throughput_time = max(time.time() - first_token_time, 0.0001)
-    response_data = {
-        "response_idx": response_idx,
-        "prompt": prompt,
-        "response": full_text,
-        "input_seq_len": prompt_len,
-        "output_seq_len": num_completion_tokens,
-        "tps": (max(num_completion_tokens, 1)) / throughput_time,
-        "ttft": ttft,
-    }
-    # with responses_lock:
-    #     responses.append(response_data)
-    return response_data
-
-
-def check_json_fpath(json_fpath):
-    directory = os.path.dirname(json_fpath)
-    user = getpass.getuser()
-    if os.access(directory, os.W_OK):
-        try:
-            with open(json_fpath, "w") as f:
-                f.write("")  # Attempt to write an empty string to the file
-            logger.info(f"The file '{json_fpath}' can be created and is writable.")
-            return True, ""
-        except IOError as err:
-            err_msg = f"Cannot write to the file '{json_fpath}'. Reason: {err}"
-    else:
-        err_msg = (
-            f"User:={user} cannot write to file:={json_fpath} in directory:={directory}"
-        )
-    logger.error(err_msg)
-    return False, err_msg
-
-
-def handle_delay(delay):
-    if delay > 0:
-        logger.info(f"Sleeping for {delay} seconds...")
-        time.sleep(delay)
-
-
-def calculate_batch_sizes(num_prompts, max_batch_size, vary_batch_size):
-    """Calculate normally distributed batch sizes that sum to total_items"""
-    if vary_batch_size:
-        mean_workers = max_batch_size / 2
-        std_dev = max_batch_size / 4
-
-        batch_sizes = []
-        remaining = num_prompts
-
-        while remaining > 0:
-            size = int(
-                np.clip(np.random.normal(mean_workers, std_dev), 1, max_batch_size)
-            )
-            if size > remaining:
-                size = remaining
-            batch_sizes.append(size)
-            remaining -= size
-
-    else:
-        batch_sizes = [max_batch_size] * (num_prompts // max_batch_size)
-
-    return batch_sizes
-
-
-def test_api_call_threaded_full_queue(
-    prompts,
-    input_seq_lengths,
-    output_seq_lengths,
-    batch_size,
-    num_full_iterations,
-    vary_batch_size,
-    inter_batch_delay,
-    call_func,
-    call_func_kwargs,
-):
-    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
-    cache_root = Path(os.getenv("CACHE_ROOT", "."))
-    json_fpath = cache_root / f"alpaca_eval_responses_{timestamp}.json"
-    logger.info(f"Will write output to: {json_fpath}")
-    can_write, err_msg = check_json_fpath(json_fpath)
-    if not can_write:
-        err_msg += (
-            f"\nNote: CACHE_ROOT:={cache_root}, consider setting in this shell to $PWD"
-        )
-    assert can_write, err_msg
-    with open(json_fpath, "a") as f:
-        f.write("[\n")
-
-    total_prompts = len(prompts) * num_full_iterations
-    response_counter = 0
-    logger.info(
-        f"Running {total_prompts} prompts in full queue with batch size {batch_size}."
-    )
-    num_prompts = len(prompts)
-    all_responses = []
-    if batch_size == 1:
-        logger.info("Running with single thread")
-        for iter_num in range(num_full_iterations):
-            for i, (prompt, isl, osl) in enumerate(
-                zip(prompts, input_seq_lengths, output_seq_lengths)
-            ):
-                handle_delay(inter_batch_delay)
-                response_idx = iter_num * num_prompts + i
-                response_data = call_func(
-                    prompt=prompt,
-                    response_idx=response_idx,
-                    prompt_len=isl,
-                    max_tokens=osl,
-                    **call_func_kwargs,
-                )
-                # Write the response data to the JSONL file
-                with responses_lock:
-                    all_responses.append(response_data)
-                    with open(json_fpath, "a") as f:
-                        if response_counter > 0:
-                            f.write(",")
-                        json.dump(response_data, f, indent=4)
-                response_counter += 1
-                logger.info(
-                    f"Processed {response_counter}/{total_prompts} responses. Avg. TPS: {response_data['tps']:.2f}, TTFT: {response_data['ttft']:.2f}, input_seq_len: {response_data['input_seq_len']},output_seq_len: {response_data['output_seq_len']}"
-                )
-    elif batch_size > 1 and vary_batch_size:
-        logger.info(
-            f"Running with ThreadPoolExecutor: batch_size={batch_size}, vary_batch_size={vary_batch_size}"
-        )
-        batch_sizes = calculate_batch_sizes(
-            num_prompts=num_prompts,
-            max_batch_size=batch_size,
-            vary_batch_size=True,
-        )
-
-        # Process prompts in batches with varying sizes
-        for iter_num in range(num_full_iterations):
-            batch_start = 0
-
-            for bsz in batch_sizes:
-                batch_end = min(batch_start + bsz, num_prompts)
-                batch_prompts = prompts[batch_start:batch_end]
-                batch_input_seq_lengths = input_seq_lengths[batch_start:batch_end]
-                batch_output_seq_lengths = output_seq_lengths[batch_start:batch_end]
-                handle_delay(inter_batch_delay)
-                # Submit all prompts in the current batch
-                logger.info(f"Sending batch requests: {bsz}")
-                with ThreadPoolExecutor(max_workers=bsz) as executor:
-                    futures = []
-
-                    for i, (prompt, isl, osl) in enumerate(
-                        zip(
-                            batch_prompts,
-                            batch_input_seq_lengths,
-                            batch_output_seq_lengths,
-                        )
-                    ):
-                        response_idx = iter_num * num_prompts + i
-                        future = executor.submit(
-                            call_func,
-                            prompt=prompt,
-                            response_idx=response_idx,
-                            prompt_len=isl,
-                            max_tokens=osl,
-                            **call_func_kwargs,
-                        )
-                        futures.append(future)
-                    # Wait for all futures in this batch to complete
-                    for future in as_completed(futures):
-                        try:
-                            response_data = future.result()
-                            with responses_lock:
-                                all_responses.append(response_data)
-                                with open(json_fpath, "a") as f:
-                                    if response_counter > 0:
-                                        f.write(",")
-                                    json.dump(response_data, f, indent=4)
-                            response_counter += 1
-                            logger.info(
-                                f"Processed {response_counter}/{total_prompts} responses. Avg. TPS: {response_data['tps']:.2f}, TTFT: {response_data['ttft']:.2f}, input_seq_len: {response_data['input_seq_len']},output_seq_len: {response_data['output_seq_len']}"
-                            )
-                        except Exception as e:
-                            logger.error(f"Error processing response: {e}")
-    elif batch_size > 1 and not vary_batch_size:
-        logger.info(
-            f"Running with ThreadPoolExecutor: batch_size={batch_size}, vary_batch_size={vary_batch_size}"
-        )
-        # Process all prompts concurrently up to batch_size limit
-        with ThreadPoolExecutor(max_workers=batch_size) as executor:
-            futures = []
-
-            # Submit all prompts across all iterations
-            for iter_num in range(num_full_iterations):
-                for i, (prompt, isl, osl) in enumerate(
-                    zip(prompts, input_seq_lengths, output_seq_lengths)
-                ):
-                    response_idx = iter_num * num_prompts + i
-                    future = executor.submit(
-                        call_func,
-                        prompt=prompt,
-                        response_idx=response_idx,
-                        prompt_len=isl,
-                        max_tokens=osl,
-                        **call_func_kwargs,
-                    )
-                    futures.append(future)
-
-            # Process completed futures as they finish
-            for future in as_completed(futures):
-                try:
-                    response_data = future.result()
-                    with responses_lock:
-                        all_responses.append(response_data)
-                        with open(json_fpath, "a") as f:
-                            if response_counter > 0:
-                                f.write(",")
-                            json.dump(response_data, f, indent=4)
-                    response_counter += 1
-                    logger.info(
-                        f"Processed {response_counter}/{total_prompts} responses. Avg. TPS: {response_data['tps']:.2f}, TTFT: {response_data['ttft']:.2f}, input_seq_len: {response_data['input_seq_len']}, output_seq_len: {response_data['output_seq_len']}"
-                    )
-                except Exception as e:
-                    logger.error(f"Error processing response: {e}")
-
-    logger.info(f"Finished all requests, total responses: {response_counter}")
-    with open(json_fpath, "a") as f:
-        f.write("\n]")
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Run Alpaca Evaluation Inference.")
-    parser = add_client_args(parser)
-    parser = add_prompt_gen_args(parser)
-    args = parser.parse_args()
-
-    # generate prompts
-    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_model)
-    prompts, input_seq_lengths = generate_prompts(args)
-    output_seq_lengths = [args.output_seq_len] * len(prompts)
-
-    headers = {"Authorization": f"Bearer {get_authorization()}"}
-    api_url = get_api_url()
-    logging.info(f"API_URL: {api_url}")
-    test_api_call_threaded_full_queue(
-        prompts=prompts,
-        input_seq_lengths=input_seq_lengths,
-        output_seq_lengths=output_seq_lengths,
-        batch_size=args.batch_size,
-        num_full_iterations=args.num_full_iterations,
-        vary_batch_size=args.vary_batch_size,
-        inter_batch_delay=args.inter_batch_delay,
-        call_func=call_inference_api,
-        call_func_kwargs={
-            "stream": not args.no_stream,
-            "headers": headers,
-            "api_url": api_url,
-            "vll_model": args.vllm_model,
-            "tokenizer": tokenizer,
-            "force_max_tokens": True,
-        },
-    )
-
 
 def add_client_args(parser):
     parser.add_argument(
@@ -445,7 +52,7 @@ def add_client_args(parser):
         "--input_seq_len",
         type=int,
         default=-1,
-        help="Length parameter of the input sequence when using random prompts (not given dataset).",
+        help="Length parameter of the input sequence when using random prompts.",
     )
     parser.add_argument(
         "--output_seq_len",
@@ -464,8 +71,112 @@ def add_client_args(parser):
         action="store_true",
         help="Randomize normally the batch size for each batch of prompts.",
     )
+    parser.add_argument(
+        "--max_prompt_length",
+        type=int,
+        required=True,
+        help="Maximum length of generated prompts.",
+    )
+    parser.add_argument(
+        "--distribution",
+        type=str,
+        default="fixed",
+        choices=["fixed", "uniform", "normal"],
+        help="Distribution method for selecting random prompt lengths.",
+    )
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default="random",
+        help="The name of the dataset to generate prompts from, or 'random' for random generation.",
+    )
+    parser.add_argument(
+        "--tokenizer_model",
+        type=str,
+        default=None,
+        help="The model tokenizer to use for vocabulary, truncation, and templating.",
+    )
+    parser.add_argument(
+        "--template",
+        type=str,
+        default=None,
+        help="Provided jinja2 template to apply to the generated prompts.",
+    )
+    parser.add_argument(
+        "--save_path",
+        type=str,
+        default=None,
+        help="Path to save the generated prompts in JSONL format.",
+    )
+    parser.add_argument(
+        "--print_prompts",
+        action="store_true",
+        default=False,
+        help="Print generated prompts.",
+    )
     return parser
 
 
+def main():
+    # set numpy seed for reproducibility
+    np.random.seed(42)
+
+    parser = argparse.ArgumentParser()
+    parser = add_client_args(parser)
+    args = parser.parse_args()
+
+    # Create configs from arguments
+    prompt_config = PromptConfig(
+        input_seq_len=args.input_seq_len,
+        max_prompt_length=args.max_prompt_length,
+        num_prompts=args.num_prompts,
+        distribution=args.distribution,
+        dataset=args.dataset,
+        tokenizer_model=args.tokenizer_model or args.vllm_model,
+        template=args.template,
+        save_path=args.save_path,
+        print_prompts=args.print_prompts,
+    )
+
+    output_seq_lens = [args.output_seq_len] * args.num_prompts
+
+    batch_config = BatchConfig(
+        batch_size=args.batch_size,
+        output_seq_lens=output_seq_lens,
+        num_full_iterations=args.num_full_iterations,
+        vary_batch_size=args.vary_batch_size,
+        inter_batch_delay=args.inter_batch_delay,
+        vllm_model=args.vllm_model,
+        stream=not args.no_stream,
+    )
+
+    env_config = EnvironmentConfig()
+
+    # Initialize components
+    tokenizer = AutoTokenizer.from_pretrained(prompt_config.tokenizer_model)
+    prompt_client = PromptClient(env_config)
+    batch_processor = BatchProcessor(prompt_client, batch_config)
+
+    # Generate prompts
+    prompts, input_seq_lengths = generate_prompts(prompt_config)
+
+    # Process batches
+    logger.info(f"Starting batch processing with batch_size={batch_config.batch_size}")
+    responses = batch_processor.process_batch(
+        prompts=prompts, input_seq_lengths=input_seq_lengths, tokenizer=tokenizer
+    )
+
+    logger.info(f"Completed processing {len(responses)} responses")
+
+    # Calculate and log summary statistics
+    if responses:
+        mean_decode_tps = np.mean([r["decode_tps"] for r in responses])
+        mean_total_tps = np.mean([r["total_tps"] for r in responses])
+        mean_ttft = np.mean([r["ttft"] for r in responses])
+        logger.info(f"Mean Decode TPS: {mean_decode_tps:.2f}")
+        logger.info(f"Mean Total TPS: {mean_total_tps:.2f}")
+        logger.info(f"Mean TTFT: {mean_ttft:.2f}")
+
+
 if __name__ == "__main__":
     main()
diff --git a/utils/prompt_configs.py b/utils/prompt_configs.py
new file mode 100644
index 00000000..04d0fd67
--- /dev/null
+++ b/utils/prompt_configs.py
@@ -0,0 +1,40 @@
+# SPDX-License-Identifier: Apache-2.0
+#
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+from dataclasses import dataclass
+from typing import List, Optional
+import os
+
+
+@dataclass
+class PromptConfig:
+    input_seq_len: int
+    max_prompt_length: int
+    num_prompts: int
+    distribution: str = "fixed"
+    dataset: str = "random"
+    tokenizer_model: str = "meta-llama/Llama-3.1-70B-Instruct"
+    template: Optional[str] = None
+    save_path: Optional[str] = None
+    print_prompts: bool = False
+
+
+@dataclass
+class BatchConfig:
+    batch_size: int
+    output_seq_lens: List[int]
+    num_full_iterations: int = 1
+    vary_batch_size: bool = False
+    inter_batch_delay: int = 0
+    vllm_model: str = "meta-llama/Llama-3.1-70B-Instruct"
+    stream: bool = True
+
+
+@dataclass
+class EnvironmentConfig:
+    authorization: Optional[str] = os.environ.get("AUTHORIZATION")
+    jwt_secret: Optional[str] = os.environ.get("JWT_SECRET")
+    deploy_url: str = os.environ.get("DEPLOY_URL", "http://127.0.0.1")
+    service_port: str = os.environ.get("SERVICE_PORT", "8000")
+    cache_root: str = os.environ.get("CACHE_ROOT", ".")
diff --git a/utils/prompt_generation.py b/utils/prompt_generation.py
index f96e0c6d..07269531 100644
--- a/utils/prompt_generation.py
+++ b/utils/prompt_generation.py
@@ -5,7 +5,6 @@
 import os
 from pathlib import Path
 import logging
-import argparse
 import json
 from datetime import date
 
@@ -14,6 +13,8 @@
 from datasets import load_dataset
 from transformers import AutoTokenizer
 
+from prompt_configs import PromptConfig
+
 
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
@@ -247,49 +248,56 @@ def process_prompts(prompts, max_length, template, tokenizer_model):
 
 
 # Main function to handle prompt generation and templating
-def generate_prompts(args):
-    logging.info(f"generate_prompts args={args}")
+def generate_prompts(prompt_config: PromptConfig):
+    logging.info(f"generate_prompts args={prompt_config}")
     # vLLM appears to add extra token on receipt of prompt
     # TODO: verify if this is bos token or something else
-    args.max_prompt_length = args.max_prompt_length - 1
-    if args.input_seq_len == -1:
-        args.input_seq_len = args.max_prompt_length
+    prompt_config.max_prompt_length = prompt_config.max_prompt_length - 1
+    if prompt_config.input_seq_len == -1:
+        prompt_config.input_seq_len = prompt_config.max_prompt_length
     else:
-        args.input_seq_len = args.input_seq_len - 1
+        prompt_config.input_seq_len = prompt_config.input_seq_len - 1
 
-    if args.dataset.lower() == "random":
+    if prompt_config.dataset.lower() == "random":
         # default case
         logger.info("Generating random prompts...")
         # -1 is for the extra token added by vLLM
-        assert args.input_seq_len > -1, "input_seq_len must be set for random prompts."
-        assert args.max_prompt_length > -1, "max_length must be set for random prompts."
+        assert (
+            prompt_config.input_seq_len > -1
+        ), "input_seq_len must be set for random prompts."
+        assert (
+            prompt_config.max_prompt_length > -1
+        ), "max_length must be set for random prompts."
         prompts = generate_random_prompts(
-            args.num_prompts,
-            args.max_prompt_length,
-            args.input_seq_len,
-            args.distribution,
-            args.tokenizer_model,
+            prompt_config.num_prompts,
+            prompt_config.max_prompt_length,
+            prompt_config.input_seq_len,
+            prompt_config.distribution,
+            prompt_config.tokenizer_model,
         )
-    elif args.dataset is not None:
+    elif prompt_config.dataset is not None:
         assert (
-            args.max_prompt_length > -1
+            prompt_config.max_prompt_length > -1
         ), "max_length must be set for datasets prompts."
-        logger.info(f"Generating prompts from the '{args.dataset}' dataset...")
-        if args.dataset == "alpaca_eval":
-            prompts = load_alpaca_eval_dataset_samples(args.num_prompts)
+        logger.info(f"Generating prompts from the '{prompt_config.dataset}' dataset...")
+        if prompt_config.dataset == "alpaca_eval":
+            prompts = load_alpaca_eval_dataset_samples(prompt_config.num_prompts)
     else:
         raise ValueError("Dataset must be provided.")
 
     prompts, prompt_lengths = process_prompts(
-        prompts, args.max_prompt_length, args.template, args.tokenizer_model
+        prompts,
+        prompt_config.max_prompt_length,
+        prompt_config.template,
+        prompt_config.tokenizer_model,
     )
     # Add 1 to prompt lengths to account for the extra token added by vLLM
     prompt_lengths = [pl + 1 for pl in prompt_lengths]
 
-    print_prompts = (args.num_prompts < 5) and args.print_prompts
+    print_prompts = (prompt_config.num_prompts < 5) and prompt_config.print_prompts
     # Save prompts to a JSONL file if a save path is provided
-    if args.save_path:
-        file_path = Path(args.save_path).resolve()
+    if prompt_config.save_path:
+        file_path = Path(prompt_config.save_path).resolve()
         try:
             with open(file_path, "w") as f:
                 for prompt in prompts:
@@ -306,64 +314,3 @@ def generate_prompts(args):
             print(f"prompt {idx}:\n{prompt}")
 
     return prompts, prompt_lengths
-
-
-def add_prompt_gen_args(parser):
-    parser.add_argument(
-        "--tokenizer_model",
-        type=str,
-        default=None,
-        help="The model tokenizer to use for vocabulary, truncation, and templating.",
-    )
-    parser.add_argument(
-        "--dataset",
-        type=str,
-        default="random",
-        help="The name of the dataset to generate prompts from, or 'random' for random token generation.",
-    )
-    parser.add_argument(
-        "--max_prompt_length",
-        type=int,
-        required=True,
-        help="Maximum length of generated prompts.",
-    )
-    parser.add_argument(
-        "--distribution",
-        type=str,
-        default="fixed",
-        choices=[
-            "fixed",
-            "uniform",
-            "normal",
-        ],
-        help="Distribution method for selecting random prompt lengths ('fixed', 'uniform', 'normal').",
-    )
-    parser.add_argument(
-        "--template",
-        type=str,
-        default=None,
-        help="Provided jinja2 template to apply to the generated prompts.",
-    )
-    parser.add_argument(
-        "--save_path",
-        type=str,
-        default=None,
-        help="Path to save the generated prompts in JSONL format.",
-    )
-    parser.add_argument(
-        "--print_prompts",
-        action="store_true",
-        default=False,
-        help="Print generated prompts if there arent more than 5.",
-    )
-    return parser
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Generate prompts.")
-    parser = add_prompt_gen_args(parser)
-    args = parser.parse_args()
-    try:
-        generate_prompts(args)
-    except ValueError as e:
-        print(e)
diff --git a/utils/startup_utils.py b/utils/startup_utils.py
index 0da62715..05cb616f 100644
--- a/utils/startup_utils.py
+++ b/utils/startup_utils.py
@@ -3,17 +3,11 @@
 # SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
 
 import os
-import time
 import logging
 import subprocess
 import psutil
 import signal
 
-import requests
-
-from utils.prompt_client_cli import (
-    get_authorization,
-)
 
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
@@ -22,44 +16,6 @@
 logger.setLevel(logging.INFO)
 
 
-def get_api_health_url():
-    DEPLOY_URL = os.getenv("DEPLOY_URL", "http://127.0.0.1")
-    health_url = f"{DEPLOY_URL}:{os.getenv('SERVICE_PORT', '7000')}/health"
-    return health_url
-
-
-def wait_for_healthy(timeout: int = 300, interval: int = 10) -> bool:
-    """
-    Check the health endpoint until the service is ready.
-    """
-    health_url = get_api_health_url()
-    start_time = time.time()
-    headers = {"Authorization": f"Bearer {get_authorization()}"}
-    total_time_waited = 0
-    while time.time() - start_time < timeout:
-        req_time = time.time()
-        try:
-            response = requests.get(health_url, headers=headers, timeout=interval)
-            if response.status_code == 200:
-                startup_time = time.time() - start_time
-                logger.info(
-                    f"vLLM service is healthy. startup_time:= {startup_time} seconds"
-                )
-                return True
-        except requests.exceptions.RequestException as e:
-            logger.warning(f"Health check failed: {e}")
-
-        total_time_waited = time.time() - start_time
-        sleep_interval = max(2 - (time.time() - req_time), 0)
-        logger.info(
-            f"Service not ready after {total_time_waited:.2f} seconds, waiting {sleep_interval:.2f} seconds before polling ..."
-        )
-        time.sleep(sleep_interval)
-
-    logger.error(f"Service did not become healthy within {timeout} seconds")
-    return False
-
-
 class InferenceServerContext:
     def __init__(self, startup_script_path):
         self.startup_script_path = startup_script_path

From 2467c742950bcbd70720c73482c75f9e04ec31e7 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Thu, 5 Dec 2024 23:23:43 +0000
Subject: [PATCH 13/76] fix health endpoint

---
 utils/prompt_client.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/utils/prompt_client.py b/utils/prompt_client.py
index 00473045..16f83936 100644
--- a/utils/prompt_client.py
+++ b/utils/prompt_client.py
@@ -53,7 +53,7 @@ def _get_api_completions_url(self) -> str:
         return f"{self._get_api_base_url()}/completions"
 
     def _get_api_health_url(self) -> str:
-        return f"{self._get_api_base_url()}/health"
+        return f"{self.env_config.deploy_url}:{self.env_config.service_port}/health"
 
     def get_health(self) -> requests.Response:
         return requests.get(self.health_url, headers=self.headers)
@@ -78,6 +78,8 @@ def wait_for_healthy(self, timeout: int = 300, interval: int = 10) -> bool:
                     )
                     self.server_ready = True
                     return True
+                else:
+                    logger.warning(f"Health check failed: {response.status_code}")
 
             except requests.exceptions.RequestException as e:
                 logger.warning(f"Health check failed: {e}")

From af5e8dc9e2f2e80936cffd64e618a88a472aceb6 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Thu, 5 Dec 2024 23:24:33 +0000
Subject: [PATCH 14/76] add vllm_model to EnvironmentConfig instead of
 BatchConfig

---
 utils/batch_processor.py | 6 +++---
 utils/prompt_configs.py  | 8 +++++---
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/utils/batch_processor.py b/utils/batch_processor.py
index c8f9ea90..6805545e 100644
--- a/utils/batch_processor.py
+++ b/utils/batch_processor.py
@@ -123,7 +123,7 @@ def _process_single_thread(
                     prompt_len=isl,
                     max_tokens=self.batch_config.output_seq_lens[i],
                     stream=self.batch_config.stream,
-                    vll_model=self.batch_config.vllm_model,
+                    vll_model=self.prompt_client.env_config.vllm_model,
                     tokenizer=tokenizer,
                 )
 
@@ -182,7 +182,7 @@ def _process_multi_thread(
                             prompt_len=isl,
                             max_tokens=self.batch_config.output_seq_lens[i],
                             stream=self.batch_config.stream,
-                            vll_model=self.batch_config.vllm_model,
+                            vll_model=self.prompt_client.env_config.vllm_model,
                             tokenizer=tokenizer,
                         )
                         futures.append(future)
@@ -231,7 +231,7 @@ def _process_batch_chunk(
                     prompt_len=isl,
                     max_tokens=self.batch_config.output_seq_lens[i],
                     stream=self.batch_config.stream,
-                    vll_model=self.batch_config.vllm_model,
+                    vll_model=self.prompt_client.env_config.vllm_model,
                     tokenizer=tokenizer,
                 )
                 futures.append(future)
diff --git a/utils/prompt_configs.py b/utils/prompt_configs.py
index 04d0fd67..eea13670 100644
--- a/utils/prompt_configs.py
+++ b/utils/prompt_configs.py
@@ -14,7 +14,9 @@ class PromptConfig:
     num_prompts: int
     distribution: str = "fixed"
     dataset: str = "random"
-    tokenizer_model: str = "meta-llama/Llama-3.1-70B-Instruct"
+    tokenizer_model: str = os.environ.get(
+        "VLLM_MODEL", "meta-llama/Llama-3.1-70B-Instruct"
+    )
     template: Optional[str] = None
     save_path: Optional[str] = None
     print_prompts: bool = False
@@ -27,14 +29,14 @@ class BatchConfig:
     num_full_iterations: int = 1
     vary_batch_size: bool = False
     inter_batch_delay: int = 0
-    vllm_model: str = "meta-llama/Llama-3.1-70B-Instruct"
     stream: bool = True
 
 
 @dataclass
 class EnvironmentConfig:
+    vllm_model: str = os.environ.get("VLLM_MODEL", "meta-llama/Llama-3.1-70B-Instruct")
     authorization: Optional[str] = os.environ.get("AUTHORIZATION")
     jwt_secret: Optional[str] = os.environ.get("JWT_SECRET")
     deploy_url: str = os.environ.get("DEPLOY_URL", "http://127.0.0.1")
-    service_port: str = os.environ.get("SERVICE_PORT", "8000")
+    service_port: str = os.environ.get("SERVICE_PORT", "7000")
     cache_root: str = os.environ.get("CACHE_ROOT", ".")

From 60c7ab28674aa167f30cf18f8329c69627878b0b Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Thu, 5 Dec 2024 23:25:10 +0000
Subject: [PATCH 15/76] refactor utils/capture_traces.py with new prompt_client

---
 utils/capture_traces.py | 79 ++++-------------------------------------
 1 file changed, 7 insertions(+), 72 deletions(-)

diff --git a/utils/capture_traces.py b/utils/capture_traces.py
index ecc1d95d..687458c7 100644
--- a/utils/capture_traces.py
+++ b/utils/capture_traces.py
@@ -2,16 +2,10 @@
 #
 # SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
 
-import os
 import logging
-import argparse
-from utils.prompt_generation import generate_prompts
-from utils.prompt_client_cli import (
-    call_inference_api,
-    get_api_base_url,
-    get_authorization,
-)
-from utils.startup_utils import wait_for_healthy
+
+from prompt_configs import EnvironmentConfig
+from prompt_client import PromptClient
 
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
@@ -21,69 +15,10 @@
 
 
 def capture_input_sizes():
-    """
-    Capture different input size graphs with the TT model on vLLM.
-    get_padded_prefill_len() defines the different input sizes for prefill:
-    https://github.com/tenstorrent/tt-metal/blob/main/models/demos/t3000/llama2_70b/tt/llama_generation.py#L341
-    """
-    input_sizes = [sz - 8 for sz in [32, 64, 128, 256, 512, 1024, 2048, 3072, 4096]]
-    prompts_per_size = 1
-    output_seq_len = 1
-
-    base_url = get_api_base_url()
-    if not wait_for_healthy(base_url):
-        raise RuntimeError("vLLM did not start correctly!")
-
-    api_url = f"{base_url}/completions"
-    headers = {"Authorization": f"Bearer {get_authorization()}"}
-    vllm_model = os.environ.get("VLLM_MODEL", "meta-llama/Llama-3.1-70B-Instruct")
-
-    for size in input_sizes:
-        logger.info(f"Capture input size: {size}")
-
-        args = argparse.Namespace(
-            tokenizer_model=vllm_model,
-            dataset="random",
-            max_prompt_length=size,
-            input_seq_len=size,
-            distribution="fixed",
-            template=None,
-            save_path=None,
-            print_prompts=False,
-            num_prompts=prompts_per_size,
-        )
-
-        prompts, prompt_lengths = generate_prompts(args)
-
-        for i, (prompt, prompt_len) in enumerate(zip(prompts, prompt_lengths)):
-            try:
-                response_data = call_inference_api(
-                    prompt=prompt,
-                    response_idx=i,
-                    prompt_len=prompt_len,
-                    stream=True,
-                    headers=headers,
-                    api_url=api_url,
-                    max_tokens=output_seq_len,
-                    vll_model=vllm_model,
-                    tokenizer=None,
-                )
-
-                logger.info(
-                    f"Input size: {size}, input_seq_len: {prompt_len}, TTFT: {response_data['ttft']:.3f}s"
-                )
-
-            except Exception as e:
-                logger.error(f"Error processing prompt: {e}")
-
-
-def main():
-    try:
-        capture_input_sizes()
-    except Exception as e:
-        logger.error(f"Capturing input sizes failed: {e}")
-        raise
+    env_config = EnvironmentConfig()
+    prompt_client = PromptClient(env_config)
+    prompt_client.capture_traces()
 
 
 if __name__ == "__main__":
-    main()
+    capture_input_sizes()

From 10993a2a98667aab00903042d66976070dd21367 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Thu, 5 Dec 2024 23:57:26 +0000
Subject: [PATCH 16/76] fix utils imports

---
 utils/batch_processor.py   | 4 ++--
 utils/capture_traces.py    | 4 ++--
 utils/prompt_client.py     | 4 ++--
 utils/prompt_client_cli.py | 8 ++++----
 utils/prompt_generation.py | 2 +-
 5 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/utils/batch_processor.py b/utils/batch_processor.py
index 6805545e..35ab6652 100644
--- a/utils/batch_processor.py
+++ b/utils/batch_processor.py
@@ -14,8 +14,8 @@
 import numpy as np
 from transformers import AutoTokenizer
 
-from prompt_configs import BatchConfig
-from prompt_client import PromptClient
+from utils.prompt_configs import BatchConfig
+from utils.prompt_client import PromptClient
 
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
diff --git a/utils/capture_traces.py b/utils/capture_traces.py
index 687458c7..f3703b1f 100644
--- a/utils/capture_traces.py
+++ b/utils/capture_traces.py
@@ -4,8 +4,8 @@
 
 import logging
 
-from prompt_configs import EnvironmentConfig
-from prompt_client import PromptClient
+from utils.prompt_configs import EnvironmentConfig
+from utils.prompt_client import PromptClient
 
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
diff --git a/utils/prompt_client.py b/utils/prompt_client.py
index 16f83936..455921d3 100644
--- a/utils/prompt_client.py
+++ b/utils/prompt_client.py
@@ -11,8 +11,8 @@
 import jwt
 from transformers import AutoTokenizer
 
-from prompt_generation import generate_prompts
-from prompt_configs import PromptConfig, EnvironmentConfig
+from utils.prompt_generation import generate_prompts
+from utils.prompt_configs import PromptConfig, EnvironmentConfig
 
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
diff --git a/utils/prompt_client_cli.py b/utils/prompt_client_cli.py
index c671ca33..8ebc7124 100644
--- a/utils/prompt_client_cli.py
+++ b/utils/prompt_client_cli.py
@@ -9,10 +9,10 @@
 import numpy as np
 from transformers import AutoTokenizer
 
-from prompt_configs import PromptConfig, BatchConfig, EnvironmentConfig
-from prompt_client import PromptClient
-from batch_processor import BatchProcessor
-from prompt_generation import generate_prompts
+from utils.prompt_configs import PromptConfig, BatchConfig, EnvironmentConfig
+from utils.prompt_client import PromptClient
+from utils.batch_processor import BatchProcessor
+from utils.prompt_generation import generate_prompts
 
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
diff --git a/utils/prompt_generation.py b/utils/prompt_generation.py
index 07269531..a351eded 100644
--- a/utils/prompt_generation.py
+++ b/utils/prompt_generation.py
@@ -13,7 +13,7 @@
 from datasets import load_dataset
 from transformers import AutoTokenizer
 
-from prompt_configs import PromptConfig
+from utils.prompt_configs import PromptConfig
 
 
 logging.basicConfig(

From 20ccdf4855dd5accc7a91d939d3b12091f8e642c Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Fri, 6 Dec 2024 03:51:38 +0000
Subject: [PATCH 17/76] fix BatchConfig usage

---
 utils/prompt_client_cli.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/utils/prompt_client_cli.py b/utils/prompt_client_cli.py
index 8ebc7124..3d74f8f5 100644
--- a/utils/prompt_client_cli.py
+++ b/utils/prompt_client_cli.py
@@ -146,7 +146,6 @@ def main():
         num_full_iterations=args.num_full_iterations,
         vary_batch_size=args.vary_batch_size,
         inter_batch_delay=args.inter_batch_delay,
-        vllm_model=args.vllm_model,
         stream=not args.no_stream,
     )
 

From eab7e7682e69d66c2754e10334a6e9ebee7352fa Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Fri, 6 Dec 2024 03:58:07 +0000
Subject: [PATCH 18/76] add benchmarking/online_benchmark_prompt_client.py
 using prompt_client.py

---
 .../online_benchmark_prompt_client.py         | 208 ++++++++++++++++++
 1 file changed, 208 insertions(+)
 create mode 100644 benchmarking/online_benchmark_prompt_client.py

diff --git a/benchmarking/online_benchmark_prompt_client.py b/benchmarking/online_benchmark_prompt_client.py
new file mode 100644
index 00000000..7f65c7fd
--- /dev/null
+++ b/benchmarking/online_benchmark_prompt_client.py
@@ -0,0 +1,208 @@
+#!/usr/bin/env python3
+
+# SPDX-License-Identifier: Apache-2.0
+#
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+import logging
+import numpy as np
+from typing import List, Dict, Tuple
+import json
+from datetime import datetime
+from pathlib import Path
+
+from utils.prompt_configs import PromptConfig, BatchConfig, EnvironmentConfig
+from utils.prompt_client import PromptClient
+from utils.batch_processor import BatchProcessor
+from utils.prompt_generation import generate_prompts
+from transformers import AutoTokenizer
+
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+def get_test_combinations(
+    context_lens: List[Tuple[int, int]],
+) -> List[Dict[str, int]]:
+    combinations = []
+    for input_len, output_len in context_lens:
+        # Skip invalid combinations where output_len > input_len
+        context = input_len + output_len
+        if context <= 4096:
+            bsz = 32
+        elif context <= 8192:
+            bsz = 16
+        else:
+            bsz = 1
+
+        num_prompts = bsz * 4
+        combinations.append(
+            {
+                "input_len": input_len,
+                "output_len": output_len,
+                "batch_size": bsz,
+                "num_prompts": num_prompts,
+            }
+        )
+
+    # Log total number of combinations
+    logger.info(f"Generated {len(combinations)} valid test combinations")
+    for i, combo in enumerate(combinations, 1):
+        logger.info(
+            f"Combination {i}: input_len={combo['input_len']}, "
+            f"output_len={combo['output_len']}, batch_size={combo['batch_size']}"
+        )
+
+    return combinations
+
+
+def run_sequence_length_test(
+    combinations: List[Dict[str, int]],
+    save_dir: str,
+    file_prefix: str,
+    num_iterations: int = 1,
+    model: str = "meta-llama/Llama-3.1-70B-Instruct",
+) -> List[dict]:
+    # Create save directory
+    save_path = Path(save_dir)
+    save_path.mkdir(parents=True, exist_ok=True)
+
+    # Initialize configurations
+    env_config = EnvironmentConfig(vllm_model=model)
+    prompt_client = PromptClient(env_config)
+
+    # Initialize results storage
+    all_results = []
+    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    results_file = save_path / f"{file_prefix}_{timestamp}.json"
+
+    # Test all combinations
+    total_combinations = len(combinations)
+    for idx, params in enumerate(combinations, 1):
+        input_len = params["input_len"]
+        output_len = params["output_len"]
+        batch_size = params["batch_size"]
+        num_prompts = params["num_prompts"]
+
+        logger.info(
+            f"\nTesting combination {idx}/{total_combinations}:\n"
+            f"input_len={input_len}, output_len={output_len}, "
+            f"batch_size={batch_size}, num_prompts={num_prompts}"
+        )
+
+        # Configure prompt generation
+        prompt_config = PromptConfig(
+            input_seq_len=input_len,
+            max_prompt_length=input_len,
+            num_prompts=num_prompts,
+            distribution="fixed",
+            dataset="random",
+            tokenizer_model=model,
+            template=None,
+            save_path=None,
+            print_prompts=False,
+        )
+
+        # Generate prompts
+        prompts, input_seq_lengths = generate_prompts(prompt_config)
+
+        # Configure batch processing
+        output_seq_lens = [output_len] * num_prompts
+        batch_config = BatchConfig(
+            batch_size=batch_size,
+            output_seq_lens=output_seq_lens,
+            num_full_iterations=num_iterations,
+            vary_batch_size=False,
+            inter_batch_delay=0,
+            stream=True,
+        )
+
+        # Initialize processor and tokenizer
+        batch_processor = BatchProcessor(prompt_client, batch_config)
+        tokenizer = AutoTokenizer.from_pretrained(model)
+
+        # Process batches
+        try:
+            responses = batch_processor.process_batch(
+                prompts=prompts,
+                input_seq_lengths=input_seq_lengths,
+                tokenizer=tokenizer,
+            )
+
+            # Calculate statistics
+            stats = {
+                "input_seq_len": input_len,
+                "output_seq_len": output_len,
+                "batch_size": batch_size,
+                "mean_decode_tps": np.mean([r["decode_tps"] for r in responses]),
+                "mean_total_tps": np.mean([r["total_tps"] for r in responses]),
+                "mean_ttft": np.mean([r["ttft"] for r in responses]),
+                "std_decode_tps": np.std([r["decode_tps"] for r in responses]),
+                "std_total_tps": np.std([r["total_tps"] for r in responses]),
+                "std_ttft": np.std([r["ttft"] for r in responses]),
+                "num_prompts": num_prompts,
+                "num_iterations": num_iterations,
+                "timestamp": timestamp,
+                "combination_index": idx,
+            }
+
+            all_results.append(stats)
+
+            # Log results
+            logger.info(
+                f"Results for combination {idx}/{total_combinations}:\n"
+                f"Mean Decode TPS: {stats['mean_decode_tps']:.2f} ± "
+                f"{stats['std_decode_tps']:.2f}\n"
+                f"Mean Total TPS: {stats['mean_total_tps']:.2f} ± "
+                f"{stats['std_total_tps']:.2f}\n"
+                f"Mean TTFT: {stats['mean_ttft']:.2f} ± {stats['std_ttft']:.2f}"
+            )
+
+            # Save results after each combination
+            with open(results_file, "w") as f:
+                json.dump(all_results, f, indent=4)
+
+        except Exception as e:
+            logger.error(f"Error processing combination {idx}: {e}")
+            continue
+
+    return all_results
+
+
+if __name__ == "__main__":
+    # Define parameter ranges
+    typical_context_lens = [
+        (128, 128),
+        (128, 2048),
+        (128, 4096),
+        (2048, 128),
+        (2048, 2048),
+        (1000, 1000),
+        (500, 2000),
+        (5000, 500),
+        (20000, 2000),
+    ]
+    extra_context_lengths = [
+        (128, 2),
+        (256, 2),
+        (512, 32),
+        (1000, 24),
+        (2000, 32),
+        (4000, 32),
+        (8100, 32),
+        (130000, 1024),
+    ]
+    # Generate all valid combinations upfront
+    combinations = get_test_combinations(
+        context_lens=typical_context_lens + extra_context_lengths,
+    )
+
+    # Run tests
+    results = run_sequence_length_test(
+        combinations=combinations,
+        save_dir="online_benchmarking",
+        file_prefix="online_benchmark_results",
+    )

From 90acdf6c46c0855abc8cd1d9ce556bac12bc55cf Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Fri, 6 Dec 2024 05:19:25 +0000
Subject: [PATCH 19/76] add benchmarking/online_benchmark_prompt_client.py
 using prompt_client.py

---
 benchmarking/README.md                        |  52 ++++++++
 benchmarking/benchmark_serving.patch          |  26 ++++
 .../online_benchmark_prompt_client.py         |   9 +-
 benchmarking/vllm_online_benchmark.py         | 124 ++++++++++++++++++
 4 files changed, 207 insertions(+), 4 deletions(-)
 create mode 100644 benchmarking/benchmark_serving.patch
 create mode 100644 benchmarking/vllm_online_benchmark.py

diff --git a/benchmarking/README.md b/benchmarking/README.md
index 38ee6b60..56fab404 100644
--- a/benchmarking/README.md
+++ b/benchmarking/README.md
@@ -36,3 +36,55 @@ python examples/offline_inference_tt.py --measure_perf --max_seqs_in_batch 32 --
 - `--max_seqs_in_batch` (default: `32`):
   - **Maximum batch size** for inference, determining the number of prompts processed in parallel.
 
+### Online Benchmarking
+
+#### using vllm/benchmarking/benchmark_serving.py
+
+use the benchmark_serving.patch file:
+```
+cd ~/vllm
+git apply benchmark_serving.patch
+```
+This simply stops the benchmarking script from sending the `best_of` arg which is not supported and causes issues.
+
+To run the benchmarks:
+```
+cd ~/app
+export PYTHONPATH=$PYTHONPATH:$PWD
+python benchmarking/vllm_online_benchmark.py
+```
+
+The output will be available for each input/output sequence length defined and time stamped.
+
+Results are also printed to stdout, for example with mock data results:
+```
+==================================================
+                    Benchmark Result                     
+==================================================
+Successful requests:                     32
+Benchmark duration (s):                  0.39
+Total input tokens:                      4096
+Total generated tokens:                  64
+Request throughput (req/s):              83.04
+Output token throughput (tok/s):         166.07
+Total Token throughput (tok/s):          10794.77
+--------------------------------------------------
+               Time to First Token                  
+--------------------------------------------------
+Mean TTFT (ms):                          358.26
+Median TTFT (ms):                        358.45
+P99 TTFT (ms):                           361.67
+--------------------------------------------------
+     Time per Output Token (excl. 1st token)       
+--------------------------------------------------
+Mean TPOT (ms):                          14.03
+Median TPOT (ms):                        14.13
+P99 TPOT (ms):                           14.30
+--------------------------------------------------
+             Inter-token Latency                   
+--------------------------------------------------
+Mean ITL (ms):                           7.86
+Median ITL (ms):                         7.83
+P99 ITL (ms):                            8.05
+==================================================
+```
diff --git a/benchmarking/benchmark_serving.patch b/benchmarking/benchmark_serving.patch
new file mode 100644
index 00000000..bb90b431
--- /dev/null
+++ b/benchmarking/benchmark_serving.patch
@@ -0,0 +1,26 @@
+diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
+index c1a396c8..74f75a15 100644
+--- a/benchmarks/benchmark_serving.py
++++ b/benchmarks/benchmark_serving.py
+@@ -22,6 +22,12 @@ On the client side, run:
+         --endpoint /generate_stream
+     to the end of the command above.
+ """
++import sys
++from unittest.mock import MagicMock
++# mock out ttnn fully so we can import ttnn without using it
++sys.modules["ttnn"] = MagicMock()
++sys.modules["ttnn.device"] = MagicMock()
++
+ import argparse
+ import asyncio
+ import base64
+@@ -417,7 +423,7 @@ async def benchmark(
+         prompt_len=test_prompt_len,
+         output_len=test_output_len,
+         logprobs=logprobs,
+-        best_of=best_of,
++        best_of=None,
+         multi_modal_content=test_mm_content,
+         ignore_eos=ignore_eos,
+     )
\ No newline at end of file
diff --git a/benchmarking/online_benchmark_prompt_client.py b/benchmarking/online_benchmark_prompt_client.py
index 7f65c7fd..22812bbd 100644
--- a/benchmarking/online_benchmark_prompt_client.py
+++ b/benchmarking/online_benchmark_prompt_client.py
@@ -70,15 +70,16 @@ def run_sequence_length_test(
     save_path = Path(save_dir)
     save_path.mkdir(parents=True, exist_ok=True)
 
-    # Initialize configurations
-    env_config = EnvironmentConfig(vllm_model=model)
-    prompt_client = PromptClient(env_config)
-
     # Initialize results storage
     all_results = []
     timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
     results_file = save_path / f"{file_prefix}_{timestamp}.json"
 
+    # Initialize configurations
+    env_config = EnvironmentConfig(vllm_model=model)
+    prompt_client = PromptClient(env_config)
+    prompt_client.capture_traces()
+
     # Test all combinations
     total_combinations = len(combinations)
     for idx, params in enumerate(combinations, 1):
diff --git a/benchmarking/vllm_online_benchmark.py b/benchmarking/vllm_online_benchmark.py
new file mode 100644
index 00000000..0315129f
--- /dev/null
+++ b/benchmarking/vllm_online_benchmark.py
@@ -0,0 +1,124 @@
+# SPDX-License-Identifier: Apache-2.0
+#
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+import os
+import subprocess
+import time
+import logging
+from typing import Dict
+from pathlib import Path
+
+from benchmarking.online_benchmark_prompt_client import get_test_combinations
+from utils.prompt_configs import EnvironmentConfig
+from utils.prompt_client import PromptClient
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+
+
+def run_benchmark(
+    params: Dict[str, int],
+    model: str,
+    port: int,
+    benchmark_script: str,
+    result_dir: Path,
+) -> None:
+    """Run a single benchmark with the given parameters."""
+    # fmt: off
+    cmd = [
+        "python", benchmark_script,
+        "--backend", "vllm",
+        "--model", model,
+        "--port", str(port),
+        "--dataset-name", "random",
+        "--num-prompts", str(params["batch_size"]),
+        "--random-input-len", str(params["input_len"]),
+        "--random-output-len", str(params["output_len"]),
+        "--save-result", 
+        "--result-dir", str(result_dir)
+    ]
+    # fmt: on
+
+    logger.info(f"Running benchmark with parameters: {params}")
+    logger.info(f"Command: {' '.join(cmd)}")
+
+    try:
+        subprocess.run(cmd, check=True)
+        logger.info("Benchmark completed successfully")
+    except subprocess.CalledProcessError as e:
+        logger.error(f"Benchmark failed with error: {e}")
+    except Exception as e:
+        logger.error(f"Unexpected error during benchmark: {e}")
+
+    # Add a small delay between runs to ensure system stability
+    time.sleep(2)
+
+
+def main():
+    # Configuration
+    env_config = EnvironmentConfig()
+
+    # Create output directory
+    result_dir = Path("vllm_online_benchmark_results")
+    result_dir.mkdir(parents=True, exist_ok=True)
+
+    prompt_client = PromptClient(env_config)
+    # note: there isnt a better way to pass an api key to the vllm benchmarking script
+    os.environ["OPENAI_API_KEY"] = prompt_client._get_authorization()
+
+    # Define benchmarking parameters
+    typical_context_lens = [
+        (128, 128),
+        (128, 2048),
+        (128, 4096),
+        (2048, 128),
+        (2048, 2048),
+        (1000, 1000),
+        (500, 2000),
+        (5000, 500),
+        (20000, 2000),
+    ]
+    extra_context_lengths = [
+        (128, 2),
+        (256, 2),
+        (512, 32),
+        (1000, 24),
+        (2000, 32),
+        (4000, 32),
+        (8100, 32),
+        (130000, 1024),
+    ]
+
+    # Get all benchmark combinations using the original function
+    combinations = get_test_combinations(
+        context_lens=typical_context_lens + extra_context_lengths,
+    )
+
+    # Log benchmark plan
+    logger.info(f"Starting benchmark suite with {len(combinations)} combinations")
+    for i, combo in enumerate(combinations, 1):
+        logger.info(f"Combination {i}: {combo}")
+
+    # ensure vllm server is ready
+    prompt_client.capture_traces()
+
+    # Run benchmarks
+    for i, params in enumerate(combinations, 1):
+        logger.info(f"\nRunning benchmark {i}/{len(combinations)}")
+        run_benchmark(
+            benchmark_script="/home/user/vllm/benchmarks/benchmark_serving.py",
+            params=params,
+            model=env_config.vllm_model,
+            port=env_config.service_port,
+            result_dir=result_dir,
+        )
+
+    logger.info("Benchmark suite completed")
+
+
+if __name__ == "__main__":
+    main()

From ec486ad5595fc807b2fe3e4e469844e766b9cdbb Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Fri, 6 Dec 2024 05:22:48 +0000
Subject: [PATCH 20/76] add benchmarking, evals, and tests dirs to Dockerfile

---
 vllm-tt-metal-llama3-70b/vllm.llama3.src.Dockerfile | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm-tt-metal-llama3-70b/vllm.llama3.src.Dockerfile b/vllm-tt-metal-llama3-70b/vllm.llama3.src.Dockerfile
index 465004e7..2184d356 100644
--- a/vllm-tt-metal-llama3-70b/vllm.llama3.src.Dockerfile
+++ b/vllm-tt-metal-llama3-70b/vllm.llama3.src.Dockerfile
@@ -99,6 +99,9 @@ ENV PYTHONPATH=${PYTHONPATH}:${APP_DIR}
 COPY --chown=user:user "vllm-tt-metal-llama3-70b/src" "${APP_DIR}/src"
 COPY --chown=user:user "vllm-tt-metal-llama3-70b/requirements.txt" "${APP_DIR}/requirements.txt"
 COPY --chown=user:user "utils" "${APP_DIR}/utils"
+COPY --chown=user:user "benchmarking" "${APP_DIR}/benchmarking"
+COPY --chown=user:user "evals" "${APP_DIR}/evals"
+COPY --chown=user:user "tests" "${APP_DIR}/tests"
 RUN /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate \
 && pip install --default-timeout=240 --no-cache-dir -r requirements.txt"
 

From c58d7b365ee1c5728a0fc8bfd8f093668ad5dddd Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Fri, 6 Dec 2024 05:34:50 +0000
Subject: [PATCH 21/76] update patchfile and benchmarking README.md with
 commands

---
 benchmarking/README.md               | 2 +-
 benchmarking/benchmark_serving.patch | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarking/README.md b/benchmarking/README.md
index 56fab404..fe27a798 100644
--- a/benchmarking/README.md
+++ b/benchmarking/README.md
@@ -43,7 +43,7 @@ python examples/offline_inference_tt.py --measure_perf --max_seqs_in_batch 32 --
 use the benchmark_serving.patch file:
 ```
 cd ~/vllm
-git apply benchmark_serving.patch
+git apply ~/app/benchmarking/benchmark_serving.patch
 ```
 This simply stops the benchmarking script from sending the `best_of` arg which is not supported and causes issues.
 
diff --git a/benchmarking/benchmark_serving.patch b/benchmarking/benchmark_serving.patch
index bb90b431..c6cd2994 100644
--- a/benchmarking/benchmark_serving.patch
+++ b/benchmarking/benchmark_serving.patch
@@ -23,4 +23,4 @@ index c1a396c8..74f75a15 100644
 +        best_of=None,
          multi_modal_content=test_mm_content,
          ignore_eos=ignore_eos,
-     )
\ No newline at end of file
+     )

From fe4f96d302de31b524c1850148585bab7be98db9 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Fri, 6 Dec 2024 05:34:58 +0000
Subject: [PATCH 22/76] update Docker IMAGE_VERSION to v0.0.3

---
 vllm-tt-metal-llama3-70b/docs/development.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm-tt-metal-llama3-70b/docs/development.md b/vllm-tt-metal-llama3-70b/docs/development.md
index 2b21a730..55d8b1d3 100644
--- a/vllm-tt-metal-llama3-70b/docs/development.md
+++ b/vllm-tt-metal-llama3-70b/docs/development.md
@@ -18,7 +18,7 @@ export TT_METAL_COMMIT_SHA_OR_TAG=385904186f81fed15d5c87c162221d4f34387164
 export TT_METAL_COMMIT_DOCKER_TAG=${TT_METAL_COMMIT_SHA_OR_TAG:0:12}
 export TT_VLLM_COMMIT_SHA_OR_TAG=384f1790c3be16e1d1b10de07252be2e66d00935
 export TT_VLLM_COMMIT_DOCKER_TAG=${TT_VLLM_COMMIT_SHA_OR_TAG:0:12}
-export IMAGE_VERSION=v0.0.2
+export IMAGE_VERSION=v0.0.3
 docker build \
   -t ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:${IMAGE_VERSION}-tt-metal-${TT_METAL_COMMIT_DOCKER_TAG}-${TT_VLLM_COMMIT_DOCKER_TAG} \
   --build-arg TT_METAL_DOCKERFILE_VERSION=${TT_METAL_DOCKERFILE_VERSION} \

From f3d815ad52f14a40822b96a90c942d618cb33346 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Fri, 6 Dec 2024 05:39:55 +0000
Subject: [PATCH 23/76] improve doc

---
 benchmarking/README.md | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/benchmarking/README.md b/benchmarking/README.md
index fe27a798..ea06674d 100644
--- a/benchmarking/README.md
+++ b/benchmarking/README.md
@@ -39,15 +39,16 @@ python examples/offline_inference_tt.py --measure_perf --max_seqs_in_batch 32 --
 ### Online Benchmarking
 
 #### using vllm/benchmarking/benchmark_serving.py
-
-use the benchmark_serving.patch file:
+Within the Docker container, use the benchmark_serving.patch file:
 ```
 cd ~/vllm
 git apply ~/app/benchmarking/benchmark_serving.patch
+cd /home/user/app/src
+python run_vllm_api_server.py
 ```
 This simply stops the benchmarking script from sending the `best_of` arg which is not supported and causes issues.
 
-To run the benchmarks:
+To run the benchmarks, in another shell into the Docker container:
 ```
 cd ~/app
 export PYTHONPATH=$PYTHONPATH:$PWD

From 8246a72abb1a82125910a2eaff4807323115cb1e Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Fri, 6 Dec 2024 05:53:45 +0000
Subject: [PATCH 24/76] update benchmark_serving.patch

---
 benchmarking/README.md               |  6 +++---
 benchmarking/benchmark_serving.patch | 11 ++++++++++-
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/benchmarking/README.md b/benchmarking/README.md
index ea06674d..641436d4 100644
--- a/benchmarking/README.md
+++ b/benchmarking/README.md
@@ -41,15 +41,15 @@ python examples/offline_inference_tt.py --measure_perf --max_seqs_in_batch 32 --
 #### using vllm/benchmarking/benchmark_serving.py
 Within the Docker container, use the benchmark_serving.patch file:
 ```
-cd ~/vllm
-git apply ~/app/benchmarking/benchmark_serving.patch
-cd /home/user/app/src
+cd ~/app/src
 python run_vllm_api_server.py
 ```
 This simply stops the benchmarking script from sending the `best_of` arg which is not supported and causes issues.
 
 To run the benchmarks, in another shell into the Docker container:
 ```
+cd ~/vllm
+git apply ~/app/benchmarking/benchmark_serving.patch
 cd ~/app
 export PYTHONPATH=$PYTHONPATH:$PWD
 python benchmarking/vllm_online_benchmark.py
diff --git a/benchmarking/benchmark_serving.patch b/benchmarking/benchmark_serving.patch
index c6cd2994..88a4b94d 100644
--- a/benchmarking/benchmark_serving.patch
+++ b/benchmarking/benchmark_serving.patch
@@ -1,5 +1,5 @@
 diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
-index c1a396c8..74f75a15 100644
+index c1a396c8..463e0e93 100644
 --- a/benchmarks/benchmark_serving.py
 +++ b/benchmarks/benchmark_serving.py
 @@ -22,6 +22,12 @@ On the client side, run:
@@ -24,3 +24,12 @@ index c1a396c8..74f75a15 100644
          multi_modal_content=test_mm_content,
          ignore_eos=ignore_eos,
      )
+@@ -458,7 +464,7 @@ async def benchmark(
+                                               prompt_len=prompt_len,
+                                               output_len=output_len,
+                                               logprobs=logprobs,
+-                                              best_of=best_of,
++                                              best_of=None,
+                                               multi_modal_content=mm_content,
+                                               ignore_eos=ignore_eos)
+         tasks.append(

From 765c4be6a15ca661e198944110b3227c76dc2696 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Fri, 6 Dec 2024 06:08:05 +0000
Subject: [PATCH 25/76] add tt_model_runner.py patch for best_of

---
 benchmarking/benchmark_serving.patch | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/benchmarking/benchmark_serving.patch b/benchmarking/benchmark_serving.patch
index 88a4b94d..fb5cb7f7 100644
--- a/benchmarking/benchmark_serving.patch
+++ b/benchmarking/benchmark_serving.patch
@@ -33,3 +33,27 @@ index c1a396c8..463e0e93 100644
                                                multi_modal_content=mm_content,
                                                ignore_eos=ignore_eos)
          tasks.append(
+diff --git a/vllm/worker/tt_model_runner.py b/vllm/worker/tt_model_runner.py
+index 1c586dd3..505e4b84 100644
+--- a/vllm/worker/tt_model_runner.py
++++ b/vllm/worker/tt_model_runner.py
+@@ -425,10 +425,15 @@ class TTModelRunner(ModelRunnerBase[TTModelInput]):
+             )
+     
+     def _validate_sampling_params(self, sampling_params):
+-        assert sampling_params.n == 1, "Currently only supporting n=1"
+-        assert sampling_params.best_of is None, "Currently not supporting best_of"
+-        assert sampling_params.logprobs is None, "Currently not supporting logprobs"
+-        assert sampling_params.prompt_logprobs is None, "Currently not supporting prompt_logprobs"
++        # if sampling_params.n != 1:
++        #     raise ValueError("Currently only supporting n=1")
++        # if sampling_params.best_of is not None:
++        #     raise ValueError("Currently not supporting best_of")
++        # if sampling_params.logprobs is not None:
++        #     raise ValueError("Currently not supporting logprobs") 
++        # if sampling_params.prompt_logprobs is not None:
++        #     raise ValueError("Currently not supporting prompt_logprobs")
++        return
+ 
+     ## Destructor (used to delete ttnn trace if using trace mode)
+     
\ No newline at end of file

From b93370d4fbf430d83e88c128e56d9f758f4a1a17 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Fri, 6 Dec 2024 06:23:06 +0000
Subject: [PATCH 26/76] update benchmarking/benchmark_serving.patch

---
 benchmarking/benchmark_serving.patch | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/benchmarking/benchmark_serving.patch b/benchmarking/benchmark_serving.patch
index fb5cb7f7..f393b6bc 100644
--- a/benchmarking/benchmark_serving.patch
+++ b/benchmarking/benchmark_serving.patch
@@ -34,10 +34,10 @@ index c1a396c8..463e0e93 100644
                                                ignore_eos=ignore_eos)
          tasks.append(
 diff --git a/vllm/worker/tt_model_runner.py b/vllm/worker/tt_model_runner.py
-index 1c586dd3..505e4b84 100644
+index 1c586dd3..2e77bf72 100644
 --- a/vllm/worker/tt_model_runner.py
 +++ b/vllm/worker/tt_model_runner.py
-@@ -425,10 +425,15 @@ class TTModelRunner(ModelRunnerBase[TTModelInput]):
+@@ -425,12 +425,7 @@ class TTModelRunner(ModelRunnerBase[TTModelInput]):
              )
      
      def _validate_sampling_params(self, sampling_params):
@@ -45,15 +45,9 @@ index 1c586dd3..505e4b84 100644
 -        assert sampling_params.best_of is None, "Currently not supporting best_of"
 -        assert sampling_params.logprobs is None, "Currently not supporting logprobs"
 -        assert sampling_params.prompt_logprobs is None, "Currently not supporting prompt_logprobs"
-+        # if sampling_params.n != 1:
-+        #     raise ValueError("Currently only supporting n=1")
-+        # if sampling_params.best_of is not None:
-+        #     raise ValueError("Currently not supporting best_of")
-+        # if sampling_params.logprobs is not None:
-+        #     raise ValueError("Currently not supporting logprobs") 
-+        # if sampling_params.prompt_logprobs is not None:
-+        #     raise ValueError("Currently not supporting prompt_logprobs")
+-
+-    ## Destructor (used to delete ttnn trace if using trace mode)
 +        return
- 
-     ## Destructor (used to delete ttnn trace if using trace mode)
-     
\ No newline at end of file
+     
+     def __del__(self):
+         if self.trace_mode and self.execute_trace_kwargs is not None:

From 5e07baac76394f0ac04b3deb5fb052e9bee35bdf Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Fri, 6 Dec 2024 06:34:43 +0000
Subject: [PATCH 27/76] use CACHE_ROOT for vllm_online_benchmark_results dir

---
 benchmarking/vllm_online_benchmark.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/benchmarking/vllm_online_benchmark.py b/benchmarking/vllm_online_benchmark.py
index 0315129f..97291699 100644
--- a/benchmarking/vllm_online_benchmark.py
+++ b/benchmarking/vllm_online_benchmark.py
@@ -63,7 +63,8 @@ def main():
     env_config = EnvironmentConfig()
 
     # Create output directory
-    result_dir = Path("vllm_online_benchmark_results")
+    cache_dir = Path(os.environ.get("CACHE_ROOT", ""))
+    result_dir = cache_dir / "vllm_online_benchmark_results"
     result_dir.mkdir(parents=True, exist_ok=True)
 
     prompt_client = PromptClient(env_config)

From d0e0b0fac21f631f45b7f65c92fe9c3acd1032f5 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Mon, 9 Dec 2024 15:55:33 +0000
Subject: [PATCH 28/76] adding timestamped online benchmark run result
 directory, rps=1 for vllm online benchmark script

---
 .../online_benchmark_prompt_client.py         | 12 ++++++---
 benchmarking/vllm_online_benchmark.py         | 27 ++++++++++++-------
 2 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/benchmarking/online_benchmark_prompt_client.py b/benchmarking/online_benchmark_prompt_client.py
index 22812bbd..ad81230b 100644
--- a/benchmarking/online_benchmark_prompt_client.py
+++ b/benchmarking/online_benchmark_prompt_client.py
@@ -38,7 +38,7 @@ def get_test_combinations(
         else:
             bsz = 1
 
-        num_prompts = bsz * 4
+        num_prompts = max(bsz * 4, 4)
         combinations.append(
             {
                 "input_len": input_len,
@@ -67,13 +67,12 @@ def run_sequence_length_test(
     model: str = "meta-llama/Llama-3.1-70B-Instruct",
 ) -> List[dict]:
     # Create save directory
-    save_path = Path(save_dir)
+    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    save_path = Path(save_dir) / f"results_{timestamp}"
     save_path.mkdir(parents=True, exist_ok=True)
 
     # Initialize results storage
     all_results = []
-    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
-    results_file = save_path / f"{file_prefix}_{timestamp}.json"
 
     # Initialize configurations
     env_config = EnvironmentConfig(vllm_model=model)
@@ -87,6 +86,11 @@ def run_sequence_length_test(
         output_len = params["output_len"]
         batch_size = params["batch_size"]
         num_prompts = params["num_prompts"]
+        run_timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+        results_file = (
+            save_path
+            / f"{file_prefix}_isl-{input_len}_osl-{output_len}_bsz-{batch_size}_n-{num_prompts}_{run_timestamp}.json"
+        )
 
         logger.info(
             f"\nTesting combination {idx}/{total_combinations}:\n"
diff --git a/benchmarking/vllm_online_benchmark.py b/benchmarking/vllm_online_benchmark.py
index 97291699..c69ce064 100644
--- a/benchmarking/vllm_online_benchmark.py
+++ b/benchmarking/vllm_online_benchmark.py
@@ -6,6 +6,7 @@
 import subprocess
 import time
 import logging
+from datetime import datetime
 from typing import Dict
 from pathlib import Path
 
@@ -25,7 +26,7 @@ def run_benchmark(
     model: str,
     port: int,
     benchmark_script: str,
-    result_dir: Path,
+    result_filename: Path,
 ) -> None:
     """Run a single benchmark with the given parameters."""
     # fmt: off
@@ -34,12 +35,13 @@ def run_benchmark(
         "--backend", "vllm",
         "--model", model,
         "--port", str(port),
+        "--request-rate", "1",
         "--dataset-name", "random",
         "--num-prompts", str(params["batch_size"]),
         "--random-input-len", str(params["input_len"]),
         "--random-output-len", str(params["output_len"]),
         "--save-result", 
-        "--result-dir", str(result_dir)
+        "--result-filename", str(result_filename)
     ]
     # fmt: on
 
@@ -64,7 +66,8 @@ def main():
 
     # Create output directory
     cache_dir = Path(os.environ.get("CACHE_ROOT", ""))
-    result_dir = cache_dir / "vllm_online_benchmark_results"
+    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    result_dir = cache_dir / "vllm_online_benchmark_results" / f"results_{timestamp}"
     result_dir.mkdir(parents=True, exist_ok=True)
 
     prompt_client = PromptClient(env_config)
@@ -91,7 +94,7 @@ def main():
         (2000, 32),
         (4000, 32),
         (8100, 32),
-        (130000, 1024),
+        # (32000, 1024)
     ]
 
     # Get all benchmark combinations using the original function
@@ -99,23 +102,27 @@ def main():
         context_lens=typical_context_lens + extra_context_lengths,
     )
 
-    # Log benchmark plan
-    logger.info(f"Starting benchmark suite with {len(combinations)} combinations")
-    for i, combo in enumerate(combinations, 1):
-        logger.info(f"Combination {i}: {combo}")
-
     # ensure vllm server is ready
     prompt_client.capture_traces()
 
     # Run benchmarks
     for i, params in enumerate(combinations, 1):
+        run_timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+        isl = params["input_len"]
+        osl = params["output_len"]
+        bsz = params["batch_size"]
+        num_prompts = params["num_prompts"]
+        result_filename = (
+            result_dir
+            / f"vllm_online_benchmark_isl-{isl}_osl-{osl}_bsz-{bsz}_n-{num_prompts}_{run_timestamp}.json"
+        )
         logger.info(f"\nRunning benchmark {i}/{len(combinations)}")
         run_benchmark(
             benchmark_script="/home/user/vllm/benchmarks/benchmark_serving.py",
             params=params,
             model=env_config.vllm_model,
             port=env_config.service_port,
-            result_dir=result_dir,
+            result_filename=result_filename,
         )
 
     logger.info("Benchmark suite completed")

From 5db2523cec328186309eef54dcb7c2e424e69f51 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Mon, 9 Dec 2024 16:16:53 +0000
Subject: [PATCH 29/76] update benchmark output file naming convention

---
 benchmarking/online_benchmark_prompt_client.py | 2 +-
 benchmarking/vllm_online_benchmark.py          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarking/online_benchmark_prompt_client.py b/benchmarking/online_benchmark_prompt_client.py
index ad81230b..db490362 100644
--- a/benchmarking/online_benchmark_prompt_client.py
+++ b/benchmarking/online_benchmark_prompt_client.py
@@ -89,7 +89,7 @@ def run_sequence_length_test(
         run_timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
         results_file = (
             save_path
-            / f"{file_prefix}_isl-{input_len}_osl-{output_len}_bsz-{batch_size}_n-{num_prompts}_{run_timestamp}.json"
+            / f"{file_prefix}_{run_timestamp}_isl-{input_len}_osl-{output_len}_bsz-{batch_size}_n-{num_prompts}.json"
         )
 
         logger.info(
diff --git a/benchmarking/vllm_online_benchmark.py b/benchmarking/vllm_online_benchmark.py
index c69ce064..5e90291e 100644
--- a/benchmarking/vllm_online_benchmark.py
+++ b/benchmarking/vllm_online_benchmark.py
@@ -114,7 +114,7 @@ def main():
         num_prompts = params["num_prompts"]
         result_filename = (
             result_dir
-            / f"vllm_online_benchmark_isl-{isl}_osl-{osl}_bsz-{bsz}_n-{num_prompts}_{run_timestamp}.json"
+            / f"vllm_online_benchmark_{run_timestamp}_isl-{isl}_osl-{osl}_bsz-{bsz}_n-{num_prompts}.json"
         )
         logger.info(f"\nRunning benchmark {i}/{len(combinations)}")
         run_benchmark(

From 5ab742c9cf2ae4a72749ee104a0be6a7541c622f Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Mon, 9 Dec 2024 16:30:06 +0000
Subject: [PATCH 30/76] rename benchmarking/online_benchmark_prompt_client.py
 to benchmarking/prompt_client_online_benchmark.py

---
 ...t.py => prompt_client_online_benchmark.py} | 34 +++++++++----------
 benchmarking/vllm_online_benchmark.py         |  2 +-
 2 files changed, 18 insertions(+), 18 deletions(-)
 rename benchmarking/{online_benchmark_prompt_client.py => prompt_client_online_benchmark.py} (94%)

diff --git a/benchmarking/online_benchmark_prompt_client.py b/benchmarking/prompt_client_online_benchmark.py
similarity index 94%
rename from benchmarking/online_benchmark_prompt_client.py
rename to benchmarking/prompt_client_online_benchmark.py
index db490362..4fe6c943 100644
--- a/benchmarking/online_benchmark_prompt_client.py
+++ b/benchmarking/prompt_client_online_benchmark.py
@@ -180,25 +180,25 @@ def run_sequence_length_test(
 if __name__ == "__main__":
     # Define parameter ranges
     typical_context_lens = [
-        (128, 128),
-        (128, 2048),
-        (128, 4096),
-        (2048, 128),
-        (2048, 2048),
-        (1000, 1000),
-        (500, 2000),
-        (5000, 500),
-        (20000, 2000),
+        # (128, 128),
+        # (128, 2048),
+        # (128, 4096),
+        # (2048, 128),
+        # (2048, 2048),
+        # (1000, 1000),
+        # (500, 2000),
+        # (5000, 500),
+        # (20000, 2000),
     ]
     extra_context_lengths = [
-        (128, 2),
-        (256, 2),
-        (512, 32),
-        (1000, 24),
-        (2000, 32),
-        (4000, 32),
-        (8100, 32),
-        (130000, 1024),
+        # (128, 2),
+        # (256, 2),
+        # (512, 32),
+        # (1000, 24),
+        # (2000, 32),
+        # (4000, 32),
+        # (8100, 32),
+        (32760, 1024),
     ]
     # Generate all valid combinations upfront
     combinations = get_test_combinations(
diff --git a/benchmarking/vllm_online_benchmark.py b/benchmarking/vllm_online_benchmark.py
index 5e90291e..32f43203 100644
--- a/benchmarking/vllm_online_benchmark.py
+++ b/benchmarking/vllm_online_benchmark.py
@@ -10,7 +10,7 @@
 from typing import Dict
 from pathlib import Path
 
-from benchmarking.online_benchmark_prompt_client import get_test_combinations
+from benchmarking.prompt_client_online_benchmark import get_test_combinations
 from utils.prompt_configs import EnvironmentConfig
 from utils.prompt_client import PromptClient
 

From 06420bd989c0639a0f1c9d5df6d6695a3b316fc5 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Mon, 9 Dec 2024 19:36:02 +0000
Subject: [PATCH 31/76] increase num_prompts default, default to 128/128 online
 test

---
 .../prompt_client_online_benchmark.py         |  8 ++---
 benchmarking/vllm_online_benchmark.py         | 35 +++++++++----------
 2 files changed, 21 insertions(+), 22 deletions(-)

diff --git a/benchmarking/prompt_client_online_benchmark.py b/benchmarking/prompt_client_online_benchmark.py
index 4fe6c943..5764acdd 100644
--- a/benchmarking/prompt_client_online_benchmark.py
+++ b/benchmarking/prompt_client_online_benchmark.py
@@ -38,7 +38,7 @@ def get_test_combinations(
         else:
             bsz = 1
 
-        num_prompts = max(bsz * 4, 4)
+        num_prompts = max(bsz * 32, 32)
         combinations.append(
             {
                 "input_len": input_len,
@@ -53,7 +53,8 @@ def get_test_combinations(
     for i, combo in enumerate(combinations, 1):
         logger.info(
             f"Combination {i}: input_len={combo['input_len']}, "
-            f"output_len={combo['output_len']}, batch_size={combo['batch_size']}"
+            f"output_len={combo['output_len']}, batch_size={combo['batch_size']}, "
+            f"num_prompts={combo['num_prompts']}"
         )
 
     return combinations
@@ -180,7 +181,7 @@ def run_sequence_length_test(
 if __name__ == "__main__":
     # Define parameter ranges
     typical_context_lens = [
-        # (128, 128),
+        (128, 128),
         # (128, 2048),
         # (128, 4096),
         # (2048, 128),
@@ -198,7 +199,6 @@ def run_sequence_length_test(
         # (2000, 32),
         # (4000, 32),
         # (8100, 32),
-        (32760, 1024),
     ]
     # Generate all valid combinations upfront
     combinations = get_test_combinations(
diff --git a/benchmarking/vllm_online_benchmark.py b/benchmarking/vllm_online_benchmark.py
index 32f43203..3dd38f07 100644
--- a/benchmarking/vllm_online_benchmark.py
+++ b/benchmarking/vllm_online_benchmark.py
@@ -35,9 +35,9 @@ def run_benchmark(
         "--backend", "vllm",
         "--model", model,
         "--port", str(port),
-        "--request-rate", "1",
+        # "--request-rate", "3",
         "--dataset-name", "random",
-        "--num-prompts", str(params["batch_size"]),
+        "--num-prompts", str(params["num_prompts"]),
         "--random-input-len", str(params["input_len"]),
         "--random-output-len", str(params["output_len"]),
         "--save-result", 
@@ -77,24 +77,23 @@ def main():
     # Define benchmarking parameters
     typical_context_lens = [
         (128, 128),
-        (128, 2048),
-        (128, 4096),
-        (2048, 128),
-        (2048, 2048),
-        (1000, 1000),
-        (500, 2000),
-        (5000, 500),
-        (20000, 2000),
+        # (128, 2048),
+        # (128, 4096),
+        # (2048, 128),
+        # (2048, 2048),
+        # (1000, 1000),
+        # (500, 2000),
+        # (5000, 500),
+        # (20000, 2000),
     ]
     extra_context_lengths = [
-        (128, 2),
-        (256, 2),
-        (512, 32),
-        (1000, 24),
-        (2000, 32),
-        (4000, 32),
-        (8100, 32),
-        # (32000, 1024)
+        # (128, 2),
+        # (256, 2),
+        # (512, 32),
+        # (1000, 24),
+        # (2000, 32),
+        # (4000, 32),
+        # (8100, 32),
     ]
 
     # Get all benchmark combinations using the original function

From b7e4cfc7ffab2403d49b62673a122f7d99c302cf Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Mon, 9 Dec 2024 21:01:14 +0000
Subject: [PATCH 32/76] use min_tokens and ignore_eos=True to force output seq
 len

---
 utils/prompt_client.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/utils/prompt_client.py b/utils/prompt_client.py
index 455921d3..4eb98ddf 100644
--- a/utils/prompt_client.py
+++ b/utils/prompt_client.py
@@ -176,7 +176,8 @@ def call_inference(
         }
 
         if force_max_tokens:
-            json_data["stop"] = "<|reserved_special_token_249|>"
+            json_data["min_tokens"] = max_tokens
+            json_data["ignore_eos"] = True
 
         req_time = time.perf_counter()
         response = requests.post(

From dda29a9300c6c716c5a0eca2fcf554a693a83421 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Mon, 9 Dec 2024 21:42:25 +0000
Subject: [PATCH 33/76] adding min_tokens to locust requests

---
 locust/locustfile.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/locust/locustfile.py b/locust/locustfile.py
index 7942394f..19dd59ee 100644
--- a/locust/locustfile.py
+++ b/locust/locustfile.py
@@ -23,6 +23,7 @@
 # Global variable to store data iterator
 data_iter = None
 
+
 def get_authorization():
     authorization = os.getenv("AUTHORIZATION", None)
     if authorization is None:
@@ -50,12 +51,13 @@ class ServeUser(FastHttpUser):
     connection_timeout = CONNECTION_TIMEOUT
     headers = {"Authorization": f"Bearer {get_authorization()}"}
 
-    def post_request(self, prompt: str, max_tokens: int):
+    def post_request(self, prompt: str, max_tokens: int, min_tokens: int):
         """Helper method to send a POST request to the API with the given prompt and token limit."""
         json_data = {
             "prompt": prompt,
             **DEFAULT_PARAMS,  # Merge default parameters
             "max_tokens": max_tokens,
+            "min_tokens": min_tokens,
         }
         response = self.client.post(API_ENDPOINT, json=json_data, headers=self.headers)
         return response
@@ -64,4 +66,4 @@ def post_request(self, prompt: str, max_tokens: int):
     def dataset_test(self):
         """Test using generated prompts from a data iterator."""
         prompt = next(data_iter)
-        self.post_request(prompt, max_tokens=128)
+        self.post_request(prompt, max_tokens=128, min_tokens=128)

From f8b3033fa22f2a81856cf5b2e90196c30dfe55e2 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Tue, 10 Dec 2024 20:20:32 +0000
Subject: [PATCH 34/76] add --ignore-eos to vllm_online_benchmark.py to force
 the output seq len to be as configured

---
 benchmarking/vllm_online_benchmark.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/benchmarking/vllm_online_benchmark.py b/benchmarking/vllm_online_benchmark.py
index 3dd38f07..1385f108 100644
--- a/benchmarking/vllm_online_benchmark.py
+++ b/benchmarking/vllm_online_benchmark.py
@@ -40,7 +40,8 @@ def run_benchmark(
         "--num-prompts", str(params["num_prompts"]),
         "--random-input-len", str(params["input_len"]),
         "--random-output-len", str(params["output_len"]),
-        "--save-result", 
+        "--ignore-eos",  # Ignore EOS tokens to force max output length as set
+        "--save-result",
         "--result-filename", str(result_filename)
     ]
     # fmt: on

From 12c38fcb1586344069d65fe1c5d02097ea0077b8 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Tue, 10 Dec 2024 20:23:14 +0000
Subject: [PATCH 35/76] add context_lens (isl, osl) pairs to capture_traces()
 to capture correct traces for performance testing

---
 .../prompt_client_online_benchmark.py         | 13 +++----
 benchmarking/vllm_online_benchmark.py         | 14 +++----
 utils/prompt_client.py                        | 37 ++++++++++++-------
 3 files changed, 34 insertions(+), 30 deletions(-)

diff --git a/benchmarking/prompt_client_online_benchmark.py b/benchmarking/prompt_client_online_benchmark.py
index 5764acdd..a7da95ae 100644
--- a/benchmarking/prompt_client_online_benchmark.py
+++ b/benchmarking/prompt_client_online_benchmark.py
@@ -78,7 +78,6 @@ def run_sequence_length_test(
     # Initialize configurations
     env_config = EnvironmentConfig(vllm_model=model)
     prompt_client = PromptClient(env_config)
-    prompt_client.capture_traces()
 
     # Test all combinations
     total_combinations = len(combinations)
@@ -130,6 +129,8 @@ def run_sequence_length_test(
         batch_processor = BatchProcessor(prompt_client, batch_config)
         tokenizer = AutoTokenizer.from_pretrained(model)
 
+        # pre-capture traces so benchmark does not include 1st run trace capture time
+        prompt_client.capture_traces(context_lens=[(input_len, output_len)])
         # Process batches
         try:
             responses = batch_processor.process_batch(
@@ -179,8 +180,8 @@ def run_sequence_length_test(
 
 
 if __name__ == "__main__":
-    # Define parameter ranges
-    typical_context_lens = [
+    # Define benchmarking context length (isl, osl) pairs
+    context_lens = [
         (128, 128),
         # (128, 2048),
         # (128, 4096),
@@ -190,8 +191,6 @@ def run_sequence_length_test(
         # (500, 2000),
         # (5000, 500),
         # (20000, 2000),
-    ]
-    extra_context_lengths = [
         # (128, 2),
         # (256, 2),
         # (512, 32),
@@ -201,9 +200,7 @@ def run_sequence_length_test(
         # (8100, 32),
     ]
     # Generate all valid combinations upfront
-    combinations = get_test_combinations(
-        context_lens=typical_context_lens + extra_context_lengths,
-    )
+    combinations = get_test_combinations(context_lens=context_lens)
 
     # Run tests
     results = run_sequence_length_test(
diff --git a/benchmarking/vllm_online_benchmark.py b/benchmarking/vllm_online_benchmark.py
index 1385f108..159f8da0 100644
--- a/benchmarking/vllm_online_benchmark.py
+++ b/benchmarking/vllm_online_benchmark.py
@@ -75,8 +75,8 @@ def main():
     # note: there isnt a better way to pass an api key to the vllm benchmarking script
     os.environ["OPENAI_API_KEY"] = prompt_client._get_authorization()
 
-    # Define benchmarking parameters
-    typical_context_lens = [
+    # Define benchmarking context length (isl, osl) pairs
+    context_lens = [
         (128, 128),
         # (128, 2048),
         # (128, 4096),
@@ -86,8 +86,6 @@ def main():
         # (500, 2000),
         # (5000, 500),
         # (20000, 2000),
-    ]
-    extra_context_lengths = [
         # (128, 2),
         # (256, 2),
         # (512, 32),
@@ -98,12 +96,10 @@ def main():
     ]
 
     # Get all benchmark combinations using the original function
-    combinations = get_test_combinations(
-        context_lens=typical_context_lens + extra_context_lengths,
-    )
+    combinations = get_test_combinations(context_lens=context_lens)
 
-    # ensure vllm server is ready
-    prompt_client.capture_traces()
+    # pre-capture traces required for benchmarking
+    prompt_client.capture_traces(context_lens=context_lens)
 
     # Run benchmarks
     for i, params in enumerate(combinations, 1):
diff --git a/utils/prompt_client.py b/utils/prompt_client.py
index 4eb98ddf..27e01ebd 100644
--- a/utils/prompt_client.py
+++ b/utils/prompt_client.py
@@ -5,7 +5,7 @@
 import logging
 import json
 import time
-from typing import List
+from typing import List, Tuple
 
 import requests
 import jwt
@@ -97,27 +97,37 @@ def wait_for_healthy(self, timeout: int = 300, interval: int = 10) -> bool:
 
     def capture_traces(
         self,
-        input_sizes: List[int] = None,
+        context_lens: List[Tuple[int, int]] = None,
         prompts_per_size: int = 1,
-        output_seq_len: int = 1,
     ) -> None:
         logger.info("Capturing input sizes ...")
 
         # Default input sizes based on get_padded_prefill_len()
-        if input_sizes is None:
-            input_sizes = [32, 64, 128, 256, 512, 1024, 2048, 3072, 4096]
+        if context_lens is None:
+            # generate 4 osl tokens by default for each isl
+            context_lens = [
+                (32, 4),
+                (64, 4),
+                (128, 4),
+                (256, 4),
+                (512, 4),
+                (1024, 4),
+                (2048, 4),
+                (3072, 4),
+                (4096, 4),
+            ]
 
         # Check service health before starting
         if not self.wait_for_healthy():
             raise RuntimeError("vLLM did not start correctly!")
 
-        for size in input_sizes:
-            logger.info(f"Capture input size: {size}")
+        for isl, osl in context_lens:
+            logger.info(f"Capture trace: isl={isl}, osl={osl}")
 
             # Create prompt config for current size
             prompt_config = PromptConfig(
-                input_seq_len=size,
-                max_prompt_length=size,
+                input_seq_len=isl,
+                max_prompt_length=isl,
                 num_prompts=prompts_per_size,
                 distribution="fixed",
                 dataset="random",
@@ -133,20 +143,21 @@ def capture_traces(
             # Process each prompt
             for i, (prompt, prompt_len) in enumerate(zip(prompts, prompt_lengths)):
                 try:
-                    logger.info(f"Starting capture for input_seq_len: {prompt_len}")
+                    logger.info(
+                        f"Starting capture for input_seq_len: {prompt_len}, output_seq_len: {osl}"
+                    )
                     response_data = self.call_inference(
                         prompt=prompt,
                         response_idx=i,
                         prompt_len=prompt_len,
-                        max_tokens=output_seq_len,
+                        max_tokens=osl,
                         stream=True,
                         vll_model=self.env_config.vllm_model,
                         tokenizer=None,
                         force_max_tokens=True,
                     )
                     logger.info(
-                        f"Input size: {size}, "
-                        f"input_seq_len: {prompt_len}, "
+                        f"tokens generated: {response_data['output_seq_len']}, "
                         f"TTFT: {response_data['ttft']:.3f}s"
                     )
                 except Exception as e:

From 1cabdc98bfd8836b7aa869a10474ce4929331323 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Tue, 10 Dec 2024 20:33:26 +0000
Subject: [PATCH 36/76] add trace pre-capture to prompt_client_cli.py with
 option to disable

---
 benchmarking/README.md     | 22 ++++++++++++++++++++++
 utils/prompt_client_cli.py | 12 ++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/benchmarking/README.md b/benchmarking/README.md
index 641436d4..d3360c85 100644
--- a/benchmarking/README.md
+++ b/benchmarking/README.md
@@ -38,6 +38,20 @@ python examples/offline_inference_tt.py --measure_perf --max_seqs_in_batch 32 --
 
 ### Online Benchmarking
 
+#### single user
+
+```bash
+python utils/prompt_client_cli.py \
+    --num_prompts 32 \
+    --batch_size 1 \
+    --tokenizer_model meta-llama/Llama-3.1-70B-Instruct \
+    --max_prompt_length 128 \
+    --input_seq_len 128 \
+    --output_seq_len 128 \
+    --template chat_template \
+    --dataset random
+```
+
 #### using vllm/benchmarking/benchmark_serving.py
 Within the Docker container, use the benchmark_serving.patch file:
 ```
@@ -89,3 +103,11 @@ Median ITL (ms):                         7.83
 P99 ITL (ms):                            8.05
 ==================================================
 ```
+
+#### using tt-inference-server/benchmarking/prompt_client_online_benchmark.py
+
+```bash
+export PYTHONPATH=$PYTHONPATH:$PWD
+python benchmarking/prompt_client_online_benchmark.py
+```
+
diff --git a/utils/prompt_client_cli.py b/utils/prompt_client_cli.py
index 3d74f8f5..943b5844 100644
--- a/utils/prompt_client_cli.py
+++ b/utils/prompt_client_cli.py
@@ -114,6 +114,12 @@ def add_client_args(parser):
         default=False,
         help="Print generated prompts.",
     )
+    parser.add_argument(
+        "--skip_trace_precapture",
+        action="store_true",
+        default=False,
+        help="Print generated prompts.",
+    )
     return parser
 
 
@@ -159,6 +165,12 @@ def main():
     # Generate prompts
     prompts, input_seq_lengths = generate_prompts(prompt_config)
 
+    if not args.skip_trace_precapture:
+        # pre-capture traces so benchmark does not include 1st run trace capture time
+        prompt_client.capture_traces(
+            context_lens=[(args.input_seq_len, args.output_seq_len)]
+        )
+
     # Process batches
     logger.info(f"Starting batch processing with batch_size={batch_config.batch_size}")
     responses = batch_processor.process_batch(

From 68f08d05c4fc74bd85c990f2388f1afc652eed77 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Tue, 10 Dec 2024 20:36:13 +0000
Subject: [PATCH 37/76] better comment and logs for trace capture

---
 utils/prompt_client.py     | 2 +-
 utils/prompt_client_cli.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/utils/prompt_client.py b/utils/prompt_client.py
index 27e01ebd..6b86330e 100644
--- a/utils/prompt_client.py
+++ b/utils/prompt_client.py
@@ -144,7 +144,7 @@ def capture_traces(
             for i, (prompt, prompt_len) in enumerate(zip(prompts, prompt_lengths)):
                 try:
                     logger.info(
-                        f"Starting capture for input_seq_len: {prompt_len}, output_seq_len: {osl}"
+                        f"Starting trace capture for: input_seq_len:={prompt_len}, output_seq_len:={osl}"
                     )
                     response_data = self.call_inference(
                         prompt=prompt,
diff --git a/utils/prompt_client_cli.py b/utils/prompt_client_cli.py
index 943b5844..31d97e6d 100644
--- a/utils/prompt_client_cli.py
+++ b/utils/prompt_client_cli.py
@@ -166,7 +166,7 @@ def main():
     prompts, input_seq_lengths = generate_prompts(prompt_config)
 
     if not args.skip_trace_precapture:
-        # pre-capture traces so benchmark does not include 1st run trace capture time
+        # pre-capture traces to not include 1st run trace capture time
         prompt_client.capture_traces(
             context_lens=[(args.input_seq_len, args.output_seq_len)]
         )

From 962c5077993678b3a2ed5362e19671dd9d8cfec9 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Thu, 12 Dec 2024 05:05:53 +0000
Subject: [PATCH 38/76] use TPOT and TPS in
 benchmarking/prompt_client_online_benchmark.py, add support in client for ITL
 and TPOT

---
 .../prompt_client_online_benchmark.py         | 25 ++++++++++++-------
 utils/batch_processor.py                      |  5 ++--
 utils/prompt_client.py                        | 25 ++++++++++++++-----
 3 files changed, 37 insertions(+), 18 deletions(-)

diff --git a/benchmarking/prompt_client_online_benchmark.py b/benchmarking/prompt_client_online_benchmark.py
index a7da95ae..f4d9ca53 100644
--- a/benchmarking/prompt_client_online_benchmark.py
+++ b/benchmarking/prompt_client_online_benchmark.py
@@ -140,15 +140,22 @@ def run_sequence_length_test(
             )
 
             # Calculate statistics
+            mean_tpot = np.mean([r["time_per_output_token"] for r in responses])
+            mean_tpot = max(mean_tpot, 1e-6)  # Avoid division by zero
+            mean_tps = 1.0 / mean_tpot
+            std_tpot = np.std([r["time_per_output_token"] for r in responses])
+            std_tpot = max(std_tpot, 1e-6)  # Avoid division by zero
+            std_tps = mean_tps - 1.0 / (mean_tpot + std_tpot)
             stats = {
                 "input_seq_len": input_len,
                 "output_seq_len": output_len,
                 "batch_size": batch_size,
-                "mean_decode_tps": np.mean([r["decode_tps"] for r in responses]),
-                "mean_total_tps": np.mean([r["total_tps"] for r in responses]),
+                "total_output_tokens": sum([r["output_seq_len"] for r in responses]),
+                "mean_tpot": mean_tpot,
+                "mean_tps": mean_tps,
                 "mean_ttft": np.mean([r["ttft"] for r in responses]),
-                "std_decode_tps": np.std([r["decode_tps"] for r in responses]),
-                "std_total_tps": np.std([r["total_tps"] for r in responses]),
+                "std_tpot": std_tpot,
+                "std_tps": std_tps,
                 "std_ttft": np.std([r["ttft"] for r in responses]),
                 "num_prompts": num_prompts,
                 "num_iterations": num_iterations,
@@ -161,11 +168,11 @@ def run_sequence_length_test(
             # Log results
             logger.info(
                 f"Results for combination {idx}/{total_combinations}:\n"
-                f"Mean Decode TPS: {stats['mean_decode_tps']:.2f} ± "
-                f"{stats['std_decode_tps']:.2f}\n"
-                f"Mean Total TPS: {stats['mean_total_tps']:.2f} ± "
-                f"{stats['std_total_tps']:.2f}\n"
-                f"Mean TTFT: {stats['mean_ttft']:.2f} ± {stats['std_ttft']:.2f}"
+                f"Mean TPOT: {stats['mean_tpot']:.4f} ± "
+                f"{stats['std_tpot']:.4f}\n"
+                f"Mean user TPS: {stats['mean_tps']:.4f} ± "
+                f"{stats['std_tps']:.4f}\n"
+                f"Mean TTFT: {stats['mean_ttft']:.4f} ± {stats['std_ttft']:.4f}"
             )
 
             # Save results after each combination
diff --git a/utils/batch_processor.py b/utils/batch_processor.py
index 35ab6652..2c7a68fc 100644
--- a/utils/batch_processor.py
+++ b/utils/batch_processor.py
@@ -266,9 +266,8 @@ def _log_progress(
     ):
         logger.info(
             f"Processed {response_counter}/{total_prompts} responses. "
-            f"decode_tps: {response_data['decode_tps']:.2f}, "
-            f"total_tps: {response_data['total_tps']:.2f}, "
-            f"ttft: {response_data['ttft']:.2f}, "
+            f"TPOT: {response_data['time_per_output_token']:.4f}, "
+            f"TTFT: {response_data['ttft']:.4f}, "
             f"input_seq_len: {response_data['input_seq_len']}, "
             f"output_seq_len: {response_data['output_seq_len']}"
         )
diff --git a/utils/prompt_client.py b/utils/prompt_client.py
index 6b86330e..1322f130 100644
--- a/utils/prompt_client.py
+++ b/utils/prompt_client.py
@@ -158,7 +158,8 @@ def capture_traces(
                     )
                     logger.info(
                         f"tokens generated: {response_data['output_seq_len']}, "
-                        f"TTFT: {response_data['ttft']:.3f}s"
+                        f"TTFT: {response_data['ttft']:.3f}s, "
+                        f"TPOT: {response_data['time_per_output_token']:.3f}s"
                     )
                 except Exception as e:
                     logger.error(f"Error processing prompt: {e}")
@@ -218,6 +219,7 @@ def _process_response(
         first_token_time = 0
         ttft = 0
         usage_dict = {}
+        token_timestamps = []
 
         if stream:
             assert (
@@ -225,8 +227,9 @@ def _process_response(
             ), "Response is not chunked"
             for line in response.iter_lines(decode_unicode=True):
                 if line and line.startswith("data: "):
+                    current_time = time.perf_counter()
                     if num_completion_tokens == 0:
-                        first_token_time = time.perf_counter()
+                        first_token_time = current_time
                         ttft = first_token_time - req_time
 
                     data_str = line[len("data: ") :].strip()
@@ -237,6 +240,7 @@ def _process_response(
                         data = json.loads(data_str)
                         if data["choices"]:
                             full_text += data["choices"][0].get("text", "")
+                            token_timestamps.append(current_time)
                             num_completion_tokens += 1
                         else:
                             usage_dict = data.get("usage", {})
@@ -249,8 +253,17 @@ def _process_response(
             usage_dict = data["usage"]
             first_token_time = req_time
 
-        decode_time = max(time.perf_counter() - first_token_time, 0.0001)
-        total_time = max(time.perf_counter() - req_time, 0.0001)
+        # Calculate inter-token latencies
+        inter_token_latencies = []
+        if len(token_timestamps) > 1:
+            inter_token_latencies = [
+                token_timestamps[i] - token_timestamps[i - 1]
+                for i in range(1, len(token_timestamps))
+            ]
+
+        gen_time = max(time.perf_counter() - first_token_time, 0.0001)
+        # discount the TTFT and 1st token time from the generation time
+        time_per_output_token = gen_time / max(num_completion_tokens - 1, 1)
 
         # verify the number of input tokens
         isl_diff = usage_dict["prompt_tokens"] - prompt_len
@@ -281,7 +294,7 @@ def _process_response(
             "response": full_text,
             "input_seq_len": prompt_len,
             "output_seq_len": num_completion_tokens,
-            "decode_tps": (max(num_completion_tokens, 1)) / decode_time,
-            "total_tps": (max(num_completion_tokens, 1)) / total_time,
+            "inter_token_latencies": inter_token_latencies,
+            "time_per_output_token": time_per_output_token,
             "ttft": ttft,
         }

From 62bf42764d981edf4f8d873d8022293f852c77c8 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Thu, 12 Dec 2024 05:08:48 +0000
Subject: [PATCH 39/76] update utils/prompt_client_cli.py and docs

---
 utils/README.md            | 100 ++++++++++++++++++++++++++++---------
 utils/prompt_client_cli.py |  24 ++++++---
 2 files changed, 94 insertions(+), 30 deletions(-)

diff --git a/utils/README.md b/utils/README.md
index 389e7cea..984fda73 100644
--- a/utils/README.md
+++ b/utils/README.md
@@ -23,20 +23,73 @@ The prompt client CLI tool allows you to send prompts to a vLLM API server with
 - `CACHE_ROOT`: Directory for saving response files (default: current directory)
 - `VLLM_MODEL`: Model name (default: meta-llama/Llama-3.1-70B-Instruct)
 
-#### Key Arguments
-
-- `--num_prompts`: Number of prompts to generate
-- `--batch_size`: Number of concurrent requests
-- `--max_prompt_length`: Maximum length for generated prompts
-- `--output_seq_len`: Maximum length for completions
-- `--num_full_iterations`: Number of times to repeat the full prompt set
-- `--vary-batch-size`: Randomize batch sizes using normal distribution
-- `--input_seq_len`: Fixed length for input sequences (-1 for variable)
-- `--inter_batch_delay`: Delay between batches in seconds
-- `--no-stream`: Disable streaming responses
-- `--dataset`: Source dataset (random, alpaca_eval)
-- `--distribution`: Prompt length distribution (fixed, uniform, normal)
-- `--template`: Path to Jinja2 template or "chat_template" for model tokenizer default
+#### Command Line Arguments
+
+##### Core Parameters
+
+- `--num_prompts` (default: 1)  
+  Number of unique prompts to generate for testing.
+
+- `--batch_size` (default: 32)  
+  Number of concurrent requests to send to the API server. Controls parallelization level.
+
+- `--num_full_iterations` (default: 1)  
+  Number of complete iterations over the entire prompt set. Useful for extended testing cycles.
+
+##### Model Configuration
+
+- `--vllm_model` (default: "meta-llama/Llama-3.1-70B-Instruct")  
+  Model identifier for the vLLM API server. Can be overridden by VLLM_MODEL environment variable.
+
+- `--tokenizer_model` (default: None)  
+  Specific tokenizer model to use for vocabulary, truncation, and templating operations.
+
+##### Sequence Length Controls
+
+- `--input_seq_len` (default: -1)  
+  Length parameter for input sequences when using random prompts. -1 allows variable lengths.
+
+- `--output_seq_len` (default: 2048)  
+  Forces all completions to a fixed maximum length for consistent testing.
+
+- `--max_prompt_length` (default: -1)  
+  Maximum allowed length for generated prompts. -1 indicates no length restriction.
+
+##### Batch Processing Options
+
+- `--vary_batch_size` (default: False)  
+  When enabled, randomizes the batch size for each prompt batch using normal distribution.
+
+- `--inter_batch_delay` (default: 0)  
+  Seconds to wait between processing each batch. Useful for rate limiting.
+
+- `--no-stream` (default: False)  
+  Disables streaming responses. By default, streaming is enabled.
+
+##### Prompt Generation Settings
+
+- `--distribution` (default: "fixed")  
+  Method for determining random prompt lengths:
+  - "fixed": Constant length
+  - "uniform": Uniform distribution
+  - "normal": Normal distribution
+
+- `--dataset` (default: "random")  
+  Source dataset for prompt generation. Use "random" for synthetic prompts.
+
+- `--template` (default: None)  
+  Jinja2 template for formatting prompts. Can be a file path or template string.
+
+##### Output Controls
+
+- `--save_path` (default: None)  
+  File path to save generated prompts in JSONL format.
+
+- `--print_prompts` (default: False)  
+  Enable printing of generated prompts to stdout.
+
+- `--skip_trace_precapture` (default: False)  
+  Skips trace precapture phase, use to speed up execution if trace captures have already completed.
 
 #### Example Usage
 
@@ -54,7 +107,7 @@ python prompt_client_cli.py \
     --num_prompts 10 \
     --batch_size 4 \
     --tokenizer_model meta-llama/Llama-3.1-70B-Instruct \
-    --max_prompt_length 512 \
+    --input_seq_len 512 \
     --output_seq_len 2048
 
 # send prompts from alpaca_eval using chat template from tokenizer
@@ -103,13 +156,14 @@ The client saves responses in JSON format with the following structure:
 
 ```json
 {
-    "response_idx": 0,
-    "prompt": "example prompt",
-    "response": "model response",
-    "prompt_length": 128,
-    "num_completion_tokens": 256,
-    "tps": 45.6,
-    "ttft": 0.15
+  "response_idx": number,    // Response index in batch
+  "prompt": string,         // Input prompt
+  "response": string,       // Generated completion text
+  "input_seq_len": number,  // Prompt length in tokens
+  "output_seq_len": number, // Completion length in tokens
+  "inter_token_latencies": number[],  // Per-token generation times in seconds
+  "time_per_output_token": number,    // Average seconds per token
+  "ttft": number            // Time to first token in seconds
 }
 ```
 
@@ -139,7 +193,7 @@ args = SimpleNamespace(
     input_seq_len=-1,
     num_prompts=5,
     distribution="normal",
-    template="templates/chat.j2",
+    template="prompt_templates/llama_instruct_example.jinja",
     save_path="generated_prompts.jsonl",
     lm_eval_task=None
 )
diff --git a/utils/prompt_client_cli.py b/utils/prompt_client_cli.py
index 31d97e6d..7ab7ed6f 100644
--- a/utils/prompt_client_cli.py
+++ b/utils/prompt_client_cli.py
@@ -74,7 +74,7 @@ def add_client_args(parser):
     parser.add_argument(
         "--max_prompt_length",
         type=int,
-        required=True,
+        default=-1,
         help="Maximum length of generated prompts.",
     )
     parser.add_argument(
@@ -118,7 +118,7 @@ def add_client_args(parser):
         "--skip_trace_precapture",
         action="store_true",
         default=False,
-        help="Print generated prompts.",
+        help="Skips trace precapture phase, use to speed up execution if trace captures have already completed.",
     )
     return parser
 
@@ -131,6 +131,16 @@ def main():
     parser = add_client_args(parser)
     args = parser.parse_args()
 
+    assert (
+        args.max_prompt_length != -1 or args.input_seq_len != -1
+    ), "Either --max_prompt_length or --input_seq_len must be provided."
+    if args.max_prompt_length == -1:
+        assert args.input_seq_len > 0
+        args.max_prompt_length = args.input_seq_len
+    elif args.input_seq_len == -1:
+        assert args.max_prompt_length > 0
+        args.input_seq_len = args.max_prompt_length
+
     # Create configs from arguments
     prompt_config = PromptConfig(
         input_seq_len=args.input_seq_len,
@@ -181,12 +191,12 @@ def main():
 
     # Calculate and log summary statistics
     if responses:
-        mean_decode_tps = np.mean([r["decode_tps"] for r in responses])
-        mean_total_tps = np.mean([r["total_tps"] for r in responses])
+        mean_tpot = np.mean([r["time_per_output_token"] for r in responses])
         mean_ttft = np.mean([r["ttft"] for r in responses])
-        logger.info(f"Mean Decode TPS: {mean_decode_tps:.2f}")
-        logger.info(f"Mean Total TPS: {mean_total_tps:.2f}")
-        logger.info(f"Mean TTFT: {mean_ttft:.2f}")
+        logger.info(f"Mean TTFT: {mean_ttft:.4f}")
+        logger.info(f"Mean TPOT: {mean_tpot:.4f}")
+        mean_tps = 1.0 / max(mean_tpot, 1e-6)
+        logger.info(f"Mean User TPS: {mean_tps:.4f}")
 
 
 if __name__ == "__main__":

From d9e163cea98a95e1ebc04fbeabb5e81abe1382e2 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Thu, 12 Dec 2024 05:22:25 +0000
Subject: [PATCH 40/76] remove WIP utils/startup_utils.py from this branch

---
 utils/startup_utils.py | 76 ------------------------------------------
 1 file changed, 76 deletions(-)
 delete mode 100644 utils/startup_utils.py

diff --git a/utils/startup_utils.py b/utils/startup_utils.py
deleted file mode 100644
index 05cb616f..00000000
--- a/utils/startup_utils.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-#
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-import os
-import logging
-import subprocess
-import psutil
-import signal
-
-
-logging.basicConfig(
-    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
-)
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-
-
-class InferenceServerContext:
-    def __init__(self, startup_script_path):
-        self.startup_script_path = startup_script_path
-
-    def __enter__(self):
-        self.process = subprocess.Popen(
-            ["python", self.startup_script_path],
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            text=True,
-            preexec_fn=os.setsid,
-        )
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        if not self.process:
-            return
-
-        # Log initial state
-        try:
-            parent = psutil.Process(self.process.pid)
-            children = parent.children(recursive=True)
-            logger.info(f"Found {len(children)} child processes before termination")
-            for child in children:
-                logger.info(f"Child PID: {child.pid}, Name: {child.name()}")
-        except psutil.NoSuchProcess:
-            logger.warning("Main process already terminated")
-            return
-
-        # Send SIGTERM to process group
-        try:
-            os.killpg(self.process.pid, signal.SIGTERM)
-            logger.info(f"Sent SIGTERM to process group {self.process.pid}")
-        except ProcessLookupError:
-            logger.warning("Process group already terminated")
-            return
-
-        # Wait for graceful shutdown
-        try:
-            self.process.wait(timeout=5)
-            logger.info("Process terminated gracefully")
-        except subprocess.TimeoutExpired:
-            logger.warning("Timeout expired, force killing process group")
-            try:
-                os.killpg(self.process.pid, signal.SIGKILL)
-            except ProcessLookupError:
-                pass
-
-        # Final verification
-        try:
-            parent = psutil.Process(self.process.pid)
-            remaining = parent.children(recursive=True)
-            if remaining:
-                logger.error(f"{len(remaining)} child processes still exist")
-                for proc in remaining:
-                    logger.error(f"Remaining PID: {proc.pid}, Name: {proc.name()}")
-        except psutil.NoSuchProcess:
-            logger.info("All inference server processes terminated")

From cd29085e84416b8182893f3b52227ea6a3f36242 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Tue, 31 Dec 2024 19:59:55 +0000
Subject: [PATCH 41/76] adding doc string to BatchProcessor

---
 utils/batch_processor.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/utils/batch_processor.py b/utils/batch_processor.py
index 2c7a68fc..ca81baf5 100644
--- a/utils/batch_processor.py
+++ b/utils/batch_processor.py
@@ -25,6 +25,17 @@
 
 
 class BatchProcessor:
+    """
+    BatchProcessor runs multiple concurrent requests to the backend inference
+    server (vLLM in this case). This adds some functionality for sending requests
+    with a specific max number of requests allowed that is independent with the
+    backend batch_size. Mostly this is for testing continous batching and seq lens,
+    but can be used as an alternative method for benchmarking as in
+    benchmarking/prompt_client_online_benchmark.py measuring TTFT as experienced
+    by users by not exceeding the backend concurrent user capacity and having
+    requests queued on the backend server before processing starts by the model.
+    """
+
     def __init__(self, prompt_client: PromptClient, batch_config: BatchConfig):
         self.prompt_client = prompt_client
         self.batch_config = batch_config

From 376403d28db3b9b2e6c4002a9cbba1c3de01af4c Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Tue, 31 Dec 2024 20:31:04 +0000
Subject: [PATCH 42/76] add output_path arg to
 batch_processor.py::BatchProcessor to optionally provide incremental output
 saveing for debugging, default to not saving output for benchmarking

---
 utils/batch_processor.py | 49 +++++++++++++++++++---------------------
 1 file changed, 23 insertions(+), 26 deletions(-)

diff --git a/utils/batch_processor.py b/utils/batch_processor.py
index ca81baf5..cbbfdf3f 100644
--- a/utils/batch_processor.py
+++ b/utils/batch_processor.py
@@ -6,9 +6,8 @@
 import logging
 import json
 import time
-from datetime import datetime
 from pathlib import Path
-from typing import List
+from typing import List, Union
 from concurrent.futures import ThreadPoolExecutor, as_completed
 
 import numpy as np
@@ -73,26 +72,22 @@ def process_batch(
         prompts: List[str],
         input_seq_lengths: List[int],
         tokenizer: AutoTokenizer,
+        output_path: Union[Path, str] = None,
     ) -> List[dict]:
-        timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
-        json_fpath = (
-            Path(self.prompt_client.env_config.cache_root)
-            / f"alpaca_eval_responses_{timestamp}.json"
-        )
-
         total_prompts = len(prompts) * self.batch_config.num_full_iterations
         response_counter = 0
         all_responses = []
 
-        with open(json_fpath, "a") as f:
-            f.write("[\n")
+        if output_path:
+            with open(output_path, "a") as f:
+                f.write("[\n")
 
         if self.batch_config.batch_size == 1:
             all_responses = self._process_single_thread(
                 prompts,
                 input_seq_lengths,
                 tokenizer,
-                json_fpath,
+                output_path,
                 total_prompts,
                 response_counter,
             )
@@ -101,13 +96,14 @@ def process_batch(
                 prompts,
                 input_seq_lengths,
                 tokenizer,
-                json_fpath,
+                output_path,
                 total_prompts,
                 response_counter,
             )
 
-        with open(json_fpath, "a") as f:
-            f.write("\n]")
+        if output_path:
+            with open(output_path, "a") as f:
+                f.write("\n]")
 
         return all_responses
 
@@ -116,7 +112,7 @@ def _process_single_thread(
         prompts: List[str],
         input_seq_lengths: List[int],
         tokenizer: AutoTokenizer,
-        json_fpath: Path,
+        output_path: Union[Path, str],
         total_prompts: int,
         response_counter: int,
     ) -> List[dict]:
@@ -139,7 +135,7 @@ def _process_single_thread(
                 )
 
                 self._save_response(
-                    response_data, all_responses, json_fpath, response_counter
+                    response_data, all_responses, output_path, response_counter
                 )
                 response_counter += 1
                 self._log_progress(response_counter, total_prompts, response_data)
@@ -151,7 +147,7 @@ def _process_multi_thread(
         prompts: List[str],
         input_seq_lengths: List[int],
         tokenizer: AutoTokenizer,
-        json_fpath: Path,
+        output_path: Union[Path, str],
         total_prompts: int,
         response_counter: int,
     ) -> List[dict]:
@@ -172,7 +168,7 @@ def _process_multi_thread(
                         bsz,
                         tokenizer,
                         all_responses,
-                        json_fpath,
+                        output_path,
                         total_prompts,
                         response_counter,
                     )
@@ -202,7 +198,7 @@ def _process_multi_thread(
                     try:
                         response_data = future.result()
                         self._save_response(
-                            response_data, all_responses, json_fpath, response_counter
+                            response_data, all_responses, output_path, response_counter
                         )
                         response_counter += 1
                         self._log_progress(
@@ -221,7 +217,7 @@ def _process_batch_chunk(
         batch_size: int,
         tokenizer: AutoTokenizer,
         all_responses: List[dict],
-        json_fpath: Path,
+        output_path: Union[Path, str],
         total_prompts: int,
         response_counter: int,
     ):
@@ -251,7 +247,7 @@ def _process_batch_chunk(
                 try:
                     response_data = future.result()
                     self._save_response(
-                        response_data, all_responses, json_fpath, response_counter
+                        response_data, all_responses, output_path, response_counter
                     )
                     response_counter += 1
                     self._log_progress(response_counter, total_prompts, response_data)
@@ -262,15 +258,16 @@ def _save_response(
         self,
         response_data: dict,
         all_responses: List[dict],
-        json_fpath: Path,
+        output_path: Union[Path, str],
         response_counter: int,
     ):
         with self.responses_lock:
             all_responses.append(response_data)
-            with open(json_fpath, "a") as f:
-                if response_counter > 0:
-                    f.write(",")
-                json.dump(response_data, f, indent=4)
+            if output_path:
+                with open(output_path, "a") as f:
+                    if response_counter > 0:
+                        f.write(",")
+                    json.dump(response_data, f, indent=4)
 
     def _log_progress(
         self, response_counter: int, total_prompts: int, response_data: dict

From daf062552976c3d218923c6b4e095c65a857b240 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Thu, 12 Dec 2024 15:22:38 +0000
Subject: [PATCH 43/76] adding tests/test_vllm_seq_lens.py to test vllm
 sequence lengths and batching capacity

---
 tests/test_vllm_seq_lens.py | 66 +++++++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)
 create mode 100644 tests/test_vllm_seq_lens.py

diff --git a/tests/test_vllm_seq_lens.py b/tests/test_vllm_seq_lens.py
new file mode 100644
index 00000000..def803c8
--- /dev/null
+++ b/tests/test_vllm_seq_lens.py
@@ -0,0 +1,66 @@
+# SPDX-License-Identifier: Apache-2.0
+#
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+import logging
+from typing import Dict
+
+
+import pytest
+
+from benchmarking.prompt_client_online_benchmark import run_sequence_length_test
+
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+# Test params
+# see: https://github.com/tenstorrent/tt-metal/tree/main/models/demos/t3000/llama3_70b#details
+
+TEST_paramS = [
+    {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32},
+    {"input_len": 2048, "output_len": 2048, "batch_size": 32, "num_prompts": 32},
+    {"input_len": 4000, "output_len": 96, "batch_size": 32, "num_prompts": 32},
+    {"input_len": 4096, "output_len": 256, "batch_size": 32, "num_prompts": 32},
+    {"input_len": 8000, "output_len": 192, "batch_size": 16, "num_prompts": 16},
+    {"input_len": 8192, "output_len": 256, "batch_size": 16, "num_prompts": 16},
+    {"input_len": 32768, "output_len": 32, "batch_size": 1, "num_prompts": 1},
+    {"input_len": 32768, "output_len": 98304, "batch_size": 1, "num_prompts": 1},
+]
+
+
+@pytest.mark.parametrize("param", TEST_paramS)
+def test_sequence_length(param: Dict[str, int]):
+    # Run the sequence length test
+    results = run_sequence_length_test(
+        combinations=[param],  # Pass as single-item list for compatibility
+        save_dir="vllm_test_seq_lens",
+        file_prefix="vllm_test_seq_lens",
+        model="meta-llama/Llama-3.1-70B-Instruct",
+    )
+
+    # Add assertions to verify the results
+    assert results is not None, "Test results should not be None"
+
+    # Verify the results contain expected data
+    logger.info(f"Results: {results}")
+    assert isinstance(results, list)
+    stats = results[0]
+    assert "input_seq_len" in stats
+    assert "output_seq_len" in stats
+
+    # Verify the specific param parameters were used
+    assert stats["input_seq_len"] == param["input_len"]
+    assert stats["output_seq_len"] == param["output_len"]
+    assert stats["batch_size"] == param["batch_size"]
+    assert stats["num_prompts"] == param["num_prompts"]
+
+    # Add specific assertions for the test parameters
+    assert stats["total_output_tokens"] > 0
+    assert stats["mean_tpot"] > 0
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v", "--log-cli-level=INFO"])

From f3e34d10b7b624b83997539b64619fdfd7cbbfe2 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Fri, 13 Dec 2024 02:54:22 +0000
Subject: [PATCH 44/76] fix TEST_PARAMS

---
 tests/test_vllm_seq_lens.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tests/test_vllm_seq_lens.py b/tests/test_vllm_seq_lens.py
index def803c8..6d35043d 100644
--- a/tests/test_vllm_seq_lens.py
+++ b/tests/test_vllm_seq_lens.py
@@ -19,7 +19,8 @@
 # Test params
 # see: https://github.com/tenstorrent/tt-metal/tree/main/models/demos/t3000/llama3_70b#details
 
-TEST_paramS = [
+TEST_PARAMS = [
+    # test sequence lengths
     {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32},
     {"input_len": 2048, "output_len": 2048, "batch_size": 32, "num_prompts": 32},
     {"input_len": 4000, "output_len": 96, "batch_size": 32, "num_prompts": 32},
@@ -28,10 +29,12 @@
     {"input_len": 8192, "output_len": 256, "batch_size": 16, "num_prompts": 16},
     {"input_len": 32768, "output_len": 32, "batch_size": 1, "num_prompts": 1},
     {"input_len": 32768, "output_len": 98304, "batch_size": 1, "num_prompts": 1},
+    # test continuous batching
+    {"input_len": 8190, "output_len": 1024, "batch_size": 32, "num_prompts": 64},
 ]
 
 
-@pytest.mark.parametrize("param", TEST_paramS)
+@pytest.mark.parametrize("param", TEST_PARAMS)
 def test_sequence_length(param: Dict[str, int]):
     # Run the sequence length test
     results = run_sequence_length_test(

From 4d360eb681d5c884bd4c8d2cf82e01798525f4ae Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Tue, 17 Dec 2024 02:12:51 +0000
Subject: [PATCH 45/76] adding fixed_batch_size to
 prompt_client_online_benchmark.py for better single user control

---
 .../prompt_client_online_benchmark.py         | 20 +++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/benchmarking/prompt_client_online_benchmark.py b/benchmarking/prompt_client_online_benchmark.py
index f4d9ca53..1e25df8d 100644
--- a/benchmarking/prompt_client_online_benchmark.py
+++ b/benchmarking/prompt_client_online_benchmark.py
@@ -6,7 +6,7 @@
 
 import logging
 import numpy as np
-from typing import List, Dict, Tuple
+from typing import List, Dict, Tuple, Optional
 import json
 from datetime import datetime
 from pathlib import Path
@@ -26,19 +26,23 @@
 
 def get_test_combinations(
     context_lens: List[Tuple[int, int]],
+    fixed_batch_size: Optional[int] = None,
 ) -> List[Dict[str, int]]:
     combinations = []
     for input_len, output_len in context_lens:
         # Skip invalid combinations where output_len > input_len
         context = input_len + output_len
-        if context <= 4096:
-            bsz = 32
-        elif context <= 8192:
-            bsz = 16
+        if not fixed_batch_size:
+            if context <= 4096:
+                bsz = 32
+            elif context <= 8192:
+                bsz = 16
+            else:
+                bsz = 1
         else:
-            bsz = 1
+            bsz = fixed_batch_size
 
-        num_prompts = max(bsz * 32, 32)
+        num_prompts = max(bsz * 8, 32)
         combinations.append(
             {
                 "input_len": input_len,
@@ -207,7 +211,7 @@ def run_sequence_length_test(
         # (8100, 32),
     ]
     # Generate all valid combinations upfront
-    combinations = get_test_combinations(context_lens=context_lens)
+    combinations = get_test_combinations(context_lens=context_lens, fixed_batch_size=1)
 
     # Run tests
     results = run_sequence_length_test(

From 41dcc22376dfe3228351920d800a328d4a3c7f5d Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Tue, 17 Dec 2024 23:17:44 +0000
Subject: [PATCH 46/76] use standard output values in ms

---
 .../prompt_client_online_benchmark.py         | 44 ++++++++-----------
 utils/README.md                               |  8 ++--
 utils/batch_processor.py                      |  4 +-
 utils/prompt_client.py                        | 17 ++++---
 utils/prompt_client_cli.py                    |  2 +-
 5 files changed, 36 insertions(+), 39 deletions(-)

diff --git a/benchmarking/prompt_client_online_benchmark.py b/benchmarking/prompt_client_online_benchmark.py
index 1e25df8d..02f2cb63 100644
--- a/benchmarking/prompt_client_online_benchmark.py
+++ b/benchmarking/prompt_client_online_benchmark.py
@@ -142,29 +142,25 @@ def run_sequence_length_test(
                 input_seq_lengths=input_seq_lengths,
                 tokenizer=tokenizer,
             )
-
-            # Calculate statistics
-            mean_tpot = np.mean([r["time_per_output_token"] for r in responses])
-            mean_tpot = max(mean_tpot, 1e-6)  # Avoid division by zero
-            mean_tps = 1.0 / mean_tpot
-            std_tpot = np.std([r["time_per_output_token"] for r in responses])
-            std_tpot = max(std_tpot, 1e-6)  # Avoid division by zero
-            std_tps = mean_tps - 1.0 / (mean_tpot + std_tpot)
+            e2e_latency = np.max([r["duration"] for r in responses])
+            num_requests = num_prompts * num_iterations
             stats = {
-                "input_seq_len": input_len,
-                "output_seq_len": output_len,
+                "model_id": model,
+                "backend": "vllm",
+                "timestamp": timestamp,
+                "input_sequence_length": input_len,
+                "output_sequence_length": output_len,
                 "batch_size": batch_size,
+                "num_requests": num_requests,
+                "mean_tpot_ms": np.mean([r["tpot_ms"] for r in responses]),
+                "std_tpot_ms": np.std([r["tpot_ms"] for r in responses]),
+                "mean_ttft_ms": np.mean([r["ttft_ms"] for r in responses]),
+                "std_ttft_ms": np.std([r["ttft_ms"] for r in responses]),
+                "total_input_tokens": sum([r["input_seq_len"] for r in responses]),
                 "total_output_tokens": sum([r["output_seq_len"] for r in responses]),
-                "mean_tpot": mean_tpot,
-                "mean_tps": mean_tps,
-                "mean_ttft": np.mean([r["ttft"] for r in responses]),
-                "std_tpot": std_tpot,
-                "std_tps": std_tps,
-                "std_ttft": np.std([r["ttft"] for r in responses]),
-                "num_prompts": num_prompts,
+                "duration": e2e_latency,
                 "num_iterations": num_iterations,
-                "timestamp": timestamp,
-                "combination_index": idx,
+                "request_throughput": num_requests / e2e_latency,
             }
 
             all_results.append(stats)
@@ -172,16 +168,14 @@ def run_sequence_length_test(
             # Log results
             logger.info(
                 f"Results for combination {idx}/{total_combinations}:\n"
-                f"Mean TPOT: {stats['mean_tpot']:.4f} ± "
-                f"{stats['std_tpot']:.4f}\n"
-                f"Mean user TPS: {stats['mean_tps']:.4f} ± "
-                f"{stats['std_tps']:.4f}\n"
-                f"Mean TTFT: {stats['mean_ttft']:.4f} ± {stats['std_ttft']:.4f}"
+                f"Mean TTFT: {stats['mean_ttft_ms']:.4f} ± {stats['std_ttft_ms']:.4f}"
+                f"Mean TPOT: {stats['mean_tpot_ms']:.4f} ± "
+                f"{stats['std_tpot_ms']:.4f}\n"
             )
 
             # Save results after each combination
             with open(results_file, "w") as f:
-                json.dump(all_results, f, indent=4)
+                json.dump(stats, f, indent=4)
 
         except Exception as e:
             logger.error(f"Error processing combination {idx}: {e}")
diff --git a/utils/README.md b/utils/README.md
index 984fda73..2afa03e2 100644
--- a/utils/README.md
+++ b/utils/README.md
@@ -156,14 +156,14 @@ The client saves responses in JSON format with the following structure:
 
 ```json
 {
-  "response_idx": number,    // Response index in batch
+  "response_idx": number,   // Response index in batch
   "prompt": string,         // Input prompt
   "response": string,       // Generated completion text
   "input_seq_len": number,  // Prompt length in tokens
   "output_seq_len": number, // Completion length in tokens
-  "inter_token_latencies": number[],  // Per-token generation times in seconds
-  "time_per_output_token": number,    // Average seconds per token
-  "ttft": number            // Time to first token in seconds
+  "itl_ms": number[],        // Inter Token Latency (ITL) ms
+  "tpot_ms": number,        // Time Per Output Token (TPOT) average, ms 
+  "ttft_ms": number         // Time To First Token (TTFT) ms
 }
 ```
 
diff --git a/utils/batch_processor.py b/utils/batch_processor.py
index cbbfdf3f..adedb984 100644
--- a/utils/batch_processor.py
+++ b/utils/batch_processor.py
@@ -274,8 +274,8 @@ def _log_progress(
     ):
         logger.info(
             f"Processed {response_counter}/{total_prompts} responses. "
-            f"TPOT: {response_data['time_per_output_token']:.4f}, "
-            f"TTFT: {response_data['ttft']:.4f}, "
+            f"TPOT: {response_data['tpot_ms']:.4f}, "
+            f"TTFT: {response_data['ttft_ms']:.4f}, "
             f"input_seq_len: {response_data['input_seq_len']}, "
             f"output_seq_len: {response_data['output_seq_len']}"
         )
diff --git a/utils/prompt_client.py b/utils/prompt_client.py
index 1322f130..7fee601d 100644
--- a/utils/prompt_client.py
+++ b/utils/prompt_client.py
@@ -158,8 +158,8 @@ def capture_traces(
                     )
                     logger.info(
                         f"tokens generated: {response_data['output_seq_len']}, "
-                        f"TTFT: {response_data['ttft']:.3f}s, "
-                        f"TPOT: {response_data['time_per_output_token']:.3f}s"
+                        f"TTFT: {response_data['ttft_ms']:.3f} ms, "
+                        f"TPOT: {response_data['tpot_ms']:.3f} ms"
                     )
                 except Exception as e:
                     logger.error(f"Error processing prompt: {e}")
@@ -252,12 +252,14 @@ def _process_response(
             full_text = data["choices"][0]["text"]
             usage_dict = data["usage"]
             first_token_time = req_time
+        
+        duration = time.perf_counter() - req_time
 
-        # Calculate inter-token latencies
+        # Calculate inter-token latencies (ms)
         inter_token_latencies = []
         if len(token_timestamps) > 1:
             inter_token_latencies = [
-                token_timestamps[i] - token_timestamps[i - 1]
+                (token_timestamps[i] - token_timestamps[i - 1]) * 1000.0
                 for i in range(1, len(token_timestamps))
             ]
 
@@ -294,7 +296,8 @@ def _process_response(
             "response": full_text,
             "input_seq_len": prompt_len,
             "output_seq_len": num_completion_tokens,
-            "inter_token_latencies": inter_token_latencies,
-            "time_per_output_token": time_per_output_token,
-            "ttft": ttft,
+            "itl_ms": inter_token_latencies,
+            "tpot_ms": time_per_output_token * 1000.0,
+            "ttft_ms": ttft * 1000.0,
+            "duration": duration,
         }
diff --git a/utils/prompt_client_cli.py b/utils/prompt_client_cli.py
index 7ab7ed6f..d4bf8a59 100644
--- a/utils/prompt_client_cli.py
+++ b/utils/prompt_client_cli.py
@@ -191,7 +191,7 @@ def main():
 
     # Calculate and log summary statistics
     if responses:
-        mean_tpot = np.mean([r["time_per_output_token"] for r in responses])
+        mean_tpot = np.mean([r["tpot_ms"] for r in responses])
         mean_ttft = np.mean([r["ttft"] for r in responses])
         logger.info(f"Mean TTFT: {mean_ttft:.4f}")
         logger.info(f"Mean TPOT: {mean_tpot:.4f}")

From 308eeaff19384ba9ec3a602df0556eddacec3da8 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Tue, 17 Dec 2024 23:21:18 +0000
Subject: [PATCH 47/76] fix output filepath for
 prompt_client_online_benchmark.py, remove get_test_combinations in favor of
 directly specifying them

---
 .../prompt_client_online_benchmark.py         | 83 +++++--------------
 benchmarking/vllm_online_benchmark.py         | 48 ++++++-----
 2 files changed, 45 insertions(+), 86 deletions(-)

diff --git a/benchmarking/prompt_client_online_benchmark.py b/benchmarking/prompt_client_online_benchmark.py
index 02f2cb63..8bb3f335 100644
--- a/benchmarking/prompt_client_online_benchmark.py
+++ b/benchmarking/prompt_client_online_benchmark.py
@@ -4,6 +4,7 @@
 #
 # SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
 
+import os
 import logging
 import numpy as np
 from typing import List, Dict, Tuple, Optional
@@ -24,56 +25,16 @@
 logger.setLevel(logging.INFO)
 
 
-def get_test_combinations(
-    context_lens: List[Tuple[int, int]],
-    fixed_batch_size: Optional[int] = None,
-) -> List[Dict[str, int]]:
-    combinations = []
-    for input_len, output_len in context_lens:
-        # Skip invalid combinations where output_len > input_len
-        context = input_len + output_len
-        if not fixed_batch_size:
-            if context <= 4096:
-                bsz = 32
-            elif context <= 8192:
-                bsz = 16
-            else:
-                bsz = 1
-        else:
-            bsz = fixed_batch_size
-
-        num_prompts = max(bsz * 8, 32)
-        combinations.append(
-            {
-                "input_len": input_len,
-                "output_len": output_len,
-                "batch_size": bsz,
-                "num_prompts": num_prompts,
-            }
-        )
-
-    # Log total number of combinations
-    logger.info(f"Generated {len(combinations)} valid test combinations")
-    for i, combo in enumerate(combinations, 1):
-        logger.info(
-            f"Combination {i}: input_len={combo['input_len']}, "
-            f"output_len={combo['output_len']}, batch_size={combo['batch_size']}, "
-            f"num_prompts={combo['num_prompts']}"
-        )
-
-    return combinations
-
-
 def run_sequence_length_test(
     combinations: List[Dict[str, int]],
-    save_dir: str,
+    result_dir: str,
     file_prefix: str,
     num_iterations: int = 1,
     model: str = "meta-llama/Llama-3.1-70B-Instruct",
 ) -> List[dict]:
     # Create save directory
     timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
-    save_path = Path(save_dir) / f"results_{timestamp}"
+    save_path = Path(result_dir) / f"results_{timestamp}"
     save_path.mkdir(parents=True, exist_ok=True)
 
     # Initialize results storage
@@ -185,31 +146,25 @@ def run_sequence_length_test(
 
 
 if __name__ == "__main__":
-    # Define benchmarking context length (isl, osl) pairs
-    context_lens = [
-        (128, 128),
-        # (128, 2048),
-        # (128, 4096),
-        # (2048, 128),
-        # (2048, 2048),
-        # (1000, 1000),
-        # (500, 2000),
-        # (5000, 500),
-        # (20000, 2000),
-        # (128, 2),
-        # (256, 2),
-        # (512, 32),
-        # (1000, 24),
-        # (2000, 32),
-        # (4000, 32),
-        # (8100, 32),
+
+    combinations = [
+        {"input_len": 128, "output_len": 128, "batch_size": 1, "num_prompts": 32},
+        {"input_len": 128, "output_len": 1024, "batch_size": 1, "num_prompts": 32},
+        {"input_len": 2048, "output_len": 128, "batch_size": 1, "num_prompts": 32},
+        {"input_len": 128, "output_len": 4096, "batch_size": 1, "num_prompts": 32},
+        {"input_len": 2048, "output_len": 2048, "batch_size": 1, "num_prompts": 32},
     ]
-    # Generate all valid combinations upfront
-    combinations = get_test_combinations(context_lens=context_lens, fixed_batch_size=1)
+
+    # Create output directory
+    cache_dir = Path(os.environ.get("CACHE_ROOT", ""))
+    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    result_dir = cache_dir / "online_benchmark_results"
+    result_dir.mkdir(parents=True, exist_ok=True)
 
     # Run tests
     results = run_sequence_length_test(
         combinations=combinations,
-        save_dir="online_benchmarking",
-        file_prefix="online_benchmark_results",
+        result_dir=result_dir,
+        file_prefix="online_benchmark",
+        model="meta-llama/Llama-3.1-70B-Instruct",
     )
diff --git a/benchmarking/vllm_online_benchmark.py b/benchmarking/vllm_online_benchmark.py
index 159f8da0..adfe6563 100644
--- a/benchmarking/vllm_online_benchmark.py
+++ b/benchmarking/vllm_online_benchmark.py
@@ -10,7 +10,6 @@
 from typing import Dict
 from pathlib import Path
 
-from benchmarking.prompt_client_online_benchmark import get_test_combinations
 from utils.prompt_configs import EnvironmentConfig
 from utils.prompt_client import PromptClient
 
@@ -35,7 +34,6 @@ def run_benchmark(
         "--backend", "vllm",
         "--model", model,
         "--port", str(port),
-        # "--request-rate", "3",
         "--dataset-name", "random",
         "--num-prompts", str(params["num_prompts"]),
         "--random-input-len", str(params["input_len"]),
@@ -75,28 +73,34 @@ def main():
     # note: there isnt a better way to pass an api key to the vllm benchmarking script
     os.environ["OPENAI_API_KEY"] = prompt_client._get_authorization()
 
-    # Define benchmarking context length (isl, osl) pairs
-    context_lens = [
-        (128, 128),
-        # (128, 2048),
-        # (128, 4096),
-        # (2048, 128),
-        # (2048, 2048),
-        # (1000, 1000),
-        # (500, 2000),
-        # (5000, 500),
-        # (20000, 2000),
-        # (128, 2),
-        # (256, 2),
-        # (512, 32),
-        # (1000, 24),
-        # (2000, 32),
-        # (4000, 32),
-        # (8100, 32),
+    # Get all benchmark combinations using the original function
+    combinations = [
+        {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32*8},
+        # {"input_len": 128, "output_len": 1024, "batch_size": 32, "num_prompts": 32},
+        # {"input_len": 2048, "output_len": 128, "batch_size": 32, "num_prompts": 32},
+        # {"input_len": 128, "output_len": 4096, "batch_size": 32, "num_prompts": 32},
+        # {"input_len": 2048, "output_len": 2048, "batch_size": 32, "num_prompts": 32},
+        # {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32},
+        # {"input_len": 128, "output_len": 2048, "batch_size": 32, "num_prompts": 32},
+        # {"input_len": 128, "output_len": 4096, "batch_size": 32, "num_prompts": 32},
+        # {"input_len": 2048, "output_len": 128, "batch_size": 32, "num_prompts": 32},
+        # {"input_len": 2048, "output_len": 2048, "batch_size": 32, "num_prompts": 32},
+        # {"input_len": 1000, "output_len": 1000, "batch_size": 32, "num_prompts": 32},
+        # {"input_len": 500, "output_len": 2000, "batch_size": 32, "num_prompts": 32},
+        # {"input_len": 5000, "output_len": 500, "batch_size": 32, "num_prompts": 32},
+        # {"input_len": 20000, "output_len": 2000, "batch_size": 32, "num_prompts": 32},
+        # {"input_len": 128, "output_len": 2, "batch_size": 32, "num_prompts": 32},
+        # {"input_len": 256, "output_len": 2, "batch_size": 32, "num_prompts": 32},
+        # {"input_len": 512, "output_len": 32, "batch_size": 32, "num_prompts": 32},
+        # {"input_len": 1000, "output_len": 24, "batch_size": 32, "num_prompts": 32},
+        # {"input_len": 2000, "output_len": 32, "batch_size": 32, "num_prompts": 32},
+        # {"input_len": 4000, "output_len": 32, "batch_size": 32, "num_prompts": 32},
+        # {"input_len": 8100, "output_len": 32, "batch_size": 32, "num_prompts": 32}
     ]
 
-    # Get all benchmark combinations using the original function
-    combinations = get_test_combinations(context_lens=context_lens)
+    context_lens = [(it["input_len"], it["output_len"]) for it in combinations]
+    # de-dupe
+    context_lens = list(set(context_lens))
 
     # pre-capture traces required for benchmarking
     prompt_client.capture_traces(context_lens=context_lens)

From e6fc8c4032fc8e8c0a54955d701f6d3c0ae17b84 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Tue, 17 Dec 2024 23:23:35 +0000
Subject: [PATCH 48/76] add benchmark output file reader script

---
 benchmarking/benchmark_output_processor.py | 227 +++++++++++++++++++++
 1 file changed, 227 insertions(+)
 create mode 100644 benchmarking/benchmark_output_processor.py

diff --git a/benchmarking/benchmark_output_processor.py b/benchmarking/benchmark_output_processor.py
new file mode 100644
index 00000000..ebc73cee
--- /dev/null
+++ b/benchmarking/benchmark_output_processor.py
@@ -0,0 +1,227 @@
+# SPDX-License-Identifier: Apache-2.0
+#
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+import json
+import glob
+import os
+from datetime import datetime
+import re
+from typing import Dict, List, Any
+from operator import itemgetter
+import argparse
+from pathlib import Path
+
+
+DATE_STR_FORMAT = "%Y-%m-%d_%H-%M-%S"
+
+def parse_args():
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser(
+        description='Process vLLM benchmark results from multiple directories.'
+    )
+    parser.add_argument(
+        'directories',
+        nargs='+',
+        type=str,
+        help='One or more directories containing benchmark files'
+    )
+    parser.add_argument(
+        '--pattern',
+        type=str,
+        default='*_benchmark_*.json',
+        help='File pattern to match (default: vllm_online_benchmark_*.json)'
+    )
+    parser.add_argument(
+        '--output',
+        type=str,
+        default=f"benchmark_results_{datetime.now().strftime(DATE_STR_FORMAT)}.csv",
+        help='Output CSV file name'
+    )
+    return parser.parse_args()
+
+def extract_params_from_filename(filename: str) -> Dict[str, Any]:
+    """
+    Extract all parameters from benchmark filename using regex.
+    Example: vllm_online_benchmark_2024-12-17_13-24-17_isl-128_osl-128_bsz-32_n-32.json
+    
+    Returns:
+        Dictionary containing timestamp and numeric parameters
+    """
+    pattern = r"""
+        benchmark_
+        (?P<timestamp>\d{4}-\d{2}-\d{2}_\d{2}-\d{2}-\d{2})  # Timestamp
+        _isl-(?P<isl>\d+)                                    # Input sequence length
+        _osl-(?P<osl>\d+)                                    # Output sequence length
+        _bsz-(?P<bsz>\d+)                                    # Batch size
+        _n-(?P<n>\d+)                                        # Number of requests
+    """
+    
+    match = re.search(pattern, filename, re.VERBOSE)
+    if not match:
+        raise ValueError(f"Could not extract parameters from filename: {filename}")
+    
+    # Convert timestamp string to datetime
+    timestamp_str = match.group('timestamp')
+    timestamp = datetime.strptime(timestamp_str, '%Y-%m-%d_%H-%M-%S')
+    
+    # Extract and convert numeric parameters
+    params = {
+        'timestamp': timestamp,
+        'input_sequence_length': int(match.group('isl')),
+        'output_sequence_length': int(match.group('osl')),
+        'batch_size': int(match.group('bsz')),
+        'num_requests': int(match.group('n'))
+    }
+    
+    return params
+
+def process_benchmark_file(filepath: str) -> Dict[str, Any]:
+    """Process a single benchmark file and extract relevant metrics."""
+    with open(filepath, 'r') as f:
+        data = json.load(f)
+    
+    filename = os.path.basename(filepath)
+
+    params = extract_params_from_filename(filename)
+    timestamp = params.pop('timestamp')  # Remove timestamp from params dict
+
+    metrics = {
+        'filepath': filepath,
+        'filename': filename,
+        'timestamp': timestamp,
+        'model_id': data.get('model_id', ''),
+        'backend': data.get('backend', ''),
+        'num_prompts': data.get('num_prompts', ''),
+        'mean_tpot_ms': data.get('mean_tpot_ms', "n/a"),
+        'std_tpot_ms': data.get('std_tpot_ms', "n/a"),
+        'mean_ttft_ms': data.get('mean_ttft_ms', "n/a"),
+        'std_ttft_ms': data.get('std_ttft_ms', "n/a"),
+        'total_input_tokens': data.get('total_input_tokens', "n/a"),
+        'total_output_tokens': data.get('total_output_tokens', "n/a"),
+        'duration': data.get('duration', "n/a"),
+        'request_throughput': data.get('request_throughput', "n/a"),
+        **params  # Unpack the extracted parameters
+    }
+
+    # Calculate statistics
+    mean_tpot = max(metrics["mean_tpot_ms"], 1e-6)  # Avoid division by zero
+    mean_tps = 1.0 / mean_tpot
+    std_tps = mean_tps - (1.0 / (mean_tpot + metrics["std_tpot_ms"]))
+    metrics["mean_tps"] = mean_tps
+    metrics["std_tps"] = std_tps
+    return metrics
+
+def process_benchmark_files(directories: List[str], pattern: str) -> List[Dict[str, Any]]:
+    """Process benchmark files from multiple directories matching the given pattern."""
+    results = []
+    
+    for directory in directories:
+        dir_path = Path(directory)
+        if not dir_path.exists():
+            print(f"Warning: Directory not found: {directory}")
+            continue
+            
+        file_pattern = str(dir_path / pattern)
+        files = glob.glob(file_pattern)
+        
+        if not files:
+            print(f"Warning: No files found matching pattern '{pattern}' in {directory}")
+            continue
+            
+        print(f"Processing {len(files)} files from {directory}")
+        
+        for filepath in files:
+            print(f"Processing: {filepath} ...")
+            try:
+                metrics = process_benchmark_file(filepath)
+                results.append(metrics)
+            except Exception as e:
+                print(f"Error processing file {filepath}: {str(e)}")
+    
+    if not results:
+        raise ValueError("No benchmark files were successfully processed")
+    
+    # Sort by timestamp
+    return sorted(results, key=lambda x: x['timestamp'])
+
+def save_to_csv(results: List[Dict[str, Any]], filename: str) -> None:
+    """Save results to a CSV file."""
+    if not results:
+        return
+
+    # Get all unique keys from all dictionaries
+    headers = list(results[0].keys())
+    
+    with open(filename, 'w') as f:
+        # Write headers
+        f.write(','.join(headers) + '\n')
+        
+        # Write data
+        for result in results:
+            row = [str(result.get(header, '')) for header in headers]
+            f.write(','.join(row) + '\n')
+
+def format_markdown_table(results: List[Dict[str, Any]]) -> str:
+    """Format results as a Markdown table."""
+    if not results:
+        return ""
+    
+    # Define columns to display and their headers
+    display_cols = [
+        ('model_id', 'Model ID'),
+        ('backend', 'Backend'),
+        ('input_sequence_length', 'ISL'),
+        ('output_sequence_length', 'OSL'),
+        ('batch_size', 'Batch Size'),
+        ('num_requests', 'Requests'),
+        ('mean_tpot_ms', 'TPOT (ms)'),
+        ('mean_ttft_ms', 'TTFT (ms)'),
+        ('request_throughput', 'Throughput (RPS)'),
+    ]
+    
+    # Create header row
+    header = " | ".join(header for _, header in display_cols)
+    separator = "|".join(['---'] * len(display_cols))
+    
+    # Create data rows
+    rows = []
+    for result in results:
+        row_values = []
+        for col, _ in display_cols:
+            value = result.get(col, '')
+            # Format floats to 2 decimal places
+            if isinstance(value, float):
+                value = f"{value:.2f}"
+            row_values.append(str(value))
+        rows.append(" | ".join(row_values))
+    
+    # Combine all parts
+    markdown_table = f"| {header} |\n| {separator} |\n"
+    markdown_table += "\n".join(f"| {row} |" for row in rows)
+    
+    return markdown_table
+
+
+def main():
+    args = parse_args()
+    
+    results = process_benchmark_files(args.directories, args.pattern)
+    
+    # Display basic statistics
+    print("\nBenchmark Summary:")
+    print(f"Total files processed: {len(results)}")
+
+    
+    # Save to CSV
+    save_to_csv(results, args.output)
+    print(f"\nResults saved to: {args.output}")
+    
+    # Generate and print Markdown table
+    print("\nMarkdown Table:\n")
+    print(format_markdown_table(results))
+    print("\n")
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file

From 6295693cda42417b0d4223ee5936510a81c6c411 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Tue, 17 Dec 2024 23:34:37 +0000
Subject: [PATCH 49/76] ruff formatting, rename
 benchmarking/benchmark_output_processor.py ->
 benchmarking/benchmark_summary.py

---
 ...tput_processor.py => benchmark_summary.py} | 169 +++++++++---------
 .../prompt_client_online_benchmark.py         |   3 +-
 benchmarking/vllm_online_benchmark.py         |   2 +-
 3 files changed, 91 insertions(+), 83 deletions(-)
 rename benchmarking/{benchmark_output_processor.py => benchmark_summary.py} (63%)

diff --git a/benchmarking/benchmark_output_processor.py b/benchmarking/benchmark_summary.py
similarity index 63%
rename from benchmarking/benchmark_output_processor.py
rename to benchmarking/benchmark_summary.py
index ebc73cee..ba734365 100644
--- a/benchmarking/benchmark_output_processor.py
+++ b/benchmarking/benchmark_summary.py
@@ -8,43 +8,44 @@
 from datetime import datetime
 import re
 from typing import Dict, List, Any
-from operator import itemgetter
 import argparse
 from pathlib import Path
 
 
 DATE_STR_FORMAT = "%Y-%m-%d_%H-%M-%S"
 
+
 def parse_args():
     """Parse command line arguments."""
     parser = argparse.ArgumentParser(
-        description='Process vLLM benchmark results from multiple directories.'
+        description="Process vLLM benchmark results from multiple directories."
     )
     parser.add_argument(
-        'directories',
-        nargs='+',
+        "directories",
+        nargs="+",
         type=str,
-        help='One or more directories containing benchmark files'
+        help="One or more directories containing benchmark files",
     )
     parser.add_argument(
-        '--pattern',
+        "--pattern",
         type=str,
-        default='*_benchmark_*.json',
-        help='File pattern to match (default: vllm_online_benchmark_*.json)'
+        default="*_benchmark_*.json",
+        help="File pattern to match (default: vllm_online_benchmark_*.json)",
     )
     parser.add_argument(
-        '--output',
+        "--output",
         type=str,
         default=f"benchmark_results_{datetime.now().strftime(DATE_STR_FORMAT)}.csv",
-        help='Output CSV file name'
+        help="Output CSV file name",
     )
     return parser.parse_args()
 
+
 def extract_params_from_filename(filename: str) -> Dict[str, Any]:
     """
     Extract all parameters from benchmark filename using regex.
     Example: vllm_online_benchmark_2024-12-17_13-24-17_isl-128_osl-128_bsz-32_n-32.json
-    
+
     Returns:
         Dictionary containing timestamp and numeric parameters
     """
@@ -56,81 +57,87 @@ def extract_params_from_filename(filename: str) -> Dict[str, Any]:
         _bsz-(?P<bsz>\d+)                                    # Batch size
         _n-(?P<n>\d+)                                        # Number of requests
     """
-    
+
     match = re.search(pattern, filename, re.VERBOSE)
     if not match:
         raise ValueError(f"Could not extract parameters from filename: {filename}")
-    
+
     # Convert timestamp string to datetime
-    timestamp_str = match.group('timestamp')
-    timestamp = datetime.strptime(timestamp_str, '%Y-%m-%d_%H-%M-%S')
-    
+    timestamp_str = match.group("timestamp")
+    timestamp = datetime.strptime(timestamp_str, "%Y-%m-%d_%H-%M-%S")
+
     # Extract and convert numeric parameters
     params = {
-        'timestamp': timestamp,
-        'input_sequence_length': int(match.group('isl')),
-        'output_sequence_length': int(match.group('osl')),
-        'batch_size': int(match.group('bsz')),
-        'num_requests': int(match.group('n'))
+        "timestamp": timestamp,
+        "input_sequence_length": int(match.group("isl")),
+        "output_sequence_length": int(match.group("osl")),
+        "batch_size": int(match.group("bsz")),
+        "num_requests": int(match.group("n")),
     }
-    
+
     return params
 
+
 def process_benchmark_file(filepath: str) -> Dict[str, Any]:
     """Process a single benchmark file and extract relevant metrics."""
-    with open(filepath, 'r') as f:
+    with open(filepath, "r") as f:
         data = json.load(f)
-    
+
     filename = os.path.basename(filepath)
 
     params = extract_params_from_filename(filename)
-    timestamp = params.pop('timestamp')  # Remove timestamp from params dict
+    timestamp = params.pop("timestamp")  # Remove timestamp from params dict
 
     metrics = {
-        'filepath': filepath,
-        'filename': filename,
-        'timestamp': timestamp,
-        'model_id': data.get('model_id', ''),
-        'backend': data.get('backend', ''),
-        'num_prompts': data.get('num_prompts', ''),
-        'mean_tpot_ms': data.get('mean_tpot_ms', "n/a"),
-        'std_tpot_ms': data.get('std_tpot_ms', "n/a"),
-        'mean_ttft_ms': data.get('mean_ttft_ms', "n/a"),
-        'std_ttft_ms': data.get('std_ttft_ms', "n/a"),
-        'total_input_tokens': data.get('total_input_tokens', "n/a"),
-        'total_output_tokens': data.get('total_output_tokens', "n/a"),
-        'duration': data.get('duration', "n/a"),
-        'request_throughput': data.get('request_throughput', "n/a"),
-        **params  # Unpack the extracted parameters
+        "filepath": filepath,
+        "filename": filename,
+        "timestamp": timestamp,
+        "model_id": data.get("model_id", ""),
+        "backend": data.get("backend", ""),
+        "num_prompts": data.get("num_prompts", ""),
+        "mean_tpot_ms": data.get("mean_tpot_ms", "n/a"),
+        "std_tpot_ms": data.get("std_tpot_ms", "n/a"),
+        "mean_ttft_ms": data.get("mean_ttft_ms", "n/a"),
+        "std_ttft_ms": data.get("std_ttft_ms", "n/a"),
+        "total_input_tokens": data.get("total_input_tokens", "n/a"),
+        "total_output_tokens": data.get("total_output_tokens", "n/a"),
+        "duration": data.get("duration", "n/a"),
+        "request_throughput": data.get("request_throughput", "n/a"),
+        **params,  # Unpack the extracted parameters
     }
 
     # Calculate statistics
     mean_tpot = max(metrics["mean_tpot_ms"], 1e-6)  # Avoid division by zero
-    mean_tps = 1.0 / mean_tpot
-    std_tps = mean_tps - (1.0 / (mean_tpot + metrics["std_tpot_ms"]))
+    mean_tps = 1000.0 / mean_tpot
+    std_tps = mean_tps - (1000.0 / (mean_tpot + metrics["std_tpot_ms"]))
     metrics["mean_tps"] = mean_tps
     metrics["std_tps"] = std_tps
     return metrics
 
-def process_benchmark_files(directories: List[str], pattern: str) -> List[Dict[str, Any]]:
+
+def process_benchmark_files(
+    directories: List[str], pattern: str
+) -> List[Dict[str, Any]]:
     """Process benchmark files from multiple directories matching the given pattern."""
     results = []
-    
+
     for directory in directories:
         dir_path = Path(directory)
         if not dir_path.exists():
             print(f"Warning: Directory not found: {directory}")
             continue
-            
+
         file_pattern = str(dir_path / pattern)
         files = glob.glob(file_pattern)
-        
+
         if not files:
-            print(f"Warning: No files found matching pattern '{pattern}' in {directory}")
+            print(
+                f"Warning: No files found matching pattern '{pattern}' in {directory}"
+            )
             continue
-            
+
         print(f"Processing {len(files)} files from {directory}")
-        
+
         for filepath in files:
             print(f"Processing: {filepath} ...")
             try:
@@ -138,12 +145,13 @@ def process_benchmark_files(directories: List[str], pattern: str) -> List[Dict[s
                 results.append(metrics)
             except Exception as e:
                 print(f"Error processing file {filepath}: {str(e)}")
-    
+
     if not results:
         raise ValueError("No benchmark files were successfully processed")
-    
+
     # Sort by timestamp
-    return sorted(results, key=lambda x: x['timestamp'])
+    return sorted(results, key=lambda x: x["timestamp"])
+
 
 def save_to_csv(results: List[Dict[str, Any]], filename: str) -> None:
     """Save results to a CSV file."""
@@ -152,76 +160,77 @@ def save_to_csv(results: List[Dict[str, Any]], filename: str) -> None:
 
     # Get all unique keys from all dictionaries
     headers = list(results[0].keys())
-    
-    with open(filename, 'w') as f:
+
+    with open(filename, "w") as f:
         # Write headers
-        f.write(','.join(headers) + '\n')
-        
+        f.write(",".join(headers) + "\n")
+
         # Write data
         for result in results:
-            row = [str(result.get(header, '')) for header in headers]
-            f.write(','.join(row) + '\n')
+            row = [str(result.get(header, "")) for header in headers]
+            f.write(",".join(row) + "\n")
+
 
 def format_markdown_table(results: List[Dict[str, Any]]) -> str:
     """Format results as a Markdown table."""
     if not results:
         return ""
-    
+
     # Define columns to display and their headers
     display_cols = [
-        ('model_id', 'Model ID'),
-        ('backend', 'Backend'),
-        ('input_sequence_length', 'ISL'),
-        ('output_sequence_length', 'OSL'),
-        ('batch_size', 'Batch Size'),
-        ('num_requests', 'Requests'),
-        ('mean_tpot_ms', 'TPOT (ms)'),
-        ('mean_ttft_ms', 'TTFT (ms)'),
-        ('request_throughput', 'Throughput (RPS)'),
+        ("model_id", "Model ID"),
+        ("backend", "Backend"),
+        ("input_sequence_length", "ISL"),
+        ("output_sequence_length", "OSL"),
+        ("batch_size", "Batch Size"),
+        ("num_requests", "Requests"),
+        ("mean_ttft_ms", "TTFT (ms)"),
+        ("mean_tpot_ms", "TPOT (ms)"),
+        ("mean_tps", "TPS (user)"),
+        ("request_throughput", "Request Throughput (RPS)"),
     ]
-    
+
     # Create header row
     header = " | ".join(header for _, header in display_cols)
-    separator = "|".join(['---'] * len(display_cols))
-    
+    separator = "|".join(["---"] * len(display_cols))
+
     # Create data rows
     rows = []
     for result in results:
         row_values = []
         for col, _ in display_cols:
-            value = result.get(col, '')
+            value = result.get(col, "")
             # Format floats to 2 decimal places
             if isinstance(value, float):
                 value = f"{value:.2f}"
             row_values.append(str(value))
         rows.append(" | ".join(row_values))
-    
+
     # Combine all parts
     markdown_table = f"| {header} |\n| {separator} |\n"
     markdown_table += "\n".join(f"| {row} |" for row in rows)
-    
+
     return markdown_table
 
 
 def main():
     args = parse_args()
-    
+
     results = process_benchmark_files(args.directories, args.pattern)
-    
+
     # Display basic statistics
     print("\nBenchmark Summary:")
     print(f"Total files processed: {len(results)}")
 
-    
     # Save to CSV
     save_to_csv(results, args.output)
     print(f"\nResults saved to: {args.output}")
-    
+
     # Generate and print Markdown table
     print("\nMarkdown Table:\n")
     print(format_markdown_table(results))
-    print("\n")
+    print("Note: all metrics are means across benchmark run unless otherwise stated.\n")
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/benchmarking/prompt_client_online_benchmark.py b/benchmarking/prompt_client_online_benchmark.py
index 8bb3f335..7f8bc71d 100644
--- a/benchmarking/prompt_client_online_benchmark.py
+++ b/benchmarking/prompt_client_online_benchmark.py
@@ -7,7 +7,7 @@
 import os
 import logging
 import numpy as np
-from typing import List, Dict, Tuple, Optional
+from typing import List, Dict
 import json
 from datetime import datetime
 from pathlib import Path
@@ -146,7 +146,6 @@ def run_sequence_length_test(
 
 
 if __name__ == "__main__":
-
     combinations = [
         {"input_len": 128, "output_len": 128, "batch_size": 1, "num_prompts": 32},
         {"input_len": 128, "output_len": 1024, "batch_size": 1, "num_prompts": 32},
diff --git a/benchmarking/vllm_online_benchmark.py b/benchmarking/vllm_online_benchmark.py
index adfe6563..e7aa0bfa 100644
--- a/benchmarking/vllm_online_benchmark.py
+++ b/benchmarking/vllm_online_benchmark.py
@@ -75,7 +75,7 @@ def main():
 
     # Get all benchmark combinations using the original function
     combinations = [
-        {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32*8},
+        {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32 * 8},
         # {"input_len": 128, "output_len": 1024, "batch_size": 32, "num_prompts": 32},
         # {"input_len": 2048, "output_len": 128, "batch_size": 32, "num_prompts": 32},
         # {"input_len": 128, "output_len": 4096, "batch_size": 32, "num_prompts": 32},

From 8963a12d3043a1243f122c02e8ed9c6a3bfdd640 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Wed, 18 Dec 2024 02:11:24 +0000
Subject: [PATCH 50/76] add percentile-metrics to add e2els stats

---
 benchmarking/vllm_online_benchmark.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarking/vllm_online_benchmark.py b/benchmarking/vllm_online_benchmark.py
index e7aa0bfa..c3305680 100644
--- a/benchmarking/vllm_online_benchmark.py
+++ b/benchmarking/vllm_online_benchmark.py
@@ -39,6 +39,7 @@ def run_benchmark(
         "--random-input-len", str(params["input_len"]),
         "--random-output-len", str(params["output_len"]),
         "--ignore-eos",  # Ignore EOS tokens to force max output length as set
+        "--percentile-metrics", "ttft,tpot,itl,e2els",  # must add e2els in order for it to be logged
         "--save-result",
         "--result-filename", str(result_filename)
     ]

From fc8eb06291ef4eb8056d70f9add076efa32bc52c Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Wed, 18 Dec 2024 02:12:34 +0000
Subject: [PATCH 51/76] add latency to
 benchmarking/prompt_client_online_benchmark.py and summary support

---
 benchmarking/benchmark_summary.py              |  5 +++--
 benchmarking/prompt_client_online_benchmark.py | 14 +++++++-------
 utils/prompt_client.py                         |  6 +++---
 3 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/benchmarking/benchmark_summary.py b/benchmarking/benchmark_summary.py
index ba734365..d5781b96 100644
--- a/benchmarking/benchmark_summary.py
+++ b/benchmarking/benchmark_summary.py
@@ -101,7 +101,7 @@ def process_benchmark_file(filepath: str) -> Dict[str, Any]:
         "std_ttft_ms": data.get("std_ttft_ms", "n/a"),
         "total_input_tokens": data.get("total_input_tokens", "n/a"),
         "total_output_tokens": data.get("total_output_tokens", "n/a"),
-        "duration": data.get("duration", "n/a"),
+        "mean_e2el_ms": data.get("mean_e2el_ms", "n/a"),
         "request_throughput": data.get("request_throughput", "n/a"),
         **params,  # Unpack the extracted parameters
     }
@@ -183,10 +183,11 @@ def format_markdown_table(results: List[Dict[str, Any]]) -> str:
         ("input_sequence_length", "ISL"),
         ("output_sequence_length", "OSL"),
         ("batch_size", "Batch Size"),
-        ("num_requests", "Requests"),
+        ("num_requests", "Num Requests"),
         ("mean_ttft_ms", "TTFT (ms)"),
         ("mean_tpot_ms", "TPOT (ms)"),
         ("mean_tps", "TPS (user)"),
+        ("mean_e2el_ms", "Request latency"),
         ("request_throughput", "Request Throughput (RPS)"),
     ]
 
diff --git a/benchmarking/prompt_client_online_benchmark.py b/benchmarking/prompt_client_online_benchmark.py
index 7f8bc71d..385bff19 100644
--- a/benchmarking/prompt_client_online_benchmark.py
+++ b/benchmarking/prompt_client_online_benchmark.py
@@ -103,7 +103,7 @@ def run_sequence_length_test(
                 input_seq_lengths=input_seq_lengths,
                 tokenizer=tokenizer,
             )
-            e2e_latency = np.max([r["duration"] for r in responses])
+            mean_e2el_ms = np.mean([r["latency"] for r in responses]) * 1000.0
             num_requests = num_prompts * num_iterations
             stats = {
                 "model_id": model,
@@ -119,9 +119,9 @@ def run_sequence_length_test(
                 "std_ttft_ms": np.std([r["ttft_ms"] for r in responses]),
                 "total_input_tokens": sum([r["input_seq_len"] for r in responses]),
                 "total_output_tokens": sum([r["output_seq_len"] for r in responses]),
-                "duration": e2e_latency,
+                "mean_e2el_ms": mean_e2el_ms,
                 "num_iterations": num_iterations,
-                "request_throughput": num_requests / e2e_latency,
+                "request_throughput": num_requests / mean_e2el_ms,
             }
 
             all_results.append(stats)
@@ -147,11 +147,11 @@ def run_sequence_length_test(
 
 if __name__ == "__main__":
     combinations = [
-        {"input_len": 128, "output_len": 128, "batch_size": 1, "num_prompts": 32},
-        {"input_len": 128, "output_len": 1024, "batch_size": 1, "num_prompts": 32},
+        # {"input_len": 128, "output_len": 128, "batch_size": 1, "num_prompts": 32},
+        {"input_len": 128, "output_len": 1024, "batch_size": 1, "num_prompts": 16},
         {"input_len": 2048, "output_len": 128, "batch_size": 1, "num_prompts": 32},
-        {"input_len": 128, "output_len": 4096, "batch_size": 1, "num_prompts": 32},
-        {"input_len": 2048, "output_len": 2048, "batch_size": 1, "num_prompts": 32},
+        {"input_len": 128, "output_len": 4096, "batch_size": 1, "num_prompts": 8},
+        {"input_len": 2048, "output_len": 2048, "batch_size": 1, "num_prompts": 8},
     ]
 
     # Create output directory
diff --git a/utils/prompt_client.py b/utils/prompt_client.py
index 7fee601d..72c3431a 100644
--- a/utils/prompt_client.py
+++ b/utils/prompt_client.py
@@ -252,8 +252,8 @@ def _process_response(
             full_text = data["choices"][0]["text"]
             usage_dict = data["usage"]
             first_token_time = req_time
-        
-        duration = time.perf_counter() - req_time
+
+        latency = time.perf_counter() - req_time
 
         # Calculate inter-token latencies (ms)
         inter_token_latencies = []
@@ -299,5 +299,5 @@ def _process_response(
             "itl_ms": inter_token_latencies,
             "tpot_ms": time_per_output_token * 1000.0,
             "ttft_ms": ttft * 1000.0,
-            "duration": duration,
+            "latency": latency,
         }

From 6c4d0925e0d850809454266ff4100e4edd1473d1 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Wed, 18 Dec 2024 05:07:20 +0000
Subject: [PATCH 52/76] support latency measurement with mean_e2el_ms

---
 benchmarking/benchmark_summary.py     |  2 +-
 benchmarking/vllm_online_benchmark.py | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/benchmarking/benchmark_summary.py b/benchmarking/benchmark_summary.py
index d5781b96..a9225e18 100644
--- a/benchmarking/benchmark_summary.py
+++ b/benchmarking/benchmark_summary.py
@@ -187,7 +187,7 @@ def format_markdown_table(results: List[Dict[str, Any]]) -> str:
         ("mean_ttft_ms", "TTFT (ms)"),
         ("mean_tpot_ms", "TPOT (ms)"),
         ("mean_tps", "TPS (user)"),
-        ("mean_e2el_ms", "Request latency"),
+        ("mean_e2el_ms", "Request latency (ms)"),
         ("request_throughput", "Request Throughput (RPS)"),
     ]
 
diff --git a/benchmarking/vllm_online_benchmark.py b/benchmarking/vllm_online_benchmark.py
index c3305680..35aed1e6 100644
--- a/benchmarking/vllm_online_benchmark.py
+++ b/benchmarking/vllm_online_benchmark.py
@@ -39,7 +39,7 @@ def run_benchmark(
         "--random-input-len", str(params["input_len"]),
         "--random-output-len", str(params["output_len"]),
         "--ignore-eos",  # Ignore EOS tokens to force max output length as set
-        "--percentile-metrics", "ttft,tpot,itl,e2els",  # must add e2els in order for it to be logged
+        "--percentile-metrics", "ttft,tpot,itl,e2el",  # must add e2el in order for it to be logged
         "--save-result",
         "--result-filename", str(result_filename)
     ]
@@ -76,11 +76,11 @@ def main():
 
     # Get all benchmark combinations using the original function
     combinations = [
-        {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32 * 8},
-        # {"input_len": 128, "output_len": 1024, "batch_size": 32, "num_prompts": 32},
-        # {"input_len": 2048, "output_len": 128, "batch_size": 32, "num_prompts": 32},
-        # {"input_len": 128, "output_len": 4096, "batch_size": 32, "num_prompts": 32},
-        # {"input_len": 2048, "output_len": 2048, "batch_size": 32, "num_prompts": 32},
+        {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32},
+        {"input_len": 128, "output_len": 1024, "batch_size": 32, "num_prompts": 32},
+        {"input_len": 2048, "output_len": 128, "batch_size": 32, "num_prompts": 32},
+        {"input_len": 128, "output_len": 4096, "batch_size": 32, "num_prompts": 32},
+        {"input_len": 2048, "output_len": 2048, "batch_size": 32, "num_prompts": 32},
         # {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32},
         # {"input_len": 128, "output_len": 2048, "batch_size": 32, "num_prompts": 32},
         # {"input_len": 128, "output_len": 4096, "batch_size": 32, "num_prompts": 32},

From d8ec6828c308e1a42749d52f8a4292b48d822685 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Wed, 18 Dec 2024 05:13:08 +0000
Subject: [PATCH 53/76] update benchmark sweeps

---
 benchmarking/prompt_client_online_benchmark.py | 17 ++++++++++++-----
 benchmarking/vllm_online_benchmark.py          |  7 -------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/benchmarking/prompt_client_online_benchmark.py b/benchmarking/prompt_client_online_benchmark.py
index 385bff19..9842d782 100644
--- a/benchmarking/prompt_client_online_benchmark.py
+++ b/benchmarking/prompt_client_online_benchmark.py
@@ -147,11 +147,18 @@ def run_sequence_length_test(
 
 if __name__ == "__main__":
     combinations = [
-        # {"input_len": 128, "output_len": 128, "batch_size": 1, "num_prompts": 32},
-        {"input_len": 128, "output_len": 1024, "batch_size": 1, "num_prompts": 16},
-        {"input_len": 2048, "output_len": 128, "batch_size": 1, "num_prompts": 32},
-        {"input_len": 128, "output_len": 4096, "batch_size": 1, "num_prompts": 8},
-        {"input_len": 2048, "output_len": 2048, "batch_size": 1, "num_prompts": 8},
+        # sweeps for batch-1
+        {"input_len": 128, "output_len": 128, "batch_size": 1, "num_prompts": 16},
+        {"input_len": 128, "output_len": 1024, "batch_size": 1, "num_prompts": 8},
+        {"input_len": 2048, "output_len": 128, "batch_size": 1, "num_prompts": 8},
+        {"input_len": 128, "output_len": 4096, "batch_size": 1, "num_prompts": 4},
+        {"input_len": 2048, "output_len": 2048, "batch_size": 1, "num_prompts": 4},
+        # sweeps for batch-32
+        {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32},
+        {"input_len": 128, "output_len": 1024, "batch_size": 32, "num_prompts": 32},
+        {"input_len": 2048, "output_len": 128, "batch_size": 32, "num_prompts": 32},
+        {"input_len": 128, "output_len": 4096, "batch_size": 32, "num_prompts": 32},
+        {"input_len": 2048, "output_len": 2048, "batch_size": 32, "num_prompts": 32},
     ]
 
     # Create output directory
diff --git a/benchmarking/vllm_online_benchmark.py b/benchmarking/vllm_online_benchmark.py
index 35aed1e6..faca9e67 100644
--- a/benchmarking/vllm_online_benchmark.py
+++ b/benchmarking/vllm_online_benchmark.py
@@ -90,13 +90,6 @@ def main():
         # {"input_len": 500, "output_len": 2000, "batch_size": 32, "num_prompts": 32},
         # {"input_len": 5000, "output_len": 500, "batch_size": 32, "num_prompts": 32},
         # {"input_len": 20000, "output_len": 2000, "batch_size": 32, "num_prompts": 32},
-        # {"input_len": 128, "output_len": 2, "batch_size": 32, "num_prompts": 32},
-        # {"input_len": 256, "output_len": 2, "batch_size": 32, "num_prompts": 32},
-        # {"input_len": 512, "output_len": 32, "batch_size": 32, "num_prompts": 32},
-        # {"input_len": 1000, "output_len": 24, "batch_size": 32, "num_prompts": 32},
-        # {"input_len": 2000, "output_len": 32, "batch_size": 32, "num_prompts": 32},
-        # {"input_len": 4000, "output_len": 32, "batch_size": 32, "num_prompts": 32},
-        # {"input_len": 8100, "output_len": 32, "batch_size": 32, "num_prompts": 32}
     ]
 
     context_lens = [(it["input_len"], it["output_len"]) for it in combinations]

From ffaabd6ed7c54b8474f946ea27569bf81587f288 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Wed, 18 Dec 2024 06:26:44 +0000
Subject: [PATCH 54/76] update sweeps context lengths

---
 benchmarking/prompt_client_online_benchmark.py | 2 ++
 benchmarking/vllm_online_benchmark.py          | 1 +
 2 files changed, 3 insertions(+)

diff --git a/benchmarking/prompt_client_online_benchmark.py b/benchmarking/prompt_client_online_benchmark.py
index 9842d782..b2ee0b7c 100644
--- a/benchmarking/prompt_client_online_benchmark.py
+++ b/benchmarking/prompt_client_online_benchmark.py
@@ -151,12 +151,14 @@ def run_sequence_length_test(
         {"input_len": 128, "output_len": 128, "batch_size": 1, "num_prompts": 16},
         {"input_len": 128, "output_len": 1024, "batch_size": 1, "num_prompts": 8},
         {"input_len": 2048, "output_len": 128, "batch_size": 1, "num_prompts": 8},
+        {"input_len": 128, "output_len": 2048, "batch_size": 1, "num_prompts": 8},
         {"input_len": 128, "output_len": 4096, "batch_size": 1, "num_prompts": 4},
         {"input_len": 2048, "output_len": 2048, "batch_size": 1, "num_prompts": 4},
         # sweeps for batch-32
         {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32},
         {"input_len": 128, "output_len": 1024, "batch_size": 32, "num_prompts": 32},
         {"input_len": 2048, "output_len": 128, "batch_size": 32, "num_prompts": 32},
+        {"input_len": 128, "output_len": 2048, "batch_size": 32, "num_prompts": 32},
         {"input_len": 128, "output_len": 4096, "batch_size": 32, "num_prompts": 32},
         {"input_len": 2048, "output_len": 2048, "batch_size": 32, "num_prompts": 32},
     ]
diff --git a/benchmarking/vllm_online_benchmark.py b/benchmarking/vllm_online_benchmark.py
index faca9e67..3be639e0 100644
--- a/benchmarking/vllm_online_benchmark.py
+++ b/benchmarking/vllm_online_benchmark.py
@@ -79,6 +79,7 @@ def main():
         {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32},
         {"input_len": 128, "output_len": 1024, "batch_size": 32, "num_prompts": 32},
         {"input_len": 2048, "output_len": 128, "batch_size": 32, "num_prompts": 32},
+        {"input_len": 128, "output_len": 2048, "batch_size": 32, "num_prompts": 32},
         {"input_len": 128, "output_len": 4096, "batch_size": 32, "num_prompts": 32},
         {"input_len": 2048, "output_len": 2048, "batch_size": 32, "num_prompts": 32},
         # {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32},

From 4602ff3d96e38eb5a95c1882f82c5a72362acc04 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Wed, 18 Dec 2024 06:42:08 +0000
Subject: [PATCH 55/76] model id as header not in table

---
 benchmarking/benchmark_summary.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/benchmarking/benchmark_summary.py b/benchmarking/benchmark_summary.py
index a9225e18..c2d6c7ea 100644
--- a/benchmarking/benchmark_summary.py
+++ b/benchmarking/benchmark_summary.py
@@ -178,8 +178,6 @@ def format_markdown_table(results: List[Dict[str, Any]]) -> str:
 
     # Define columns to display and their headers
     display_cols = [
-        ("model_id", "Model ID"),
-        ("backend", "Backend"),
         ("input_sequence_length", "ISL"),
         ("output_sequence_length", "OSL"),
         ("batch_size", "Batch Size"),
@@ -229,6 +227,9 @@ def main():
 
     # Generate and print Markdown table
     print("\nMarkdown Table:\n")
+
+    print(f"Model ID: {results[0].get('model_id')}")
+    print(f"Backend: {results[0].get('backend')}")
     print(format_markdown_table(results))
     print("Note: all metrics are means across benchmark run unless otherwise stated.\n")
 

From 594b9a1e7b07aca50e65d92a8a6abf706dd05183 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Wed, 18 Dec 2024 18:13:00 +0000
Subject: [PATCH 56/76] add better formatting in benchmark_summary.py, update
 iso/osl sweeps

---
 benchmarking/benchmark_summary.py             | 112 +++++++++++++-----
 .../prompt_client_online_benchmark.py         |   9 +-
 benchmarking/vllm_online_benchmark.py         |  15 ++-
 3 files changed, 96 insertions(+), 40 deletions(-)

diff --git a/benchmarking/benchmark_summary.py b/benchmarking/benchmark_summary.py
index c2d6c7ea..30d8894b 100644
--- a/benchmarking/benchmark_summary.py
+++ b/benchmarking/benchmark_summary.py
@@ -33,22 +33,15 @@ def parse_args():
         help="File pattern to match (default: vllm_online_benchmark_*.json)",
     )
     parser.add_argument(
-        "--output",
+        "--output-dir",
         type=str,
-        default=f"benchmark_results_{datetime.now().strftime(DATE_STR_FORMAT)}.csv",
+        default="",
         help="Output CSV file name",
     )
     return parser.parse_args()
 
 
 def extract_params_from_filename(filename: str) -> Dict[str, Any]:
-    """
-    Extract all parameters from benchmark filename using regex.
-    Example: vllm_online_benchmark_2024-12-17_13-24-17_isl-128_osl-128_bsz-32_n-32.json
-
-    Returns:
-        Dictionary containing timestamp and numeric parameters
-    """
     pattern = r"""
         benchmark_
         (?P<timestamp>\d{4}-\d{2}-\d{2}_\d{2}-\d{2}-\d{2})  # Timestamp
@@ -78,6 +71,23 @@ def extract_params_from_filename(filename: str) -> Dict[str, Any]:
     return params
 
 
+def format_metrics(metrics):
+    NOT_MEASURED_STR = "n/a"
+    formatted_metrics = {}
+
+    for key, value in metrics.items():
+        # Skip None values and NOT_MEASURED_STR
+        if value is None or value == NOT_MEASURED_STR:
+            formatted_metrics[key] = NOT_MEASURED_STR
+        elif isinstance(value, float):
+            # Format numeric values to 2 decimal places
+            formatted_metrics[key] = round(float(value), 2)
+        else:
+            formatted_metrics[key] = value
+
+    return formatted_metrics
+
+
 def process_benchmark_file(filepath: str) -> Dict[str, Any]:
     """Process a single benchmark file and extract relevant metrics."""
     with open(filepath, "r") as f:
@@ -86,32 +96,44 @@ def process_benchmark_file(filepath: str) -> Dict[str, Any]:
     filename = os.path.basename(filepath)
 
     params = extract_params_from_filename(filename)
-    timestamp = params.pop("timestamp")  # Remove timestamp from params dict
+
+    # Calculate statistics
+
+    mean_tpot_ms = data.get("mean_tpot_ms")
+    if data.get("mean_tpot_ms"):
+        mean_tpot = max(data.get("mean_tpot_ms"), 1e-6)  # Avoid division by zero
+        mean_tps = 1000.0 / mean_tpot
+        if data.get("std_tpot_ms"):
+            std_tps = mean_tps - (1000.0 / (mean_tpot + data.get("std_tpot_ms")))
+        else:
+            std_tps = None
+    else:
+        mean_tps = None
+        std_tps = None
 
     metrics = {
-        "filepath": filepath,
-        "filename": filename,
-        "timestamp": timestamp,
+        "timestamp": params["timestamp"],
         "model_id": data.get("model_id", ""),
         "backend": data.get("backend", ""),
+        "input_sequence_length": params["input_sequence_length"],
+        "output_sequence_length": params["output_sequence_length"],
+        "batch_size": params["batch_size"],
+        "mean_ttft_ms": data.get("mean_ttft_ms"),
+        "std_ttft_ms": data.get("std_ttft_ms"),
+        "mean_tpot_ms": mean_tpot_ms,
+        "std_tpot_ms": data.get("std_tpot_ms"),
+        "mean_tps": mean_tps,
+        "std_tps": std_tps,
+        "mean_e2el_ms": data.get("mean_e2el_ms"),
+        "request_throughput": data.get("request_throughput"),
+        "total_input_tokens": data.get("total_input_tokens"),
+        "total_output_tokens": data.get("total_output_tokens"),
         "num_prompts": data.get("num_prompts", ""),
-        "mean_tpot_ms": data.get("mean_tpot_ms", "n/a"),
-        "std_tpot_ms": data.get("std_tpot_ms", "n/a"),
-        "mean_ttft_ms": data.get("mean_ttft_ms", "n/a"),
-        "std_ttft_ms": data.get("std_ttft_ms", "n/a"),
-        "total_input_tokens": data.get("total_input_tokens", "n/a"),
-        "total_output_tokens": data.get("total_output_tokens", "n/a"),
-        "mean_e2el_ms": data.get("mean_e2el_ms", "n/a"),
-        "request_throughput": data.get("request_throughput", "n/a"),
-        **params,  # Unpack the extracted parameters
+        "num_requests": params["num_requests"],
+        "filename": filename,
     }
+    metrics = format_metrics(metrics)
 
-    # Calculate statistics
-    mean_tpot = max(metrics["mean_tpot_ms"], 1e-6)  # Avoid division by zero
-    mean_tps = 1000.0 / mean_tpot
-    std_tps = mean_tps - (1000.0 / (mean_tpot + metrics["std_tpot_ms"]))
-    metrics["mean_tps"] = mean_tps
-    metrics["std_tps"] = std_tps
     return metrics
 
 
@@ -153,15 +175,19 @@ def process_benchmark_files(
     return sorted(results, key=lambda x: x["timestamp"])
 
 
-def save_to_csv(results: List[Dict[str, Any]], filename: str) -> None:
+def save_to_csv(
+    results: List[Dict[str, Any]], output_dir: str, timestamp_str: str
+) -> None:
     """Save results to a CSV file."""
     if not results:
         return
 
+    file_path = Path(output_dir) / f"benchmark_results_{timestamp_str}.csv"
+
     # Get all unique keys from all dictionaries
     headers = list(results[0].keys())
 
-    with open(filename, "w") as f:
+    with open(file_path, "w") as f:
         # Write headers
         f.write(",".join(headers) + "\n")
 
@@ -169,6 +195,7 @@ def save_to_csv(results: List[Dict[str, Any]], filename: str) -> None:
         for result in results:
             row = [str(result.get(header, "")) for header in headers]
             f.write(",".join(row) + "\n")
+    print(f"\nResults saved to: {file_path}")
 
 
 def format_markdown_table(results: List[Dict[str, Any]]) -> str:
@@ -212,18 +239,39 @@ def format_markdown_table(results: List[Dict[str, Any]]) -> str:
     return markdown_table
 
 
+def extract_timestamp(directories):
+    pattern = r"""
+        results_
+        (?P<timestamp>\d{4}-\d{2}-\d{2}_\d{2}-\d{2}-\d{2})  # Timestamp
+    """
+    first_dir = directories[0]
+    match = re.search(pattern, first_dir, re.VERBOSE)
+    if not match:
+        raise ValueError(f"Could not extract parameters from: {first_dir}")
+
+    # Convert timestamp string to datetime
+    timestamp_str = match.group("timestamp")
+
+    return timestamp_str
+
+
 def main():
     args = parse_args()
 
     results = process_benchmark_files(args.directories, args.pattern)
+    timestamp_str = extract_timestamp(args.directories)
 
     # Display basic statistics
     print("\nBenchmark Summary:")
     print(f"Total files processed: {len(results)}")
 
     # Save to CSV
-    save_to_csv(results, args.output)
-    print(f"\nResults saved to: {args.output}")
+    output_dir = args.output_dir
+    if not output_dir:
+        output_dir = Path(os.environ.get("CACHE_ROOT", ""), "benchmark_results")
+        os.makedirs(output_dir, exist_ok=True)
+
+    save_to_csv(results, output_dir, timestamp_str)
 
     # Generate and print Markdown table
     print("\nMarkdown Table:\n")
diff --git a/benchmarking/prompt_client_online_benchmark.py b/benchmarking/prompt_client_online_benchmark.py
index b2ee0b7c..3ed70ff8 100644
--- a/benchmarking/prompt_client_online_benchmark.py
+++ b/benchmarking/prompt_client_online_benchmark.py
@@ -146,22 +146,27 @@ def run_sequence_length_test(
 
 
 if __name__ == "__main__":
+    # fmt: off
     combinations = [
         # sweeps for batch-1
+        {"input_len": 128, "output_len": 10, "batch_size": 1, "num_prompts": 16},
         {"input_len": 128, "output_len": 128, "batch_size": 1, "num_prompts": 16},
         {"input_len": 128, "output_len": 1024, "batch_size": 1, "num_prompts": 8},
-        {"input_len": 2048, "output_len": 128, "batch_size": 1, "num_prompts": 8},
         {"input_len": 128, "output_len": 2048, "batch_size": 1, "num_prompts": 8},
         {"input_len": 128, "output_len": 4096, "batch_size": 1, "num_prompts": 4},
+        {"input_len": 2048, "output_len": 128, "batch_size": 1, "num_prompts": 8},
         {"input_len": 2048, "output_len": 2048, "batch_size": 1, "num_prompts": 4},
         # sweeps for batch-32
         {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32},
+        {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32},
+        {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32},
         {"input_len": 128, "output_len": 1024, "batch_size": 32, "num_prompts": 32},
-        {"input_len": 2048, "output_len": 128, "batch_size": 32, "num_prompts": 32},
         {"input_len": 128, "output_len": 2048, "batch_size": 32, "num_prompts": 32},
         {"input_len": 128, "output_len": 4096, "batch_size": 32, "num_prompts": 32},
+        {"input_len": 2048, "output_len": 128, "batch_size": 32, "num_prompts": 32},
         {"input_len": 2048, "output_len": 2048, "batch_size": 32, "num_prompts": 32},
     ]
+    # fmt: on
 
     # Create output directory
     cache_dir = Path(os.environ.get("CACHE_ROOT", ""))
diff --git a/benchmarking/vllm_online_benchmark.py b/benchmarking/vllm_online_benchmark.py
index 3be639e0..59d6de74 100644
--- a/benchmarking/vllm_online_benchmark.py
+++ b/benchmarking/vllm_online_benchmark.py
@@ -75,13 +75,15 @@ def main():
     os.environ["OPENAI_API_KEY"] = prompt_client._get_authorization()
 
     # Get all benchmark combinations using the original function
+    # fmt: off
     combinations = [
-        {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32},
-        {"input_len": 128, "output_len": 1024, "batch_size": 32, "num_prompts": 32},
-        {"input_len": 2048, "output_len": 128, "batch_size": 32, "num_prompts": 32},
-        {"input_len": 128, "output_len": 2048, "batch_size": 32, "num_prompts": 32},
-        {"input_len": 128, "output_len": 4096, "batch_size": 32, "num_prompts": 32},
-        {"input_len": 2048, "output_len": 2048, "batch_size": 32, "num_prompts": 32},
+        {"input_len": 128, "output_len": 10, "batch_size": 32, "num_prompts": 32 * 16},
+        {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32 * 32},
+        {"input_len": 128, "output_len": 1024, "batch_size": 32, "num_prompts": 32 * 16},
+        {"input_len": 128, "output_len": 2048, "batch_size": 32, "num_prompts": 32 * 8},
+        {"input_len": 128, "output_len": 4096, "batch_size": 32, "num_prompts": 32 * 4},
+        {"input_len": 2048, "output_len": 128, "batch_size": 32, "num_prompts": 32 * 16},
+        {"input_len": 2048, "output_len": 2048, "batch_size": 32, "num_prompts": 32 * 4},
         # {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32},
         # {"input_len": 128, "output_len": 2048, "batch_size": 32, "num_prompts": 32},
         # {"input_len": 128, "output_len": 4096, "batch_size": 32, "num_prompts": 32},
@@ -92,6 +94,7 @@ def main():
         # {"input_len": 5000, "output_len": 500, "batch_size": 32, "num_prompts": 32},
         # {"input_len": 20000, "output_len": 2000, "batch_size": 32, "num_prompts": 32},
     ]
+    # fmt: on
 
     context_lens = [(it["input_len"], it["output_len"]) for it in combinations]
     # de-dupe

From 2ce6fe71506a5de4c8cbaba39bd27b691715ca8e Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Wed, 18 Dec 2024 20:39:23 +0000
Subject: [PATCH 57/76] add better markdown formatting, add saving display .csv

---
 benchmarking/benchmark_summary.py | 195 +++++++++++++++++++++---------
 1 file changed, 138 insertions(+), 57 deletions(-)

diff --git a/benchmarking/benchmark_summary.py b/benchmarking/benchmark_summary.py
index 30d8894b..cea4d7c8 100644
--- a/benchmarking/benchmark_summary.py
+++ b/benchmarking/benchmark_summary.py
@@ -5,14 +5,16 @@
 import json
 import glob
 import os
+import csv
 from datetime import datetime
 import re
-from typing import Dict, List, Any
+from typing import Dict, List, Any, Union, Tuple
 import argparse
 from pathlib import Path
 
 
 DATE_STR_FORMAT = "%Y-%m-%d_%H-%M-%S"
+NOT_MEASURED_STR = "n/a"
 
 
 def parse_args():
@@ -71,8 +73,23 @@ def extract_params_from_filename(filename: str) -> Dict[str, Any]:
     return params
 
 
+def extract_timestamp(directories):
+    pattern = r"""
+        results_
+        (?P<timestamp>\d{4}-\d{2}-\d{2}_\d{2}-\d{2}-\d{2})  # Timestamp
+    """
+    first_dir = directories[0]
+    match = re.search(pattern, first_dir, re.VERBOSE)
+    if not match:
+        raise ValueError(f"Could not extract parameters from: {first_dir}")
+
+    # Convert timestamp string to datetime
+    timestamp_str = match.group("timestamp")
+
+    return timestamp_str
+
+
 def format_metrics(metrics):
-    NOT_MEASURED_STR = "n/a"
     formatted_metrics = {}
 
     for key, value in metrics.items():
@@ -175,36 +192,32 @@ def process_benchmark_files(
     return sorted(results, key=lambda x: x["timestamp"])
 
 
-def save_to_csv(
-    results: List[Dict[str, Any]], output_dir: str, timestamp_str: str
-) -> None:
-    """Save results to a CSV file."""
+def save_to_csv(results: List[Dict[str, Any]], file_path: Union[Path, str]) -> None:
     if not results:
         return
 
-    file_path = Path(output_dir) / f"benchmark_results_{timestamp_str}.csv"
-
-    # Get all unique keys from all dictionaries
+    # Get headers from first result (assuming all results have same structure)
     headers = list(results[0].keys())
 
-    with open(file_path, "w") as f:
-        # Write headers
-        f.write(",".join(headers) + "\n")
+    try:
+        with open(file_path, "w", newline="") as f:
+            writer = csv.writer(f)
+            # Write headers
+            writer.writerow(headers)
+            # Write data rows
+            for result in results:
+                row = [str(result.get(header, NOT_MEASURED_STR)) for header in headers]
+                writer.writerow(row)
 
-        # Write data
-        for result in results:
-            row = [str(result.get(header, "")) for header in headers]
-            f.write(",".join(row) + "\n")
-    print(f"\nResults saved to: {file_path}")
+        print(f"\nResults saved to: {file_path}")
 
+    except Exception as e:
+        print(f"Error saving CSV file: {e}")
 
-def format_markdown_table(results: List[Dict[str, Any]]) -> str:
-    """Format results as a Markdown table."""
-    if not results:
-        return ""
 
-    # Define columns to display and their headers
-    display_cols = [
+def create_display_dict(result: Dict[str, Any]) -> Dict[str, str]:
+    # Define display columns mapping
+    display_cols: List[Tuple[str, str]] = [
         ("input_sequence_length", "ISL"),
         ("output_sequence_length", "OSL"),
         ("batch_size", "Batch Size"),
@@ -216,43 +229,105 @@ def format_markdown_table(results: List[Dict[str, Any]]) -> str:
         ("request_throughput", "Request Throughput (RPS)"),
     ]
 
-    # Create header row
-    header = " | ".join(header for _, header in display_cols)
-    separator = "|".join(["---"] * len(display_cols))
+    display_dict = {}
+    for col_name, display_header in display_cols:
+        value = result.get(col_name, NOT_MEASURED_STR)
+        display_dict[display_header] = str(value)
 
-    # Create data rows
-    rows = []
-    for result in results:
-        row_values = []
-        for col, _ in display_cols:
-            value = result.get(col, "")
-            # Format floats to 2 decimal places
-            if isinstance(value, float):
-                value = f"{value:.2f}"
-            row_values.append(str(value))
-        rows.append(" | ".join(row_values))
+    return display_dict
 
-    # Combine all parts
-    markdown_table = f"| {header} |\n| {separator} |\n"
-    markdown_table += "\n".join(f"| {row} |" for row in rows)
 
-    return markdown_table
+def get_markdown_table(display_dicts: List[Dict[str, str]]) -> str:
+    if not display_dicts:
+        return ""
 
+    def sanitize_cell(text: str) -> str:
+        """Sanitize cell content for Markdown compatibility"""
+        # Replace problematic characters
+        text = str(text)
+        text = text.replace("|", "\\|")  # Escape pipe characters
+        text = text.replace("\n", " ")  # Replace newlines with spaces
+        text = re.sub(r"[^\x00-\x7F]+", "", text)  # Remove non-ASCII characters
+        return text.strip()
+
+    # Get headers from first dictionary
+    headers = list(display_dicts[0].keys())
+
+    # Calculate column widths based on all values including headers
+    col_widths = {}
+    for header in headers:
+        # Include header length in width calculation
+        width = len(header)
+        # Check all values for this column
+        for d in display_dicts:
+            width = max(width, len(str(d.get(header, ""))))
+        # Add minimum width of 3
+        col_widths[header] = max(width, 3)
+
+    # Create header row with proper padding
+    header_row = (
+        "| "
+        + " | ".join(
+            sanitize_cell(header).ljust(col_widths[header]) for header in headers
+        )
+        + " |"
+    )
 
-def extract_timestamp(directories):
-    pattern = r"""
-        results_
-        (?P<timestamp>\d{4}-\d{2}-\d{2}_\d{2}-\d{2}-\d{2})  # Timestamp
-    """
-    first_dir = directories[0]
-    match = re.search(pattern, first_dir, re.VERBOSE)
-    if not match:
-        raise ValueError(f"Could not extract parameters from: {first_dir}")
+    # Create separator row with proper alignment indicators
+    separator_row = (
+        "|"
+        + "|".join(":" + "-" * (col_widths[header]) + ":" for header in headers)
+        + "|"
+    )
 
-    # Convert timestamp string to datetime
-    timestamp_str = match.group("timestamp")
+    # Create value rows with proper padding
+    value_rows = []
+    for d in display_dicts:
+        row = (
+            "| "
+            + " | ".join(
+                sanitize_cell(str(d.get(header, ""))).ljust(col_widths[header])
+                for header in headers
+            )
+            + " |"
+        )
+        value_rows.append(row)
+
+    # add notes
+    notes = (
+        "\nNote: all metrics are means across benchmark run unless otherwise stated.\n"
+    )
+    # Combine all rows
+    md_str = f"{header_row}\n{separator_row}\n" + "\n".join(value_rows) + notes
+    return md_str
 
-    return timestamp_str
+
+def save_markdown_table(
+    markdown_str: str, filepath: str, add_title: str = None, add_notes: List[str] = None
+) -> None:
+    # Convert string path to Path object and ensure .md extension
+    path = Path(filepath)
+    if path.suffix.lower() != ".md":
+        path = path.with_suffix(".md")
+
+    # Create directory if it doesn't exist
+    path.parent.mkdir(parents=True, exist_ok=True)
+
+    # Prepare content
+    content = []
+    if add_title:
+        # Add title with markdown h1 formatting and blank line
+        content.extend([f"# {add_title}", ""])
+    content.append(markdown_str)
+    if add_notes:
+        content.extend(add_notes)
+
+    # Write to file with UTF-8 encoding
+    try:
+        path.write_text("\n".join(content), encoding="utf-8")
+        print(f"Successfully saved markdown table to: {path}")
+    except Exception as e:
+        print(f"Error saving markdown table: {str(e)}")
 
 
 def main():
@@ -271,15 +346,21 @@ def main():
         output_dir = Path(os.environ.get("CACHE_ROOT", ""), "benchmark_results")
         os.makedirs(output_dir, exist_ok=True)
 
-    save_to_csv(results, output_dir, timestamp_str)
+    # save stats
+    stats_file_path = Path(output_dir) / f"benchmark_stats_{timestamp_str}.csv"
+    save_to_csv(results, stats_file_path)
 
+    display_results = [create_display_dict(res) for res in results]
+    disp_file_path = Path(output_dir) / f"benchmark_display_{timestamp_str}.csv"
+    save_to_csv(display_results, disp_file_path)
     # Generate and print Markdown table
     print("\nMarkdown Table:\n")
-
     print(f"Model ID: {results[0].get('model_id')}")
     print(f"Backend: {results[0].get('backend')}")
-    print(format_markdown_table(results))
-    print("Note: all metrics are means across benchmark run unless otherwise stated.\n")
+    display_md_str = get_markdown_table(display_results)
+    print(display_md_str)
+    disp_md_path = Path(output_dir) / f"benchmark_display_{timestamp_str}.md"
+    save_markdown_table(display_md_str, disp_md_path)
 
 
 if __name__ == "__main__":

From 6be324fc93a521a607e48b3b7bea8857a00acc18 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Wed, 18 Dec 2024 20:39:40 +0000
Subject: [PATCH 58/76] update sweep isl/osl

---
 benchmarking/vllm_online_benchmark.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/benchmarking/vllm_online_benchmark.py b/benchmarking/vllm_online_benchmark.py
index 59d6de74..8f35b5c7 100644
--- a/benchmarking/vllm_online_benchmark.py
+++ b/benchmarking/vllm_online_benchmark.py
@@ -77,7 +77,6 @@ def main():
     # Get all benchmark combinations using the original function
     # fmt: off
     combinations = [
-        {"input_len": 128, "output_len": 10, "batch_size": 32, "num_prompts": 32 * 16},
         {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32 * 32},
         {"input_len": 128, "output_len": 1024, "batch_size": 32, "num_prompts": 32 * 16},
         {"input_len": 128, "output_len": 2048, "batch_size": 32, "num_prompts": 32 * 8},

From f558876ec0725cd0f51062a95b0677139fc3f9cf Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Wed, 18 Dec 2024 21:04:37 +0000
Subject: [PATCH 59/76] update sweep isl/osl

---
 .../prompt_client_online_benchmark.py         | 30 +++++++++----------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/benchmarking/prompt_client_online_benchmark.py b/benchmarking/prompt_client_online_benchmark.py
index 3ed70ff8..5b1203e3 100644
--- a/benchmarking/prompt_client_online_benchmark.py
+++ b/benchmarking/prompt_client_online_benchmark.py
@@ -149,22 +149,22 @@ def run_sequence_length_test(
     # fmt: off
     combinations = [
         # sweeps for batch-1
-        {"input_len": 128, "output_len": 10, "batch_size": 1, "num_prompts": 16},
-        {"input_len": 128, "output_len": 128, "batch_size": 1, "num_prompts": 16},
-        {"input_len": 128, "output_len": 1024, "batch_size": 1, "num_prompts": 8},
-        {"input_len": 128, "output_len": 2048, "batch_size": 1, "num_prompts": 8},
-        {"input_len": 128, "output_len": 4096, "batch_size": 1, "num_prompts": 4},
-        {"input_len": 2048, "output_len": 128, "batch_size": 1, "num_prompts": 8},
-        {"input_len": 2048, "output_len": 2048, "batch_size": 1, "num_prompts": 4},
+        {"input_len": 128, "output_len": 10, "batch_size": 1, "num_prompts": 64},
+        {"input_len": 128, "output_len": 128, "batch_size": 1, "num_prompts": 64},
+        {"input_len": 128, "output_len": 1024, "batch_size": 1, "num_prompts": 32},
+        {"input_len": 128, "output_len": 2048, "batch_size": 1, "num_prompts": 32},
+        {"input_len": 128, "output_len": 4096, "batch_size": 1, "num_prompts": 16},
+        {"input_len": 2048, "output_len": 128, "batch_size": 1, "num_prompts": 32},
+        {"input_len": 2048, "output_len": 2048, "batch_size": 1, "num_prompts": 16},
         # sweeps for batch-32
-        {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32},
-        {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32},
-        {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32},
-        {"input_len": 128, "output_len": 1024, "batch_size": 32, "num_prompts": 32},
-        {"input_len": 128, "output_len": 2048, "batch_size": 32, "num_prompts": 32},
-        {"input_len": 128, "output_len": 4096, "batch_size": 32, "num_prompts": 32},
-        {"input_len": 2048, "output_len": 128, "batch_size": 32, "num_prompts": 32},
-        {"input_len": 2048, "output_len": 2048, "batch_size": 32, "num_prompts": 32},
+        {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32 * 8},
+        {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32 * 8},
+        {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32 * 8},
+        {"input_len": 128, "output_len": 1024, "batch_size": 32, "num_prompts": 32 * 4},
+        {"input_len": 128, "output_len": 2048, "batch_size": 32, "num_prompts": 32 * 4},
+        {"input_len": 128, "output_len": 4096, "batch_size": 32, "num_prompts": 32 * 2},
+        {"input_len": 2048, "output_len": 128, "batch_size": 32, "num_prompts": 32 * 8},
+        {"input_len": 2048, "output_len": 2048, "batch_size": 32, "num_prompts": 32 * 2},
     ]
     # fmt: on
 

From b4260d35edcbed3f5f9325c22f989b8711c2056e Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Wed, 18 Dec 2024 21:05:05 +0000
Subject: [PATCH 60/76] add metadata to markdown summary

---
 benchmarking/benchmark_summary.py | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/benchmarking/benchmark_summary.py b/benchmarking/benchmark_summary.py
index cea4d7c8..0cdaf77d 100644
--- a/benchmarking/benchmark_summary.py
+++ b/benchmarking/benchmark_summary.py
@@ -237,7 +237,7 @@ def create_display_dict(result: Dict[str, Any]) -> Dict[str, str]:
     return display_dict
 
 
-def get_markdown_table(display_dicts: List[Dict[str, str]]) -> str:
+def get_markdown_table(display_dicts: List[Dict[str, str]], metadata: str = "") -> str:
     if not display_dicts:
         return ""
 
@@ -294,11 +294,16 @@ def sanitize_cell(text: str) -> str:
         value_rows.append(row)
 
     # add notes
-    notes = (
+    end_notes = (
         "\nNote: all metrics are means across benchmark run unless otherwise stated.\n"
     )
     # Combine all rows
-    md_str = f"{header_row}\n{separator_row}\n" + "\n".join(value_rows) + notes
+    md_str = (
+        metadata
+        + f"\n{header_row}\n{separator_row}\n"
+        + "\n".join(value_rows)
+        + end_notes
+    )
     return md_str
 
 
@@ -355,9 +360,11 @@ def main():
     save_to_csv(display_results, disp_file_path)
     # Generate and print Markdown table
     print("\nMarkdown Table:\n")
-    print(f"Model ID: {results[0].get('model_id')}")
-    print(f"Backend: {results[0].get('backend')}")
-    display_md_str = get_markdown_table(display_results)
+    metadata = (
+        f"Model ID: {results[0].get('model_id')}\n"
+        f"Backend: {results[0].get('backend')}\n"
+    )
+    display_md_str = get_markdown_table(display_results, metadata=metadata)
     print(display_md_str)
     disp_md_path = Path(output_dir) / f"benchmark_display_{timestamp_str}.md"
     save_markdown_table(display_md_str, disp_md_path)

From 89958d96e972327607b8af6c39e9097601637efe Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Wed, 18 Dec 2024 21:32:51 +0000
Subject: [PATCH 61/76] add ignore_eos=True to locust requests to use min/max
 tokens, increase locust default test length to 10 minutes

---
 locust/locust_config.conf | 4 ++--
 locust/locustfile.py      | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/locust/locust_config.conf b/locust/locust_config.conf
index cdd4157b..2431cf0e 100644
--- a/locust/locust_config.conf
+++ b/locust/locust_config.conf
@@ -2,5 +2,5 @@ locustfile = locustfile.py
 headless = true
 host = http://localhost:7000
 users = 32
-spawn-rate = 1
-run-time = 3m
+spawn-rate = 6
+run-time = 10m
diff --git a/locust/locustfile.py b/locust/locustfile.py
index 19dd59ee..94db1a59 100644
--- a/locust/locustfile.py
+++ b/locust/locustfile.py
@@ -18,6 +18,7 @@
     "temperature": 1.0,
     "top_k": 10,
     "top_p": 0.9,
+    "ignore_eos": True,
 }
 
 # Global variable to store data iterator

From 126c5886ac6d7fd022b1ac78c40bae1ea5a47595 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Thu, 19 Dec 2024 06:00:34 +0000
Subject: [PATCH 62/76] update for llama 3.1 70B v0 testing

---
 .../prompt_client_online_benchmark.py         | 19 +++++++++----------
 vllm-tt-metal-llama3-70b/README.md            |  2 +-
 vllm-tt-metal-llama3-70b/docs/development.md  |  8 ++++----
 .../src/run_vllm_api_server.py                |  4 ++--
 .../vllm.llama3.src.Dockerfile                |  1 +
 5 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/benchmarking/prompt_client_online_benchmark.py b/benchmarking/prompt_client_online_benchmark.py
index 5b1203e3..79a847d1 100644
--- a/benchmarking/prompt_client_online_benchmark.py
+++ b/benchmarking/prompt_client_online_benchmark.py
@@ -151,20 +151,19 @@ def run_sequence_length_test(
         # sweeps for batch-1
         {"input_len": 128, "output_len": 10, "batch_size": 1, "num_prompts": 64},
         {"input_len": 128, "output_len": 128, "batch_size": 1, "num_prompts": 64},
-        {"input_len": 128, "output_len": 1024, "batch_size": 1, "num_prompts": 32},
-        {"input_len": 128, "output_len": 2048, "batch_size": 1, "num_prompts": 32},
-        {"input_len": 128, "output_len": 4096, "batch_size": 1, "num_prompts": 16},
+        {"input_len": 128, "output_len": 1024, "batch_size": 1, "num_prompts": 16},
+        {"input_len": 128, "output_len": 2048, "batch_size": 1, "num_prompts": 8},
+        {"input_len": 128, "output_len": 4096, "batch_size": 1, "num_prompts": 8},
         {"input_len": 2048, "output_len": 128, "batch_size": 1, "num_prompts": 32},
-        {"input_len": 2048, "output_len": 2048, "batch_size": 1, "num_prompts": 16},
+        {"input_len": 2048, "output_len": 2048, "batch_size": 1, "num_prompts": 8},
         # sweeps for batch-32
-        {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32 * 8},
-        {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32 * 8},
-        {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32 * 8},
-        {"input_len": 128, "output_len": 1024, "batch_size": 32, "num_prompts": 32 * 4},
+        {"input_len": 128, "output_len": 10, "batch_size": 32, "num_prompts": 32 * 16},
+        {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32 * 16},
+        {"input_len": 128, "output_len": 1024, "batch_size": 32, "num_prompts": 32 * 8},
         {"input_len": 128, "output_len": 2048, "batch_size": 32, "num_prompts": 32 * 4},
-        {"input_len": 128, "output_len": 4096, "batch_size": 32, "num_prompts": 32 * 2},
+        {"input_len": 128, "output_len": 4096, "batch_size": 32, "num_prompts": 32 * 4},
         {"input_len": 2048, "output_len": 128, "batch_size": 32, "num_prompts": 32 * 8},
-        {"input_len": 2048, "output_len": 2048, "batch_size": 32, "num_prompts": 32 * 2},
+        {"input_len": 2048, "output_len": 2048, "batch_size": 32, "num_prompts": 32 * 4},
     ]
     # fmt: on
 
diff --git a/vllm-tt-metal-llama3-70b/README.md b/vllm-tt-metal-llama3-70b/README.md
index 3fa10b39..31a4232b 100644
--- a/vllm-tt-metal-llama3-70b/README.md
+++ b/vllm-tt-metal-llama3-70b/README.md
@@ -36,7 +36,7 @@ docker run \
   --volume ${PERSISTENT_VOLUME?ERROR env var PERSISTENT_VOLUME must be set}:/home/user/cache_root:rw \
   --shm-size 32G \
   --publish 7000:7000 \
-  ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:v0.0.2-tt-metal-385904186f81-384f1790c3be
+  ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:v0.0.1-tt-metal-3ef683762eaa-953161188c50
 ```
 
 By default the Docker container will start running the entrypoint command wrapped in `src/run_vllm_api_server.py`.
diff --git a/vllm-tt-metal-llama3-70b/docs/development.md b/vllm-tt-metal-llama3-70b/docs/development.md
index 55d8b1d3..232fd9aa 100644
--- a/vllm-tt-metal-llama3-70b/docs/development.md
+++ b/vllm-tt-metal-llama3-70b/docs/development.md
@@ -13,12 +13,12 @@ When building, update the commit SHA and get correct SHA from model developers o
 # set build context to repo root
 cd tt-inference-server
 # build image
-export TT_METAL_DOCKERFILE_VERSION=v0.53.0-rc34
-export TT_METAL_COMMIT_SHA_OR_TAG=385904186f81fed15d5c87c162221d4f34387164
+export TT_METAL_DOCKERFILE_VERSION=v0.53.0
+export TT_METAL_COMMIT_SHA_OR_TAG=3ef683762eaa4bd602ec6f3f33aec875775265c5
 export TT_METAL_COMMIT_DOCKER_TAG=${TT_METAL_COMMIT_SHA_OR_TAG:0:12}
-export TT_VLLM_COMMIT_SHA_OR_TAG=384f1790c3be16e1d1b10de07252be2e66d00935
+export TT_VLLM_COMMIT_SHA_OR_TAG=953161188c50f10da95a88ab305e23977ebd3750
 export TT_VLLM_COMMIT_DOCKER_TAG=${TT_VLLM_COMMIT_SHA_OR_TAG:0:12}
-export IMAGE_VERSION=v0.0.3
+export IMAGE_VERSION=v0.0.1
 docker build \
   -t ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:${IMAGE_VERSION}-tt-metal-${TT_METAL_COMMIT_DOCKER_TAG}-${TT_VLLM_COMMIT_DOCKER_TAG} \
   --build-arg TT_METAL_DOCKERFILE_VERSION=${TT_METAL_DOCKERFILE_VERSION} \
diff --git a/vllm-tt-metal-llama3-70b/src/run_vllm_api_server.py b/vllm-tt-metal-llama3-70b/src/run_vllm_api_server.py
index 992874b1..595f8444 100644
--- a/vllm-tt-metal-llama3-70b/src/run_vllm_api_server.py
+++ b/vllm-tt-metal-llama3-70b/src/run_vllm_api_server.py
@@ -13,10 +13,10 @@
 from utils.logging_utils import set_vllm_logging_config
 
 # importing from tt-metal install path
-from models.demos.t3000.llama2_70b.tt.llama_generation import TtLlamaModelForGeneration
+from models.demos.t3000.llama2_70b.tt.generator_vllm import TtLlamaForCausalLM
 
 # register the model
-ModelRegistry.register_model("TTLlamaForCausalLM", TtLlamaModelForGeneration)
+ModelRegistry.register_model("TTLlamaForCausalLM", TtLlamaForCausalLM)
 
 
 def get_encoded_api_key(jwt_secret):
diff --git a/vllm-tt-metal-llama3-70b/vllm.llama3.src.Dockerfile b/vllm-tt-metal-llama3-70b/vllm.llama3.src.Dockerfile
index 2184d356..c57fa85d 100644
--- a/vllm-tt-metal-llama3-70b/vllm.llama3.src.Dockerfile
+++ b/vllm-tt-metal-llama3-70b/vllm.llama3.src.Dockerfile
@@ -102,6 +102,7 @@ COPY --chown=user:user "utils" "${APP_DIR}/utils"
 COPY --chown=user:user "benchmarking" "${APP_DIR}/benchmarking"
 COPY --chown=user:user "evals" "${APP_DIR}/evals"
 COPY --chown=user:user "tests" "${APP_DIR}/tests"
+COPY --chown=user:user "locust" "${APP_DIR}/locust"
 RUN /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate \
 && pip install --default-timeout=240 --no-cache-dir -r requirements.txt"
 

From aef6a94d56f70b372ba9f93392b890245cdf3bcd Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Thu, 19 Dec 2024 17:56:45 +0000
Subject: [PATCH 63/76] adding evals changes from tstesco/llama-evals

---
 evals/README.md    | 138 ++++++---------------------------------------
 evals/run_evals.sh |  32 ++++++++++-
 2 files changed, 47 insertions(+), 123 deletions(-)

diff --git a/evals/README.md b/evals/README.md
index 7795d48a..ca3add07 100644
--- a/evals/README.md
+++ b/evals/README.md
@@ -21,156 +21,52 @@ export PERSISTENT_VOLUME=$PWD/persistent_volume/volume_id_tt-metal-llama-3.1-70b
 docker run \
   --rm \
   -it \
-  --env-file tt-metal-llama3-70b/.env \
+  --env-file vllm-tt-metal-llama3-70b/.env \
   --cap-add ALL \
   --device /dev/tenstorrent:/dev/tenstorrent \
   --volume /dev/hugepages-1G:/dev/hugepages-1G:rw \
   --volume ${PERSISTENT_VOLUME?ERROR env var PERSISTENT_VOLUME must be set}:/home/user/cache_root:rw \
   --shm-size 32G \
-  ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:v0.0.1-tt-metal-${TT_METAL_COMMIT_DOCKER_TAG}-${TT_VLLM_COMMIT_DOCKER_TAG} bash
+  ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:${IMAGE_VERSION}-tt-metal-${TT_METAL_COMMIT_DOCKER_TAG}-${TT_VLLM_COMMIT_DOCKER_TAG}
 ```
 
-## Step 3: Inside container setup and run vLLM
-
-#### Install vLLM - Option 1: use default installation in docker image
-
-already built into Docker image
-
-#### Install vLLM - option 2: install vLLM from github
-
-```bash
-# option 2: install from github
-cd /home/user/vllm
-git fetch
-git checkout <branch>
-git pull
-pip install -e .
-echo "done vllm install."
-```
-#### Install vLLM - option 3: install edittable (for development) from mounted volume
-
-```bash
-# option 3: install edittable (for development) - mount from outside container
-cd /home/user/vllm
-pip install -e .
-echo "done vllm install."
-```
-
-#### Run vllm serving openai compatible API server
-
-```bash
-# run vllm serving
-python run_vllm_api_server.py
-```
-
-## Step 4: Inside container setup LM evalulation harness
-
-Enter new bash shell in running container (this does so with newest running container):
-```bash
-docker exec -it $(docker ps -q | head -n1) bash
-```
-
-Now inside container:
-```bash
-# option 1: install from github: https://github.com/tstescoTT/lm-evaluation-harness
-pip install git+https://github.com/tstescoTT/lm-evaluation-harness.git#egg=lm-eval[ifeval]
-# option 2: install edittable (for development) - mounted to container
-cd ~/lm-evaluation-harness
-pip install -e .[ifeval]
-```
-
-## Step 5: Inside container set up llama-recipes LM evalulation harness templates
+The default Docker image command will start the vLLM server. 
 
+## Step 3: Inside container set up llama-recipes LM evalulation harness templates
 
 Using Meta’s LM eval reproduce documentation: https://github.com/meta-llama/llama-recipes/tree/main/tools/benchmarks/llm_eval_harness/meta_eval 
 
 To access Meta Llama 3.1 evals, you must:
 
-1. Log in to the Hugging Face website (https://huggingface.co/collections/meta-llama/llama-31-evals-66a2c5a14c2093e58298ac7f ) and click the 3.1 evals dataset pages and agree to the terms.
+1. Log in to the Hugging Face website (https://huggingface.co/collections/meta-llama/llama-31-evals-66a2c5a14c2093e58298ac7f) and click the 3.1 evals dataset pages and agree to the terms.
 2. Follow the [Hugging Face authentication instructions](https://huggingface.co/docs/huggingface_hub/en/quick-start#authentication) to gain read access for your machine.
 
 #### Hugging Face authentication - option 1: HF_TOKEN (if not already passed into Docker container)
 ```bash
-# set up HF Token, needed for IFEval dataset
-# echo "hf_<token>" > ${HF_HOME}/token
-export PYTHONPATH=${PYTHONPATH}:$PWD
+# set up HF Token if not already set up in .env, needed for datasets
+echo "HF_TOKEN=hf_<your_token>" >> vllm-tt-metal-llama3-70b/.env
 ```
 
 #### Hugging Face authentication - option 2: huggingface_hub login
+Note: do this inside the container shell:
 ```python
 from huggingface_hub import login
 login()
 ```
 
-Finally,  build llama-recipe lm-evaluation-harness templates:
-```bash
-git clone https://github.com/tstescoTT/llama-recipes.git
-cd llama-recipes/tools/benchmarks/llm_eval_harness/meta_eval
-python prepare_meta_eval.py --config_path ./eval_config.yaml
-mkdir -p ~/lm-evaluation-harness
-cp -rf work_dir/ ~/lm-evaluation-harness/
-```
-
-## Step 6: Inside container run LM evals
-
-`run_evals.sh` can be run from where lm_eval CLI is available:
-```bash
-cd ~/lm-evaluation-harness
-export OPENAI_API_KEY=$(python -c 'import os; import json; import jwt; json_payload = json.loads("{\"team_id\": \"tenstorrent\", \"token_id\": \"debug-test\"}"); encoded_jwt = jwt.encode(json_payload, os.environ["JWT_SECRET"], algorithm="HS256"); print(encoded_jwt)')
-run_evals.sh
-```
-
-For example, running GPQA manually:
+## Step 4: Inside container setup and run vLLM via script
 
-The model args (`Meta-Llama-3.1-70B` below) need only correspond to the model defined by running the server, not the actual weights.
+Enter new bash shell in running container, oneliner below enters newest running container:
 ```bash
-lm_eval \
---model local-completions \
---model_args model=meta-llama/Llama-3.1-70B-Instruct,base_url=http://127.0.0.1:7000/v1/completions,num_concurrent=32,max_retries=4,tokenized_requests=False,add_bos_token=True \
---gen_kwargs model=meta-llama/Llama-3.1-70B-Instruct,stop="<|eot_id|>",stream=False \
---tasks meta_ifeval \
---batch_size auto \
---output_path /home/user/cache_root/eval_output \
---include_path ./work_dir \
---seed 42  \
---log_samples
+docker exec -it $(docker ps -q | head -n1) bash
 ```
 
-## Notes:
-
-### Chat templating
-
-As mentioned in: https://github.com/meta-llama/llama-recipes/tree/main/tools/benchmarks/llm_eval_harness/meta_eval#run-eval-tasks 
+Running the `run_evals.sh` script will:
+1. set up lm_eval and evals datasets
+2. pre-capture the tt-metal execution traces so that evals do not trigger 1st run trace capture unexpectedly
+3. run evals via lm_eval as configured
 
-“As for add_bos_token=True, since our prompts in the evals dataset has already included all the special tokens required by instruct model, such as <|start_header_id|>user<|end_header_id|>, we will not use --apply_chat_template argument for instruct models anymore. However, we need to use add_bos_token=True flag to add the BOS_token back during VLLM inference, as the BOS_token is removed by default in this PR.”
-
-Though it is recommended to use the pre-templated prompts following the build instructions for llama-recipes, the chat template can be manually added via the `lm_eval` runtime argument:
 ```bash
---apply_chat_template utils/prompt_templates/llama_instruct_example.jinja
+cd ~/app/evals
+. run_evals.sh
 ```
-
-llama_instruct_example.jinja: text file jinja template for llama 3.1 instruct:
-```
-{{- bos_token }}
-
-{#- System message #}
-{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
-{{- "Cutting Knowledge Date: December 2023\n" }}
-{{- "Today Date: " + date_string + "\n\n" }}
-{{- system_message }}
-{{- "<|eot_id|>" }}
-
-{#- Messages #}
-{%- for message in messages %}
-    {%- if message.role in ['user', 'assistant'] %}
-        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
-    {%- endif %}
-{%- endfor %}
-
-{%- if add_generation_prompt %}
-    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
-{%- endif %}
-```
-
-The instruct chat template could also be applied on the vLLM server side, but this implementation gives more flexibility to the caller of vLLM.
-
diff --git a/evals/run_evals.sh b/evals/run_evals.sh
index 12308b47..2db83369 100644
--- a/evals/run_evals.sh
+++ b/evals/run_evals.sh
@@ -3,10 +3,38 @@
 #
 # SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
 
+# set up lm_eval and evals datasets
+cd $HOME
+if python -c "import lm_eval" 2>/dev/null; then
+    echo "lm_eval is installed."
+else
+    echo "Installing lm_eval ..."
+    pip install git+https://github.com/tstescoTT/lm-evaluation-harness.git#egg=lm-eval[ifeval]
+fi
+
+if [ -d "$HOME/llama-recipes" ]; then
+    echo "The directory $HOME/llama-recipes exists."
+else
+    echo "The directory ~/llama-recipes does not exist."
+    git clone https://github.com/tstescoTT/llama-recipes.git $HOME/llama-recipes
+    cd $HOME/llama-recipes/tools/benchmarks/llm_eval_harness/meta_eval
+    python prepare_meta_eval.py --config_path ./eval_config.yaml
+    mkdir -p $HOME/lm-evaluation-harness
+    cp -rf work_dir/ $HOME/lm-evaluation-harness/
+fi
+
+# trace capture so that evals do not trigger 1st run trace capture unexpectedly
+cd $HOME/app
+python utils/capture_traces.py
+
+# run evals
+export OPENAI_API_KEY=$(python -c 'import os; import json; import jwt; json_payload = json.loads("{\"team_id\": \"tenstorrent\", \"token_id\": \"debug-test\"}"); encoded_jwt = jwt.encode(json_payload, os.environ["JWT_SECRET"], algorithm="HS256"); print(encoded_jwt)')
+cd $HOME/lm-evaluation-harness/
+
 # GPQA
 lm_eval \
 --model local-completions \
---model_args model=meta-llama/Llama-3.1-70B-Instruct,base_url=http://127.0.0.1:7000/v1/completions,num_concurrent=1,max_retries=4,tokenized_requests=False,add_bos_token=True \
+--model_args model=meta-llama/Llama-3.1-70B-Instruct,base_url=http://127.0.0.1:7000/v1/completions,num_concurrent=32,max_retries=4,tokenized_requests=False,add_bos_token=True,timeout=None \
 --gen_kwargs model=meta-llama/Llama-3.1-70B-Instruct,stop="<|eot_id|>",stream=False \
 --tasks meta_gpqa \
 --batch_size auto \
@@ -18,7 +46,7 @@ lm_eval \
 # IFEval
 lm_eval \
 --model local-completions \
---model_args model=meta-llama/Llama-3.1-70B-Instruct,base_url=http://127.0.0.1:7000/v1/completions,num_concurrent=32,max_retries=4,tokenized_requests=False,add_bos_token=True \
+--model_args model=meta-llama/Llama-3.1-70B-Instruct,base_url=http://127.0.0.1:7000/v1/completions,num_concurrent=32,max_retries=4,tokenized_requests=False,add_bos_token=True,timeout=None \
 --gen_kwargs model=meta-llama/Llama-3.1-70B-Instruct,stop="<|eot_id|>",stream=False \
 --tasks meta_ifeval \
 --batch_size auto \

From 5ab18166deb23c7028e950a503588e45cda5a1f6 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Thu, 19 Dec 2024 23:19:45 +0000
Subject: [PATCH 64/76] adding TT_METAL_COMMIT_SHA_OR_TAG=v0.54.0-rc2

---
 vllm-tt-metal-llama3-70b/docs/development.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm-tt-metal-llama3-70b/docs/development.md b/vllm-tt-metal-llama3-70b/docs/development.md
index 232fd9aa..efa3c075 100644
--- a/vllm-tt-metal-llama3-70b/docs/development.md
+++ b/vllm-tt-metal-llama3-70b/docs/development.md
@@ -14,7 +14,7 @@ When building, update the commit SHA and get correct SHA from model developers o
 cd tt-inference-server
 # build image
 export TT_METAL_DOCKERFILE_VERSION=v0.53.0
-export TT_METAL_COMMIT_SHA_OR_TAG=3ef683762eaa4bd602ec6f3f33aec875775265c5
+export TT_METAL_COMMIT_SHA_OR_TAG=v0.54.0-rc2
 export TT_METAL_COMMIT_DOCKER_TAG=${TT_METAL_COMMIT_SHA_OR_TAG:0:12}
 export TT_VLLM_COMMIT_SHA_OR_TAG=953161188c50f10da95a88ab305e23977ebd3750
 export TT_VLLM_COMMIT_DOCKER_TAG=${TT_VLLM_COMMIT_SHA_OR_TAG:0:12}

From 0e5b67a122ae4f3268876ff4ad3ea110bab8495f Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Thu, 19 Dec 2024 23:55:41 +0000
Subject: [PATCH 65/76] update README commit tags

---
 vllm-tt-metal-llama3-70b/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm-tt-metal-llama3-70b/README.md b/vllm-tt-metal-llama3-70b/README.md
index 31a4232b..38ef1a9a 100644
--- a/vllm-tt-metal-llama3-70b/README.md
+++ b/vllm-tt-metal-llama3-70b/README.md
@@ -36,7 +36,7 @@ docker run \
   --volume ${PERSISTENT_VOLUME?ERROR env var PERSISTENT_VOLUME must be set}:/home/user/cache_root:rw \
   --shm-size 32G \
   --publish 7000:7000 \
-  ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:v0.0.1-tt-metal-3ef683762eaa-953161188c50
+  ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:v0.0.1-tt-metal-v0.54.0-rc2-953161188c50 
 ```
 
 By default the Docker container will start running the entrypoint command wrapped in `src/run_vllm_api_server.py`.

From 0c48a9f61798b8481d92363a48e219c322c73367 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Fri, 20 Dec 2024 02:56:27 +0000
Subject: [PATCH 66/76] adding vllm benchmarking patch to stop sending
 unsupported params best_of logprobs

---
 benchmarking/benchmark_serving.patch | 43 ++++++++++------------------
 1 file changed, 15 insertions(+), 28 deletions(-)

diff --git a/benchmarking/benchmark_serving.patch b/benchmarking/benchmark_serving.patch
index f393b6bc..818d92f7 100644
--- a/benchmarking/benchmark_serving.patch
+++ b/benchmarking/benchmark_serving.patch
@@ -1,5 +1,19 @@
+diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
+index 4813fde2..0cb3e72e 100644
+--- a/benchmarks/backend_request_func.py
++++ b/benchmarks/backend_request_func.py
+@@ -235,9 +235,7 @@ async def async_request_openai_completions(
+             "model": request_func_input.model,
+             "prompt": request_func_input.prompt,
+             "temperature": 0.0,
+-            "best_of": request_func_input.best_of,
+             "max_tokens": request_func_input.output_len,
+-            "logprobs": request_func_input.logprobs,
+             "stream": True,
+             "ignore_eos": request_func_input.ignore_eos,
+         }
 diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
-index c1a396c8..463e0e93 100644
+index c1a396c8..74f75a15 100644
 --- a/benchmarks/benchmark_serving.py
 +++ b/benchmarks/benchmark_serving.py
 @@ -22,6 +22,12 @@ On the client side, run:
@@ -24,30 +38,3 @@ index c1a396c8..463e0e93 100644
          multi_modal_content=test_mm_content,
          ignore_eos=ignore_eos,
      )
-@@ -458,7 +464,7 @@ async def benchmark(
-                                               prompt_len=prompt_len,
-                                               output_len=output_len,
-                                               logprobs=logprobs,
--                                              best_of=best_of,
-+                                              best_of=None,
-                                               multi_modal_content=mm_content,
-                                               ignore_eos=ignore_eos)
-         tasks.append(
-diff --git a/vllm/worker/tt_model_runner.py b/vllm/worker/tt_model_runner.py
-index 1c586dd3..2e77bf72 100644
---- a/vllm/worker/tt_model_runner.py
-+++ b/vllm/worker/tt_model_runner.py
-@@ -425,12 +425,7 @@ class TTModelRunner(ModelRunnerBase[TTModelInput]):
-             )
-     
-     def _validate_sampling_params(self, sampling_params):
--        assert sampling_params.n == 1, "Currently only supporting n=1"
--        assert sampling_params.best_of is None, "Currently not supporting best_of"
--        assert sampling_params.logprobs is None, "Currently not supporting logprobs"
--        assert sampling_params.prompt_logprobs is None, "Currently not supporting prompt_logprobs"
--
--    ## Destructor (used to delete ttnn trace if using trace mode)
-+        return
-     
-     def __del__(self):
-         if self.trace_mode and self.execute_trace_kwargs is not None:

From 471c90b9f3ddb5bcdcefd74912366244062714b3 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Wed, 18 Dec 2024 22:02:57 +0000
Subject: [PATCH 67/76] move vllm-tt-metal-llama3-70b/setup.sh -> setup.sh, add
 support for Hugging Face authorization and model download, add llama 3.3 70B
 instruct

---
 vllm-tt-metal-llama3-70b/setup.sh => setup.sh | 372 ++++++++++++------
 1 file changed, 255 insertions(+), 117 deletions(-)
 rename vllm-tt-metal-llama3-70b/setup.sh => setup.sh (51%)

diff --git a/vllm-tt-metal-llama3-70b/setup.sh b/setup.sh
similarity index 51%
rename from vllm-tt-metal-llama3-70b/setup.sh
rename to setup.sh
index ee102dff..f1115e8c 100755
--- a/vllm-tt-metal-llama3-70b/setup.sh
+++ b/setup.sh
@@ -9,6 +9,7 @@ set -euo pipefail  # Exit on error, print commands, unset variables treated as e
 usage() {
     echo "Usage: $0 <model_type>"
     echo "Available model types:"
+    echo "  llama-3.3-70b-instruct"
     echo "  llama-3.1-70b-instruct"
     echo "  llama-3.1-70b"
     echo "  llama-3.1-8b-instruct"
@@ -73,72 +74,116 @@ check_and_prompt_env_file() {
     fi
 }
 
+get_hf_env_vars() {
+    # get HF_TOKEN
+    if [ -z "${HF_TOKEN:-}" ]; then
+        echo "HF_TOKEN environment variable is not set. Please set it before running the script."
+        read -r -s -p "Enter your HF_TOKEN: " input_hf_token
+        echo
+        if [ -z "${input_hf_token:-}" ]; then
+            echo "⛔ HF_TOKEN cannot be empty. Please try again."
+            exit 1
+        elif [[ ! "$input_hf_token" == hf_* ]]; then
+            echo "⛔ HF_TOKEN must start with 'hf_'. Please try again."
+            exit 1
+        fi
+        HF_TOKEN=${input_hf_token}
+        echo "✅ HF_TOKEN set."
+    fi
+    # get HF_HOME
+    if [ -z "${HF_HOME:-}" ]; then
+        echo "HF_HOME environment variable is not set. Please set it before running the script."
+        read -r -p "Enter your HF_HOME [default: $HOME/.cache/huggingface]:" input_hf_home
+        echo
+        input_hf_home=${input_hf_home:-"$HOME/.cache/huggingface"}
+        if [ ! -d "$input_hf_home" ] || [ ! -w "$input_hf_home" ]; then
+            echo "⛔ HF_HOME must be a valid directory and writable by the user. Please try again."
+            exit 1
+        fi
+        HF_HOME=${input_hf_home}
+        echo "✅ HF_HOME set."
+    fi
+}
 
 # Function to set environment variables based on the model selection and write them to .env
 setup_model_environment() {
-    # Set default values for environment variables
-    DEFAULT_PERSISTENT_VOLUME_ROOT=${REPO_ROOT}/persistent_volume
-    DEFAULT_LLAMA_REPO=~/llama-models
     # Set environment variables based on the model selection
     case "$1" in
-      "llama-3.1-70b-instruct")
-      MODEL_NAME="llama-3.1-70b-instruct"
-      META_MODEL_NAME="Meta-Llama-3.1-70B-Instruct"
-      META_DIR_FILTER="llama3_1"
-      REPACKED=1
-      ;;
-      "llama-3.1-70b")
-      MODEL_NAME="llama-3.1-70b"
-      META_MODEL_NAME="Meta-Llama-3.1-70B"
-      META_DIR_FILTER="llama3_1"
-      REPACKED=1
-      ;;
-      "llama-3.1-8b-instruct")
-      MODEL_NAME="llama-3.1-8b-instruct"
-      META_MODEL_NAME="Meta-Llama-3.1-8B-Instruct"
-      META_DIR_FILTER="llama3_1"
-      REPACKED=0
-      ;;
-      "llama-3.1-8b")
-      MODEL_NAME="llama-3.1-8b"
-      META_MODEL_NAME="Meta-Llama-3.1-8B"
-      META_DIR_FILTER="llama3_1"
-      REPACKED=0
-      ;;
-      "llama-3-70b-instruct")
-      MODEL_NAME="llama-3-70b-instruct"
-      META_MODEL_NAME="Meta-Llama-3-70B-Instruct"
-      META_DIR_FILTER="llama3"
-      REPACKED=1
-      ;;
-      "llama-3-70b")
-      MODEL_NAME="llama-3-70b"
-      META_MODEL_NAME="Meta-Llama-3-70B"
-      META_DIR_FILTER="llama3"
-      REPACKED=1
-      ;;
-      "llama-3-8b-instruct")
-      MODEL_NAME="llama-3-8b-instruct"
-      META_MODEL_NAME="Meta-Llama-3-8B-Instruct"
-      META_DIR_FILTER="llama3"
-      REPACKED=0
-      ;;
-      "llama-3-8b")
-      MODEL_NAME="llama-3-8b"
-      META_MODEL_NAME="Meta-Llama-3-8B"
-      META_DIR_FILTER="llama3"
-      REPACKED=0
-      ;;
-      *)
-      echo "⛔ Invalid model choice."
-      usage
-      exit 1
-      ;;
+        "llama-3.3-70b-instruct")
+        MODEL_NAME="llama-3.3-70b-instruct"
+        HF_MODEL_REPO_ID="meta-llama/Llama-3.3-70B-Instruct"
+        META_MODEL_NAME="Meta-Llama-3.3-70B-Instruct"
+        META_DIR_FILTER="llama3_1"
+        REPACKED=1
+        ;;
+        "llama-3.1-70b-instruct")
+        MODEL_NAME="llama-3.1-70b-instruct"
+        HF_MODEL_REPO_ID="meta-llama/Llama-3.1-70B-Instruct"
+        META_MODEL_NAME="Meta-Llama-3.1-70B-Instruct"
+        META_DIR_FILTER="llama3_1"
+        REPACKED=1
+        ;;
+        "llama-3.1-70b")
+        MODEL_NAME="llama-3.1-70b"
+        HF_MODEL_REPO_ID="meta-llama/Llama-3.1-70B"
+        META_MODEL_NAME="Meta-Llama-3.1-70B"
+        META_DIR_FILTER="llama3_1"
+        REPACKED=1
+        ;;
+        "llama-3.1-8b-instruct")
+        MODEL_NAME="llama-3.1-8b-instruct"
+        HF_MODEL_REPO_ID="meta-llama/Llama-3.1-8B-Instruct"
+        META_MODEL_NAME="Meta-Llama-3.1-8B-Instruct"
+        META_DIR_FILTER="llama3_1"
+        REPACKED=0
+        ;;
+        "llama-3.1-8b")
+        MODEL_NAME="llama-3.1-8b"
+        HF_MODEL_REPO_ID="meta-llama/Llama-3.1-8B"
+        META_MODEL_NAME="Meta-Llama-3.1-8B"
+        META_DIR_FILTER="llama3_1"
+        REPACKED=0
+        ;;
+        "llama-3-70b-instruct")
+        MODEL_NAME="llama-3-70b-instruct"
+        HF_MODEL_REPO_ID="meta-llama/Llama-3-70B-Instruct"
+        META_MODEL_NAME="Meta-Llama-3-70B-Instruct"
+        META_DIR_FILTER="llama3"
+        REPACKED=1
+        ;;
+        "llama-3-70b")
+        MODEL_NAME="llama-3-70b"
+        HF_MODEL_REPO_ID="meta-llama/Llama-3-70B"
+        META_MODEL_NAME="Meta-Llama-3-70B"
+        META_DIR_FILTER="llama3"
+        REPACKED=1
+        ;;
+        "llama-3-8b-instruct")
+        MODEL_NAME="llama-3-8b-instruct"
+        HF_MODEL_REPO_ID="meta-llama/Llama-3-8B-Instruct"
+        META_MODEL_NAME="Meta-Llama-3-8B-Instruct"
+        META_DIR_FILTER="llama3"
+        REPACKED=0
+        ;;
+        "llama-3-8b")
+        MODEL_NAME="llama-3-8b"
+        HF_MODEL_REPO_ID="meta-llama/Llama-3-8B"
+        META_MODEL_NAME="Meta-Llama-3-8B"
+        META_DIR_FILTER="llama3"
+        REPACKED=0
+        ;;
+        *)
+        echo "⛔ Invalid model choice."
+        usage
+        exit 1
+        ;;
     esac
 
+    # Set default values for environment variables
+    DEFAULT_PERSISTENT_VOLUME_ROOT=${REPO_ROOT}/persistent_volume
+
     # Initialize OVERWRITE_ENV
     OVERWRITE_ENV=false
-
     check_and_prompt_env_file
 
     if [ "$OVERWRITE_ENV" = false ]; then
@@ -146,29 +191,47 @@ setup_model_environment() {
         return 0
     fi
 
+    read -p "Use 🤗 Hugging Face authorization token for downloading models? Alternative is direct authorization from Meta. (y/n) [default: y]: " input_use_hf_token
+    choice_use_hf_token=${input_use_hf_token:-"y"}
+    echo # move to a new line after input
+    # Handle user's choice
+    case "$choice_use_hf_token" in
+        y|Y )
+            echo "Using 🤗 Hugging Face Token."
+            get_hf_env_vars
+            # default location for HF e.g. ~/.cache/huggingface/models/meta-llama/Llama-3.3-70B-Instruct
+            LLAMA_WEIGHTS_DIR=${HF_HOME}/${HF_MODEL_REPO_ID}/original
+            ;;
+        n|N )
+            echo "Using direct authorization from Meta. You will need their URL Authorization token, typically from their website or email."
+            # Prompt user for LLAMA_REPO if not already set or use default
+            read -r -p "Enter the path where you want to clone the Llama model repository [default: ${LLAMA_REPO}]: " INPUT_LLAMA_REPO
+            LLAMA_REPO=${INPUT_LLAMA_REPO:-$LLAMA_REPO}
+            LLAMA_DIR=${LLAMA_DIR:-${LLAMA_REPO}/models/${META_DIR_FILTER}}
+            LLAMA_WEIGHTS_DIR=${LLAMA_WEIGHTS_DIR:-${LLAMA_DIR}/${META_MODEL_NAME}}
+            echo  # move to a new line after input
+            ;;
+        * )
+            echo "⛔ Invalid option. Exiting."
+            exit 1
+            ;;
+    esac
+
     # Safely handle potentially unset environment variables using default values
     PERSISTENT_VOLUME_ROOT=${PERSISTENT_VOLUME_ROOT:-$DEFAULT_PERSISTENT_VOLUME_ROOT}
-    LLAMA_REPO=${LLAMA_REPO:-$DEFAULT_LLAMA_REPO}
 
     # Prompt user for PERSISTENT_VOLUME_ROOT if not already set or use default
-    read -p "Enter your PERSISTENT_VOLUME_ROOT [default: ${PERSISTENT_VOLUME_ROOT}]: " INPUT_PERSISTENT_VOLUME_ROOT
+    read -r -p "Enter your PERSISTENT_VOLUME_ROOT [default: ${PERSISTENT_VOLUME_ROOT}]: " INPUT_PERSISTENT_VOLUME_ROOT
     PERSISTENT_VOLUME_ROOT=${INPUT_PERSISTENT_VOLUME_ROOT:-$PERSISTENT_VOLUME_ROOT}
-    echo
-    # Prompt user for LLAMA_REPO if not already set or use default
-    read -p "Enter the path where you want to clone the Llama model repository [default: ${LLAMA_REPO}]: " INPUT_LLAMA_REPO
-    LLAMA_REPO=${INPUT_LLAMA_REPO:-$LLAMA_REPO}
-    echo  # move to a new line after input
+    echo # move to a new line after input   
 
     # Set environment variables with defaults if not already set
-    LLAMA_DIR=${LLAMA_DIR:-${LLAMA_REPO}/models/${META_DIR_FILTER}}
-    LLAMA_WEIGHTS_DIR=${LLAMA_WEIGHTS_DIR:-${LLAMA_DIR}/${META_MODEL_NAME}}
-    PERSISTENT_VOLUME=${PERSISTENT_VOLUME:-${PERSISTENT_VOLUME_ROOT}/volume_id_tt-metal-${MODEL_NAME}v0.0.1}
-
+    PERSISTENT_VOLUME=${PERSISTENT_VOLUME_ROOT}/volume_id_tt-metal-${MODEL_NAME}v0.0.1
     # Prompt user for JWT_SECRET securely
     read -sp "Enter your JWT_SECRET: " JWT_SECRET
     echo  # move to a new line after input
     # Verify the JWT_SECRET is not empty
-    if [ -z "$JWT_SECRET" ]; then
+    if [ -z "${JWT_SECRET:-}" ]; then
         echo "⛔ JWT_SECRET cannot be empty. Please try again."
         exit 1
     fi
@@ -184,11 +247,15 @@ setup_model_environment() {
     # Write environment variables to .env file
     echo "Writing environment variables to ${ENV_FILE} ..."
     cat > ${ENV_FILE} <<EOF
+# Environment variables for the model setup
+USE_HF_DOWNLOAD=$choice_use_hf_token
 MODEL_NAME=$MODEL_NAME
 META_MODEL_NAME=$META_MODEL_NAME
+HF_MODEL_REPO_ID=$HF_MODEL_REPO_ID
+HOST_HF_HOME=${HF_HOME:-""}
 # host paths
-LLAMA_REPO=$LLAMA_REPO
-LLAMA_DIR=$LLAMA_DIR
+LLAMA_REPO=${LLAMA_REPO:-""}
+LLAMA_DIR=${LLAMA_DIR:-""}
 LLAMA_WEIGHTS_DIR=$LLAMA_WEIGHTS_DIR
 PERSISTENT_VOLUME_ROOT=$PERSISTENT_VOLUME_ROOT
 PERSISTENT_VOLUME=$PERSISTENT_VOLUME
@@ -208,6 +275,7 @@ LLAMA3_TOKENIZER_PATH=/home/user/cache_root/model_weights/${REPACKED_STR}$MODEL_
 LLAMA3_CACHE_PATH=/home/user/cache_root/tt_metal_cache/cache_${REPACKED_STR}$MODEL_NAME
 # These are secrets and must be stored securely for production environments
 JWT_SECRET=$JWT_SECRET
+HF_TOKEN=${HF_TOKEN:-""}
 EOF
 
     echo "Environment variables written to: ${ENV_FILE}"
@@ -264,27 +332,44 @@ setup_permissions() {
     echo "✅ setup_permissions completed!"
 }
 
-setup_weights() {
-    # St`ep 1: Load environment variables from .env file
-    load_env
+# Shared function for repacking weights
+repack_weights() {
+    local source_dir="$1"
+    local target_dir="$2"
 
-    # check if model weights already exist
-    if [ -d "${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}" ]; then
-        echo "Model weights already exist at: ${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}."
-        echo "contents:"
-        echo
-        echo "$(ls -lh ${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME})"
-        echo
-        echo "If directory does not have correct weigths, to re-download or copy the model weights delete the directory."
-        echo "✅ Model weights setup is already complete, check if directory contents are correct."
-        return 0
-    fi
+    # Create target directory if it doesn't exist
+    mkdir -p "${target_dir}"
 
-    # TODO: support HF_TOKEN for downloading models
-    # Step 2: Set up Llama model repository path
+    # Copy required files
+    cp "${source_dir}/tokenizer.model" "${target_dir}/tokenizer.model"
+    cp "${source_dir}/params.json" "${target_dir}/params.json"
+    
+    # Set up Python environment for repacking
+    VENV_NAME=".venv_repack"
+    echo "Setting up python venv for repacking: ${VENV_NAME}"
+    python3 -m venv ${VENV_NAME}
+    source ${VENV_NAME}/bin/activate
+    pip install --upgrade setuptools wheel pip==21.2.4 tqdm
+    pip install --index-url https://download.pytorch.org/whl/cpu torch==2.2.1
+    
+    # Download repacking script
+    curl -O https://raw.githubusercontent.com/tenstorrent/tt-metal/refs/heads/main/models/demos/t3000/llama2_70b/scripts/repack_weights.py
+    
+    echo "Repacking weights..."
+    python repack_weights.py "${source_dir}" "${target_dir}" 5
+    
+    # Cleanup
+    deactivate
+    rm -rf ${VENV_NAME} repack_weights.py
+    
+    echo "✅ Weight repacking completed!"
+}
+
+setup_weights_meta() {
+    # Step 1: Set up Llama model repository path
     echo "Using repository path: $LLAMA_REPO"
 
-    # Step 3: Clone the repository (if it doesn't already exist)
+    # Step 2: Clone the repository (if it doesn't already exist)
     if [ ! -d "$LLAMA_REPO" ]; then
         echo "Cloning the Llama repository to: $LLAMA_REPO"
         git clone https://github.com/meta-llama/llama-models.git "$LLAMA_REPO"
@@ -295,56 +380,82 @@ setup_weights() {
         echo "🔔 Llama repository already exists at $LLAMA_REPO"
     fi
 
-    # Step 4: Check if weights are already downloaded
+    # Step 3: Check if weights are already downloaded
     if [ -d "${LLAMA_WEIGHTS_DIR}" ] && [ "$(ls -A "${LLAMA_WEIGHTS_DIR}")" ]; then
         echo "Weights already downloaded at ${LLAMA_WEIGHTS_DIR}"
         echo "Skipping download."
     else
-        # Step 5: Run the download script and select models
+        # Step 4: Run the download script and select models
         echo "Running the download script to download models at ${LLAMA_DIR}/download.sh ..."
         cd "$LLAMA_DIR"
         ./download.sh
         cd -
     fi
 
-    # Step 6: Set up persistent volume root
+    # Step 5: Copy weights to persistent volume
     echo "Setting up persistent volume root: ${PERSISTENT_VOLUME}"
     mkdir -p "${PERSISTENT_VOLUME}/model_weights/"
 
-    # Step 7: Create directories for weights, tokenizer, and params
-    echo "Create directories for weights, tokenizer, and params."
-    
     if [ "${REPACKED}" -eq 1 ]; then
         WEIGHTS_DIR="${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}"
-        mkdir -p "${WEIGHTS_DIR}"
-        cp "${LLAMA_WEIGHTS_DIR}/tokenizer.model" "${WEIGHTS_DIR}/tokenizer.model"
-        cp "${LLAMA_WEIGHTS_DIR}/params.json" "${WEIGHTS_DIR}/params.json"
-        # Step 8: repack weights into repacked dir once instead of copying them
-        VENV_NAME="venv_setup"
-        echo "setting up repacking python venv: ${VENV_NAME}"
-        python3 -m venv ${VENV_NAME}
-        source ${VENV_NAME}/bin/activate
-        # pip==21.2.4 is needed to avoid the following error:
-        # ERROR: Package 'networkx' requires a different Python: 3.8.10 not in '>=3.9'
-        pip install --upgrade setuptools wheel pip==21.2.4 tqdm
-        # repack script dependency
-        # pip does not support +cpu build variant qualifier, need to specify cpu index url
-        pip install --index-url https://download.pytorch.org/whl/cpu torch==2.2.1
-        curl -O https://raw.githubusercontent.com/tenstorrent/tt-metal/refs/heads/main/models/demos/t3000/llama2_70b/scripts/repack_weights.py
-        echo "repacking weights..."
-        python repack_weights.py "${LLAMA_WEIGHTS_DIR}" "${WEIGHTS_DIR}" 5
-        deactivate
-        rm -rf ${VENV_NAME} repack_weights.py
+        repack_weights "${LLAMA_WEIGHTS_DIR}" "${WEIGHTS_DIR}"
     else
         WEIGHTS_DIR="${PERSISTENT_VOLUME}/model_weights/${MODEL_NAME}"
         cp -rf "${LLAMA_WEIGHTS_DIR}" "${WEIGHTS_DIR}"
-        
     fi
 
     echo "using weights directory: ${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}"
+    echo "✅ setup_weights_meta completed!"
+}
+
+setup_weights_huggingface() {
+    # Step 1: Verify HF_TOKEN and HF_HOME are set
+    if [ -z "${HF_TOKEN:-}" ] || [ -z "${HOST_HF_HOME:-}" ]; then
+        echo "⛔ HF_TOKEN or HF_HOME not set. Please ensure both environment variables are set."
+        exit 1
+    fi
 
-    # create a tmp python venv with dependencies to run repack script
-    echo "✅ setup_weights completed!"
+    # Step 2: Set up persistent volume root
+    echo "Setting up persistent volume root: ${PERSISTENT_VOLUME}"
+    mkdir -p "${PERSISTENT_VOLUME}/model_weights/"
+
+    # Step 3: Create python virtual environment for huggingface downloads
+    VENV_NAME=".venv_hf_setup"
+    echo "Setting up python venv for Hugging Face downloads: ${VENV_NAME}"
+    python3 -m venv ${VENV_NAME}
+    source ${VENV_NAME}/bin/activate
+
+    # Step 4: Install required packages
+    pip install --upgrade pip setuptools wheel
+    pip install "huggingface_hub[cli]"
+
+    # Step 5: Download model using huggingface-cli
+    echo "Downloading model from Hugging Face Hub..."
+    # stop timeout issue: https://huggingface.co/docs/huggingface_hub/en/guides/cli#download-timeout
+    export HF_HUB_DOWNLOAD_TIMEOUT=60
+    # using default HF naming convention for model weights
+    huggingface-cli download "${HF_MODEL_REPO_ID}" \
+        original/params.json \
+        original/tokenizer.model \
+        original/consolidated.* \
+        --cache-dir="${HOST_HF_HOME}" \
+        --token="${HF_TOKEN}"                            
+
+    # Step 6: Process and copy weights
+    if [ "${REPACKED}" -eq 1 ]; then
+        WEIGHTS_DIR="${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}"
+        repack_weights "${LLAMA_WEIGHTS_DIR}" "${WEIGHTS_DIR}"
+    else
+        WEIGHTS_DIR="${PERSISTENT_VOLUME}/model_weights/${MODEL_NAME}"
+        cp -rf "${LLAMA_WEIGHTS_DIR}" "${WEIGHTS_DIR}"
+    fi
+
+    # Step 7: Cleanup
+    deactivate
+    rm -rf ${VENV_NAME}
+
+    echo "using weights directory: ${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}"
+    echo "✅ setup_weights_huggingface completed!"
 }
 
 setup_tt_metal_cache() {
@@ -360,6 +471,34 @@ setup_tt_metal_cache() {
     echo "✅ setup_tt_metal_cache completed!"
 }
 
+setup_weights() {
+    # Step 1: Load environment variables from .env file
+    load_env
+
+    # check if model weights already exist
+    if [ -d "${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}" ]; then
+        echo "Model weights already exist at: ${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}."
+        echo "contents:"
+        echo
+        echo "$(ls -lh ${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME})"
+        echo
+        echo "If directory does not have correct weights, to re-download or copy the model weights delete the directory."
+        echo "🔔 check if directory contents are correct."
+        exit 1
+    fi
+
+    # Determine which setup method to use based on HF_TOKEN presence
+    if [ "${USE_HF_DOWNLOAD}" == "y" ]; then
+        setup_weights_huggingface
+    else
+        setup_weights_meta
+    fi
+}
+
+# ==============================================================================
+# Main script logic
+# ==============================================================================
+
 # Ensure script is being executed, not sourced
 if [[ "${BASH_SOURCE[0]}" != "${0}" ]]; then
     echo "⛔ Error: This script is being sourced. Please make execute it:"
@@ -368,7 +507,6 @@ if [[ "${BASH_SOURCE[0]}" != "${0}" ]]; then
     return 1;  # 'return' works when sourced; 'exit' would terminate the shell
 fi
 
-# Main script logic
 if [ $# -lt 1 ]; then
     usage
 fi

From ec43450577fac371d26a60b1badd8e6b90dbad9b Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Wed, 18 Dec 2024 22:11:21 +0000
Subject: [PATCH 68/76] add llama 3.2 refs

---
 setup.sh | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/setup.sh b/setup.sh
index f1115e8c..a024244c 100755
--- a/setup.sh
+++ b/setup.sh
@@ -10,6 +10,7 @@ usage() {
     echo "Usage: $0 <model_type>"
     echo "Available model types:"
     echo "  llama-3.3-70b-instruct"
+    echo "  llama-3.2-11b-vision-instruct"
     echo "  llama-3.1-70b-instruct"
     echo "  llama-3.1-70b"
     echo "  llama-3.1-8b-instruct"
@@ -113,9 +114,16 @@ setup_model_environment() {
         MODEL_NAME="llama-3.3-70b-instruct"
         HF_MODEL_REPO_ID="meta-llama/Llama-3.3-70B-Instruct"
         META_MODEL_NAME="Meta-Llama-3.3-70B-Instruct"
-        META_DIR_FILTER="llama3_1"
+        META_DIR_FILTER="llama3_3"
         REPACKED=1
         ;;
+        "llama-3.2-11b-instruct")
+        MODEL_NAME="llama-3.2-11b-vision-instruct"
+        HF_MODEL_REPO_ID="meta-llama/Llama-3.2-11B-Vision-Instruct"
+        META_MODEL_NAME="Meta-Llama-3.2-11B-Vision-Instruct"
+        META_DIR_FILTER="llama3_2"
+        REPACKED=0
+        ;;
         "llama-3.1-70b-instruct")
         MODEL_NAME="llama-3.1-70b-instruct"
         HF_MODEL_REPO_ID="meta-llama/Llama-3.1-70B-Instruct"
@@ -493,6 +501,9 @@ setup_weights() {
     else
         setup_weights_meta
     fi
+    
+    echo "create tt-metal cache dir: ${LLAMA3_CACHE_PATH}"
+    mkdir -p ${LLAMA3_CACHE_PATH}
 }
 
 # ==============================================================================

From d17e46e4595f5eb6c783d4eec1f82b061e5f959d Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Wed, 18 Dec 2024 22:42:45 +0000
Subject: [PATCH 69/76] WIP make setup.sh run from repo root, add fixed model
 impl dir, env file dir in persistent dir

---
 setup.sh | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/setup.sh b/setup.sh
index a024244c..f20dde77 100755
--- a/setup.sh
+++ b/setup.sh
@@ -26,11 +26,9 @@ usage() {
 }
 
 # globals
-readonly MODEL_PATH=$(dirname "$(realpath "$0")")
-readonly REPO_ROOT=$(dirname "${MODEL_PATH}")
+readonly REPO_ROOT=$(dirname "$(realpath "$0")")
 readonly ENV_FILE="${MODEL_PATH}/.env"
 echo "REPO_ROOT: ${REPO_ROOT}"
-echo "MODEL_PATH: ${MODEL_PATH}"
 echo "ENV_FILE: ${ENV_FILE}"
 
 check_and_prompt_env_file() {
@@ -113,20 +111,23 @@ setup_model_environment() {
         "llama-3.3-70b-instruct")
         MODEL_NAME="llama-3.3-70b-instruct"
         HF_MODEL_REPO_ID="meta-llama/Llama-3.3-70B-Instruct"
-        META_MODEL_NAME="Meta-Llama-3.3-70B-Instruct"
-        META_DIR_FILTER="llama3_3"
+        MODEL_IMPL_ROOT_DIR="vllm-tt-metal-llama3-70b"
+        META_MODEL_NAME=""
+        META_DIR_FILTER=""
         REPACKED=1
         ;;
         "llama-3.2-11b-instruct")
         MODEL_NAME="llama-3.2-11b-vision-instruct"
         HF_MODEL_REPO_ID="meta-llama/Llama-3.2-11B-Vision-Instruct"
-        META_MODEL_NAME="Meta-Llama-3.2-11B-Vision-Instruct"
-        META_DIR_FILTER="llama3_2"
+        MODEL_IMPL_ROOT_DIR="vllm-tt-metal-llama32-11b-vision"
+        META_MODEL_NAME=""
+        META_DIR_FILTER=""
         REPACKED=0
         ;;
         "llama-3.1-70b-instruct")
         MODEL_NAME="llama-3.1-70b-instruct"
         HF_MODEL_REPO_ID="meta-llama/Llama-3.1-70B-Instruct"
+        MODEL_IMPL_ROOT_DIR="vllm-tt-metal-llama3-70b"
         META_MODEL_NAME="Meta-Llama-3.1-70B-Instruct"
         META_DIR_FILTER="llama3_1"
         REPACKED=1
@@ -189,6 +190,8 @@ setup_model_environment() {
 
     # Set default values for environment variables
     DEFAULT_PERSISTENT_VOLUME_ROOT=${REPO_ROOT}/persistent_volume
+    DEFAULT_ENV_DIR="${DEFAULT_PERSISTENT_VOLUME_ROOT}/model_envs"
+    mkdir -p ${DEFAULT_ENV_DIR}
 
     # Initialize OVERWRITE_ENV
     OVERWRITE_ENV=false
@@ -211,6 +214,9 @@ setup_model_environment() {
             LLAMA_WEIGHTS_DIR=${HF_HOME}/${HF_MODEL_REPO_ID}/original
             ;;
         n|N )
+            if [ -z "${META_DIR_FILTER:-}" ]; then
+                echo "⛔ MODEL_NAME=${MODEL_NAME} does not support using direct Meta authorization model download. Please use Hugging Face method."
+            fi
             echo "Using direct authorization from Meta. You will need their URL Authorization token, typically from their website or email."
             # Prompt user for LLAMA_REPO if not already set or use default
             read -r -p "Enter the path where you want to clone the Llama model repository [default: ${LLAMA_REPO}]: " INPUT_LLAMA_REPO

From 895ff8d03d1a8d8616499840573bd6f1cf017871 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Thu, 19 Dec 2024 21:16:14 +0000
Subject: [PATCH 70/76] adding setup.sh support for multiple models, adding
 support for llama 3.2 11b vision instruct, llama 3.3 70b instruct

---
 .gitignore |   2 +-
 setup.sh   | 108 +++++++++++++++++++++++++++--------------------------
 2 files changed, 57 insertions(+), 53 deletions(-)

diff --git a/.gitignore b/.gitignore
index 24914ed3..f5ae8ff9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,7 +12,7 @@ __pycache__
 env
 .testvenv
 python_env
-.venv
+.venv*
 
 # persistent storage volume
 persistent_volume
diff --git a/setup.sh b/setup.sh
index f20dde77..04fe83d6 100755
--- a/setup.sh
+++ b/setup.sh
@@ -20,26 +20,20 @@ usage() {
     echo "  llama-3-8b-instruct"
     echo "  llama-3-8b"
     echo
-    echo "Options:"
-    echo "  setup_permissions      Run the script to set file permissions after first run setup (requires sudo)."
     exit 1
 }
 
 # globals
 readonly REPO_ROOT=$(dirname "$(realpath "$0")")
-readonly ENV_FILE="${MODEL_PATH}/.env"
-echo "REPO_ROOT: ${REPO_ROOT}"
-echo "ENV_FILE: ${ENV_FILE}"
 
 check_and_prompt_env_file() {
     local MODEL_NAME_KEY="MODEL_NAME"
     local MODEL_NAME=""
-    
     # Check if .env file exists
-    if [[ -f "$ENV_FILE" ]]; then
+    if [[ -f "${ENV_FILE}" ]]; then
         # Extract the MODEL_NAME value from .env
-        FOUND_MODEL_NAME=$(grep "^$MODEL_NAME_KEY=" "$ENV_FILE" | cut -d '=' -f2)
-
+        echo "found ENV_FILE: ${ENV_FILE}"
+        FOUND_MODEL_NAME=$(grep "^$MODEL_NAME_KEY=" "$ENV_FILE" | cut -d '=' -f2) || FOUND_MODEL_NAME=""
         # If MODEL_NAME is found, display it
         if [[ -n "$FOUND_MODEL_NAME" ]]; then
             echo "The existing file ${ENV_FILE} contains MODEL_NAME: $FOUND_MODEL_NAME"
@@ -66,7 +60,6 @@ check_and_prompt_env_file() {
             echo "MODEL_NAME not found in ${ENV_FILE}. Overwritting."
             OVERWRITE_ENV=true
         fi
-        
     else
         echo "${ENV_FILE} does not exist. Proceeding to create a new one."
         OVERWRITE_ENV=true
@@ -116,7 +109,7 @@ setup_model_environment() {
         META_DIR_FILTER=""
         REPACKED=1
         ;;
-        "llama-3.2-11b-instruct")
+        "llama-3.2-11b-vision-instruct")
         MODEL_NAME="llama-3.2-11b-vision-instruct"
         HF_MODEL_REPO_ID="meta-llama/Llama-3.2-11B-Vision-Instruct"
         MODEL_IMPL_ROOT_DIR="vllm-tt-metal-llama32-11b-vision"
@@ -187,20 +180,32 @@ setup_model_environment() {
         exit 1
         ;;
     esac
+    # Initialize OVERWRITE_ENV
+    OVERWRITE_ENV=false
 
     # Set default values for environment variables
     DEFAULT_PERSISTENT_VOLUME_ROOT=${REPO_ROOT}/persistent_volume
-    DEFAULT_ENV_DIR="${DEFAULT_PERSISTENT_VOLUME_ROOT}/model_envs"
-    mkdir -p ${DEFAULT_ENV_DIR}
-
-    # Initialize OVERWRITE_ENV
-    OVERWRITE_ENV=false
+    MODEL_ENV_DIR="${DEFAULT_PERSISTENT_VOLUME_ROOT}/model_envs"
+    
+    mkdir -p ${MODEL_ENV_DIR}
+    ENV_FILE="${MODEL_ENV_DIR}/${MODEL_NAME}.env"
+    export ENV_FILE
     check_and_prompt_env_file
 
+
     if [ "$OVERWRITE_ENV" = false ]; then
         echo "✅ using existing .env file: ${ENV_FILE}."
         return 0
     fi
+    # Safely handle potentially unset environment variables using default values
+    PERSISTENT_VOLUME_ROOT=${PERSISTENT_VOLUME_ROOT:-$DEFAULT_PERSISTENT_VOLUME_ROOT}
+    # Prompt user for PERSISTENT_VOLUME_ROOT if not already set or use default
+    read -r -p "Enter your PERSISTENT_VOLUME_ROOT [default: ${DEFAULT_PERSISTENT_VOLUME_ROOT}]: " INPUT_PERSISTENT_VOLUME_ROOT
+    PERSISTENT_VOLUME_ROOT=${INPUT_PERSISTENT_VOLUME_ROOT:-$PERSISTENT_VOLUME_ROOT}
+    echo # move to a new line after input   
+    # Set environment variables with defaults if not already set
+    PERSISTENT_VOLUME=${PERSISTENT_VOLUME_ROOT}/volume_id_tt-metal-${MODEL_NAME}v0.0.1
+    
 
     read -p "Use 🤗 Hugging Face authorization token for downloading models? Alternative is direct authorization from Meta. (y/n) [default: y]: " input_use_hf_token
     choice_use_hf_token=${input_use_hf_token:-"y"}
@@ -211,7 +216,8 @@ setup_model_environment() {
             echo "Using 🤗 Hugging Face Token."
             get_hf_env_vars
             # default location for HF e.g. ~/.cache/huggingface/models/meta-llama/Llama-3.3-70B-Instruct
-            LLAMA_WEIGHTS_DIR=${HF_HOME}/${HF_MODEL_REPO_ID}/original
+            # LLAMA_WEIGHTS_DIR=${HF_HOME}/local_dir/${HF_MODEL_REPO_ID}
+            WEIGHTS_DIR=${PERSISTENT_VOLUME}/model_weights/${MODEL_NAME}
             ;;
         n|N )
             if [ -z "${META_DIR_FILTER:-}" ]; then
@@ -231,16 +237,6 @@ setup_model_environment() {
             ;;
     esac
 
-    # Safely handle potentially unset environment variables using default values
-    PERSISTENT_VOLUME_ROOT=${PERSISTENT_VOLUME_ROOT:-$DEFAULT_PERSISTENT_VOLUME_ROOT}
-
-    # Prompt user for PERSISTENT_VOLUME_ROOT if not already set or use default
-    read -r -p "Enter your PERSISTENT_VOLUME_ROOT [default: ${PERSISTENT_VOLUME_ROOT}]: " INPUT_PERSISTENT_VOLUME_ROOT
-    PERSISTENT_VOLUME_ROOT=${INPUT_PERSISTENT_VOLUME_ROOT:-$PERSISTENT_VOLUME_ROOT}
-    echo # move to a new line after input   
-
-    # Set environment variables with defaults if not already set
-    PERSISTENT_VOLUME=${PERSISTENT_VOLUME_ROOT}/volume_id_tt-metal-${MODEL_NAME}v0.0.1
     # Prompt user for JWT_SECRET securely
     read -sp "Enter your JWT_SECRET: " JWT_SECRET
     echo  # move to a new line after input
@@ -270,9 +266,10 @@ HOST_HF_HOME=${HF_HOME:-""}
 # host paths
 LLAMA_REPO=${LLAMA_REPO:-""}
 LLAMA_DIR=${LLAMA_DIR:-""}
-LLAMA_WEIGHTS_DIR=$LLAMA_WEIGHTS_DIR
+LLAMA_WEIGHTS_DIR=${LLAMA_WEIGHTS_DIR:-""}
 PERSISTENT_VOLUME_ROOT=$PERSISTENT_VOLUME_ROOT
 PERSISTENT_VOLUME=$PERSISTENT_VOLUME
+WEIGHTS_DIR=${WEIGHTS_DIR:-""}
 # container paths
 REPACKED=${REPACKED}
 REPACKED_STR=${REPACKED_STR}
@@ -326,12 +323,13 @@ setup_permissions() {
     sudo usermod -aG dockermount "$USER"
 
     # Get container user with UID 1000 and add to group
-    CONTAINER_USER=$(getent passwd 1000 | cut -d: -f1)
+    CONTAINER_UID=1000
+    CONTAINER_USER=$(getent passwd ${CONTAINER_UID} | cut -d: -f1)
     if [ -n "$CONTAINER_USER" ]; then
-        echo "Adding container user: '$CONTAINER_USER' (UID 1000) to 'dockermount' group ..."
+        echo "Adding container user: '$CONTAINER_USER' (UID ${CONTAINER_UID}) to 'dockermount' group ..."
         sudo usermod -aG dockermount "$CONTAINER_USER"
     else
-        echo "No user found with UID 1000."
+        echo "No user found with UID ${CONTAINER_UID}."
     fi
 
     # Set file ownership and permissions
@@ -340,7 +338,7 @@ setup_permissions() {
         # if the user point the PERSISTENT_VOLUME
         sudo mkdir -p "${PERSISTENT_VOLUME}"
     fi
-    sudo chown -R ${CONTAINER_USER}:dockermount "${PERSISTENT_VOLUME}"
+    sudo chown -R ${CONTAINER_UID}:dockermount "${PERSISTENT_VOLUME}"
     sudo chmod -R 775 "${PERSISTENT_VOLUME}"
 
     echo "✅ setup_permissions completed!"
@@ -453,15 +451,27 @@ setup_weights_huggingface() {
         original/tokenizer.model \
         original/consolidated.* \
         --cache-dir="${HOST_HF_HOME}" \
-        --token="${HF_TOKEN}"                            
+        --token="${HF_TOKEN}"
+
+    # symlinks are broken for huggingface-cli download with --local-dir option
+    # see: https://github.com/huggingface/huggingface_hub/pull/2223
+    # to use symlinks, find most recent snapshot and create symlink to that
+    mkdir -p "${WEIGHTS_DIR}"
+    LOCAL_REPO_NAME=$(echo "${HF_MODEL_REPO_ID}" | sed 's|/|--|g')
+    SNAPSHOT_DIR="${HOST_HF_HOME}/models--${LOCAL_REPO_NAME}/snapshots"
+    # note: ls -td will sort by modification date descending, potential edge case
+    # if desired snapshot is not most recent modified or ls sorts differently
+    MOST_RECENT_SNAPSHOT=$(ls -td -- ${SNAPSHOT_DIR}/* | head -n 1)
+    echo "create symlink: ${MOST_RECENT_SNAPSHOT}/original/ -> ${WEIGHTS_DIR}"
+    for item in ${MOST_RECENT_SNAPSHOT}/original/*; do
+        ln -s "$item" "${WEIGHTS_DIR}"
+    done
 
     # Step 6: Process and copy weights
     if [ "${REPACKED}" -eq 1 ]; then
-        WEIGHTS_DIR="${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}"
-        repack_weights "${LLAMA_WEIGHTS_DIR}" "${WEIGHTS_DIR}"
-    else
-        WEIGHTS_DIR="${PERSISTENT_VOLUME}/model_weights/${MODEL_NAME}"
-        cp -rf "${LLAMA_WEIGHTS_DIR}" "${WEIGHTS_DIR}"
+        REPACKED_WEIGHTS_DIR="${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}"
+        mkdir -p "${REPACKED_WEIGHTS_DIR}"
+        repack_weights "${WEIGHTS_DIR}" "${REPACKED_WEIGHTS_DIR}"
     fi
 
     # Step 7: Cleanup
@@ -491,25 +501,24 @@ setup_weights() {
 
     # check if model weights already exist
     if [ -d "${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}" ]; then
-        echo "Model weights already exist at: ${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}."
+        echo "Model weights already exist at: ${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}"
         echo "contents:"
         echo
         echo "$(ls -lh ${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME})"
         echo
         echo "If directory does not have correct weights, to re-download or copy the model weights delete the directory."
         echo "🔔 check if directory contents are correct."
-        exit 1
-    fi
-
-    # Determine which setup method to use based on HF_TOKEN presence
-    if [ "${USE_HF_DOWNLOAD}" == "y" ]; then
-        setup_weights_huggingface
     else
-        setup_weights_meta
+        # Determine which setup method to use based on HF_TOKEN presence
+        if [ "${USE_HF_DOWNLOAD}" == "y" ]; then
+            setup_weights_huggingface
+        else
+            setup_weights_meta
+        fi
     fi
     
     echo "create tt-metal cache dir: ${LLAMA3_CACHE_PATH}"
-    mkdir -p ${LLAMA3_CACHE_PATH}
+    mkdir -p "${PERSISTENT_VOLUME}/tt_metal_cache/cache_${REPACKED_STR}${MODEL_NAME}"
 }
 
 # ==============================================================================
@@ -528,11 +537,6 @@ if [ $# -lt 1 ]; then
     usage
 fi
 
-if [ "$1" == "setup_permissions" ]; then
-    setup_permissions
-    exit 0
-fi
-
 # Set up environment variables for the chosen model
 MODEL_TYPE=$1
 setup_model_environment "$MODEL_TYPE"

From 49ee14f43562999b1a548aad49c68fad57b885e1 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Thu, 19 Dec 2024 21:22:32 +0000
Subject: [PATCH 71/76] update .env file location in documentation

---
 evals/README.md                              | 2 +-
 vllm-tt-metal-llama3-70b/README.md           | 2 +-
 vllm-tt-metal-llama3-70b/docs/development.md | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/evals/README.md b/evals/README.md
index ca3add07..5dfc42a5 100644
--- a/evals/README.md
+++ b/evals/README.md
@@ -21,7 +21,7 @@ export PERSISTENT_VOLUME=$PWD/persistent_volume/volume_id_tt-metal-llama-3.1-70b
 docker run \
   --rm \
   -it \
-  --env-file vllm-tt-metal-llama3-70b/.env \
+  --env-file persistent_volume/model_envs/llama-3.1-70b-instruct.env \
   --cap-add ALL \
   --device /dev/tenstorrent:/dev/tenstorrent \
   --volume /dev/hugepages-1G:/dev/hugepages-1G:rw \
diff --git a/vllm-tt-metal-llama3-70b/README.md b/vllm-tt-metal-llama3-70b/README.md
index 38ef1a9a..3d9a9a5d 100644
--- a/vllm-tt-metal-llama3-70b/README.md
+++ b/vllm-tt-metal-llama3-70b/README.md
@@ -29,7 +29,7 @@ export PERSISTENT_VOLUME=$PWD/persistent_volume/volume_id_tt-metal-llama-3.1-70b
 docker run \
   --rm \
   -it \
-  --env-file vllm-tt-metal-llama3-70b/.env \
+  --env-file persistent_volume/model_envs/llama-3.1-70b-instruct.env \
   --cap-add ALL \
   --device /dev/tenstorrent:/dev/tenstorrent \
   --volume /dev/hugepages-1G:/dev/hugepages-1G:rw \
diff --git a/vllm-tt-metal-llama3-70b/docs/development.md b/vllm-tt-metal-llama3-70b/docs/development.md
index efa3c075..d4f950b3 100644
--- a/vllm-tt-metal-llama3-70b/docs/development.md
+++ b/vllm-tt-metal-llama3-70b/docs/development.md
@@ -42,7 +42,7 @@ export PERSISTENT_VOLUME=$PWD/persistent_volume/volume_id_tt-metal-llama-3.1-70b
 docker run \
   --rm \
   -it \
-  --env-file tt-metal-llama3-70b/.env \
+  --env-file persistent_volume/model_envs/llama-3.1-70b-instruct.env \
   --cap-add ALL \
   --device /dev/tenstorrent:/dev/tenstorrent \
   --volume /dev/hugepages-1G:/dev/hugepages-1G:rw \

From 1675f54c3e85cfc72bf0f1d1b52d147380ddc7ab Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Thu, 19 Dec 2024 23:12:20 +0000
Subject: [PATCH 72/76] remove MODEL_IMPL_ROOT_DIR and add note about
 MODEL_NAME

---
 setup.sh | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/setup.sh b/setup.sh
index 04fe83d6..52388dfa 100755
--- a/setup.sh
+++ b/setup.sh
@@ -100,11 +100,11 @@ get_hf_env_vars() {
 # Function to set environment variables based on the model selection and write them to .env
 setup_model_environment() {
     # Set environment variables based on the model selection
+    # note: MODEL_NAME is the lower cased basename of the HF repo ID
     case "$1" in
         "llama-3.3-70b-instruct")
         MODEL_NAME="llama-3.3-70b-instruct"
         HF_MODEL_REPO_ID="meta-llama/Llama-3.3-70B-Instruct"
-        MODEL_IMPL_ROOT_DIR="vllm-tt-metal-llama3-70b"
         META_MODEL_NAME=""
         META_DIR_FILTER=""
         REPACKED=1
@@ -112,7 +112,6 @@ setup_model_environment() {
         "llama-3.2-11b-vision-instruct")
         MODEL_NAME="llama-3.2-11b-vision-instruct"
         HF_MODEL_REPO_ID="meta-llama/Llama-3.2-11B-Vision-Instruct"
-        MODEL_IMPL_ROOT_DIR="vllm-tt-metal-llama32-11b-vision"
         META_MODEL_NAME=""
         META_DIR_FILTER=""
         REPACKED=0
@@ -120,7 +119,6 @@ setup_model_environment() {
         "llama-3.1-70b-instruct")
         MODEL_NAME="llama-3.1-70b-instruct"
         HF_MODEL_REPO_ID="meta-llama/Llama-3.1-70B-Instruct"
-        MODEL_IMPL_ROOT_DIR="vllm-tt-metal-llama3-70b"
         META_MODEL_NAME="Meta-Llama-3.1-70B-Instruct"
         META_DIR_FILTER="llama3_1"
         REPACKED=1

From d1bffe04a0368941480f6c02ad2ff59ad816d405 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Fri, 20 Dec 2024 03:19:08 +0000
Subject: [PATCH 73/76] move setup_tt_metal_cache into setup_weights to use
 load_env scope

---
 setup.sh | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/setup.sh b/setup.sh
index 52388dfa..86dd9d58 100755
--- a/setup.sh
+++ b/setup.sh
@@ -515,8 +515,7 @@ setup_weights() {
         fi
     fi
     
-    echo "create tt-metal cache dir: ${LLAMA3_CACHE_PATH}"
-    mkdir -p "${PERSISTENT_VOLUME}/tt_metal_cache/cache_${REPACKED_STR}${MODEL_NAME}"
+    setup_tt_metal_cache
 }
 
 # ==============================================================================
@@ -539,7 +538,6 @@ fi
 MODEL_TYPE=$1
 setup_model_environment "$MODEL_TYPE"
 setup_weights
-setup_tt_metal_cache
 # Call the script again with sudo to execute the sudo-required commands
 echo "Switching to sudo portion to set file permissions and complete setup."
 setup_permissions

From 97c90cfd4b52fee4b9eb609cfee8e4f421063b61 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Fri, 20 Dec 2024 03:23:13 +0000
Subject: [PATCH 74/76] better logging and handling of
 {PERSISTENT_VOLUME}/model_weights dir setup

---
 setup.sh | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/setup.sh b/setup.sh
index 86dd9d58..eac1a20d 100755
--- a/setup.sh
+++ b/setup.sh
@@ -395,17 +395,12 @@ setup_weights_meta() {
         echo "Weights already downloaded at ${LLAMA_WEIGHTS_DIR}"
         echo "Skipping download."
     else
-        # Step 4: Run the download script and select models
         echo "Running the download script to download models at ${LLAMA_DIR}/download.sh ..."
         cd "$LLAMA_DIR"
         ./download.sh
         cd -
     fi
 
-    # Step 5: Copy weights to persistent volume
-    echo "Setting up persistent volume root: ${PERSISTENT_VOLUME}"
-    mkdir -p "${PERSISTENT_VOLUME}/model_weights/"
-
     if [ "${REPACKED}" -eq 1 ]; then
         WEIGHTS_DIR="${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}"
         repack_weights "${LLAMA_WEIGHTS_DIR}" "${WEIGHTS_DIR}"
@@ -425,10 +420,6 @@ setup_weights_huggingface() {
         exit 1
     fi
 
-    # Step 2: Set up persistent volume root
-    echo "Setting up persistent volume root: ${PERSISTENT_VOLUME}"
-    mkdir -p "${PERSISTENT_VOLUME}/model_weights/"
-
     # Step 3: Create python virtual environment for huggingface downloads
     VENV_NAME=".venv_hf_setup"
     echo "Setting up python venv for Hugging Face downloads: ${VENV_NAME}"
@@ -494,19 +485,20 @@ setup_tt_metal_cache() {
 }
 
 setup_weights() {
-    # Step 1: Load environment variables from .env file
     load_env
 
     # check if model weights already exist
     if [ -d "${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}" ]; then
         echo "Model weights already exist at: ${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}"
+        echo "🔔 check if directory contents are correct."
         echo "contents:"
-        echo
+        echo "ls -lh ${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}"
         echo "$(ls -lh ${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME})"
         echo
         echo "If directory does not have correct weights, to re-download or copy the model weights delete the directory."
-        echo "🔔 check if directory contents are correct."
     else
+        echo "Setting up persistent volume root: ${PERSISTENT_VOLUME}"
+        mkdir -p "${PERSISTENT_VOLUME}/model_weights/"
         # Determine which setup method to use based on HF_TOKEN presence
         if [ "${USE_HF_DOWNLOAD}" == "y" ]; then
             setup_weights_huggingface

From 6df6c7cd2dcec2b1c9f999db9e565a116d181d41 Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Fri, 20 Dec 2024 04:07:20 +0000
Subject: [PATCH 75/76] adding error message when huggingface-cli download
 fails with common issues for troubleshooting, add support for llama 3.2 1B /
 3B

---
 setup.sh | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/setup.sh b/setup.sh
index eac1a20d..3b062ada 100755
--- a/setup.sh
+++ b/setup.sh
@@ -11,6 +11,8 @@ usage() {
     echo "Available model types:"
     echo "  llama-3.3-70b-instruct"
     echo "  llama-3.2-11b-vision-instruct"
+    echo "  llama-3.2-3b-instruct"
+    echo "  llama-3.2-1b-instruct"
     echo "  llama-3.1-70b-instruct"
     echo "  llama-3.1-70b"
     echo "  llama-3.1-8b-instruct"
@@ -116,6 +118,20 @@ setup_model_environment() {
         META_DIR_FILTER=""
         REPACKED=0
         ;;
+        "llama-3.2-3b-instruct")
+        MODEL_NAME="llama-3.2-3b-instruct"
+        HF_MODEL_REPO_ID="meta-llama/Llama-3.2-3B-Instruct"
+        META_MODEL_NAME=""
+        META_DIR_FILTER=""
+        REPACKED=0
+        ;;
+        "llama-3.2-1b-instruct")
+        MODEL_NAME="llama-3.2-1b-instruct"
+        HF_MODEL_REPO_ID="meta-llama/Llama-3.2-1B-Instruct"
+        META_MODEL_NAME=""
+        META_DIR_FILTER=""
+        REPACKED=0
+        ;;
         "llama-3.1-70b-instruct")
         MODEL_NAME="llama-3.1-70b-instruct"
         HF_MODEL_REPO_ID="meta-llama/Llama-3.1-70B-Instruct"
@@ -442,6 +458,17 @@ setup_weights_huggingface() {
         --cache-dir="${HOST_HF_HOME}" \
         --token="${HF_TOKEN}"
 
+    if [ $? -ne 0 ]; then
+        echo "⛔ Error occured during: huggingface-cli download ${HF_MODEL_REPO_ID}"
+        echo "🔔 check for common issues:"
+        echo "  1. 401 Unauthorized error occurred."
+        echo "    For example:"
+        echo "      huggingface_hub.errors.GatedRepoError: 401 Client Error. Cannot access gated repo"
+        echo "      ❗ In this case, go to the repo URL in your web browser and click through the access request form."
+        echo "  2. check correct HF_TOKEN is set in the .env file: ${ENV_FILE}"
+        exit 1
+    fi
+
     # symlinks are broken for huggingface-cli download with --local-dir option
     # see: https://github.com/huggingface/huggingface_hub/pull/2223
     # to use symlinks, find most recent snapshot and create symlink to that

From 6beaa66e5350304cf513f150e6d9845cc79337bb Mon Sep 17 00:00:00 2001
From: Tom Stesco <tstesco@tenstorrent.com>
Date: Sat, 21 Dec 2024 02:19:00 +0000
Subject: [PATCH 76/76] update README for llama 3.1 70B v0 drop commits

---
 vllm-tt-metal-llama3-70b/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm-tt-metal-llama3-70b/README.md b/vllm-tt-metal-llama3-70b/README.md
index 3d9a9a5d..e6b3448c 100644
--- a/vllm-tt-metal-llama3-70b/README.md
+++ b/vllm-tt-metal-llama3-70b/README.md
@@ -106,7 +106,7 @@ Either download the Docker image from GitHub Container Registry (recommended for
 
 ```bash
 # pull image from GHCR
-docker pull ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:v0.0.2-tt-metal-385904186f81-384f1790c3be
+docker pull ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:v0.0.1-tt-metal-v0.54.0-rc2-953161188c50
 ```
 
 #### Option B: Build Docker Image
@@ -115,7 +115,7 @@ For instructions on building the Docker imagem locally see: [vllm-tt-metal-llama
 
 ### 5. Automated Setup: environment variables and weights files
 
-The script `vllm-tt-metal-llama3-70b/setup.sh` automates:
+The script `setup.sh` automates:
 
 1. interactively creating the .env file,
 2. downloading the Llama model weights,
@@ -123,7 +123,7 @@ The script `vllm-tt-metal-llama3-70b/setup.sh` automates:
 4. creating the default persistent storage directory structure and permissions.
 
 ```bash
-cd tt-inference-server/vllm-tt-metal-llama3-70b
+cd tt-inference-server
 chmod +x setup.sh
 ./setup.sh llama-3.1-70b-instruct
 ```