From 7d68656ab085bfcb213eebf1b54a3af3d939aa86 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Wed, 4 Dec 2024 02:48:43 +0000 Subject: [PATCH 01/76] add print_prompts cli arg --- utils/prompt_generation.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/utils/prompt_generation.py b/utils/prompt_generation.py index ccb8b2cc..f96e0c6d 100644 --- a/utils/prompt_generation.py +++ b/utils/prompt_generation.py @@ -286,7 +286,7 @@ def generate_prompts(args): # Add 1 to prompt lengths to account for the extra token added by vLLM prompt_lengths = [pl + 1 for pl in prompt_lengths] - print_prompts = (not args.save_path) and (args.num_prompts < 5) + print_prompts = (args.num_prompts < 5) and args.print_prompts # Save prompts to a JSONL file if a save path is provided if args.save_path: file_path = Path(args.save_path).resolve() @@ -350,6 +350,12 @@ def add_prompt_gen_args(parser): default=None, help="Path to save the generated prompts in JSONL format.", ) + parser.add_argument( + "--print_prompts", + action="store_true", + default=False, + help="Print generated prompts if there arent more than 5.", + ) return parser From 8d78d64e62b9604b89552d70820413ea8036b19c Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Wed, 4 Dec 2024 02:50:29 +0000 Subject: [PATCH 02/76] remove redundant stop token from vLLM example api calls --- utils/prompt_client_cli.py | 1 - .../src/example_openai_client_alpaca_eval.py | 1 - .../src/example_requests_client_alpaca_eval.py | 1 - 3 files changed, 3 deletions(-) diff --git a/utils/prompt_client_cli.py b/utils/prompt_client_cli.py index d3707418..f97786e5 100644 --- a/utils/prompt_client_cli.py +++ b/utils/prompt_client_cli.py @@ -81,7 +81,6 @@ def call_inference_api( "top_p": 0.9, "max_tokens": max_tokens, "stream": stream, - "stop": ["<|eot_id|>"], } req_time = time.time() # using requests stream=True, make sure to set a timeout diff --git a/vllm-tt-metal-llama3-70b/src/example_openai_client_alpaca_eval.py b/vllm-tt-metal-llama3-70b/src/example_openai_client_alpaca_eval.py index 032c9343..1a3c781e 100644 --- a/vllm-tt-metal-llama3-70b/src/example_openai_client_alpaca_eval.py +++ b/vllm-tt-metal-llama3-70b/src/example_openai_client_alpaca_eval.py @@ -40,7 +40,6 @@ def call_inference_api(prompt, response_idx, stream=True, headers=None, client=N temperature=1, max_tokens=2048, top_p=0.9, - stop=["<|eot_id|>"], stream=stream, ) if stream: diff --git a/vllm-tt-metal-llama3-70b/src/example_requests_client_alpaca_eval.py b/vllm-tt-metal-llama3-70b/src/example_requests_client_alpaca_eval.py index ed761905..ca201900 100644 --- a/vllm-tt-metal-llama3-70b/src/example_requests_client_alpaca_eval.py +++ b/vllm-tt-metal-llama3-70b/src/example_requests_client_alpaca_eval.py @@ -108,7 +108,6 @@ def call_inference_api( "top_p": 0.9, "max_tokens": max_tokens, "stream": stream, - "stop": ["<|eot_id|>"], } req_time = time.time() # using requests stream=True, make sure to set a timeout From 3108bc0a713e86f79a7fe887929a75f026034996 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Wed, 4 Dec 2024 02:51:39 +0000 Subject: [PATCH 03/76] add capture_trace.py util to pre-prompt vllm server to capture all trace input sizes --- utils/capture_traces.py | 120 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) create mode 100644 utils/capture_traces.py diff --git a/utils/capture_traces.py b/utils/capture_traces.py new file mode 100644 index 00000000..828c7e1d --- /dev/null +++ b/utils/capture_traces.py @@ -0,0 +1,120 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +import os +import time +import logging +import requests +import argparse +from utils.prompt_generation import generate_prompts +from utils.prompt_client_cli import ( + call_inference_api, + get_api_base_url, + get_authorization, +) + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" +) +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +def get_api_health_url(): + DEPLOY_URL = os.getenv("DEPLOY_URL", "http://127.0.0.1") + health_url = f"{DEPLOY_URL}:{os.getenv('SERVICE_PORT', '8000')}/health" + return health_url + + +def check_health(base_url: str, timeout: int = 300, interval: int = 10) -> bool: + """ + Check the health endpoint until the service is ready. + """ + health_url = get_api_health_url() + start_time = time.time() + headers = {"Authorization": f"Bearer {get_authorization()}"} + + while time.time() - start_time < timeout: + try: + response = requests.get(health_url, headers=headers) + if response.status_code == 200: + logger.info("vLLM service is healthy and ready") + return True + except requests.exceptions.RequestException as e: + logger.warning(f"Health check failed: {e}") + + logger.info(f"Service not ready, waiting {interval} seconds...") + time.sleep(interval) + + logger.error(f"Service did not become healthy within {timeout} seconds") + return False + + +def capture_input_sizes(): + """ + Capture different input size graphs with the TT model on vLLM. + get_padded_prefill_len() defines the different input sizes for prefill: + https://github.com/tenstorrent/tt-metal/blob/main/models/demos/t3000/llama2_70b/tt/llama_generation.py#L341 + """ + input_sizes = [sz - 8 for sz in [32, 64, 128, 256, 512, 1024, 2048, 3072, 4096]] + prompts_per_size = 1 + output_seq_len = 1 + + base_url = get_api_base_url() + if not check_health(base_url): + raise RuntimeError("vLLM did not start correctly!") + + api_url = f"{base_url}/completions" + headers = {"Authorization": f"Bearer {get_authorization()}"} + vllm_model = os.environ.get("VLLM_MODEL", "meta-llama/Llama-3.1-70B-Instruct") + + for size in input_sizes: + logger.info(f"Capture input size: {size}") + + args = argparse.Namespace( + tokenizer_model=vllm_model, + dataset="random", + max_prompt_length=size, + input_seq_len=size, + distribution="fixed", + template=None, + save_path=None, + print_prompts=False, + num_prompts=prompts_per_size, + ) + + prompts, prompt_lengths = generate_prompts(args) + + for i, (prompt, prompt_len) in enumerate(zip(prompts, prompt_lengths)): + try: + response_data = call_inference_api( + prompt=prompt, + response_idx=i, + prompt_len=prompt_len, + stream=True, + headers=headers, + api_url=api_url, + max_tokens=output_seq_len, + vll_model=vllm_model, + tokenizer=None, + ) + + logger.info( + f"Input size: {size}, input_seq_len: {prompt_len}, TTFT: {response_data['ttft']:.3f}s" + ) + + except Exception as e: + logger.error(f"Error processing prompt: {e}") + + +def main(): + try: + capture_input_sizes() + except Exception as e: + logger.error(f"Capturing input sizes failed: {e}") + raise + + +if __name__ == "__main__": + main() From ea3d75dbd9caaa5029c7dd4af8fcfce35a6da03f Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Wed, 4 Dec 2024 03:13:38 +0000 Subject: [PATCH 04/76] adding utils/startup_utils.py to refine handling of startup in automation --- utils/capture_traces.py | 35 ++------------------------- utils/startup_utils.py | 53 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 33 deletions(-) create mode 100644 utils/startup_utils.py diff --git a/utils/capture_traces.py b/utils/capture_traces.py index 828c7e1d..ecc1d95d 100644 --- a/utils/capture_traces.py +++ b/utils/capture_traces.py @@ -3,9 +3,7 @@ # SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC import os -import time import logging -import requests import argparse from utils.prompt_generation import generate_prompts from utils.prompt_client_cli import ( @@ -13,6 +11,7 @@ get_api_base_url, get_authorization, ) +from utils.startup_utils import wait_for_healthy logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" @@ -21,36 +20,6 @@ logger.setLevel(logging.INFO) -def get_api_health_url(): - DEPLOY_URL = os.getenv("DEPLOY_URL", "http://127.0.0.1") - health_url = f"{DEPLOY_URL}:{os.getenv('SERVICE_PORT', '8000')}/health" - return health_url - - -def check_health(base_url: str, timeout: int = 300, interval: int = 10) -> bool: - """ - Check the health endpoint until the service is ready. - """ - health_url = get_api_health_url() - start_time = time.time() - headers = {"Authorization": f"Bearer {get_authorization()}"} - - while time.time() - start_time < timeout: - try: - response = requests.get(health_url, headers=headers) - if response.status_code == 200: - logger.info("vLLM service is healthy and ready") - return True - except requests.exceptions.RequestException as e: - logger.warning(f"Health check failed: {e}") - - logger.info(f"Service not ready, waiting {interval} seconds...") - time.sleep(interval) - - logger.error(f"Service did not become healthy within {timeout} seconds") - return False - - def capture_input_sizes(): """ Capture different input size graphs with the TT model on vLLM. @@ -62,7 +31,7 @@ def capture_input_sizes(): output_seq_len = 1 base_url = get_api_base_url() - if not check_health(base_url): + if not wait_for_healthy(base_url): raise RuntimeError("vLLM did not start correctly!") api_url = f"{base_url}/completions" diff --git a/utils/startup_utils.py b/utils/startup_utils.py new file mode 100644 index 00000000..33ef7f86 --- /dev/null +++ b/utils/startup_utils.py @@ -0,0 +1,53 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +import os +import time +import logging +import requests +from utils.prompt_client_cli import ( + get_authorization, +) + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" +) +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +def get_api_health_url(): + DEPLOY_URL = os.getenv("DEPLOY_URL", "http://127.0.0.1") + health_url = f"{DEPLOY_URL}:{os.getenv('SERVICE_PORT', '8000')}/health" + return health_url + + +def wait_for_healthy(base_url: str, timeout: int = 300, interval: int = 10) -> bool: + """ + Check the health endpoint until the service is ready. + """ + health_url = get_api_health_url() + start_time = time.time() + headers = {"Authorization": f"Bearer {get_authorization()}"} + total_time_waited = 0 + while time.time() - start_time < timeout: + try: + response = requests.get(health_url, headers=headers, timeout=interval) + if response.status_code == 200: + startup_time = time.time() - start_time + logger.info( + f"vLLM service is healthy. startup_time:= {startup_time} seconds" + ) + return True + except requests.exceptions.RequestException as e: + logger.warning(f"Health check failed: {e}") + + total_time_waited += interval + logger.info( + f"Service not ready after {total_time_waited} seconds, waiting {interval} seconds before polling ..." + ) + time.sleep(0.05) + + logger.error(f"Service did not become healthy within {timeout} seconds") + return False From cc1d17a6dd06ae59cd9934df7c22e54e71a7f138 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Wed, 4 Dec 2024 04:47:07 +0000 Subject: [PATCH 05/76] adding force_max_tokens as option to call_inference_api(), add input_seq_lengths and output_seq_lengths directly args to test_api_call_threaded_full_queue() to allow for varied isl and osl within batch --- utils/prompt_client_cli.py | 62 +++++++++++++++++++++++++------------- 1 file changed, 41 insertions(+), 21 deletions(-) diff --git a/utils/prompt_client_cli.py b/utils/prompt_client_cli.py index f97786e5..58524cfd 100644 --- a/utils/prompt_client_cli.py +++ b/utils/prompt_client_cli.py @@ -71,6 +71,7 @@ def call_inference_api( max_tokens, vll_model, tokenizer, + force_max_tokens=True, ): # set API prompt and optional parameters json_data = { @@ -82,6 +83,9 @@ def call_inference_api( "max_tokens": max_tokens, "stream": stream, } + if force_max_tokens: + # use a reserved special token avoid the model to stopping before osl reached + json_data["stop"] = "<|reserved_special_token_249|>" req_time = time.time() # using requests stream=True, make sure to set a timeout response = requests.post( @@ -102,11 +106,10 @@ def call_inference_api( if num_completion_tokens == 0: first_token_time = time.time() ttft = first_token_time - req_time - num_completion_tokens += 1 data_str = line[len("data: ") :].strip() if data_str == "[DONE]": - num_completion_tokens -= 1 break + num_completion_tokens += 1 try: # Parse the JSON data data = json.loads(data_str) @@ -117,10 +120,7 @@ def call_inference_api( print(f"Failed to decode JSON: {e}") continue else: - # If not chunked, you can access the entire response body at once - data = response.json()["usage"] raise ValueError("Response is not chunked") - else: data = response.json() full_text = data["choices"][0]["text"] @@ -128,10 +128,15 @@ def call_inference_api( # conservatively set the first token time to the request time first_token_time = req_time logger.info(f"usage: {data['usage']}") - # TODO: verify the number of tokens - # num_completion_tokens = len(tokenizer.encode(full_text, add_special_tokens=False)) - num_completion_tokens = max(num_completion_tokens, 2) + # verify the number of completion tokens + checksum_num_completion_tokens = len( + tokenizer.encode(full_text, add_special_tokens=False) + ) + token_diff = checksum_num_completion_tokens - num_completion_tokens + if token_diff != 0: + logger.warning(f"response_idx=:{response_idx}, token_diff =: {token_diff}") + throughput_time = max(time.time() - first_token_time, 0.0001) response_data = { "response_idx": response_idx, @@ -139,7 +144,7 @@ def call_inference_api( "response": full_text, "prompt_length": prompt_len, "num_completion_tokens": num_completion_tokens, - "tps": (num_completion_tokens - 1) / throughput_time, + "tps": (max(num_completion_tokens, 1)) / throughput_time, "ttft": ttft, } with responses_lock: @@ -198,7 +203,8 @@ def calculate_batch_sizes(num_prompts, max_batch_size, vary_batch_size): def test_api_call_threaded_full_queue( prompts, - prompt_lengths, + input_seq_lengths, + output_seq_lengths, batch_size, num_full_iterations, vary_batch_size, @@ -228,13 +234,16 @@ def test_api_call_threaded_full_queue( if batch_size == 1: logger.info("Running with single thread") for iter_num in range(num_full_iterations): - for i, (prompt, prompt_len) in enumerate(zip(prompts, prompt_lengths)): + for i, (prompt, isl, osl) in enumerate( + zip(prompts, input_seq_lengths, output_seq_lengths) + ): handle_delay(inter_batch_delay) response_idx = iter_num * num_prompts + i response_data = call_func( prompt=prompt, response_idx=response_idx, - prompt_len=prompt_len, + prompt_len=isl, + max_tokens=osl, **call_func_kwargs, ) # Write the response data to the JSONL file @@ -264,22 +273,28 @@ def test_api_call_threaded_full_queue( for bsz in batch_sizes: batch_end = min(batch_start + bsz, num_prompts) batch_prompts = prompts[batch_start:batch_end] - batch_prompt_lengths = prompt_lengths[batch_start:batch_end] + batch_input_seq_lengths = input_seq_lengths[batch_start:batch_end] + batch_output_seq_lengths = output_seq_lengths[batch_start:batch_end] handle_delay(inter_batch_delay) # Submit all prompts in the current batch logger.info(f"Sending batch requests: {bsz}") with ThreadPoolExecutor(max_workers=bsz) as executor: futures = [] - for i, (prompt, prompt_len) in enumerate( - zip(batch_prompts, batch_prompt_lengths) + for i, (prompt, isl, osl) in enumerate( + zip( + batch_prompts, + batch_input_seq_lengths, + batch_output_seq_lengths, + ) ): response_idx = iter_num * num_prompts + i future = executor.submit( call_func, prompt=prompt, response_idx=response_idx, - prompt_len=prompt_len, + prompt_len=isl, + max_tokens=osl, **call_func_kwargs, ) futures.append(future) @@ -308,13 +323,16 @@ def test_api_call_threaded_full_queue( # Submit all prompts across all iterations for iter_num in range(num_full_iterations): - for i, (prompt, prompt_len) in enumerate(zip(prompts, prompt_lengths)): + for i, (prompt, isl, osl) in enumerate( + zip(prompts, input_seq_lengths, output_seq_lengths) + ): response_idx = iter_num * num_prompts + i future = executor.submit( call_func, prompt=prompt, response_idx=response_idx, - prompt_len=prompt_len, + prompt_len=isl, + max_tokens=osl, **call_func_kwargs, ) futures.append(future) @@ -348,14 +366,16 @@ def main(): # generate prompts tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_model) - prompts, prompt_lengths = generate_prompts(args) + prompts, input_seq_lengths = generate_prompts(args) + output_seq_lengths = [args.output_seq_len] * len(prompts) headers = {"Authorization": f"Bearer {get_authorization()}"} api_url = get_api_url() logging.info(f"API_URL: {api_url}") test_api_call_threaded_full_queue( prompts=prompts, - prompt_lengths=prompt_lengths, + input_seq_lengths=input_seq_lengths, + output_seq_lengths=output_seq_lengths, batch_size=args.batch_size, num_full_iterations=args.num_full_iterations, vary_batch_size=args.vary_batch_size, @@ -365,9 +385,9 @@ def main(): "stream": not args.no_stream, "headers": headers, "api_url": api_url, - "max_tokens": args.output_seq_len, "vll_model": args.vllm_model, "tokenizer": tokenizer, + "force_max_tokens": True, }, ) From 059d5135f5e0da6b086e60494eaa9faaa9c6d393 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Wed, 4 Dec 2024 21:43:09 +0000 Subject: [PATCH 06/76] faster mock model prefill --- tests/mock_vllm_model.py | 63 ++++++++++++++++++++++------------------ 1 file changed, 34 insertions(+), 29 deletions(-) diff --git a/tests/mock_vllm_model.py b/tests/mock_vllm_model.py index f3ff4503..ef807a76 100644 --- a/tests/mock_vllm_model.py +++ b/tests/mock_vllm_model.py @@ -269,36 +269,41 @@ def prefill_forward( """ batch, batch_seq_len = tokens.shape - output_logits = torch.zeros(batch, 1, self.params.vocab_size) - prompt_lens = ( - prompt_lens - if prompt_lens is not None - else torch.tensor([batch_seq_len] * batch) - ) - for user_id in range(batch): - seq_len = prompt_lens[user_id] - prefill_seq_len = get_padded_prefill_len(seq_len) - prefill_ids = torch.cat( - [ - tokens[user_id : user_id + 1, :seq_len], - torch.zeros(1, prefill_seq_len - seq_len).long(), - ], - dim=-1, - ) - logger.info(f"Filling kv cache for user {user_id + 1}") - last_token_idx = seq_len - 1 - logits = self.prefill_forward_single_user( - prefill_ids, - start_pos, - user_id, - last_token_idx=last_token_idx, - page_table=page_table, - kv_cache=kv_cache, + # faster prefill that does not mimic the actual prefill process + fast_prefill = True + if fast_prefill: + output_logits = torch.randn((batch, 1, self.params.vocab_size)) + else: + output_logits = torch.zeros(batch, 1, self.params.vocab_size) + prompt_lens = ( + prompt_lens + if prompt_lens is not None + else torch.tensor([batch_seq_len] * batch) ) - # Since we give unpadded_seq_len, only the tile containing the last token is returned - output_logits[user_id] = logits[ - :, last_token_idx % 32 : last_token_idx % 32 + 1, : - ] + for user_id in range(batch): + seq_len = prompt_lens[user_id] + prefill_seq_len = get_padded_prefill_len(seq_len) + prefill_ids = torch.cat( + [ + tokens[user_id : user_id + 1, :seq_len], + torch.zeros(1, prefill_seq_len - seq_len).long(), + ], + dim=-1, + ) + logger.info(f"Filling kv cache for user {user_id + 1}") + last_token_idx = seq_len - 1 + logits = self.prefill_forward_single_user( + prefill_ids, + start_pos, + user_id, + last_token_idx=last_token_idx, + page_table=page_table, + kv_cache=kv_cache, + ) + # Since we give unpadded_seq_len, only the tile containing the last token is returned + output_logits[user_id] = logits[ + :, last_token_idx % 32 : last_token_idx % 32 + 1, : + ] return output_logits From 48d17deb89107977792d19fb3f023151f6bd3efe Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Thu, 5 Dec 2024 01:48:44 +0000 Subject: [PATCH 07/76] make it not send stop tokens by default and speed up mock model decode and prefill --- tests/mock_vllm_model.py | 63 +++++++++++++++++++++++----------------- 1 file changed, 36 insertions(+), 27 deletions(-) diff --git a/tests/mock_vllm_model.py b/tests/mock_vllm_model.py index ef807a76..8fa6a510 100644 --- a/tests/mock_vllm_model.py +++ b/tests/mock_vllm_model.py @@ -14,12 +14,12 @@ import torch from huggingface_hub import hf_hub_download -from vllm.engine.metrics import logger - # mock out ttnn fully so we can import ttnn without using it sys.modules["ttnn"] = MagicMock() sys.modules["ttnn.device"] = MagicMock() +from vllm.engine.metrics import logger + from models.demos.t3000.llama2_70b.tt.llama_common import ( setup_llama_env, ) @@ -31,6 +31,8 @@ get_model_config, ) +torch.manual_seed(9387) + def setup_mock_model_weights(cache_root: str, weights_dir: str, hf_token: str): if not hf_token: @@ -269,10 +271,11 @@ def prefill_forward( """ batch, batch_seq_len = tokens.shape - # faster prefill that does not mimic the actual prefill process fast_prefill = True if fast_prefill: - output_logits = torch.randn((batch, 1, self.params.vocab_size)) + # faster prefill that does not mimic the actual prefill process + logger.info("Filling kv cache via fast_prefill in mock model") + output_logits = self.decode_forward(tokens=tokens, start_pos=start_pos) else: output_logits = torch.zeros(batch, 1, self.params.vocab_size) prompt_lens = ( @@ -304,29 +307,27 @@ def prefill_forward( output_logits[user_id] = logits[ :, last_token_idx % 32 : last_token_idx % 32 + 1, : ] - return output_logits - def decode_mock_send_token(self, logits, start_pos, batch, send_eot=False): + def decode_send_stop_token(self, logits, start_pos, batch): # tooling for sending EOT token or other specific token at specific output position EOT_ID = 128009 send_index = 200 send_token = EOT_ID - if send_eot: - if start_pos is not None: - if isinstance(start_pos, int): - # if start pos is same across batch, ie. now in prefill - cache_idxs = torch.tensor( - [start_pos for _ in range(batch)], dtype=torch.int64 - ) - else: # if start_pos is a tensor ie. is different across batch, now in decode mode - # if start position is greater than index to send EOT - cache_idxs = start_pos.to(dtype=torch.int64) - send_token_mask = cache_idxs > send_index - # find positions where start pos passes send_index (ie. done decoding) + make 1D - batch_indices = torch.nonzero(send_token_mask).squeeze() - # assign a high logit at at the send _token index so model will select it and generate the EOT so that generation stops - logits[batch_indices, 0, send_token] = 100.0 + if start_pos is not None: + if isinstance(start_pos, int): + # if start pos is same across batch, ie. now in prefill + cache_idxs = torch.tensor( + [start_pos for _ in range(batch)], dtype=torch.int64 + ) + else: # if start_pos is a tensor ie. is different across batch, now in decode mode + # if start position is greater than index to send EOT + cache_idxs = start_pos.to(dtype=torch.int64) + send_token_mask = cache_idxs > send_index + # find positions where start pos passes send_index (ie. done decoding) + make 1D + batch_indices = torch.nonzero(send_token_mask).squeeze() + # assign a high logit at at the send _token index so model will select it and generate the EOT so that generation stops + logits[batch_indices, 0, send_token] = 100.0 return logits def decode_forward( @@ -342,15 +343,23 @@ def decode_forward( assert len(tokens.shape) == 2 batch, seqlen = tokens.shape forward_start = time.time() - simulated_tps = 10000.0 + simulated_tps = 100000.0 simulated_duration = 1.0 / simulated_tps - # update the new tokens generated to the input id - # vocab_size = tokenizer.nwords + low_value = -100.0 + high_value = 100.0 + vocab_size = 128256 + unreserved_vocab_size = 128000 # logits: [batch, seqlen, vocab_size] - logits = torch.randn((batch, seqlen, 128256)) - logits = self.decode_mock_send_token(logits, start_pos, batch, send_eot=True) - actual_duration = time.time() - forward_start + logits = torch.full((batch, seqlen, vocab_size), low_value) + # set randomly selected tokens to high value + gen_token_ids = torch.randint(0, unreserved_vocab_size, (batch,)) + logits[:, :, gen_token_ids] = high_value + send_eot = False + if send_eot: + # optionally send EOT token with some logic + logits = self.decode_send_stop_token(logits, start_pos, batch) # simulate forward latency + actual_duration = time.time() - forward_start time.sleep(max(simulated_duration - actual_duration, 0)) return logits From fead1aaf6f9759df523d52097ceeb92b8bd74844 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Thu, 5 Dec 2024 01:49:57 +0000 Subject: [PATCH 08/76] adding token count verification for vllm open ai api server to prompt_client_cli.py --- utils/prompt_client_cli.py | 41 ++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/utils/prompt_client_cli.py b/utils/prompt_client_cli.py index 58524cfd..c73372c3 100644 --- a/utils/prompt_client_cli.py +++ b/utils/prompt_client_cli.py @@ -82,6 +82,7 @@ def call_inference_api( "top_p": 0.9, "max_tokens": max_tokens, "stream": stream, + "stream_options": {"include_usage": True}, } if force_max_tokens: # use a reserved special token avoid the model to stopping before osl reached @@ -109,13 +110,18 @@ def call_inference_api( data_str = line[len("data: ") :].strip() if data_str == "[DONE]": break - num_completion_tokens += 1 try: # Parse the JSON data data = json.loads(data_str) # Extract text from the 'choices' field - content = data["choices"][0].get("text", "") - full_text += content + if data["choices"]: + num_completion_tokens += 1 + content = data["choices"][0].get("text", "") + full_text += content + else: + # final response has complete usage + usage_dict = data.get("usage", {}) + except json.JSONDecodeError as e: print(f"Failed to decode JSON: {e}") continue @@ -124,18 +130,31 @@ def call_inference_api( else: data = response.json() full_text = data["choices"][0]["text"] - num_completion_tokens = data["usage"]["completion_tokens"] + usage_dict = data["usage"] + usage_completion_tokens = usage_dict["completion_tokens"] # conservatively set the first token time to the request time first_token_time = req_time logger.info(f"usage: {data['usage']}") - # verify the number of completion tokens - checksum_num_completion_tokens = len( - tokenizer.encode(full_text, add_special_tokens=False) - ) - token_diff = checksum_num_completion_tokens - num_completion_tokens - if token_diff != 0: - logger.warning(f"response_idx=:{response_idx}, token_diff =: {token_diff}") + # verify the number of input tokens + isl_diff = usage_dict["prompt_tokens"] - prompt_len + if isl_diff != 0: + logger.warning( + f"response_idx=:{response_idx}, isl_diff(actual - expected) =: {isl_diff}" + ) + + # verify the number of output tokens + usage_completion_tokens = usage_dict["completion_tokens"] + if num_completion_tokens > 0: + osl_diff = usage_completion_tokens - num_completion_tokens + if osl_diff != 0: + logger.warning( + f"response_idx=:{response_idx}, osl_diff(actual - expected) =: {osl_diff}" + ) + if max_tokens != usage_completion_tokens or max_tokens != num_completion_tokens: + logger.warning( + f"response_idx=:{response_idx}, max_tokens=:{max_tokens}, num_completion_tokens=:{num_completion_tokens}, usage_completion_tokens:={usage_completion_tokens}" + ) throughput_time = max(time.time() - first_token_time, 0.0001) response_data = { From 5a80551a9fd79022fcc9a82d8d417eec20cfd173 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Thu, 5 Dec 2024 04:38:47 +0000 Subject: [PATCH 09/76] add max-log-len to limit logging of prompts to avoid clutter in logs --- tests/mock_vllm_api_server.py | 1 + vllm-tt-metal-llama3-70b/src/run_vllm_api_server.py | 1 + 2 files changed, 2 insertions(+) diff --git a/tests/mock_vllm_api_server.py b/tests/mock_vllm_api_server.py index bb357277..063d55b1 100644 --- a/tests/mock_vllm_api_server.py +++ b/tests/mock_vllm_api_server.py @@ -86,6 +86,7 @@ def main(): "num_scheduler_steps": "10", "port": os.getenv("SERVICE_PORT", "7000"), "seed": "4862", + "max-log-len": "32", "download-dir": os.getenv("CACHE_DIR", None), "api-key": get_encoded_api_key(os.getenv("JWT_SECRET", None)), } diff --git a/vllm-tt-metal-llama3-70b/src/run_vllm_api_server.py b/vllm-tt-metal-llama3-70b/src/run_vllm_api_server.py index a5b51126..992874b1 100644 --- a/vllm-tt-metal-llama3-70b/src/run_vllm_api_server.py +++ b/vllm-tt-metal-llama3-70b/src/run_vllm_api_server.py @@ -48,6 +48,7 @@ def main(): "max_model_len": "131072", "max_num_batched_tokens": "131072", "num_scheduler_steps": "10", + "max-log-len": "32", "port": os.getenv("SERVICE_PORT", "7000"), "download-dir": os.getenv("CACHE_DIR", None), "api-key": get_encoded_api_key(os.getenv("JWT_SECRET", None)), From d845f08d21ab505145fad0f71537057e8bc7d344 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Thu, 5 Dec 2024 04:40:09 +0000 Subject: [PATCH 10/76] add InferenceServerContext to startup_utils.py, improve wait_for_healthy --- utils/startup_utils.py | 77 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 72 insertions(+), 5 deletions(-) diff --git a/utils/startup_utils.py b/utils/startup_utils.py index 33ef7f86..0da62715 100644 --- a/utils/startup_utils.py +++ b/utils/startup_utils.py @@ -5,7 +5,12 @@ import os import time import logging +import subprocess +import psutil +import signal + import requests + from utils.prompt_client_cli import ( get_authorization, ) @@ -19,11 +24,11 @@ def get_api_health_url(): DEPLOY_URL = os.getenv("DEPLOY_URL", "http://127.0.0.1") - health_url = f"{DEPLOY_URL}:{os.getenv('SERVICE_PORT', '8000')}/health" + health_url = f"{DEPLOY_URL}:{os.getenv('SERVICE_PORT', '7000')}/health" return health_url -def wait_for_healthy(base_url: str, timeout: int = 300, interval: int = 10) -> bool: +def wait_for_healthy(timeout: int = 300, interval: int = 10) -> bool: """ Check the health endpoint until the service is ready. """ @@ -32,6 +37,7 @@ def wait_for_healthy(base_url: str, timeout: int = 300, interval: int = 10) -> b headers = {"Authorization": f"Bearer {get_authorization()}"} total_time_waited = 0 while time.time() - start_time < timeout: + req_time = time.time() try: response = requests.get(health_url, headers=headers, timeout=interval) if response.status_code == 200: @@ -43,11 +49,72 @@ def wait_for_healthy(base_url: str, timeout: int = 300, interval: int = 10) -> b except requests.exceptions.RequestException as e: logger.warning(f"Health check failed: {e}") - total_time_waited += interval + total_time_waited = time.time() - start_time + sleep_interval = max(2 - (time.time() - req_time), 0) logger.info( - f"Service not ready after {total_time_waited} seconds, waiting {interval} seconds before polling ..." + f"Service not ready after {total_time_waited:.2f} seconds, waiting {sleep_interval:.2f} seconds before polling ..." ) - time.sleep(0.05) + time.sleep(sleep_interval) logger.error(f"Service did not become healthy within {timeout} seconds") return False + + +class InferenceServerContext: + def __init__(self, startup_script_path): + self.startup_script_path = startup_script_path + + def __enter__(self): + self.process = subprocess.Popen( + ["python", self.startup_script_path], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + preexec_fn=os.setsid, + ) + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + if not self.process: + return + + # Log initial state + try: + parent = psutil.Process(self.process.pid) + children = parent.children(recursive=True) + logger.info(f"Found {len(children)} child processes before termination") + for child in children: + logger.info(f"Child PID: {child.pid}, Name: {child.name()}") + except psutil.NoSuchProcess: + logger.warning("Main process already terminated") + return + + # Send SIGTERM to process group + try: + os.killpg(self.process.pid, signal.SIGTERM) + logger.info(f"Sent SIGTERM to process group {self.process.pid}") + except ProcessLookupError: + logger.warning("Process group already terminated") + return + + # Wait for graceful shutdown + try: + self.process.wait(timeout=5) + logger.info("Process terminated gracefully") + except subprocess.TimeoutExpired: + logger.warning("Timeout expired, force killing process group") + try: + os.killpg(self.process.pid, signal.SIGKILL) + except ProcessLookupError: + pass + + # Final verification + try: + parent = psutil.Process(self.process.pid) + remaining = parent.children(recursive=True) + if remaining: + logger.error(f"{len(remaining)} child processes still exist") + for proc in remaining: + logger.error(f"Remaining PID: {proc.pid}, Name: {proc.name()}") + except psutil.NoSuchProcess: + logger.info("All inference server processes terminated") From 632ac83af91dd970f6b42759b2c7b4eca4d75ba3 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Thu, 5 Dec 2024 22:57:08 +0000 Subject: [PATCH 11/76] add all_responses to utils/prompt_client_cli.py not using globals --- utils/prompt_client_cli.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/utils/prompt_client_cli.py b/utils/prompt_client_cli.py index c73372c3..611a96d1 100644 --- a/utils/prompt_client_cli.py +++ b/utils/prompt_client_cli.py @@ -58,7 +58,6 @@ def get_api_url(): # Thread-safe data collection responses_lock = threading.Lock() -responses = [] def call_inference_api( @@ -161,13 +160,13 @@ def call_inference_api( "response_idx": response_idx, "prompt": prompt, "response": full_text, - "prompt_length": prompt_len, - "num_completion_tokens": num_completion_tokens, + "input_seq_len": prompt_len, + "output_seq_len": num_completion_tokens, "tps": (max(num_completion_tokens, 1)) / throughput_time, "ttft": ttft, } - with responses_lock: - responses.append(response_data) + # with responses_lock: + # responses.append(response_data) return response_data @@ -250,6 +249,7 @@ def test_api_call_threaded_full_queue( f"Running {total_prompts} prompts in full queue with batch size {batch_size}." ) num_prompts = len(prompts) + all_responses = [] if batch_size == 1: logger.info("Running with single thread") for iter_num in range(num_full_iterations): @@ -267,13 +267,14 @@ def test_api_call_threaded_full_queue( ) # Write the response data to the JSONL file with responses_lock: + all_responses.append(response_data) with open(json_fpath, "a") as f: if response_counter > 0: f.write(",") json.dump(response_data, f, indent=4) response_counter += 1 logger.info( - f"Processed {response_counter}/{total_prompts} responses. Avg. TPS: {response_data['tps']:.2f}, TTFT: {response_data['ttft']:.2f}, Completion Tokens: {response_data['num_completion_tokens']}, Prompt Length: {response_data['prompt_length']}" + f"Processed {response_counter}/{total_prompts} responses. Avg. TPS: {response_data['tps']:.2f}, TTFT: {response_data['ttft']:.2f}, input_seq_len: {response_data['input_seq_len']},output_seq_len: {response_data['output_seq_len']}" ) elif batch_size > 1 and vary_batch_size: logger.info( @@ -322,13 +323,14 @@ def test_api_call_threaded_full_queue( try: response_data = future.result() with responses_lock: + all_responses.append(response_data) with open(json_fpath, "a") as f: if response_counter > 0: f.write(",") json.dump(response_data, f, indent=4) response_counter += 1 logger.info( - f"Processed {response_counter}/{total_prompts} responses. Avg. TPS: {response_data['tps']:.2f}, TTFT: {response_data['ttft']:.2f}, Completion Tokens: {response_data['num_completion_tokens']}, Prompt Length: {response_data['prompt_length']}" + f"Processed {response_counter}/{total_prompts} responses. Avg. TPS: {response_data['tps']:.2f}, TTFT: {response_data['ttft']:.2f}, input_seq_len: {response_data['input_seq_len']},output_seq_len: {response_data['output_seq_len']}" ) except Exception as e: logger.error(f"Error processing response: {e}") @@ -361,13 +363,14 @@ def test_api_call_threaded_full_queue( try: response_data = future.result() with responses_lock: + all_responses.append(response_data) with open(json_fpath, "a") as f: if response_counter > 0: f.write(",") json.dump(response_data, f, indent=4) response_counter += 1 logger.info( - f"Processed {response_counter}/{total_prompts} responses. Avg. TPS: {response_data['tps']:.2f}, TTFT: {response_data['ttft']:.2f}, Completion Tokens: {response_data['num_completion_tokens']}, Prompt Length: {response_data['prompt_length']}" + f"Processed {response_counter}/{total_prompts} responses. Avg. TPS: {response_data['tps']:.2f}, TTFT: {response_data['ttft']:.2f}, input_seq_len: {response_data['input_seq_len']}, output_seq_len: {response_data['output_seq_len']}" ) except Exception as e: logger.error(f"Error processing response: {e}") From f563e32b2987ecede98fbd7994358f547bc8735a Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Thu, 5 Dec 2024 23:06:20 +0000 Subject: [PATCH 12/76] adding new utils/prompt_client_cli.py using utils/prompt_client.py and utils/batch_processor.py with configs in utils/prompt_configs.py and utils/prompt_generation.py for prompt generation --- utils/batch_processor.py | 274 ++++++++++++++++++++ utils/prompt_client.py | 273 ++++++++++++++++++++ utils/prompt_client_cli.py | 507 ++++++++----------------------------- utils/prompt_configs.py | 40 +++ utils/prompt_generation.py | 117 +++------ utils/startup_utils.py | 44 ---- 6 files changed, 728 insertions(+), 527 deletions(-) create mode 100644 utils/batch_processor.py create mode 100644 utils/prompt_client.py create mode 100644 utils/prompt_configs.py diff --git a/utils/batch_processor.py b/utils/batch_processor.py new file mode 100644 index 00000000..c8f9ea90 --- /dev/null +++ b/utils/batch_processor.py @@ -0,0 +1,274 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +import threading +import logging +import json +import time +from datetime import datetime +from pathlib import Path +from typing import List +from concurrent.futures import ThreadPoolExecutor, as_completed + +import numpy as np +from transformers import AutoTokenizer + +from prompt_configs import BatchConfig +from prompt_client import PromptClient + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" +) +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +class BatchProcessor: + def __init__(self, prompt_client: PromptClient, batch_config: BatchConfig): + self.prompt_client = prompt_client + self.batch_config = batch_config + self.responses_lock = threading.Lock() + + def _calculate_batch_sizes(self, num_prompts: int) -> List[int]: + if self.batch_config.vary_batch_size: + mean_workers = self.batch_config.batch_size / 2 + std_dev = self.batch_config.batch_size / 4 + + batch_sizes = [] + remaining = num_prompts + + while remaining > 0: + size = int( + np.clip( + np.random.normal(mean_workers, std_dev), + 1, + self.batch_config.batch_size, + ) + ) + if size > remaining: + size = remaining + batch_sizes.append(size) + remaining -= size + + return batch_sizes + + return [self.batch_config.batch_size] * ( + num_prompts // self.batch_config.batch_size + ) + + def process_batch( + self, + prompts: List[str], + input_seq_lengths: List[int], + tokenizer: AutoTokenizer, + ) -> List[dict]: + timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + json_fpath = ( + Path(self.prompt_client.env_config.cache_root) + / f"alpaca_eval_responses_{timestamp}.json" + ) + + total_prompts = len(prompts) * self.batch_config.num_full_iterations + response_counter = 0 + all_responses = [] + + with open(json_fpath, "a") as f: + f.write("[\n") + + if self.batch_config.batch_size == 1: + all_responses = self._process_single_thread( + prompts, + input_seq_lengths, + tokenizer, + json_fpath, + total_prompts, + response_counter, + ) + else: + all_responses = self._process_multi_thread( + prompts, + input_seq_lengths, + tokenizer, + json_fpath, + total_prompts, + response_counter, + ) + + with open(json_fpath, "a") as f: + f.write("\n]") + + return all_responses + + def _process_single_thread( + self, + prompts: List[str], + input_seq_lengths: List[int], + tokenizer: AutoTokenizer, + json_fpath: Path, + total_prompts: int, + response_counter: int, + ) -> List[dict]: + all_responses = [] + + for iter_num in range(self.batch_config.num_full_iterations): + for i, (prompt, isl) in enumerate(zip(prompts, input_seq_lengths)): + if self.batch_config.inter_batch_delay > 0: + time.sleep(self.batch_config.inter_batch_delay) + + response_idx = iter_num * len(prompts) + i + response_data = self.prompt_client.call_inference( + prompt=prompt, + response_idx=response_idx, + prompt_len=isl, + max_tokens=self.batch_config.output_seq_lens[i], + stream=self.batch_config.stream, + vll_model=self.batch_config.vllm_model, + tokenizer=tokenizer, + ) + + self._save_response( + response_data, all_responses, json_fpath, response_counter + ) + response_counter += 1 + self._log_progress(response_counter, total_prompts, response_data) + + return all_responses + + def _process_multi_thread( + self, + prompts: List[str], + input_seq_lengths: List[int], + tokenizer: AutoTokenizer, + json_fpath: Path, + total_prompts: int, + response_counter: int, + ) -> List[dict]: + all_responses = [] + + if self.batch_config.vary_batch_size: + batch_sizes = self._calculate_batch_sizes(len(prompts)) + + for iter_num in range(self.batch_config.num_full_iterations): + batch_start = 0 + + for bsz in batch_sizes: + batch_end = min(batch_start + bsz, len(prompts)) + self._process_batch_chunk( + prompts[batch_start:batch_end], + input_seq_lengths[batch_start:batch_end], + iter_num, + bsz, + tokenizer, + all_responses, + json_fpath, + total_prompts, + response_counter, + ) + batch_start = batch_end + else: + with ThreadPoolExecutor( + max_workers=self.batch_config.batch_size + ) as executor: + futures = [] + + for iter_num in range(self.batch_config.num_full_iterations): + for i, (prompt, isl) in enumerate(zip(prompts, input_seq_lengths)): + response_idx = iter_num * len(prompts) + i + future = executor.submit( + self.prompt_client.call_inference, + prompt=prompt, + response_idx=response_idx, + prompt_len=isl, + max_tokens=self.batch_config.output_seq_lens[i], + stream=self.batch_config.stream, + vll_model=self.batch_config.vllm_model, + tokenizer=tokenizer, + ) + futures.append(future) + + for future in as_completed(futures): + try: + response_data = future.result() + self._save_response( + response_data, all_responses, json_fpath, response_counter + ) + response_counter += 1 + self._log_progress( + response_counter, total_prompts, response_data + ) + except Exception as e: + logger.error(f"Error processing response: {e}") + + return all_responses + + def _process_batch_chunk( + self, + batch_prompts: List[str], + batch_input_seq_lengths: List[int], + iter_num: int, + batch_size: int, + tokenizer: AutoTokenizer, + all_responses: List[dict], + json_fpath: Path, + total_prompts: int, + response_counter: int, + ): + if self.batch_config.inter_batch_delay > 0: + time.sleep(self.batch_config.inter_batch_delay) + + with ThreadPoolExecutor(max_workers=batch_size) as executor: + futures = [] + + for i, (prompt, isl) in enumerate( + zip(batch_prompts, batch_input_seq_lengths) + ): + response_idx = iter_num * len(batch_prompts) + i + future = executor.submit( + self.prompt_client.call_inference, + prompt=prompt, + response_idx=response_idx, + prompt_len=isl, + max_tokens=self.batch_config.output_seq_lens[i], + stream=self.batch_config.stream, + vll_model=self.batch_config.vllm_model, + tokenizer=tokenizer, + ) + futures.append(future) + + for future in as_completed(futures): + try: + response_data = future.result() + self._save_response( + response_data, all_responses, json_fpath, response_counter + ) + response_counter += 1 + self._log_progress(response_counter, total_prompts, response_data) + except Exception as e: + logger.error(f"Error processing response: {e}") + + def _save_response( + self, + response_data: dict, + all_responses: List[dict], + json_fpath: Path, + response_counter: int, + ): + with self.responses_lock: + all_responses.append(response_data) + with open(json_fpath, "a") as f: + if response_counter > 0: + f.write(",") + json.dump(response_data, f, indent=4) + + def _log_progress( + self, response_counter: int, total_prompts: int, response_data: dict + ): + logger.info( + f"Processed {response_counter}/{total_prompts} responses. " + f"decode_tps: {response_data['decode_tps']:.2f}, " + f"total_tps: {response_data['total_tps']:.2f}, " + f"ttft: {response_data['ttft']:.2f}, " + f"input_seq_len: {response_data['input_seq_len']}, " + f"output_seq_len: {response_data['output_seq_len']}" + ) diff --git a/utils/prompt_client.py b/utils/prompt_client.py new file mode 100644 index 00000000..00473045 --- /dev/null +++ b/utils/prompt_client.py @@ -0,0 +1,273 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +import logging +import json +import time +from typing import List + +import requests +import jwt +from transformers import AutoTokenizer + +from prompt_generation import generate_prompts +from prompt_configs import PromptConfig, EnvironmentConfig + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" +) +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +class PromptClient: + def __init__(self, env_config: EnvironmentConfig): + self.env_config = env_config + self.headers = {"Authorization": f"Bearer {self._get_authorization()}"} + self.completions_url = self._get_api_completions_url() + self.health_url = self._get_api_health_url() + self.server_ready = False + + def _get_authorization(self) -> str: + if self.env_config.authorization: + return self.env_config.authorization + + if self.env_config.jwt_secret: + json_payload = json.loads( + '{"team_id": "tenstorrent", "token_id":"debug-test"}' + ) + encoded_jwt = jwt.encode( + json_payload, self.env_config.jwt_secret, algorithm="HS256" + ) + return encoded_jwt + + raise ValueError( + "Neither AUTHORIZATION or JWT_SECRET environment variables are set." + ) + + def _get_api_base_url(self) -> str: + return f"{self.env_config.deploy_url}:{self.env_config.service_port}/v1" + + def _get_api_completions_url(self) -> str: + return f"{self._get_api_base_url()}/completions" + + def _get_api_health_url(self) -> str: + return f"{self._get_api_base_url()}/health" + + def get_health(self) -> requests.Response: + return requests.get(self.health_url, headers=self.headers) + + def wait_for_healthy(self, timeout: int = 300, interval: int = 10) -> bool: + if self.server_ready: + return True + + start_time = time.time() + total_time_waited = 0 + + while time.time() - start_time < timeout: + req_time = time.time() + try: + response = requests.get( + self.health_url, headers=self.headers, timeout=interval + ) + if response.status_code == 200: + startup_time = time.time() - start_time + logger.info( + f"vLLM service is healthy. startup_time:= {startup_time} seconds" + ) + self.server_ready = True + return True + + except requests.exceptions.RequestException as e: + logger.warning(f"Health check failed: {e}") + + total_time_waited = time.time() - start_time + sleep_interval = max(2 - (time.time() - req_time), 0) + logger.info( + f"Service not ready after {total_time_waited:.2f} seconds, " + f"waiting {sleep_interval:.2f} seconds before polling ..." + ) + time.sleep(sleep_interval) + + logger.error(f"Service did not become healthy within {timeout} seconds") + return False + + def capture_traces( + self, + input_sizes: List[int] = None, + prompts_per_size: int = 1, + output_seq_len: int = 1, + ) -> None: + logger.info("Capturing input sizes ...") + + # Default input sizes based on get_padded_prefill_len() + if input_sizes is None: + input_sizes = [32, 64, 128, 256, 512, 1024, 2048, 3072, 4096] + + # Check service health before starting + if not self.wait_for_healthy(): + raise RuntimeError("vLLM did not start correctly!") + + for size in input_sizes: + logger.info(f"Capture input size: {size}") + + # Create prompt config for current size + prompt_config = PromptConfig( + input_seq_len=size, + max_prompt_length=size, + num_prompts=prompts_per_size, + distribution="fixed", + dataset="random", + tokenizer_model=self.env_config.vllm_model, + template=None, + save_path=None, + print_prompts=False, + ) + + # Generate prompts for current size + prompts, prompt_lengths = generate_prompts(prompt_config) + + # Process each prompt + for i, (prompt, prompt_len) in enumerate(zip(prompts, prompt_lengths)): + try: + logger.info(f"Starting capture for input_seq_len: {prompt_len}") + response_data = self.call_inference( + prompt=prompt, + response_idx=i, + prompt_len=prompt_len, + max_tokens=output_seq_len, + stream=True, + vll_model=self.env_config.vllm_model, + tokenizer=None, + force_max_tokens=True, + ) + logger.info( + f"Input size: {size}, " + f"input_seq_len: {prompt_len}, " + f"TTFT: {response_data['ttft']:.3f}s" + ) + except Exception as e: + logger.error(f"Error processing prompt: {e}") + + def call_inference( + self, + prompt: str, + response_idx: int, + prompt_len: int, + max_tokens: int, + stream: bool, + vll_model: str, + tokenizer: AutoTokenizer, + force_max_tokens: bool = True, + include_usage: bool = True, + ) -> dict: + json_data = { + "model": vll_model, + "prompt": prompt, + "temperature": 1, + "top_k": 20, + "top_p": 0.9, + "max_tokens": max_tokens, + "stream": stream, + "stream_options": {"include_usage": include_usage}, + } + + if force_max_tokens: + json_data["stop"] = "<|reserved_special_token_249|>" + + req_time = time.perf_counter() + response = requests.post( + self.completions_url, + json=json_data, + headers=self.headers, + stream=stream, + timeout=600, + ) + + return self._process_response( + response, req_time, response_idx, prompt, prompt_len, max_tokens, stream + ) + + def _process_response( + self, + response: requests.Response, + req_time: float, + response_idx: int, + prompt: str, + prompt_len: int, + max_tokens: int, + stream: bool, + ) -> dict: + full_text = "" + num_completion_tokens = 0 + first_token_time = 0 + ttft = 0 + usage_dict = {} + + if stream: + assert ( + response.headers.get("transfer-encoding") == "chunked" + ), "Response is not chunked" + for line in response.iter_lines(decode_unicode=True): + if line and line.startswith("data: "): + if num_completion_tokens == 0: + first_token_time = time.perf_counter() + ttft = first_token_time - req_time + + data_str = line[len("data: ") :].strip() + if data_str == "[DONE]": + break + + try: + data = json.loads(data_str) + if data["choices"]: + full_text += data["choices"][0].get("text", "") + num_completion_tokens += 1 + else: + usage_dict = data.get("usage", {}) + except json.JSONDecodeError as e: + logger.error(f"Failed to decode JSON: {e}") + continue + else: + data = response.json() + full_text = data["choices"][0]["text"] + usage_dict = data["usage"] + first_token_time = req_time + + decode_time = max(time.perf_counter() - first_token_time, 0.0001) + total_time = max(time.perf_counter() - req_time, 0.0001) + + # verify the number of input tokens + isl_diff = usage_dict["prompt_tokens"] - prompt_len + if isl_diff != 0: + logger.warning( + f"response_idx=:{response_idx}, isl_diff(actual - expected) =: {isl_diff}" + ) + + # verify the number of output tokens + usage_completion_tokens = usage_dict["completion_tokens"] + if num_completion_tokens > 0: + osl_diff = usage_completion_tokens - num_completion_tokens + if osl_diff != 0: + logger.warning( + f"response_idx=:{response_idx}, osl_diff(actual - expected) =: {osl_diff}" + ) + if ( + max_tokens != usage_completion_tokens + or max_tokens != num_completion_tokens + ): + logger.warning( + f"response_idx=:{response_idx}, max_tokens=:{max_tokens}, num_completion_tokens=:{num_completion_tokens}, usage_completion_tokens:={usage_completion_tokens}" + ) + + return { + "response_idx": response_idx, + "prompt": prompt, + "response": full_text, + "input_seq_len": prompt_len, + "output_seq_len": num_completion_tokens, + "decode_tps": (max(num_completion_tokens, 1)) / decode_time, + "total_tps": (max(num_completion_tokens, 1)) / total_time, + "ttft": ttft, + } diff --git a/utils/prompt_client_cli.py b/utils/prompt_client_cli.py index 611a96d1..c671ca33 100644 --- a/utils/prompt_client_cli.py +++ b/utils/prompt_client_cli.py @@ -3,22 +3,16 @@ # SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC import os -import getpass -import threading import logging -import json import argparse -import time -from datetime import datetime -import requests -from pathlib import Path -from concurrent.futures import ThreadPoolExecutor, as_completed -import jwt import numpy as np from transformers import AutoTokenizer -from utils.prompt_generation import add_prompt_gen_args, generate_prompts +from prompt_configs import PromptConfig, BatchConfig, EnvironmentConfig +from prompt_client import PromptClient +from batch_processor import BatchProcessor +from prompt_generation import generate_prompts logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" @@ -26,393 +20,6 @@ logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) -# set numpy seed for reproducibility -np.random.seed(42) - - -def get_authorization(): - authorization = os.getenv("AUTHORIZATION", None) - if authorization is None: - jwt_secret = os.getenv("JWT_SECRET", None) - if jwt_secret is None: - raise ValueError( - "Neither AUTHORIZATION or JWT_SECRET environment variables are set." - ) - json_payload = json.loads('{"team_id": "tenstorrent", "token_id":"debug-test"}') - encoded_jwt = jwt.encode(json_payload, jwt_secret, algorithm="HS256") - authorization = f"{encoded_jwt}" - return authorization - - -def get_api_base_url(): - DEPLOY_URL = os.getenv("DEPLOY_URL", "http://127.0.0.1") - base_url = f"{DEPLOY_URL}:{os.getenv('SERVICE_PORT', '8000')}/v1" - return base_url - - -def get_api_url(): - base_url = get_api_base_url() - api_url = f"{base_url}/completions" - return api_url - - -# Thread-safe data collection -responses_lock = threading.Lock() - - -def call_inference_api( - prompt, - response_idx, - prompt_len, - stream, - headers, - api_url, - max_tokens, - vll_model, - tokenizer, - force_max_tokens=True, -): - # set API prompt and optional parameters - json_data = { - "model": vll_model, - "prompt": prompt, - "temperature": 1, - "top_k": 20, - "top_p": 0.9, - "max_tokens": max_tokens, - "stream": stream, - "stream_options": {"include_usage": True}, - } - if force_max_tokens: - # use a reserved special token avoid the model to stopping before osl reached - json_data["stop"] = "<|reserved_special_token_249|>" - req_time = time.time() - # using requests stream=True, make sure to set a timeout - response = requests.post( - api_url, json=json_data, headers=headers, stream=stream, timeout=600 - ) - # Handle chunked response - full_text = "" - num_completion_tokens = 0 - first_token_time = 0 - ttft = 0 - if stream: - if response.headers.get("transfer-encoding") == "chunked": - for line in response.iter_lines(decode_unicode=True): - # Process each line of data as it's received - if line: - # Remove the 'data: ' prefix - if line.startswith("data: "): - if num_completion_tokens == 0: - first_token_time = time.time() - ttft = first_token_time - req_time - data_str = line[len("data: ") :].strip() - if data_str == "[DONE]": - break - try: - # Parse the JSON data - data = json.loads(data_str) - # Extract text from the 'choices' field - if data["choices"]: - num_completion_tokens += 1 - content = data["choices"][0].get("text", "") - full_text += content - else: - # final response has complete usage - usage_dict = data.get("usage", {}) - - except json.JSONDecodeError as e: - print(f"Failed to decode JSON: {e}") - continue - else: - raise ValueError("Response is not chunked") - else: - data = response.json() - full_text = data["choices"][0]["text"] - usage_dict = data["usage"] - usage_completion_tokens = usage_dict["completion_tokens"] - # conservatively set the first token time to the request time - first_token_time = req_time - logger.info(f"usage: {data['usage']}") - - # verify the number of input tokens - isl_diff = usage_dict["prompt_tokens"] - prompt_len - if isl_diff != 0: - logger.warning( - f"response_idx=:{response_idx}, isl_diff(actual - expected) =: {isl_diff}" - ) - - # verify the number of output tokens - usage_completion_tokens = usage_dict["completion_tokens"] - if num_completion_tokens > 0: - osl_diff = usage_completion_tokens - num_completion_tokens - if osl_diff != 0: - logger.warning( - f"response_idx=:{response_idx}, osl_diff(actual - expected) =: {osl_diff}" - ) - if max_tokens != usage_completion_tokens or max_tokens != num_completion_tokens: - logger.warning( - f"response_idx=:{response_idx}, max_tokens=:{max_tokens}, num_completion_tokens=:{num_completion_tokens}, usage_completion_tokens:={usage_completion_tokens}" - ) - - throughput_time = max(time.time() - first_token_time, 0.0001) - response_data = { - "response_idx": response_idx, - "prompt": prompt, - "response": full_text, - "input_seq_len": prompt_len, - "output_seq_len": num_completion_tokens, - "tps": (max(num_completion_tokens, 1)) / throughput_time, - "ttft": ttft, - } - # with responses_lock: - # responses.append(response_data) - return response_data - - -def check_json_fpath(json_fpath): - directory = os.path.dirname(json_fpath) - user = getpass.getuser() - if os.access(directory, os.W_OK): - try: - with open(json_fpath, "w") as f: - f.write("") # Attempt to write an empty string to the file - logger.info(f"The file '{json_fpath}' can be created and is writable.") - return True, "" - except IOError as err: - err_msg = f"Cannot write to the file '{json_fpath}'. Reason: {err}" - else: - err_msg = ( - f"User:={user} cannot write to file:={json_fpath} in directory:={directory}" - ) - logger.error(err_msg) - return False, err_msg - - -def handle_delay(delay): - if delay > 0: - logger.info(f"Sleeping for {delay} seconds...") - time.sleep(delay) - - -def calculate_batch_sizes(num_prompts, max_batch_size, vary_batch_size): - """Calculate normally distributed batch sizes that sum to total_items""" - if vary_batch_size: - mean_workers = max_batch_size / 2 - std_dev = max_batch_size / 4 - - batch_sizes = [] - remaining = num_prompts - - while remaining > 0: - size = int( - np.clip(np.random.normal(mean_workers, std_dev), 1, max_batch_size) - ) - if size > remaining: - size = remaining - batch_sizes.append(size) - remaining -= size - - else: - batch_sizes = [max_batch_size] * (num_prompts // max_batch_size) - - return batch_sizes - - -def test_api_call_threaded_full_queue( - prompts, - input_seq_lengths, - output_seq_lengths, - batch_size, - num_full_iterations, - vary_batch_size, - inter_batch_delay, - call_func, - call_func_kwargs, -): - timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") - cache_root = Path(os.getenv("CACHE_ROOT", ".")) - json_fpath = cache_root / f"alpaca_eval_responses_{timestamp}.json" - logger.info(f"Will write output to: {json_fpath}") - can_write, err_msg = check_json_fpath(json_fpath) - if not can_write: - err_msg += ( - f"\nNote: CACHE_ROOT:={cache_root}, consider setting in this shell to $PWD" - ) - assert can_write, err_msg - with open(json_fpath, "a") as f: - f.write("[\n") - - total_prompts = len(prompts) * num_full_iterations - response_counter = 0 - logger.info( - f"Running {total_prompts} prompts in full queue with batch size {batch_size}." - ) - num_prompts = len(prompts) - all_responses = [] - if batch_size == 1: - logger.info("Running with single thread") - for iter_num in range(num_full_iterations): - for i, (prompt, isl, osl) in enumerate( - zip(prompts, input_seq_lengths, output_seq_lengths) - ): - handle_delay(inter_batch_delay) - response_idx = iter_num * num_prompts + i - response_data = call_func( - prompt=prompt, - response_idx=response_idx, - prompt_len=isl, - max_tokens=osl, - **call_func_kwargs, - ) - # Write the response data to the JSONL file - with responses_lock: - all_responses.append(response_data) - with open(json_fpath, "a") as f: - if response_counter > 0: - f.write(",") - json.dump(response_data, f, indent=4) - response_counter += 1 - logger.info( - f"Processed {response_counter}/{total_prompts} responses. Avg. TPS: {response_data['tps']:.2f}, TTFT: {response_data['ttft']:.2f}, input_seq_len: {response_data['input_seq_len']},output_seq_len: {response_data['output_seq_len']}" - ) - elif batch_size > 1 and vary_batch_size: - logger.info( - f"Running with ThreadPoolExecutor: batch_size={batch_size}, vary_batch_size={vary_batch_size}" - ) - batch_sizes = calculate_batch_sizes( - num_prompts=num_prompts, - max_batch_size=batch_size, - vary_batch_size=True, - ) - - # Process prompts in batches with varying sizes - for iter_num in range(num_full_iterations): - batch_start = 0 - - for bsz in batch_sizes: - batch_end = min(batch_start + bsz, num_prompts) - batch_prompts = prompts[batch_start:batch_end] - batch_input_seq_lengths = input_seq_lengths[batch_start:batch_end] - batch_output_seq_lengths = output_seq_lengths[batch_start:batch_end] - handle_delay(inter_batch_delay) - # Submit all prompts in the current batch - logger.info(f"Sending batch requests: {bsz}") - with ThreadPoolExecutor(max_workers=bsz) as executor: - futures = [] - - for i, (prompt, isl, osl) in enumerate( - zip( - batch_prompts, - batch_input_seq_lengths, - batch_output_seq_lengths, - ) - ): - response_idx = iter_num * num_prompts + i - future = executor.submit( - call_func, - prompt=prompt, - response_idx=response_idx, - prompt_len=isl, - max_tokens=osl, - **call_func_kwargs, - ) - futures.append(future) - # Wait for all futures in this batch to complete - for future in as_completed(futures): - try: - response_data = future.result() - with responses_lock: - all_responses.append(response_data) - with open(json_fpath, "a") as f: - if response_counter > 0: - f.write(",") - json.dump(response_data, f, indent=4) - response_counter += 1 - logger.info( - f"Processed {response_counter}/{total_prompts} responses. Avg. TPS: {response_data['tps']:.2f}, TTFT: {response_data['ttft']:.2f}, input_seq_len: {response_data['input_seq_len']},output_seq_len: {response_data['output_seq_len']}" - ) - except Exception as e: - logger.error(f"Error processing response: {e}") - elif batch_size > 1 and not vary_batch_size: - logger.info( - f"Running with ThreadPoolExecutor: batch_size={batch_size}, vary_batch_size={vary_batch_size}" - ) - # Process all prompts concurrently up to batch_size limit - with ThreadPoolExecutor(max_workers=batch_size) as executor: - futures = [] - - # Submit all prompts across all iterations - for iter_num in range(num_full_iterations): - for i, (prompt, isl, osl) in enumerate( - zip(prompts, input_seq_lengths, output_seq_lengths) - ): - response_idx = iter_num * num_prompts + i - future = executor.submit( - call_func, - prompt=prompt, - response_idx=response_idx, - prompt_len=isl, - max_tokens=osl, - **call_func_kwargs, - ) - futures.append(future) - - # Process completed futures as they finish - for future in as_completed(futures): - try: - response_data = future.result() - with responses_lock: - all_responses.append(response_data) - with open(json_fpath, "a") as f: - if response_counter > 0: - f.write(",") - json.dump(response_data, f, indent=4) - response_counter += 1 - logger.info( - f"Processed {response_counter}/{total_prompts} responses. Avg. TPS: {response_data['tps']:.2f}, TTFT: {response_data['ttft']:.2f}, input_seq_len: {response_data['input_seq_len']}, output_seq_len: {response_data['output_seq_len']}" - ) - except Exception as e: - logger.error(f"Error processing response: {e}") - - logger.info(f"Finished all requests, total responses: {response_counter}") - with open(json_fpath, "a") as f: - f.write("\n]") - - -def main(): - parser = argparse.ArgumentParser(description="Run Alpaca Evaluation Inference.") - parser = add_client_args(parser) - parser = add_prompt_gen_args(parser) - args = parser.parse_args() - - # generate prompts - tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_model) - prompts, input_seq_lengths = generate_prompts(args) - output_seq_lengths = [args.output_seq_len] * len(prompts) - - headers = {"Authorization": f"Bearer {get_authorization()}"} - api_url = get_api_url() - logging.info(f"API_URL: {api_url}") - test_api_call_threaded_full_queue( - prompts=prompts, - input_seq_lengths=input_seq_lengths, - output_seq_lengths=output_seq_lengths, - batch_size=args.batch_size, - num_full_iterations=args.num_full_iterations, - vary_batch_size=args.vary_batch_size, - inter_batch_delay=args.inter_batch_delay, - call_func=call_inference_api, - call_func_kwargs={ - "stream": not args.no_stream, - "headers": headers, - "api_url": api_url, - "vll_model": args.vllm_model, - "tokenizer": tokenizer, - "force_max_tokens": True, - }, - ) - def add_client_args(parser): parser.add_argument( @@ -445,7 +52,7 @@ def add_client_args(parser): "--input_seq_len", type=int, default=-1, - help="Length parameter of the input sequence when using random prompts (not given dataset).", + help="Length parameter of the input sequence when using random prompts.", ) parser.add_argument( "--output_seq_len", @@ -464,8 +71,112 @@ def add_client_args(parser): action="store_true", help="Randomize normally the batch size for each batch of prompts.", ) + parser.add_argument( + "--max_prompt_length", + type=int, + required=True, + help="Maximum length of generated prompts.", + ) + parser.add_argument( + "--distribution", + type=str, + default="fixed", + choices=["fixed", "uniform", "normal"], + help="Distribution method for selecting random prompt lengths.", + ) + parser.add_argument( + "--dataset", + type=str, + default="random", + help="The name of the dataset to generate prompts from, or 'random' for random generation.", + ) + parser.add_argument( + "--tokenizer_model", + type=str, + default=None, + help="The model tokenizer to use for vocabulary, truncation, and templating.", + ) + parser.add_argument( + "--template", + type=str, + default=None, + help="Provided jinja2 template to apply to the generated prompts.", + ) + parser.add_argument( + "--save_path", + type=str, + default=None, + help="Path to save the generated prompts in JSONL format.", + ) + parser.add_argument( + "--print_prompts", + action="store_true", + default=False, + help="Print generated prompts.", + ) return parser +def main(): + # set numpy seed for reproducibility + np.random.seed(42) + + parser = argparse.ArgumentParser() + parser = add_client_args(parser) + args = parser.parse_args() + + # Create configs from arguments + prompt_config = PromptConfig( + input_seq_len=args.input_seq_len, + max_prompt_length=args.max_prompt_length, + num_prompts=args.num_prompts, + distribution=args.distribution, + dataset=args.dataset, + tokenizer_model=args.tokenizer_model or args.vllm_model, + template=args.template, + save_path=args.save_path, + print_prompts=args.print_prompts, + ) + + output_seq_lens = [args.output_seq_len] * args.num_prompts + + batch_config = BatchConfig( + batch_size=args.batch_size, + output_seq_lens=output_seq_lens, + num_full_iterations=args.num_full_iterations, + vary_batch_size=args.vary_batch_size, + inter_batch_delay=args.inter_batch_delay, + vllm_model=args.vllm_model, + stream=not args.no_stream, + ) + + env_config = EnvironmentConfig() + + # Initialize components + tokenizer = AutoTokenizer.from_pretrained(prompt_config.tokenizer_model) + prompt_client = PromptClient(env_config) + batch_processor = BatchProcessor(prompt_client, batch_config) + + # Generate prompts + prompts, input_seq_lengths = generate_prompts(prompt_config) + + # Process batches + logger.info(f"Starting batch processing with batch_size={batch_config.batch_size}") + responses = batch_processor.process_batch( + prompts=prompts, input_seq_lengths=input_seq_lengths, tokenizer=tokenizer + ) + + logger.info(f"Completed processing {len(responses)} responses") + + # Calculate and log summary statistics + if responses: + mean_decode_tps = np.mean([r["decode_tps"] for r in responses]) + mean_total_tps = np.mean([r["total_tps"] for r in responses]) + mean_ttft = np.mean([r["ttft"] for r in responses]) + logger.info(f"Mean Decode TPS: {mean_decode_tps:.2f}") + logger.info(f"Mean Total TPS: {mean_total_tps:.2f}") + logger.info(f"Mean TTFT: {mean_ttft:.2f}") + + if __name__ == "__main__": main() diff --git a/utils/prompt_configs.py b/utils/prompt_configs.py new file mode 100644 index 00000000..04d0fd67 --- /dev/null +++ b/utils/prompt_configs.py @@ -0,0 +1,40 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +from dataclasses import dataclass +from typing import List, Optional +import os + + +@dataclass +class PromptConfig: + input_seq_len: int + max_prompt_length: int + num_prompts: int + distribution: str = "fixed" + dataset: str = "random" + tokenizer_model: str = "meta-llama/Llama-3.1-70B-Instruct" + template: Optional[str] = None + save_path: Optional[str] = None + print_prompts: bool = False + + +@dataclass +class BatchConfig: + batch_size: int + output_seq_lens: List[int] + num_full_iterations: int = 1 + vary_batch_size: bool = False + inter_batch_delay: int = 0 + vllm_model: str = "meta-llama/Llama-3.1-70B-Instruct" + stream: bool = True + + +@dataclass +class EnvironmentConfig: + authorization: Optional[str] = os.environ.get("AUTHORIZATION") + jwt_secret: Optional[str] = os.environ.get("JWT_SECRET") + deploy_url: str = os.environ.get("DEPLOY_URL", "http://127.0.0.1") + service_port: str = os.environ.get("SERVICE_PORT", "8000") + cache_root: str = os.environ.get("CACHE_ROOT", ".") diff --git a/utils/prompt_generation.py b/utils/prompt_generation.py index f96e0c6d..07269531 100644 --- a/utils/prompt_generation.py +++ b/utils/prompt_generation.py @@ -5,7 +5,6 @@ import os from pathlib import Path import logging -import argparse import json from datetime import date @@ -14,6 +13,8 @@ from datasets import load_dataset from transformers import AutoTokenizer +from prompt_configs import PromptConfig + logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" @@ -247,49 +248,56 @@ def process_prompts(prompts, max_length, template, tokenizer_model): # Main function to handle prompt generation and templating -def generate_prompts(args): - logging.info(f"generate_prompts args={args}") +def generate_prompts(prompt_config: PromptConfig): + logging.info(f"generate_prompts args={prompt_config}") # vLLM appears to add extra token on receipt of prompt # TODO: verify if this is bos token or something else - args.max_prompt_length = args.max_prompt_length - 1 - if args.input_seq_len == -1: - args.input_seq_len = args.max_prompt_length + prompt_config.max_prompt_length = prompt_config.max_prompt_length - 1 + if prompt_config.input_seq_len == -1: + prompt_config.input_seq_len = prompt_config.max_prompt_length else: - args.input_seq_len = args.input_seq_len - 1 + prompt_config.input_seq_len = prompt_config.input_seq_len - 1 - if args.dataset.lower() == "random": + if prompt_config.dataset.lower() == "random": # default case logger.info("Generating random prompts...") # -1 is for the extra token added by vLLM - assert args.input_seq_len > -1, "input_seq_len must be set for random prompts." - assert args.max_prompt_length > -1, "max_length must be set for random prompts." + assert ( + prompt_config.input_seq_len > -1 + ), "input_seq_len must be set for random prompts." + assert ( + prompt_config.max_prompt_length > -1 + ), "max_length must be set for random prompts." prompts = generate_random_prompts( - args.num_prompts, - args.max_prompt_length, - args.input_seq_len, - args.distribution, - args.tokenizer_model, + prompt_config.num_prompts, + prompt_config.max_prompt_length, + prompt_config.input_seq_len, + prompt_config.distribution, + prompt_config.tokenizer_model, ) - elif args.dataset is not None: + elif prompt_config.dataset is not None: assert ( - args.max_prompt_length > -1 + prompt_config.max_prompt_length > -1 ), "max_length must be set for datasets prompts." - logger.info(f"Generating prompts from the '{args.dataset}' dataset...") - if args.dataset == "alpaca_eval": - prompts = load_alpaca_eval_dataset_samples(args.num_prompts) + logger.info(f"Generating prompts from the '{prompt_config.dataset}' dataset...") + if prompt_config.dataset == "alpaca_eval": + prompts = load_alpaca_eval_dataset_samples(prompt_config.num_prompts) else: raise ValueError("Dataset must be provided.") prompts, prompt_lengths = process_prompts( - prompts, args.max_prompt_length, args.template, args.tokenizer_model + prompts, + prompt_config.max_prompt_length, + prompt_config.template, + prompt_config.tokenizer_model, ) # Add 1 to prompt lengths to account for the extra token added by vLLM prompt_lengths = [pl + 1 for pl in prompt_lengths] - print_prompts = (args.num_prompts < 5) and args.print_prompts + print_prompts = (prompt_config.num_prompts < 5) and prompt_config.print_prompts # Save prompts to a JSONL file if a save path is provided - if args.save_path: - file_path = Path(args.save_path).resolve() + if prompt_config.save_path: + file_path = Path(prompt_config.save_path).resolve() try: with open(file_path, "w") as f: for prompt in prompts: @@ -306,64 +314,3 @@ def generate_prompts(args): print(f"prompt {idx}:\n{prompt}") return prompts, prompt_lengths - - -def add_prompt_gen_args(parser): - parser.add_argument( - "--tokenizer_model", - type=str, - default=None, - help="The model tokenizer to use for vocabulary, truncation, and templating.", - ) - parser.add_argument( - "--dataset", - type=str, - default="random", - help="The name of the dataset to generate prompts from, or 'random' for random token generation.", - ) - parser.add_argument( - "--max_prompt_length", - type=int, - required=True, - help="Maximum length of generated prompts.", - ) - parser.add_argument( - "--distribution", - type=str, - default="fixed", - choices=[ - "fixed", - "uniform", - "normal", - ], - help="Distribution method for selecting random prompt lengths ('fixed', 'uniform', 'normal').", - ) - parser.add_argument( - "--template", - type=str, - default=None, - help="Provided jinja2 template to apply to the generated prompts.", - ) - parser.add_argument( - "--save_path", - type=str, - default=None, - help="Path to save the generated prompts in JSONL format.", - ) - parser.add_argument( - "--print_prompts", - action="store_true", - default=False, - help="Print generated prompts if there arent more than 5.", - ) - return parser - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Generate prompts.") - parser = add_prompt_gen_args(parser) - args = parser.parse_args() - try: - generate_prompts(args) - except ValueError as e: - print(e) diff --git a/utils/startup_utils.py b/utils/startup_utils.py index 0da62715..05cb616f 100644 --- a/utils/startup_utils.py +++ b/utils/startup_utils.py @@ -3,17 +3,11 @@ # SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC import os -import time import logging import subprocess import psutil import signal -import requests - -from utils.prompt_client_cli import ( - get_authorization, -) logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" @@ -22,44 +16,6 @@ logger.setLevel(logging.INFO) -def get_api_health_url(): - DEPLOY_URL = os.getenv("DEPLOY_URL", "http://127.0.0.1") - health_url = f"{DEPLOY_URL}:{os.getenv('SERVICE_PORT', '7000')}/health" - return health_url - - -def wait_for_healthy(timeout: int = 300, interval: int = 10) -> bool: - """ - Check the health endpoint until the service is ready. - """ - health_url = get_api_health_url() - start_time = time.time() - headers = {"Authorization": f"Bearer {get_authorization()}"} - total_time_waited = 0 - while time.time() - start_time < timeout: - req_time = time.time() - try: - response = requests.get(health_url, headers=headers, timeout=interval) - if response.status_code == 200: - startup_time = time.time() - start_time - logger.info( - f"vLLM service is healthy. startup_time:= {startup_time} seconds" - ) - return True - except requests.exceptions.RequestException as e: - logger.warning(f"Health check failed: {e}") - - total_time_waited = time.time() - start_time - sleep_interval = max(2 - (time.time() - req_time), 0) - logger.info( - f"Service not ready after {total_time_waited:.2f} seconds, waiting {sleep_interval:.2f} seconds before polling ..." - ) - time.sleep(sleep_interval) - - logger.error(f"Service did not become healthy within {timeout} seconds") - return False - - class InferenceServerContext: def __init__(self, startup_script_path): self.startup_script_path = startup_script_path From 2467c742950bcbd70720c73482c75f9e04ec31e7 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Thu, 5 Dec 2024 23:23:43 +0000 Subject: [PATCH 13/76] fix health endpoint --- utils/prompt_client.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/utils/prompt_client.py b/utils/prompt_client.py index 00473045..16f83936 100644 --- a/utils/prompt_client.py +++ b/utils/prompt_client.py @@ -53,7 +53,7 @@ def _get_api_completions_url(self) -> str: return f"{self._get_api_base_url()}/completions" def _get_api_health_url(self) -> str: - return f"{self._get_api_base_url()}/health" + return f"{self.env_config.deploy_url}:{self.env_config.service_port}/health" def get_health(self) -> requests.Response: return requests.get(self.health_url, headers=self.headers) @@ -78,6 +78,8 @@ def wait_for_healthy(self, timeout: int = 300, interval: int = 10) -> bool: ) self.server_ready = True return True + else: + logger.warning(f"Health check failed: {response.status_code}") except requests.exceptions.RequestException as e: logger.warning(f"Health check failed: {e}") From af5e8dc9e2f2e80936cffd64e618a88a472aceb6 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Thu, 5 Dec 2024 23:24:33 +0000 Subject: [PATCH 14/76] add vllm_model to EnvironmentConfig instead of BatchConfig --- utils/batch_processor.py | 6 +++--- utils/prompt_configs.py | 8 +++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/utils/batch_processor.py b/utils/batch_processor.py index c8f9ea90..6805545e 100644 --- a/utils/batch_processor.py +++ b/utils/batch_processor.py @@ -123,7 +123,7 @@ def _process_single_thread( prompt_len=isl, max_tokens=self.batch_config.output_seq_lens[i], stream=self.batch_config.stream, - vll_model=self.batch_config.vllm_model, + vll_model=self.prompt_client.env_config.vllm_model, tokenizer=tokenizer, ) @@ -182,7 +182,7 @@ def _process_multi_thread( prompt_len=isl, max_tokens=self.batch_config.output_seq_lens[i], stream=self.batch_config.stream, - vll_model=self.batch_config.vllm_model, + vll_model=self.prompt_client.env_config.vllm_model, tokenizer=tokenizer, ) futures.append(future) @@ -231,7 +231,7 @@ def _process_batch_chunk( prompt_len=isl, max_tokens=self.batch_config.output_seq_lens[i], stream=self.batch_config.stream, - vll_model=self.batch_config.vllm_model, + vll_model=self.prompt_client.env_config.vllm_model, tokenizer=tokenizer, ) futures.append(future) diff --git a/utils/prompt_configs.py b/utils/prompt_configs.py index 04d0fd67..eea13670 100644 --- a/utils/prompt_configs.py +++ b/utils/prompt_configs.py @@ -14,7 +14,9 @@ class PromptConfig: num_prompts: int distribution: str = "fixed" dataset: str = "random" - tokenizer_model: str = "meta-llama/Llama-3.1-70B-Instruct" + tokenizer_model: str = os.environ.get( + "VLLM_MODEL", "meta-llama/Llama-3.1-70B-Instruct" + ) template: Optional[str] = None save_path: Optional[str] = None print_prompts: bool = False @@ -27,14 +29,14 @@ class BatchConfig: num_full_iterations: int = 1 vary_batch_size: bool = False inter_batch_delay: int = 0 - vllm_model: str = "meta-llama/Llama-3.1-70B-Instruct" stream: bool = True @dataclass class EnvironmentConfig: + vllm_model: str = os.environ.get("VLLM_MODEL", "meta-llama/Llama-3.1-70B-Instruct") authorization: Optional[str] = os.environ.get("AUTHORIZATION") jwt_secret: Optional[str] = os.environ.get("JWT_SECRET") deploy_url: str = os.environ.get("DEPLOY_URL", "http://127.0.0.1") - service_port: str = os.environ.get("SERVICE_PORT", "8000") + service_port: str = os.environ.get("SERVICE_PORT", "7000") cache_root: str = os.environ.get("CACHE_ROOT", ".") From 60c7ab28674aa167f30cf18f8329c69627878b0b Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Thu, 5 Dec 2024 23:25:10 +0000 Subject: [PATCH 15/76] refactor utils/capture_traces.py with new prompt_client --- utils/capture_traces.py | 79 ++++------------------------------------- 1 file changed, 7 insertions(+), 72 deletions(-) diff --git a/utils/capture_traces.py b/utils/capture_traces.py index ecc1d95d..687458c7 100644 --- a/utils/capture_traces.py +++ b/utils/capture_traces.py @@ -2,16 +2,10 @@ # # SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC -import os import logging -import argparse -from utils.prompt_generation import generate_prompts -from utils.prompt_client_cli import ( - call_inference_api, - get_api_base_url, - get_authorization, -) -from utils.startup_utils import wait_for_healthy + +from prompt_configs import EnvironmentConfig +from prompt_client import PromptClient logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" @@ -21,69 +15,10 @@ def capture_input_sizes(): - """ - Capture different input size graphs with the TT model on vLLM. - get_padded_prefill_len() defines the different input sizes for prefill: - https://github.com/tenstorrent/tt-metal/blob/main/models/demos/t3000/llama2_70b/tt/llama_generation.py#L341 - """ - input_sizes = [sz - 8 for sz in [32, 64, 128, 256, 512, 1024, 2048, 3072, 4096]] - prompts_per_size = 1 - output_seq_len = 1 - - base_url = get_api_base_url() - if not wait_for_healthy(base_url): - raise RuntimeError("vLLM did not start correctly!") - - api_url = f"{base_url}/completions" - headers = {"Authorization": f"Bearer {get_authorization()}"} - vllm_model = os.environ.get("VLLM_MODEL", "meta-llama/Llama-3.1-70B-Instruct") - - for size in input_sizes: - logger.info(f"Capture input size: {size}") - - args = argparse.Namespace( - tokenizer_model=vllm_model, - dataset="random", - max_prompt_length=size, - input_seq_len=size, - distribution="fixed", - template=None, - save_path=None, - print_prompts=False, - num_prompts=prompts_per_size, - ) - - prompts, prompt_lengths = generate_prompts(args) - - for i, (prompt, prompt_len) in enumerate(zip(prompts, prompt_lengths)): - try: - response_data = call_inference_api( - prompt=prompt, - response_idx=i, - prompt_len=prompt_len, - stream=True, - headers=headers, - api_url=api_url, - max_tokens=output_seq_len, - vll_model=vllm_model, - tokenizer=None, - ) - - logger.info( - f"Input size: {size}, input_seq_len: {prompt_len}, TTFT: {response_data['ttft']:.3f}s" - ) - - except Exception as e: - logger.error(f"Error processing prompt: {e}") - - -def main(): - try: - capture_input_sizes() - except Exception as e: - logger.error(f"Capturing input sizes failed: {e}") - raise + env_config = EnvironmentConfig() + prompt_client = PromptClient(env_config) + prompt_client.capture_traces() if __name__ == "__main__": - main() + capture_input_sizes() From 10993a2a98667aab00903042d66976070dd21367 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Thu, 5 Dec 2024 23:57:26 +0000 Subject: [PATCH 16/76] fix utils imports --- utils/batch_processor.py | 4 ++-- utils/capture_traces.py | 4 ++-- utils/prompt_client.py | 4 ++-- utils/prompt_client_cli.py | 8 ++++---- utils/prompt_generation.py | 2 +- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/utils/batch_processor.py b/utils/batch_processor.py index 6805545e..35ab6652 100644 --- a/utils/batch_processor.py +++ b/utils/batch_processor.py @@ -14,8 +14,8 @@ import numpy as np from transformers import AutoTokenizer -from prompt_configs import BatchConfig -from prompt_client import PromptClient +from utils.prompt_configs import BatchConfig +from utils.prompt_client import PromptClient logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" diff --git a/utils/capture_traces.py b/utils/capture_traces.py index 687458c7..f3703b1f 100644 --- a/utils/capture_traces.py +++ b/utils/capture_traces.py @@ -4,8 +4,8 @@ import logging -from prompt_configs import EnvironmentConfig -from prompt_client import PromptClient +from utils.prompt_configs import EnvironmentConfig +from utils.prompt_client import PromptClient logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" diff --git a/utils/prompt_client.py b/utils/prompt_client.py index 16f83936..455921d3 100644 --- a/utils/prompt_client.py +++ b/utils/prompt_client.py @@ -11,8 +11,8 @@ import jwt from transformers import AutoTokenizer -from prompt_generation import generate_prompts -from prompt_configs import PromptConfig, EnvironmentConfig +from utils.prompt_generation import generate_prompts +from utils.prompt_configs import PromptConfig, EnvironmentConfig logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" diff --git a/utils/prompt_client_cli.py b/utils/prompt_client_cli.py index c671ca33..8ebc7124 100644 --- a/utils/prompt_client_cli.py +++ b/utils/prompt_client_cli.py @@ -9,10 +9,10 @@ import numpy as np from transformers import AutoTokenizer -from prompt_configs import PromptConfig, BatchConfig, EnvironmentConfig -from prompt_client import PromptClient -from batch_processor import BatchProcessor -from prompt_generation import generate_prompts +from utils.prompt_configs import PromptConfig, BatchConfig, EnvironmentConfig +from utils.prompt_client import PromptClient +from utils.batch_processor import BatchProcessor +from utils.prompt_generation import generate_prompts logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" diff --git a/utils/prompt_generation.py b/utils/prompt_generation.py index 07269531..a351eded 100644 --- a/utils/prompt_generation.py +++ b/utils/prompt_generation.py @@ -13,7 +13,7 @@ from datasets import load_dataset from transformers import AutoTokenizer -from prompt_configs import PromptConfig +from utils.prompt_configs import PromptConfig logging.basicConfig( From 20ccdf4855dd5accc7a91d939d3b12091f8e642c Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Fri, 6 Dec 2024 03:51:38 +0000 Subject: [PATCH 17/76] fix BatchConfig usage --- utils/prompt_client_cli.py | 1 - 1 file changed, 1 deletion(-) diff --git a/utils/prompt_client_cli.py b/utils/prompt_client_cli.py index 8ebc7124..3d74f8f5 100644 --- a/utils/prompt_client_cli.py +++ b/utils/prompt_client_cli.py @@ -146,7 +146,6 @@ def main(): num_full_iterations=args.num_full_iterations, vary_batch_size=args.vary_batch_size, inter_batch_delay=args.inter_batch_delay, - vllm_model=args.vllm_model, stream=not args.no_stream, ) From eab7e7682e69d66c2754e10334a6e9ebee7352fa Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Fri, 6 Dec 2024 03:58:07 +0000 Subject: [PATCH 18/76] add benchmarking/online_benchmark_prompt_client.py using prompt_client.py --- .../online_benchmark_prompt_client.py | 208 ++++++++++++++++++ 1 file changed, 208 insertions(+) create mode 100644 benchmarking/online_benchmark_prompt_client.py diff --git a/benchmarking/online_benchmark_prompt_client.py b/benchmarking/online_benchmark_prompt_client.py new file mode 100644 index 00000000..7f65c7fd --- /dev/null +++ b/benchmarking/online_benchmark_prompt_client.py @@ -0,0 +1,208 @@ +#!/usr/bin/env python3 + +# SPDX-License-Identifier: Apache-2.0 +# +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +import logging +import numpy as np +from typing import List, Dict, Tuple +import json +from datetime import datetime +from pathlib import Path + +from utils.prompt_configs import PromptConfig, BatchConfig, EnvironmentConfig +from utils.prompt_client import PromptClient +from utils.batch_processor import BatchProcessor +from utils.prompt_generation import generate_prompts +from transformers import AutoTokenizer + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" +) +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +def get_test_combinations( + context_lens: List[Tuple[int, int]], +) -> List[Dict[str, int]]: + combinations = [] + for input_len, output_len in context_lens: + # Skip invalid combinations where output_len > input_len + context = input_len + output_len + if context <= 4096: + bsz = 32 + elif context <= 8192: + bsz = 16 + else: + bsz = 1 + + num_prompts = bsz * 4 + combinations.append( + { + "input_len": input_len, + "output_len": output_len, + "batch_size": bsz, + "num_prompts": num_prompts, + } + ) + + # Log total number of combinations + logger.info(f"Generated {len(combinations)} valid test combinations") + for i, combo in enumerate(combinations, 1): + logger.info( + f"Combination {i}: input_len={combo['input_len']}, " + f"output_len={combo['output_len']}, batch_size={combo['batch_size']}" + ) + + return combinations + + +def run_sequence_length_test( + combinations: List[Dict[str, int]], + save_dir: str, + file_prefix: str, + num_iterations: int = 1, + model: str = "meta-llama/Llama-3.1-70B-Instruct", +) -> List[dict]: + # Create save directory + save_path = Path(save_dir) + save_path.mkdir(parents=True, exist_ok=True) + + # Initialize configurations + env_config = EnvironmentConfig(vllm_model=model) + prompt_client = PromptClient(env_config) + + # Initialize results storage + all_results = [] + timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + results_file = save_path / f"{file_prefix}_{timestamp}.json" + + # Test all combinations + total_combinations = len(combinations) + for idx, params in enumerate(combinations, 1): + input_len = params["input_len"] + output_len = params["output_len"] + batch_size = params["batch_size"] + num_prompts = params["num_prompts"] + + logger.info( + f"\nTesting combination {idx}/{total_combinations}:\n" + f"input_len={input_len}, output_len={output_len}, " + f"batch_size={batch_size}, num_prompts={num_prompts}" + ) + + # Configure prompt generation + prompt_config = PromptConfig( + input_seq_len=input_len, + max_prompt_length=input_len, + num_prompts=num_prompts, + distribution="fixed", + dataset="random", + tokenizer_model=model, + template=None, + save_path=None, + print_prompts=False, + ) + + # Generate prompts + prompts, input_seq_lengths = generate_prompts(prompt_config) + + # Configure batch processing + output_seq_lens = [output_len] * num_prompts + batch_config = BatchConfig( + batch_size=batch_size, + output_seq_lens=output_seq_lens, + num_full_iterations=num_iterations, + vary_batch_size=False, + inter_batch_delay=0, + stream=True, + ) + + # Initialize processor and tokenizer + batch_processor = BatchProcessor(prompt_client, batch_config) + tokenizer = AutoTokenizer.from_pretrained(model) + + # Process batches + try: + responses = batch_processor.process_batch( + prompts=prompts, + input_seq_lengths=input_seq_lengths, + tokenizer=tokenizer, + ) + + # Calculate statistics + stats = { + "input_seq_len": input_len, + "output_seq_len": output_len, + "batch_size": batch_size, + "mean_decode_tps": np.mean([r["decode_tps"] for r in responses]), + "mean_total_tps": np.mean([r["total_tps"] for r in responses]), + "mean_ttft": np.mean([r["ttft"] for r in responses]), + "std_decode_tps": np.std([r["decode_tps"] for r in responses]), + "std_total_tps": np.std([r["total_tps"] for r in responses]), + "std_ttft": np.std([r["ttft"] for r in responses]), + "num_prompts": num_prompts, + "num_iterations": num_iterations, + "timestamp": timestamp, + "combination_index": idx, + } + + all_results.append(stats) + + # Log results + logger.info( + f"Results for combination {idx}/{total_combinations}:\n" + f"Mean Decode TPS: {stats['mean_decode_tps']:.2f} ± " + f"{stats['std_decode_tps']:.2f}\n" + f"Mean Total TPS: {stats['mean_total_tps']:.2f} ± " + f"{stats['std_total_tps']:.2f}\n" + f"Mean TTFT: {stats['mean_ttft']:.2f} ± {stats['std_ttft']:.2f}" + ) + + # Save results after each combination + with open(results_file, "w") as f: + json.dump(all_results, f, indent=4) + + except Exception as e: + logger.error(f"Error processing combination {idx}: {e}") + continue + + return all_results + + +if __name__ == "__main__": + # Define parameter ranges + typical_context_lens = [ + (128, 128), + (128, 2048), + (128, 4096), + (2048, 128), + (2048, 2048), + (1000, 1000), + (500, 2000), + (5000, 500), + (20000, 2000), + ] + extra_context_lengths = [ + (128, 2), + (256, 2), + (512, 32), + (1000, 24), + (2000, 32), + (4000, 32), + (8100, 32), + (130000, 1024), + ] + # Generate all valid combinations upfront + combinations = get_test_combinations( + context_lens=typical_context_lens + extra_context_lengths, + ) + + # Run tests + results = run_sequence_length_test( + combinations=combinations, + save_dir="online_benchmarking", + file_prefix="online_benchmark_results", + ) From 90acdf6c46c0855abc8cd1d9ce556bac12bc55cf Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Fri, 6 Dec 2024 05:19:25 +0000 Subject: [PATCH 19/76] add benchmarking/online_benchmark_prompt_client.py using prompt_client.py --- benchmarking/README.md | 52 ++++++++ benchmarking/benchmark_serving.patch | 26 ++++ .../online_benchmark_prompt_client.py | 9 +- benchmarking/vllm_online_benchmark.py | 124 ++++++++++++++++++ 4 files changed, 207 insertions(+), 4 deletions(-) create mode 100644 benchmarking/benchmark_serving.patch create mode 100644 benchmarking/vllm_online_benchmark.py diff --git a/benchmarking/README.md b/benchmarking/README.md index 38ee6b60..56fab404 100644 --- a/benchmarking/README.md +++ b/benchmarking/README.md @@ -36,3 +36,55 @@ python examples/offline_inference_tt.py --measure_perf --max_seqs_in_batch 32 -- - `--max_seqs_in_batch` (default: `32`): - **Maximum batch size** for inference, determining the number of prompts processed in parallel. +### Online Benchmarking + +#### using vllm/benchmarking/benchmark_serving.py + +use the benchmark_serving.patch file: +``` +cd ~/vllm +git apply benchmark_serving.patch +``` +This simply stops the benchmarking script from sending the `best_of` arg which is not supported and causes issues. + +To run the benchmarks: +``` +cd ~/app +export PYTHONPATH=$PYTHONPATH:$PWD +python benchmarking/vllm_online_benchmark.py +``` + +The output will be available for each input/output sequence length defined and time stamped. + +Results are also printed to stdout, for example with mock data results: +``` +================================================== + Benchmark Result +================================================== +Successful requests: 32 +Benchmark duration (s): 0.39 +Total input tokens: 4096 +Total generated tokens: 64 +Request throughput (req/s): 83.04 +Output token throughput (tok/s): 166.07 +Total Token throughput (tok/s): 10794.77 +-------------------------------------------------- + Time to First Token +-------------------------------------------------- +Mean TTFT (ms): 358.26 +Median TTFT (ms): 358.45 +P99 TTFT (ms): 361.67 +-------------------------------------------------- + Time per Output Token (excl. 1st token) +-------------------------------------------------- +Mean TPOT (ms): 14.03 +Median TPOT (ms): 14.13 +P99 TPOT (ms): 14.30 +-------------------------------------------------- + Inter-token Latency +-------------------------------------------------- +Mean ITL (ms): 7.86 +Median ITL (ms): 7.83 +P99 ITL (ms): 8.05 +================================================== +``` diff --git a/benchmarking/benchmark_serving.patch b/benchmarking/benchmark_serving.patch new file mode 100644 index 00000000..bb90b431 --- /dev/null +++ b/benchmarking/benchmark_serving.patch @@ -0,0 +1,26 @@ +diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py +index c1a396c8..74f75a15 100644 +--- a/benchmarks/benchmark_serving.py ++++ b/benchmarks/benchmark_serving.py +@@ -22,6 +22,12 @@ On the client side, run: + --endpoint /generate_stream + to the end of the command above. + """ ++import sys ++from unittest.mock import MagicMock ++# mock out ttnn fully so we can import ttnn without using it ++sys.modules["ttnn"] = MagicMock() ++sys.modules["ttnn.device"] = MagicMock() ++ + import argparse + import asyncio + import base64 +@@ -417,7 +423,7 @@ async def benchmark( + prompt_len=test_prompt_len, + output_len=test_output_len, + logprobs=logprobs, +- best_of=best_of, ++ best_of=None, + multi_modal_content=test_mm_content, + ignore_eos=ignore_eos, + ) \ No newline at end of file diff --git a/benchmarking/online_benchmark_prompt_client.py b/benchmarking/online_benchmark_prompt_client.py index 7f65c7fd..22812bbd 100644 --- a/benchmarking/online_benchmark_prompt_client.py +++ b/benchmarking/online_benchmark_prompt_client.py @@ -70,15 +70,16 @@ def run_sequence_length_test( save_path = Path(save_dir) save_path.mkdir(parents=True, exist_ok=True) - # Initialize configurations - env_config = EnvironmentConfig(vllm_model=model) - prompt_client = PromptClient(env_config) - # Initialize results storage all_results = [] timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") results_file = save_path / f"{file_prefix}_{timestamp}.json" + # Initialize configurations + env_config = EnvironmentConfig(vllm_model=model) + prompt_client = PromptClient(env_config) + prompt_client.capture_traces() + # Test all combinations total_combinations = len(combinations) for idx, params in enumerate(combinations, 1): diff --git a/benchmarking/vllm_online_benchmark.py b/benchmarking/vllm_online_benchmark.py new file mode 100644 index 00000000..0315129f --- /dev/null +++ b/benchmarking/vllm_online_benchmark.py @@ -0,0 +1,124 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +import os +import subprocess +import time +import logging +from typing import Dict +from pathlib import Path + +from benchmarking.online_benchmark_prompt_client import get_test_combinations +from utils.prompt_configs import EnvironmentConfig +from utils.prompt_client import PromptClient + +# Configure logging +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" +) +logger = logging.getLogger(__name__) + + +def run_benchmark( + params: Dict[str, int], + model: str, + port: int, + benchmark_script: str, + result_dir: Path, +) -> None: + """Run a single benchmark with the given parameters.""" + # fmt: off + cmd = [ + "python", benchmark_script, + "--backend", "vllm", + "--model", model, + "--port", str(port), + "--dataset-name", "random", + "--num-prompts", str(params["batch_size"]), + "--random-input-len", str(params["input_len"]), + "--random-output-len", str(params["output_len"]), + "--save-result", + "--result-dir", str(result_dir) + ] + # fmt: on + + logger.info(f"Running benchmark with parameters: {params}") + logger.info(f"Command: {' '.join(cmd)}") + + try: + subprocess.run(cmd, check=True) + logger.info("Benchmark completed successfully") + except subprocess.CalledProcessError as e: + logger.error(f"Benchmark failed with error: {e}") + except Exception as e: + logger.error(f"Unexpected error during benchmark: {e}") + + # Add a small delay between runs to ensure system stability + time.sleep(2) + + +def main(): + # Configuration + env_config = EnvironmentConfig() + + # Create output directory + result_dir = Path("vllm_online_benchmark_results") + result_dir.mkdir(parents=True, exist_ok=True) + + prompt_client = PromptClient(env_config) + # note: there isnt a better way to pass an api key to the vllm benchmarking script + os.environ["OPENAI_API_KEY"] = prompt_client._get_authorization() + + # Define benchmarking parameters + typical_context_lens = [ + (128, 128), + (128, 2048), + (128, 4096), + (2048, 128), + (2048, 2048), + (1000, 1000), + (500, 2000), + (5000, 500), + (20000, 2000), + ] + extra_context_lengths = [ + (128, 2), + (256, 2), + (512, 32), + (1000, 24), + (2000, 32), + (4000, 32), + (8100, 32), + (130000, 1024), + ] + + # Get all benchmark combinations using the original function + combinations = get_test_combinations( + context_lens=typical_context_lens + extra_context_lengths, + ) + + # Log benchmark plan + logger.info(f"Starting benchmark suite with {len(combinations)} combinations") + for i, combo in enumerate(combinations, 1): + logger.info(f"Combination {i}: {combo}") + + # ensure vllm server is ready + prompt_client.capture_traces() + + # Run benchmarks + for i, params in enumerate(combinations, 1): + logger.info(f"\nRunning benchmark {i}/{len(combinations)}") + run_benchmark( + benchmark_script="/home/user/vllm/benchmarks/benchmark_serving.py", + params=params, + model=env_config.vllm_model, + port=env_config.service_port, + result_dir=result_dir, + ) + + logger.info("Benchmark suite completed") + + +if __name__ == "__main__": + main() From ec486ad5595fc807b2fe3e4e469844e766b9cdbb Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Fri, 6 Dec 2024 05:22:48 +0000 Subject: [PATCH 20/76] add benchmarking, evals, and tests dirs to Dockerfile --- vllm-tt-metal-llama3-70b/vllm.llama3.src.Dockerfile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm-tt-metal-llama3-70b/vllm.llama3.src.Dockerfile b/vllm-tt-metal-llama3-70b/vllm.llama3.src.Dockerfile index 465004e7..2184d356 100644 --- a/vllm-tt-metal-llama3-70b/vllm.llama3.src.Dockerfile +++ b/vllm-tt-metal-llama3-70b/vllm.llama3.src.Dockerfile @@ -99,6 +99,9 @@ ENV PYTHONPATH=${PYTHONPATH}:${APP_DIR} COPY --chown=user:user "vllm-tt-metal-llama3-70b/src" "${APP_DIR}/src" COPY --chown=user:user "vllm-tt-metal-llama3-70b/requirements.txt" "${APP_DIR}/requirements.txt" COPY --chown=user:user "utils" "${APP_DIR}/utils" +COPY --chown=user:user "benchmarking" "${APP_DIR}/benchmarking" +COPY --chown=user:user "evals" "${APP_DIR}/evals" +COPY --chown=user:user "tests" "${APP_DIR}/tests" RUN /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate \ && pip install --default-timeout=240 --no-cache-dir -r requirements.txt" From c58d7b365ee1c5728a0fc8bfd8f093668ad5dddd Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Fri, 6 Dec 2024 05:34:50 +0000 Subject: [PATCH 21/76] update patchfile and benchmarking README.md with commands --- benchmarking/README.md | 2 +- benchmarking/benchmark_serving.patch | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarking/README.md b/benchmarking/README.md index 56fab404..fe27a798 100644 --- a/benchmarking/README.md +++ b/benchmarking/README.md @@ -43,7 +43,7 @@ python examples/offline_inference_tt.py --measure_perf --max_seqs_in_batch 32 -- use the benchmark_serving.patch file: ``` cd ~/vllm -git apply benchmark_serving.patch +git apply ~/app/benchmarking/benchmark_serving.patch ``` This simply stops the benchmarking script from sending the `best_of` arg which is not supported and causes issues. diff --git a/benchmarking/benchmark_serving.patch b/benchmarking/benchmark_serving.patch index bb90b431..c6cd2994 100644 --- a/benchmarking/benchmark_serving.patch +++ b/benchmarking/benchmark_serving.patch @@ -23,4 +23,4 @@ index c1a396c8..74f75a15 100644 + best_of=None, multi_modal_content=test_mm_content, ignore_eos=ignore_eos, - ) \ No newline at end of file + ) From fe4f96d302de31b524c1850148585bab7be98db9 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Fri, 6 Dec 2024 05:34:58 +0000 Subject: [PATCH 22/76] update Docker IMAGE_VERSION to v0.0.3 --- vllm-tt-metal-llama3-70b/docs/development.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm-tt-metal-llama3-70b/docs/development.md b/vllm-tt-metal-llama3-70b/docs/development.md index 2b21a730..55d8b1d3 100644 --- a/vllm-tt-metal-llama3-70b/docs/development.md +++ b/vllm-tt-metal-llama3-70b/docs/development.md @@ -18,7 +18,7 @@ export TT_METAL_COMMIT_SHA_OR_TAG=385904186f81fed15d5c87c162221d4f34387164 export TT_METAL_COMMIT_DOCKER_TAG=${TT_METAL_COMMIT_SHA_OR_TAG:0:12} export TT_VLLM_COMMIT_SHA_OR_TAG=384f1790c3be16e1d1b10de07252be2e66d00935 export TT_VLLM_COMMIT_DOCKER_TAG=${TT_VLLM_COMMIT_SHA_OR_TAG:0:12} -export IMAGE_VERSION=v0.0.2 +export IMAGE_VERSION=v0.0.3 docker build \ -t ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:${IMAGE_VERSION}-tt-metal-${TT_METAL_COMMIT_DOCKER_TAG}-${TT_VLLM_COMMIT_DOCKER_TAG} \ --build-arg TT_METAL_DOCKERFILE_VERSION=${TT_METAL_DOCKERFILE_VERSION} \ From f3d815ad52f14a40822b96a90c942d618cb33346 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Fri, 6 Dec 2024 05:39:55 +0000 Subject: [PATCH 23/76] improve doc --- benchmarking/README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/benchmarking/README.md b/benchmarking/README.md index fe27a798..ea06674d 100644 --- a/benchmarking/README.md +++ b/benchmarking/README.md @@ -39,15 +39,16 @@ python examples/offline_inference_tt.py --measure_perf --max_seqs_in_batch 32 -- ### Online Benchmarking #### using vllm/benchmarking/benchmark_serving.py - -use the benchmark_serving.patch file: +Within the Docker container, use the benchmark_serving.patch file: ``` cd ~/vllm git apply ~/app/benchmarking/benchmark_serving.patch +cd /home/user/app/src +python run_vllm_api_server.py ``` This simply stops the benchmarking script from sending the `best_of` arg which is not supported and causes issues. -To run the benchmarks: +To run the benchmarks, in another shell into the Docker container: ``` cd ~/app export PYTHONPATH=$PYTHONPATH:$PWD From 8246a72abb1a82125910a2eaff4807323115cb1e Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Fri, 6 Dec 2024 05:53:45 +0000 Subject: [PATCH 24/76] update benchmark_serving.patch --- benchmarking/README.md | 6 +++--- benchmarking/benchmark_serving.patch | 11 ++++++++++- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/benchmarking/README.md b/benchmarking/README.md index ea06674d..641436d4 100644 --- a/benchmarking/README.md +++ b/benchmarking/README.md @@ -41,15 +41,15 @@ python examples/offline_inference_tt.py --measure_perf --max_seqs_in_batch 32 -- #### using vllm/benchmarking/benchmark_serving.py Within the Docker container, use the benchmark_serving.patch file: ``` -cd ~/vllm -git apply ~/app/benchmarking/benchmark_serving.patch -cd /home/user/app/src +cd ~/app/src python run_vllm_api_server.py ``` This simply stops the benchmarking script from sending the `best_of` arg which is not supported and causes issues. To run the benchmarks, in another shell into the Docker container: ``` +cd ~/vllm +git apply ~/app/benchmarking/benchmark_serving.patch cd ~/app export PYTHONPATH=$PYTHONPATH:$PWD python benchmarking/vllm_online_benchmark.py diff --git a/benchmarking/benchmark_serving.patch b/benchmarking/benchmark_serving.patch index c6cd2994..88a4b94d 100644 --- a/benchmarking/benchmark_serving.patch +++ b/benchmarking/benchmark_serving.patch @@ -1,5 +1,5 @@ diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py -index c1a396c8..74f75a15 100644 +index c1a396c8..463e0e93 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -22,6 +22,12 @@ On the client side, run: @@ -24,3 +24,12 @@ index c1a396c8..74f75a15 100644 multi_modal_content=test_mm_content, ignore_eos=ignore_eos, ) +@@ -458,7 +464,7 @@ async def benchmark( + prompt_len=prompt_len, + output_len=output_len, + logprobs=logprobs, +- best_of=best_of, ++ best_of=None, + multi_modal_content=mm_content, + ignore_eos=ignore_eos) + tasks.append( From 765c4be6a15ca661e198944110b3227c76dc2696 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Fri, 6 Dec 2024 06:08:05 +0000 Subject: [PATCH 25/76] add tt_model_runner.py patch for best_of --- benchmarking/benchmark_serving.patch | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/benchmarking/benchmark_serving.patch b/benchmarking/benchmark_serving.patch index 88a4b94d..fb5cb7f7 100644 --- a/benchmarking/benchmark_serving.patch +++ b/benchmarking/benchmark_serving.patch @@ -33,3 +33,27 @@ index c1a396c8..463e0e93 100644 multi_modal_content=mm_content, ignore_eos=ignore_eos) tasks.append( +diff --git a/vllm/worker/tt_model_runner.py b/vllm/worker/tt_model_runner.py +index 1c586dd3..505e4b84 100644 +--- a/vllm/worker/tt_model_runner.py ++++ b/vllm/worker/tt_model_runner.py +@@ -425,10 +425,15 @@ class TTModelRunner(ModelRunnerBase[TTModelInput]): + ) + + def _validate_sampling_params(self, sampling_params): +- assert sampling_params.n == 1, "Currently only supporting n=1" +- assert sampling_params.best_of is None, "Currently not supporting best_of" +- assert sampling_params.logprobs is None, "Currently not supporting logprobs" +- assert sampling_params.prompt_logprobs is None, "Currently not supporting prompt_logprobs" ++ # if sampling_params.n != 1: ++ # raise ValueError("Currently only supporting n=1") ++ # if sampling_params.best_of is not None: ++ # raise ValueError("Currently not supporting best_of") ++ # if sampling_params.logprobs is not None: ++ # raise ValueError("Currently not supporting logprobs") ++ # if sampling_params.prompt_logprobs is not None: ++ # raise ValueError("Currently not supporting prompt_logprobs") ++ return + + ## Destructor (used to delete ttnn trace if using trace mode) + \ No newline at end of file From b93370d4fbf430d83e88c128e56d9f758f4a1a17 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Fri, 6 Dec 2024 06:23:06 +0000 Subject: [PATCH 26/76] update benchmarking/benchmark_serving.patch --- benchmarking/benchmark_serving.patch | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/benchmarking/benchmark_serving.patch b/benchmarking/benchmark_serving.patch index fb5cb7f7..f393b6bc 100644 --- a/benchmarking/benchmark_serving.patch +++ b/benchmarking/benchmark_serving.patch @@ -34,10 +34,10 @@ index c1a396c8..463e0e93 100644 ignore_eos=ignore_eos) tasks.append( diff --git a/vllm/worker/tt_model_runner.py b/vllm/worker/tt_model_runner.py -index 1c586dd3..505e4b84 100644 +index 1c586dd3..2e77bf72 100644 --- a/vllm/worker/tt_model_runner.py +++ b/vllm/worker/tt_model_runner.py -@@ -425,10 +425,15 @@ class TTModelRunner(ModelRunnerBase[TTModelInput]): +@@ -425,12 +425,7 @@ class TTModelRunner(ModelRunnerBase[TTModelInput]): ) def _validate_sampling_params(self, sampling_params): @@ -45,15 +45,9 @@ index 1c586dd3..505e4b84 100644 - assert sampling_params.best_of is None, "Currently not supporting best_of" - assert sampling_params.logprobs is None, "Currently not supporting logprobs" - assert sampling_params.prompt_logprobs is None, "Currently not supporting prompt_logprobs" -+ # if sampling_params.n != 1: -+ # raise ValueError("Currently only supporting n=1") -+ # if sampling_params.best_of is not None: -+ # raise ValueError("Currently not supporting best_of") -+ # if sampling_params.logprobs is not None: -+ # raise ValueError("Currently not supporting logprobs") -+ # if sampling_params.prompt_logprobs is not None: -+ # raise ValueError("Currently not supporting prompt_logprobs") +- +- ## Destructor (used to delete ttnn trace if using trace mode) + return - - ## Destructor (used to delete ttnn trace if using trace mode) - \ No newline at end of file + + def __del__(self): + if self.trace_mode and self.execute_trace_kwargs is not None: From 5e07baac76394f0ac04b3deb5fb052e9bee35bdf Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Fri, 6 Dec 2024 06:34:43 +0000 Subject: [PATCH 27/76] use CACHE_ROOT for vllm_online_benchmark_results dir --- benchmarking/vllm_online_benchmark.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmarking/vllm_online_benchmark.py b/benchmarking/vllm_online_benchmark.py index 0315129f..97291699 100644 --- a/benchmarking/vllm_online_benchmark.py +++ b/benchmarking/vllm_online_benchmark.py @@ -63,7 +63,8 @@ def main(): env_config = EnvironmentConfig() # Create output directory - result_dir = Path("vllm_online_benchmark_results") + cache_dir = Path(os.environ.get("CACHE_ROOT", "")) + result_dir = cache_dir / "vllm_online_benchmark_results" result_dir.mkdir(parents=True, exist_ok=True) prompt_client = PromptClient(env_config) From d0e0b0fac21f631f45b7f65c92fe9c3acd1032f5 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Mon, 9 Dec 2024 15:55:33 +0000 Subject: [PATCH 28/76] adding timestamped online benchmark run result directory, rps=1 for vllm online benchmark script --- .../online_benchmark_prompt_client.py | 12 ++++++--- benchmarking/vllm_online_benchmark.py | 27 ++++++++++++------- 2 files changed, 25 insertions(+), 14 deletions(-) diff --git a/benchmarking/online_benchmark_prompt_client.py b/benchmarking/online_benchmark_prompt_client.py index 22812bbd..ad81230b 100644 --- a/benchmarking/online_benchmark_prompt_client.py +++ b/benchmarking/online_benchmark_prompt_client.py @@ -38,7 +38,7 @@ def get_test_combinations( else: bsz = 1 - num_prompts = bsz * 4 + num_prompts = max(bsz * 4, 4) combinations.append( { "input_len": input_len, @@ -67,13 +67,12 @@ def run_sequence_length_test( model: str = "meta-llama/Llama-3.1-70B-Instruct", ) -> List[dict]: # Create save directory - save_path = Path(save_dir) + timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + save_path = Path(save_dir) / f"results_{timestamp}" save_path.mkdir(parents=True, exist_ok=True) # Initialize results storage all_results = [] - timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") - results_file = save_path / f"{file_prefix}_{timestamp}.json" # Initialize configurations env_config = EnvironmentConfig(vllm_model=model) @@ -87,6 +86,11 @@ def run_sequence_length_test( output_len = params["output_len"] batch_size = params["batch_size"] num_prompts = params["num_prompts"] + run_timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + results_file = ( + save_path + / f"{file_prefix}_isl-{input_len}_osl-{output_len}_bsz-{batch_size}_n-{num_prompts}_{run_timestamp}.json" + ) logger.info( f"\nTesting combination {idx}/{total_combinations}:\n" diff --git a/benchmarking/vllm_online_benchmark.py b/benchmarking/vllm_online_benchmark.py index 97291699..c69ce064 100644 --- a/benchmarking/vllm_online_benchmark.py +++ b/benchmarking/vllm_online_benchmark.py @@ -6,6 +6,7 @@ import subprocess import time import logging +from datetime import datetime from typing import Dict from pathlib import Path @@ -25,7 +26,7 @@ def run_benchmark( model: str, port: int, benchmark_script: str, - result_dir: Path, + result_filename: Path, ) -> None: """Run a single benchmark with the given parameters.""" # fmt: off @@ -34,12 +35,13 @@ def run_benchmark( "--backend", "vllm", "--model", model, "--port", str(port), + "--request-rate", "1", "--dataset-name", "random", "--num-prompts", str(params["batch_size"]), "--random-input-len", str(params["input_len"]), "--random-output-len", str(params["output_len"]), "--save-result", - "--result-dir", str(result_dir) + "--result-filename", str(result_filename) ] # fmt: on @@ -64,7 +66,8 @@ def main(): # Create output directory cache_dir = Path(os.environ.get("CACHE_ROOT", "")) - result_dir = cache_dir / "vllm_online_benchmark_results" + timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + result_dir = cache_dir / "vllm_online_benchmark_results" / f"results_{timestamp}" result_dir.mkdir(parents=True, exist_ok=True) prompt_client = PromptClient(env_config) @@ -91,7 +94,7 @@ def main(): (2000, 32), (4000, 32), (8100, 32), - (130000, 1024), + # (32000, 1024) ] # Get all benchmark combinations using the original function @@ -99,23 +102,27 @@ def main(): context_lens=typical_context_lens + extra_context_lengths, ) - # Log benchmark plan - logger.info(f"Starting benchmark suite with {len(combinations)} combinations") - for i, combo in enumerate(combinations, 1): - logger.info(f"Combination {i}: {combo}") - # ensure vllm server is ready prompt_client.capture_traces() # Run benchmarks for i, params in enumerate(combinations, 1): + run_timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + isl = params["input_len"] + osl = params["output_len"] + bsz = params["batch_size"] + num_prompts = params["num_prompts"] + result_filename = ( + result_dir + / f"vllm_online_benchmark_isl-{isl}_osl-{osl}_bsz-{bsz}_n-{num_prompts}_{run_timestamp}.json" + ) logger.info(f"\nRunning benchmark {i}/{len(combinations)}") run_benchmark( benchmark_script="/home/user/vllm/benchmarks/benchmark_serving.py", params=params, model=env_config.vllm_model, port=env_config.service_port, - result_dir=result_dir, + result_filename=result_filename, ) logger.info("Benchmark suite completed") From 5db2523cec328186309eef54dcb7c2e424e69f51 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Mon, 9 Dec 2024 16:16:53 +0000 Subject: [PATCH 29/76] update benchmark output file naming convention --- benchmarking/online_benchmark_prompt_client.py | 2 +- benchmarking/vllm_online_benchmark.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarking/online_benchmark_prompt_client.py b/benchmarking/online_benchmark_prompt_client.py index ad81230b..db490362 100644 --- a/benchmarking/online_benchmark_prompt_client.py +++ b/benchmarking/online_benchmark_prompt_client.py @@ -89,7 +89,7 @@ def run_sequence_length_test( run_timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") results_file = ( save_path - / f"{file_prefix}_isl-{input_len}_osl-{output_len}_bsz-{batch_size}_n-{num_prompts}_{run_timestamp}.json" + / f"{file_prefix}_{run_timestamp}_isl-{input_len}_osl-{output_len}_bsz-{batch_size}_n-{num_prompts}.json" ) logger.info( diff --git a/benchmarking/vllm_online_benchmark.py b/benchmarking/vllm_online_benchmark.py index c69ce064..5e90291e 100644 --- a/benchmarking/vllm_online_benchmark.py +++ b/benchmarking/vllm_online_benchmark.py @@ -114,7 +114,7 @@ def main(): num_prompts = params["num_prompts"] result_filename = ( result_dir - / f"vllm_online_benchmark_isl-{isl}_osl-{osl}_bsz-{bsz}_n-{num_prompts}_{run_timestamp}.json" + / f"vllm_online_benchmark_{run_timestamp}_isl-{isl}_osl-{osl}_bsz-{bsz}_n-{num_prompts}.json" ) logger.info(f"\nRunning benchmark {i}/{len(combinations)}") run_benchmark( From 5ab742c9cf2ae4a72749ee104a0be6a7541c622f Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Mon, 9 Dec 2024 16:30:06 +0000 Subject: [PATCH 30/76] rename benchmarking/online_benchmark_prompt_client.py to benchmarking/prompt_client_online_benchmark.py --- ...t.py => prompt_client_online_benchmark.py} | 34 +++++++++---------- benchmarking/vllm_online_benchmark.py | 2 +- 2 files changed, 18 insertions(+), 18 deletions(-) rename benchmarking/{online_benchmark_prompt_client.py => prompt_client_online_benchmark.py} (94%) diff --git a/benchmarking/online_benchmark_prompt_client.py b/benchmarking/prompt_client_online_benchmark.py similarity index 94% rename from benchmarking/online_benchmark_prompt_client.py rename to benchmarking/prompt_client_online_benchmark.py index db490362..4fe6c943 100644 --- a/benchmarking/online_benchmark_prompt_client.py +++ b/benchmarking/prompt_client_online_benchmark.py @@ -180,25 +180,25 @@ def run_sequence_length_test( if __name__ == "__main__": # Define parameter ranges typical_context_lens = [ - (128, 128), - (128, 2048), - (128, 4096), - (2048, 128), - (2048, 2048), - (1000, 1000), - (500, 2000), - (5000, 500), - (20000, 2000), + # (128, 128), + # (128, 2048), + # (128, 4096), + # (2048, 128), + # (2048, 2048), + # (1000, 1000), + # (500, 2000), + # (5000, 500), + # (20000, 2000), ] extra_context_lengths = [ - (128, 2), - (256, 2), - (512, 32), - (1000, 24), - (2000, 32), - (4000, 32), - (8100, 32), - (130000, 1024), + # (128, 2), + # (256, 2), + # (512, 32), + # (1000, 24), + # (2000, 32), + # (4000, 32), + # (8100, 32), + (32760, 1024), ] # Generate all valid combinations upfront combinations = get_test_combinations( diff --git a/benchmarking/vllm_online_benchmark.py b/benchmarking/vllm_online_benchmark.py index 5e90291e..32f43203 100644 --- a/benchmarking/vllm_online_benchmark.py +++ b/benchmarking/vllm_online_benchmark.py @@ -10,7 +10,7 @@ from typing import Dict from pathlib import Path -from benchmarking.online_benchmark_prompt_client import get_test_combinations +from benchmarking.prompt_client_online_benchmark import get_test_combinations from utils.prompt_configs import EnvironmentConfig from utils.prompt_client import PromptClient From 06420bd989c0639a0f1c9d5df6d6695a3b316fc5 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Mon, 9 Dec 2024 19:36:02 +0000 Subject: [PATCH 31/76] increase num_prompts default, default to 128/128 online test --- .../prompt_client_online_benchmark.py | 8 ++--- benchmarking/vllm_online_benchmark.py | 35 +++++++++---------- 2 files changed, 21 insertions(+), 22 deletions(-) diff --git a/benchmarking/prompt_client_online_benchmark.py b/benchmarking/prompt_client_online_benchmark.py index 4fe6c943..5764acdd 100644 --- a/benchmarking/prompt_client_online_benchmark.py +++ b/benchmarking/prompt_client_online_benchmark.py @@ -38,7 +38,7 @@ def get_test_combinations( else: bsz = 1 - num_prompts = max(bsz * 4, 4) + num_prompts = max(bsz * 32, 32) combinations.append( { "input_len": input_len, @@ -53,7 +53,8 @@ def get_test_combinations( for i, combo in enumerate(combinations, 1): logger.info( f"Combination {i}: input_len={combo['input_len']}, " - f"output_len={combo['output_len']}, batch_size={combo['batch_size']}" + f"output_len={combo['output_len']}, batch_size={combo['batch_size']}, " + f"num_prompts={combo['num_prompts']}" ) return combinations @@ -180,7 +181,7 @@ def run_sequence_length_test( if __name__ == "__main__": # Define parameter ranges typical_context_lens = [ - # (128, 128), + (128, 128), # (128, 2048), # (128, 4096), # (2048, 128), @@ -198,7 +199,6 @@ def run_sequence_length_test( # (2000, 32), # (4000, 32), # (8100, 32), - (32760, 1024), ] # Generate all valid combinations upfront combinations = get_test_combinations( diff --git a/benchmarking/vllm_online_benchmark.py b/benchmarking/vllm_online_benchmark.py index 32f43203..3dd38f07 100644 --- a/benchmarking/vllm_online_benchmark.py +++ b/benchmarking/vllm_online_benchmark.py @@ -35,9 +35,9 @@ def run_benchmark( "--backend", "vllm", "--model", model, "--port", str(port), - "--request-rate", "1", + # "--request-rate", "3", "--dataset-name", "random", - "--num-prompts", str(params["batch_size"]), + "--num-prompts", str(params["num_prompts"]), "--random-input-len", str(params["input_len"]), "--random-output-len", str(params["output_len"]), "--save-result", @@ -77,24 +77,23 @@ def main(): # Define benchmarking parameters typical_context_lens = [ (128, 128), - (128, 2048), - (128, 4096), - (2048, 128), - (2048, 2048), - (1000, 1000), - (500, 2000), - (5000, 500), - (20000, 2000), + # (128, 2048), + # (128, 4096), + # (2048, 128), + # (2048, 2048), + # (1000, 1000), + # (500, 2000), + # (5000, 500), + # (20000, 2000), ] extra_context_lengths = [ - (128, 2), - (256, 2), - (512, 32), - (1000, 24), - (2000, 32), - (4000, 32), - (8100, 32), - # (32000, 1024) + # (128, 2), + # (256, 2), + # (512, 32), + # (1000, 24), + # (2000, 32), + # (4000, 32), + # (8100, 32), ] # Get all benchmark combinations using the original function From b7e4cfc7ffab2403d49b62673a122f7d99c302cf Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Mon, 9 Dec 2024 21:01:14 +0000 Subject: [PATCH 32/76] use min_tokens and ignore_eos=True to force output seq len --- utils/prompt_client.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/utils/prompt_client.py b/utils/prompt_client.py index 455921d3..4eb98ddf 100644 --- a/utils/prompt_client.py +++ b/utils/prompt_client.py @@ -176,7 +176,8 @@ def call_inference( } if force_max_tokens: - json_data["stop"] = "<|reserved_special_token_249|>" + json_data["min_tokens"] = max_tokens + json_data["ignore_eos"] = True req_time = time.perf_counter() response = requests.post( From dda29a9300c6c716c5a0eca2fcf554a693a83421 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Mon, 9 Dec 2024 21:42:25 +0000 Subject: [PATCH 33/76] adding min_tokens to locust requests --- locust/locustfile.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/locust/locustfile.py b/locust/locustfile.py index 7942394f..19dd59ee 100644 --- a/locust/locustfile.py +++ b/locust/locustfile.py @@ -23,6 +23,7 @@ # Global variable to store data iterator data_iter = None + def get_authorization(): authorization = os.getenv("AUTHORIZATION", None) if authorization is None: @@ -50,12 +51,13 @@ class ServeUser(FastHttpUser): connection_timeout = CONNECTION_TIMEOUT headers = {"Authorization": f"Bearer {get_authorization()}"} - def post_request(self, prompt: str, max_tokens: int): + def post_request(self, prompt: str, max_tokens: int, min_tokens: int): """Helper method to send a POST request to the API with the given prompt and token limit.""" json_data = { "prompt": prompt, **DEFAULT_PARAMS, # Merge default parameters "max_tokens": max_tokens, + "min_tokens": min_tokens, } response = self.client.post(API_ENDPOINT, json=json_data, headers=self.headers) return response @@ -64,4 +66,4 @@ def post_request(self, prompt: str, max_tokens: int): def dataset_test(self): """Test using generated prompts from a data iterator.""" prompt = next(data_iter) - self.post_request(prompt, max_tokens=128) + self.post_request(prompt, max_tokens=128, min_tokens=128) From f8b3033fa22f2a81856cf5b2e90196c30dfe55e2 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Tue, 10 Dec 2024 20:20:32 +0000 Subject: [PATCH 34/76] add --ignore-eos to vllm_online_benchmark.py to force the output seq len to be as configured --- benchmarking/vllm_online_benchmark.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmarking/vllm_online_benchmark.py b/benchmarking/vllm_online_benchmark.py index 3dd38f07..1385f108 100644 --- a/benchmarking/vllm_online_benchmark.py +++ b/benchmarking/vllm_online_benchmark.py @@ -40,7 +40,8 @@ def run_benchmark( "--num-prompts", str(params["num_prompts"]), "--random-input-len", str(params["input_len"]), "--random-output-len", str(params["output_len"]), - "--save-result", + "--ignore-eos", # Ignore EOS tokens to force max output length as set + "--save-result", "--result-filename", str(result_filename) ] # fmt: on From 12c38fcb1586344069d65fe1c5d02097ea0077b8 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Tue, 10 Dec 2024 20:23:14 +0000 Subject: [PATCH 35/76] add context_lens (isl, osl) pairs to capture_traces() to capture correct traces for performance testing --- .../prompt_client_online_benchmark.py | 13 +++---- benchmarking/vllm_online_benchmark.py | 14 +++---- utils/prompt_client.py | 37 ++++++++++++------- 3 files changed, 34 insertions(+), 30 deletions(-) diff --git a/benchmarking/prompt_client_online_benchmark.py b/benchmarking/prompt_client_online_benchmark.py index 5764acdd..a7da95ae 100644 --- a/benchmarking/prompt_client_online_benchmark.py +++ b/benchmarking/prompt_client_online_benchmark.py @@ -78,7 +78,6 @@ def run_sequence_length_test( # Initialize configurations env_config = EnvironmentConfig(vllm_model=model) prompt_client = PromptClient(env_config) - prompt_client.capture_traces() # Test all combinations total_combinations = len(combinations) @@ -130,6 +129,8 @@ def run_sequence_length_test( batch_processor = BatchProcessor(prompt_client, batch_config) tokenizer = AutoTokenizer.from_pretrained(model) + # pre-capture traces so benchmark does not include 1st run trace capture time + prompt_client.capture_traces(context_lens=[(input_len, output_len)]) # Process batches try: responses = batch_processor.process_batch( @@ -179,8 +180,8 @@ def run_sequence_length_test( if __name__ == "__main__": - # Define parameter ranges - typical_context_lens = [ + # Define benchmarking context length (isl, osl) pairs + context_lens = [ (128, 128), # (128, 2048), # (128, 4096), @@ -190,8 +191,6 @@ def run_sequence_length_test( # (500, 2000), # (5000, 500), # (20000, 2000), - ] - extra_context_lengths = [ # (128, 2), # (256, 2), # (512, 32), @@ -201,9 +200,7 @@ def run_sequence_length_test( # (8100, 32), ] # Generate all valid combinations upfront - combinations = get_test_combinations( - context_lens=typical_context_lens + extra_context_lengths, - ) + combinations = get_test_combinations(context_lens=context_lens) # Run tests results = run_sequence_length_test( diff --git a/benchmarking/vllm_online_benchmark.py b/benchmarking/vllm_online_benchmark.py index 1385f108..159f8da0 100644 --- a/benchmarking/vllm_online_benchmark.py +++ b/benchmarking/vllm_online_benchmark.py @@ -75,8 +75,8 @@ def main(): # note: there isnt a better way to pass an api key to the vllm benchmarking script os.environ["OPENAI_API_KEY"] = prompt_client._get_authorization() - # Define benchmarking parameters - typical_context_lens = [ + # Define benchmarking context length (isl, osl) pairs + context_lens = [ (128, 128), # (128, 2048), # (128, 4096), @@ -86,8 +86,6 @@ def main(): # (500, 2000), # (5000, 500), # (20000, 2000), - ] - extra_context_lengths = [ # (128, 2), # (256, 2), # (512, 32), @@ -98,12 +96,10 @@ def main(): ] # Get all benchmark combinations using the original function - combinations = get_test_combinations( - context_lens=typical_context_lens + extra_context_lengths, - ) + combinations = get_test_combinations(context_lens=context_lens) - # ensure vllm server is ready - prompt_client.capture_traces() + # pre-capture traces required for benchmarking + prompt_client.capture_traces(context_lens=context_lens) # Run benchmarks for i, params in enumerate(combinations, 1): diff --git a/utils/prompt_client.py b/utils/prompt_client.py index 4eb98ddf..27e01ebd 100644 --- a/utils/prompt_client.py +++ b/utils/prompt_client.py @@ -5,7 +5,7 @@ import logging import json import time -from typing import List +from typing import List, Tuple import requests import jwt @@ -97,27 +97,37 @@ def wait_for_healthy(self, timeout: int = 300, interval: int = 10) -> bool: def capture_traces( self, - input_sizes: List[int] = None, + context_lens: List[Tuple[int, int]] = None, prompts_per_size: int = 1, - output_seq_len: int = 1, ) -> None: logger.info("Capturing input sizes ...") # Default input sizes based on get_padded_prefill_len() - if input_sizes is None: - input_sizes = [32, 64, 128, 256, 512, 1024, 2048, 3072, 4096] + if context_lens is None: + # generate 4 osl tokens by default for each isl + context_lens = [ + (32, 4), + (64, 4), + (128, 4), + (256, 4), + (512, 4), + (1024, 4), + (2048, 4), + (3072, 4), + (4096, 4), + ] # Check service health before starting if not self.wait_for_healthy(): raise RuntimeError("vLLM did not start correctly!") - for size in input_sizes: - logger.info(f"Capture input size: {size}") + for isl, osl in context_lens: + logger.info(f"Capture trace: isl={isl}, osl={osl}") # Create prompt config for current size prompt_config = PromptConfig( - input_seq_len=size, - max_prompt_length=size, + input_seq_len=isl, + max_prompt_length=isl, num_prompts=prompts_per_size, distribution="fixed", dataset="random", @@ -133,20 +143,21 @@ def capture_traces( # Process each prompt for i, (prompt, prompt_len) in enumerate(zip(prompts, prompt_lengths)): try: - logger.info(f"Starting capture for input_seq_len: {prompt_len}") + logger.info( + f"Starting capture for input_seq_len: {prompt_len}, output_seq_len: {osl}" + ) response_data = self.call_inference( prompt=prompt, response_idx=i, prompt_len=prompt_len, - max_tokens=output_seq_len, + max_tokens=osl, stream=True, vll_model=self.env_config.vllm_model, tokenizer=None, force_max_tokens=True, ) logger.info( - f"Input size: {size}, " - f"input_seq_len: {prompt_len}, " + f"tokens generated: {response_data['output_seq_len']}, " f"TTFT: {response_data['ttft']:.3f}s" ) except Exception as e: From 1cabdc98bfd8836b7aa869a10474ce4929331323 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Tue, 10 Dec 2024 20:33:26 +0000 Subject: [PATCH 36/76] add trace pre-capture to prompt_client_cli.py with option to disable --- benchmarking/README.md | 22 ++++++++++++++++++++++ utils/prompt_client_cli.py | 12 ++++++++++++ 2 files changed, 34 insertions(+) diff --git a/benchmarking/README.md b/benchmarking/README.md index 641436d4..d3360c85 100644 --- a/benchmarking/README.md +++ b/benchmarking/README.md @@ -38,6 +38,20 @@ python examples/offline_inference_tt.py --measure_perf --max_seqs_in_batch 32 -- ### Online Benchmarking +#### single user + +```bash +python utils/prompt_client_cli.py \ + --num_prompts 32 \ + --batch_size 1 \ + --tokenizer_model meta-llama/Llama-3.1-70B-Instruct \ + --max_prompt_length 128 \ + --input_seq_len 128 \ + --output_seq_len 128 \ + --template chat_template \ + --dataset random +``` + #### using vllm/benchmarking/benchmark_serving.py Within the Docker container, use the benchmark_serving.patch file: ``` @@ -89,3 +103,11 @@ Median ITL (ms): 7.83 P99 ITL (ms): 8.05 ================================================== ``` + +#### using tt-inference-server/benchmarking/prompt_client_online_benchmark.py + +```bash +export PYTHONPATH=$PYTHONPATH:$PWD +python benchmarking/prompt_client_online_benchmark.py +``` + diff --git a/utils/prompt_client_cli.py b/utils/prompt_client_cli.py index 3d74f8f5..943b5844 100644 --- a/utils/prompt_client_cli.py +++ b/utils/prompt_client_cli.py @@ -114,6 +114,12 @@ def add_client_args(parser): default=False, help="Print generated prompts.", ) + parser.add_argument( + "--skip_trace_precapture", + action="store_true", + default=False, + help="Print generated prompts.", + ) return parser @@ -159,6 +165,12 @@ def main(): # Generate prompts prompts, input_seq_lengths = generate_prompts(prompt_config) + if not args.skip_trace_precapture: + # pre-capture traces so benchmark does not include 1st run trace capture time + prompt_client.capture_traces( + context_lens=[(args.input_seq_len, args.output_seq_len)] + ) + # Process batches logger.info(f"Starting batch processing with batch_size={batch_config.batch_size}") responses = batch_processor.process_batch( From 68f08d05c4fc74bd85c990f2388f1afc652eed77 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Tue, 10 Dec 2024 20:36:13 +0000 Subject: [PATCH 37/76] better comment and logs for trace capture --- utils/prompt_client.py | 2 +- utils/prompt_client_cli.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/utils/prompt_client.py b/utils/prompt_client.py index 27e01ebd..6b86330e 100644 --- a/utils/prompt_client.py +++ b/utils/prompt_client.py @@ -144,7 +144,7 @@ def capture_traces( for i, (prompt, prompt_len) in enumerate(zip(prompts, prompt_lengths)): try: logger.info( - f"Starting capture for input_seq_len: {prompt_len}, output_seq_len: {osl}" + f"Starting trace capture for: input_seq_len:={prompt_len}, output_seq_len:={osl}" ) response_data = self.call_inference( prompt=prompt, diff --git a/utils/prompt_client_cli.py b/utils/prompt_client_cli.py index 943b5844..31d97e6d 100644 --- a/utils/prompt_client_cli.py +++ b/utils/prompt_client_cli.py @@ -166,7 +166,7 @@ def main(): prompts, input_seq_lengths = generate_prompts(prompt_config) if not args.skip_trace_precapture: - # pre-capture traces so benchmark does not include 1st run trace capture time + # pre-capture traces to not include 1st run trace capture time prompt_client.capture_traces( context_lens=[(args.input_seq_len, args.output_seq_len)] ) From 962c5077993678b3a2ed5362e19671dd9d8cfec9 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Thu, 12 Dec 2024 05:05:53 +0000 Subject: [PATCH 38/76] use TPOT and TPS in benchmarking/prompt_client_online_benchmark.py, add support in client for ITL and TPOT --- .../prompt_client_online_benchmark.py | 25 ++++++++++++------- utils/batch_processor.py | 5 ++-- utils/prompt_client.py | 25 ++++++++++++++----- 3 files changed, 37 insertions(+), 18 deletions(-) diff --git a/benchmarking/prompt_client_online_benchmark.py b/benchmarking/prompt_client_online_benchmark.py index a7da95ae..f4d9ca53 100644 --- a/benchmarking/prompt_client_online_benchmark.py +++ b/benchmarking/prompt_client_online_benchmark.py @@ -140,15 +140,22 @@ def run_sequence_length_test( ) # Calculate statistics + mean_tpot = np.mean([r["time_per_output_token"] for r in responses]) + mean_tpot = max(mean_tpot, 1e-6) # Avoid division by zero + mean_tps = 1.0 / mean_tpot + std_tpot = np.std([r["time_per_output_token"] for r in responses]) + std_tpot = max(std_tpot, 1e-6) # Avoid division by zero + std_tps = mean_tps - 1.0 / (mean_tpot + std_tpot) stats = { "input_seq_len": input_len, "output_seq_len": output_len, "batch_size": batch_size, - "mean_decode_tps": np.mean([r["decode_tps"] for r in responses]), - "mean_total_tps": np.mean([r["total_tps"] for r in responses]), + "total_output_tokens": sum([r["output_seq_len"] for r in responses]), + "mean_tpot": mean_tpot, + "mean_tps": mean_tps, "mean_ttft": np.mean([r["ttft"] for r in responses]), - "std_decode_tps": np.std([r["decode_tps"] for r in responses]), - "std_total_tps": np.std([r["total_tps"] for r in responses]), + "std_tpot": std_tpot, + "std_tps": std_tps, "std_ttft": np.std([r["ttft"] for r in responses]), "num_prompts": num_prompts, "num_iterations": num_iterations, @@ -161,11 +168,11 @@ def run_sequence_length_test( # Log results logger.info( f"Results for combination {idx}/{total_combinations}:\n" - f"Mean Decode TPS: {stats['mean_decode_tps']:.2f} ± " - f"{stats['std_decode_tps']:.2f}\n" - f"Mean Total TPS: {stats['mean_total_tps']:.2f} ± " - f"{stats['std_total_tps']:.2f}\n" - f"Mean TTFT: {stats['mean_ttft']:.2f} ± {stats['std_ttft']:.2f}" + f"Mean TPOT: {stats['mean_tpot']:.4f} ± " + f"{stats['std_tpot']:.4f}\n" + f"Mean user TPS: {stats['mean_tps']:.4f} ± " + f"{stats['std_tps']:.4f}\n" + f"Mean TTFT: {stats['mean_ttft']:.4f} ± {stats['std_ttft']:.4f}" ) # Save results after each combination diff --git a/utils/batch_processor.py b/utils/batch_processor.py index 35ab6652..2c7a68fc 100644 --- a/utils/batch_processor.py +++ b/utils/batch_processor.py @@ -266,9 +266,8 @@ def _log_progress( ): logger.info( f"Processed {response_counter}/{total_prompts} responses. " - f"decode_tps: {response_data['decode_tps']:.2f}, " - f"total_tps: {response_data['total_tps']:.2f}, " - f"ttft: {response_data['ttft']:.2f}, " + f"TPOT: {response_data['time_per_output_token']:.4f}, " + f"TTFT: {response_data['ttft']:.4f}, " f"input_seq_len: {response_data['input_seq_len']}, " f"output_seq_len: {response_data['output_seq_len']}" ) diff --git a/utils/prompt_client.py b/utils/prompt_client.py index 6b86330e..1322f130 100644 --- a/utils/prompt_client.py +++ b/utils/prompt_client.py @@ -158,7 +158,8 @@ def capture_traces( ) logger.info( f"tokens generated: {response_data['output_seq_len']}, " - f"TTFT: {response_data['ttft']:.3f}s" + f"TTFT: {response_data['ttft']:.3f}s, " + f"TPOT: {response_data['time_per_output_token']:.3f}s" ) except Exception as e: logger.error(f"Error processing prompt: {e}") @@ -218,6 +219,7 @@ def _process_response( first_token_time = 0 ttft = 0 usage_dict = {} + token_timestamps = [] if stream: assert ( @@ -225,8 +227,9 @@ def _process_response( ), "Response is not chunked" for line in response.iter_lines(decode_unicode=True): if line and line.startswith("data: "): + current_time = time.perf_counter() if num_completion_tokens == 0: - first_token_time = time.perf_counter() + first_token_time = current_time ttft = first_token_time - req_time data_str = line[len("data: ") :].strip() @@ -237,6 +240,7 @@ def _process_response( data = json.loads(data_str) if data["choices"]: full_text += data["choices"][0].get("text", "") + token_timestamps.append(current_time) num_completion_tokens += 1 else: usage_dict = data.get("usage", {}) @@ -249,8 +253,17 @@ def _process_response( usage_dict = data["usage"] first_token_time = req_time - decode_time = max(time.perf_counter() - first_token_time, 0.0001) - total_time = max(time.perf_counter() - req_time, 0.0001) + # Calculate inter-token latencies + inter_token_latencies = [] + if len(token_timestamps) > 1: + inter_token_latencies = [ + token_timestamps[i] - token_timestamps[i - 1] + for i in range(1, len(token_timestamps)) + ] + + gen_time = max(time.perf_counter() - first_token_time, 0.0001) + # discount the TTFT and 1st token time from the generation time + time_per_output_token = gen_time / max(num_completion_tokens - 1, 1) # verify the number of input tokens isl_diff = usage_dict["prompt_tokens"] - prompt_len @@ -281,7 +294,7 @@ def _process_response( "response": full_text, "input_seq_len": prompt_len, "output_seq_len": num_completion_tokens, - "decode_tps": (max(num_completion_tokens, 1)) / decode_time, - "total_tps": (max(num_completion_tokens, 1)) / total_time, + "inter_token_latencies": inter_token_latencies, + "time_per_output_token": time_per_output_token, "ttft": ttft, } From 62bf42764d981edf4f8d873d8022293f852c77c8 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Thu, 12 Dec 2024 05:08:48 +0000 Subject: [PATCH 39/76] update utils/prompt_client_cli.py and docs --- utils/README.md | 100 ++++++++++++++++++++++++++++--------- utils/prompt_client_cli.py | 24 ++++++--- 2 files changed, 94 insertions(+), 30 deletions(-) diff --git a/utils/README.md b/utils/README.md index 389e7cea..984fda73 100644 --- a/utils/README.md +++ b/utils/README.md @@ -23,20 +23,73 @@ The prompt client CLI tool allows you to send prompts to a vLLM API server with - `CACHE_ROOT`: Directory for saving response files (default: current directory) - `VLLM_MODEL`: Model name (default: meta-llama/Llama-3.1-70B-Instruct) -#### Key Arguments - -- `--num_prompts`: Number of prompts to generate -- `--batch_size`: Number of concurrent requests -- `--max_prompt_length`: Maximum length for generated prompts -- `--output_seq_len`: Maximum length for completions -- `--num_full_iterations`: Number of times to repeat the full prompt set -- `--vary-batch-size`: Randomize batch sizes using normal distribution -- `--input_seq_len`: Fixed length for input sequences (-1 for variable) -- `--inter_batch_delay`: Delay between batches in seconds -- `--no-stream`: Disable streaming responses -- `--dataset`: Source dataset (random, alpaca_eval) -- `--distribution`: Prompt length distribution (fixed, uniform, normal) -- `--template`: Path to Jinja2 template or "chat_template" for model tokenizer default +#### Command Line Arguments + +##### Core Parameters + +- `--num_prompts` (default: 1) + Number of unique prompts to generate for testing. + +- `--batch_size` (default: 32) + Number of concurrent requests to send to the API server. Controls parallelization level. + +- `--num_full_iterations` (default: 1) + Number of complete iterations over the entire prompt set. Useful for extended testing cycles. + +##### Model Configuration + +- `--vllm_model` (default: "meta-llama/Llama-3.1-70B-Instruct") + Model identifier for the vLLM API server. Can be overridden by VLLM_MODEL environment variable. + +- `--tokenizer_model` (default: None) + Specific tokenizer model to use for vocabulary, truncation, and templating operations. + +##### Sequence Length Controls + +- `--input_seq_len` (default: -1) + Length parameter for input sequences when using random prompts. -1 allows variable lengths. + +- `--output_seq_len` (default: 2048) + Forces all completions to a fixed maximum length for consistent testing. + +- `--max_prompt_length` (default: -1) + Maximum allowed length for generated prompts. -1 indicates no length restriction. + +##### Batch Processing Options + +- `--vary_batch_size` (default: False) + When enabled, randomizes the batch size for each prompt batch using normal distribution. + +- `--inter_batch_delay` (default: 0) + Seconds to wait between processing each batch. Useful for rate limiting. + +- `--no-stream` (default: False) + Disables streaming responses. By default, streaming is enabled. + +##### Prompt Generation Settings + +- `--distribution` (default: "fixed") + Method for determining random prompt lengths: + - "fixed": Constant length + - "uniform": Uniform distribution + - "normal": Normal distribution + +- `--dataset` (default: "random") + Source dataset for prompt generation. Use "random" for synthetic prompts. + +- `--template` (default: None) + Jinja2 template for formatting prompts. Can be a file path or template string. + +##### Output Controls + +- `--save_path` (default: None) + File path to save generated prompts in JSONL format. + +- `--print_prompts` (default: False) + Enable printing of generated prompts to stdout. + +- `--skip_trace_precapture` (default: False) + Skips trace precapture phase, use to speed up execution if trace captures have already completed. #### Example Usage @@ -54,7 +107,7 @@ python prompt_client_cli.py \ --num_prompts 10 \ --batch_size 4 \ --tokenizer_model meta-llama/Llama-3.1-70B-Instruct \ - --max_prompt_length 512 \ + --input_seq_len 512 \ --output_seq_len 2048 # send prompts from alpaca_eval using chat template from tokenizer @@ -103,13 +156,14 @@ The client saves responses in JSON format with the following structure: ```json { - "response_idx": 0, - "prompt": "example prompt", - "response": "model response", - "prompt_length": 128, - "num_completion_tokens": 256, - "tps": 45.6, - "ttft": 0.15 + "response_idx": number, // Response index in batch + "prompt": string, // Input prompt + "response": string, // Generated completion text + "input_seq_len": number, // Prompt length in tokens + "output_seq_len": number, // Completion length in tokens + "inter_token_latencies": number[], // Per-token generation times in seconds + "time_per_output_token": number, // Average seconds per token + "ttft": number // Time to first token in seconds } ``` @@ -139,7 +193,7 @@ args = SimpleNamespace( input_seq_len=-1, num_prompts=5, distribution="normal", - template="templates/chat.j2", + template="prompt_templates/llama_instruct_example.jinja", save_path="generated_prompts.jsonl", lm_eval_task=None ) diff --git a/utils/prompt_client_cli.py b/utils/prompt_client_cli.py index 31d97e6d..7ab7ed6f 100644 --- a/utils/prompt_client_cli.py +++ b/utils/prompt_client_cli.py @@ -74,7 +74,7 @@ def add_client_args(parser): parser.add_argument( "--max_prompt_length", type=int, - required=True, + default=-1, help="Maximum length of generated prompts.", ) parser.add_argument( @@ -118,7 +118,7 @@ def add_client_args(parser): "--skip_trace_precapture", action="store_true", default=False, - help="Print generated prompts.", + help="Skips trace precapture phase, use to speed up execution if trace captures have already completed.", ) return parser @@ -131,6 +131,16 @@ def main(): parser = add_client_args(parser) args = parser.parse_args() + assert ( + args.max_prompt_length != -1 or args.input_seq_len != -1 + ), "Either --max_prompt_length or --input_seq_len must be provided." + if args.max_prompt_length == -1: + assert args.input_seq_len > 0 + args.max_prompt_length = args.input_seq_len + elif args.input_seq_len == -1: + assert args.max_prompt_length > 0 + args.input_seq_len = args.max_prompt_length + # Create configs from arguments prompt_config = PromptConfig( input_seq_len=args.input_seq_len, @@ -181,12 +191,12 @@ def main(): # Calculate and log summary statistics if responses: - mean_decode_tps = np.mean([r["decode_tps"] for r in responses]) - mean_total_tps = np.mean([r["total_tps"] for r in responses]) + mean_tpot = np.mean([r["time_per_output_token"] for r in responses]) mean_ttft = np.mean([r["ttft"] for r in responses]) - logger.info(f"Mean Decode TPS: {mean_decode_tps:.2f}") - logger.info(f"Mean Total TPS: {mean_total_tps:.2f}") - logger.info(f"Mean TTFT: {mean_ttft:.2f}") + logger.info(f"Mean TTFT: {mean_ttft:.4f}") + logger.info(f"Mean TPOT: {mean_tpot:.4f}") + mean_tps = 1.0 / max(mean_tpot, 1e-6) + logger.info(f"Mean User TPS: {mean_tps:.4f}") if __name__ == "__main__": From d9e163cea98a95e1ebc04fbeabb5e81abe1382e2 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Thu, 12 Dec 2024 05:22:25 +0000 Subject: [PATCH 40/76] remove WIP utils/startup_utils.py from this branch --- utils/startup_utils.py | 76 ------------------------------------------ 1 file changed, 76 deletions(-) delete mode 100644 utils/startup_utils.py diff --git a/utils/startup_utils.py b/utils/startup_utils.py deleted file mode 100644 index 05cb616f..00000000 --- a/utils/startup_utils.py +++ /dev/null @@ -1,76 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# -# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC - -import os -import logging -import subprocess -import psutil -import signal - - -logging.basicConfig( - level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" -) -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - - -class InferenceServerContext: - def __init__(self, startup_script_path): - self.startup_script_path = startup_script_path - - def __enter__(self): - self.process = subprocess.Popen( - ["python", self.startup_script_path], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - preexec_fn=os.setsid, - ) - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - if not self.process: - return - - # Log initial state - try: - parent = psutil.Process(self.process.pid) - children = parent.children(recursive=True) - logger.info(f"Found {len(children)} child processes before termination") - for child in children: - logger.info(f"Child PID: {child.pid}, Name: {child.name()}") - except psutil.NoSuchProcess: - logger.warning("Main process already terminated") - return - - # Send SIGTERM to process group - try: - os.killpg(self.process.pid, signal.SIGTERM) - logger.info(f"Sent SIGTERM to process group {self.process.pid}") - except ProcessLookupError: - logger.warning("Process group already terminated") - return - - # Wait for graceful shutdown - try: - self.process.wait(timeout=5) - logger.info("Process terminated gracefully") - except subprocess.TimeoutExpired: - logger.warning("Timeout expired, force killing process group") - try: - os.killpg(self.process.pid, signal.SIGKILL) - except ProcessLookupError: - pass - - # Final verification - try: - parent = psutil.Process(self.process.pid) - remaining = parent.children(recursive=True) - if remaining: - logger.error(f"{len(remaining)} child processes still exist") - for proc in remaining: - logger.error(f"Remaining PID: {proc.pid}, Name: {proc.name()}") - except psutil.NoSuchProcess: - logger.info("All inference server processes terminated") From cd29085e84416b8182893f3b52227ea6a3f36242 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Tue, 31 Dec 2024 19:59:55 +0000 Subject: [PATCH 41/76] adding doc string to BatchProcessor --- utils/batch_processor.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/utils/batch_processor.py b/utils/batch_processor.py index 2c7a68fc..ca81baf5 100644 --- a/utils/batch_processor.py +++ b/utils/batch_processor.py @@ -25,6 +25,17 @@ class BatchProcessor: + """ + BatchProcessor runs multiple concurrent requests to the backend inference + server (vLLM in this case). This adds some functionality for sending requests + with a specific max number of requests allowed that is independent with the + backend batch_size. Mostly this is for testing continous batching and seq lens, + but can be used as an alternative method for benchmarking as in + benchmarking/prompt_client_online_benchmark.py measuring TTFT as experienced + by users by not exceeding the backend concurrent user capacity and having + requests queued on the backend server before processing starts by the model. + """ + def __init__(self, prompt_client: PromptClient, batch_config: BatchConfig): self.prompt_client = prompt_client self.batch_config = batch_config From 376403d28db3b9b2e6c4002a9cbba1c3de01af4c Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Tue, 31 Dec 2024 20:31:04 +0000 Subject: [PATCH 42/76] add output_path arg to batch_processor.py::BatchProcessor to optionally provide incremental output saveing for debugging, default to not saving output for benchmarking --- utils/batch_processor.py | 49 +++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 26 deletions(-) diff --git a/utils/batch_processor.py b/utils/batch_processor.py index ca81baf5..cbbfdf3f 100644 --- a/utils/batch_processor.py +++ b/utils/batch_processor.py @@ -6,9 +6,8 @@ import logging import json import time -from datetime import datetime from pathlib import Path -from typing import List +from typing import List, Union from concurrent.futures import ThreadPoolExecutor, as_completed import numpy as np @@ -73,26 +72,22 @@ def process_batch( prompts: List[str], input_seq_lengths: List[int], tokenizer: AutoTokenizer, + output_path: Union[Path, str] = None, ) -> List[dict]: - timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") - json_fpath = ( - Path(self.prompt_client.env_config.cache_root) - / f"alpaca_eval_responses_{timestamp}.json" - ) - total_prompts = len(prompts) * self.batch_config.num_full_iterations response_counter = 0 all_responses = [] - with open(json_fpath, "a") as f: - f.write("[\n") + if output_path: + with open(output_path, "a") as f: + f.write("[\n") if self.batch_config.batch_size == 1: all_responses = self._process_single_thread( prompts, input_seq_lengths, tokenizer, - json_fpath, + output_path, total_prompts, response_counter, ) @@ -101,13 +96,14 @@ def process_batch( prompts, input_seq_lengths, tokenizer, - json_fpath, + output_path, total_prompts, response_counter, ) - with open(json_fpath, "a") as f: - f.write("\n]") + if output_path: + with open(output_path, "a") as f: + f.write("\n]") return all_responses @@ -116,7 +112,7 @@ def _process_single_thread( prompts: List[str], input_seq_lengths: List[int], tokenizer: AutoTokenizer, - json_fpath: Path, + output_path: Union[Path, str], total_prompts: int, response_counter: int, ) -> List[dict]: @@ -139,7 +135,7 @@ def _process_single_thread( ) self._save_response( - response_data, all_responses, json_fpath, response_counter + response_data, all_responses, output_path, response_counter ) response_counter += 1 self._log_progress(response_counter, total_prompts, response_data) @@ -151,7 +147,7 @@ def _process_multi_thread( prompts: List[str], input_seq_lengths: List[int], tokenizer: AutoTokenizer, - json_fpath: Path, + output_path: Union[Path, str], total_prompts: int, response_counter: int, ) -> List[dict]: @@ -172,7 +168,7 @@ def _process_multi_thread( bsz, tokenizer, all_responses, - json_fpath, + output_path, total_prompts, response_counter, ) @@ -202,7 +198,7 @@ def _process_multi_thread( try: response_data = future.result() self._save_response( - response_data, all_responses, json_fpath, response_counter + response_data, all_responses, output_path, response_counter ) response_counter += 1 self._log_progress( @@ -221,7 +217,7 @@ def _process_batch_chunk( batch_size: int, tokenizer: AutoTokenizer, all_responses: List[dict], - json_fpath: Path, + output_path: Union[Path, str], total_prompts: int, response_counter: int, ): @@ -251,7 +247,7 @@ def _process_batch_chunk( try: response_data = future.result() self._save_response( - response_data, all_responses, json_fpath, response_counter + response_data, all_responses, output_path, response_counter ) response_counter += 1 self._log_progress(response_counter, total_prompts, response_data) @@ -262,15 +258,16 @@ def _save_response( self, response_data: dict, all_responses: List[dict], - json_fpath: Path, + output_path: Union[Path, str], response_counter: int, ): with self.responses_lock: all_responses.append(response_data) - with open(json_fpath, "a") as f: - if response_counter > 0: - f.write(",") - json.dump(response_data, f, indent=4) + if output_path: + with open(output_path, "a") as f: + if response_counter > 0: + f.write(",") + json.dump(response_data, f, indent=4) def _log_progress( self, response_counter: int, total_prompts: int, response_data: dict From daf062552976c3d218923c6b4e095c65a857b240 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Thu, 12 Dec 2024 15:22:38 +0000 Subject: [PATCH 43/76] adding tests/test_vllm_seq_lens.py to test vllm sequence lengths and batching capacity --- tests/test_vllm_seq_lens.py | 66 +++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 tests/test_vllm_seq_lens.py diff --git a/tests/test_vllm_seq_lens.py b/tests/test_vllm_seq_lens.py new file mode 100644 index 00000000..def803c8 --- /dev/null +++ b/tests/test_vllm_seq_lens.py @@ -0,0 +1,66 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +import logging +from typing import Dict + + +import pytest + +from benchmarking.prompt_client_online_benchmark import run_sequence_length_test + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" +) +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +# Test params +# see: https://github.com/tenstorrent/tt-metal/tree/main/models/demos/t3000/llama3_70b#details + +TEST_paramS = [ + {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32}, + {"input_len": 2048, "output_len": 2048, "batch_size": 32, "num_prompts": 32}, + {"input_len": 4000, "output_len": 96, "batch_size": 32, "num_prompts": 32}, + {"input_len": 4096, "output_len": 256, "batch_size": 32, "num_prompts": 32}, + {"input_len": 8000, "output_len": 192, "batch_size": 16, "num_prompts": 16}, + {"input_len": 8192, "output_len": 256, "batch_size": 16, "num_prompts": 16}, + {"input_len": 32768, "output_len": 32, "batch_size": 1, "num_prompts": 1}, + {"input_len": 32768, "output_len": 98304, "batch_size": 1, "num_prompts": 1}, +] + + +@pytest.mark.parametrize("param", TEST_paramS) +def test_sequence_length(param: Dict[str, int]): + # Run the sequence length test + results = run_sequence_length_test( + combinations=[param], # Pass as single-item list for compatibility + save_dir="vllm_test_seq_lens", + file_prefix="vllm_test_seq_lens", + model="meta-llama/Llama-3.1-70B-Instruct", + ) + + # Add assertions to verify the results + assert results is not None, "Test results should not be None" + + # Verify the results contain expected data + logger.info(f"Results: {results}") + assert isinstance(results, list) + stats = results[0] + assert "input_seq_len" in stats + assert "output_seq_len" in stats + + # Verify the specific param parameters were used + assert stats["input_seq_len"] == param["input_len"] + assert stats["output_seq_len"] == param["output_len"] + assert stats["batch_size"] == param["batch_size"] + assert stats["num_prompts"] == param["num_prompts"] + + # Add specific assertions for the test parameters + assert stats["total_output_tokens"] > 0 + assert stats["mean_tpot"] > 0 + + +if __name__ == "__main__": + pytest.main([__file__, "-v", "--log-cli-level=INFO"]) From f3e34d10b7b624b83997539b64619fdfd7cbbfe2 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Fri, 13 Dec 2024 02:54:22 +0000 Subject: [PATCH 44/76] fix TEST_PARAMS --- tests/test_vllm_seq_lens.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/test_vllm_seq_lens.py b/tests/test_vllm_seq_lens.py index def803c8..6d35043d 100644 --- a/tests/test_vllm_seq_lens.py +++ b/tests/test_vllm_seq_lens.py @@ -19,7 +19,8 @@ # Test params # see: https://github.com/tenstorrent/tt-metal/tree/main/models/demos/t3000/llama3_70b#details -TEST_paramS = [ +TEST_PARAMS = [ + # test sequence lengths {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32}, {"input_len": 2048, "output_len": 2048, "batch_size": 32, "num_prompts": 32}, {"input_len": 4000, "output_len": 96, "batch_size": 32, "num_prompts": 32}, @@ -28,10 +29,12 @@ {"input_len": 8192, "output_len": 256, "batch_size": 16, "num_prompts": 16}, {"input_len": 32768, "output_len": 32, "batch_size": 1, "num_prompts": 1}, {"input_len": 32768, "output_len": 98304, "batch_size": 1, "num_prompts": 1}, + # test continuous batching + {"input_len": 8190, "output_len": 1024, "batch_size": 32, "num_prompts": 64}, ] -@pytest.mark.parametrize("param", TEST_paramS) +@pytest.mark.parametrize("param", TEST_PARAMS) def test_sequence_length(param: Dict[str, int]): # Run the sequence length test results = run_sequence_length_test( From 4d360eb681d5c884bd4c8d2cf82e01798525f4ae Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Tue, 17 Dec 2024 02:12:51 +0000 Subject: [PATCH 45/76] adding fixed_batch_size to prompt_client_online_benchmark.py for better single user control --- .../prompt_client_online_benchmark.py | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/benchmarking/prompt_client_online_benchmark.py b/benchmarking/prompt_client_online_benchmark.py index f4d9ca53..1e25df8d 100644 --- a/benchmarking/prompt_client_online_benchmark.py +++ b/benchmarking/prompt_client_online_benchmark.py @@ -6,7 +6,7 @@ import logging import numpy as np -from typing import List, Dict, Tuple +from typing import List, Dict, Tuple, Optional import json from datetime import datetime from pathlib import Path @@ -26,19 +26,23 @@ def get_test_combinations( context_lens: List[Tuple[int, int]], + fixed_batch_size: Optional[int] = None, ) -> List[Dict[str, int]]: combinations = [] for input_len, output_len in context_lens: # Skip invalid combinations where output_len > input_len context = input_len + output_len - if context <= 4096: - bsz = 32 - elif context <= 8192: - bsz = 16 + if not fixed_batch_size: + if context <= 4096: + bsz = 32 + elif context <= 8192: + bsz = 16 + else: + bsz = 1 else: - bsz = 1 + bsz = fixed_batch_size - num_prompts = max(bsz * 32, 32) + num_prompts = max(bsz * 8, 32) combinations.append( { "input_len": input_len, @@ -207,7 +211,7 @@ def run_sequence_length_test( # (8100, 32), ] # Generate all valid combinations upfront - combinations = get_test_combinations(context_lens=context_lens) + combinations = get_test_combinations(context_lens=context_lens, fixed_batch_size=1) # Run tests results = run_sequence_length_test( From 41dcc22376dfe3228351920d800a328d4a3c7f5d Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Tue, 17 Dec 2024 23:17:44 +0000 Subject: [PATCH 46/76] use standard output values in ms --- .../prompt_client_online_benchmark.py | 44 ++++++++----------- utils/README.md | 8 ++-- utils/batch_processor.py | 4 +- utils/prompt_client.py | 17 ++++--- utils/prompt_client_cli.py | 2 +- 5 files changed, 36 insertions(+), 39 deletions(-) diff --git a/benchmarking/prompt_client_online_benchmark.py b/benchmarking/prompt_client_online_benchmark.py index 1e25df8d..02f2cb63 100644 --- a/benchmarking/prompt_client_online_benchmark.py +++ b/benchmarking/prompt_client_online_benchmark.py @@ -142,29 +142,25 @@ def run_sequence_length_test( input_seq_lengths=input_seq_lengths, tokenizer=tokenizer, ) - - # Calculate statistics - mean_tpot = np.mean([r["time_per_output_token"] for r in responses]) - mean_tpot = max(mean_tpot, 1e-6) # Avoid division by zero - mean_tps = 1.0 / mean_tpot - std_tpot = np.std([r["time_per_output_token"] for r in responses]) - std_tpot = max(std_tpot, 1e-6) # Avoid division by zero - std_tps = mean_tps - 1.0 / (mean_tpot + std_tpot) + e2e_latency = np.max([r["duration"] for r in responses]) + num_requests = num_prompts * num_iterations stats = { - "input_seq_len": input_len, - "output_seq_len": output_len, + "model_id": model, + "backend": "vllm", + "timestamp": timestamp, + "input_sequence_length": input_len, + "output_sequence_length": output_len, "batch_size": batch_size, + "num_requests": num_requests, + "mean_tpot_ms": np.mean([r["tpot_ms"] for r in responses]), + "std_tpot_ms": np.std([r["tpot_ms"] for r in responses]), + "mean_ttft_ms": np.mean([r["ttft_ms"] for r in responses]), + "std_ttft_ms": np.std([r["ttft_ms"] for r in responses]), + "total_input_tokens": sum([r["input_seq_len"] for r in responses]), "total_output_tokens": sum([r["output_seq_len"] for r in responses]), - "mean_tpot": mean_tpot, - "mean_tps": mean_tps, - "mean_ttft": np.mean([r["ttft"] for r in responses]), - "std_tpot": std_tpot, - "std_tps": std_tps, - "std_ttft": np.std([r["ttft"] for r in responses]), - "num_prompts": num_prompts, + "duration": e2e_latency, "num_iterations": num_iterations, - "timestamp": timestamp, - "combination_index": idx, + "request_throughput": num_requests / e2e_latency, } all_results.append(stats) @@ -172,16 +168,14 @@ def run_sequence_length_test( # Log results logger.info( f"Results for combination {idx}/{total_combinations}:\n" - f"Mean TPOT: {stats['mean_tpot']:.4f} ± " - f"{stats['std_tpot']:.4f}\n" - f"Mean user TPS: {stats['mean_tps']:.4f} ± " - f"{stats['std_tps']:.4f}\n" - f"Mean TTFT: {stats['mean_ttft']:.4f} ± {stats['std_ttft']:.4f}" + f"Mean TTFT: {stats['mean_ttft_ms']:.4f} ± {stats['std_ttft_ms']:.4f}" + f"Mean TPOT: {stats['mean_tpot_ms']:.4f} ± " + f"{stats['std_tpot_ms']:.4f}\n" ) # Save results after each combination with open(results_file, "w") as f: - json.dump(all_results, f, indent=4) + json.dump(stats, f, indent=4) except Exception as e: logger.error(f"Error processing combination {idx}: {e}") diff --git a/utils/README.md b/utils/README.md index 984fda73..2afa03e2 100644 --- a/utils/README.md +++ b/utils/README.md @@ -156,14 +156,14 @@ The client saves responses in JSON format with the following structure: ```json { - "response_idx": number, // Response index in batch + "response_idx": number, // Response index in batch "prompt": string, // Input prompt "response": string, // Generated completion text "input_seq_len": number, // Prompt length in tokens "output_seq_len": number, // Completion length in tokens - "inter_token_latencies": number[], // Per-token generation times in seconds - "time_per_output_token": number, // Average seconds per token - "ttft": number // Time to first token in seconds + "itl_ms": number[], // Inter Token Latency (ITL) ms + "tpot_ms": number, // Time Per Output Token (TPOT) average, ms + "ttft_ms": number // Time To First Token (TTFT) ms } ``` diff --git a/utils/batch_processor.py b/utils/batch_processor.py index cbbfdf3f..adedb984 100644 --- a/utils/batch_processor.py +++ b/utils/batch_processor.py @@ -274,8 +274,8 @@ def _log_progress( ): logger.info( f"Processed {response_counter}/{total_prompts} responses. " - f"TPOT: {response_data['time_per_output_token']:.4f}, " - f"TTFT: {response_data['ttft']:.4f}, " + f"TPOT: {response_data['tpot_ms']:.4f}, " + f"TTFT: {response_data['ttft_ms']:.4f}, " f"input_seq_len: {response_data['input_seq_len']}, " f"output_seq_len: {response_data['output_seq_len']}" ) diff --git a/utils/prompt_client.py b/utils/prompt_client.py index 1322f130..7fee601d 100644 --- a/utils/prompt_client.py +++ b/utils/prompt_client.py @@ -158,8 +158,8 @@ def capture_traces( ) logger.info( f"tokens generated: {response_data['output_seq_len']}, " - f"TTFT: {response_data['ttft']:.3f}s, " - f"TPOT: {response_data['time_per_output_token']:.3f}s" + f"TTFT: {response_data['ttft_ms']:.3f} ms, " + f"TPOT: {response_data['tpot_ms']:.3f} ms" ) except Exception as e: logger.error(f"Error processing prompt: {e}") @@ -252,12 +252,14 @@ def _process_response( full_text = data["choices"][0]["text"] usage_dict = data["usage"] first_token_time = req_time + + duration = time.perf_counter() - req_time - # Calculate inter-token latencies + # Calculate inter-token latencies (ms) inter_token_latencies = [] if len(token_timestamps) > 1: inter_token_latencies = [ - token_timestamps[i] - token_timestamps[i - 1] + (token_timestamps[i] - token_timestamps[i - 1]) * 1000.0 for i in range(1, len(token_timestamps)) ] @@ -294,7 +296,8 @@ def _process_response( "response": full_text, "input_seq_len": prompt_len, "output_seq_len": num_completion_tokens, - "inter_token_latencies": inter_token_latencies, - "time_per_output_token": time_per_output_token, - "ttft": ttft, + "itl_ms": inter_token_latencies, + "tpot_ms": time_per_output_token * 1000.0, + "ttft_ms": ttft * 1000.0, + "duration": duration, } diff --git a/utils/prompt_client_cli.py b/utils/prompt_client_cli.py index 7ab7ed6f..d4bf8a59 100644 --- a/utils/prompt_client_cli.py +++ b/utils/prompt_client_cli.py @@ -191,7 +191,7 @@ def main(): # Calculate and log summary statistics if responses: - mean_tpot = np.mean([r["time_per_output_token"] for r in responses]) + mean_tpot = np.mean([r["tpot_ms"] for r in responses]) mean_ttft = np.mean([r["ttft"] for r in responses]) logger.info(f"Mean TTFT: {mean_ttft:.4f}") logger.info(f"Mean TPOT: {mean_tpot:.4f}") From 308eeaff19384ba9ec3a602df0556eddacec3da8 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Tue, 17 Dec 2024 23:21:18 +0000 Subject: [PATCH 47/76] fix output filepath for prompt_client_online_benchmark.py, remove get_test_combinations in favor of directly specifying them --- .../prompt_client_online_benchmark.py | 83 +++++-------------- benchmarking/vllm_online_benchmark.py | 48 ++++++----- 2 files changed, 45 insertions(+), 86 deletions(-) diff --git a/benchmarking/prompt_client_online_benchmark.py b/benchmarking/prompt_client_online_benchmark.py index 02f2cb63..8bb3f335 100644 --- a/benchmarking/prompt_client_online_benchmark.py +++ b/benchmarking/prompt_client_online_benchmark.py @@ -4,6 +4,7 @@ # # SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC +import os import logging import numpy as np from typing import List, Dict, Tuple, Optional @@ -24,56 +25,16 @@ logger.setLevel(logging.INFO) -def get_test_combinations( - context_lens: List[Tuple[int, int]], - fixed_batch_size: Optional[int] = None, -) -> List[Dict[str, int]]: - combinations = [] - for input_len, output_len in context_lens: - # Skip invalid combinations where output_len > input_len - context = input_len + output_len - if not fixed_batch_size: - if context <= 4096: - bsz = 32 - elif context <= 8192: - bsz = 16 - else: - bsz = 1 - else: - bsz = fixed_batch_size - - num_prompts = max(bsz * 8, 32) - combinations.append( - { - "input_len": input_len, - "output_len": output_len, - "batch_size": bsz, - "num_prompts": num_prompts, - } - ) - - # Log total number of combinations - logger.info(f"Generated {len(combinations)} valid test combinations") - for i, combo in enumerate(combinations, 1): - logger.info( - f"Combination {i}: input_len={combo['input_len']}, " - f"output_len={combo['output_len']}, batch_size={combo['batch_size']}, " - f"num_prompts={combo['num_prompts']}" - ) - - return combinations - - def run_sequence_length_test( combinations: List[Dict[str, int]], - save_dir: str, + result_dir: str, file_prefix: str, num_iterations: int = 1, model: str = "meta-llama/Llama-3.1-70B-Instruct", ) -> List[dict]: # Create save directory timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") - save_path = Path(save_dir) / f"results_{timestamp}" + save_path = Path(result_dir) / f"results_{timestamp}" save_path.mkdir(parents=True, exist_ok=True) # Initialize results storage @@ -185,31 +146,25 @@ def run_sequence_length_test( if __name__ == "__main__": - # Define benchmarking context length (isl, osl) pairs - context_lens = [ - (128, 128), - # (128, 2048), - # (128, 4096), - # (2048, 128), - # (2048, 2048), - # (1000, 1000), - # (500, 2000), - # (5000, 500), - # (20000, 2000), - # (128, 2), - # (256, 2), - # (512, 32), - # (1000, 24), - # (2000, 32), - # (4000, 32), - # (8100, 32), + + combinations = [ + {"input_len": 128, "output_len": 128, "batch_size": 1, "num_prompts": 32}, + {"input_len": 128, "output_len": 1024, "batch_size": 1, "num_prompts": 32}, + {"input_len": 2048, "output_len": 128, "batch_size": 1, "num_prompts": 32}, + {"input_len": 128, "output_len": 4096, "batch_size": 1, "num_prompts": 32}, + {"input_len": 2048, "output_len": 2048, "batch_size": 1, "num_prompts": 32}, ] - # Generate all valid combinations upfront - combinations = get_test_combinations(context_lens=context_lens, fixed_batch_size=1) + + # Create output directory + cache_dir = Path(os.environ.get("CACHE_ROOT", "")) + timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + result_dir = cache_dir / "online_benchmark_results" + result_dir.mkdir(parents=True, exist_ok=True) # Run tests results = run_sequence_length_test( combinations=combinations, - save_dir="online_benchmarking", - file_prefix="online_benchmark_results", + result_dir=result_dir, + file_prefix="online_benchmark", + model="meta-llama/Llama-3.1-70B-Instruct", ) diff --git a/benchmarking/vllm_online_benchmark.py b/benchmarking/vllm_online_benchmark.py index 159f8da0..adfe6563 100644 --- a/benchmarking/vllm_online_benchmark.py +++ b/benchmarking/vllm_online_benchmark.py @@ -10,7 +10,6 @@ from typing import Dict from pathlib import Path -from benchmarking.prompt_client_online_benchmark import get_test_combinations from utils.prompt_configs import EnvironmentConfig from utils.prompt_client import PromptClient @@ -35,7 +34,6 @@ def run_benchmark( "--backend", "vllm", "--model", model, "--port", str(port), - # "--request-rate", "3", "--dataset-name", "random", "--num-prompts", str(params["num_prompts"]), "--random-input-len", str(params["input_len"]), @@ -75,28 +73,34 @@ def main(): # note: there isnt a better way to pass an api key to the vllm benchmarking script os.environ["OPENAI_API_KEY"] = prompt_client._get_authorization() - # Define benchmarking context length (isl, osl) pairs - context_lens = [ - (128, 128), - # (128, 2048), - # (128, 4096), - # (2048, 128), - # (2048, 2048), - # (1000, 1000), - # (500, 2000), - # (5000, 500), - # (20000, 2000), - # (128, 2), - # (256, 2), - # (512, 32), - # (1000, 24), - # (2000, 32), - # (4000, 32), - # (8100, 32), + # Get all benchmark combinations using the original function + combinations = [ + {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32*8}, + # {"input_len": 128, "output_len": 1024, "batch_size": 32, "num_prompts": 32}, + # {"input_len": 2048, "output_len": 128, "batch_size": 32, "num_prompts": 32}, + # {"input_len": 128, "output_len": 4096, "batch_size": 32, "num_prompts": 32}, + # {"input_len": 2048, "output_len": 2048, "batch_size": 32, "num_prompts": 32}, + # {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32}, + # {"input_len": 128, "output_len": 2048, "batch_size": 32, "num_prompts": 32}, + # {"input_len": 128, "output_len": 4096, "batch_size": 32, "num_prompts": 32}, + # {"input_len": 2048, "output_len": 128, "batch_size": 32, "num_prompts": 32}, + # {"input_len": 2048, "output_len": 2048, "batch_size": 32, "num_prompts": 32}, + # {"input_len": 1000, "output_len": 1000, "batch_size": 32, "num_prompts": 32}, + # {"input_len": 500, "output_len": 2000, "batch_size": 32, "num_prompts": 32}, + # {"input_len": 5000, "output_len": 500, "batch_size": 32, "num_prompts": 32}, + # {"input_len": 20000, "output_len": 2000, "batch_size": 32, "num_prompts": 32}, + # {"input_len": 128, "output_len": 2, "batch_size": 32, "num_prompts": 32}, + # {"input_len": 256, "output_len": 2, "batch_size": 32, "num_prompts": 32}, + # {"input_len": 512, "output_len": 32, "batch_size": 32, "num_prompts": 32}, + # {"input_len": 1000, "output_len": 24, "batch_size": 32, "num_prompts": 32}, + # {"input_len": 2000, "output_len": 32, "batch_size": 32, "num_prompts": 32}, + # {"input_len": 4000, "output_len": 32, "batch_size": 32, "num_prompts": 32}, + # {"input_len": 8100, "output_len": 32, "batch_size": 32, "num_prompts": 32} ] - # Get all benchmark combinations using the original function - combinations = get_test_combinations(context_lens=context_lens) + context_lens = [(it["input_len"], it["output_len"]) for it in combinations] + # de-dupe + context_lens = list(set(context_lens)) # pre-capture traces required for benchmarking prompt_client.capture_traces(context_lens=context_lens) From e6fc8c4032fc8e8c0a54955d701f6d3c0ae17b84 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Tue, 17 Dec 2024 23:23:35 +0000 Subject: [PATCH 48/76] add benchmark output file reader script --- benchmarking/benchmark_output_processor.py | 227 +++++++++++++++++++++ 1 file changed, 227 insertions(+) create mode 100644 benchmarking/benchmark_output_processor.py diff --git a/benchmarking/benchmark_output_processor.py b/benchmarking/benchmark_output_processor.py new file mode 100644 index 00000000..ebc73cee --- /dev/null +++ b/benchmarking/benchmark_output_processor.py @@ -0,0 +1,227 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +import json +import glob +import os +from datetime import datetime +import re +from typing import Dict, List, Any +from operator import itemgetter +import argparse +from pathlib import Path + + +DATE_STR_FORMAT = "%Y-%m-%d_%H-%M-%S" + +def parse_args(): + """Parse command line arguments.""" + parser = argparse.ArgumentParser( + description='Process vLLM benchmark results from multiple directories.' + ) + parser.add_argument( + 'directories', + nargs='+', + type=str, + help='One or more directories containing benchmark files' + ) + parser.add_argument( + '--pattern', + type=str, + default='*_benchmark_*.json', + help='File pattern to match (default: vllm_online_benchmark_*.json)' + ) + parser.add_argument( + '--output', + type=str, + default=f"benchmark_results_{datetime.now().strftime(DATE_STR_FORMAT)}.csv", + help='Output CSV file name' + ) + return parser.parse_args() + +def extract_params_from_filename(filename: str) -> Dict[str, Any]: + """ + Extract all parameters from benchmark filename using regex. + Example: vllm_online_benchmark_2024-12-17_13-24-17_isl-128_osl-128_bsz-32_n-32.json + + Returns: + Dictionary containing timestamp and numeric parameters + """ + pattern = r""" + benchmark_ + (?P\d{4}-\d{2}-\d{2}_\d{2}-\d{2}-\d{2}) # Timestamp + _isl-(?P\d+) # Input sequence length + _osl-(?P\d+) # Output sequence length + _bsz-(?P\d+) # Batch size + _n-(?P\d+) # Number of requests + """ + + match = re.search(pattern, filename, re.VERBOSE) + if not match: + raise ValueError(f"Could not extract parameters from filename: {filename}") + + # Convert timestamp string to datetime + timestamp_str = match.group('timestamp') + timestamp = datetime.strptime(timestamp_str, '%Y-%m-%d_%H-%M-%S') + + # Extract and convert numeric parameters + params = { + 'timestamp': timestamp, + 'input_sequence_length': int(match.group('isl')), + 'output_sequence_length': int(match.group('osl')), + 'batch_size': int(match.group('bsz')), + 'num_requests': int(match.group('n')) + } + + return params + +def process_benchmark_file(filepath: str) -> Dict[str, Any]: + """Process a single benchmark file and extract relevant metrics.""" + with open(filepath, 'r') as f: + data = json.load(f) + + filename = os.path.basename(filepath) + + params = extract_params_from_filename(filename) + timestamp = params.pop('timestamp') # Remove timestamp from params dict + + metrics = { + 'filepath': filepath, + 'filename': filename, + 'timestamp': timestamp, + 'model_id': data.get('model_id', ''), + 'backend': data.get('backend', ''), + 'num_prompts': data.get('num_prompts', ''), + 'mean_tpot_ms': data.get('mean_tpot_ms', "n/a"), + 'std_tpot_ms': data.get('std_tpot_ms', "n/a"), + 'mean_ttft_ms': data.get('mean_ttft_ms', "n/a"), + 'std_ttft_ms': data.get('std_ttft_ms', "n/a"), + 'total_input_tokens': data.get('total_input_tokens', "n/a"), + 'total_output_tokens': data.get('total_output_tokens', "n/a"), + 'duration': data.get('duration', "n/a"), + 'request_throughput': data.get('request_throughput', "n/a"), + **params # Unpack the extracted parameters + } + + # Calculate statistics + mean_tpot = max(metrics["mean_tpot_ms"], 1e-6) # Avoid division by zero + mean_tps = 1.0 / mean_tpot + std_tps = mean_tps - (1.0 / (mean_tpot + metrics["std_tpot_ms"])) + metrics["mean_tps"] = mean_tps + metrics["std_tps"] = std_tps + return metrics + +def process_benchmark_files(directories: List[str], pattern: str) -> List[Dict[str, Any]]: + """Process benchmark files from multiple directories matching the given pattern.""" + results = [] + + for directory in directories: + dir_path = Path(directory) + if not dir_path.exists(): + print(f"Warning: Directory not found: {directory}") + continue + + file_pattern = str(dir_path / pattern) + files = glob.glob(file_pattern) + + if not files: + print(f"Warning: No files found matching pattern '{pattern}' in {directory}") + continue + + print(f"Processing {len(files)} files from {directory}") + + for filepath in files: + print(f"Processing: {filepath} ...") + try: + metrics = process_benchmark_file(filepath) + results.append(metrics) + except Exception as e: + print(f"Error processing file {filepath}: {str(e)}") + + if not results: + raise ValueError("No benchmark files were successfully processed") + + # Sort by timestamp + return sorted(results, key=lambda x: x['timestamp']) + +def save_to_csv(results: List[Dict[str, Any]], filename: str) -> None: + """Save results to a CSV file.""" + if not results: + return + + # Get all unique keys from all dictionaries + headers = list(results[0].keys()) + + with open(filename, 'w') as f: + # Write headers + f.write(','.join(headers) + '\n') + + # Write data + for result in results: + row = [str(result.get(header, '')) for header in headers] + f.write(','.join(row) + '\n') + +def format_markdown_table(results: List[Dict[str, Any]]) -> str: + """Format results as a Markdown table.""" + if not results: + return "" + + # Define columns to display and their headers + display_cols = [ + ('model_id', 'Model ID'), + ('backend', 'Backend'), + ('input_sequence_length', 'ISL'), + ('output_sequence_length', 'OSL'), + ('batch_size', 'Batch Size'), + ('num_requests', 'Requests'), + ('mean_tpot_ms', 'TPOT (ms)'), + ('mean_ttft_ms', 'TTFT (ms)'), + ('request_throughput', 'Throughput (RPS)'), + ] + + # Create header row + header = " | ".join(header for _, header in display_cols) + separator = "|".join(['---'] * len(display_cols)) + + # Create data rows + rows = [] + for result in results: + row_values = [] + for col, _ in display_cols: + value = result.get(col, '') + # Format floats to 2 decimal places + if isinstance(value, float): + value = f"{value:.2f}" + row_values.append(str(value)) + rows.append(" | ".join(row_values)) + + # Combine all parts + markdown_table = f"| {header} |\n| {separator} |\n" + markdown_table += "\n".join(f"| {row} |" for row in rows) + + return markdown_table + + +def main(): + args = parse_args() + + results = process_benchmark_files(args.directories, args.pattern) + + # Display basic statistics + print("\nBenchmark Summary:") + print(f"Total files processed: {len(results)}") + + + # Save to CSV + save_to_csv(results, args.output) + print(f"\nResults saved to: {args.output}") + + # Generate and print Markdown table + print("\nMarkdown Table:\n") + print(format_markdown_table(results)) + print("\n") + + +if __name__ == "__main__": + main() \ No newline at end of file From 6295693cda42417b0d4223ee5936510a81c6c411 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Tue, 17 Dec 2024 23:34:37 +0000 Subject: [PATCH 49/76] ruff formatting, rename benchmarking/benchmark_output_processor.py -> benchmarking/benchmark_summary.py --- ...tput_processor.py => benchmark_summary.py} | 169 +++++++++--------- .../prompt_client_online_benchmark.py | 3 +- benchmarking/vllm_online_benchmark.py | 2 +- 3 files changed, 91 insertions(+), 83 deletions(-) rename benchmarking/{benchmark_output_processor.py => benchmark_summary.py} (63%) diff --git a/benchmarking/benchmark_output_processor.py b/benchmarking/benchmark_summary.py similarity index 63% rename from benchmarking/benchmark_output_processor.py rename to benchmarking/benchmark_summary.py index ebc73cee..ba734365 100644 --- a/benchmarking/benchmark_output_processor.py +++ b/benchmarking/benchmark_summary.py @@ -8,43 +8,44 @@ from datetime import datetime import re from typing import Dict, List, Any -from operator import itemgetter import argparse from pathlib import Path DATE_STR_FORMAT = "%Y-%m-%d_%H-%M-%S" + def parse_args(): """Parse command line arguments.""" parser = argparse.ArgumentParser( - description='Process vLLM benchmark results from multiple directories.' + description="Process vLLM benchmark results from multiple directories." ) parser.add_argument( - 'directories', - nargs='+', + "directories", + nargs="+", type=str, - help='One or more directories containing benchmark files' + help="One or more directories containing benchmark files", ) parser.add_argument( - '--pattern', + "--pattern", type=str, - default='*_benchmark_*.json', - help='File pattern to match (default: vllm_online_benchmark_*.json)' + default="*_benchmark_*.json", + help="File pattern to match (default: vllm_online_benchmark_*.json)", ) parser.add_argument( - '--output', + "--output", type=str, default=f"benchmark_results_{datetime.now().strftime(DATE_STR_FORMAT)}.csv", - help='Output CSV file name' + help="Output CSV file name", ) return parser.parse_args() + def extract_params_from_filename(filename: str) -> Dict[str, Any]: """ Extract all parameters from benchmark filename using regex. Example: vllm_online_benchmark_2024-12-17_13-24-17_isl-128_osl-128_bsz-32_n-32.json - + Returns: Dictionary containing timestamp and numeric parameters """ @@ -56,81 +57,87 @@ def extract_params_from_filename(filename: str) -> Dict[str, Any]: _bsz-(?P\d+) # Batch size _n-(?P\d+) # Number of requests """ - + match = re.search(pattern, filename, re.VERBOSE) if not match: raise ValueError(f"Could not extract parameters from filename: {filename}") - + # Convert timestamp string to datetime - timestamp_str = match.group('timestamp') - timestamp = datetime.strptime(timestamp_str, '%Y-%m-%d_%H-%M-%S') - + timestamp_str = match.group("timestamp") + timestamp = datetime.strptime(timestamp_str, "%Y-%m-%d_%H-%M-%S") + # Extract and convert numeric parameters params = { - 'timestamp': timestamp, - 'input_sequence_length': int(match.group('isl')), - 'output_sequence_length': int(match.group('osl')), - 'batch_size': int(match.group('bsz')), - 'num_requests': int(match.group('n')) + "timestamp": timestamp, + "input_sequence_length": int(match.group("isl")), + "output_sequence_length": int(match.group("osl")), + "batch_size": int(match.group("bsz")), + "num_requests": int(match.group("n")), } - + return params + def process_benchmark_file(filepath: str) -> Dict[str, Any]: """Process a single benchmark file and extract relevant metrics.""" - with open(filepath, 'r') as f: + with open(filepath, "r") as f: data = json.load(f) - + filename = os.path.basename(filepath) params = extract_params_from_filename(filename) - timestamp = params.pop('timestamp') # Remove timestamp from params dict + timestamp = params.pop("timestamp") # Remove timestamp from params dict metrics = { - 'filepath': filepath, - 'filename': filename, - 'timestamp': timestamp, - 'model_id': data.get('model_id', ''), - 'backend': data.get('backend', ''), - 'num_prompts': data.get('num_prompts', ''), - 'mean_tpot_ms': data.get('mean_tpot_ms', "n/a"), - 'std_tpot_ms': data.get('std_tpot_ms', "n/a"), - 'mean_ttft_ms': data.get('mean_ttft_ms', "n/a"), - 'std_ttft_ms': data.get('std_ttft_ms', "n/a"), - 'total_input_tokens': data.get('total_input_tokens', "n/a"), - 'total_output_tokens': data.get('total_output_tokens', "n/a"), - 'duration': data.get('duration', "n/a"), - 'request_throughput': data.get('request_throughput', "n/a"), - **params # Unpack the extracted parameters + "filepath": filepath, + "filename": filename, + "timestamp": timestamp, + "model_id": data.get("model_id", ""), + "backend": data.get("backend", ""), + "num_prompts": data.get("num_prompts", ""), + "mean_tpot_ms": data.get("mean_tpot_ms", "n/a"), + "std_tpot_ms": data.get("std_tpot_ms", "n/a"), + "mean_ttft_ms": data.get("mean_ttft_ms", "n/a"), + "std_ttft_ms": data.get("std_ttft_ms", "n/a"), + "total_input_tokens": data.get("total_input_tokens", "n/a"), + "total_output_tokens": data.get("total_output_tokens", "n/a"), + "duration": data.get("duration", "n/a"), + "request_throughput": data.get("request_throughput", "n/a"), + **params, # Unpack the extracted parameters } # Calculate statistics mean_tpot = max(metrics["mean_tpot_ms"], 1e-6) # Avoid division by zero - mean_tps = 1.0 / mean_tpot - std_tps = mean_tps - (1.0 / (mean_tpot + metrics["std_tpot_ms"])) + mean_tps = 1000.0 / mean_tpot + std_tps = mean_tps - (1000.0 / (mean_tpot + metrics["std_tpot_ms"])) metrics["mean_tps"] = mean_tps metrics["std_tps"] = std_tps return metrics -def process_benchmark_files(directories: List[str], pattern: str) -> List[Dict[str, Any]]: + +def process_benchmark_files( + directories: List[str], pattern: str +) -> List[Dict[str, Any]]: """Process benchmark files from multiple directories matching the given pattern.""" results = [] - + for directory in directories: dir_path = Path(directory) if not dir_path.exists(): print(f"Warning: Directory not found: {directory}") continue - + file_pattern = str(dir_path / pattern) files = glob.glob(file_pattern) - + if not files: - print(f"Warning: No files found matching pattern '{pattern}' in {directory}") + print( + f"Warning: No files found matching pattern '{pattern}' in {directory}" + ) continue - + print(f"Processing {len(files)} files from {directory}") - + for filepath in files: print(f"Processing: {filepath} ...") try: @@ -138,12 +145,13 @@ def process_benchmark_files(directories: List[str], pattern: str) -> List[Dict[s results.append(metrics) except Exception as e: print(f"Error processing file {filepath}: {str(e)}") - + if not results: raise ValueError("No benchmark files were successfully processed") - + # Sort by timestamp - return sorted(results, key=lambda x: x['timestamp']) + return sorted(results, key=lambda x: x["timestamp"]) + def save_to_csv(results: List[Dict[str, Any]], filename: str) -> None: """Save results to a CSV file.""" @@ -152,76 +160,77 @@ def save_to_csv(results: List[Dict[str, Any]], filename: str) -> None: # Get all unique keys from all dictionaries headers = list(results[0].keys()) - - with open(filename, 'w') as f: + + with open(filename, "w") as f: # Write headers - f.write(','.join(headers) + '\n') - + f.write(",".join(headers) + "\n") + # Write data for result in results: - row = [str(result.get(header, '')) for header in headers] - f.write(','.join(row) + '\n') + row = [str(result.get(header, "")) for header in headers] + f.write(",".join(row) + "\n") + def format_markdown_table(results: List[Dict[str, Any]]) -> str: """Format results as a Markdown table.""" if not results: return "" - + # Define columns to display and their headers display_cols = [ - ('model_id', 'Model ID'), - ('backend', 'Backend'), - ('input_sequence_length', 'ISL'), - ('output_sequence_length', 'OSL'), - ('batch_size', 'Batch Size'), - ('num_requests', 'Requests'), - ('mean_tpot_ms', 'TPOT (ms)'), - ('mean_ttft_ms', 'TTFT (ms)'), - ('request_throughput', 'Throughput (RPS)'), + ("model_id", "Model ID"), + ("backend", "Backend"), + ("input_sequence_length", "ISL"), + ("output_sequence_length", "OSL"), + ("batch_size", "Batch Size"), + ("num_requests", "Requests"), + ("mean_ttft_ms", "TTFT (ms)"), + ("mean_tpot_ms", "TPOT (ms)"), + ("mean_tps", "TPS (user)"), + ("request_throughput", "Request Throughput (RPS)"), ] - + # Create header row header = " | ".join(header for _, header in display_cols) - separator = "|".join(['---'] * len(display_cols)) - + separator = "|".join(["---"] * len(display_cols)) + # Create data rows rows = [] for result in results: row_values = [] for col, _ in display_cols: - value = result.get(col, '') + value = result.get(col, "") # Format floats to 2 decimal places if isinstance(value, float): value = f"{value:.2f}" row_values.append(str(value)) rows.append(" | ".join(row_values)) - + # Combine all parts markdown_table = f"| {header} |\n| {separator} |\n" markdown_table += "\n".join(f"| {row} |" for row in rows) - + return markdown_table def main(): args = parse_args() - + results = process_benchmark_files(args.directories, args.pattern) - + # Display basic statistics print("\nBenchmark Summary:") print(f"Total files processed: {len(results)}") - # Save to CSV save_to_csv(results, args.output) print(f"\nResults saved to: {args.output}") - + # Generate and print Markdown table print("\nMarkdown Table:\n") print(format_markdown_table(results)) - print("\n") + print("Note: all metrics are means across benchmark run unless otherwise stated.\n") if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/benchmarking/prompt_client_online_benchmark.py b/benchmarking/prompt_client_online_benchmark.py index 8bb3f335..7f8bc71d 100644 --- a/benchmarking/prompt_client_online_benchmark.py +++ b/benchmarking/prompt_client_online_benchmark.py @@ -7,7 +7,7 @@ import os import logging import numpy as np -from typing import List, Dict, Tuple, Optional +from typing import List, Dict import json from datetime import datetime from pathlib import Path @@ -146,7 +146,6 @@ def run_sequence_length_test( if __name__ == "__main__": - combinations = [ {"input_len": 128, "output_len": 128, "batch_size": 1, "num_prompts": 32}, {"input_len": 128, "output_len": 1024, "batch_size": 1, "num_prompts": 32}, diff --git a/benchmarking/vllm_online_benchmark.py b/benchmarking/vllm_online_benchmark.py index adfe6563..e7aa0bfa 100644 --- a/benchmarking/vllm_online_benchmark.py +++ b/benchmarking/vllm_online_benchmark.py @@ -75,7 +75,7 @@ def main(): # Get all benchmark combinations using the original function combinations = [ - {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32*8}, + {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32 * 8}, # {"input_len": 128, "output_len": 1024, "batch_size": 32, "num_prompts": 32}, # {"input_len": 2048, "output_len": 128, "batch_size": 32, "num_prompts": 32}, # {"input_len": 128, "output_len": 4096, "batch_size": 32, "num_prompts": 32}, From 8963a12d3043a1243f122c02e8ed9c6a3bfdd640 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Wed, 18 Dec 2024 02:11:24 +0000 Subject: [PATCH 50/76] add percentile-metrics to add e2els stats --- benchmarking/vllm_online_benchmark.py | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarking/vllm_online_benchmark.py b/benchmarking/vllm_online_benchmark.py index e7aa0bfa..c3305680 100644 --- a/benchmarking/vllm_online_benchmark.py +++ b/benchmarking/vllm_online_benchmark.py @@ -39,6 +39,7 @@ def run_benchmark( "--random-input-len", str(params["input_len"]), "--random-output-len", str(params["output_len"]), "--ignore-eos", # Ignore EOS tokens to force max output length as set + "--percentile-metrics", "ttft,tpot,itl,e2els", # must add e2els in order for it to be logged "--save-result", "--result-filename", str(result_filename) ] From fc8eb06291ef4eb8056d70f9add076efa32bc52c Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Wed, 18 Dec 2024 02:12:34 +0000 Subject: [PATCH 51/76] add latency to benchmarking/prompt_client_online_benchmark.py and summary support --- benchmarking/benchmark_summary.py | 5 +++-- benchmarking/prompt_client_online_benchmark.py | 14 +++++++------- utils/prompt_client.py | 6 +++--- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/benchmarking/benchmark_summary.py b/benchmarking/benchmark_summary.py index ba734365..d5781b96 100644 --- a/benchmarking/benchmark_summary.py +++ b/benchmarking/benchmark_summary.py @@ -101,7 +101,7 @@ def process_benchmark_file(filepath: str) -> Dict[str, Any]: "std_ttft_ms": data.get("std_ttft_ms", "n/a"), "total_input_tokens": data.get("total_input_tokens", "n/a"), "total_output_tokens": data.get("total_output_tokens", "n/a"), - "duration": data.get("duration", "n/a"), + "mean_e2el_ms": data.get("mean_e2el_ms", "n/a"), "request_throughput": data.get("request_throughput", "n/a"), **params, # Unpack the extracted parameters } @@ -183,10 +183,11 @@ def format_markdown_table(results: List[Dict[str, Any]]) -> str: ("input_sequence_length", "ISL"), ("output_sequence_length", "OSL"), ("batch_size", "Batch Size"), - ("num_requests", "Requests"), + ("num_requests", "Num Requests"), ("mean_ttft_ms", "TTFT (ms)"), ("mean_tpot_ms", "TPOT (ms)"), ("mean_tps", "TPS (user)"), + ("mean_e2el_ms", "Request latency"), ("request_throughput", "Request Throughput (RPS)"), ] diff --git a/benchmarking/prompt_client_online_benchmark.py b/benchmarking/prompt_client_online_benchmark.py index 7f8bc71d..385bff19 100644 --- a/benchmarking/prompt_client_online_benchmark.py +++ b/benchmarking/prompt_client_online_benchmark.py @@ -103,7 +103,7 @@ def run_sequence_length_test( input_seq_lengths=input_seq_lengths, tokenizer=tokenizer, ) - e2e_latency = np.max([r["duration"] for r in responses]) + mean_e2el_ms = np.mean([r["latency"] for r in responses]) * 1000.0 num_requests = num_prompts * num_iterations stats = { "model_id": model, @@ -119,9 +119,9 @@ def run_sequence_length_test( "std_ttft_ms": np.std([r["ttft_ms"] for r in responses]), "total_input_tokens": sum([r["input_seq_len"] for r in responses]), "total_output_tokens": sum([r["output_seq_len"] for r in responses]), - "duration": e2e_latency, + "mean_e2el_ms": mean_e2el_ms, "num_iterations": num_iterations, - "request_throughput": num_requests / e2e_latency, + "request_throughput": num_requests / mean_e2el_ms, } all_results.append(stats) @@ -147,11 +147,11 @@ def run_sequence_length_test( if __name__ == "__main__": combinations = [ - {"input_len": 128, "output_len": 128, "batch_size": 1, "num_prompts": 32}, - {"input_len": 128, "output_len": 1024, "batch_size": 1, "num_prompts": 32}, + # {"input_len": 128, "output_len": 128, "batch_size": 1, "num_prompts": 32}, + {"input_len": 128, "output_len": 1024, "batch_size": 1, "num_prompts": 16}, {"input_len": 2048, "output_len": 128, "batch_size": 1, "num_prompts": 32}, - {"input_len": 128, "output_len": 4096, "batch_size": 1, "num_prompts": 32}, - {"input_len": 2048, "output_len": 2048, "batch_size": 1, "num_prompts": 32}, + {"input_len": 128, "output_len": 4096, "batch_size": 1, "num_prompts": 8}, + {"input_len": 2048, "output_len": 2048, "batch_size": 1, "num_prompts": 8}, ] # Create output directory diff --git a/utils/prompt_client.py b/utils/prompt_client.py index 7fee601d..72c3431a 100644 --- a/utils/prompt_client.py +++ b/utils/prompt_client.py @@ -252,8 +252,8 @@ def _process_response( full_text = data["choices"][0]["text"] usage_dict = data["usage"] first_token_time = req_time - - duration = time.perf_counter() - req_time + + latency = time.perf_counter() - req_time # Calculate inter-token latencies (ms) inter_token_latencies = [] @@ -299,5 +299,5 @@ def _process_response( "itl_ms": inter_token_latencies, "tpot_ms": time_per_output_token * 1000.0, "ttft_ms": ttft * 1000.0, - "duration": duration, + "latency": latency, } From 6c4d0925e0d850809454266ff4100e4edd1473d1 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Wed, 18 Dec 2024 05:07:20 +0000 Subject: [PATCH 52/76] support latency measurement with mean_e2el_ms --- benchmarking/benchmark_summary.py | 2 +- benchmarking/vllm_online_benchmark.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/benchmarking/benchmark_summary.py b/benchmarking/benchmark_summary.py index d5781b96..a9225e18 100644 --- a/benchmarking/benchmark_summary.py +++ b/benchmarking/benchmark_summary.py @@ -187,7 +187,7 @@ def format_markdown_table(results: List[Dict[str, Any]]) -> str: ("mean_ttft_ms", "TTFT (ms)"), ("mean_tpot_ms", "TPOT (ms)"), ("mean_tps", "TPS (user)"), - ("mean_e2el_ms", "Request latency"), + ("mean_e2el_ms", "Request latency (ms)"), ("request_throughput", "Request Throughput (RPS)"), ] diff --git a/benchmarking/vllm_online_benchmark.py b/benchmarking/vllm_online_benchmark.py index c3305680..35aed1e6 100644 --- a/benchmarking/vllm_online_benchmark.py +++ b/benchmarking/vllm_online_benchmark.py @@ -39,7 +39,7 @@ def run_benchmark( "--random-input-len", str(params["input_len"]), "--random-output-len", str(params["output_len"]), "--ignore-eos", # Ignore EOS tokens to force max output length as set - "--percentile-metrics", "ttft,tpot,itl,e2els", # must add e2els in order for it to be logged + "--percentile-metrics", "ttft,tpot,itl,e2el", # must add e2el in order for it to be logged "--save-result", "--result-filename", str(result_filename) ] @@ -76,11 +76,11 @@ def main(): # Get all benchmark combinations using the original function combinations = [ - {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32 * 8}, - # {"input_len": 128, "output_len": 1024, "batch_size": 32, "num_prompts": 32}, - # {"input_len": 2048, "output_len": 128, "batch_size": 32, "num_prompts": 32}, - # {"input_len": 128, "output_len": 4096, "batch_size": 32, "num_prompts": 32}, - # {"input_len": 2048, "output_len": 2048, "batch_size": 32, "num_prompts": 32}, + {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32}, + {"input_len": 128, "output_len": 1024, "batch_size": 32, "num_prompts": 32}, + {"input_len": 2048, "output_len": 128, "batch_size": 32, "num_prompts": 32}, + {"input_len": 128, "output_len": 4096, "batch_size": 32, "num_prompts": 32}, + {"input_len": 2048, "output_len": 2048, "batch_size": 32, "num_prompts": 32}, # {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32}, # {"input_len": 128, "output_len": 2048, "batch_size": 32, "num_prompts": 32}, # {"input_len": 128, "output_len": 4096, "batch_size": 32, "num_prompts": 32}, From d8ec6828c308e1a42749d52f8a4292b48d822685 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Wed, 18 Dec 2024 05:13:08 +0000 Subject: [PATCH 53/76] update benchmark sweeps --- benchmarking/prompt_client_online_benchmark.py | 17 ++++++++++++----- benchmarking/vllm_online_benchmark.py | 7 ------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/benchmarking/prompt_client_online_benchmark.py b/benchmarking/prompt_client_online_benchmark.py index 385bff19..9842d782 100644 --- a/benchmarking/prompt_client_online_benchmark.py +++ b/benchmarking/prompt_client_online_benchmark.py @@ -147,11 +147,18 @@ def run_sequence_length_test( if __name__ == "__main__": combinations = [ - # {"input_len": 128, "output_len": 128, "batch_size": 1, "num_prompts": 32}, - {"input_len": 128, "output_len": 1024, "batch_size": 1, "num_prompts": 16}, - {"input_len": 2048, "output_len": 128, "batch_size": 1, "num_prompts": 32}, - {"input_len": 128, "output_len": 4096, "batch_size": 1, "num_prompts": 8}, - {"input_len": 2048, "output_len": 2048, "batch_size": 1, "num_prompts": 8}, + # sweeps for batch-1 + {"input_len": 128, "output_len": 128, "batch_size": 1, "num_prompts": 16}, + {"input_len": 128, "output_len": 1024, "batch_size": 1, "num_prompts": 8}, + {"input_len": 2048, "output_len": 128, "batch_size": 1, "num_prompts": 8}, + {"input_len": 128, "output_len": 4096, "batch_size": 1, "num_prompts": 4}, + {"input_len": 2048, "output_len": 2048, "batch_size": 1, "num_prompts": 4}, + # sweeps for batch-32 + {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32}, + {"input_len": 128, "output_len": 1024, "batch_size": 32, "num_prompts": 32}, + {"input_len": 2048, "output_len": 128, "batch_size": 32, "num_prompts": 32}, + {"input_len": 128, "output_len": 4096, "batch_size": 32, "num_prompts": 32}, + {"input_len": 2048, "output_len": 2048, "batch_size": 32, "num_prompts": 32}, ] # Create output directory diff --git a/benchmarking/vllm_online_benchmark.py b/benchmarking/vllm_online_benchmark.py index 35aed1e6..faca9e67 100644 --- a/benchmarking/vllm_online_benchmark.py +++ b/benchmarking/vllm_online_benchmark.py @@ -90,13 +90,6 @@ def main(): # {"input_len": 500, "output_len": 2000, "batch_size": 32, "num_prompts": 32}, # {"input_len": 5000, "output_len": 500, "batch_size": 32, "num_prompts": 32}, # {"input_len": 20000, "output_len": 2000, "batch_size": 32, "num_prompts": 32}, - # {"input_len": 128, "output_len": 2, "batch_size": 32, "num_prompts": 32}, - # {"input_len": 256, "output_len": 2, "batch_size": 32, "num_prompts": 32}, - # {"input_len": 512, "output_len": 32, "batch_size": 32, "num_prompts": 32}, - # {"input_len": 1000, "output_len": 24, "batch_size": 32, "num_prompts": 32}, - # {"input_len": 2000, "output_len": 32, "batch_size": 32, "num_prompts": 32}, - # {"input_len": 4000, "output_len": 32, "batch_size": 32, "num_prompts": 32}, - # {"input_len": 8100, "output_len": 32, "batch_size": 32, "num_prompts": 32} ] context_lens = [(it["input_len"], it["output_len"]) for it in combinations] From ffaabd6ed7c54b8474f946ea27569bf81587f288 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Wed, 18 Dec 2024 06:26:44 +0000 Subject: [PATCH 54/76] update sweeps context lengths --- benchmarking/prompt_client_online_benchmark.py | 2 ++ benchmarking/vllm_online_benchmark.py | 1 + 2 files changed, 3 insertions(+) diff --git a/benchmarking/prompt_client_online_benchmark.py b/benchmarking/prompt_client_online_benchmark.py index 9842d782..b2ee0b7c 100644 --- a/benchmarking/prompt_client_online_benchmark.py +++ b/benchmarking/prompt_client_online_benchmark.py @@ -151,12 +151,14 @@ def run_sequence_length_test( {"input_len": 128, "output_len": 128, "batch_size": 1, "num_prompts": 16}, {"input_len": 128, "output_len": 1024, "batch_size": 1, "num_prompts": 8}, {"input_len": 2048, "output_len": 128, "batch_size": 1, "num_prompts": 8}, + {"input_len": 128, "output_len": 2048, "batch_size": 1, "num_prompts": 8}, {"input_len": 128, "output_len": 4096, "batch_size": 1, "num_prompts": 4}, {"input_len": 2048, "output_len": 2048, "batch_size": 1, "num_prompts": 4}, # sweeps for batch-32 {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32}, {"input_len": 128, "output_len": 1024, "batch_size": 32, "num_prompts": 32}, {"input_len": 2048, "output_len": 128, "batch_size": 32, "num_prompts": 32}, + {"input_len": 128, "output_len": 2048, "batch_size": 32, "num_prompts": 32}, {"input_len": 128, "output_len": 4096, "batch_size": 32, "num_prompts": 32}, {"input_len": 2048, "output_len": 2048, "batch_size": 32, "num_prompts": 32}, ] diff --git a/benchmarking/vllm_online_benchmark.py b/benchmarking/vllm_online_benchmark.py index faca9e67..3be639e0 100644 --- a/benchmarking/vllm_online_benchmark.py +++ b/benchmarking/vllm_online_benchmark.py @@ -79,6 +79,7 @@ def main(): {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32}, {"input_len": 128, "output_len": 1024, "batch_size": 32, "num_prompts": 32}, {"input_len": 2048, "output_len": 128, "batch_size": 32, "num_prompts": 32}, + {"input_len": 128, "output_len": 2048, "batch_size": 32, "num_prompts": 32}, {"input_len": 128, "output_len": 4096, "batch_size": 32, "num_prompts": 32}, {"input_len": 2048, "output_len": 2048, "batch_size": 32, "num_prompts": 32}, # {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32}, From 4602ff3d96e38eb5a95c1882f82c5a72362acc04 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Wed, 18 Dec 2024 06:42:08 +0000 Subject: [PATCH 55/76] model id as header not in table --- benchmarking/benchmark_summary.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/benchmarking/benchmark_summary.py b/benchmarking/benchmark_summary.py index a9225e18..c2d6c7ea 100644 --- a/benchmarking/benchmark_summary.py +++ b/benchmarking/benchmark_summary.py @@ -178,8 +178,6 @@ def format_markdown_table(results: List[Dict[str, Any]]) -> str: # Define columns to display and their headers display_cols = [ - ("model_id", "Model ID"), - ("backend", "Backend"), ("input_sequence_length", "ISL"), ("output_sequence_length", "OSL"), ("batch_size", "Batch Size"), @@ -229,6 +227,9 @@ def main(): # Generate and print Markdown table print("\nMarkdown Table:\n") + + print(f"Model ID: {results[0].get('model_id')}") + print(f"Backend: {results[0].get('backend')}") print(format_markdown_table(results)) print("Note: all metrics are means across benchmark run unless otherwise stated.\n") From 594b9a1e7b07aca50e65d92a8a6abf706dd05183 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Wed, 18 Dec 2024 18:13:00 +0000 Subject: [PATCH 56/76] add better formatting in benchmark_summary.py, update iso/osl sweeps --- benchmarking/benchmark_summary.py | 112 +++++++++++++----- .../prompt_client_online_benchmark.py | 9 +- benchmarking/vllm_online_benchmark.py | 15 ++- 3 files changed, 96 insertions(+), 40 deletions(-) diff --git a/benchmarking/benchmark_summary.py b/benchmarking/benchmark_summary.py index c2d6c7ea..30d8894b 100644 --- a/benchmarking/benchmark_summary.py +++ b/benchmarking/benchmark_summary.py @@ -33,22 +33,15 @@ def parse_args(): help="File pattern to match (default: vllm_online_benchmark_*.json)", ) parser.add_argument( - "--output", + "--output-dir", type=str, - default=f"benchmark_results_{datetime.now().strftime(DATE_STR_FORMAT)}.csv", + default="", help="Output CSV file name", ) return parser.parse_args() def extract_params_from_filename(filename: str) -> Dict[str, Any]: - """ - Extract all parameters from benchmark filename using regex. - Example: vllm_online_benchmark_2024-12-17_13-24-17_isl-128_osl-128_bsz-32_n-32.json - - Returns: - Dictionary containing timestamp and numeric parameters - """ pattern = r""" benchmark_ (?P\d{4}-\d{2}-\d{2}_\d{2}-\d{2}-\d{2}) # Timestamp @@ -78,6 +71,23 @@ def extract_params_from_filename(filename: str) -> Dict[str, Any]: return params +def format_metrics(metrics): + NOT_MEASURED_STR = "n/a" + formatted_metrics = {} + + for key, value in metrics.items(): + # Skip None values and NOT_MEASURED_STR + if value is None or value == NOT_MEASURED_STR: + formatted_metrics[key] = NOT_MEASURED_STR + elif isinstance(value, float): + # Format numeric values to 2 decimal places + formatted_metrics[key] = round(float(value), 2) + else: + formatted_metrics[key] = value + + return formatted_metrics + + def process_benchmark_file(filepath: str) -> Dict[str, Any]: """Process a single benchmark file and extract relevant metrics.""" with open(filepath, "r") as f: @@ -86,32 +96,44 @@ def process_benchmark_file(filepath: str) -> Dict[str, Any]: filename = os.path.basename(filepath) params = extract_params_from_filename(filename) - timestamp = params.pop("timestamp") # Remove timestamp from params dict + + # Calculate statistics + + mean_tpot_ms = data.get("mean_tpot_ms") + if data.get("mean_tpot_ms"): + mean_tpot = max(data.get("mean_tpot_ms"), 1e-6) # Avoid division by zero + mean_tps = 1000.0 / mean_tpot + if data.get("std_tpot_ms"): + std_tps = mean_tps - (1000.0 / (mean_tpot + data.get("std_tpot_ms"))) + else: + std_tps = None + else: + mean_tps = None + std_tps = None metrics = { - "filepath": filepath, - "filename": filename, - "timestamp": timestamp, + "timestamp": params["timestamp"], "model_id": data.get("model_id", ""), "backend": data.get("backend", ""), + "input_sequence_length": params["input_sequence_length"], + "output_sequence_length": params["output_sequence_length"], + "batch_size": params["batch_size"], + "mean_ttft_ms": data.get("mean_ttft_ms"), + "std_ttft_ms": data.get("std_ttft_ms"), + "mean_tpot_ms": mean_tpot_ms, + "std_tpot_ms": data.get("std_tpot_ms"), + "mean_tps": mean_tps, + "std_tps": std_tps, + "mean_e2el_ms": data.get("mean_e2el_ms"), + "request_throughput": data.get("request_throughput"), + "total_input_tokens": data.get("total_input_tokens"), + "total_output_tokens": data.get("total_output_tokens"), "num_prompts": data.get("num_prompts", ""), - "mean_tpot_ms": data.get("mean_tpot_ms", "n/a"), - "std_tpot_ms": data.get("std_tpot_ms", "n/a"), - "mean_ttft_ms": data.get("mean_ttft_ms", "n/a"), - "std_ttft_ms": data.get("std_ttft_ms", "n/a"), - "total_input_tokens": data.get("total_input_tokens", "n/a"), - "total_output_tokens": data.get("total_output_tokens", "n/a"), - "mean_e2el_ms": data.get("mean_e2el_ms", "n/a"), - "request_throughput": data.get("request_throughput", "n/a"), - **params, # Unpack the extracted parameters + "num_requests": params["num_requests"], + "filename": filename, } + metrics = format_metrics(metrics) - # Calculate statistics - mean_tpot = max(metrics["mean_tpot_ms"], 1e-6) # Avoid division by zero - mean_tps = 1000.0 / mean_tpot - std_tps = mean_tps - (1000.0 / (mean_tpot + metrics["std_tpot_ms"])) - metrics["mean_tps"] = mean_tps - metrics["std_tps"] = std_tps return metrics @@ -153,15 +175,19 @@ def process_benchmark_files( return sorted(results, key=lambda x: x["timestamp"]) -def save_to_csv(results: List[Dict[str, Any]], filename: str) -> None: +def save_to_csv( + results: List[Dict[str, Any]], output_dir: str, timestamp_str: str +) -> None: """Save results to a CSV file.""" if not results: return + file_path = Path(output_dir) / f"benchmark_results_{timestamp_str}.csv" + # Get all unique keys from all dictionaries headers = list(results[0].keys()) - with open(filename, "w") as f: + with open(file_path, "w") as f: # Write headers f.write(",".join(headers) + "\n") @@ -169,6 +195,7 @@ def save_to_csv(results: List[Dict[str, Any]], filename: str) -> None: for result in results: row = [str(result.get(header, "")) for header in headers] f.write(",".join(row) + "\n") + print(f"\nResults saved to: {file_path}") def format_markdown_table(results: List[Dict[str, Any]]) -> str: @@ -212,18 +239,39 @@ def format_markdown_table(results: List[Dict[str, Any]]) -> str: return markdown_table +def extract_timestamp(directories): + pattern = r""" + results_ + (?P\d{4}-\d{2}-\d{2}_\d{2}-\d{2}-\d{2}) # Timestamp + """ + first_dir = directories[0] + match = re.search(pattern, first_dir, re.VERBOSE) + if not match: + raise ValueError(f"Could not extract parameters from: {first_dir}") + + # Convert timestamp string to datetime + timestamp_str = match.group("timestamp") + + return timestamp_str + + def main(): args = parse_args() results = process_benchmark_files(args.directories, args.pattern) + timestamp_str = extract_timestamp(args.directories) # Display basic statistics print("\nBenchmark Summary:") print(f"Total files processed: {len(results)}") # Save to CSV - save_to_csv(results, args.output) - print(f"\nResults saved to: {args.output}") + output_dir = args.output_dir + if not output_dir: + output_dir = Path(os.environ.get("CACHE_ROOT", ""), "benchmark_results") + os.makedirs(output_dir, exist_ok=True) + + save_to_csv(results, output_dir, timestamp_str) # Generate and print Markdown table print("\nMarkdown Table:\n") diff --git a/benchmarking/prompt_client_online_benchmark.py b/benchmarking/prompt_client_online_benchmark.py index b2ee0b7c..3ed70ff8 100644 --- a/benchmarking/prompt_client_online_benchmark.py +++ b/benchmarking/prompt_client_online_benchmark.py @@ -146,22 +146,27 @@ def run_sequence_length_test( if __name__ == "__main__": + # fmt: off combinations = [ # sweeps for batch-1 + {"input_len": 128, "output_len": 10, "batch_size": 1, "num_prompts": 16}, {"input_len": 128, "output_len": 128, "batch_size": 1, "num_prompts": 16}, {"input_len": 128, "output_len": 1024, "batch_size": 1, "num_prompts": 8}, - {"input_len": 2048, "output_len": 128, "batch_size": 1, "num_prompts": 8}, {"input_len": 128, "output_len": 2048, "batch_size": 1, "num_prompts": 8}, {"input_len": 128, "output_len": 4096, "batch_size": 1, "num_prompts": 4}, + {"input_len": 2048, "output_len": 128, "batch_size": 1, "num_prompts": 8}, {"input_len": 2048, "output_len": 2048, "batch_size": 1, "num_prompts": 4}, # sweeps for batch-32 {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32}, + {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32}, + {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32}, {"input_len": 128, "output_len": 1024, "batch_size": 32, "num_prompts": 32}, - {"input_len": 2048, "output_len": 128, "batch_size": 32, "num_prompts": 32}, {"input_len": 128, "output_len": 2048, "batch_size": 32, "num_prompts": 32}, {"input_len": 128, "output_len": 4096, "batch_size": 32, "num_prompts": 32}, + {"input_len": 2048, "output_len": 128, "batch_size": 32, "num_prompts": 32}, {"input_len": 2048, "output_len": 2048, "batch_size": 32, "num_prompts": 32}, ] + # fmt: on # Create output directory cache_dir = Path(os.environ.get("CACHE_ROOT", "")) diff --git a/benchmarking/vllm_online_benchmark.py b/benchmarking/vllm_online_benchmark.py index 3be639e0..59d6de74 100644 --- a/benchmarking/vllm_online_benchmark.py +++ b/benchmarking/vllm_online_benchmark.py @@ -75,13 +75,15 @@ def main(): os.environ["OPENAI_API_KEY"] = prompt_client._get_authorization() # Get all benchmark combinations using the original function + # fmt: off combinations = [ - {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32}, - {"input_len": 128, "output_len": 1024, "batch_size": 32, "num_prompts": 32}, - {"input_len": 2048, "output_len": 128, "batch_size": 32, "num_prompts": 32}, - {"input_len": 128, "output_len": 2048, "batch_size": 32, "num_prompts": 32}, - {"input_len": 128, "output_len": 4096, "batch_size": 32, "num_prompts": 32}, - {"input_len": 2048, "output_len": 2048, "batch_size": 32, "num_prompts": 32}, + {"input_len": 128, "output_len": 10, "batch_size": 32, "num_prompts": 32 * 16}, + {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32 * 32}, + {"input_len": 128, "output_len": 1024, "batch_size": 32, "num_prompts": 32 * 16}, + {"input_len": 128, "output_len": 2048, "batch_size": 32, "num_prompts": 32 * 8}, + {"input_len": 128, "output_len": 4096, "batch_size": 32, "num_prompts": 32 * 4}, + {"input_len": 2048, "output_len": 128, "batch_size": 32, "num_prompts": 32 * 16}, + {"input_len": 2048, "output_len": 2048, "batch_size": 32, "num_prompts": 32 * 4}, # {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32}, # {"input_len": 128, "output_len": 2048, "batch_size": 32, "num_prompts": 32}, # {"input_len": 128, "output_len": 4096, "batch_size": 32, "num_prompts": 32}, @@ -92,6 +94,7 @@ def main(): # {"input_len": 5000, "output_len": 500, "batch_size": 32, "num_prompts": 32}, # {"input_len": 20000, "output_len": 2000, "batch_size": 32, "num_prompts": 32}, ] + # fmt: on context_lens = [(it["input_len"], it["output_len"]) for it in combinations] # de-dupe From 2ce6fe71506a5de4c8cbaba39bd27b691715ca8e Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Wed, 18 Dec 2024 20:39:23 +0000 Subject: [PATCH 57/76] add better markdown formatting, add saving display .csv --- benchmarking/benchmark_summary.py | 195 +++++++++++++++++++++--------- 1 file changed, 138 insertions(+), 57 deletions(-) diff --git a/benchmarking/benchmark_summary.py b/benchmarking/benchmark_summary.py index 30d8894b..cea4d7c8 100644 --- a/benchmarking/benchmark_summary.py +++ b/benchmarking/benchmark_summary.py @@ -5,14 +5,16 @@ import json import glob import os +import csv from datetime import datetime import re -from typing import Dict, List, Any +from typing import Dict, List, Any, Union, Tuple import argparse from pathlib import Path DATE_STR_FORMAT = "%Y-%m-%d_%H-%M-%S" +NOT_MEASURED_STR = "n/a" def parse_args(): @@ -71,8 +73,23 @@ def extract_params_from_filename(filename: str) -> Dict[str, Any]: return params +def extract_timestamp(directories): + pattern = r""" + results_ + (?P\d{4}-\d{2}-\d{2}_\d{2}-\d{2}-\d{2}) # Timestamp + """ + first_dir = directories[0] + match = re.search(pattern, first_dir, re.VERBOSE) + if not match: + raise ValueError(f"Could not extract parameters from: {first_dir}") + + # Convert timestamp string to datetime + timestamp_str = match.group("timestamp") + + return timestamp_str + + def format_metrics(metrics): - NOT_MEASURED_STR = "n/a" formatted_metrics = {} for key, value in metrics.items(): @@ -175,36 +192,32 @@ def process_benchmark_files( return sorted(results, key=lambda x: x["timestamp"]) -def save_to_csv( - results: List[Dict[str, Any]], output_dir: str, timestamp_str: str -) -> None: - """Save results to a CSV file.""" +def save_to_csv(results: List[Dict[str, Any]], file_path: Union[Path, str]) -> None: if not results: return - file_path = Path(output_dir) / f"benchmark_results_{timestamp_str}.csv" - - # Get all unique keys from all dictionaries + # Get headers from first result (assuming all results have same structure) headers = list(results[0].keys()) - with open(file_path, "w") as f: - # Write headers - f.write(",".join(headers) + "\n") + try: + with open(file_path, "w", newline="") as f: + writer = csv.writer(f) + # Write headers + writer.writerow(headers) + # Write data rows + for result in results: + row = [str(result.get(header, NOT_MEASURED_STR)) for header in headers] + writer.writerow(row) - # Write data - for result in results: - row = [str(result.get(header, "")) for header in headers] - f.write(",".join(row) + "\n") - print(f"\nResults saved to: {file_path}") + print(f"\nResults saved to: {file_path}") + except Exception as e: + print(f"Error saving CSV file: {e}") -def format_markdown_table(results: List[Dict[str, Any]]) -> str: - """Format results as a Markdown table.""" - if not results: - return "" - # Define columns to display and their headers - display_cols = [ +def create_display_dict(result: Dict[str, Any]) -> Dict[str, str]: + # Define display columns mapping + display_cols: List[Tuple[str, str]] = [ ("input_sequence_length", "ISL"), ("output_sequence_length", "OSL"), ("batch_size", "Batch Size"), @@ -216,43 +229,105 @@ def format_markdown_table(results: List[Dict[str, Any]]) -> str: ("request_throughput", "Request Throughput (RPS)"), ] - # Create header row - header = " | ".join(header for _, header in display_cols) - separator = "|".join(["---"] * len(display_cols)) + display_dict = {} + for col_name, display_header in display_cols: + value = result.get(col_name, NOT_MEASURED_STR) + display_dict[display_header] = str(value) - # Create data rows - rows = [] - for result in results: - row_values = [] - for col, _ in display_cols: - value = result.get(col, "") - # Format floats to 2 decimal places - if isinstance(value, float): - value = f"{value:.2f}" - row_values.append(str(value)) - rows.append(" | ".join(row_values)) + return display_dict - # Combine all parts - markdown_table = f"| {header} |\n| {separator} |\n" - markdown_table += "\n".join(f"| {row} |" for row in rows) - return markdown_table +def get_markdown_table(display_dicts: List[Dict[str, str]]) -> str: + if not display_dicts: + return "" + def sanitize_cell(text: str) -> str: + """Sanitize cell content for Markdown compatibility""" + # Replace problematic characters + text = str(text) + text = text.replace("|", "\\|") # Escape pipe characters + text = text.replace("\n", " ") # Replace newlines with spaces + text = re.sub(r"[^\x00-\x7F]+", "", text) # Remove non-ASCII characters + return text.strip() + + # Get headers from first dictionary + headers = list(display_dicts[0].keys()) + + # Calculate column widths based on all values including headers + col_widths = {} + for header in headers: + # Include header length in width calculation + width = len(header) + # Check all values for this column + for d in display_dicts: + width = max(width, len(str(d.get(header, "")))) + # Add minimum width of 3 + col_widths[header] = max(width, 3) + + # Create header row with proper padding + header_row = ( + "| " + + " | ".join( + sanitize_cell(header).ljust(col_widths[header]) for header in headers + ) + + " |" + ) -def extract_timestamp(directories): - pattern = r""" - results_ - (?P\d{4}-\d{2}-\d{2}_\d{2}-\d{2}-\d{2}) # Timestamp - """ - first_dir = directories[0] - match = re.search(pattern, first_dir, re.VERBOSE) - if not match: - raise ValueError(f"Could not extract parameters from: {first_dir}") + # Create separator row with proper alignment indicators + separator_row = ( + "|" + + "|".join(":" + "-" * (col_widths[header]) + ":" for header in headers) + + "|" + ) - # Convert timestamp string to datetime - timestamp_str = match.group("timestamp") + # Create value rows with proper padding + value_rows = [] + for d in display_dicts: + row = ( + "| " + + " | ".join( + sanitize_cell(str(d.get(header, ""))).ljust(col_widths[header]) + for header in headers + ) + + " |" + ) + value_rows.append(row) + + # add notes + notes = ( + "\nNote: all metrics are means across benchmark run unless otherwise stated.\n" + ) + # Combine all rows + md_str = f"{header_row}\n{separator_row}\n" + "\n".join(value_rows) + notes + return md_str - return timestamp_str + +def save_markdown_table( + markdown_str: str, filepath: str, add_title: str = None, add_notes: List[str] = None +) -> None: + # Convert string path to Path object and ensure .md extension + path = Path(filepath) + if path.suffix.lower() != ".md": + path = path.with_suffix(".md") + + # Create directory if it doesn't exist + path.parent.mkdir(parents=True, exist_ok=True) + + # Prepare content + content = [] + if add_title: + # Add title with markdown h1 formatting and blank line + content.extend([f"# {add_title}", ""]) + content.append(markdown_str) + if add_notes: + content.extend(add_notes) + + # Write to file with UTF-8 encoding + try: + path.write_text("\n".join(content), encoding="utf-8") + print(f"Successfully saved markdown table to: {path}") + except Exception as e: + print(f"Error saving markdown table: {str(e)}") def main(): @@ -271,15 +346,21 @@ def main(): output_dir = Path(os.environ.get("CACHE_ROOT", ""), "benchmark_results") os.makedirs(output_dir, exist_ok=True) - save_to_csv(results, output_dir, timestamp_str) + # save stats + stats_file_path = Path(output_dir) / f"benchmark_stats_{timestamp_str}.csv" + save_to_csv(results, stats_file_path) + display_results = [create_display_dict(res) for res in results] + disp_file_path = Path(output_dir) / f"benchmark_display_{timestamp_str}.csv" + save_to_csv(display_results, disp_file_path) # Generate and print Markdown table print("\nMarkdown Table:\n") - print(f"Model ID: {results[0].get('model_id')}") print(f"Backend: {results[0].get('backend')}") - print(format_markdown_table(results)) - print("Note: all metrics are means across benchmark run unless otherwise stated.\n") + display_md_str = get_markdown_table(display_results) + print(display_md_str) + disp_md_path = Path(output_dir) / f"benchmark_display_{timestamp_str}.md" + save_markdown_table(display_md_str, disp_md_path) if __name__ == "__main__": From 6be324fc93a521a607e48b3b7bea8857a00acc18 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Wed, 18 Dec 2024 20:39:40 +0000 Subject: [PATCH 58/76] update sweep isl/osl --- benchmarking/vllm_online_benchmark.py | 1 - 1 file changed, 1 deletion(-) diff --git a/benchmarking/vllm_online_benchmark.py b/benchmarking/vllm_online_benchmark.py index 59d6de74..8f35b5c7 100644 --- a/benchmarking/vllm_online_benchmark.py +++ b/benchmarking/vllm_online_benchmark.py @@ -77,7 +77,6 @@ def main(): # Get all benchmark combinations using the original function # fmt: off combinations = [ - {"input_len": 128, "output_len": 10, "batch_size": 32, "num_prompts": 32 * 16}, {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32 * 32}, {"input_len": 128, "output_len": 1024, "batch_size": 32, "num_prompts": 32 * 16}, {"input_len": 128, "output_len": 2048, "batch_size": 32, "num_prompts": 32 * 8}, From f558876ec0725cd0f51062a95b0677139fc3f9cf Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Wed, 18 Dec 2024 21:04:37 +0000 Subject: [PATCH 59/76] update sweep isl/osl --- .../prompt_client_online_benchmark.py | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/benchmarking/prompt_client_online_benchmark.py b/benchmarking/prompt_client_online_benchmark.py index 3ed70ff8..5b1203e3 100644 --- a/benchmarking/prompt_client_online_benchmark.py +++ b/benchmarking/prompt_client_online_benchmark.py @@ -149,22 +149,22 @@ def run_sequence_length_test( # fmt: off combinations = [ # sweeps for batch-1 - {"input_len": 128, "output_len": 10, "batch_size": 1, "num_prompts": 16}, - {"input_len": 128, "output_len": 128, "batch_size": 1, "num_prompts": 16}, - {"input_len": 128, "output_len": 1024, "batch_size": 1, "num_prompts": 8}, - {"input_len": 128, "output_len": 2048, "batch_size": 1, "num_prompts": 8}, - {"input_len": 128, "output_len": 4096, "batch_size": 1, "num_prompts": 4}, - {"input_len": 2048, "output_len": 128, "batch_size": 1, "num_prompts": 8}, - {"input_len": 2048, "output_len": 2048, "batch_size": 1, "num_prompts": 4}, + {"input_len": 128, "output_len": 10, "batch_size": 1, "num_prompts": 64}, + {"input_len": 128, "output_len": 128, "batch_size": 1, "num_prompts": 64}, + {"input_len": 128, "output_len": 1024, "batch_size": 1, "num_prompts": 32}, + {"input_len": 128, "output_len": 2048, "batch_size": 1, "num_prompts": 32}, + {"input_len": 128, "output_len": 4096, "batch_size": 1, "num_prompts": 16}, + {"input_len": 2048, "output_len": 128, "batch_size": 1, "num_prompts": 32}, + {"input_len": 2048, "output_len": 2048, "batch_size": 1, "num_prompts": 16}, # sweeps for batch-32 - {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32}, - {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32}, - {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32}, - {"input_len": 128, "output_len": 1024, "batch_size": 32, "num_prompts": 32}, - {"input_len": 128, "output_len": 2048, "batch_size": 32, "num_prompts": 32}, - {"input_len": 128, "output_len": 4096, "batch_size": 32, "num_prompts": 32}, - {"input_len": 2048, "output_len": 128, "batch_size": 32, "num_prompts": 32}, - {"input_len": 2048, "output_len": 2048, "batch_size": 32, "num_prompts": 32}, + {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32 * 8}, + {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32 * 8}, + {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32 * 8}, + {"input_len": 128, "output_len": 1024, "batch_size": 32, "num_prompts": 32 * 4}, + {"input_len": 128, "output_len": 2048, "batch_size": 32, "num_prompts": 32 * 4}, + {"input_len": 128, "output_len": 4096, "batch_size": 32, "num_prompts": 32 * 2}, + {"input_len": 2048, "output_len": 128, "batch_size": 32, "num_prompts": 32 * 8}, + {"input_len": 2048, "output_len": 2048, "batch_size": 32, "num_prompts": 32 * 2}, ] # fmt: on From b4260d35edcbed3f5f9325c22f989b8711c2056e Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Wed, 18 Dec 2024 21:05:05 +0000 Subject: [PATCH 60/76] add metadata to markdown summary --- benchmarking/benchmark_summary.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/benchmarking/benchmark_summary.py b/benchmarking/benchmark_summary.py index cea4d7c8..0cdaf77d 100644 --- a/benchmarking/benchmark_summary.py +++ b/benchmarking/benchmark_summary.py @@ -237,7 +237,7 @@ def create_display_dict(result: Dict[str, Any]) -> Dict[str, str]: return display_dict -def get_markdown_table(display_dicts: List[Dict[str, str]]) -> str: +def get_markdown_table(display_dicts: List[Dict[str, str]], metadata: str = "") -> str: if not display_dicts: return "" @@ -294,11 +294,16 @@ def sanitize_cell(text: str) -> str: value_rows.append(row) # add notes - notes = ( + end_notes = ( "\nNote: all metrics are means across benchmark run unless otherwise stated.\n" ) # Combine all rows - md_str = f"{header_row}\n{separator_row}\n" + "\n".join(value_rows) + notes + md_str = ( + metadata + + f"\n{header_row}\n{separator_row}\n" + + "\n".join(value_rows) + + end_notes + ) return md_str @@ -355,9 +360,11 @@ def main(): save_to_csv(display_results, disp_file_path) # Generate and print Markdown table print("\nMarkdown Table:\n") - print(f"Model ID: {results[0].get('model_id')}") - print(f"Backend: {results[0].get('backend')}") - display_md_str = get_markdown_table(display_results) + metadata = ( + f"Model ID: {results[0].get('model_id')}\n" + f"Backend: {results[0].get('backend')}\n" + ) + display_md_str = get_markdown_table(display_results, metadata=metadata) print(display_md_str) disp_md_path = Path(output_dir) / f"benchmark_display_{timestamp_str}.md" save_markdown_table(display_md_str, disp_md_path) From 89958d96e972327607b8af6c39e9097601637efe Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Wed, 18 Dec 2024 21:32:51 +0000 Subject: [PATCH 61/76] add ignore_eos=True to locust requests to use min/max tokens, increase locust default test length to 10 minutes --- locust/locust_config.conf | 4 ++-- locust/locustfile.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/locust/locust_config.conf b/locust/locust_config.conf index cdd4157b..2431cf0e 100644 --- a/locust/locust_config.conf +++ b/locust/locust_config.conf @@ -2,5 +2,5 @@ locustfile = locustfile.py headless = true host = http://localhost:7000 users = 32 -spawn-rate = 1 -run-time = 3m +spawn-rate = 6 +run-time = 10m diff --git a/locust/locustfile.py b/locust/locustfile.py index 19dd59ee..94db1a59 100644 --- a/locust/locustfile.py +++ b/locust/locustfile.py @@ -18,6 +18,7 @@ "temperature": 1.0, "top_k": 10, "top_p": 0.9, + "ignore_eos": True, } # Global variable to store data iterator From 126c5886ac6d7fd022b1ac78c40bae1ea5a47595 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Thu, 19 Dec 2024 06:00:34 +0000 Subject: [PATCH 62/76] update for llama 3.1 70B v0 testing --- .../prompt_client_online_benchmark.py | 19 +++++++++---------- vllm-tt-metal-llama3-70b/README.md | 2 +- vllm-tt-metal-llama3-70b/docs/development.md | 8 ++++---- .../src/run_vllm_api_server.py | 4 ++-- .../vllm.llama3.src.Dockerfile | 1 + 5 files changed, 17 insertions(+), 17 deletions(-) diff --git a/benchmarking/prompt_client_online_benchmark.py b/benchmarking/prompt_client_online_benchmark.py index 5b1203e3..79a847d1 100644 --- a/benchmarking/prompt_client_online_benchmark.py +++ b/benchmarking/prompt_client_online_benchmark.py @@ -151,20 +151,19 @@ def run_sequence_length_test( # sweeps for batch-1 {"input_len": 128, "output_len": 10, "batch_size": 1, "num_prompts": 64}, {"input_len": 128, "output_len": 128, "batch_size": 1, "num_prompts": 64}, - {"input_len": 128, "output_len": 1024, "batch_size": 1, "num_prompts": 32}, - {"input_len": 128, "output_len": 2048, "batch_size": 1, "num_prompts": 32}, - {"input_len": 128, "output_len": 4096, "batch_size": 1, "num_prompts": 16}, + {"input_len": 128, "output_len": 1024, "batch_size": 1, "num_prompts": 16}, + {"input_len": 128, "output_len": 2048, "batch_size": 1, "num_prompts": 8}, + {"input_len": 128, "output_len": 4096, "batch_size": 1, "num_prompts": 8}, {"input_len": 2048, "output_len": 128, "batch_size": 1, "num_prompts": 32}, - {"input_len": 2048, "output_len": 2048, "batch_size": 1, "num_prompts": 16}, + {"input_len": 2048, "output_len": 2048, "batch_size": 1, "num_prompts": 8}, # sweeps for batch-32 - {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32 * 8}, - {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32 * 8}, - {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32 * 8}, - {"input_len": 128, "output_len": 1024, "batch_size": 32, "num_prompts": 32 * 4}, + {"input_len": 128, "output_len": 10, "batch_size": 32, "num_prompts": 32 * 16}, + {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32 * 16}, + {"input_len": 128, "output_len": 1024, "batch_size": 32, "num_prompts": 32 * 8}, {"input_len": 128, "output_len": 2048, "batch_size": 32, "num_prompts": 32 * 4}, - {"input_len": 128, "output_len": 4096, "batch_size": 32, "num_prompts": 32 * 2}, + {"input_len": 128, "output_len": 4096, "batch_size": 32, "num_prompts": 32 * 4}, {"input_len": 2048, "output_len": 128, "batch_size": 32, "num_prompts": 32 * 8}, - {"input_len": 2048, "output_len": 2048, "batch_size": 32, "num_prompts": 32 * 2}, + {"input_len": 2048, "output_len": 2048, "batch_size": 32, "num_prompts": 32 * 4}, ] # fmt: on diff --git a/vllm-tt-metal-llama3-70b/README.md b/vllm-tt-metal-llama3-70b/README.md index 3fa10b39..31a4232b 100644 --- a/vllm-tt-metal-llama3-70b/README.md +++ b/vllm-tt-metal-llama3-70b/README.md @@ -36,7 +36,7 @@ docker run \ --volume ${PERSISTENT_VOLUME?ERROR env var PERSISTENT_VOLUME must be set}:/home/user/cache_root:rw \ --shm-size 32G \ --publish 7000:7000 \ - ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:v0.0.2-tt-metal-385904186f81-384f1790c3be + ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:v0.0.1-tt-metal-3ef683762eaa-953161188c50 ``` By default the Docker container will start running the entrypoint command wrapped in `src/run_vllm_api_server.py`. diff --git a/vllm-tt-metal-llama3-70b/docs/development.md b/vllm-tt-metal-llama3-70b/docs/development.md index 55d8b1d3..232fd9aa 100644 --- a/vllm-tt-metal-llama3-70b/docs/development.md +++ b/vllm-tt-metal-llama3-70b/docs/development.md @@ -13,12 +13,12 @@ When building, update the commit SHA and get correct SHA from model developers o # set build context to repo root cd tt-inference-server # build image -export TT_METAL_DOCKERFILE_VERSION=v0.53.0-rc34 -export TT_METAL_COMMIT_SHA_OR_TAG=385904186f81fed15d5c87c162221d4f34387164 +export TT_METAL_DOCKERFILE_VERSION=v0.53.0 +export TT_METAL_COMMIT_SHA_OR_TAG=3ef683762eaa4bd602ec6f3f33aec875775265c5 export TT_METAL_COMMIT_DOCKER_TAG=${TT_METAL_COMMIT_SHA_OR_TAG:0:12} -export TT_VLLM_COMMIT_SHA_OR_TAG=384f1790c3be16e1d1b10de07252be2e66d00935 +export TT_VLLM_COMMIT_SHA_OR_TAG=953161188c50f10da95a88ab305e23977ebd3750 export TT_VLLM_COMMIT_DOCKER_TAG=${TT_VLLM_COMMIT_SHA_OR_TAG:0:12} -export IMAGE_VERSION=v0.0.3 +export IMAGE_VERSION=v0.0.1 docker build \ -t ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:${IMAGE_VERSION}-tt-metal-${TT_METAL_COMMIT_DOCKER_TAG}-${TT_VLLM_COMMIT_DOCKER_TAG} \ --build-arg TT_METAL_DOCKERFILE_VERSION=${TT_METAL_DOCKERFILE_VERSION} \ diff --git a/vllm-tt-metal-llama3-70b/src/run_vllm_api_server.py b/vllm-tt-metal-llama3-70b/src/run_vllm_api_server.py index 992874b1..595f8444 100644 --- a/vllm-tt-metal-llama3-70b/src/run_vllm_api_server.py +++ b/vllm-tt-metal-llama3-70b/src/run_vllm_api_server.py @@ -13,10 +13,10 @@ from utils.logging_utils import set_vllm_logging_config # importing from tt-metal install path -from models.demos.t3000.llama2_70b.tt.llama_generation import TtLlamaModelForGeneration +from models.demos.t3000.llama2_70b.tt.generator_vllm import TtLlamaForCausalLM # register the model -ModelRegistry.register_model("TTLlamaForCausalLM", TtLlamaModelForGeneration) +ModelRegistry.register_model("TTLlamaForCausalLM", TtLlamaForCausalLM) def get_encoded_api_key(jwt_secret): diff --git a/vllm-tt-metal-llama3-70b/vllm.llama3.src.Dockerfile b/vllm-tt-metal-llama3-70b/vllm.llama3.src.Dockerfile index 2184d356..c57fa85d 100644 --- a/vllm-tt-metal-llama3-70b/vllm.llama3.src.Dockerfile +++ b/vllm-tt-metal-llama3-70b/vllm.llama3.src.Dockerfile @@ -102,6 +102,7 @@ COPY --chown=user:user "utils" "${APP_DIR}/utils" COPY --chown=user:user "benchmarking" "${APP_DIR}/benchmarking" COPY --chown=user:user "evals" "${APP_DIR}/evals" COPY --chown=user:user "tests" "${APP_DIR}/tests" +COPY --chown=user:user "locust" "${APP_DIR}/locust" RUN /bin/bash -c "source ${PYTHON_ENV_DIR}/bin/activate \ && pip install --default-timeout=240 --no-cache-dir -r requirements.txt" From aef6a94d56f70b372ba9f93392b890245cdf3bcd Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Thu, 19 Dec 2024 17:56:45 +0000 Subject: [PATCH 63/76] adding evals changes from tstesco/llama-evals --- evals/README.md | 138 ++++++--------------------------------------- evals/run_evals.sh | 32 ++++++++++- 2 files changed, 47 insertions(+), 123 deletions(-) diff --git a/evals/README.md b/evals/README.md index 7795d48a..ca3add07 100644 --- a/evals/README.md +++ b/evals/README.md @@ -21,156 +21,52 @@ export PERSISTENT_VOLUME=$PWD/persistent_volume/volume_id_tt-metal-llama-3.1-70b docker run \ --rm \ -it \ - --env-file tt-metal-llama3-70b/.env \ + --env-file vllm-tt-metal-llama3-70b/.env \ --cap-add ALL \ --device /dev/tenstorrent:/dev/tenstorrent \ --volume /dev/hugepages-1G:/dev/hugepages-1G:rw \ --volume ${PERSISTENT_VOLUME?ERROR env var PERSISTENT_VOLUME must be set}:/home/user/cache_root:rw \ --shm-size 32G \ - ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:v0.0.1-tt-metal-${TT_METAL_COMMIT_DOCKER_TAG}-${TT_VLLM_COMMIT_DOCKER_TAG} bash + ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:${IMAGE_VERSION}-tt-metal-${TT_METAL_COMMIT_DOCKER_TAG}-${TT_VLLM_COMMIT_DOCKER_TAG} ``` -## Step 3: Inside container setup and run vLLM - -#### Install vLLM - Option 1: use default installation in docker image - -already built into Docker image - -#### Install vLLM - option 2: install vLLM from github - -```bash -# option 2: install from github -cd /home/user/vllm -git fetch -git checkout -git pull -pip install -e . -echo "done vllm install." -``` -#### Install vLLM - option 3: install edittable (for development) from mounted volume - -```bash -# option 3: install edittable (for development) - mount from outside container -cd /home/user/vllm -pip install -e . -echo "done vllm install." -``` - -#### Run vllm serving openai compatible API server - -```bash -# run vllm serving -python run_vllm_api_server.py -``` - -## Step 4: Inside container setup LM evalulation harness - -Enter new bash shell in running container (this does so with newest running container): -```bash -docker exec -it $(docker ps -q | head -n1) bash -``` - -Now inside container: -```bash -# option 1: install from github: https://github.com/tstescoTT/lm-evaluation-harness -pip install git+https://github.com/tstescoTT/lm-evaluation-harness.git#egg=lm-eval[ifeval] -# option 2: install edittable (for development) - mounted to container -cd ~/lm-evaluation-harness -pip install -e .[ifeval] -``` - -## Step 5: Inside container set up llama-recipes LM evalulation harness templates +The default Docker image command will start the vLLM server. +## Step 3: Inside container set up llama-recipes LM evalulation harness templates Using Meta’s LM eval reproduce documentation: https://github.com/meta-llama/llama-recipes/tree/main/tools/benchmarks/llm_eval_harness/meta_eval To access Meta Llama 3.1 evals, you must: -1. Log in to the Hugging Face website (https://huggingface.co/collections/meta-llama/llama-31-evals-66a2c5a14c2093e58298ac7f ) and click the 3.1 evals dataset pages and agree to the terms. +1. Log in to the Hugging Face website (https://huggingface.co/collections/meta-llama/llama-31-evals-66a2c5a14c2093e58298ac7f) and click the 3.1 evals dataset pages and agree to the terms. 2. Follow the [Hugging Face authentication instructions](https://huggingface.co/docs/huggingface_hub/en/quick-start#authentication) to gain read access for your machine. #### Hugging Face authentication - option 1: HF_TOKEN (if not already passed into Docker container) ```bash -# set up HF Token, needed for IFEval dataset -# echo "hf_" > ${HF_HOME}/token -export PYTHONPATH=${PYTHONPATH}:$PWD +# set up HF Token if not already set up in .env, needed for datasets +echo "HF_TOKEN=hf_" >> vllm-tt-metal-llama3-70b/.env ``` #### Hugging Face authentication - option 2: huggingface_hub login +Note: do this inside the container shell: ```python from huggingface_hub import login login() ``` -Finally, build llama-recipe lm-evaluation-harness templates: -```bash -git clone https://github.com/tstescoTT/llama-recipes.git -cd llama-recipes/tools/benchmarks/llm_eval_harness/meta_eval -python prepare_meta_eval.py --config_path ./eval_config.yaml -mkdir -p ~/lm-evaluation-harness -cp -rf work_dir/ ~/lm-evaluation-harness/ -``` - -## Step 6: Inside container run LM evals - -`run_evals.sh` can be run from where lm_eval CLI is available: -```bash -cd ~/lm-evaluation-harness -export OPENAI_API_KEY=$(python -c 'import os; import json; import jwt; json_payload = json.loads("{\"team_id\": \"tenstorrent\", \"token_id\": \"debug-test\"}"); encoded_jwt = jwt.encode(json_payload, os.environ["JWT_SECRET"], algorithm="HS256"); print(encoded_jwt)') -run_evals.sh -``` - -For example, running GPQA manually: +## Step 4: Inside container setup and run vLLM via script -The model args (`Meta-Llama-3.1-70B` below) need only correspond to the model defined by running the server, not the actual weights. +Enter new bash shell in running container, oneliner below enters newest running container: ```bash -lm_eval \ ---model local-completions \ ---model_args model=meta-llama/Llama-3.1-70B-Instruct,base_url=http://127.0.0.1:7000/v1/completions,num_concurrent=32,max_retries=4,tokenized_requests=False,add_bos_token=True \ ---gen_kwargs model=meta-llama/Llama-3.1-70B-Instruct,stop="<|eot_id|>",stream=False \ ---tasks meta_ifeval \ ---batch_size auto \ ---output_path /home/user/cache_root/eval_output \ ---include_path ./work_dir \ ---seed 42 \ ---log_samples +docker exec -it $(docker ps -q | head -n1) bash ``` -## Notes: - -### Chat templating - -As mentioned in: https://github.com/meta-llama/llama-recipes/tree/main/tools/benchmarks/llm_eval_harness/meta_eval#run-eval-tasks +Running the `run_evals.sh` script will: +1. set up lm_eval and evals datasets +2. pre-capture the tt-metal execution traces so that evals do not trigger 1st run trace capture unexpectedly +3. run evals via lm_eval as configured -“As for add_bos_token=True, since our prompts in the evals dataset has already included all the special tokens required by instruct model, such as <|start_header_id|>user<|end_header_id|>, we will not use --apply_chat_template argument for instruct models anymore. However, we need to use add_bos_token=True flag to add the BOS_token back during VLLM inference, as the BOS_token is removed by default in this PR.” - -Though it is recommended to use the pre-templated prompts following the build instructions for llama-recipes, the chat template can be manually added via the `lm_eval` runtime argument: ```bash ---apply_chat_template utils/prompt_templates/llama_instruct_example.jinja +cd ~/app/evals +. run_evals.sh ``` - -llama_instruct_example.jinja: text file jinja template for llama 3.1 instruct: -``` -{{- bos_token }} - -{#- System message #} -{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} -{{- "Cutting Knowledge Date: December 2023\n" }} -{{- "Today Date: " + date_string + "\n\n" }} -{{- system_message }} -{{- "<|eot_id|>" }} - -{#- Messages #} -{%- for message in messages %} - {%- if message.role in ['user', 'assistant'] %} - {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} - {%- endif %} -{%- endfor %} - -{%- if add_generation_prompt %} - {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} -{%- endif %} -``` - -The instruct chat template could also be applied on the vLLM server side, but this implementation gives more flexibility to the caller of vLLM. - diff --git a/evals/run_evals.sh b/evals/run_evals.sh index 12308b47..2db83369 100644 --- a/evals/run_evals.sh +++ b/evals/run_evals.sh @@ -3,10 +3,38 @@ # # SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC +# set up lm_eval and evals datasets +cd $HOME +if python -c "import lm_eval" 2>/dev/null; then + echo "lm_eval is installed." +else + echo "Installing lm_eval ..." + pip install git+https://github.com/tstescoTT/lm-evaluation-harness.git#egg=lm-eval[ifeval] +fi + +if [ -d "$HOME/llama-recipes" ]; then + echo "The directory $HOME/llama-recipes exists." +else + echo "The directory ~/llama-recipes does not exist." + git clone https://github.com/tstescoTT/llama-recipes.git $HOME/llama-recipes + cd $HOME/llama-recipes/tools/benchmarks/llm_eval_harness/meta_eval + python prepare_meta_eval.py --config_path ./eval_config.yaml + mkdir -p $HOME/lm-evaluation-harness + cp -rf work_dir/ $HOME/lm-evaluation-harness/ +fi + +# trace capture so that evals do not trigger 1st run trace capture unexpectedly +cd $HOME/app +python utils/capture_traces.py + +# run evals +export OPENAI_API_KEY=$(python -c 'import os; import json; import jwt; json_payload = json.loads("{\"team_id\": \"tenstorrent\", \"token_id\": \"debug-test\"}"); encoded_jwt = jwt.encode(json_payload, os.environ["JWT_SECRET"], algorithm="HS256"); print(encoded_jwt)') +cd $HOME/lm-evaluation-harness/ + # GPQA lm_eval \ --model local-completions \ ---model_args model=meta-llama/Llama-3.1-70B-Instruct,base_url=http://127.0.0.1:7000/v1/completions,num_concurrent=1,max_retries=4,tokenized_requests=False,add_bos_token=True \ +--model_args model=meta-llama/Llama-3.1-70B-Instruct,base_url=http://127.0.0.1:7000/v1/completions,num_concurrent=32,max_retries=4,tokenized_requests=False,add_bos_token=True,timeout=None \ --gen_kwargs model=meta-llama/Llama-3.1-70B-Instruct,stop="<|eot_id|>",stream=False \ --tasks meta_gpqa \ --batch_size auto \ @@ -18,7 +46,7 @@ lm_eval \ # IFEval lm_eval \ --model local-completions \ ---model_args model=meta-llama/Llama-3.1-70B-Instruct,base_url=http://127.0.0.1:7000/v1/completions,num_concurrent=32,max_retries=4,tokenized_requests=False,add_bos_token=True \ +--model_args model=meta-llama/Llama-3.1-70B-Instruct,base_url=http://127.0.0.1:7000/v1/completions,num_concurrent=32,max_retries=4,tokenized_requests=False,add_bos_token=True,timeout=None \ --gen_kwargs model=meta-llama/Llama-3.1-70B-Instruct,stop="<|eot_id|>",stream=False \ --tasks meta_ifeval \ --batch_size auto \ From 5ab18166deb23c7028e950a503588e45cda5a1f6 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Thu, 19 Dec 2024 23:19:45 +0000 Subject: [PATCH 64/76] adding TT_METAL_COMMIT_SHA_OR_TAG=v0.54.0-rc2 --- vllm-tt-metal-llama3-70b/docs/development.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm-tt-metal-llama3-70b/docs/development.md b/vllm-tt-metal-llama3-70b/docs/development.md index 232fd9aa..efa3c075 100644 --- a/vllm-tt-metal-llama3-70b/docs/development.md +++ b/vllm-tt-metal-llama3-70b/docs/development.md @@ -14,7 +14,7 @@ When building, update the commit SHA and get correct SHA from model developers o cd tt-inference-server # build image export TT_METAL_DOCKERFILE_VERSION=v0.53.0 -export TT_METAL_COMMIT_SHA_OR_TAG=3ef683762eaa4bd602ec6f3f33aec875775265c5 +export TT_METAL_COMMIT_SHA_OR_TAG=v0.54.0-rc2 export TT_METAL_COMMIT_DOCKER_TAG=${TT_METAL_COMMIT_SHA_OR_TAG:0:12} export TT_VLLM_COMMIT_SHA_OR_TAG=953161188c50f10da95a88ab305e23977ebd3750 export TT_VLLM_COMMIT_DOCKER_TAG=${TT_VLLM_COMMIT_SHA_OR_TAG:0:12} From 0e5b67a122ae4f3268876ff4ad3ea110bab8495f Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Thu, 19 Dec 2024 23:55:41 +0000 Subject: [PATCH 65/76] update README commit tags --- vllm-tt-metal-llama3-70b/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm-tt-metal-llama3-70b/README.md b/vllm-tt-metal-llama3-70b/README.md index 31a4232b..38ef1a9a 100644 --- a/vllm-tt-metal-llama3-70b/README.md +++ b/vllm-tt-metal-llama3-70b/README.md @@ -36,7 +36,7 @@ docker run \ --volume ${PERSISTENT_VOLUME?ERROR env var PERSISTENT_VOLUME must be set}:/home/user/cache_root:rw \ --shm-size 32G \ --publish 7000:7000 \ - ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:v0.0.1-tt-metal-3ef683762eaa-953161188c50 + ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:v0.0.1-tt-metal-v0.54.0-rc2-953161188c50 ``` By default the Docker container will start running the entrypoint command wrapped in `src/run_vllm_api_server.py`. From 0c48a9f61798b8481d92363a48e219c322c73367 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Fri, 20 Dec 2024 02:56:27 +0000 Subject: [PATCH 66/76] adding vllm benchmarking patch to stop sending unsupported params best_of logprobs --- benchmarking/benchmark_serving.patch | 43 ++++++++++------------------ 1 file changed, 15 insertions(+), 28 deletions(-) diff --git a/benchmarking/benchmark_serving.patch b/benchmarking/benchmark_serving.patch index f393b6bc..818d92f7 100644 --- a/benchmarking/benchmark_serving.patch +++ b/benchmarking/benchmark_serving.patch @@ -1,5 +1,19 @@ +diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py +index 4813fde2..0cb3e72e 100644 +--- a/benchmarks/backend_request_func.py ++++ b/benchmarks/backend_request_func.py +@@ -235,9 +235,7 @@ async def async_request_openai_completions( + "model": request_func_input.model, + "prompt": request_func_input.prompt, + "temperature": 0.0, +- "best_of": request_func_input.best_of, + "max_tokens": request_func_input.output_len, +- "logprobs": request_func_input.logprobs, + "stream": True, + "ignore_eos": request_func_input.ignore_eos, + } diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py -index c1a396c8..463e0e93 100644 +index c1a396c8..74f75a15 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -22,6 +22,12 @@ On the client side, run: @@ -24,30 +38,3 @@ index c1a396c8..463e0e93 100644 multi_modal_content=test_mm_content, ignore_eos=ignore_eos, ) -@@ -458,7 +464,7 @@ async def benchmark( - prompt_len=prompt_len, - output_len=output_len, - logprobs=logprobs, -- best_of=best_of, -+ best_of=None, - multi_modal_content=mm_content, - ignore_eos=ignore_eos) - tasks.append( -diff --git a/vllm/worker/tt_model_runner.py b/vllm/worker/tt_model_runner.py -index 1c586dd3..2e77bf72 100644 ---- a/vllm/worker/tt_model_runner.py -+++ b/vllm/worker/tt_model_runner.py -@@ -425,12 +425,7 @@ class TTModelRunner(ModelRunnerBase[TTModelInput]): - ) - - def _validate_sampling_params(self, sampling_params): -- assert sampling_params.n == 1, "Currently only supporting n=1" -- assert sampling_params.best_of is None, "Currently not supporting best_of" -- assert sampling_params.logprobs is None, "Currently not supporting logprobs" -- assert sampling_params.prompt_logprobs is None, "Currently not supporting prompt_logprobs" -- -- ## Destructor (used to delete ttnn trace if using trace mode) -+ return - - def __del__(self): - if self.trace_mode and self.execute_trace_kwargs is not None: From 471c90b9f3ddb5bcdcefd74912366244062714b3 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Wed, 18 Dec 2024 22:02:57 +0000 Subject: [PATCH 67/76] move vllm-tt-metal-llama3-70b/setup.sh -> setup.sh, add support for Hugging Face authorization and model download, add llama 3.3 70B instruct --- vllm-tt-metal-llama3-70b/setup.sh => setup.sh | 372 ++++++++++++------ 1 file changed, 255 insertions(+), 117 deletions(-) rename vllm-tt-metal-llama3-70b/setup.sh => setup.sh (51%) diff --git a/vllm-tt-metal-llama3-70b/setup.sh b/setup.sh similarity index 51% rename from vllm-tt-metal-llama3-70b/setup.sh rename to setup.sh index ee102dff..f1115e8c 100755 --- a/vllm-tt-metal-llama3-70b/setup.sh +++ b/setup.sh @@ -9,6 +9,7 @@ set -euo pipefail # Exit on error, print commands, unset variables treated as e usage() { echo "Usage: $0 " echo "Available model types:" + echo " llama-3.3-70b-instruct" echo " llama-3.1-70b-instruct" echo " llama-3.1-70b" echo " llama-3.1-8b-instruct" @@ -73,72 +74,116 @@ check_and_prompt_env_file() { fi } +get_hf_env_vars() { + # get HF_TOKEN + if [ -z "${HF_TOKEN:-}" ]; then + echo "HF_TOKEN environment variable is not set. Please set it before running the script." + read -r -s -p "Enter your HF_TOKEN: " input_hf_token + echo + if [ -z "${input_hf_token:-}" ]; then + echo "⛔ HF_TOKEN cannot be empty. Please try again." + exit 1 + elif [[ ! "$input_hf_token" == hf_* ]]; then + echo "⛔ HF_TOKEN must start with 'hf_'. Please try again." + exit 1 + fi + HF_TOKEN=${input_hf_token} + echo "✅ HF_TOKEN set." + fi + # get HF_HOME + if [ -z "${HF_HOME:-}" ]; then + echo "HF_HOME environment variable is not set. Please set it before running the script." + read -r -p "Enter your HF_HOME [default: $HOME/.cache/huggingface]:" input_hf_home + echo + input_hf_home=${input_hf_home:-"$HOME/.cache/huggingface"} + if [ ! -d "$input_hf_home" ] || [ ! -w "$input_hf_home" ]; then + echo "⛔ HF_HOME must be a valid directory and writable by the user. Please try again." + exit 1 + fi + HF_HOME=${input_hf_home} + echo "✅ HF_HOME set." + fi +} # Function to set environment variables based on the model selection and write them to .env setup_model_environment() { - # Set default values for environment variables - DEFAULT_PERSISTENT_VOLUME_ROOT=${REPO_ROOT}/persistent_volume - DEFAULT_LLAMA_REPO=~/llama-models # Set environment variables based on the model selection case "$1" in - "llama-3.1-70b-instruct") - MODEL_NAME="llama-3.1-70b-instruct" - META_MODEL_NAME="Meta-Llama-3.1-70B-Instruct" - META_DIR_FILTER="llama3_1" - REPACKED=1 - ;; - "llama-3.1-70b") - MODEL_NAME="llama-3.1-70b" - META_MODEL_NAME="Meta-Llama-3.1-70B" - META_DIR_FILTER="llama3_1" - REPACKED=1 - ;; - "llama-3.1-8b-instruct") - MODEL_NAME="llama-3.1-8b-instruct" - META_MODEL_NAME="Meta-Llama-3.1-8B-Instruct" - META_DIR_FILTER="llama3_1" - REPACKED=0 - ;; - "llama-3.1-8b") - MODEL_NAME="llama-3.1-8b" - META_MODEL_NAME="Meta-Llama-3.1-8B" - META_DIR_FILTER="llama3_1" - REPACKED=0 - ;; - "llama-3-70b-instruct") - MODEL_NAME="llama-3-70b-instruct" - META_MODEL_NAME="Meta-Llama-3-70B-Instruct" - META_DIR_FILTER="llama3" - REPACKED=1 - ;; - "llama-3-70b") - MODEL_NAME="llama-3-70b" - META_MODEL_NAME="Meta-Llama-3-70B" - META_DIR_FILTER="llama3" - REPACKED=1 - ;; - "llama-3-8b-instruct") - MODEL_NAME="llama-3-8b-instruct" - META_MODEL_NAME="Meta-Llama-3-8B-Instruct" - META_DIR_FILTER="llama3" - REPACKED=0 - ;; - "llama-3-8b") - MODEL_NAME="llama-3-8b" - META_MODEL_NAME="Meta-Llama-3-8B" - META_DIR_FILTER="llama3" - REPACKED=0 - ;; - *) - echo "⛔ Invalid model choice." - usage - exit 1 - ;; + "llama-3.3-70b-instruct") + MODEL_NAME="llama-3.3-70b-instruct" + HF_MODEL_REPO_ID="meta-llama/Llama-3.3-70B-Instruct" + META_MODEL_NAME="Meta-Llama-3.3-70B-Instruct" + META_DIR_FILTER="llama3_1" + REPACKED=1 + ;; + "llama-3.1-70b-instruct") + MODEL_NAME="llama-3.1-70b-instruct" + HF_MODEL_REPO_ID="meta-llama/Llama-3.1-70B-Instruct" + META_MODEL_NAME="Meta-Llama-3.1-70B-Instruct" + META_DIR_FILTER="llama3_1" + REPACKED=1 + ;; + "llama-3.1-70b") + MODEL_NAME="llama-3.1-70b" + HF_MODEL_REPO_ID="meta-llama/Llama-3.1-70B" + META_MODEL_NAME="Meta-Llama-3.1-70B" + META_DIR_FILTER="llama3_1" + REPACKED=1 + ;; + "llama-3.1-8b-instruct") + MODEL_NAME="llama-3.1-8b-instruct" + HF_MODEL_REPO_ID="meta-llama/Llama-3.1-8B-Instruct" + META_MODEL_NAME="Meta-Llama-3.1-8B-Instruct" + META_DIR_FILTER="llama3_1" + REPACKED=0 + ;; + "llama-3.1-8b") + MODEL_NAME="llama-3.1-8b" + HF_MODEL_REPO_ID="meta-llama/Llama-3.1-8B" + META_MODEL_NAME="Meta-Llama-3.1-8B" + META_DIR_FILTER="llama3_1" + REPACKED=0 + ;; + "llama-3-70b-instruct") + MODEL_NAME="llama-3-70b-instruct" + HF_MODEL_REPO_ID="meta-llama/Llama-3-70B-Instruct" + META_MODEL_NAME="Meta-Llama-3-70B-Instruct" + META_DIR_FILTER="llama3" + REPACKED=1 + ;; + "llama-3-70b") + MODEL_NAME="llama-3-70b" + HF_MODEL_REPO_ID="meta-llama/Llama-3-70B" + META_MODEL_NAME="Meta-Llama-3-70B" + META_DIR_FILTER="llama3" + REPACKED=1 + ;; + "llama-3-8b-instruct") + MODEL_NAME="llama-3-8b-instruct" + HF_MODEL_REPO_ID="meta-llama/Llama-3-8B-Instruct" + META_MODEL_NAME="Meta-Llama-3-8B-Instruct" + META_DIR_FILTER="llama3" + REPACKED=0 + ;; + "llama-3-8b") + MODEL_NAME="llama-3-8b" + HF_MODEL_REPO_ID="meta-llama/Llama-3-8B" + META_MODEL_NAME="Meta-Llama-3-8B" + META_DIR_FILTER="llama3" + REPACKED=0 + ;; + *) + echo "⛔ Invalid model choice." + usage + exit 1 + ;; esac + # Set default values for environment variables + DEFAULT_PERSISTENT_VOLUME_ROOT=${REPO_ROOT}/persistent_volume + # Initialize OVERWRITE_ENV OVERWRITE_ENV=false - check_and_prompt_env_file if [ "$OVERWRITE_ENV" = false ]; then @@ -146,29 +191,47 @@ setup_model_environment() { return 0 fi + read -p "Use 🤗 Hugging Face authorization token for downloading models? Alternative is direct authorization from Meta. (y/n) [default: y]: " input_use_hf_token + choice_use_hf_token=${input_use_hf_token:-"y"} + echo # move to a new line after input + # Handle user's choice + case "$choice_use_hf_token" in + y|Y ) + echo "Using 🤗 Hugging Face Token." + get_hf_env_vars + # default location for HF e.g. ~/.cache/huggingface/models/meta-llama/Llama-3.3-70B-Instruct + LLAMA_WEIGHTS_DIR=${HF_HOME}/${HF_MODEL_REPO_ID}/original + ;; + n|N ) + echo "Using direct authorization from Meta. You will need their URL Authorization token, typically from their website or email." + # Prompt user for LLAMA_REPO if not already set or use default + read -r -p "Enter the path where you want to clone the Llama model repository [default: ${LLAMA_REPO}]: " INPUT_LLAMA_REPO + LLAMA_REPO=${INPUT_LLAMA_REPO:-$LLAMA_REPO} + LLAMA_DIR=${LLAMA_DIR:-${LLAMA_REPO}/models/${META_DIR_FILTER}} + LLAMA_WEIGHTS_DIR=${LLAMA_WEIGHTS_DIR:-${LLAMA_DIR}/${META_MODEL_NAME}} + echo # move to a new line after input + ;; + * ) + echo "⛔ Invalid option. Exiting." + exit 1 + ;; + esac + # Safely handle potentially unset environment variables using default values PERSISTENT_VOLUME_ROOT=${PERSISTENT_VOLUME_ROOT:-$DEFAULT_PERSISTENT_VOLUME_ROOT} - LLAMA_REPO=${LLAMA_REPO:-$DEFAULT_LLAMA_REPO} # Prompt user for PERSISTENT_VOLUME_ROOT if not already set or use default - read -p "Enter your PERSISTENT_VOLUME_ROOT [default: ${PERSISTENT_VOLUME_ROOT}]: " INPUT_PERSISTENT_VOLUME_ROOT + read -r -p "Enter your PERSISTENT_VOLUME_ROOT [default: ${PERSISTENT_VOLUME_ROOT}]: " INPUT_PERSISTENT_VOLUME_ROOT PERSISTENT_VOLUME_ROOT=${INPUT_PERSISTENT_VOLUME_ROOT:-$PERSISTENT_VOLUME_ROOT} - echo - # Prompt user for LLAMA_REPO if not already set or use default - read -p "Enter the path where you want to clone the Llama model repository [default: ${LLAMA_REPO}]: " INPUT_LLAMA_REPO - LLAMA_REPO=${INPUT_LLAMA_REPO:-$LLAMA_REPO} - echo # move to a new line after input + echo # move to a new line after input # Set environment variables with defaults if not already set - LLAMA_DIR=${LLAMA_DIR:-${LLAMA_REPO}/models/${META_DIR_FILTER}} - LLAMA_WEIGHTS_DIR=${LLAMA_WEIGHTS_DIR:-${LLAMA_DIR}/${META_MODEL_NAME}} - PERSISTENT_VOLUME=${PERSISTENT_VOLUME:-${PERSISTENT_VOLUME_ROOT}/volume_id_tt-metal-${MODEL_NAME}v0.0.1} - + PERSISTENT_VOLUME=${PERSISTENT_VOLUME_ROOT}/volume_id_tt-metal-${MODEL_NAME}v0.0.1 # Prompt user for JWT_SECRET securely read -sp "Enter your JWT_SECRET: " JWT_SECRET echo # move to a new line after input # Verify the JWT_SECRET is not empty - if [ -z "$JWT_SECRET" ]; then + if [ -z "${JWT_SECRET:-}" ]; then echo "⛔ JWT_SECRET cannot be empty. Please try again." exit 1 fi @@ -184,11 +247,15 @@ setup_model_environment() { # Write environment variables to .env file echo "Writing environment variables to ${ENV_FILE} ..." cat > ${ENV_FILE} <=3.9' - pip install --upgrade setuptools wheel pip==21.2.4 tqdm - # repack script dependency - # pip does not support +cpu build variant qualifier, need to specify cpu index url - pip install --index-url https://download.pytorch.org/whl/cpu torch==2.2.1 - curl -O https://raw.githubusercontent.com/tenstorrent/tt-metal/refs/heads/main/models/demos/t3000/llama2_70b/scripts/repack_weights.py - echo "repacking weights..." - python repack_weights.py "${LLAMA_WEIGHTS_DIR}" "${WEIGHTS_DIR}" 5 - deactivate - rm -rf ${VENV_NAME} repack_weights.py + repack_weights "${LLAMA_WEIGHTS_DIR}" "${WEIGHTS_DIR}" else WEIGHTS_DIR="${PERSISTENT_VOLUME}/model_weights/${MODEL_NAME}" cp -rf "${LLAMA_WEIGHTS_DIR}" "${WEIGHTS_DIR}" - fi echo "using weights directory: ${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}" + echo "✅ setup_weights_meta completed!" +} + +setup_weights_huggingface() { + # Step 1: Verify HF_TOKEN and HF_HOME are set + if [ -z "${HF_TOKEN:-}" ] || [ -z "${HOST_HF_HOME:-}" ]; then + echo "⛔ HF_TOKEN or HF_HOME not set. Please ensure both environment variables are set." + exit 1 + fi - # create a tmp python venv with dependencies to run repack script - echo "✅ setup_weights completed!" + # Step 2: Set up persistent volume root + echo "Setting up persistent volume root: ${PERSISTENT_VOLUME}" + mkdir -p "${PERSISTENT_VOLUME}/model_weights/" + + # Step 3: Create python virtual environment for huggingface downloads + VENV_NAME=".venv_hf_setup" + echo "Setting up python venv for Hugging Face downloads: ${VENV_NAME}" + python3 -m venv ${VENV_NAME} + source ${VENV_NAME}/bin/activate + + # Step 4: Install required packages + pip install --upgrade pip setuptools wheel + pip install "huggingface_hub[cli]" + + # Step 5: Download model using huggingface-cli + echo "Downloading model from Hugging Face Hub..." + # stop timeout issue: https://huggingface.co/docs/huggingface_hub/en/guides/cli#download-timeout + export HF_HUB_DOWNLOAD_TIMEOUT=60 + # using default HF naming convention for model weights + huggingface-cli download "${HF_MODEL_REPO_ID}" \ + original/params.json \ + original/tokenizer.model \ + original/consolidated.* \ + --cache-dir="${HOST_HF_HOME}" \ + --token="${HF_TOKEN}" + + # Step 6: Process and copy weights + if [ "${REPACKED}" -eq 1 ]; then + WEIGHTS_DIR="${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}" + repack_weights "${LLAMA_WEIGHTS_DIR}" "${WEIGHTS_DIR}" + else + WEIGHTS_DIR="${PERSISTENT_VOLUME}/model_weights/${MODEL_NAME}" + cp -rf "${LLAMA_WEIGHTS_DIR}" "${WEIGHTS_DIR}" + fi + + # Step 7: Cleanup + deactivate + rm -rf ${VENV_NAME} + + echo "using weights directory: ${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}" + echo "✅ setup_weights_huggingface completed!" } setup_tt_metal_cache() { @@ -360,6 +471,34 @@ setup_tt_metal_cache() { echo "✅ setup_tt_metal_cache completed!" } +setup_weights() { + # Step 1: Load environment variables from .env file + load_env + + # check if model weights already exist + if [ -d "${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}" ]; then + echo "Model weights already exist at: ${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}." + echo "contents:" + echo + echo "$(ls -lh ${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME})" + echo + echo "If directory does not have correct weights, to re-download or copy the model weights delete the directory." + echo "🔔 check if directory contents are correct." + exit 1 + fi + + # Determine which setup method to use based on HF_TOKEN presence + if [ "${USE_HF_DOWNLOAD}" == "y" ]; then + setup_weights_huggingface + else + setup_weights_meta + fi +} + +# ============================================================================== +# Main script logic +# ============================================================================== + # Ensure script is being executed, not sourced if [[ "${BASH_SOURCE[0]}" != "${0}" ]]; then echo "⛔ Error: This script is being sourced. Please make execute it:" @@ -368,7 +507,6 @@ if [[ "${BASH_SOURCE[0]}" != "${0}" ]]; then return 1; # 'return' works when sourced; 'exit' would terminate the shell fi -# Main script logic if [ $# -lt 1 ]; then usage fi From ec43450577fac371d26a60b1badd8e6b90dbad9b Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Wed, 18 Dec 2024 22:11:21 +0000 Subject: [PATCH 68/76] add llama 3.2 refs --- setup.sh | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/setup.sh b/setup.sh index f1115e8c..a024244c 100755 --- a/setup.sh +++ b/setup.sh @@ -10,6 +10,7 @@ usage() { echo "Usage: $0 " echo "Available model types:" echo " llama-3.3-70b-instruct" + echo " llama-3.2-11b-vision-instruct" echo " llama-3.1-70b-instruct" echo " llama-3.1-70b" echo " llama-3.1-8b-instruct" @@ -113,9 +114,16 @@ setup_model_environment() { MODEL_NAME="llama-3.3-70b-instruct" HF_MODEL_REPO_ID="meta-llama/Llama-3.3-70B-Instruct" META_MODEL_NAME="Meta-Llama-3.3-70B-Instruct" - META_DIR_FILTER="llama3_1" + META_DIR_FILTER="llama3_3" REPACKED=1 ;; + "llama-3.2-11b-instruct") + MODEL_NAME="llama-3.2-11b-vision-instruct" + HF_MODEL_REPO_ID="meta-llama/Llama-3.2-11B-Vision-Instruct" + META_MODEL_NAME="Meta-Llama-3.2-11B-Vision-Instruct" + META_DIR_FILTER="llama3_2" + REPACKED=0 + ;; "llama-3.1-70b-instruct") MODEL_NAME="llama-3.1-70b-instruct" HF_MODEL_REPO_ID="meta-llama/Llama-3.1-70B-Instruct" @@ -493,6 +501,9 @@ setup_weights() { else setup_weights_meta fi + + echo "create tt-metal cache dir: ${LLAMA3_CACHE_PATH}" + mkdir -p ${LLAMA3_CACHE_PATH} } # ============================================================================== From d17e46e4595f5eb6c783d4eec1f82b061e5f959d Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Wed, 18 Dec 2024 22:42:45 +0000 Subject: [PATCH 69/76] WIP make setup.sh run from repo root, add fixed model impl dir, env file dir in persistent dir --- setup.sh | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/setup.sh b/setup.sh index a024244c..f20dde77 100755 --- a/setup.sh +++ b/setup.sh @@ -26,11 +26,9 @@ usage() { } # globals -readonly MODEL_PATH=$(dirname "$(realpath "$0")") -readonly REPO_ROOT=$(dirname "${MODEL_PATH}") +readonly REPO_ROOT=$(dirname "$(realpath "$0")") readonly ENV_FILE="${MODEL_PATH}/.env" echo "REPO_ROOT: ${REPO_ROOT}" -echo "MODEL_PATH: ${MODEL_PATH}" echo "ENV_FILE: ${ENV_FILE}" check_and_prompt_env_file() { @@ -113,20 +111,23 @@ setup_model_environment() { "llama-3.3-70b-instruct") MODEL_NAME="llama-3.3-70b-instruct" HF_MODEL_REPO_ID="meta-llama/Llama-3.3-70B-Instruct" - META_MODEL_NAME="Meta-Llama-3.3-70B-Instruct" - META_DIR_FILTER="llama3_3" + MODEL_IMPL_ROOT_DIR="vllm-tt-metal-llama3-70b" + META_MODEL_NAME="" + META_DIR_FILTER="" REPACKED=1 ;; "llama-3.2-11b-instruct") MODEL_NAME="llama-3.2-11b-vision-instruct" HF_MODEL_REPO_ID="meta-llama/Llama-3.2-11B-Vision-Instruct" - META_MODEL_NAME="Meta-Llama-3.2-11B-Vision-Instruct" - META_DIR_FILTER="llama3_2" + MODEL_IMPL_ROOT_DIR="vllm-tt-metal-llama32-11b-vision" + META_MODEL_NAME="" + META_DIR_FILTER="" REPACKED=0 ;; "llama-3.1-70b-instruct") MODEL_NAME="llama-3.1-70b-instruct" HF_MODEL_REPO_ID="meta-llama/Llama-3.1-70B-Instruct" + MODEL_IMPL_ROOT_DIR="vllm-tt-metal-llama3-70b" META_MODEL_NAME="Meta-Llama-3.1-70B-Instruct" META_DIR_FILTER="llama3_1" REPACKED=1 @@ -189,6 +190,8 @@ setup_model_environment() { # Set default values for environment variables DEFAULT_PERSISTENT_VOLUME_ROOT=${REPO_ROOT}/persistent_volume + DEFAULT_ENV_DIR="${DEFAULT_PERSISTENT_VOLUME_ROOT}/model_envs" + mkdir -p ${DEFAULT_ENV_DIR} # Initialize OVERWRITE_ENV OVERWRITE_ENV=false @@ -211,6 +214,9 @@ setup_model_environment() { LLAMA_WEIGHTS_DIR=${HF_HOME}/${HF_MODEL_REPO_ID}/original ;; n|N ) + if [ -z "${META_DIR_FILTER:-}" ]; then + echo "⛔ MODEL_NAME=${MODEL_NAME} does not support using direct Meta authorization model download. Please use Hugging Face method." + fi echo "Using direct authorization from Meta. You will need their URL Authorization token, typically from their website or email." # Prompt user for LLAMA_REPO if not already set or use default read -r -p "Enter the path where you want to clone the Llama model repository [default: ${LLAMA_REPO}]: " INPUT_LLAMA_REPO From 895ff8d03d1a8d8616499840573bd6f1cf017871 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Thu, 19 Dec 2024 21:16:14 +0000 Subject: [PATCH 70/76] adding setup.sh support for multiple models, adding support for llama 3.2 11b vision instruct, llama 3.3 70b instruct --- .gitignore | 2 +- setup.sh | 108 +++++++++++++++++++++++++++-------------------------- 2 files changed, 57 insertions(+), 53 deletions(-) diff --git a/.gitignore b/.gitignore index 24914ed3..f5ae8ff9 100644 --- a/.gitignore +++ b/.gitignore @@ -12,7 +12,7 @@ __pycache__ env .testvenv python_env -.venv +.venv* # persistent storage volume persistent_volume diff --git a/setup.sh b/setup.sh index f20dde77..04fe83d6 100755 --- a/setup.sh +++ b/setup.sh @@ -20,26 +20,20 @@ usage() { echo " llama-3-8b-instruct" echo " llama-3-8b" echo - echo "Options:" - echo " setup_permissions Run the script to set file permissions after first run setup (requires sudo)." exit 1 } # globals readonly REPO_ROOT=$(dirname "$(realpath "$0")") -readonly ENV_FILE="${MODEL_PATH}/.env" -echo "REPO_ROOT: ${REPO_ROOT}" -echo "ENV_FILE: ${ENV_FILE}" check_and_prompt_env_file() { local MODEL_NAME_KEY="MODEL_NAME" local MODEL_NAME="" - # Check if .env file exists - if [[ -f "$ENV_FILE" ]]; then + if [[ -f "${ENV_FILE}" ]]; then # Extract the MODEL_NAME value from .env - FOUND_MODEL_NAME=$(grep "^$MODEL_NAME_KEY=" "$ENV_FILE" | cut -d '=' -f2) - + echo "found ENV_FILE: ${ENV_FILE}" + FOUND_MODEL_NAME=$(grep "^$MODEL_NAME_KEY=" "$ENV_FILE" | cut -d '=' -f2) || FOUND_MODEL_NAME="" # If MODEL_NAME is found, display it if [[ -n "$FOUND_MODEL_NAME" ]]; then echo "The existing file ${ENV_FILE} contains MODEL_NAME: $FOUND_MODEL_NAME" @@ -66,7 +60,6 @@ check_and_prompt_env_file() { echo "MODEL_NAME not found in ${ENV_FILE}. Overwritting." OVERWRITE_ENV=true fi - else echo "${ENV_FILE} does not exist. Proceeding to create a new one." OVERWRITE_ENV=true @@ -116,7 +109,7 @@ setup_model_environment() { META_DIR_FILTER="" REPACKED=1 ;; - "llama-3.2-11b-instruct") + "llama-3.2-11b-vision-instruct") MODEL_NAME="llama-3.2-11b-vision-instruct" HF_MODEL_REPO_ID="meta-llama/Llama-3.2-11B-Vision-Instruct" MODEL_IMPL_ROOT_DIR="vllm-tt-metal-llama32-11b-vision" @@ -187,20 +180,32 @@ setup_model_environment() { exit 1 ;; esac + # Initialize OVERWRITE_ENV + OVERWRITE_ENV=false # Set default values for environment variables DEFAULT_PERSISTENT_VOLUME_ROOT=${REPO_ROOT}/persistent_volume - DEFAULT_ENV_DIR="${DEFAULT_PERSISTENT_VOLUME_ROOT}/model_envs" - mkdir -p ${DEFAULT_ENV_DIR} - - # Initialize OVERWRITE_ENV - OVERWRITE_ENV=false + MODEL_ENV_DIR="${DEFAULT_PERSISTENT_VOLUME_ROOT}/model_envs" + + mkdir -p ${MODEL_ENV_DIR} + ENV_FILE="${MODEL_ENV_DIR}/${MODEL_NAME}.env" + export ENV_FILE check_and_prompt_env_file + if [ "$OVERWRITE_ENV" = false ]; then echo "✅ using existing .env file: ${ENV_FILE}." return 0 fi + # Safely handle potentially unset environment variables using default values + PERSISTENT_VOLUME_ROOT=${PERSISTENT_VOLUME_ROOT:-$DEFAULT_PERSISTENT_VOLUME_ROOT} + # Prompt user for PERSISTENT_VOLUME_ROOT if not already set or use default + read -r -p "Enter your PERSISTENT_VOLUME_ROOT [default: ${DEFAULT_PERSISTENT_VOLUME_ROOT}]: " INPUT_PERSISTENT_VOLUME_ROOT + PERSISTENT_VOLUME_ROOT=${INPUT_PERSISTENT_VOLUME_ROOT:-$PERSISTENT_VOLUME_ROOT} + echo # move to a new line after input + # Set environment variables with defaults if not already set + PERSISTENT_VOLUME=${PERSISTENT_VOLUME_ROOT}/volume_id_tt-metal-${MODEL_NAME}v0.0.1 + read -p "Use 🤗 Hugging Face authorization token for downloading models? Alternative is direct authorization from Meta. (y/n) [default: y]: " input_use_hf_token choice_use_hf_token=${input_use_hf_token:-"y"} @@ -211,7 +216,8 @@ setup_model_environment() { echo "Using 🤗 Hugging Face Token." get_hf_env_vars # default location for HF e.g. ~/.cache/huggingface/models/meta-llama/Llama-3.3-70B-Instruct - LLAMA_WEIGHTS_DIR=${HF_HOME}/${HF_MODEL_REPO_ID}/original + # LLAMA_WEIGHTS_DIR=${HF_HOME}/local_dir/${HF_MODEL_REPO_ID} + WEIGHTS_DIR=${PERSISTENT_VOLUME}/model_weights/${MODEL_NAME} ;; n|N ) if [ -z "${META_DIR_FILTER:-}" ]; then @@ -231,16 +237,6 @@ setup_model_environment() { ;; esac - # Safely handle potentially unset environment variables using default values - PERSISTENT_VOLUME_ROOT=${PERSISTENT_VOLUME_ROOT:-$DEFAULT_PERSISTENT_VOLUME_ROOT} - - # Prompt user for PERSISTENT_VOLUME_ROOT if not already set or use default - read -r -p "Enter your PERSISTENT_VOLUME_ROOT [default: ${PERSISTENT_VOLUME_ROOT}]: " INPUT_PERSISTENT_VOLUME_ROOT - PERSISTENT_VOLUME_ROOT=${INPUT_PERSISTENT_VOLUME_ROOT:-$PERSISTENT_VOLUME_ROOT} - echo # move to a new line after input - - # Set environment variables with defaults if not already set - PERSISTENT_VOLUME=${PERSISTENT_VOLUME_ROOT}/volume_id_tt-metal-${MODEL_NAME}v0.0.1 # Prompt user for JWT_SECRET securely read -sp "Enter your JWT_SECRET: " JWT_SECRET echo # move to a new line after input @@ -270,9 +266,10 @@ HOST_HF_HOME=${HF_HOME:-""} # host paths LLAMA_REPO=${LLAMA_REPO:-""} LLAMA_DIR=${LLAMA_DIR:-""} -LLAMA_WEIGHTS_DIR=$LLAMA_WEIGHTS_DIR +LLAMA_WEIGHTS_DIR=${LLAMA_WEIGHTS_DIR:-""} PERSISTENT_VOLUME_ROOT=$PERSISTENT_VOLUME_ROOT PERSISTENT_VOLUME=$PERSISTENT_VOLUME +WEIGHTS_DIR=${WEIGHTS_DIR:-""} # container paths REPACKED=${REPACKED} REPACKED_STR=${REPACKED_STR} @@ -326,12 +323,13 @@ setup_permissions() { sudo usermod -aG dockermount "$USER" # Get container user with UID 1000 and add to group - CONTAINER_USER=$(getent passwd 1000 | cut -d: -f1) + CONTAINER_UID=1000 + CONTAINER_USER=$(getent passwd ${CONTAINER_UID} | cut -d: -f1) if [ -n "$CONTAINER_USER" ]; then - echo "Adding container user: '$CONTAINER_USER' (UID 1000) to 'dockermount' group ..." + echo "Adding container user: '$CONTAINER_USER' (UID ${CONTAINER_UID}) to 'dockermount' group ..." sudo usermod -aG dockermount "$CONTAINER_USER" else - echo "No user found with UID 1000." + echo "No user found with UID ${CONTAINER_UID}." fi # Set file ownership and permissions @@ -340,7 +338,7 @@ setup_permissions() { # if the user point the PERSISTENT_VOLUME sudo mkdir -p "${PERSISTENT_VOLUME}" fi - sudo chown -R ${CONTAINER_USER}:dockermount "${PERSISTENT_VOLUME}" + sudo chown -R ${CONTAINER_UID}:dockermount "${PERSISTENT_VOLUME}" sudo chmod -R 775 "${PERSISTENT_VOLUME}" echo "✅ setup_permissions completed!" @@ -453,15 +451,27 @@ setup_weights_huggingface() { original/tokenizer.model \ original/consolidated.* \ --cache-dir="${HOST_HF_HOME}" \ - --token="${HF_TOKEN}" + --token="${HF_TOKEN}" + + # symlinks are broken for huggingface-cli download with --local-dir option + # see: https://github.com/huggingface/huggingface_hub/pull/2223 + # to use symlinks, find most recent snapshot and create symlink to that + mkdir -p "${WEIGHTS_DIR}" + LOCAL_REPO_NAME=$(echo "${HF_MODEL_REPO_ID}" | sed 's|/|--|g') + SNAPSHOT_DIR="${HOST_HF_HOME}/models--${LOCAL_REPO_NAME}/snapshots" + # note: ls -td will sort by modification date descending, potential edge case + # if desired snapshot is not most recent modified or ls sorts differently + MOST_RECENT_SNAPSHOT=$(ls -td -- ${SNAPSHOT_DIR}/* | head -n 1) + echo "create symlink: ${MOST_RECENT_SNAPSHOT}/original/ -> ${WEIGHTS_DIR}" + for item in ${MOST_RECENT_SNAPSHOT}/original/*; do + ln -s "$item" "${WEIGHTS_DIR}" + done # Step 6: Process and copy weights if [ "${REPACKED}" -eq 1 ]; then - WEIGHTS_DIR="${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}" - repack_weights "${LLAMA_WEIGHTS_DIR}" "${WEIGHTS_DIR}" - else - WEIGHTS_DIR="${PERSISTENT_VOLUME}/model_weights/${MODEL_NAME}" - cp -rf "${LLAMA_WEIGHTS_DIR}" "${WEIGHTS_DIR}" + REPACKED_WEIGHTS_DIR="${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}" + mkdir -p "${REPACKED_WEIGHTS_DIR}" + repack_weights "${WEIGHTS_DIR}" "${REPACKED_WEIGHTS_DIR}" fi # Step 7: Cleanup @@ -491,25 +501,24 @@ setup_weights() { # check if model weights already exist if [ -d "${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}" ]; then - echo "Model weights already exist at: ${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}." + echo "Model weights already exist at: ${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}" echo "contents:" echo echo "$(ls -lh ${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME})" echo echo "If directory does not have correct weights, to re-download or copy the model weights delete the directory." echo "🔔 check if directory contents are correct." - exit 1 - fi - - # Determine which setup method to use based on HF_TOKEN presence - if [ "${USE_HF_DOWNLOAD}" == "y" ]; then - setup_weights_huggingface else - setup_weights_meta + # Determine which setup method to use based on HF_TOKEN presence + if [ "${USE_HF_DOWNLOAD}" == "y" ]; then + setup_weights_huggingface + else + setup_weights_meta + fi fi echo "create tt-metal cache dir: ${LLAMA3_CACHE_PATH}" - mkdir -p ${LLAMA3_CACHE_PATH} + mkdir -p "${PERSISTENT_VOLUME}/tt_metal_cache/cache_${REPACKED_STR}${MODEL_NAME}" } # ============================================================================== @@ -528,11 +537,6 @@ if [ $# -lt 1 ]; then usage fi -if [ "$1" == "setup_permissions" ]; then - setup_permissions - exit 0 -fi - # Set up environment variables for the chosen model MODEL_TYPE=$1 setup_model_environment "$MODEL_TYPE" From 49ee14f43562999b1a548aad49c68fad57b885e1 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Thu, 19 Dec 2024 21:22:32 +0000 Subject: [PATCH 71/76] update .env file location in documentation --- evals/README.md | 2 +- vllm-tt-metal-llama3-70b/README.md | 2 +- vllm-tt-metal-llama3-70b/docs/development.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/evals/README.md b/evals/README.md index ca3add07..5dfc42a5 100644 --- a/evals/README.md +++ b/evals/README.md @@ -21,7 +21,7 @@ export PERSISTENT_VOLUME=$PWD/persistent_volume/volume_id_tt-metal-llama-3.1-70b docker run \ --rm \ -it \ - --env-file vllm-tt-metal-llama3-70b/.env \ + --env-file persistent_volume/model_envs/llama-3.1-70b-instruct.env \ --cap-add ALL \ --device /dev/tenstorrent:/dev/tenstorrent \ --volume /dev/hugepages-1G:/dev/hugepages-1G:rw \ diff --git a/vllm-tt-metal-llama3-70b/README.md b/vllm-tt-metal-llama3-70b/README.md index 38ef1a9a..3d9a9a5d 100644 --- a/vllm-tt-metal-llama3-70b/README.md +++ b/vllm-tt-metal-llama3-70b/README.md @@ -29,7 +29,7 @@ export PERSISTENT_VOLUME=$PWD/persistent_volume/volume_id_tt-metal-llama-3.1-70b docker run \ --rm \ -it \ - --env-file vllm-tt-metal-llama3-70b/.env \ + --env-file persistent_volume/model_envs/llama-3.1-70b-instruct.env \ --cap-add ALL \ --device /dev/tenstorrent:/dev/tenstorrent \ --volume /dev/hugepages-1G:/dev/hugepages-1G:rw \ diff --git a/vllm-tt-metal-llama3-70b/docs/development.md b/vllm-tt-metal-llama3-70b/docs/development.md index efa3c075..d4f950b3 100644 --- a/vllm-tt-metal-llama3-70b/docs/development.md +++ b/vllm-tt-metal-llama3-70b/docs/development.md @@ -42,7 +42,7 @@ export PERSISTENT_VOLUME=$PWD/persistent_volume/volume_id_tt-metal-llama-3.1-70b docker run \ --rm \ -it \ - --env-file tt-metal-llama3-70b/.env \ + --env-file persistent_volume/model_envs/llama-3.1-70b-instruct.env \ --cap-add ALL \ --device /dev/tenstorrent:/dev/tenstorrent \ --volume /dev/hugepages-1G:/dev/hugepages-1G:rw \ From 1675f54c3e85cfc72bf0f1d1b52d147380ddc7ab Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Thu, 19 Dec 2024 23:12:20 +0000 Subject: [PATCH 72/76] remove MODEL_IMPL_ROOT_DIR and add note about MODEL_NAME --- setup.sh | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/setup.sh b/setup.sh index 04fe83d6..52388dfa 100755 --- a/setup.sh +++ b/setup.sh @@ -100,11 +100,11 @@ get_hf_env_vars() { # Function to set environment variables based on the model selection and write them to .env setup_model_environment() { # Set environment variables based on the model selection + # note: MODEL_NAME is the lower cased basename of the HF repo ID case "$1" in "llama-3.3-70b-instruct") MODEL_NAME="llama-3.3-70b-instruct" HF_MODEL_REPO_ID="meta-llama/Llama-3.3-70B-Instruct" - MODEL_IMPL_ROOT_DIR="vllm-tt-metal-llama3-70b" META_MODEL_NAME="" META_DIR_FILTER="" REPACKED=1 @@ -112,7 +112,6 @@ setup_model_environment() { "llama-3.2-11b-vision-instruct") MODEL_NAME="llama-3.2-11b-vision-instruct" HF_MODEL_REPO_ID="meta-llama/Llama-3.2-11B-Vision-Instruct" - MODEL_IMPL_ROOT_DIR="vllm-tt-metal-llama32-11b-vision" META_MODEL_NAME="" META_DIR_FILTER="" REPACKED=0 @@ -120,7 +119,6 @@ setup_model_environment() { "llama-3.1-70b-instruct") MODEL_NAME="llama-3.1-70b-instruct" HF_MODEL_REPO_ID="meta-llama/Llama-3.1-70B-Instruct" - MODEL_IMPL_ROOT_DIR="vllm-tt-metal-llama3-70b" META_MODEL_NAME="Meta-Llama-3.1-70B-Instruct" META_DIR_FILTER="llama3_1" REPACKED=1 From d1bffe04a0368941480f6c02ad2ff59ad816d405 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Fri, 20 Dec 2024 03:19:08 +0000 Subject: [PATCH 73/76] move setup_tt_metal_cache into setup_weights to use load_env scope --- setup.sh | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/setup.sh b/setup.sh index 52388dfa..86dd9d58 100755 --- a/setup.sh +++ b/setup.sh @@ -515,8 +515,7 @@ setup_weights() { fi fi - echo "create tt-metal cache dir: ${LLAMA3_CACHE_PATH}" - mkdir -p "${PERSISTENT_VOLUME}/tt_metal_cache/cache_${REPACKED_STR}${MODEL_NAME}" + setup_tt_metal_cache } # ============================================================================== @@ -539,7 +538,6 @@ fi MODEL_TYPE=$1 setup_model_environment "$MODEL_TYPE" setup_weights -setup_tt_metal_cache # Call the script again with sudo to execute the sudo-required commands echo "Switching to sudo portion to set file permissions and complete setup." setup_permissions From 97c90cfd4b52fee4b9eb609cfee8e4f421063b61 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Fri, 20 Dec 2024 03:23:13 +0000 Subject: [PATCH 74/76] better logging and handling of {PERSISTENT_VOLUME}/model_weights dir setup --- setup.sh | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/setup.sh b/setup.sh index 86dd9d58..eac1a20d 100755 --- a/setup.sh +++ b/setup.sh @@ -395,17 +395,12 @@ setup_weights_meta() { echo "Weights already downloaded at ${LLAMA_WEIGHTS_DIR}" echo "Skipping download." else - # Step 4: Run the download script and select models echo "Running the download script to download models at ${LLAMA_DIR}/download.sh ..." cd "$LLAMA_DIR" ./download.sh cd - fi - # Step 5: Copy weights to persistent volume - echo "Setting up persistent volume root: ${PERSISTENT_VOLUME}" - mkdir -p "${PERSISTENT_VOLUME}/model_weights/" - if [ "${REPACKED}" -eq 1 ]; then WEIGHTS_DIR="${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}" repack_weights "${LLAMA_WEIGHTS_DIR}" "${WEIGHTS_DIR}" @@ -425,10 +420,6 @@ setup_weights_huggingface() { exit 1 fi - # Step 2: Set up persistent volume root - echo "Setting up persistent volume root: ${PERSISTENT_VOLUME}" - mkdir -p "${PERSISTENT_VOLUME}/model_weights/" - # Step 3: Create python virtual environment for huggingface downloads VENV_NAME=".venv_hf_setup" echo "Setting up python venv for Hugging Face downloads: ${VENV_NAME}" @@ -494,19 +485,20 @@ setup_tt_metal_cache() { } setup_weights() { - # Step 1: Load environment variables from .env file load_env # check if model weights already exist if [ -d "${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}" ]; then echo "Model weights already exist at: ${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}" + echo "🔔 check if directory contents are correct." echo "contents:" - echo + echo "ls -lh ${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME}" echo "$(ls -lh ${PERSISTENT_VOLUME}/model_weights/${REPACKED_STR}${MODEL_NAME})" echo echo "If directory does not have correct weights, to re-download or copy the model weights delete the directory." - echo "🔔 check if directory contents are correct." else + echo "Setting up persistent volume root: ${PERSISTENT_VOLUME}" + mkdir -p "${PERSISTENT_VOLUME}/model_weights/" # Determine which setup method to use based on HF_TOKEN presence if [ "${USE_HF_DOWNLOAD}" == "y" ]; then setup_weights_huggingface From 6df6c7cd2dcec2b1c9f999db9e565a116d181d41 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Fri, 20 Dec 2024 04:07:20 +0000 Subject: [PATCH 75/76] adding error message when huggingface-cli download fails with common issues for troubleshooting, add support for llama 3.2 1B / 3B --- setup.sh | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/setup.sh b/setup.sh index eac1a20d..3b062ada 100755 --- a/setup.sh +++ b/setup.sh @@ -11,6 +11,8 @@ usage() { echo "Available model types:" echo " llama-3.3-70b-instruct" echo " llama-3.2-11b-vision-instruct" + echo " llama-3.2-3b-instruct" + echo " llama-3.2-1b-instruct" echo " llama-3.1-70b-instruct" echo " llama-3.1-70b" echo " llama-3.1-8b-instruct" @@ -116,6 +118,20 @@ setup_model_environment() { META_DIR_FILTER="" REPACKED=0 ;; + "llama-3.2-3b-instruct") + MODEL_NAME="llama-3.2-3b-instruct" + HF_MODEL_REPO_ID="meta-llama/Llama-3.2-3B-Instruct" + META_MODEL_NAME="" + META_DIR_FILTER="" + REPACKED=0 + ;; + "llama-3.2-1b-instruct") + MODEL_NAME="llama-3.2-1b-instruct" + HF_MODEL_REPO_ID="meta-llama/Llama-3.2-1B-Instruct" + META_MODEL_NAME="" + META_DIR_FILTER="" + REPACKED=0 + ;; "llama-3.1-70b-instruct") MODEL_NAME="llama-3.1-70b-instruct" HF_MODEL_REPO_ID="meta-llama/Llama-3.1-70B-Instruct" @@ -442,6 +458,17 @@ setup_weights_huggingface() { --cache-dir="${HOST_HF_HOME}" \ --token="${HF_TOKEN}" + if [ $? -ne 0 ]; then + echo "⛔ Error occured during: huggingface-cli download ${HF_MODEL_REPO_ID}" + echo "🔔 check for common issues:" + echo " 1. 401 Unauthorized error occurred." + echo " For example:" + echo " huggingface_hub.errors.GatedRepoError: 401 Client Error. Cannot access gated repo" + echo " ❗ In this case, go to the repo URL in your web browser and click through the access request form." + echo " 2. check correct HF_TOKEN is set in the .env file: ${ENV_FILE}" + exit 1 + fi + # symlinks are broken for huggingface-cli download with --local-dir option # see: https://github.com/huggingface/huggingface_hub/pull/2223 # to use symlinks, find most recent snapshot and create symlink to that From 6beaa66e5350304cf513f150e6d9845cc79337bb Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Sat, 21 Dec 2024 02:19:00 +0000 Subject: [PATCH 76/76] update README for llama 3.1 70B v0 drop commits --- vllm-tt-metal-llama3-70b/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm-tt-metal-llama3-70b/README.md b/vllm-tt-metal-llama3-70b/README.md index 3d9a9a5d..e6b3448c 100644 --- a/vllm-tt-metal-llama3-70b/README.md +++ b/vllm-tt-metal-llama3-70b/README.md @@ -106,7 +106,7 @@ Either download the Docker image from GitHub Container Registry (recommended for ```bash # pull image from GHCR -docker pull ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:v0.0.2-tt-metal-385904186f81-384f1790c3be +docker pull ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm:v0.0.1-tt-metal-v0.54.0-rc2-953161188c50 ``` #### Option B: Build Docker Image @@ -115,7 +115,7 @@ For instructions on building the Docker imagem locally see: [vllm-tt-metal-llama ### 5. Automated Setup: environment variables and weights files -The script `vllm-tt-metal-llama3-70b/setup.sh` automates: +The script `setup.sh` automates: 1. interactively creating the .env file, 2. downloading the Llama model weights, @@ -123,7 +123,7 @@ The script `vllm-tt-metal-llama3-70b/setup.sh` automates: 4. creating the default persistent storage directory structure and permissions. ```bash -cd tt-inference-server/vllm-tt-metal-llama3-70b +cd tt-inference-server chmod +x setup.sh ./setup.sh llama-3.1-70b-instruct ```