diff --git a/README.md b/README.md index 3bfe8d4d52d..104f596e28e 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ | [Llama 3.1 70B (TP=8)](./models/demos/t3000/llama3_70b) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 190 | 15.1 | 20 | 483.2 | [v0.54.0-rc2](https://github.com/tenstorrent/tt-metal/tree/v0.54.0-rc2) | [9531611](https://github.com/tenstorrent/vllm/tree/953161188c50f10da95a88ab305e23977ebd3750) | | [Falcon 40B (TP=8)](./models/demos/t3000/falcon40b) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | | 5.3 | 36 | 169.6 | [v0.55.0-rc19](https://github.com/tenstorrent/tt-metal/tree/v0.55.0-rc19) | | | [Mixtral 8x7B (TP=8)](./models/demos/t3000/mixtral8x7b) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 227 | 14.9 | 33 | 476.8 | [v0.55.0-rc19](https://github.com/tenstorrent/tt-metal/tree/v0.55.0-rc19) | | -| [DeepSeek R1 Distill Llama 3.3 70B (TP=8)](./models/demos/llama3) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | | | | | [main](https://github.com/tenstorrent/tt-metal/) | tbd | +| [DeepSeek R1 Distill Llama 3.3 70B (TP=8)](./models/demos/llama3) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 1113 | 16.4 | 33 | | [main](https://github.com/tenstorrent/tt-metal/) | [2f33504](https://github.com/tenstorrent/vllm/tree/2f33504bad49a6202d3685155107a6126a5b5e6e) | | [Falcon 7B (DP=32)](./models/demos/tg/falcon7b) | 1024 | [Galaxy](https://tenstorrent.com/hardware/galaxy) | 223 | 4.8 | 26 | 4915.2 | [v0.55.0-rc18](https://github.com/tenstorrent/tt-metal/tree/v0.55.0-rc18) | | | [Llama 3.1 70B (DP=4, TP=8)](./models/demos/t3000/llama3_70b) | 128 | [Galaxy](https://tenstorrent.com/hardware/galaxy) | 190 | 14.3 | 20 | 1835.5 | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) | | | [Llama 3.1 70B (TP=32)](./models/demos/llama3) | 32 | [Galaxy](https://tenstorrent.com/hardware/galaxy) | 763 | 13.5 | 80 | 432.0 | [v0.55.0-rc12](https://github.com/tenstorrent/tt-metal/tree/v0.55.0-rc12) | [2f33504](https://github.com/tenstorrent/vllm/tree/2f33504bad49a6202d3685155107a6126a5b5e6e) | diff --git a/models/demos/llama3/PERF.md b/models/demos/llama3/PERF.md index 8a2a32ba7ac..30a89d6bf56 100644 --- a/models/demos/llama3/PERF.md +++ b/models/demos/llama3/PERF.md @@ -13,18 +13,18 @@ This configuration uses bfp4 MLP FF1+FF3 for all models. | Llama3.2-1B | N150 | 89 | 98 | 86.9 | | Llama3.2-1B | N300 | 91 | 98 | 104.3 | | Llama3.2-1B | T3K | 91 | 98 | 118.5 | -| Llama3.2-1B | TG | | 99 | 53.3 | +| Llama3.2-1B | TG | | | 72.3 | | Llama3.2-3B | N150 | 92 | 96 | 53.3 | | Llama3.2-3B | N300 | 91 | 96 | 66.1 | | Llama3.2-3B | T3K | 91 | 96 | 66.9 | -| Llama3.2-3B | TG | | | | +| Llama3.2-3B | TG | | | 48.5 | | Llama3.1-8B | N150 | 87 | 99 | 27.9 | | Llama3.1-8B | N300 | 88 | 99 | 43.7 | | Llama3.1-8B | T3K | 91 | 100 | 64.2 | -| Llama3.1-8B | TG | | | | +| Llama3.1-8B | TG | | | 41.0 | | Llama3.2-11B | N300 | 89 | 99 | 43.5 | | Llama3.2-11B | T3K | 88 | 99 | 63.4 | -| Llama3.2-11B | TG | | | | +| Llama3.2-11B | TG | | | 40.9 | | Llama3.1-70B | T3K | 96 | 100 | 16.1 | | Llama3.1-70B | TG | | | | diff --git a/models/demos/llama3/tt/llama_mlp.py b/models/demos/llama3/tt/llama_mlp.py index eced782213c..bb5fb6026e2 100644 --- a/models/demos/llama3/tt/llama_mlp.py +++ b/models/demos/llama3/tt/llama_mlp.py @@ -34,7 +34,9 @@ def __init__( # TODO Clean up this code. With sharding, we load the normal weights and then shard them as_sharded_tensor = lambda name, type, dims: ttnn.as_tensor( - pad_hidden_dim(torch_weight(name[:2]), -1), # Grab only the wX part of the name + pad_hidden_dim( + torch_weight(name[:2]), dims[0] if args.is_galaxy else -1 + ), # Grab only the wX part of the name dtype=type, device=self.mesh_device, mesh_mapper=ttnn.ShardTensor2dMesh(self.mesh_device, dims=dims, mesh_shape=args.cluster_shape),