diff --git a/README.md b/README.md
index 3bfe8d4d52d..104f596e28e 100644
--- a/README.md
+++ b/README.md
@@ -36,7 +36,7 @@
 | [Llama 3.1 70B (TP=8)](./models/demos/t3000/llama3_70b)       | 32    | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 190       | 15.1  | 20              | 483.2  | [v0.54.0-rc2](https://github.com/tenstorrent/tt-metal/tree/v0.54.0-rc2) | [9531611](https://github.com/tenstorrent/vllm/tree/953161188c50f10da95a88ab305e23977ebd3750)      |
 | [Falcon 40B (TP=8)](./models/demos/t3000/falcon40b)           | 32    | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) |           | 5.3   | 36              | 169.6  | [v0.55.0-rc19](https://github.com/tenstorrent/tt-metal/tree/v0.55.0-rc19)  |                                                                                                   |
 | [Mixtral 8x7B (TP=8)](./models/demos/t3000/mixtral8x7b)       | 32    | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 227       | 14.9  | 33              | 476.8  | [v0.55.0-rc19](https://github.com/tenstorrent/tt-metal/tree/v0.55.0-rc19) |                                                                                                   |
-| [DeepSeek R1 Distill Llama 3.3 70B (TP=8)](./models/demos/llama3)       | 32    | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) |        |   |               |  | [main](https://github.com/tenstorrent/tt-metal/) | tbd      |
+| [DeepSeek R1 Distill Llama 3.3 70B (TP=8)](./models/demos/llama3)       | 32    | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 1113       | 16.4  | 33              |  | [main](https://github.com/tenstorrent/tt-metal/) | [2f33504](https://github.com/tenstorrent/vllm/tree/2f33504bad49a6202d3685155107a6126a5b5e6e)      |
 | [Falcon 7B (DP=32)](./models/demos/tg/falcon7b)               | 1024  | [Galaxy](https://tenstorrent.com/hardware/galaxy)        | 223       | 4.8   | 26              | 4915.2 | [v0.55.0-rc18](https://github.com/tenstorrent/tt-metal/tree/v0.55.0-rc18) |                                                                                                   |
 | [Llama 3.1 70B (DP=4, TP=8)](./models/demos/t3000/llama3_70b) | 128   | [Galaxy](https://tenstorrent.com/hardware/galaxy)        | 190       | 14.3  | 20              | 1835.5 | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) |                                                                                                   |
 | [Llama 3.1 70B (TP=32)](./models/demos/llama3)                | 32   | [Galaxy](https://tenstorrent.com/hardware/galaxy)        | 763       | 13.5  | 80              | 432.0 | [v0.55.0-rc12](https://github.com/tenstorrent/tt-metal/tree/v0.55.0-rc12) | [2f33504](https://github.com/tenstorrent/vllm/tree/2f33504bad49a6202d3685155107a6126a5b5e6e) |
diff --git a/models/demos/llama3/PERF.md b/models/demos/llama3/PERF.md
index 8a2a32ba7ac..30a89d6bf56 100644
--- a/models/demos/llama3/PERF.md
+++ b/models/demos/llama3/PERF.md
@@ -13,18 +13,18 @@ This configuration uses bfp4 MLP FF1+FF3 for all models.
 | Llama3.2-1B    | N150   | 89        | 98        | 86.9          |
 | Llama3.2-1B    | N300   | 91        | 98        | 104.3         |
 | Llama3.2-1B    | T3K    | 91        | 98        | 118.5         |
-| Llama3.2-1B    | TG     |         | 99        | 53.3          |
+| Llama3.2-1B    | TG     |         |         | 72.3          |
 | Llama3.2-3B    | N150   | 92        | 96        | 53.3          |
 | Llama3.2-3B    | N300   | 91        | 96        | 66.1          |
 | Llama3.2-3B    | T3K    | 91        | 96        | 66.9          |
-| Llama3.2-3B    | TG     |         |         |           |
+| Llama3.2-3B    | TG     |         |         | 48.5          |
 | Llama3.1-8B    | N150   | 87        | 99        | 27.9          |
 | Llama3.1-8B    | N300   | 88        | 99        | 43.7          |
 | Llama3.1-8B    | T3K    | 91        | 100        | 64.2          |
-| Llama3.1-8B    | TG     |         |         |           |
+| Llama3.1-8B    | TG     |         |         | 41.0          |
 | Llama3.2-11B   | N300   | 89        | 99        | 43.5          |
 | Llama3.2-11B   | T3K    | 88        | 99        | 63.4          |
-| Llama3.2-11B   | TG     |         |         |           |
+| Llama3.2-11B   | TG     |         |         | 40.9          |
 | Llama3.1-70B   | T3K    | 96        | 100        | 16.1          |
 | Llama3.1-70B   | TG     |         |        |           |
 
diff --git a/models/demos/llama3/tt/llama_mlp.py b/models/demos/llama3/tt/llama_mlp.py
index eced782213c..bb5fb6026e2 100644
--- a/models/demos/llama3/tt/llama_mlp.py
+++ b/models/demos/llama3/tt/llama_mlp.py
@@ -34,7 +34,9 @@ def __init__(
 
         # TODO Clean up this code. With sharding, we load the normal weights and then shard them
         as_sharded_tensor = lambda name, type, dims: ttnn.as_tensor(
-            pad_hidden_dim(torch_weight(name[:2]), -1),  # Grab only the wX part of the name
+            pad_hidden_dim(
+                torch_weight(name[:2]), dims[0] if args.is_galaxy else -1
+            ),  # Grab only the wX part of the name
             dtype=type,
             device=self.mesh_device,
             mesh_mapper=ttnn.ShardTensor2dMesh(self.mesh_device, dims=dims, mesh_shape=args.cluster_shape),