diff --git a/tests/tt_metal/tt_metal/api/CMakeLists.txt b/tests/tt_metal/tt_metal/api/CMakeLists.txt
index 864e4b911e9..042e199476c 100644
--- a/tests/tt_metal/tt_metal/api/CMakeLists.txt
+++ b/tests/tt_metal/tt_metal/api/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(UNIT_TESTS_API_SRC
     ${CMAKE_CURRENT_SOURCE_DIR}/allocator/test_free_list_allocator.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/allocator/test_free_list_opt_allocator.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/allocator/test_l1_banking_allocator.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/circular_buffer/test_CircularBuffer_allocation.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/circular_buffer/test_CircularBuffer_creation.cpp
diff --git a/tests/tt_metal/tt_metal/api/allocator/test_free_list_opt_allocator.cpp b/tests/tt_metal/tt_metal/api/allocator/test_free_list_opt_allocator.cpp
new file mode 100644
index 00000000000..6d4488afa54
--- /dev/null
+++ b/tests/tt_metal/tt_metal/api/allocator/test_free_list_opt_allocator.cpp
@@ -0,0 +1,342 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <gtest/gtest.h>
+#include "allocator/allocator.hpp"
+#include "tt_metal/impl/allocator/algorithms/free_list_opt.hpp"
+
+// UDL to convert integer literals to SI units
+constexpr size_t operator"" _KiB(unsigned long long x) { return x * 1024; }
+constexpr size_t operator"" _MiB(unsigned long long x) { return x * 1024 * 1024; }
+constexpr size_t operator"" _GiB(unsigned long long x) { return x * 1024 * 1024 * 1024; }
+
+TEST(FreeListOptTest, Allocation) {
+    auto allocator = tt::tt_metal::allocator::FreeListOpt(1_GiB, 0, 1_KiB, 1_KiB);
+    auto a = allocator.allocate(1_KiB);
+    ASSERT_TRUE(a.has_value());
+    ASSERT_EQ(a.value(), 0);
+
+    auto b = allocator.allocate(1_KiB);
+    ASSERT_TRUE(b.has_value());
+    ASSERT_EQ(b.value(), 1_KiB);
+}
+
+TEST(FreeListOptTest, Alignment) {
+    auto allocator = tt::tt_metal::allocator::FreeListOpt(1_GiB, 0, 1, 1_KiB);
+    auto a = allocator.allocate(64);
+    ASSERT_TRUE(a.has_value());
+    ASSERT_EQ(a.value(), 0);
+    auto b = allocator.allocate(64);
+    ASSERT_TRUE(b.has_value());
+    ASSERT_EQ(b.value(), 1_KiB);
+}
+
+TEST(FreeListOptTest, MinAllocationSize) {
+    auto allocator = tt::tt_metal::allocator::FreeListOpt(1_GiB, 0, 1_KiB, 1);
+    auto a = allocator.allocate(1);
+    ASSERT_TRUE(a.has_value());
+    ASSERT_EQ(a.value(), 0);
+    auto b = allocator.allocate(1);
+    ASSERT_TRUE(b.has_value());
+    ASSERT_EQ(b.value(), 1_KiB);
+}
+
+TEST(FreeListOptTest, Clear) {
+    auto allocator = tt::tt_metal::allocator::FreeListOpt(1_GiB, 0, 1_KiB, 1_KiB);
+    auto a = allocator.allocate(1_KiB);
+    auto b = allocator.allocate(1_KiB);
+    ASSERT_TRUE(a.has_value());
+    ASSERT_TRUE(b.has_value());
+    allocator.clear();
+    auto c = allocator.allocate(1_KiB);
+    ASSERT_TRUE(c.has_value());
+    ASSERT_EQ(c.value(), 0);
+}
+
+TEST(FreeListOptTest, AllocationAndDeallocation) {
+    auto allocator = tt::tt_metal::allocator::FreeListOpt(1_GiB, 0, 1_KiB, 1_KiB);
+    std::vector<std::optional<tt::tt_metal::DeviceAddr>> allocations(10);
+
+    // Deallocate in order
+    for(size_t i = 0; i < allocations.size(); i++) {
+        allocations[i] = allocator.allocate(1_KiB);
+        ASSERT_TRUE(allocations[i].has_value());
+    }
+
+    for(size_t i = allocations.size(); i > 0; i--) {
+        allocator.deallocate(allocations[i - 1].value());
+    }
+
+    // Deallocate in reverse order
+    for(size_t i = 0; i < allocations.size(); i++) {
+        allocations[i] = allocator.allocate(1_KiB);
+        ASSERT_TRUE(allocations[i].has_value());
+    }
+
+    for(size_t i = 0; i < allocations.size(); i++) {
+        allocator.deallocate(allocations[i].value());
+    }
+}
+
+TEST(FreeListOptTest, AllocateAtAddress) {
+    auto allocator = tt::tt_metal::allocator::FreeListOpt(1_GiB, 0, 1_KiB, 1_KiB);
+    auto a = allocator.allocate(1_KiB);
+    ASSERT_TRUE(a.has_value());
+    ASSERT_EQ(a.value(), 0);
+
+    auto b = allocator.allocate_at_address(1_KiB, 1_KiB);
+    ASSERT_TRUE(b.has_value());
+    ASSERT_EQ(b.value(), 1_KiB);
+
+    // Address is already allocated
+    auto c = allocator.allocate_at_address(1_KiB, 1_KiB);
+    ASSERT_FALSE(c.has_value());
+
+    auto d = allocator.allocate_at_address(2_KiB, 1_KiB);
+    ASSERT_TRUE(d.has_value());
+    ASSERT_EQ(d.value(), 2_KiB);
+
+    allocator.deallocate(a.value());
+    auto e = allocator.allocate_at_address(0, 1_KiB);
+    ASSERT_TRUE(e.has_value());
+    ASSERT_EQ(e.value(), 0);
+}
+
+TEST(FreeListOptTest, AllocateAtAddressInteractions) {
+    auto allocator = tt::tt_metal::allocator::FreeListOpt(1_GiB, 0, 1_KiB, 1_KiB);
+    auto wedge = allocator.allocate_at_address(32_KiB, 1_KiB);
+
+    auto a = allocator.allocate(1_KiB);
+    ASSERT_TRUE(a.has_value());
+    ASSERT_EQ(a.value(), 0);
+
+    auto z = allocator.allocate(1_KiB, false);
+    ASSERT_TRUE(z.has_value());
+    ASSERT_EQ(z.value(), 32_KiB - 1_KiB); // Counterintuitive, but because we use BestFit, it will find the smaller block at the beginning
+
+    auto b = allocator.allocate(1_KiB);
+    ASSERT_TRUE(b.has_value());
+    ASSERT_EQ(b.value(), 1_KiB);
+}
+
+TEST(FreeListOptTest, ShrinkAndReset) {
+    auto allocator = tt::tt_metal::allocator::FreeListOpt(1_GiB, 0, 1_KiB, 1_KiB);
+    auto a = allocator.allocate(1_KiB);
+    auto b = allocator.allocate(1_KiB);
+    ASSERT_TRUE(a.has_value());
+    ASSERT_TRUE(b.has_value());
+    allocator.deallocate(a.value());
+
+    allocator.shrink_size(1_KiB);
+    auto c = allocator.allocate_at_address(0, 1_KiB);
+    ASSERT_FALSE(c.has_value());
+
+    auto d = allocator.allocate_at_address(1_KiB, 1_KiB);
+    ASSERT_FALSE(d.has_value());
+
+    allocator.reset_size();
+    allocator.deallocate(b.value());
+
+    auto e = allocator.allocate(2_KiB);
+    ASSERT_TRUE(e.has_value());
+}
+
+TEST(FreeListOptTest, Statistics) {
+    auto allocator = tt::tt_metal::allocator::FreeListOpt(1_GiB, 0, 1_KiB, 1_KiB);
+    auto a = allocator.allocate(1_KiB);
+    auto b = allocator.allocate(1_KiB);
+    ASSERT_TRUE(a.has_value());
+    ASSERT_TRUE(b.has_value());
+    allocator.deallocate(a.value());
+
+    auto stats = allocator.get_statistics();
+    ASSERT_EQ(stats.total_allocated_bytes, 1_KiB);
+}
+
+TEST(FreeListOptTest, AllocateFromTop) {
+    auto allocator = tt::tt_metal::allocator::FreeListOpt(1_GiB, 0, 1_KiB, 1_KiB);
+    auto a = allocator.allocate(1_KiB, false);
+    ASSERT_TRUE(a.has_value());
+    ASSERT_EQ(a.value(), 1_GiB - 1_KiB);
+
+    auto b = allocator.allocate(1_KiB, false);
+    ASSERT_TRUE(b.has_value());
+    ASSERT_EQ(b.value(), 1_GiB - 2_KiB);
+
+    auto c = allocator.allocate(1_KiB);
+    ASSERT_TRUE(c.has_value());
+    ASSERT_EQ(c.value(), 0);
+}
+
+TEST(FreeListOptTest, Coalescing) {
+    auto allocator = tt::tt_metal::allocator::FreeListOpt(1_GiB, 0, 1_KiB, 1_KiB);
+    auto a = allocator.allocate(1_KiB);
+    auto b = allocator.allocate(1_KiB);
+    auto c = allocator.allocate(1_KiB);
+    ASSERT_TRUE(a.has_value());
+    ASSERT_TRUE(b.has_value());
+    ASSERT_TRUE(c.has_value());
+    allocator.deallocate(b.value());
+    allocator.deallocate(a.value());
+
+    auto d = allocator.allocate(2_KiB);
+    ASSERT_TRUE(d.has_value());
+    ASSERT_EQ(d.value(), 0);
+}
+
+TEST(FreeListOptTest, CoalescingAfterResetShrink) {
+    auto allocator = tt::tt_metal::allocator::FreeListOpt(1_GiB, 0, 1_KiB, 1_KiB);
+    auto a = allocator.allocate(1_KiB);
+    auto b = allocator.allocate(1_KiB);
+    auto c = allocator.allocate(1_KiB);
+    ASSERT_TRUE(a.has_value());
+    ASSERT_TRUE(b.has_value());
+    ASSERT_TRUE(c.has_value());
+    allocator.deallocate(b.value());
+    allocator.deallocate(a.value());
+
+    allocator.shrink_size(1_KiB);
+    auto d = allocator.allocate(2_KiB);
+    allocator.reset_size();
+    auto e = allocator.allocate(2_KiB);
+    ASSERT_TRUE(e.has_value());
+    ASSERT_EQ(e.value(), 0);
+}
+
+TEST(FreeListOptTest, OutOfMemory) {
+    auto allocator = tt::tt_metal::allocator::FreeListOpt(1_GiB, 0, 1_KiB, 1_KiB);
+    auto a = allocator.allocate(1_GiB);
+    ASSERT_TRUE(a.has_value());
+    auto b = allocator.allocate(1_KiB);
+    ASSERT_FALSE(b.has_value());
+
+    allocator.clear();
+    auto c = allocator.allocate(1_GiB - 1_KiB);
+    ASSERT_TRUE(c.has_value());
+    auto d = allocator.allocate(2_KiB);
+    ASSERT_FALSE(d.has_value());
+}
+
+TEST(FreeListOptTest, AvailableAddresses) {
+    auto allocator = tt::tt_metal::allocator::FreeListOpt(1_GiB, 0, 1_KiB, 1_KiB);
+    auto a = allocator.allocate(1_KiB);
+    auto aval = allocator.available_addresses(1_KiB);
+    ASSERT_EQ(aval.size(), 1);
+    ASSERT_EQ(aval[0].first, 1_KiB); // Start address
+    ASSERT_EQ(aval[0].second, 1_GiB); // End address
+    allocator.clear();
+
+    a = allocator.allocate(1_KiB);
+    auto b = allocator.allocate(1_KiB);
+    auto c = allocator.allocate(1_KiB);
+    ASSERT_TRUE(a.has_value());
+    ASSERT_EQ(a.value(), 0);
+    ASSERT_TRUE(b.has_value());
+    ASSERT_EQ(b.value(), 1_KiB);
+    ASSERT_TRUE(c.has_value());
+    ASSERT_EQ(c.value(), 2_KiB);
+    allocator.deallocate(b.value());
+    aval = allocator.available_addresses(1_KiB);
+    ASSERT_EQ(aval.size(), 2);
+    ASSERT_EQ(aval[0].first, 1_KiB); // Start address
+    ASSERT_EQ(aval[0].second, 2_KiB); // End address
+    ASSERT_EQ(aval[1].first, 3_KiB); // Start address
+    ASSERT_EQ(aval[1].second, 1_GiB); // End address
+
+    allocator.clear();
+    a = allocator.allocate(1_KiB);
+    b = allocator.allocate(1_KiB);
+    c = allocator.allocate(1_KiB);
+    ASSERT_TRUE(a.has_value());
+    ASSERT_EQ(a.value(), 0);
+    ASSERT_TRUE(b.has_value());
+    ASSERT_EQ(b.value(), 1_KiB);
+    ASSERT_TRUE(c.has_value());
+    ASSERT_EQ(c.value(), 2_KiB);
+    allocator.deallocate(b.value());
+    aval = allocator.available_addresses(10_KiB);
+    ASSERT_EQ(aval.size(), 1);
+    ASSERT_EQ(aval[0].first, 3_KiB); // Start address
+    ASSERT_EQ(aval[0].second, 1_GiB); // End address
+}
+
+TEST(FreeListOptTest, LowestOccupiedAddress) {
+    auto allocator = tt::tt_metal::allocator::FreeListOpt(1_GiB, 0, 1_KiB, 1_KiB);
+    auto a = allocator.allocate(1_KiB);
+    auto b = allocator.allocate(1_KiB);
+    auto c = allocator.allocate(1_KiB);
+    ASSERT_TRUE(a.has_value());
+    ASSERT_EQ(a.value(), 0);
+    ASSERT_TRUE(b.has_value());
+    ASSERT_EQ(b.value(), 1_KiB);
+    ASSERT_TRUE(c.has_value());
+    ASSERT_EQ(c.value(), 2_KiB);
+    auto loa = allocator.lowest_occupied_address();
+    ASSERT_EQ(loa.value(), 0);
+    allocator.deallocate(a.value());
+    loa = allocator.lowest_occupied_address();
+    ASSERT_EQ(loa.value(), 1_KiB);
+    allocator.deallocate(b.value());
+    loa = allocator.lowest_occupied_address();
+    ASSERT_EQ(loa.value(), 2_KiB);
+    allocator.deallocate(c.value());
+    loa = allocator.lowest_occupied_address();
+    ASSERT_FALSE(loa.has_value());
+}
+
+TEST(FreeListOptTest, LowestOccupiedAddressWithAllocateAt) {
+    auto allocator = tt::tt_metal::allocator::FreeListOpt(1_GiB, 0, 1_KiB, 1_KiB);
+    auto a = allocator.allocate_at_address(1_KiB, 1_KiB);
+    ASSERT_TRUE(a.has_value());
+    ASSERT_EQ(a.value(), 1_KiB);
+    auto loa = allocator.lowest_occupied_address();
+    ASSERT_EQ(loa.value(), 1_KiB);
+    allocator.deallocate(a.value());
+    loa = allocator.lowest_occupied_address();
+    ASSERT_FALSE(loa.has_value());
+}
+
+TEST(FreeListOptTest, FirstFit) {
+    auto allocator = tt::tt_metal::allocator::FreeListOpt(1_GiB, 0, 1_KiB, 1_KiB, tt::tt_metal::allocator::FreeListOpt::SearchPolicy::FIRST);
+    auto a = allocator.allocate(1_KiB);
+    auto b = allocator.allocate(3_KiB);
+    auto c = allocator.allocate(1_KiB);
+    auto d = allocator.allocate(1_KiB);
+    auto e = allocator.allocate(1_KiB);
+
+    ASSERT_TRUE(a.has_value());
+    ASSERT_EQ(a.value(), 0);
+    ASSERT_TRUE(b.has_value());
+    ASSERT_EQ(b.value(), 1_KiB);
+    ASSERT_TRUE(c.has_value());
+    ASSERT_EQ(c.value(), 4_KiB);
+    ASSERT_TRUE(d.has_value());
+    ASSERT_EQ(d.value(), 5_KiB);
+    ASSERT_TRUE(e.has_value());
+    ASSERT_EQ(e.value(), 6_KiB);
+
+    allocator.deallocate(b.value());
+    allocator.deallocate(d.value());
+
+    auto f = allocator.allocate(1_KiB);
+    ASSERT_TRUE(f.has_value());
+    ASSERT_EQ(f.value(), 1_KiB);
+}
+
+TEST(FreeListOptTest, FirstFitAllocateAtAddressInteractions) {
+    auto allocator = tt::tt_metal::allocator::FreeListOpt(1_GiB, 0, 1_KiB, 1_KiB, tt::tt_metal::allocator::FreeListOpt::SearchPolicy::FIRST);
+    auto wedge = allocator.allocate_at_address(32_KiB, 1_KiB);
+
+    auto a = allocator.allocate(1_KiB);
+    ASSERT_TRUE(a.has_value());
+    ASSERT_EQ(a.value(), 0);
+
+    auto z = allocator.allocate(1_KiB, false);
+    ASSERT_TRUE(z.has_value());
+    ASSERT_EQ(z.value(), 1_GiB - 1_KiB);
+
+    auto b = allocator.allocate(1_KiB);
+    ASSERT_TRUE(b.has_value());
+    ASSERT_EQ(b.value(), 1_KiB);
+}
diff --git a/tt_metal/impl/CMakeLists.txt b/tt_metal/impl/CMakeLists.txt
index cad106642c9..9bd981b89d3 100644
--- a/tt_metal/impl/CMakeLists.txt
+++ b/tt_metal/impl/CMakeLists.txt
@@ -12,6 +12,7 @@ set(IMPL_SRC
     ${CMAKE_CURRENT_SOURCE_DIR}/buffers/semaphore.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/kernels/kernel.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/allocator/algorithms/free_list.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/allocator/algorithms/free_list_opt.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/allocator/allocator.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/allocator/basic_allocator.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/allocator/l1_banking_allocator.cpp
diff --git a/tt_metal/impl/allocator/algorithms/allocator_algorithm.hpp b/tt_metal/impl/allocator/algorithms/allocator_algorithm.hpp
index aa0bb516101..6eec0150db8 100644
--- a/tt_metal/impl/allocator/algorithms/allocator_algorithm.hpp
+++ b/tt_metal/impl/allocator/algorithms/allocator_algorithm.hpp
@@ -63,7 +63,7 @@ class Algorithm {
 
     virtual Statistics get_statistics() const = 0;
 
-    virtual void dump_blocks(std::ofstream& out) const = 0;
+    virtual void dump_blocks(std::ostream& out) const = 0;
 
     virtual void shrink_size(DeviceAddr shrink_size, bool bottom_up = true) = 0;
 
diff --git a/tt_metal/impl/allocator/algorithms/free_list.cpp b/tt_metal/impl/allocator/algorithms/free_list.cpp
index c320b205b90..adc75c102e0 100644
--- a/tt_metal/impl/allocator/algorithms/free_list.cpp
+++ b/tt_metal/impl/allocator/algorithms/free_list.cpp
@@ -431,12 +431,12 @@ Statistics FreeList::get_statistics() const {
     return stats;
 }
 
-void FreeList::dump_block(const boost::local_shared_ptr<Block>& block, std::ofstream& out) const {
+void FreeList::dump_block(const boost::local_shared_ptr<Block>& block, std::ostream& out) const {
     auto alloc_status = this->is_allocated(block) ? "Y" : "N";
     out << ",,," << (block->address + this->offset_bytes_) << "," << (block->size) << "," << alloc_status << "\n";
 }
 
-void FreeList::dump_blocks(std::ofstream& out) const {
+void FreeList::dump_blocks(std::ostream& out) const {
     out << ",,Blocks:,Address (B),Size (B),Allocated (Y/N)\n";
     boost::local_shared_ptr<Block> curr_block = this->block_head_;
     while (curr_block != nullptr) {
diff --git a/tt_metal/impl/allocator/algorithms/free_list.hpp b/tt_metal/impl/allocator/algorithms/free_list.hpp
index 6337e51f2ac..467fc8c3ece 100644
--- a/tt_metal/impl/allocator/algorithms/free_list.hpp
+++ b/tt_metal/impl/allocator/algorithms/free_list.hpp
@@ -36,7 +36,7 @@ class FreeList : public Algorithm {
 
     Statistics get_statistics() const;
 
-    void dump_blocks(std::ofstream& out) const;
+    void dump_blocks(std::ostream& out) const;
 
     void shrink_size(DeviceAddr shrink_size, bool bottom_up = true);
 
@@ -66,7 +66,7 @@ class FreeList : public Algorithm {
         boost::local_shared_ptr<Block> next_free = nullptr;
     };
 
-    void dump_block(const boost::local_shared_ptr<Block>& block, std::ofstream& out) const;
+    void dump_block(const boost::local_shared_ptr<Block>& block, std::ostream& out) const;
 
     bool is_allocated(const boost::local_shared_ptr<Block>& block) const;
 
diff --git a/tt_metal/impl/allocator/algorithms/free_list_opt.cpp b/tt_metal/impl/allocator/algorithms/free_list_opt.cpp
new file mode 100644
index 00000000000..bd24c0c871b
--- /dev/null
+++ b/tt_metal/impl/allocator/algorithms/free_list_opt.cpp
@@ -0,0 +1,630 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "tt_metal/impl/allocator/algorithms/free_list_opt.hpp"
+#include "common/assert.hpp"
+#include "llrt/hal.hpp"
+#include "tt_metal/impl/allocator/algorithms/allocator_algorithm.hpp"
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <optional>
+#include <vector>
+#include <array>
+
+inline size_t intlg2(size_t n) {
+    // std::log2() is slow
+    size_t count = 0;
+    while (n > 0) {
+        count++;
+        n >>= 1;
+    }
+    return count;
+}
+
+inline size_t num_segerated_classes(size_t max_size_bytes, size_t size_segregated_base) {
+    size_t n = max_size_bytes / size_segregated_base;
+    ssize_t count = intlg2(n);
+    // 128MB as the last seggregated class size should be enough
+    // avoid having too many classes as iterating them is not free
+    ssize_t max_count = intlg2(128 * 1024 * 1024 / size_segregated_base);
+    return std::clamp(count, ssize_t{2}, max_count);
+}
+
+namespace tt {
+
+namespace tt_metal {
+
+namespace allocator {
+
+FreeListOpt::FreeListOpt(
+    DeviceAddr max_size_bytes,
+    DeviceAddr offset_bytes,
+    DeviceAddr min_allocation_size,
+    DeviceAddr alignment,
+    SearchPolicy policy) :
+    policy_(policy),
+    size_segregated_count((num_segerated_classes(max_size_bytes, size_segregated_base))),
+    Algorithm(max_size_bytes, offset_bytes, min_allocation_size, alignment) {
+    // Reduce reallocations by reserving memory for free list components
+    constexpr size_t initial_block_count = 64;
+    block_address_.reserve(initial_block_count);
+    block_size_.reserve(initial_block_count);
+    block_prev_block_.reserve(initial_block_count);
+    block_next_block_.reserve(initial_block_count);
+    block_is_allocated_.reserve(initial_block_count);
+    free_meta_block_indices_.reserve(initial_block_count);
+    meta_block_is_allocated_.reserve(initial_block_count);
+    free_blocks_segregated_by_size_.resize(size_segregated_count);
+    for (auto& free_blocks : free_blocks_segregated_by_size_) {
+        free_blocks.reserve(initial_block_count);
+    }
+    allocated_block_table_.resize(n_alloc_table_buckets);
+    for (auto& bucket : allocated_block_table_) {
+        bucket.reserve(n_alloc_table_init_bucket_size);
+    }
+
+    init();
+}
+
+void FreeListOpt::init() {
+    max_size_bytes_ += shrink_size_;
+    shrink_size_ = 0;
+
+    block_address_.clear();
+    block_size_.clear();
+    block_prev_block_.clear();
+    block_next_block_.clear();
+    block_is_allocated_.clear();
+    free_meta_block_indices_.clear();
+    meta_block_is_allocated_.clear();
+    for (auto& bucket : allocated_block_table_) {
+        bucket.clear();
+    }
+    for (auto& free_blocks : free_blocks_segregated_by_size_) {
+        free_blocks.clear();
+    }
+
+    // Create a single block that spans the entire memory
+    block_address_.push_back(0);
+    block_size_.push_back(max_size_bytes_);
+    block_prev_block_.push_back(-1);
+    block_next_block_.push_back(-1);
+    block_is_allocated_.push_back(false);
+    meta_block_is_allocated_.push_back(true);
+    free_blocks_segregated_by_size_[get_size_segregated_index(max_size_bytes_)].push_back(0);
+}
+
+std::optional<DeviceAddr> FreeListOpt::allocate(DeviceAddr size_bytes, bool bottom_up, DeviceAddr address_limit) {
+    DeviceAddr alloc_size = align(std::max(size_bytes, min_allocation_size_));
+
+    // Find the best free block by looking at the segregated free blocks, if we can find a block in it's size class
+    // we can be confident that it's the best block to allocate from. Else, look at the next size class. However the
+    // blocks within a size class are not sorted by size, so we may not always find the best block.
+
+    ssize_t target_block_index = -1;
+    size_t size_segregated_index = get_size_segregated_index(alloc_size);
+    TT_ASSERT(size_segregated_index < size_segregated_count, "Size segregated index out of bounds");
+    std::vector<size_t>* segregated_list = nullptr;
+    size_t segregated_item_index = 0;
+    DeviceAddr best_address = bottom_up ? ~(DeviceAddr)0 : 0;
+
+    for (size_t i = size_segregated_index; i < free_blocks_segregated_by_size_.size(); i++) {
+        auto& free_blocks = free_blocks_segregated_by_size_[i];
+        ssize_t increment = bottom_up ? 1 : -1;
+        for (ssize_t j = bottom_up ? 0 : free_blocks.size() - 1; j >= 0 && j < free_blocks.size(); j += increment) {
+            size_t block_index = free_blocks[j];
+            if (policy_ == SearchPolicy::BEST) {
+                if (block_size_[block_index] == alloc_size) {
+                    target_block_index = block_index;
+                    segregated_list = &free_blocks;
+                    segregated_item_index = j;
+                    break;
+                } else if (
+                    block_size_[block_index] >= alloc_size &&
+                    (target_block_index == -1 || block_size_[block_index] < block_size_[target_block_index])) {
+                    target_block_index = block_index;
+                    segregated_list = &free_blocks;
+                    segregated_item_index = j;
+                }
+                if (target_block_index != -1) {
+                    break;
+                }
+            } else {
+                if (block_size_[block_index] < alloc_size) {
+                    continue;
+                }
+
+                bool address_better =
+                    bottom_up ? block_address_[block_index] < best_address : block_address_[block_index] > best_address;
+                if (target_block_index == -1 || address_better) {
+                    target_block_index = block_index;
+                    segregated_list = &free_blocks;
+                    segregated_item_index = j;
+                    best_address = block_address_[block_index];
+                    break;
+                }
+            }
+        }
+    }
+
+    if (target_block_index == -1) {
+        return std::nullopt;
+    }
+    TT_ASSERT(segregated_list != nullptr, "Segregated list is null");
+    TT_ASSERT(segregated_item_index < segregated_list->size(), "Segregated item index out of bounds");
+    TT_ASSERT(
+        block_is_allocated_[target_block_index] == false, "Block we are trying allocate from is already allocated");
+    segregated_list->erase(segregated_list->begin() + segregated_item_index);
+
+    // Allocate the block
+    size_t offset = 0;
+    if (!bottom_up) {
+        offset = block_size_[target_block_index] - alloc_size;
+    }
+    size_t allocated_block_index = allocate_in_block(target_block_index, alloc_size, offset);
+    DeviceAddr start_address = block_address_[allocated_block_index];
+    if (start_address + offset_bytes_ < address_limit) {
+        TT_THROW(
+            "Out of Memory: Cannot allocate at an address below {}. Allocation at {}",
+            address_limit,
+            start_address + offset_bytes_);
+    }
+    update_lowest_occupied_address(start_address);
+    return start_address + offset_bytes_;
+}
+
+std::optional<DeviceAddr> FreeListOpt::allocate_at_address(DeviceAddr absolute_start_address, DeviceAddr size_bytes) {
+    // Nothing we can do but scan the free list
+    size_t alloc_size = align(std::max(size_bytes, min_allocation_size_));
+    ssize_t target_block_index = -1;
+    DeviceAddr start_address = absolute_start_address - offset_bytes_;
+    for (size_t i = 0; i < block_address_.size(); i++) {
+        size_t block_start = block_address_[i];
+        size_t block_end = block_start + block_size_[i];
+        if (start_address >= block_start && start_address + alloc_size <= block_end) {
+            target_block_index = i;
+            break;
+        }
+    }
+
+    if (target_block_index == -1 || block_is_allocated_[target_block_index]) {
+        return std::nullopt;
+    }
+
+    // Find the relevant size segregated list
+    size_t size_segregated_index = get_size_segregated_index(block_size_[target_block_index]);
+    std::vector<size_t>& segregated_list = free_blocks_segregated_by_size_[size_segregated_index];
+    auto it = std::find(segregated_list.begin(), segregated_list.end(), target_block_index);
+    TT_ASSERT(it != segregated_list.end(), "Block not found in size segregated list");
+    segregated_list.erase(it);
+
+    size_t offset = start_address - block_address_[target_block_index];
+    size_t alloc_block_index = allocate_in_block(target_block_index, alloc_size, offset);
+    update_lowest_occupied_address(start_address);
+    return absolute_start_address;
+}
+
+size_t FreeListOpt::allocate_in_block(size_t block_index, DeviceAddr alloc_size, size_t offset) {
+    if (block_size_[block_index] == alloc_size && offset == 0) {
+        block_is_allocated_[block_index] = true;
+        insert_block_to_alloc_table(block_address_[block_index], block_index);
+        return block_index;
+    }
+
+    bool left_aligned = offset == 0;
+    bool right_aligned = offset + alloc_size == block_size_[block_index];
+
+    // Create free space if not left/right aligned
+    if (!left_aligned) {
+        size_t free_block_size = offset;
+        DeviceAddr free_block_address = block_address_[block_index];
+        ssize_t prev_block = block_prev_block_[block_index];
+        ssize_t next_block = block_next_block_[block_index];
+        block_size_[block_index] -= offset;
+        block_address_[block_index] += offset;
+        size_t new_block_index = alloc_meta_block(free_block_address, free_block_size, prev_block, block_index, false);
+        if (prev_block != -1) {
+            block_next_block_[prev_block] = new_block_index;
+        }
+        block_prev_block_[block_index] = new_block_index;
+
+        insert_block_to_segregated_list(new_block_index);
+    }
+
+    if (!right_aligned) {
+        size_t free_block_size = block_size_[block_index] - alloc_size;
+        DeviceAddr free_block_address = block_address_[block_index] + alloc_size;
+        ssize_t prev_block = block_index;
+        ssize_t next_block = block_next_block_[block_index];
+        block_size_[block_index] -= free_block_size;
+        size_t new_block_index = alloc_meta_block(free_block_address, free_block_size, prev_block, next_block, false);
+        if (next_block != -1) {
+            block_prev_block_[next_block] = new_block_index;
+        }
+        block_next_block_[block_index] = new_block_index;
+
+        insert_block_to_segregated_list(new_block_index);
+    }
+    block_is_allocated_[block_index] = true;
+    insert_block_to_alloc_table(block_address_[block_index], block_index);
+
+    return block_index;
+}
+
+void FreeListOpt::deallocate(DeviceAddr absolute_address) {
+    // The existing FreeList implementation does not check if the address is actually allocated. Just return if it's not
+    // Do we want to keep this behavior?
+
+    DeviceAddr addr = absolute_address - offset_bytes_;
+    auto block_index_opt = get_and_remove_from_alloc_table(addr);
+    if (!block_index_opt.has_value()) {
+        return;
+    }
+    size_t block_index = *block_index_opt;
+    block_is_allocated_[block_index] = false;
+    ssize_t prev_block = block_prev_block_[block_index];
+    ssize_t next_block = block_next_block_[block_index];
+
+    // Merge with previous block if it's free
+    if (prev_block != -1 && !block_is_allocated_[prev_block]) {
+        // Look into the size segregated list to remove the block
+        size_t size_segregated_index = get_size_segregated_index(block_size_[prev_block]);
+        std::vector<size_t>& segregated_list = free_blocks_segregated_by_size_[size_segregated_index];
+        auto it = std::find(segregated_list.begin(), segregated_list.end(), prev_block);
+        TT_ASSERT(
+            it != segregated_list.end(),
+            "Prev block {} not found in size segregated list during deallocation of block {}",
+            prev_block,
+            block_index);
+        segregated_list.erase(it);
+
+        block_size_[prev_block] += block_size_[block_index];
+        block_next_block_[prev_block] = next_block;
+        if (next_block != -1) {
+            block_prev_block_[next_block] = prev_block;
+        }
+        free_meta_block(block_index);
+        block_index = prev_block;
+    }
+
+    // Merge with next block if it's free
+    if (next_block != -1 && !block_is_allocated_[next_block]) {
+        // Look into the size segregated list to remove the block
+        size_t size_segregated_index = get_size_segregated_index(block_size_[next_block]);
+        std::vector<size_t>& segregated_list = free_blocks_segregated_by_size_[size_segregated_index];
+        auto it = std::find(segregated_list.begin(), segregated_list.end(), next_block);
+        TT_ASSERT(
+            it != segregated_list.end(),
+            "Next block {} not found in size segregated list during deallocation of block {}",
+            next_block,
+            block_index);
+        segregated_list.erase(it);
+
+        block_size_[block_index] += block_size_[next_block];
+        block_next_block_[block_index] = block_next_block_[next_block];
+        if (block_next_block_[next_block] != -1) {
+            block_prev_block_[block_next_block_[next_block]] = block_index;
+        }
+        free_meta_block(next_block);
+    }
+
+    TT_ASSERT(lowest_occupied_address_.has_value(), "Lowest occupied address should have a value");
+    if (addr <= *lowest_occupied_address_) {
+        lowest_occupied_address_ = std::nullopt;
+        ssize_t curr_block_index = block_next_block_[block_index];
+        while (curr_block_index != -1) {
+            if (block_is_allocated_[curr_block_index]) {
+                lowest_occupied_address_ = block_address_[curr_block_index];
+                break;
+            }
+            curr_block_index = block_next_block_[curr_block_index];
+        }
+    }
+    // Update the segregated list
+    insert_block_to_segregated_list(block_index);
+}
+
+std::vector<std::pair<DeviceAddr, DeviceAddr>> FreeListOpt::available_addresses(DeviceAddr size_bytes) const {
+    size_t alloc_size = align(std::max(size_bytes, min_allocation_size_));
+    size_t size_segregated_index = get_size_segregated_index(alloc_size);
+    std::vector<std::pair<DeviceAddr, DeviceAddr>> addresses;
+
+    for (size_t i = size_segregated_index; i < size_segregated_count; i++) {
+        for (size_t j = 0; j < free_blocks_segregated_by_size_[i].size(); j++) {
+            size_t block_index = free_blocks_segregated_by_size_[i][j];
+            if (block_size_[block_index] >= alloc_size) {
+                addresses.push_back(
+                    {block_address_[block_index], block_address_[block_index] + block_size_[block_index]});
+            }
+        }
+    }
+    return addresses;
+}
+
+size_t FreeListOpt::alloc_meta_block(
+    DeviceAddr address, DeviceAddr size, ssize_t prev_block, ssize_t next_block, bool is_allocated) {
+    size_t idx;
+    if (free_meta_block_indices_.empty()) {
+        idx = block_address_.size();
+        block_address_.push_back(address);
+        block_size_.push_back(size);
+        block_prev_block_.push_back(prev_block);
+        block_next_block_.push_back(next_block);
+        block_is_allocated_.push_back(is_allocated);
+        meta_block_is_allocated_.push_back(true);
+    } else {
+        idx = free_meta_block_indices_.back();
+        free_meta_block_indices_.pop_back();
+        block_address_[idx] = address;
+        block_size_[idx] = size;
+        block_prev_block_[idx] = prev_block;
+        block_next_block_[idx] = next_block;
+        block_is_allocated_[idx] = is_allocated;
+        meta_block_is_allocated_[idx] = true;
+    }
+    return idx;
+}
+
+void FreeListOpt::free_meta_block(size_t block_index) {
+    free_meta_block_indices_.push_back(block_index);
+    meta_block_is_allocated_[block_index] = false;
+}
+
+void FreeListOpt::clear() { init(); }
+
+Statistics FreeListOpt::get_statistics() const {
+    // TODO: Cache the statistics
+    size_t total_allocated_bytes = 0;
+    size_t total_free_bytes = 0;
+    size_t largest_free_block_bytes = 0;
+    std::vector<uint32_t> largest_free_block_addrs;
+
+    for (size_t i = 0; i < block_address_.size(); i++) {
+        if (block_is_allocated_[i]) {
+            total_allocated_bytes += block_size_[i];
+        } else {
+            total_free_bytes += block_size_[i];
+            if (block_size_[i] >= largest_free_block_bytes) {
+                largest_free_block_bytes = block_size_[i];
+                // XXX: This is going to overflow
+                largest_free_block_addrs.push_back(block_address_[i] + offset_bytes_);
+            }
+        }
+    }
+
+    if (total_allocated_bytes == 0) {
+        total_free_bytes = max_size_bytes_;
+        largest_free_block_bytes = max_size_bytes_;
+    }
+
+    return Statistics{
+        .total_allocatable_size_bytes = max_size_bytes_,
+        .total_allocated_bytes = total_allocated_bytes,
+        .total_free_bytes = total_free_bytes,
+        .largest_free_block_bytes = largest_free_block_bytes,
+        // Why do we need largest_free_block_addrs? Without it the entire loop can be removed
+        // and statistics can be tracked during allocation and deallocation
+        .largest_free_block_addrs = std::move(largest_free_block_addrs),
+    };
+}
+
+void FreeListOpt::dump_blocks(std::ostream& out) const {
+    out << "FreeListOpt allocator info:" << std::endl;
+    out << "segregated free blocks by size:" << std::endl;
+    for (size_t i = 0; i < free_blocks_segregated_by_size_.size(); i++) {
+        if (i != free_blocks_segregated_by_size_.size() - 1) {
+            out << "  Size class " << i << ": (" << size_t(size_segregated_base * (size_t{1} << i)) << " - "
+                << size_t(size_segregated_base * (size_t{1} << (i + 1))) << ") blocks: ";
+        } else {
+            out << "  Size class " << i << ": (" << size_t(size_segregated_base * (size_t{1} << i))
+                << " - inf) blocks: ";
+        }
+        for (size_t j = 0; j < free_blocks_segregated_by_size_[i].size(); j++) {
+            out << free_blocks_segregated_by_size_[i][j] << " ";
+        }
+
+        out << std::endl;
+    }
+
+    out << "Free slots in block table: ";
+    for (size_t i = 0; i < free_meta_block_indices_.size(); i++) {
+        out << free_meta_block_indices_[i] << " ";
+    }
+    out << std::endl;
+
+    out << "Block table:" << std::endl;
+    auto leftpad = [](std::string str, size_t width) {
+        if (str.size() >= width) {
+            return str;
+        }
+        return std::string(width - str.size(), ' ') + str;
+    };
+    auto leftpad_num = [leftpad](auto num, size_t width) {
+        // HACK: -1 for us means none
+        if (num == -1) {
+            return leftpad("none", width);
+        }
+        return leftpad(std::to_string(num), width);
+    };
+    const size_t pad = 12;
+    std::array<std::string, 6> headers = {"Block", "Address", "Size", "PrevID", "NextID", "Allocated"};
+    for (auto& header : headers) {
+        out << leftpad(header, pad) << " ";
+    }
+    out << std::endl;
+    for (size_t i = 0; i < block_address_.size(); i++) {
+        if (!meta_block_is_allocated_[i]) {
+            continue;
+        }
+        out << leftpad_num(i, pad) << " " << leftpad_num(block_address_[i], pad) << " "
+            << leftpad_num(block_size_[i], pad) << " " << leftpad_num(block_prev_block_[i], pad) << " "
+            << leftpad_num(block_next_block_[i], pad) << " " << leftpad(block_is_allocated_[i] ? "yes" : "no", pad)
+            << std::endl;
+    }
+}
+
+void FreeListOpt::shrink_size(DeviceAddr shrink_size, bool bottom_up) {
+    if (shrink_size == 0) {
+        return;
+    }
+    TT_FATAL(bottom_up, "Shrinking from the top is currently not supported");
+    TT_FATAL(
+        shrink_size <= this->max_size_bytes_,
+        "Shrink size {} must be smaller than max size {}",
+        shrink_size,
+        max_size_bytes_);
+
+    // loop and scan the block list to find if the shrink cut into any allocated block
+    size_t block_to_shrink = -1;
+    DeviceAddr shrunk_address = shrink_size_ + shrink_size;
+    // TODO: There must be a way to force the beginning of all blocks be at index 0
+    for (size_t i = 0; i < block_address_.size(); i++) {
+        if (!meta_block_is_allocated_[i]) {
+            continue;
+        } else if (block_is_allocated_[i]) {
+            TT_FATAL(
+                block_address_[i] >= shrunk_address,
+                "Shrink size {} cuts into allocated block at address {}",
+                shrunk_address,
+                block_address_[i]);
+        } else if (block_address_[i] <= shrunk_address && block_address_[i] + block_size_[i] >= shrunk_address) {
+            block_to_shrink = i;
+            break;
+        }
+    }
+
+    TT_FATAL(block_to_shrink != -1, "Shrink size {} does not align with any block. This must be a bug", shrunk_address);
+
+    // Find the relevant size segregated list
+    size_t size_segregated_index = get_size_segregated_index(block_size_[block_to_shrink]);
+    std::vector<size_t>& segregated_list = free_blocks_segregated_by_size_[size_segregated_index];
+    for (size_t i = 0; i < segregated_list.size(); i++) {
+        if (segregated_list[i] == block_to_shrink) {
+            segregated_list.erase(segregated_list.begin() + i);
+            break;
+        }
+    }
+
+    // Shrink the block
+    block_size_[block_to_shrink] -= shrink_size;
+    max_size_bytes_ -= shrink_size;
+    shrink_size_ += shrink_size;
+    if (block_size_[block_to_shrink] == 0) {
+        block_prev_block_[block_next_block_[block_to_shrink]] = block_prev_block_[block_to_shrink];
+        free_meta_block(block_to_shrink);
+    } else {
+        block_address_[block_to_shrink] += shrink_size;
+        insert_block_to_segregated_list(block_to_shrink);
+    }
+}
+
+void FreeListOpt::reset_size() {
+    if (shrink_size_ == 0) {
+        return;
+    }
+
+    // Create a new block, mark it as allocated and deallocate the old block so coalescing can happen
+    ssize_t lowest_block_index = -1;
+    for (size_t i = 0; i < block_address_.size(); i++) {
+        if (!meta_block_is_allocated_[i]) {
+            continue;
+        }
+        if (block_address_[i] == shrink_size_) {
+            lowest_block_index = i;
+            break;
+        }
+    }
+    TT_ASSERT(lowest_block_index != -1, "Lowest block not found during reset size");
+
+    // There 2 cases to consider:
+    // 1. The lowest block is is free, which means we can just modify it's attributes
+    // 2. The lowest block is allocated, which means we need to create a new block and deallocate the old one
+    if (!block_is_allocated_[lowest_block_index]) {
+        auto* segregated_list =
+            &free_blocks_segregated_by_size_[get_size_segregated_index(block_size_[lowest_block_index])];
+        for (size_t i = 0; i < segregated_list->size(); i++) {
+            if ((*segregated_list)[i] == lowest_block_index) {
+                segregated_list->erase(segregated_list->begin() + i);
+                break;
+            }
+        }
+        block_size_[lowest_block_index] += shrink_size_;
+        block_address_[lowest_block_index] = 0;
+        insert_block_to_segregated_list(lowest_block_index);
+    } else {
+        size_t new_block_index = alloc_meta_block(0, shrink_size_, -1, lowest_block_index, false);
+        TT_ASSERT(block_prev_block_[lowest_block_index] == -1, "Lowest block should not have a previous block");
+        block_prev_block_[lowest_block_index] = new_block_index;
+        insert_block_to_segregated_list(new_block_index);
+    }
+
+    max_size_bytes_ += shrink_size_;
+    shrink_size_ = 0;
+}
+
+void FreeListOpt::insert_block_to_segregated_list(size_t block_index) {
+    const size_t size_segregated_index = get_size_segregated_index(block_size_[block_index]);
+    auto& free_blocks = free_blocks_segregated_by_size_[size_segregated_index];
+    // Pushing to the back is faster than sorted insertion. But it increases fragmentation
+    // free_blocks.push_back(block_index);
+    // The overhead is not worth it in benchmarks. Need real world data to confirm. But certainly it'll help with
+    // fragmentation
+    std::vector<size_t>::iterator it;
+    // from experience, the lower bound is only faster after a certain number of elements
+    if (free_blocks.size() < 30) {
+        for (it = free_blocks.begin(); it != free_blocks.end(); it++) {
+            if (block_address_[*it] > block_address_[block_index]) {
+                break;
+            }
+        }
+    } else {
+        it = std::lower_bound(free_blocks.begin(), free_blocks.end(), block_index, [this](size_t a, size_t b) {
+            return block_address_[a] < block_address_[b];
+        });
+    }
+    free_blocks.insert(it, block_index);
+}
+
+inline size_t FreeListOpt::hash_device_address(DeviceAddr address) {
+    // HACK: This hash is critical for performance, empirically found to be good for
+    // the specific usecase
+    return ((address) ^ (address >> 12) * 3) % n_alloc_table_buckets;
+}
+void FreeListOpt::insert_block_to_alloc_table(DeviceAddr address, size_t block_index) {
+    size_t bucket = hash_device_address(address);
+    allocated_block_table_[bucket].emplace_back(address, block_index);
+}
+bool FreeListOpt::is_address_in_alloc_table(DeviceAddr address) const {
+    size_t bucket = hash_device_address(address);
+    for (const auto& [addr, block_index] : allocated_block_table_[bucket]) {
+        if (addr == address) {
+            return true;
+        }
+    }
+    return false;
+}
+std::optional<size_t> FreeListOpt::get_and_remove_from_alloc_table(DeviceAddr address) {
+    size_t bucket = hash_device_address(address);
+    // It's common to deallocate the last allocated block, so search from the back
+    for (ssize_t i = allocated_block_table_[bucket].size() - 1; i >= 0; i--) {
+        if (allocated_block_table_[bucket][i].first == address) {
+            auto res = allocated_block_table_[bucket][i].second;
+            allocated_block_table_[bucket].erase(allocated_block_table_[bucket].begin() + i);
+            return res;
+        }
+    }
+    return std::nullopt;
+}
+
+void FreeListOpt::update_lowest_occupied_address(DeviceAddr address) {
+    if (!lowest_occupied_address_.has_value() || address < lowest_occupied_address_.value()) {
+        lowest_occupied_address_ = address;
+    }
+}
+
+}  // namespace allocator
+}  // namespace tt_metal
+}  // namespace tt
diff --git a/tt_metal/impl/allocator/algorithms/free_list_opt.hpp b/tt_metal/impl/allocator/algorithms/free_list_opt.hpp
new file mode 100644
index 00000000000..1a3a7f87405
--- /dev/null
+++ b/tt_metal/impl/allocator/algorithms/free_list_opt.hpp
@@ -0,0 +1,126 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+#include <optional>
+
+#include "tt_metal/impl/allocator/algorithms/allocator_algorithm.hpp"
+
+namespace tt {
+namespace tt_metal {
+namespace allocator {
+// Essentially the same free list algorithm as FreeList with BestFit policy, but with (IMO absurdly) optimized code.
+// Including
+// - SoA instead of linked list for the free list
+// - Size segregated to avoid unnecessary searches of smaller blocks
+// - Hash table to store allocated blocks for faster block lookup during deallocation
+// - Keeps metadata locality to avoid cache misses
+// - Metadata reuse to avoid allocations
+class FreeListOpt : public Algorithm {
+public:
+    enum class SearchPolicy {
+        FIRST,
+        BEST,
+    };
+    FreeListOpt(
+        DeviceAddr max_size_bytes,
+        DeviceAddr offset_bytes,
+        DeviceAddr min_allocation_size,
+        DeviceAddr alignment,
+        SearchPolicy policy = SearchPolicy::BEST);
+    void init() override;
+
+    std::vector<std::pair<DeviceAddr, DeviceAddr>> available_addresses(DeviceAddr size_bytes) const override;
+
+    std::optional<DeviceAddr> allocate(
+        DeviceAddr size_bytes, bool bottom_up = true, DeviceAddr address_limit = 0) override;
+
+    std::optional<DeviceAddr> allocate_at_address(DeviceAddr absolute_start_address, DeviceAddr size_bytes) override;
+
+    void deallocate(DeviceAddr absolute_address) override;
+
+    void clear() override;
+
+    Statistics get_statistics() const override;
+
+    void dump_blocks(std::ostream& out) const override;
+
+    void shrink_size(DeviceAddr shrink_size, bool bottom_up = true) override;
+
+    void reset_size() override;
+
+private:
+    // SoA free list components
+    std::vector<DeviceAddr> block_address_;
+    std::vector<DeviceAddr> block_size_;
+    std::vector<ssize_t> block_prev_block_;
+    std::vector<ssize_t> block_next_block_;
+    std::vector<uint8_t> block_is_allocated_;       // not using bool to avoid compacting
+    std::vector<uint8_t> meta_block_is_allocated_;  // not using bool to avoid compacting
+
+    // Metadata block indices that is not currently used (to reuse blocks instead of always allocating new ones)
+    std::vector<size_t> free_meta_block_indices_;
+
+    // Caches so most operations don't need to scan the entire free list. The allocated block table
+    // will not rehash as I find the cost to not be worth it
+    inline static constexpr size_t n_alloc_table_buckets = 512;          // Number of buckets in the hash table
+    inline static constexpr size_t n_alloc_table_init_bucket_size = 10;  // Initial size of each bucket
+    std::vector<std::vector<std::pair<DeviceAddr, size_t>>> allocated_block_table_;
+
+    // Size segregated list of free blocks. Idea comes from the TLSF paper, but instead of aiming for realtime
+    // the goal there is to not look at small blocks when allocating large blocks. Which the naive free list
+    // algorithm does not do. Confiugring these 2 parameters is needs real world data, but for now it's just
+    // number pulled out of thin air. Too low and it devolves into an array search, too high you pay cache misses
+
+    // Size class index is calculated by taking the log2 of the block size divided by the base size
+    // ex: size = 2048, base = 1024, log2(2048/1024) = 1, so size class index = 1
+    inline static constexpr size_t size_segregated_base = 1024;  // in bytes
+    const size_t size_segregated_count;                          // Number of size classes
+    std::vector<std::vector<size_t>> free_blocks_segregated_by_size_;
+
+    // internal functions
+    // Given a block index, mark a chunk (from block start + offset to block start + offset + alloc_size) as allocated
+    // Unused space is split into a new free block and retuened to the free list and the segregated list
+    // NOTE: This function DOES NOT remove block_index from the segregated list. Caller should do that
+    size_t allocate_in_block(size_t block_index, DeviceAddr alloc_size, size_t offset);
+
+    inline size_t get_size_segregated_index(DeviceAddr size_bytes) const {
+        // std::log2 is SLOW, so we use a simple log2 implementation for integers. I assume GCC compiles this to a
+        // count leading zeros instruction then a subtraction.
+        size_t lg = 0;
+        size_t n = size_bytes / size_segregated_base;
+        while (n >>= 1) {
+            lg++;
+        }
+        return std::min(size_segregated_count - 1, lg);
+    }
+    // Put the block at block_index into the size segregated list at the appropriate index (data taken from
+    // the SoA vectors)
+    void insert_block_to_segregated_list(size_t block_index);
+
+    // Allocate a new block and return the index to the block
+    size_t alloc_meta_block(
+        DeviceAddr address, DeviceAddr size, ssize_t prev_block, ssize_t next_block, bool is_allocated);
+    // Free the block at block_index and mark it as free
+    void free_meta_block(size_t block_index);
+
+    // Operations on the allocated block table
+    static size_t hash_device_address(DeviceAddr address);
+    void insert_block_to_alloc_table(DeviceAddr address, size_t block_index);
+    bool is_address_in_alloc_table(DeviceAddr address) const;
+    std::optional<size_t> get_and_remove_from_alloc_table(DeviceAddr address);
+
+    void update_lowest_occupied_address(DeviceAddr address);
+
+    size_t find_free_block(DeviceAddr size, bool bottom_up);
+
+    SearchPolicy policy_;
+};
+
+}  // namespace allocator
+}  // namespace tt_metal
+}  // namespace tt
diff --git a/tt_metal/impl/allocator/allocator.cpp b/tt_metal/impl/allocator/allocator.cpp
index 0e64623892c..059c209d7a0 100644
--- a/tt_metal/impl/allocator/allocator.cpp
+++ b/tt_metal/impl/allocator/allocator.cpp
@@ -7,7 +7,7 @@
 #include <magic_enum/magic_enum.hpp>
 #include "tt_metal/common/math.hpp"
 #include "tt_metal/detail/util.hpp"
-#include "tt_metal/impl/allocator/algorithms/free_list.hpp"
+#include "tt_metal/impl/allocator/algorithms/free_list_opt.hpp"
 #include "tt_metal/impl/buffers/buffer.hpp"
 
 namespace tt {
@@ -27,8 +27,8 @@ static char const* get_memory_pool_name(BufferType buffer_type) {
 #endif
 
 void BankManager::init_allocator(DeviceAddr size_bytes, uint32_t alignment_bytes, DeviceAddr offset) {
-    this->allocator_ =
-        std::make_unique<FreeList>(size_bytes, offset, alignment_bytes, alignment_bytes, FreeList::SearchPolicy::FIRST);
+    this->allocator_ = std::make_unique<FreeListOpt>(
+        size_bytes, offset, alignment_bytes, alignment_bytes, FreeListOpt::SearchPolicy::FIRST);
 }
 
 void validate_num_banks(uint32_t num_banks, const BufferType& buffer_type, bool disable_interleaved) {