diff --git a/tests/tt_eager/integration_tests/test_bert.cpp b/tests/tt_eager/integration_tests/test_bert.cpp index 54aed669997..964938e42ac 100644 --- a/tests/tt_eager/integration_tests/test_bert.cpp +++ b/tests/tt_eager/integration_tests/test_bert.cpp @@ -17,18 +17,18 @@ #include "ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/split_query_key_value_and_split_heads.hpp" #include "ttnn/operations/experimental/transformer/concatenate_heads/concatenate_heads.hpp" -using Parameters = std::map; +using Parameters = std::map; using ttnn::operations::unary::UnaryOpType; using ttnn::operations::unary::UnaryWithParam; -MemoryConfig l1_memory_config = tt::tt_metal::MemoryConfig{ +ttnn::MemoryConfig l1_memory_config = tt::tt_metal::MemoryConfig{ .memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED, .buffer_type = tt::tt_metal::BufferType::L1}; -MemoryConfig dram_memory_config = tt::tt_metal::MemoryConfig{ +ttnn::MemoryConfig dram_memory_config = tt::tt_metal::MemoryConfig{ .memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED, .buffer_type = tt::tt_metal::BufferType::DRAM}; -Tensor encoder( - Tensor&& hidden_states, - const Tensor& attention_mask, +ttnn::Tensor encoder( + ttnn::Tensor&& hidden_states, + const ttnn::Tensor& attention_mask, const Parameters& parameters, std::size_t encoder_index, const std::uint32_t head_size) { @@ -192,7 +192,7 @@ Tensor encoder( return feedforward_layernorm_output; } -Tensor qa_head(Tensor&& hidden_states, const Parameters& parameters) { +ttnn::Tensor qa_head(ttnn::Tensor&& hidden_states, const Parameters& parameters) { auto output = ttnn::operations::matmul::matmul( hidden_states, parameters.at("qa_head_weight"), /*bias=*/std::nullopt, ttnn::operations::matmul::Matmul{}); hidden_states.deallocate(); diff --git a/tests/tt_eager/ops/test_bmm_op.cpp b/tests/tt_eager/ops/test_bmm_op.cpp index b8c2b10d05b..078d4f3aef7 100644 --- a/tests/tt_eager/ops/test_bmm_op.cpp +++ b/tests/tt_eager/ops/test_bmm_op.cpp @@ -51,7 +51,7 @@ int main(int argc, char** argv) { ttnn::operations::matmul::Matmul{ /*program_config=*/std::nullopt, /*bcast_batch=*/std::nullopt, - operation::DEFAULT_OUTPUT_MEMORY_CONFIG, + tt::tt_metal::operation::DEFAULT_OUTPUT_MEMORY_CONFIG, /*output_dtype=*/std::nullopt, /*compute_kernel_config=*/std::nullopt, /*untilize_out=*/false, diff --git a/tests/tt_eager/ops/test_eltwise_binary_op.cpp b/tests/tt_eager/ops/test_eltwise_binary_op.cpp index 32769454b8e..a4c69c37fc6 100644 --- a/tests/tt_eager/ops/test_eltwise_binary_op.cpp +++ b/tests/tt_eager/ops/test_eltwise_binary_op.cpp @@ -112,7 +112,7 @@ int main() { run_binary_ops(); // Allocate a tensor to show that the addresses aren't cached - auto input_tensor = ttnn::random::uniform(bfloat16(0.0f), bfloat16(0.0f), Shape({1, 1, 32, 32})) + auto input_tensor = ttnn::random::uniform(bfloat16(0.0f), bfloat16(0.0f), ttnn::Shape({1, 1, 32, 32})) .to_layout(Layout::TILE) .to_device(device); diff --git a/tests/tt_eager/ops/test_eltwise_unary_op.cpp b/tests/tt_eager/ops/test_eltwise_unary_op.cpp index 17839c2f228..e22e7ee4b0c 100644 --- a/tests/tt_eager/ops/test_eltwise_unary_op.cpp +++ b/tests/tt_eager/ops/test_eltwise_unary_op.cpp @@ -116,7 +116,7 @@ void test_operation_infrastructure() { ttnn::operations::unary::operation_attributes_t op_args{ {UnaryWithParam{UnaryOpType::SQRT}}, DataType::BFLOAT16, - MemoryConfig{.memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED}, + tt::tt_metal::MemoryConfig{.memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED}, false, false}; ttnn::operations::unary::tensor_args_t tensor_args{input_tensor}; @@ -159,7 +159,7 @@ namespace tt_metal { template struct exp_with_param { static Tensor fn(const tt::tt_metal::Tensor& t) { - return ttnn::exp(t, approx_value, operation::DEFAULT_OUTPUT_MEMORY_CONFIG); + return ttnn::exp(t, approx_value, tt::tt_metal::operation::DEFAULT_OUTPUT_MEMORY_CONFIG); } }; } // namespace tt_metal diff --git a/tests/tt_eager/ops/test_fold_op.cpp b/tests/tt_eager/ops/test_fold_op.cpp index fec37c1a120..3d45b3d1097 100644 --- a/tests/tt_eager/ops/test_fold_op.cpp +++ b/tests/tt_eager/ops/test_fold_op.cpp @@ -12,22 +12,21 @@ #include using namespace tt; -using namespace tt::tt_metal; using namespace constants; -void run_fold(IDevice* device, const ttnn::Shape& shape) { - Tensor input_tensor = ttnn::random::random(shape).to_layout(Layout::ROW_MAJOR).to_device(device); +void run_fold(tt::tt_metal::IDevice* device, const ttnn::Shape& shape) { + ttnn::Tensor input_tensor = ttnn::random::random(shape).to_layout(ttnn::Layout::ROW_MAJOR).to_device(device); uint32_t stride_h = 2; uint32_t stride_w = 2; - Tensor device_output_tensor = ttnn::fold(ttnn::DefaultQueueId, input_tensor, stride_h, stride_w); - Tensor output_tensor = device_output_tensor.cpu(); + ttnn::Tensor device_output_tensor = ttnn::fold(ttnn::DefaultQueueId, input_tensor, stride_h, stride_w); + ttnn::Tensor output_tensor = device_output_tensor.cpu(); } int main(int argc, char** argv) { int device_id = 0; tt_metal::IDevice* device = tt_metal::CreateDevice(device_id); - run_fold(device, Shape({1, 2, 2, 2})); + run_fold(device, ttnn::Shape({1, 2, 2, 2})); bool pass = CloseDevice(device); if (pass) { diff --git a/tests/tt_eager/ops/test_sliding_window_ops.cpp b/tests/tt_eager/ops/test_sliding_window_ops.cpp index b6b6e200d56..8cb303da5d2 100644 --- a/tests/tt_eager/ops/test_sliding_window_ops.cpp +++ b/tests/tt_eager/ops/test_sliding_window_ops.cpp @@ -20,7 +20,7 @@ using namespace ttnn::operations::sliding_window; // From owned_buffer of type bfloat16 of create float vector for convolution operation. vector create_filter_vec( - const owned_buffer::Buffer& filter_tensor_buf, uint32_t filter_h, uint32_t filter_w) { + const tt::tt_metal::owned_buffer::Buffer& filter_tensor_buf, uint32_t filter_h, uint32_t filter_w) { vector filter_vector; for (auto h = 0; h < filter_h; h++) { for (auto w = 0; w < filter_w; w++) { @@ -32,8 +32,8 @@ vector create_filter_vec( // Compare calculated convolution buffer with Golden convolution uint32_t compare_conv_out_with_golden( - const owned_buffer::Buffer& out_golden_tensor_buf, - const owned_buffer::Buffer& conv_tensor_buf) { + const tt::tt_metal::owned_buffer::Buffer& out_golden_tensor_buf, + const tt::tt_metal::owned_buffer::Buffer& conv_tensor_buf) { uint32_t diff = 0; if (out_golden_tensor_buf != conv_tensor_buf) { assert(out_golden_tensor_buf.size() == conv_tensor_buf.size()); @@ -125,13 +125,13 @@ uint32_t validate_generate_halo_kernel_config( uint32_t validate_generate_functions( tt::tt_metal::IDevice* device, const SlidingWindowConfig& config, - const owned_buffer::Buffer& input_padded_tensor_buf, + const tt::tt_metal::owned_buffer::Buffer& input_padded_tensor_buf, const vector& filter_vector, - const owned_buffer::Buffer& out_golden_tensor_buf, + const tt::tt_metal::owned_buffer::Buffer& out_golden_tensor_buf, uint32_t reshard_num_cores_nhw = 0, bool remote_read = false) { log_debug(tt::LogTest, "Validating generate functions for config = {}", config); - owned_buffer::Buffer conv_tensor_buf; + tt::tt_metal::owned_buffer::Buffer conv_tensor_buf; uint32_t diff; uint32_t failed_tests = 0; auto pad_metadata = generate_pad_metadata(config); @@ -381,14 +381,15 @@ int main() { ttnn::Shape filter_tensor_shape({config.window_hw.first, config.window_hw.second}); Tensor input_padded_tensor = - ttnn::random::random(input_tensor_shape, DataType::BFLOAT16).to_layout(Layout::ROW_MAJOR).cpu(); - Tensor filter_tensor = - ttnn::random::random(filter_tensor_shape, DataType::BFLOAT16).to_layout(Layout::ROW_MAJOR).cpu(); - auto input_padded_tensor_buf = owned_buffer::get_as(input_padded_tensor); - auto filter_tensor_buf = owned_buffer::get_as(filter_tensor); + ttnn::random::random(input_tensor_shape, ttnn::DataType::BFLOAT16).to_layout(ttnn::Layout::ROW_MAJOR).cpu(); + Tensor filter_tensor = ttnn::random::random(filter_tensor_shape, ttnn::DataType::BFLOAT16) + .to_layout(ttnn::Layout::ROW_MAJOR) + .cpu(); + auto input_padded_tensor_buf = tt::tt_metal::owned_buffer::get_as(input_padded_tensor); + auto filter_tensor_buf = tt::tt_metal::owned_buffer::get_as(filter_tensor); vector filter_vector = create_filter_vec(filter_tensor_buf, tc.filter_h, tc.filter_w); - owned_buffer::Buffer out_golden_tensor_buf = ref_conv_op( + tt::tt_metal::owned_buffer::Buffer out_golden_tensor_buf = ref_conv_op( input_padded_tensor, input_tensor_shape, tc.stride_h, diff --git a/tests/tt_eager/tensors/test_raw_host_memory_pointer.cpp b/tests/tt_eager/tensors/test_raw_host_memory_pointer.cpp index c097b2fc99a..a334821787a 100644 --- a/tests/tt_eager/tensors/test_raw_host_memory_pointer.cpp +++ b/tests/tt_eager/tensors/test_raw_host_memory_pointer.cpp @@ -8,6 +8,7 @@ #include #include +#include "ttnn/tensor/enum_types.hpp" #include "ttnn/tensor/host_buffer/functions.hpp" #include "ttnn/tensor/host_buffer/types.hpp" #include "ttnn/tensor/tensor.hpp" @@ -56,6 +57,7 @@ struct NDArray { void test_raw_host_memory_pointer() { using tt::tt_metal::BorrowedStorage; using tt::tt_metal::DataType; + using tt::tt_metal::Layout; using tt::tt_metal::OwnedStorage; using tt::tt_metal::Tensor; using namespace tt::tt_metal::borrowed_buffer; @@ -67,8 +69,11 @@ void test_raw_host_memory_pointer() { ttnn::Shape shape({1, 1, tt::constants::TILE_HEIGHT, tt::constants::TILE_WIDTH}); // Host tensor to print the output - Tensor tensor_for_printing = - Tensor(OwnedStorage{owned_buffer::create(shape.volume())}, shape, DataType::BFLOAT16, Layout::TILE); + Tensor tensor_for_printing = Tensor( + OwnedStorage{tt::tt_metal::owned_buffer::create(shape.volume())}, + shape, + DataType::BFLOAT16, + Layout::TILE); /* Borrow Data from Numpy Start */ // Create some @@ -78,7 +83,7 @@ void test_raw_host_memory_pointer() { auto on_destruction_callback = [] {}; Tensor a_cpu = Tensor( BorrowedStorage{ - borrowed_buffer::Buffer(static_cast(a_np_array_data), a_np_array.size()), + tt::tt_metal::borrowed_buffer::Buffer(static_cast(a_np_array_data), a_np_array.size()), on_creation_callback, on_destruction_callback}, shape, @@ -93,7 +98,7 @@ void test_raw_host_memory_pointer() { // Set every value of tt Tensor to the same non-zero number bfloat16 a_value = 4.0f; - for (auto& element : borrowed_buffer::get_as(a_cpu)) { + for (auto& element : tt::tt_metal::borrowed_buffer::get_as(a_cpu)) { element = a_value; } @@ -113,7 +118,7 @@ void test_raw_host_memory_pointer() { // Check that cpu tensor has correct data bfloat16 output_value = 1.99219f; // Not exactly 2.0f because of rounding errors - for (auto& element : owned_buffer::get_as(tensor_for_printing)) { + for (auto& element : tt::tt_metal::owned_buffer::get_as(tensor_for_printing)) { TT_ASSERT(element == output_value); } @@ -128,7 +133,8 @@ void test_raw_host_memory_pointer() { Tensor alternative_tensor_for_printing = Tensor( BorrowedStorage{ - borrowed_buffer::Buffer(static_cast(storage_of_alternative_tensor_for_printing), shape.volume()), + tt::tt_metal::borrowed_buffer::Buffer( + static_cast(storage_of_alternative_tensor_for_printing), shape.volume()), on_creation_callback, on_destruction_callback}, shape, @@ -136,7 +142,7 @@ void test_raw_host_memory_pointer() { Layout::TILE); alternative_tensor_for_printing.print(); - for (auto& element : borrowed_buffer::get_as(alternative_tensor_for_printing)) { + for (auto& element : tt::tt_metal::borrowed_buffer::get_as(alternative_tensor_for_printing)) { TT_ASSERT(element == output_value); } @@ -147,7 +153,7 @@ void test_raw_host_memory_pointer() { void* d_np_array_data = d_np_array.data; Tensor d_cpu = Tensor( BorrowedStorage{ - borrowed_buffer::Buffer(static_cast(d_np_array_data), d_np_array.size()), + tt::tt_metal::borrowed_buffer::Buffer(static_cast(d_np_array_data), d_np_array.size()), on_creation_callback, on_destruction_callback}, shape, @@ -155,7 +161,7 @@ void test_raw_host_memory_pointer() { Layout::TILE); bfloat16 d_value = 8.0f; - for (auto& element : borrowed_buffer::get_as(d_cpu)) { + for (auto& element : tt::tt_metal::borrowed_buffer::get_as(d_cpu)) { element = d_value; } @@ -166,7 +172,7 @@ void test_raw_host_memory_pointer() { tt::tt_metal::memcpy(tensor_for_printing, e_dev); - for (auto& element : owned_buffer::get_as(tensor_for_printing)) { + for (auto& element : tt::tt_metal::owned_buffer::get_as(tensor_for_printing)) { TT_ASSERT(element == bfloat16(10.0f)); } diff --git a/tests/tt_metal/distributed/test_mesh_sub_device.cpp b/tests/tt_metal/distributed/test_mesh_sub_device.cpp index 90ae82320d4..15db602579a 100644 --- a/tests/tt_metal/distributed/test_mesh_sub_device.cpp +++ b/tests/tt_metal/distributed/test_mesh_sub_device.cpp @@ -12,6 +12,7 @@ namespace tt::tt_metal::distributed::test { namespace { +using namespace tt::tt_metal; using MeshSubDeviceTestSuite = GenericMeshDeviceFixture; TEST_F(MeshSubDeviceTestSuite, SyncWorkloadsOnSubDevice) { diff --git a/tests/tt_metal/test_utils/test_common.hpp b/tests/tt_metal/test_utils/test_common.hpp index dbcf2c50e25..ddbb41c9923 100644 --- a/tests/tt_metal/test_utils/test_common.hpp +++ b/tests/tt_metal/test_utils/test_common.hpp @@ -17,6 +17,8 @@ #include #include "metal_soc_descriptor.h" +using namespace tt::tt_metal; // test only + namespace test_args { template diff --git a/tests/tt_metal/tt_fabric/common/fabric_fixture.hpp b/tests/tt_metal/tt_fabric/common/fabric_fixture.hpp index 24b67c7c68e..73b1022f037 100644 --- a/tests/tt_metal/tt_fabric/common/fabric_fixture.hpp +++ b/tests/tt_metal/tt_fabric/common/fabric_fixture.hpp @@ -34,8 +34,8 @@ class ControlPlaneFixture : public ::testing::Test { class FabricFixture : public ::testing::Test { protected: tt::ARCH arch_; - std::map devices_map_; - std::vector devices_; + std::map devices_map_; + std::vector devices_; bool slow_dispatch_; void SetUp() override { diff --git a/tests/tt_metal/tt_fabric/fabric_data_movement/test_basic_fabric_apis.cpp b/tests/tt_metal/tt_fabric/fabric_data_movement/test_basic_fabric_apis.cpp index a1dbc1f6d4c..ee097108597 100644 --- a/tests/tt_metal/tt_fabric/fabric_data_movement/test_basic_fabric_apis.cpp +++ b/tests/tt_metal/tt_fabric/fabric_data_movement/test_basic_fabric_apis.cpp @@ -13,6 +13,10 @@ namespace tt::tt_fabric { TEST_F(FabricFixture, TestAsyncWrite) { + using tt::tt_metal::ShardedBufferConfig; + using tt::tt_metal::ShardOrientation; + using tt::tt_metal::ShardSpecBuffer; + CoreCoord sender_logical_core = {0, 0}; CoreRangeSet sender_logical_crs = {sender_logical_core}; CoreCoord receiver_logical_core = {1, 0}; @@ -60,8 +64,8 @@ TEST_F(FabricFixture, TestAsyncWrite) { .device = receiver_device, .size = data_size, .page_size = data_size, - .buffer_type = BufferType::L1, - .buffer_layout = TensorMemoryLayout::HEIGHT_SHARDED, + .buffer_type = tt_metal::BufferType::L1, + .buffer_layout = tt_metal::TensorMemoryLayout::HEIGHT_SHARDED, .shard_parameters = std::move(receiver_shard_parameters), }; auto receiver_buffer = CreateBuffer(receiver_shard_config); @@ -78,8 +82,8 @@ TEST_F(FabricFixture, TestAsyncWrite) { .device = sender_device, .size = sender_packet_header_and_data_size, .page_size = sender_packet_header_and_data_size, - .buffer_type = BufferType::L1, - .buffer_layout = TensorMemoryLayout::HEIGHT_SHARDED, + .buffer_type = tt_metal::BufferType::L1, + .buffer_layout = tt_metal::TensorMemoryLayout::HEIGHT_SHARDED, .shard_parameters = std::move(sender_shard_parameters), }; auto sender_buffer = CreateBuffer(sender_shard_config); @@ -119,7 +123,7 @@ TEST_F(FabricFixture, TestAsyncWrite) { sender_logical_crs, tt_metal::DataMovementConfig{ .processor = tt_metal::DataMovementProcessor::RISCV_0, - .noc = NOC::RISCV_0_default, + .noc = tt_metal::NOC::RISCV_0_default, .compile_args = sender_compile_time_args}); auto& sender_virtual_router_coord = routers[0].second; @@ -143,7 +147,7 @@ TEST_F(FabricFixture, TestAsyncWrite) { "tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_receiver.cpp", {receiver_logical_core}, tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); + .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); std::vector receiver_runtime_args = { receiver_buffer->address(), @@ -164,6 +168,10 @@ TEST_F(FabricFixture, TestAsyncWrite) { } TEST_F(FabricFixture, TestAtomicInc) { + using tt::tt_metal::ShardedBufferConfig; + using tt::tt_metal::ShardOrientation; + using tt::tt_metal::ShardSpecBuffer; + CoreCoord sender_logical_core = {0, 0}; CoreRangeSet sender_logical_crs = {sender_logical_core}; CoreCoord receiver_logical_core = {1, 0}; @@ -211,8 +219,8 @@ TEST_F(FabricFixture, TestAtomicInc) { .device = receiver_device, .size = data_size, .page_size = data_size, - .buffer_type = BufferType::L1, - .buffer_layout = TensorMemoryLayout::HEIGHT_SHARDED, + .buffer_type = tt_metal::BufferType::L1, + .buffer_layout = tt_metal::TensorMemoryLayout::HEIGHT_SHARDED, .shard_parameters = std::move(receiver_shard_parameters), }; auto receiver_buffer = CreateBuffer(receiver_shard_config); @@ -229,8 +237,8 @@ TEST_F(FabricFixture, TestAtomicInc) { .device = sender_device, .size = sender_packet_header_and_data_size, .page_size = sender_packet_header_and_data_size, - .buffer_type = BufferType::L1, - .buffer_layout = TensorMemoryLayout::HEIGHT_SHARDED, + .buffer_type = tt_metal::BufferType::L1, + .buffer_layout = tt_metal::TensorMemoryLayout::HEIGHT_SHARDED, .shard_parameters = std::move(sender_shard_parameters), }; auto sender_buffer = CreateBuffer(sender_shard_config); @@ -269,7 +277,7 @@ TEST_F(FabricFixture, TestAtomicInc) { sender_logical_crs, tt_metal::DataMovementConfig{ .processor = tt_metal::DataMovementProcessor::RISCV_0, - .noc = NOC::RISCV_0_default, + .noc = tt_metal::NOC::RISCV_0_default, .compile_args = sender_compile_time_args}); auto& sender_virtual_router_coord = routers[0].second; @@ -293,7 +301,7 @@ TEST_F(FabricFixture, TestAtomicInc) { "tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_receiver.cpp", {receiver_logical_core}, tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); + .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); std::vector receiver_runtime_args = { receiver_buffer->address(), @@ -314,6 +322,10 @@ TEST_F(FabricFixture, TestAtomicInc) { } TEST_F(FabricFixture, TestAsyncWriteAtomicInc) { + using tt::tt_metal::ShardedBufferConfig; + using tt::tt_metal::ShardOrientation; + using tt::tt_metal::ShardSpecBuffer; + CoreCoord sender_logical_core = {0, 0}; CoreRangeSet sender_logical_crs = {sender_logical_core}; CoreCoord receiver_logical_core = {1, 0}; @@ -362,8 +374,8 @@ TEST_F(FabricFixture, TestAsyncWriteAtomicInc) { .device = receiver_device, .size = data_size, .page_size = data_size, - .buffer_type = BufferType::L1, - .buffer_layout = TensorMemoryLayout::HEIGHT_SHARDED, + .buffer_type = tt_metal::BufferType::L1, + .buffer_layout = tt_metal::TensorMemoryLayout::HEIGHT_SHARDED, .shard_parameters = receiver_shard_parameters, }; auto receiver_buffer = CreateBuffer(receiver_shard_config); @@ -371,8 +383,8 @@ TEST_F(FabricFixture, TestAsyncWriteAtomicInc) { .device = receiver_device, .size = atomic_inc_size, .page_size = atomic_inc_size, - .buffer_type = BufferType::L1, - .buffer_layout = TensorMemoryLayout::HEIGHT_SHARDED, + .buffer_type = tt_metal::BufferType::L1, + .buffer_layout = tt_metal::TensorMemoryLayout::HEIGHT_SHARDED, .shard_parameters = receiver_shard_parameters, }; auto receiver_atomic_buffer = CreateBuffer(receiver_atomic_shard_config); @@ -391,8 +403,8 @@ TEST_F(FabricFixture, TestAsyncWriteAtomicInc) { .device = sender_device, .size = sender_packet_header_and_data_size, .page_size = sender_packet_header_and_data_size, - .buffer_type = BufferType::L1, - .buffer_layout = TensorMemoryLayout::HEIGHT_SHARDED, + .buffer_type = tt_metal::BufferType::L1, + .buffer_layout = tt_metal::TensorMemoryLayout::HEIGHT_SHARDED, .shard_parameters = std::move(sender_shard_parameters), }; auto sender_buffer = CreateBuffer(sender_shard_config); @@ -434,7 +446,7 @@ TEST_F(FabricFixture, TestAsyncWriteAtomicInc) { sender_logical_crs, tt_metal::DataMovementConfig{ .processor = tt_metal::DataMovementProcessor::RISCV_0, - .noc = NOC::RISCV_0_default, + .noc = tt_metal::NOC::RISCV_0_default, .compile_args = sender_compile_time_args}); auto& sender_virtual_router_coord = routers[0].second; @@ -460,7 +472,7 @@ TEST_F(FabricFixture, TestAsyncWriteAtomicInc) { "tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_receiver.cpp", {receiver_logical_core}, tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); + .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); std::vector receiver_runtime_args = { receiver_buffer->address(), @@ -484,6 +496,10 @@ TEST_F(FabricFixture, TestAsyncWriteAtomicInc) { } TEST_F(FabricFixture, TestAsyncWriteMulticast) { + using tt::tt_metal::ShardedBufferConfig; + using tt::tt_metal::ShardOrientation; + using tt::tt_metal::ShardSpecBuffer; + CoreCoord sender_logical_core = {0, 0}; CoreRangeSet sender_logical_crs = {sender_logical_core}; CoreCoord receiver_logical_core = {1, 0}; @@ -556,7 +572,7 @@ TEST_F(FabricFixture, TestAsyncWriteMulticast) { std::vector receiver_buffer_data(data_size / sizeof(uint32_t), 0); std::vector receiver_programs; - std::vector> receiver_buffers; + std::vector> receiver_buffers; for (auto& [routing_direction, physical_end_device_ids] : physical_end_device_ids_by_dir) { for (auto physical_end_device_id : physical_end_device_ids) { auto* receiver_device = DevicePool::instance().get_active_device(physical_end_device_id); @@ -564,8 +580,8 @@ TEST_F(FabricFixture, TestAsyncWriteMulticast) { .device = receiver_device, .size = data_size, .page_size = data_size, - .buffer_type = BufferType::L1, - .buffer_layout = TensorMemoryLayout::HEIGHT_SHARDED, + .buffer_type = tt_metal::BufferType::L1, + .buffer_layout = tt_metal::TensorMemoryLayout::HEIGHT_SHARDED, .shard_parameters = receiver_shard_parameters, }; auto receiver_buffer = CreateBuffer(receiver_shard_config); @@ -578,7 +594,7 @@ TEST_F(FabricFixture, TestAsyncWriteMulticast) { "tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_receiver.cpp", {receiver_logical_core}, tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); + .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); std::vector receiver_runtime_args = { receiver_buffer->address(), @@ -608,8 +624,8 @@ TEST_F(FabricFixture, TestAsyncWriteMulticast) { .device = sender_device, .size = sender_packet_header_and_data_size, .page_size = sender_packet_header_and_data_size, - .buffer_type = BufferType::L1, - .buffer_layout = TensorMemoryLayout::HEIGHT_SHARDED, + .buffer_type = tt_metal::BufferType::L1, + .buffer_layout = tt_metal::TensorMemoryLayout::HEIGHT_SHARDED, .shard_parameters = std::move(sender_shard_parameters), }; auto sender_buffer = CreateBuffer(sender_shard_config); @@ -648,7 +664,7 @@ TEST_F(FabricFixture, TestAsyncWriteMulticast) { sender_logical_crs, tt_metal::DataMovementConfig{ .processor = tt_metal::DataMovementProcessor::RISCV_0, - .noc = NOC::RISCV_0_default, + .noc = tt_metal::NOC::RISCV_0_default, .compile_args = sender_compile_time_args}); std::unordered_map sender_router_noc_xys; @@ -696,6 +712,10 @@ TEST_F(FabricFixture, TestAsyncWriteMulticast) { } TEST_F(FabricFixture, TestAsyncWriteMulticastMultidirectional) { + using tt::tt_metal::ShardedBufferConfig; + using tt::tt_metal::ShardOrientation; + using tt::tt_metal::ShardSpecBuffer; + CoreCoord sender_logical_core = {0, 0}; CoreRangeSet sender_logical_crs = {sender_logical_core}; CoreCoord receiver_logical_core = {1, 0}; @@ -769,7 +789,7 @@ TEST_F(FabricFixture, TestAsyncWriteMulticastMultidirectional) { std::vector receiver_buffer_data(data_size / sizeof(uint32_t), 0); std::vector receiver_programs; - std::vector> receiver_buffers; + std::vector> receiver_buffers; for (auto& [routing_direction, physical_end_device_ids] : physical_end_device_ids_by_dir) { for (auto physical_end_device_id : physical_end_device_ids) { auto* receiver_device = DevicePool::instance().get_active_device(physical_end_device_id); @@ -777,8 +797,8 @@ TEST_F(FabricFixture, TestAsyncWriteMulticastMultidirectional) { .device = receiver_device, .size = data_size, .page_size = data_size, - .buffer_type = BufferType::L1, - .buffer_layout = TensorMemoryLayout::HEIGHT_SHARDED, + .buffer_type = tt_metal::BufferType::L1, + .buffer_layout = tt_metal::TensorMemoryLayout::HEIGHT_SHARDED, .shard_parameters = receiver_shard_parameters, }; auto receiver_buffer = CreateBuffer(receiver_shard_config); @@ -791,7 +811,7 @@ TEST_F(FabricFixture, TestAsyncWriteMulticastMultidirectional) { "tests/tt_metal/tt_fabric/fabric_data_movement/kernels/fabric_receiver.cpp", {receiver_logical_core}, tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); + .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); std::vector receiver_runtime_args = { receiver_buffer->address(), @@ -821,8 +841,8 @@ TEST_F(FabricFixture, TestAsyncWriteMulticastMultidirectional) { .device = sender_device, .size = sender_packet_header_and_data_size, .page_size = sender_packet_header_and_data_size, - .buffer_type = BufferType::L1, - .buffer_layout = TensorMemoryLayout::HEIGHT_SHARDED, + .buffer_type = tt_metal::BufferType::L1, + .buffer_layout = tt_metal::TensorMemoryLayout::HEIGHT_SHARDED, .shard_parameters = std::move(sender_shard_parameters), }; auto sender_buffer = CreateBuffer(sender_shard_config); @@ -862,7 +882,7 @@ TEST_F(FabricFixture, TestAsyncWriteMulticastMultidirectional) { sender_logical_crs, tt_metal::DataMovementConfig{ .processor = tt_metal::DataMovementProcessor::RISCV_0, - .noc = NOC::RISCV_0_default, + .noc = tt_metal::NOC::RISCV_0_default, .compile_args = sender_compile_time_args}); std::unordered_map sender_router_noc_xys; diff --git a/tests/tt_metal/tt_metal/api/allocator/test_l1_banking_allocator.cpp b/tests/tt_metal/tt_metal/api/allocator/test_l1_banking_allocator.cpp index d0cea1bdf29..e0f67fde314 100644 --- a/tests/tt_metal/tt_metal/api/allocator/test_l1_banking_allocator.cpp +++ b/tests/tt_metal/tt_metal/api/allocator/test_l1_banking_allocator.cpp @@ -16,10 +16,10 @@ namespace unit_tests::test_l1_banking_allocator { uint64_t get_alloc_limit(const tt::tt_metal::IDevice* device) { const metal_SocDescriptor& soc_desc = tt::Cluster::instance().get_soc_desc(device->id()); uint32_t l1_unreserved_base = device->allocator()->get_base_allocator_addr(tt::tt_metal::HalMemType::L1); - auto dispatch_core_config = dispatch_core_manager::instance().get_dispatch_core_config(device->id()); + auto dispatch_core_config = tt::tt_metal::dispatch_core_manager::instance().get_dispatch_core_config(device->id()); auto storage_core_bank_size = tt::get_storage_core_bank_size(device->id(), device->num_hw_cqs(), dispatch_core_config); - const uint32_t allocator_alignment = device->allocator()->get_alignment(BufferType::L1); + const uint32_t allocator_alignment = device->allocator()->get_alignment(tt::tt_metal::BufferType::L1); const uint32_t interleaved_l1_bank_size = storage_core_bank_size.has_value() ? storage_core_bank_size.value() : (soc_desc.worker_l1_size - l1_unreserved_base); @@ -31,6 +31,8 @@ uint64_t get_alloc_limit(const tt::tt_metal::IDevice* device) { } // namespace unit_tests::test_l1_banking_allocator +namespace tt::tt_metal { + TEST_F(DeviceSingleCardBufferFixture, TestL1BuffersAllocatedTopDown) { std::vector alloc_sizes = {32 * 1024, 64 * 1024, 128 * 1024}; size_t total_size_bytes = 0; @@ -66,3 +68,5 @@ TEST_F(DeviceSingleCardBufferFixture, TestL1BuffersDoNotGrowBeyondBankSize) { EXPECT_ANY_THROW(auto buffer = tt::tt_metal::CreateBuffer(l1_config);); } + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/api/compile_program_with_kernel_path_env_var_fixture.hpp b/tests/tt_metal/tt_metal/api/compile_program_with_kernel_path_env_var_fixture.hpp index 41072b94670..619de6ccaae 100644 --- a/tests/tt_metal/tt_metal/api/compile_program_with_kernel_path_env_var_fixture.hpp +++ b/tests/tt_metal/tt_metal/api/compile_program_with_kernel_path_env_var_fixture.hpp @@ -18,13 +18,13 @@ class CompileProgramWithKernelPathEnvVarFixture : public ::testing::Test { } const chip_id_t device_id = 0; - this->device_ = CreateDevice(device_id); - this->program_ = CreateProgram(); + this->device_ = tt::tt_metal::CreateDevice(device_id); + this->program_ = tt::tt_metal::CreateProgram(); } void TearDown() override { if (!IsSkipped()) { - CloseDevice(this->device_); + tt::tt_metal::CloseDevice(this->device_); } } @@ -34,7 +34,8 @@ class CompileProgramWithKernelPathEnvVarFixture : public ::testing::Test { this->program_, kernel_file, core, - tt_metal::DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default}); + tt_metal::DataMovementConfig{ + .processor = tt::tt_metal::DataMovementProcessor::RISCV_1, .noc = tt::tt_metal::NOC::RISCV_1_default}); } void setup_kernel_dir(const string& orig_kernel_file, const string& new_kernel_file) { @@ -55,8 +56,8 @@ class CompileProgramWithKernelPathEnvVarFixture : public ::testing::Test { } } - IDevice* device_; - Program program_; + tt::tt_metal::IDevice* device_; + tt::tt_metal::Program program_; private: bool are_preconditions_satisfied() { return this->are_env_vars_set() && this->is_kernel_dir_valid(); } diff --git a/tests/tt_metal/tt_metal/api/test_buffer_region.cpp b/tests/tt_metal/tt_metal/api/test_buffer_region.cpp index 547895ebb44..077c2dd4fa3 100644 --- a/tests/tt_metal/tt_metal/api/test_buffer_region.cpp +++ b/tests/tt_metal/tt_metal/api/test_buffer_region.cpp @@ -9,6 +9,8 @@ #include "device_fixture.hpp" +namespace tt::tt_metal { + TEST_F(DeviceSingleCardBufferFixture, TestInvalidBufferRegion) { const InterleavedBufferConfig& buffer_config{ .device = this->device_, .size = 2048, .page_size = 32, .buffer_type = BufferType::DRAM}; @@ -65,3 +67,5 @@ TEST_F(DeviceSingleCardBufferFixture, TestFullBufferRegion) { const BufferRegion buffer_region(0, 2048); EXPECT_FALSE(buffer.get()->is_valid_partial_region(buffer_region)); } + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/api/test_direct.cpp b/tests/tt_metal/tt_metal/api/test_direct.cpp index dd3999244a3..5eebfcd1901 100644 --- a/tests/tt_metal/tt_metal/api/test_direct.cpp +++ b/tests/tt_metal/tt_metal/api/test_direct.cpp @@ -333,6 +333,8 @@ bool reader_datacopy_writer(tt_metal::IDevice* device, const ReaderDatacopyWrite } } // namespace unit_tests::dram::direct +namespace tt::tt_metal { + TEST_F(DeviceFixture, TensixSingleCoreDirectDramReaderOnly) { for (unsigned int id = 0; id < num_devices_; id++) { uint32_t l1_unreserved_base = devices_.at(id)->allocator()->get_base_allocator_addr(HalMemType::L1); @@ -386,3 +388,5 @@ TEST_F(DeviceFixture, TensixSingleCoreDirectDramReaderDatacopyWriter) { ASSERT_TRUE(unit_tests::dram::direct::reader_datacopy_writer(devices_.at(id), test_config)); } } + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/api/test_dram.cpp b/tests/tt_metal/tt_metal/api/test_dram.cpp index 07916a6adf7..f23a40c95b6 100644 --- a/tests/tt_metal/tt_metal/api/test_dram.cpp +++ b/tests/tt_metal/tt_metal/api/test_dram.cpp @@ -23,7 +23,7 @@ struct DRAMConfig { tt_metal::DataMovementConfig data_movement_cfg; }; -bool dram_single_core_db(DispatchFixture* fixture, tt_metal::IDevice* device) { +bool dram_single_core_db(tt::tt_metal::DispatchFixture* fixture, tt_metal::IDevice* device) { tt_metal::Program program = tt_metal::CreateProgram(); CoreCoord core = {0, 0}; @@ -85,9 +85,12 @@ bool dram_single_core_db(DispatchFixture* fixture, tt_metal::IDevice* device) { } bool dram_single_core( - DispatchFixture* fixture, tt_metal::IDevice* device, const DRAMConfig& cfg, std::vector src_vec) { + tt::tt_metal::DispatchFixture* fixture, + tt_metal::IDevice* device, + const DRAMConfig& cfg, + std::vector src_vec) { // Create a program - tt_metal::Program program = CreateProgram(); + tt_metal::Program program = tt::tt_metal::CreateProgram(); tt_metal::InterleavedBufferConfig dram_config{ .device = device, @@ -124,9 +127,12 @@ bool dram_single_core( } bool dram_single_core_pre_allocated( - DispatchFixture* fixture, tt_metal::IDevice* device, const DRAMConfig& cfg, std::vector src_vec) { + tt::tt_metal::DispatchFixture* fixture, + tt_metal::IDevice* device, + const DRAMConfig& cfg, + std::vector src_vec) { // Create a program - tt_metal::Program program = CreateProgram(); + tt_metal::Program program = tt::tt_metal::CreateProgram(); tt_metal::InterleavedBufferConfig dram_config{ .device = device, @@ -172,6 +178,8 @@ bool dram_single_core_pre_allocated( } } // namespace unit_tests_common::dram::test_dram +namespace tt::tt_metal { + TEST_F(DispatchFixture, TensixDRAMLoopbackSingleCore) { uint32_t buffer_size = 2 * 1024 * 25; std::vector src_vec = @@ -217,3 +225,5 @@ TEST_F(DispatchFixture, TensixDRAMLoopbackSingleCoreDB) { ASSERT_TRUE(unit_tests_common::dram::test_dram::dram_single_core_db(this, devices_.at(id))); } } + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/api/test_dram_to_l1_multicast.cpp b/tests/tt_metal/tt_metal/api/test_dram_to_l1_multicast.cpp index 4fa2432ad93..5884f76c78b 100644 --- a/tests/tt_metal/tt_metal/api/test_dram_to_l1_multicast.cpp +++ b/tests/tt_metal/tt_metal/api/test_dram_to_l1_multicast.cpp @@ -23,7 +23,8 @@ struct DRAMtoL1MulticastConfig { CoreCoord exclude_direction; }; -bool dram_to_l1_multicast(DispatchFixture* fixture, tt_metal::IDevice* device, const DRAMtoL1MulticastConfig& cfg) { +bool dram_to_l1_multicast( + tt::tt_metal::DispatchFixture* fixture, tt_metal::IDevice* device, const DRAMtoL1MulticastConfig& cfg) { bool pass = true; tt_metal::Program program = tt_metal::CreateProgram(); @@ -127,6 +128,8 @@ bool dram_to_l1_multicast(DispatchFixture* fixture, tt_metal::IDevice* device, c } } // namespace unit_tests_common::dram::test_dram_to_l1_multicast +namespace tt::tt_metal { + TEST_F(DispatchFixture, TensixDRAMtoL1Multicast) { unit_tests_common::dram::test_dram_to_l1_multicast::DRAMtoL1MulticastConfig test_config = { .dest_buffer_addr = 200 * 1024, @@ -216,3 +219,5 @@ TEST_F(DispatchFixture, TensixDRAMtoL1MulticastExcludeRegionDownRight) { this, devices_.at(id), test_config)); } } + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/api/test_global_circular_buffers.cpp b/tests/tt_metal/tt_metal/api/test_global_circular_buffers.cpp index 864e305cfba..334df62e369 100644 --- a/tests/tt_metal/tt_metal/api/test_global_circular_buffers.cpp +++ b/tests/tt_metal/tt_metal/api/test_global_circular_buffers.cpp @@ -14,6 +14,8 @@ #include #include "tt_metal/include/tt_metal/program.hpp" +namespace tt::tt_metal { + TEST_F(DispatchFixture, TensixCreateGlobalCircularBuffers) { CoreRangeSet cores(CoreRange({1, 1}, {1, 1})); CoreRangeSet cores2(CoreRange({1, 1}, {2, 2})); @@ -108,3 +110,5 @@ TEST_F(DispatchFixture, TensixProgramGlobalCircularBuffers) { EXPECT_THROW(program_dispatch::finalize_program_offsets(program, device), std::exception); } } + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/api/test_global_semaphores.cpp b/tests/tt_metal/tt_metal/api/test_global_semaphores.cpp index f103c2cd150..6837893cc2d 100644 --- a/tests/tt_metal/tt_metal/api/test_global_semaphores.cpp +++ b/tests/tt_metal/tt_metal/api/test_global_semaphores.cpp @@ -12,6 +12,8 @@ #include #include +namespace tt::tt_metal { + TEST_F(DispatchFixture, InitializeGlobalSemaphores) { CoreRangeSet cores(CoreRange({0, 0}, {1, 1})); @@ -117,3 +119,5 @@ TEST_F(DispatchFixture, ResetGlobalSemaphores) { } } } + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/api/test_kernel_creation.cpp b/tests/tt_metal/tt_metal/api/test_kernel_creation.cpp index d4ef1f01f0e..0f3291123b6 100644 --- a/tests/tt_metal/tt_metal/api/test_kernel_creation.cpp +++ b/tests/tt_metal/tt_metal/api/test_kernel_creation.cpp @@ -9,6 +9,8 @@ #include #include "compile_program_with_kernel_path_env_var_fixture.hpp" +namespace tt::tt_metal { + using namespace tt; // Ensures we can successfully create kernels on available compute grid @@ -110,3 +112,5 @@ TEST_F(CompileProgramWithKernelPathEnvVarFixture, TensixNonExistentKernel) { this->create_kernel(kernel_file); EXPECT_THROW(detail::CompileProgram(this->device_, this->program_), std::exception); } + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/api/test_noc.cpp b/tests/tt_metal/tt_metal/api/test_noc.cpp index a70285d4291..fc4ec2bc181 100644 --- a/tests/tt_metal/tt_metal/api/test_noc.cpp +++ b/tests/tt_metal/tt_metal/api/test_noc.cpp @@ -19,7 +19,7 @@ namespace unit_tests::basic::test_noc { const uint32_t init_value = 0x1234B33F; -uint32_t read_reg(IDevice* device, CoreCoord logical_node, uint32_t reg_addr) { +uint32_t read_reg(tt::tt_metal::IDevice* device, CoreCoord logical_node, uint32_t reg_addr) { // Read and return reg value form reading uint32_t reg_data = unit_tests::basic::test_noc::init_value; tt_metal::detail::ReadRegFromDevice(device, logical_node, reg_addr, reg_data); @@ -27,7 +27,10 @@ uint32_t read_reg(IDevice* device, CoreCoord logical_node, uint32_t reg_addr) { } void read_translation_table( - IDevice* device, CoreCoord logical_node, std::vector& x_remap, std::vector& y_remap) { + tt::tt_metal::IDevice* device, + CoreCoord logical_node, + std::vector& x_remap, + std::vector& y_remap) { #ifdef NOC_X_ID_TRANSLATE_TABLE_0 std::vector x_reg_addrs = { NOC_CFG(NOC_X_ID_TRANSLATE_TABLE_0), @@ -169,6 +172,8 @@ TEST(NOC, TensixVerifyNocIdentityTranslationTable) { ASSERT_TRUE(tt::tt_metal::CloseDevice(device)); } +namespace tt::tt_metal { + // Tests that kernel can write to and read from a stream register address // This is meant to exercise noc_inline_dw_write API TEST_F(DeviceFixture, TensixDirectedStreamRegWriteRead) { @@ -228,3 +233,5 @@ TEST_F(DeviceFixture, TensixDirectedStreamRegWriteRead) { } } } + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/api/test_runtime_args.cpp b/tests/tt_metal/tt_metal/api/test_runtime_args.cpp index 201fa8fdc4a..867411fe29f 100644 --- a/tests/tt_metal/tt_metal/api/test_runtime_args.cpp +++ b/tests/tt_metal/tt_metal/api/test_runtime_args.cpp @@ -10,7 +10,6 @@ #include using namespace tt; -using namespace tt::tt_metal; namespace unit_tests::runtime_args { @@ -42,8 +41,9 @@ uint32_t get_runtime_arg_addr(uint32_t l1_unreserved_base, tt::RISCV processor, return result_base + offset; }; -Program initialize_program_data_movement(IDevice* device, const CoreRangeSet& core_range_set) { - Program program = tt_metal::CreateProgram(); +tt::tt_metal::Program initialize_program_data_movement( + tt::tt_metal::IDevice* device, const CoreRangeSet& core_range_set) { + tt::tt_metal::Program program = tt_metal::CreateProgram(); auto add_two_ints_kernel = tt_metal::CreateKernel( program, @@ -52,16 +52,19 @@ Program initialize_program_data_movement(IDevice* device, const CoreRangeSet& co tt_metal::DataMovementConfig{ .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); - detail::CompileProgram(device, program); + tt::tt_metal::detail::CompileProgram(device, program); return std::move(program); } -Program initialize_program_data_movement_rta( - IDevice* device, const CoreRangeSet& core_range_set, uint32_t num_unique_rt_args, bool common_rtas = false) { - Program program = tt_metal::CreateProgram(); +tt::tt_metal::Program initialize_program_data_movement_rta( + tt::tt_metal::IDevice* device, + const CoreRangeSet& core_range_set, + uint32_t num_unique_rt_args, + bool common_rtas = false) { + tt::tt_metal::Program program = tt_metal::CreateProgram(); uint32_t rta_base_dm = get_runtime_arg_addr( - device->allocator()->get_base_allocator_addr(HalMemType::L1), tt::RISCV::BRISC, common_rtas); + device->allocator()->get_base_allocator_addr(tt::tt_metal::HalMemType::L1), tt::RISCV::BRISC, common_rtas); std::map dm_defines = { {"DATA_MOVEMENT", "1"}, {"NUM_RUNTIME_ARGS", std::to_string(num_unique_rt_args)}, @@ -79,19 +82,22 @@ Program initialize_program_data_movement_rta( .noc = tt_metal::NOC::RISCV_0_default, .defines = dm_defines}); - detail::CompileProgram(device, program); + tt::tt_metal::detail::CompileProgram(device, program); return std::move(program); } -Program initialize_program_compute( - IDevice* device, const CoreRangeSet& core_range_set, uint32_t num_unique_rt_args, uint32_t num_common_rt_args) { - Program program = tt_metal::CreateProgram(); +tt::tt_metal::Program initialize_program_compute( + tt::tt_metal::IDevice* device, + const CoreRangeSet& core_range_set, + uint32_t num_unique_rt_args, + uint32_t num_common_rt_args) { + tt::tt_metal::Program program = tt_metal::CreateProgram(); // Tell kernel how many unique and common RT args to expect. Will increment each. - uint32_t rta_base_compute = - get_runtime_arg_addr(device->allocator()->get_base_allocator_addr(HalMemType::L1), tt::RISCV::COMPUTE, false); - uint32_t common_rta_base_compute = - get_runtime_arg_addr(device->allocator()->get_base_allocator_addr(HalMemType::L1), tt::RISCV::COMPUTE, true); + uint32_t rta_base_compute = get_runtime_arg_addr( + device->allocator()->get_base_allocator_addr(tt::tt_metal::HalMemType::L1), tt::RISCV::COMPUTE, false); + uint32_t common_rta_base_compute = get_runtime_arg_addr( + device->allocator()->get_base_allocator_addr(tt::tt_metal::HalMemType::L1), tt::RISCV::COMPUTE, true); std::vector compile_args = { num_unique_rt_args, num_common_rt_args, rta_base_compute, common_rta_base_compute}; bool fp32_dest_acc_en = false; @@ -113,7 +119,7 @@ Program initialize_program_compute( // Verify the runtime args for a single core (apply optional non-zero increment amounts to values written to match // compute kernel) bool verify_core_rt_args( - IDevice* device, + tt::tt_metal::IDevice* device, bool is_common, CoreCoord core, uint32_t base_addr, @@ -145,8 +151,8 @@ bool verify_core_rt_args( // Iterate over all cores unique and common runtime args, and verify they match expected values. bool verify_results( bool are_args_incremented, - IDevice* device, - const Program& program, + tt::tt_metal::IDevice* device, + const tt::tt_metal::Program& program, const std::map>& core_to_rt_args, const std::vector& common_rt_args = {}) { bool pass = true; @@ -159,7 +165,7 @@ bool verify_results( for (size_t kernel_id = 0; kernel_id < program.num_kernels(); kernel_id++) { const auto kernel = tt_metal::detail::GetKernel(program, kernel_id); auto rt_args_base_addr = get_runtime_arg_addr( - device->allocator()->get_base_allocator_addr(HalMemType::L1), kernel->processor(), false); + device->allocator()->get_base_allocator_addr(tt::tt_metal::HalMemType::L1), kernel->processor(), false); // Verify Unique RT Args (per core) for (const auto& logical_core : kernel->cores_with_runtime_args()) { @@ -175,7 +181,7 @@ bool verify_results( // Verify common RT Args (same for all cores) if they exist. if (common_rt_args.size() > 0) { auto common_rt_args_base_addr = get_runtime_arg_addr( - device->allocator()->get_base_allocator_addr(HalMemType::L1), kernel->processor(), true); + device->allocator()->get_base_allocator_addr(tt::tt_metal::HalMemType::L1), kernel->processor(), true); for (auto& core_range : kernel->logical_coreranges()) { for (auto x = core_range.start_coord.x; x <= core_range.end_coord.x; x++) { @@ -194,6 +200,10 @@ bool verify_results( return pass; } +} // namespace unit_tests::runtime_args + +namespace tt::tt_metal { + // Write unique and common runtime args to device and readback to verify written correctly. TEST_F(DeviceFixture, TensixLegallyModifyRTArgsDataMovement) { for (unsigned int id = 0; id < num_devices_; id++) { @@ -217,7 +227,7 @@ TEST_F(DeviceFixture, TensixLegallyModifyRTArgsDataMovement) { } } detail::WriteRuntimeArgsToDevice(this->devices_.at(id), program); - tt_metal::detail::LaunchProgram(this->devices_.at(id), program); + detail::LaunchProgram(this->devices_.at(id), program); EXPECT_TRUE(unit_tests::runtime_args::verify_results(false, this->devices_.at(id), program, core_to_rt_args)); std::vector second_runtime_args = {202, 505}; @@ -451,4 +461,4 @@ TEST_F(DeviceFixture, TensixIllegallyModifyRTArgs) { } } -} // namespace unit_tests::runtime_args +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/api/test_semaphores.cpp b/tests/tt_metal/tt_metal/api/test_semaphores.cpp index 9e6aa1e92dc..a9ab9d5210a 100644 --- a/tests/tt_metal/tt_metal/api/test_semaphores.cpp +++ b/tests/tt_metal/tt_metal/api/test_semaphores.cpp @@ -61,7 +61,7 @@ void initialize_program(tt_metal::IDevice* device, tt_metal::Program& program, c void create_and_read_max_num_semaphores( tt_metal::IDevice* device, tt_metal::Program& program, const CoreRange& core_range) { std::vector golden; - for (uint32_t i = 0; i < NUM_SEMAPHORES; i++) { + for (uint32_t i = 0; i < tt::tt_metal::NUM_SEMAPHORES; i++) { uint32_t initial_value = i; auto semaphore_id = tt_metal::CreateSemaphore(program, core_range, initial_value); golden.push_back(initial_value); @@ -69,7 +69,7 @@ void create_and_read_max_num_semaphores( } tt_metal::detail::CompileProgram(device, program); - program_dispatch::finalize_program_offsets(program, device); + tt::tt_metal::program_dispatch::finalize_program_offsets(program, device); ASSERT_TRUE(tt_metal::detail::ConfigureDeviceWithProgram(device, program)); @@ -77,10 +77,10 @@ void create_and_read_max_num_semaphores( for (auto y = core_range.start_coord.y; y <= core_range.end_coord.y; y++) { auto logical_core = CoreCoord{x, y}; std::vector res; - for (uint32_t i = 0; i < NUM_SEMAPHORES; i++) { + for (uint32_t i = 0; i < tt::tt_metal::NUM_SEMAPHORES; i++) { std::vector single_val; uint32_t semaphore_addr = program.get_sem_base_addr(device, logical_core, CoreType::WORKER) + - (hal.get_alignment(HalMemType::L1) * i); + (tt::tt_metal::hal.get_alignment(tt::tt_metal::HalMemType::L1) * i); uint32_t semaphore_size = sizeof(uint32_t); tt_metal::detail::ReadFromDeviceL1(device, logical_core, semaphore_addr, semaphore_size, single_val); ASSERT_TRUE(single_val.size() == 1); @@ -101,6 +101,8 @@ void try_creating_more_than_max_num_semaphores( } // namespace unit_tests::initialize_semaphores +namespace tt::tt_metal { + TEST_F(DeviceFixture, TensixInitializeLegalSemaphores) { for (unsigned int id = 0; id < num_devices_; id++) { tt_metal::Program program = tt_metal::CreateProgram(); @@ -150,3 +152,5 @@ TEST_F(DeviceFixture, TensixCreateMultipleSemaphoresOnSameCore) { EXPECT_EQ(sem5_id, 3); EXPECT_EQ(sem6_id, 0); } + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/api/test_shape_base.cpp b/tests/tt_metal/tt_metal/api/test_shape_base.cpp index 455a7714c1d..2f2265b4878 100644 --- a/tests/tt_metal/tt_metal/api/test_shape_base.cpp +++ b/tests/tt_metal/tt_metal/api/test_shape_base.cpp @@ -7,6 +7,8 @@ #include +namespace tt::tt_metal { + TEST(TensorShapeBaseTests, General4D) { tt::tt_metal::ShapeBase vec({20, 30, 40, 50}); EXPECT_EQ(vec.view().size(), vec.view().size()); @@ -79,3 +81,5 @@ TEST(TensorShapeBaseTests, TwoElements) { EXPECT_EQ(vec[-4], 1); EXPECT_THROW(vec[-5], std::exception); } + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/api/test_simple_dram_buffer.cpp b/tests/tt_metal/tt_metal/api/test_simple_dram_buffer.cpp index 68112a359b2..75d212c1e02 100644 --- a/tests/tt_metal/tt_metal/api/test_simple_dram_buffer.cpp +++ b/tests/tt_metal/tt_metal/api/test_simple_dram_buffer.cpp @@ -41,6 +41,8 @@ bool SimpleDramWriteOnly(IDevice* device, size_t local_address, size_t byte_size } } // namespace tt::test::buffer::detail +namespace tt::tt_metal { + TEST_F(DeviceFixture, TestSimpleDramBufferReadOnlyLo) { for (unsigned int id = 0; id < num_devices_; id++) { size_t lo_address = devices_.at(id)->allocator()->get_base_allocator_addr(HalMemType::DRAM); @@ -85,3 +87,5 @@ TEST_F(DeviceFixture, TestSimpleDramBufferWriteOnlyHi) { ASSERT_TRUE(SimpleDramWriteOnly(this->devices_.at(id), hi_address, 16 * 1024)); } } + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/api/test_simple_l1_buffer.cpp b/tests/tt_metal/tt_metal/api/test_simple_l1_buffer.cpp index a7b49005d45..92916323837 100644 --- a/tests/tt_metal/tt_metal/api/test_simple_l1_buffer.cpp +++ b/tests/tt_metal/tt_metal/api/test_simple_l1_buffer.cpp @@ -63,7 +63,7 @@ bool SimpleTiledL1WriteCBRead( .set_page_size(cb_index, page_size); auto l1_cb = tt_metal::CreateCircularBuffer(program, core, l1_cb_config); std::map defines = {{"INTERFACE_WITH_L1", "1"}}; - uint32_t bank_id = device->allocator()->get_bank_ids_from_logical_core(BufferType::L1, core)[0]; + uint32_t bank_id = device->allocator()->get_bank_ids_from_logical_core(tt_metal::BufferType::L1, core)[0]; auto reader_kernel = tt_metal::CreateKernel( program, "tests/tt_metal/tt_metal/test_kernels/dataflow/direct_reader_unary.cpp", @@ -118,6 +118,8 @@ bool SimpleTiledL1WriteCBRead( } // namespace tt::test::buffer::detail +namespace tt::tt_metal { + TEST_F(DeviceFixture, TestSimpleL1BufferReadOnlyLo) { for (unsigned int id = 0; id < num_devices_; id++) { size_t lo_address = this->devices_.at(id)->l1_size_per_core() - @@ -237,3 +239,5 @@ TEST_F(DeviceFixture, TensixTestBufferL1ReadWriteTileHi) { this->devices_.at(id), {2, 2}, hi_address + 8 * 1024, hi_address + 16 * 1024, 6 * 1024)); } } + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/api/test_soc_descriptor.cpp b/tests/tt_metal/tt_metal/api/test_soc_descriptor.cpp index abb9ec14ba4..3275bcc4efa 100644 --- a/tests/tt_metal/tt_metal/api/test_soc_descriptor.cpp +++ b/tests/tt_metal/tt_metal/api/test_soc_descriptor.cpp @@ -44,6 +44,8 @@ std::unordered_set get_harvested_rows(chip_id_t device_id) { } } // namespace unit_tests::basic::soc_desc +namespace tt::tt_metal { + // This test ensures that no logical core maps to a harvested row TEST(SOC, TensixValidateLogicalToPhysicalCoreCoordHostMapping) { size_t num_devices = tt_metal::GetNumAvailableDevices(); @@ -69,3 +71,5 @@ TEST(SOC, TensixValidateLogicalToPhysicalCoreCoordHostMapping) { tt_metal::CloseDevice(device); } } + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/common/command_queue_fixture.hpp b/tests/tt_metal/tt_metal/common/command_queue_fixture.hpp index a847610dea6..107763433ab 100644 --- a/tests/tt_metal/tt_metal/common/command_queue_fixture.hpp +++ b/tests/tt_metal/tt_metal/common/command_queue_fixture.hpp @@ -17,6 +17,8 @@ #include #include "llrt.hpp" +namespace tt::tt_metal { + class CommandQueueFixture : public DispatchFixture { protected: tt::tt_metal::IDevice* device_; @@ -160,3 +162,5 @@ class CommandQueueMultiDeviceFixture : public DispatchFixture { }; class CommandQueueMultiDeviceProgramFixture : public CommandQueueMultiDeviceFixture {}; + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/common/device_fixture.hpp b/tests/tt_metal/tt_metal/common/device_fixture.hpp index 1d7cdefe875..4df09733c06 100644 --- a/tests/tt_metal/tt_metal/common/device_fixture.hpp +++ b/tests/tt_metal/tt_metal/common/device_fixture.hpp @@ -13,6 +13,8 @@ #include #include "llrt.hpp" +namespace tt::tt_metal { + class DeviceFixture : public DispatchFixture { protected: void SetUp() override { @@ -113,3 +115,5 @@ class DeviceSingleCardFastSlowDispatchFixture : public DeviceSingleCardFixture { } } }; + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/common/dispatch_fixture.hpp b/tests/tt_metal/tt_metal/common/dispatch_fixture.hpp index af1c0816d09..8eb6be84885 100644 --- a/tests/tt_metal/tt_metal/common/dispatch_fixture.hpp +++ b/tests/tt_metal/tt_metal/common/dispatch_fixture.hpp @@ -14,6 +14,8 @@ #include #include "llrt.hpp" +namespace tt::tt_metal { + // A dispatch-agnostic test fixture class DispatchFixture : public ::testing::Test { public: @@ -129,3 +131,5 @@ class DispatchFixture : public ::testing::Test { } } }; + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/common/matmul_test_utils.hpp b/tests/tt_metal/tt_metal/common/matmul_test_utils.hpp index 0090f56cac9..93ff7b61149 100644 --- a/tests/tt_metal/tt_metal/common/matmul_test_utils.hpp +++ b/tests/tt_metal/tt_metal/common/matmul_test_utils.hpp @@ -12,7 +12,7 @@ #include #include "llrt.hpp" -using namespace tt; +namespace tt::tt_metal { inline std::vector select_columns(std::vector data, int M, int K, int N) { if (N == K) { @@ -154,3 +154,5 @@ inline bool move_tiles_to_dram( EnqueueWriteBuffer(cq, buffer, tiles, false); return pass; } + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/common/multi_device_fixture.hpp b/tests/tt_metal/tt_metal/common/multi_device_fixture.hpp index 775f93b1861..16c3299023a 100644 --- a/tests/tt_metal/tt_metal/common/multi_device_fixture.hpp +++ b/tests/tt_metal/tt_metal/common/multi_device_fixture.hpp @@ -15,6 +15,8 @@ #include "umd/device/types/cluster_descriptor_types.h" #include "tt_metal/test_utils/env_vars.hpp" +namespace tt::tt_metal { + class MultiDeviceFixture : public DispatchFixture { protected: void SetUp() override { this->arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); } @@ -126,7 +128,7 @@ class MeshDeviceFixtureBase : public ::testing::Test { magic_enum::enum_name(*config_.mesh_device_type)); } // Use ethernet dispatch for more than 1 CQ on T3K/N300 - DispatchCoreType core_type = (config_.num_cqs >= 2) ? DispatchCoreType::ETH : DispatchCoreType::WORKER; + auto core_type = (config_.num_cqs >= 2) ? DispatchCoreType::ETH : DispatchCoreType::WORKER; mesh_device_ = MeshDevice::create( MeshDeviceConfig{.mesh_shape = get_mesh_shape(*mesh_device_type)}, 0, @@ -208,3 +210,5 @@ class T3000MultiCQMeshDeviceFixture : public MeshDeviceFixtureBase { T3000MultiCQMeshDeviceFixture() : MeshDeviceFixtureBase(Config{.mesh_device_type = MeshDeviceType::T3000, .num_cqs = 2}) {} }; + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/debug_tools/debug_tools_fixture.hpp b/tests/tt_metal/tt_metal/debug_tools/debug_tools_fixture.hpp index 67ff4e18018..ee0c19bd6c3 100644 --- a/tests/tt_metal/tt_metal/debug_tools/debug_tools_fixture.hpp +++ b/tests/tt_metal/tt_metal/debug_tools/debug_tools_fixture.hpp @@ -11,6 +11,8 @@ #include "dprint_server.hpp" +namespace tt::tt_metal { + class DebugToolsFixture : public DispatchFixture { protected: bool watcher_previous_enabled; @@ -222,3 +224,5 @@ class WatcherDelayFixture : public WatcherFixture { tt::llrt::RunTimeOptions::get_instance().set_feature_targets(tt::llrt::RunTimeDebugFeatureAtomicDebugDelay, saved_target_selection[tt::llrt::RunTimeDebugFeatureAtomicDebugDelay]); } }; + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/device/galaxy_fixture.hpp b/tests/tt_metal/tt_metal/device/galaxy_fixture.hpp index ee367536a20..892632676d4 100644 --- a/tests/tt_metal/tt_metal/device/galaxy_fixture.hpp +++ b/tests/tt_metal/tt_metal/device/galaxy_fixture.hpp @@ -11,6 +11,8 @@ #include #include "multi_device_fixture.hpp" +namespace tt::tt_metal { + class GalaxyFixture : public MultiDeviceFixture { protected: void SkipTestSuiteIfNotGalaxyMotherboard() { @@ -84,3 +86,5 @@ class TGGFixture : public GalaxyFixture { this->InitializeDevices(); } }; + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/device/test_device.cpp b/tests/tt_metal/tt_metal/device/test_device.cpp index c28a8133507..86de20c81ce 100644 --- a/tests/tt_metal/tt_metal/device/test_device.cpp +++ b/tests/tt_metal/tt_metal/device/test_device.cpp @@ -74,6 +74,8 @@ bool dram_ping( } } // namespace unit_tests::basic::device +namespace tt::tt_metal { + TEST_F(DeviceFixture, PingAllLegalDramChannels) { for (unsigned int id = 0; id < num_devices_; id++) { { @@ -274,3 +276,5 @@ TEST_F(DeviceFixture, TensixTestL1ToPCIeAt16BAlignedAddress) { EXPECT_EQ(src, result); } + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/device/test_device_cluster_api.cpp b/tests/tt_metal/tt_metal/device/test_device_cluster_api.cpp index 3b4b0a43cbe..c0a6ad4526e 100644 --- a/tests/tt_metal/tt_metal/device/test_device_cluster_api.cpp +++ b/tests/tt_metal/tt_metal/device/test_device_cluster_api.cpp @@ -10,11 +10,13 @@ #include #include -using namespace tt; -using namespace tt::test_utils; +namespace tt::tt_metal { namespace unit_tests::multichip::cluster { +using namespace tt; +using namespace tt::test_utils; + // Run this on Nebula X2 only, validate etherent core apis are correct // Known connectivity: chip 0 (x=9, y=6) <--> chip 1 (x=9, y=0) // chip 0 (x=1, y=6) <--> chip 1 (x=1, y=0) @@ -154,3 +156,5 @@ TEST_F(N300DeviceFixture, ActiveEthValidateEthernetSockets) { EXPECT_ANY_THROW(device_0->get_ethernet_sockets(2)); } } // namespace unit_tests::multichip::cluster + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/device/test_device_init_and_teardown.cpp b/tests/tt_metal/tt_metal/device/test_device_init_and_teardown.cpp index 7e7b4b10465..d300bacf276 100644 --- a/tests/tt_metal/tt_metal/device/test_device_init_and_teardown.cpp +++ b/tests/tt_metal/tt_metal/device/test_device_init_and_teardown.cpp @@ -26,7 +26,7 @@ void launch_program(tt_metal::IDevice* device, tt_metal::Program& program) { if (getenv("TT_METAL_SLOW_DISPATCH_MODE")) { tt_metal::detail::LaunchProgram(device, program); } else { - CommandQueue& cq = device->command_queue(); + tt::tt_metal::CommandQueue& cq = device->command_queue(); EnqueueProgram(cq, program, false); Finish(cq); } @@ -44,21 +44,25 @@ bool load_all_blank_kernels(tt_metal::IDevice* device) { program, "tt_metal/kernels/dataflow/blank.cpp", all_cores, - DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default}); + tt::tt_metal::DataMovementConfig{ + .processor = tt::tt_metal::DataMovementProcessor::RISCV_1, .noc = tt::tt_metal::NOC::RISCV_1_default}); CreateKernel( program, "tt_metal/kernels/dataflow/blank.cpp", all_cores, - DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); + tt::tt_metal::DataMovementConfig{ + .processor = tt::tt_metal::DataMovementProcessor::RISCV_0, .noc = tt::tt_metal::NOC::RISCV_0_default}); - CreateKernel(program, "tt_metal/kernels/compute/blank.cpp", all_cores, ComputeConfig{}); + CreateKernel(program, "tt_metal/kernels/compute/blank.cpp", all_cores, tt::tt_metal::ComputeConfig{}); unit_tests_common::basic::test_device_init::launch_program(device, program); return pass; } } // namespace unit_tests_common::basic::test_device_init +namespace tt::tt_metal { + INSTANTIATE_TEST_SUITE_P(DeviceInit, DeviceParamFixture, ::testing::Values(1, tt::tt_metal::GetNumAvailableDevices())); TEST_P(DeviceParamFixture, DeviceInitializeAndTeardown) { @@ -101,3 +105,5 @@ TEST_P(DeviceParamFixture, TensixDeviceLoadBlankKernels) { ASSERT_TRUE(tt::tt_metal::CloseDevice(device)); } } + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/device/test_device_pool.cpp b/tests/tt_metal/tt_metal/device/test_device_pool.cpp index c7d2eb8e3bf..e833ea69f91 100644 --- a/tests/tt_metal/tt_metal/device/test_device_pool.cpp +++ b/tests/tt_metal/tt_metal/device/test_device_pool.cpp @@ -9,6 +9,8 @@ using namespace tt; +namespace tt::tt_metal { + TEST(DevicePool, DevicePoolOpenClose) { std::vector device_ids{0}; int num_hw_cqs = 1; @@ -129,3 +131,5 @@ TEST(DevicePool, DevicePoolReduceDevices) { ASSERT_TRUE(dev->is_initialized()); DevicePool::instance().close_device(0); } + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/device/test_galaxy_cluster_api.cpp b/tests/tt_metal/tt_metal/device/test_galaxy_cluster_api.cpp index f645182a350..5c9de159d57 100644 --- a/tests/tt_metal/tt_metal/device/test_galaxy_cluster_api.cpp +++ b/tests/tt_metal/tt_metal/device/test_galaxy_cluster_api.cpp @@ -39,6 +39,8 @@ std::unordered_set get_ethernet_connected_device_ids(const chip_id_t return connected_device_ids; } +namespace tt::tt_metal { + // Validate that every pair of adjacent galaxy chips has 4 links between them // The reason that this test is in TGFixture instead of GalaxyFixture is // because there are 2 links between adjacent Galaxy chips that are on different @@ -192,3 +194,5 @@ TEST_F(TGGFixture, ValidateChipBoardTypes) { ASSERT_TRUE(num_galaxy_chips == 64) << "Detected " << num_galaxy_chips << " Galaxy chips" << std::endl; ASSERT_TRUE(num_n150_chips == 8) << "Detected " << num_n150_chips << " N150 chips" << std::endl; } + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_sub_device.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_sub_device.cpp index 12c3bdaa3cd..c55e6806254 100644 --- a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_sub_device.cpp +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_sub_device.cpp @@ -17,6 +17,8 @@ #include "tt_metal/test_utils/stimulus.hpp" #include "command_queue_fixture.hpp" +namespace tt::tt_metal { + TEST_F(CommandQueueSingleCardFixture, TensixTestSubDeviceAllocations) { uint32_t local_l1_size = 3200; SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {2, 2}))}); @@ -139,3 +141,5 @@ TEST_F(CommandQueueSingleCardFixture, TensixTestSubDeviceBankIds) { EXPECT_EQ(global_bank_id, sub_device_bank_id); } } + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_program/program_with_kernel_created_from_string_fixture.hpp b/tests/tt_metal/tt_metal/dispatch/dispatch_program/program_with_kernel_created_from_string_fixture.hpp index c0d33a0a429..b8a1aa3ac22 100644 --- a/tests/tt_metal/tt_metal/dispatch/dispatch_program/program_with_kernel_created_from_string_fixture.hpp +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_program/program_with_kernel_created_from_string_fixture.hpp @@ -7,6 +7,8 @@ #include #include "dispatch_fixture.hpp" +using namespace tt::tt_metal; + class ProgramWithKernelCreatedFromStringFixture : public DispatchFixture { protected: void SetUp() override { diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_dispatch.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_dispatch.cpp index e6b6f8c2829..acb6341d55d 100644 --- a/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_dispatch.cpp +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_dispatch.cpp @@ -13,19 +13,19 @@ using std::vector; // Test sync w/ semaphores betweeen eth/tensix cores // Test will hang in the kernel if the sync doesn't work properly static void test_sems_across_core_types( - DispatchFixture* fixture, vector& devices, bool active_eth) { + tt::tt_metal::DispatchFixture* fixture, vector& devices, bool active_eth) { // just something unique... constexpr uint32_t eth_sem_init_val = 33; constexpr uint32_t tensix_sem_init_val = 102; vector compile_args; if (active_eth) { - compile_args.push_back(static_cast(HalProgrammableCoreType::ACTIVE_ETH)); + compile_args.push_back(static_cast(tt::tt_metal::HalProgrammableCoreType::ACTIVE_ETH)); } else { - compile_args.push_back(static_cast(HalProgrammableCoreType::IDLE_ETH)); + compile_args.push_back(static_cast(tt::tt_metal::HalProgrammableCoreType::IDLE_ETH)); } - for (IDevice* device : devices) { + for (tt::tt_metal::IDevice* device : devices) { if (not device->is_mmio_capable()) { continue; } @@ -33,7 +33,7 @@ static void test_sems_across_core_types( const auto& eth_cores = active_eth ? device->get_active_ethernet_cores() : device->get_inactive_ethernet_cores(); if (eth_cores.size() > 0) { - Program program = CreateProgram(); + auto program = tt::tt_metal::CreateProgram(); CoreCoord eth_core = *eth_cores.begin(); CoreCoord phys_eth_core = device->virtual_core_from_logical_core(eth_core, CoreType::ETH); @@ -43,8 +43,8 @@ static void test_sems_across_core_types( "tests/tt_metal/tt_metal/test_kernels/dataflow/semaphore_across_core_types.cpp", eth_core, tt::tt_metal::EthernetConfig{ - .eth_mode = active_eth ? Eth::RECEIVER : Eth::IDLE, - .noc = NOC::NOC_0, + .eth_mode = active_eth ? tt::tt_metal::Eth::RECEIVER : tt::tt_metal::Eth::IDLE, + .noc = tt::tt_metal::NOC::NOC_0, .compile_args = compile_args, }); @@ -55,15 +55,15 @@ static void test_sems_across_core_types( program, "tests/tt_metal/tt_metal/test_kernels/dataflow/semaphore_across_core_types.cpp", tensix_core, - DataMovementConfig{ - .processor = DataMovementProcessor::RISCV_0, - .noc = NOC::RISCV_0_default, + tt::tt_metal::DataMovementConfig{ + .processor = tt::tt_metal::DataMovementProcessor::RISCV_0, + .noc = tt::tt_metal::NOC::RISCV_0_default, .compile_args = compile_args, }); // Set up args vector eth_rtas = { - hal.noc_xy_encoding(phys_tensix_core.x, phys_tensix_core.y), + tt::tt_metal::hal.noc_xy_encoding(phys_tensix_core.x, phys_tensix_core.y), eth_sem_id, tensix_sem_id, eth_sem_init_val, @@ -79,7 +79,7 @@ static void test_sems_across_core_types( SetRuntimeArgs(program, eth_kernel, eth_core, eth_rtas); vector tensix_rtas = { - hal.noc_xy_encoding(phys_eth_core.x, phys_eth_core.y), + tt::tt_metal::hal.noc_xy_encoding(phys_eth_core.x, phys_eth_core.y), tensix_sem_id, eth_sem_id, tensix_sem_init_val, @@ -91,6 +91,8 @@ static void test_sems_across_core_types( } } +namespace tt::tt_metal { + TEST_F(DispatchFixture, EthTestBlank) { IDevice* device = devices_[0]; Program program = CreateProgram(); @@ -263,3 +265,5 @@ TEST_F(DispatchFixture, TensixActiveEthTestCBsAcrossDifferentCoreTypes) { EXPECT_TRUE(pass_out); } } + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_dispatch_program_with_kernel_created_from_string.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_dispatch_program_with_kernel_created_from_string.cpp index c06089e20b9..096c2a2fdb2 100644 --- a/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_dispatch_program_with_kernel_created_from_string.cpp +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_dispatch_program_with_kernel_created_from_string.cpp @@ -14,6 +14,10 @@ #include "umd/device/types/cluster_descriptor_types.h" #include "program_with_kernel_created_from_string_fixture.hpp" +using namespace tt; + +namespace tt::tt_metal { + TEST_F(ProgramWithKernelCreatedFromStringFixture, TensixDataMovementKernel) { const CoreRange cores({0, 0}, {1, 1}); const string& kernel_src_code = R"( @@ -30,11 +34,11 @@ TEST_F(ProgramWithKernelCreatedFromStringFixture, TensixDataMovementKernel) { for (IDevice* device : this->devices_) { Program program = CreateProgram(); - tt_metal::CreateKernelFromString( + CreateKernelFromString( program, kernel_src_code, cores, - tt_metal::DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default}); + DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default}); this->RunProgram(device, program); }; } @@ -58,11 +62,11 @@ TEST_F(ProgramWithKernelCreatedFromStringFixture, TensixComputeKernel) { for (IDevice* device : this->devices_) { Program program = CreateProgram(); - tt_metal::CreateKernelFromString( + CreateKernelFromString( program, kernel_src_code, cores, - tt_metal::ComputeConfig{ + ComputeConfig{ .math_fidelity = MathFidelity::HiFi4, .fp32_dest_acc_en = false, .math_approx_mode = false, @@ -99,3 +103,5 @@ TEST_F(ProgramWithKernelCreatedFromStringFixture, ActiveEthEthernetKernel) { this->RunProgram(device, program); }; } + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_global_circular_buffers.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_global_circular_buffers.cpp index d1631530f53..cd1f7f13a9b 100644 --- a/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_global_circular_buffers.cpp +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_global_circular_buffers.cpp @@ -14,6 +14,8 @@ #include #include "tt_metal/include/tt_metal/program.hpp" +namespace tt::tt_metal { + TEST_F(DispatchFixture, TensixProgramGlobalCircularBuffers) { CoreCoord sender_core = CoreCoord(0, 0); CoreRangeSet sender_cores = CoreRangeSet(CoreRange(sender_core)); @@ -119,3 +121,5 @@ TEST_F(DispatchFixture, TensixProgramGlobalCircularBuffers) { } this->RunProgram(device, program); } + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_sub_device.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_sub_device.cpp index a7867ad2402..bb778b172ce 100644 --- a/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_sub_device.cpp +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_sub_device.cpp @@ -19,6 +19,8 @@ #include "sub_device_test_utils.hpp" #include "dispatch_test_utils.hpp" +namespace tt::tt_metal { + TEST_F(CommandQueueSingleCardFixture, TensixTestSubDeviceSynchronization) { auto* device = devices_[0]; uint32_t local_l1_size = 3200; @@ -142,3 +144,5 @@ TEST_F(CommandQueueSingleCardFixture, TensixActiveEthTestSubDeviceBasicEthProgra Synchronize(device); detail::DumpDeviceProfileResults(device); } + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_test_utils.hpp b/tests/tt_metal/tt_metal/dispatch/dispatch_test_utils.hpp index be77eb429a0..a506412309f 100644 --- a/tests/tt_metal/tt_metal/dispatch/dispatch_test_utils.hpp +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_test_utils.hpp @@ -7,6 +7,8 @@ #include #include +namespace tt::tt_metal { + struct TestBufferConfig { uint32_t num_pages; uint32_t page_size; @@ -93,3 +95,5 @@ inline std::pair, std::vector> create_runtime_ar return create_runtime_args(num_rt_args_unique, num_rt_args_common, unique_base, common_base); } + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_trace/test_sub_device.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_trace/test_sub_device.cpp index 4da16fbae0f..8cc81ceec88 100644 --- a/tests/tt_metal/tt_metal/dispatch/dispatch_trace/test_sub_device.cpp +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_trace/test_sub_device.cpp @@ -18,6 +18,8 @@ #include "dispatch_test_utils.hpp" #include "sub_device_test_utils.hpp" +namespace tt::tt_metal { + TEST_F(CommandQueueSingleCardTraceFixture, TensixTestSubDeviceTraceBasicPrograms) { auto* device = devices_[0]; SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {2, 2}))}); @@ -306,3 +308,5 @@ TEST_F(CommandQueueSingleCardTraceFixture, TensixTestSubDeviceIllegalOperations) device->remove_sub_device_manager(sub_device_manager_1); EXPECT_THROW(device->load_sub_device_manager(sub_device_manager_1), std::exception); } + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_util/test_device_command.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_util/test_device_command.cpp index 8a5c67497ba..ac0fbbf1c40 100644 --- a/tests/tt_metal/tt_metal/dispatch/dispatch_util/test_device_command.cpp +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_util/test_device_command.cpp @@ -8,6 +8,8 @@ #include #include "tt_metal/impl/dispatch/device_command_calculator.hpp" +namespace tt::tt_metal { + TEST(DeviceCommandTest, AddDispatchWait) { DeviceCommandCalculator calculator; calculator.add_dispatch_wait(); @@ -275,3 +277,5 @@ TYPED_TEST(WritePackedCommandTest, RandomAddDispatchWritePacked) { EXPECT_EQ(command.size_bytes(), command.write_offset_bytes()); } } + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_util/test_dispatch_settings.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_util/test_dispatch_settings.cpp index 94889e85584..963ee5c9d94 100644 --- a/tests/tt_metal/tt_metal/dispatch/dispatch_util/test_dispatch_settings.cpp +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_util/test_dispatch_settings.cpp @@ -11,7 +11,7 @@ #include #include "umd/device/tt_core_coordinates.h" -using namespace tt::tt_metal; +namespace tt::tt_metal { // Loop through test_func for WORKER, ETH X 1, 2 CQs void ForEachCoreTypeXHWCQs(const std::function& test_func) { @@ -160,3 +160,5 @@ TEST_F(CommandQueueSingleCardFixture, TestDispatchSettingsMutations) { DispatchSettings::initialize(original_settings); } + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/dispatch/multi_command_queue_fixture.hpp b/tests/tt_metal/tt_metal/dispatch/multi_command_queue_fixture.hpp index fe398b8a90a..200bce2330b 100644 --- a/tests/tt_metal/tt_metal/dispatch/multi_command_queue_fixture.hpp +++ b/tests/tt_metal/tt_metal/dispatch/multi_command_queue_fixture.hpp @@ -17,6 +17,8 @@ #include #include +namespace tt::tt_metal { + class MultiCommandQueueSingleDeviceFixture : public DispatchFixture { protected: void SetUp() override { @@ -152,3 +154,5 @@ class MultiCommandQueueMultiDeviceFixture : public DispatchFixture { class MultiCommandQueueMultiDeviceBufferFixture : public MultiCommandQueueMultiDeviceFixture {}; class MultiCommandQueueMultiDeviceEventFixture : public MultiCommandQueueMultiDeviceFixture {}; + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/dispatch/random_program_fixture.hpp b/tests/tt_metal/tt_metal/dispatch/random_program_fixture.hpp index ccef1c93422..12680b39c0b 100644 --- a/tests/tt_metal/tt_metal/dispatch/random_program_fixture.hpp +++ b/tests/tt_metal/tt_metal/dispatch/random_program_fixture.hpp @@ -14,6 +14,8 @@ #include #include "dispatch_test_utils.hpp" +namespace tt::tt_metal { + class RandomProgramFixture : virtual public CommandQueueSingleCardProgramFixture { protected: static const uint32_t MIN_KERNEL_SIZE_BYTES = 20; @@ -387,3 +389,5 @@ class RandomProgramTraceFixture : virtual public RandomProgramFixture, } } }; + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/dispatch/sub_device_test_utils.hpp b/tests/tt_metal/tt_metal/dispatch/sub_device_test_utils.hpp index d5b27e598fd..0f3fbbe7b98 100644 --- a/tests/tt_metal/tt_metal/dispatch/sub_device_test_utils.hpp +++ b/tests/tt_metal/tt_metal/dispatch/sub_device_test_utils.hpp @@ -8,8 +8,10 @@ #include #include +namespace tt::tt_metal { + inline std::tuple create_single_sync_program( - IDevice* device, SubDevice sub_device) { + IDevice* device, const SubDevice& sub_device) { auto syncer_coord = sub_device.cores(HalProgrammableCoreType::TENSIX).ranges().at(0).start_coord; auto syncer_core = CoreRangeSet(CoreRange(syncer_coord, syncer_coord)); auto global_sem = CreateGlobalSemaphore(device, sub_device.cores(HalProgrammableCoreType::TENSIX), INVALID); @@ -128,3 +130,5 @@ inline std::tuple create_basic_eth_s return { std::move(waiter_program), std::move(syncer_program), std::move(incrementer_program), std::move(global_sem)}; } + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/eth/test_basic_eth.cpp b/tests/tt_metal/tt_metal/eth/test_basic_eth.cpp index 55c0d2dec15..377a6568182 100644 --- a/tests/tt_metal/tt_metal/eth/test_basic_eth.cpp +++ b/tests/tt_metal/tt_metal/eth/test_basic_eth.cpp @@ -41,7 +41,7 @@ namespace unit_tests::erisc::kernels { */ bool reader_kernel_no_send( - DispatchFixture* fixture, + tt_metal::DispatchFixture* fixture, tt_metal::IDevice* device, const size_t& byte_size, const size_t& eth_l1_byte_address, @@ -107,7 +107,7 @@ bool reader_kernel_no_send( } bool writer_kernel_no_receive( - DispatchFixture* fixture, + tt_metal::DispatchFixture* fixture, tt_metal::IDevice* device, const size_t& byte_size, const size_t& eth_l1_byte_address, @@ -278,6 +278,8 @@ bool noc_reader_and_writer_kernels( } // namespace unit_tests::erisc::kernels +namespace tt::tt_metal { + TEST_F(CommandQueueSingleCardProgramFixture, ActiveEthKernelsNocReadNoSend) { using namespace CMAKE_UNIQUE_NAMESPACE; const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; @@ -489,3 +491,5 @@ TEST_F(BlackholeSingleCardFixture, IdleEthKernelOnBothIdleEriscs) { erisc1_ethernet_config)); } } + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/eth/test_buffer_movement_kernels.cpp b/tests/tt_metal/tt_metal/eth/test_buffer_movement_kernels.cpp index 46157f6524a..f40c4d70d59 100644 --- a/tests/tt_metal/tt_metal/eth/test_buffer_movement_kernels.cpp +++ b/tests/tt_metal/tt_metal/eth/test_buffer_movement_kernels.cpp @@ -40,7 +40,7 @@ struct BankedConfig { namespace unit_tests::erisc::kernels { bool chip_to_chip_dram_buffer_transfer( - DispatchFixture* fixture, + tt_metal::DispatchFixture* fixture, tt_metal::IDevice* sender_device, tt_metal::IDevice* receiver_device, const CoreCoord& eth_sender_core, @@ -171,7 +171,7 @@ bool chip_to_chip_dram_buffer_transfer( } bool chip_to_chip_interleaved_buffer_transfer( - DispatchFixture* fixture, + tt_metal::DispatchFixture* fixture, tt_metal::IDevice* sender_device, tt_metal::IDevice* receiver_device, const CoreCoord& eth_sender_core, @@ -210,7 +210,7 @@ bool chip_to_chip_interleaved_buffer_transfer( .page_size = cfg.page_size_bytes, .buffer_type = cfg.output_buffer_type}; auto input_buffer = CreateBuffer(sender_config); - bool input_is_dram = cfg.input_buffer_type == BufferType::DRAM; + bool input_is_dram = cfg.input_buffer_type == tt_metal::BufferType::DRAM; fixture->WriteBuffer(sender_device, input_buffer, input_packed); @@ -244,7 +244,7 @@ bool chip_to_chip_interleaved_buffer_transfer( tt_metal::Program receiver_program = tt_metal::Program(); auto output_buffer = CreateBuffer(receiver_config); - bool output_is_dram = cfg.output_buffer_type == BufferType::DRAM; + bool output_is_dram = cfg.output_buffer_type == tt_metal::BufferType::DRAM; std::vector all_zeros(cfg.size_bytes / sizeof(uint32_t), 0); tt_metal::detail::WriteToBuffer(output_buffer, all_zeros); @@ -300,6 +300,8 @@ bool chip_to_chip_interleaved_buffer_transfer( } // namespace unit_tests::erisc::kernels +namespace tt::tt_metal { + TEST_F(TwoDeviceFixture, ActiveEthKernelsSendDramBufferChip0ToChip1) { if (arch_ == ARCH::BLACKHOLE) { GTEST_SKIP() << "See GH Issue #18384"; @@ -659,3 +661,5 @@ TEST_F(CommandQueueMultiDeviceProgramFixture, ActiveEthKernelsSendInterleavedBuf } } } + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/eth/test_erisc_app_direct_send.cpp b/tests/tt_metal/tt_metal/eth/test_erisc_app_direct_send.cpp index c3657b3e012..3537e8374e9 100644 --- a/tests/tt_metal/tt_metal/eth/test_erisc_app_direct_send.cpp +++ b/tests/tt_metal/tt_metal/eth/test_erisc_app_direct_send.cpp @@ -50,7 +50,7 @@ const size_t get_rand_32_byte_aligned_address(const size_t& base, const size_t& } bool eth_direct_sender_receiver_kernels( - DispatchFixture* fixture, + tt_metal::DispatchFixture* fixture, tt_metal::IDevice* sender_device, tt_metal::IDevice* receiver_device, const size_t& byte_size, @@ -227,11 +227,12 @@ bool send_over_eth( receiver_device->id(), receiver_core, args_1, eth_l1_mem::address_map::ERISC_APP_SYNC_INFO_BASE); // TODO: this should be updated to use kernel api - uint32_t active_eth_index = hal.get_programmable_core_type_index(HalProgrammableCoreType::ACTIVE_ETH); - auto sender_firmware_path = BuildEnvManager::get_instance() + uint32_t active_eth_index = + tt_metal::hal.get_programmable_core_type_index(tt_metal::HalProgrammableCoreType::ACTIVE_ETH); + auto sender_firmware_path = tt_metal::BuildEnvManager::get_instance() .get_firmware_build_state(sender_device->build_id(), active_eth_index, 0, 0) .get_target_out_path(""); - auto receiver_firmware_path = BuildEnvManager::get_instance() + auto receiver_firmware_path = tt_metal::BuildEnvManager::get_instance() .get_firmware_build_state(receiver_device->build_id(), active_eth_index, 0, 0) .get_target_out_path(""); const ll_api::memory& binary_mem_send = llrt::get_risc_binary(sender_firmware_path); @@ -270,6 +271,8 @@ bool send_over_eth( } // namespace unit_tests::erisc::direct_send +namespace tt::tt_metal { + TEST_F(N300DeviceFixture, ActiveEthSingleCoreDirectSendChip0ToChip1) { using namespace CMAKE_UNIQUE_NAMESPACE; GTEST_SKIP(); @@ -875,3 +878,5 @@ TEST_F(CommandQueueMultiDeviceProgramFixture, ActiveEthKernelsDirectSendAllConne } } } + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/eth/test_ring_gather_kernels.cpp b/tests/tt_metal/tt_metal/eth/test_ring_gather_kernels.cpp index 2d9d997c6c7..e8299446a58 100644 --- a/tests/tt_metal/tt_metal/eth/test_ring_gather_kernels.cpp +++ b/tests/tt_metal/tt_metal/eth/test_ring_gather_kernels.cpp @@ -35,8 +35,8 @@ struct BankedConfig { size_t num_pages = 1; size_t size_bytes = 1 * 2 * 32 * 32; size_t page_size_bytes = 2 * 32 * 32; - BufferType input_buffer_type = BufferType::L1; - BufferType output_buffer_type = BufferType::L1; + tt_metal::BufferType input_buffer_type = tt_metal::BufferType::L1; + tt_metal::BufferType output_buffer_type = tt_metal::BufferType::L1; tt::DataFormat l1_data_format = tt::DataFormat::Float16_b; }; @@ -81,7 +81,7 @@ std::vector get_hamiltonian_cycle(vector>& adj, int N, int s = return {}; } -std::vector get_device_ring(std::vector devices) { +std::vector get_device_ring(std::vector devices) { std::vector> adj(devices.size(), std::vector(devices.size(), 0)); for (uint32_t i = 0; i < devices.size(); ++i) { const auto& device = devices[i]; @@ -96,7 +96,7 @@ std::vector get_device_ring(std::vector devic } const auto& device_ring_idx = get_hamiltonian_cycle(adj, devices.size(), 0); - std::vector device_ring; + std::vector device_ring; device_ring.reserve(device_ring_idx.size()); for (const auto& device_idx : device_ring_idx) { device_ring.push_back(devices[device_idx]); @@ -104,9 +104,9 @@ std::vector get_device_ring(std::vector devic return device_ring; } -std::vector> get_sender_receiver_cores( - std::vector device_ring) { - std::vector> sender_receivers; +std::vector> get_sender_receiver_cores( + std::vector device_ring) { + std::vector> sender_receivers; sender_receivers.reserve(device_ring.size() - 1); // Special case for 2 devices to ensure core pairs are not the same for send and receive @@ -117,7 +117,7 @@ std::vector> get_sender_rec for (const auto& first_eth_core : first_device->get_active_ethernet_cores(true)) { auto [device_id, second_eth_core] = first_device->get_connected_ethernet_core(first_eth_core); if (second_device->id() == device_id) { - IDevice* sender_device, *receiver_device; + tt_metal::IDevice *sender_device, *receiver_device; CoreCoord sender_eth_core, receiver_eth_core; if (i == 0) { sender_device = first_device, receiver_device = second_device; @@ -337,7 +337,7 @@ bool eth_interleaved_ring_gather_sender_receiver_kernels( std::vector full_input; full_input.reserve(numel * sender_receivers.size()); - std::vector> output_buffers; + std::vector> output_buffers; output_buffers.reserve(sender_receivers.size()); for (uint32_t i = 0; i < sender_receivers.size(); ++i) { @@ -357,11 +357,11 @@ bool eth_interleaved_ring_gather_sender_receiver_kernels( auto& program = programs[device->id()]; - auto input_buffer = - CreateBuffer(InterleavedBufferConfig{device, cfg.size_bytes, cfg.page_size_bytes, cfg.input_buffer_type}); - bool input_is_dram = cfg.input_buffer_type == BufferType::DRAM; + auto input_buffer = CreateBuffer( + tt_metal::InterleavedBufferConfig{device, cfg.size_bytes, cfg.page_size_bytes, cfg.input_buffer_type}); + bool input_is_dram = cfg.input_buffer_type == tt_metal::BufferType::DRAM; tt_metal::detail::WriteToBuffer(input_buffer, inputs[i]); - output_buffers.emplace_back(CreateBuffer(InterleavedBufferConfig{ + output_buffers.emplace_back(CreateBuffer(tt_metal::InterleavedBufferConfig{ device, cfg.size_bytes * sender_receivers.size(), cfg.page_size_bytes, cfg.output_buffer_type})); tt_metal::detail::WriteToBuffer(output_buffers[i], all_zeros); @@ -376,8 +376,8 @@ bool eth_interleaved_ring_gather_sender_receiver_kernels( uint32_t(num_bytes_per_send >> 4), uint32_t(device->ethernet_core_from_logical_core(eth_receiver_core).x), uint32_t(device->ethernet_core_from_logical_core(eth_receiver_core).y), - uint32_t(input_buffer->buffer_type() == BufferType::DRAM), - uint32_t(output_buffers[i]->buffer_type() == BufferType::DRAM)}}); + uint32_t(input_buffer->buffer_type() == tt_metal::BufferType::DRAM), + uint32_t(output_buffers[i]->buffer_type() == tt_metal::BufferType::DRAM)}}); tt_metal::SetRuntimeArgs( program, @@ -415,7 +415,8 @@ bool eth_interleaved_ring_gather_sender_receiver_kernels( uint32_t(device->ethernet_core_from_logical_core(eth_sender_core).x), uint32_t(device->ethernet_core_from_logical_core(eth_sender_core).y), uint32_t( - output_buffers[i]->buffer_type() == BufferType::DRAM)}}); // probably want to use NOC_1 here + output_buffers[i]->buffer_type() == + tt_metal::BufferType::DRAM)}}); // probably want to use NOC_1 here tt_metal::SetRuntimeArgs( program, @@ -463,6 +464,8 @@ bool eth_interleaved_ring_gather_sender_receiver_kernels( } // namespace unit_tests::erisc::kernels +namespace tt::tt_metal { + TEST_F(DeviceFixture, ActiveEthKernelsDirectRingGatherAllChips) { const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE + 32; const size_t dst_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE + 32; @@ -488,3 +491,5 @@ TEST_F(DeviceFixture, ActiveEthKernelsInterleavedRingGatherAllChips) { ASSERT_TRUE(unit_tests::erisc::kernels::eth_interleaved_ring_gather_sender_receiver_kernels( device_ring, test_config, src_eth_l1_byte_address, dst_eth_l1_byte_address, sem_l1_byte_address)); } + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/integration/matmul/test_matmul_X_tile.cpp b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_X_tile.cpp index a78af1fe2e3..e059eed2d05 100644 --- a/tests/tt_metal/tt_metal/integration/matmul/test_matmul_X_tile.cpp +++ b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_X_tile.cpp @@ -51,7 +51,7 @@ void create_test_stimuli(MatmulTileStimuli& stimuli, uint32_t M, uint32_t K, uin auto activations_tilized = test_utils::tilize(tensor.get_values(), M * 32, K * 32); auto activations_tile_layout = convert_to_tile_layout(activations_tilized); auto activations = pack_bfloat16_vec_into_uint32_vec(activations_tile_layout); - auto activations_tile_transposed = transpose_tiles(activations, M, K, 1); + auto activations_tile_transposed = tt::tt_metal::transpose_tiles(activations, M, K, 1); stimuli.a = activations_tile_transposed; auto identity = create_identity_matrix(K * 32, N * 32, std::min(K, N) * 32); @@ -82,7 +82,7 @@ void set_math_fid_masks(uint16_t& math_fid_mask, MathFidelity math_fidelity = Ma } void matmul_tile( - DispatchFixture* fixture, + tt_metal::DispatchFixture* fixture, tt_metal::IDevice* device, const MatmulTileConfig& cfg, vector activations, @@ -327,6 +327,8 @@ void matmul_tile( } } // namespace unit_tests_common::matmul::test_matmul_X_tile +namespace tt::tt_metal { + using namespace tt::test_utils; using namespace unit_tests_common::matmul::test_matmul_X_tile; @@ -521,3 +523,5 @@ TEST_F(DispatchFixture, TensixMatmulBlockInitShortWithDt) { } } } + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/integration/matmul/test_matmul_large_block.cpp b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_large_block.cpp index 882bb99b2e6..3a66e0c5674 100644 --- a/tests/tt_metal/tt_metal/integration/matmul/test_matmul_large_block.cpp +++ b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_large_block.cpp @@ -171,7 +171,7 @@ void create_CBs_for_fused_matmul( } bool matmul_large_block( - DispatchFixture* fixture, + tt_metal::DispatchFixture* fixture, tt_metal::IDevice* device, bool activations_rm, bool output_rm, @@ -376,25 +376,27 @@ bool matmul_large_block( if (output_rm) { pass &= (golden == result_bfp16); if (not pass) { - print_faces(result_bfp16, "Result"); + tt_metal::print_faces(result_bfp16, "Result"); } } else { auto result_flat_layout = convert_to_flat_layout(result_bfp16); auto result_untilized = test_utils::untilize(result_flat_layout, M * 32, N * 32); pass &= (golden == result_untilized); if (not pass) { - print_faces(result_untilized, "Result"); + tt_metal::print_faces(result_untilized, "Result"); } } if (not pass) { - print_faces(tensor.get_values(), "Golden"); + tt_metal::print_faces(tensor.get_values(), "Golden"); } return pass; } } // namespace unit_tests_common::matmul::test_matmul_large_block +namespace tt::tt_metal { + TEST_F(DispatchFixture, TensixMatmulLargeBlock) { for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) { if (i == 1) { @@ -417,3 +419,5 @@ TEST_F(DispatchFixture, TensixMatmulLargeBlock) { } } } + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_X_dram.cpp b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_X_dram.cpp index 4ea1f70ff71..b57004d338b 100644 --- a/tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_X_dram.cpp +++ b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_X_dram.cpp @@ -158,7 +158,7 @@ bool matmul_multi_core_single_dram(tt_metal::IDevice* device) { int per_core_M = M / num_cores_r; int per_core_N = N / num_cores_c; uint32_t single_tile_size = 2 * 1024; - uint32_t dram_unreserved_base = device->allocator()->get_base_allocator_addr(HalMemType::DRAM); + uint32_t dram_unreserved_base = device->allocator()->get_base_allocator_addr(tt_metal::HalMemType::DRAM); log_info(LogTest, "M = {}, N = {}, K = {}", M, N, K); log_info(LogTest, "Activation = {}x{}", M * 32, K * 32); log_info(LogTest, "Weights = {}x{}", K * 32, N * 32); @@ -180,7 +180,7 @@ bool matmul_multi_core_single_dram(tt_metal::IDevice* device) { tt::deprecated::Tensor tensor = tt::deprecated::initialize_tensor( shape, tt::deprecated::Initialize::RANDOM, 0, 100, std::chrono::system_clock::now().time_since_epoch().count()); auto identity = create_identity_matrix(K * 32, N * 32, std::min(K, N) * 32); // bflaot16 identity - auto golden = select_columns(tensor.get_values(), M, K, N); + auto golden = tt_metal::select_columns(tensor.get_values(), M, K, N); MatmulConfig matmul_cfg = { .multi_dram = 0, @@ -205,9 +205,10 @@ bool matmul_multi_core_single_dram(tt_metal::IDevice* device) { //////////////////////////////////////////////////////////////////////////// log_debug(LogTest, "Slicing input tensors and copying them to dram along with sending runtime args to device"); for (int i = 0; i < num_cores_r; i++) { - std::vector activation_slice = get_row_slice(tensor.get_values(), num_cores_r, i, M * 32, K * 32); + std::vector activation_slice = + tt_metal::get_row_slice(tensor.get_values(), num_cores_r, i, M * 32, K * 32); for (int j = 0; j < num_cores_c; j++) { - std::vector weights_slice = get_col_slice(identity, num_cores_c, j, K * 32, N * 32); + std::vector weights_slice = tt_metal::get_col_slice(identity, num_cores_c, j, K * 32, N * 32); int core_index = i * num_cores_c + j; CoreCoord core = {(std::size_t)j, (std::size_t)i}; @@ -245,7 +246,7 @@ bool matmul_multi_core_single_dram(tt_metal::IDevice* device) { auto activations_tilized = test_utils::tilize(activation_slice, per_core_M * 32, K * 32); auto activations_tile_layout = convert_to_tile_layout(activations_tilized); auto activations = pack_bfloat16_vec_into_uint32_vec(activations_tile_layout); - auto activations_tile_transposed = transpose_tiles(activations, per_core_M, K, in0_block_w); + auto activations_tile_transposed = tt_metal::transpose_tiles(activations, per_core_M, K, in0_block_w); pass &= tt_metal::detail::WriteToDeviceDRAMChannel( device, dram_src0_channel_id, dram_buffer_src0_addr, activations_tile_transposed); @@ -292,9 +293,9 @@ bool matmul_multi_core_single_dram(tt_metal::IDevice* device) { log_debug(LogTest, "Matmul test done"); log_debug(LogTest, "Gathering data back from dram and checking against golden"); for (int i = 0; i < num_cores_r; i++) { - auto golden_row = get_row_slice(golden, num_cores_r, i, M * 32, N * 32); + auto golden_row = tt_metal::get_row_slice(golden, num_cores_r, i, M * 32, N * 32); for (int j = 0; j < num_cores_c; j++) { - auto per_core_golden = get_col_slice(golden_row, num_cores_c, j, per_core_M * 32, N * 32); + auto per_core_golden = tt_metal::get_col_slice(golden_row, num_cores_c, j, per_core_M * 32, N * 32); std::vector result_vec; int core_index = i * num_cores_c + j; uint32_t dram_buffer_dst_addr = @@ -410,7 +411,7 @@ bool assign_runtime_args_to_program( return pass; } -bool matmul_multi_core_multi_dram(DispatchFixture* fixture, tt_metal::IDevice* device) { +bool matmul_multi_core_multi_dram(tt_metal::DispatchFixture* fixture, tt_metal::IDevice* device) { bool pass = true; int num_cores_r = device->compute_with_storage_grid_size().y; int num_cores_c = device->compute_with_storage_grid_size().x; @@ -474,17 +475,20 @@ bool matmul_multi_core_multi_dram(DispatchFixture* fixture, tt_metal::IDevice* d auto activations_tile_layout = convert_to_tile_layout(activations_tilized); auto activations = pack_bfloat16_vec_into_uint32_vec(activations_tile_layout); - auto activation_buffer = Buffer::create(device, activations.size() * sizeof(uint32_t), 1024 * 2, BufferType::DRAM); + auto activation_buffer = + tt_metal::Buffer::create(device, activations.size() * sizeof(uint32_t), 1024 * 2, tt_metal::BufferType::DRAM); pass &= move_tiles_to_dram(device, activations, M, K, activation_buffer); auto identity_tilized = test_utils::tilize(identity, K * 32, N * 32); auto weights_tile_layout = convert_to_tile_layout(identity_tilized); auto weights = pack_bfloat16_vec_into_uint32_vec(weights_tile_layout); - auto weight_buffer = Buffer::create(device, weights.size() * sizeof(uint32_t), 1024 * 2, BufferType::DRAM); + auto weight_buffer = + tt_metal::Buffer::create(device, weights.size() * sizeof(uint32_t), 1024 * 2, tt_metal::BufferType::DRAM); pass &= move_tiles_to_dram(device, weights, K, N, weight_buffer); log_debug(LogTest, "Copying inputs to dram complete"); - auto out_buffer = Buffer::create(device, M * N * sizeof(uint32_t) * 32 * 32, 1024 * 2, BufferType::DRAM); + auto out_buffer = + tt_metal::Buffer::create(device, M * N * sizeof(uint32_t) * 32 * 32, 1024 * 2, tt_metal::BufferType::DRAM); uint32_t out_dram_addr = out_buffer->address(); log_debug(LogTest, "Writing kernel runtime args to device"); @@ -516,15 +520,15 @@ bool matmul_multi_core_multi_dram(DispatchFixture* fixture, tt_metal::IDevice* d vector result; fixture->ReadBuffer(device, out_buffer, result); - auto golden = select_columns(tensor.get_values(), M, K, N); + auto golden = tt_metal::select_columns(tensor.get_values(), M, K, N); // Keeping this old code because took me too long to decipher. Matmul // owner can refactor at a later time auto result_iter = result.begin(); for (int i = 0; i < M; i++) { - auto row = get_row_slice(golden, M, i, M * 32, N * 32); + auto row = tt_metal::get_row_slice(golden, M, i, M * 32, N * 32); for (int j = 0; j < N; j++) { - auto golden_tile = get_col_slice(row, N, j, 32, N * 32); + auto golden_tile = tt_metal::get_col_slice(row, N, j, 32, N * 32); std::vector result_vec; result_vec.insert(result_vec.end(), result_iter, result_iter + 512); result_iter += 512; @@ -540,6 +544,8 @@ bool matmul_multi_core_multi_dram(DispatchFixture* fixture, tt_metal::IDevice* d } // namespace unit_tests_common::matmul::test_matmul_multi_core_X_dram +namespace tt::tt_metal { + TEST_F(DispatchFixture, TensixMatmulMultiCoreSingleDRAM) { if (!getenv("TT_METAL_SLOW_DISPATCH_MODE")) { log_info(LogTest, "This test is only supported in slow dispatch mode"); @@ -568,3 +574,5 @@ TEST_F(DispatchFixture, TensixMatmulMultiCoreMultiDRAM) { this, devices_.at(id))); } } + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp index 572d6243c66..90692c84446 100644 --- a/tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp +++ b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp @@ -380,7 +380,7 @@ bool matmul_multi_core_multi_dram_in0_mcast_in1_mcast(tt_metal::IDevice* device) int per_core_M = M / num_cores_r; int per_core_N = N / num_cores_c; uint32_t single_tile_size = 2 * 1024; - uint32_t in0_dram_addr = device->allocator()->get_base_allocator_addr(HalMemType::DRAM); + uint32_t in0_dram_addr = device->allocator()->get_base_allocator_addr(tt_metal::HalMemType::DRAM); uint32_t in1_dram_addr = 400 * 1024 * 1024; uint32_t out_dram_addr = 800 * 1024 * 1024; @@ -406,7 +406,7 @@ bool matmul_multi_core_multi_dram_in0_mcast_in1_mcast(tt_metal::IDevice* device) tt::deprecated::Tensor tensor = tt::deprecated::initialize_tensor( shape, tt::deprecated::Initialize::RANDOM, 0, 100, std::chrono::system_clock::now().time_since_epoch().count()); auto identity = create_identity_matrix(K * 32, N * 32, std::min(K, N) * 32); // bflaot16 identity - auto golden = select_columns(tensor.get_values(), M, K, N); + auto golden = tt_metal::select_columns(tensor.get_values(), M, K, N); auto [program, @@ -487,9 +487,9 @@ bool matmul_multi_core_multi_dram_in0_mcast_in1_mcast(tt_metal::IDevice* device) log_debug(LogTest, "Gathering data back from dram and checking against golden"); for (int i = 0; i < M; i++) { - auto row = get_row_slice(golden, M, i, M * 32, N * 32); + auto row = tt_metal::get_row_slice(golden, M, i, M * 32, N * 32); for (int j = 0; j < N; j++) { - auto golden_tile = get_col_slice(row, N, j, 32, N * 32); + auto golden_tile = tt_metal::get_col_slice(row, N, j, 32, N * 32); int tile_id = i * N + j; int dram_bank = tile_id % device->num_dram_channels(); uint32_t dram_address = ((tile_id / device->num_dram_channels()) * single_tile_size) + out_dram_addr; @@ -509,6 +509,8 @@ bool matmul_multi_core_multi_dram_in0_mcast_in1_mcast(tt_metal::IDevice* device) } // namespace unit_tests_common::matmul::test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast +namespace tt::tt_metal { + TEST_F(DispatchFixture, TensixMatmulMultiCoreMultiDRAMIn0MCastIn1MCast) { if (!getenv("TT_METAL_SLOW_DISPATCH_MODE")) { tt::log_info(tt::LogTest, "This test is only supported in slow dispatch mode"); @@ -519,3 +521,5 @@ TEST_F(DispatchFixture, TensixMatmulMultiCoreMultiDRAMIn0MCastIn1MCast) { matmul_multi_core_multi_dram_in0_mcast_in1_mcast(devices_.at(id))); } } + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_multi_dram_inX_mcast.cpp b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_multi_dram_inX_mcast.cpp index 7b08f76e7b2..53ccafa6965 100644 --- a/tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_multi_dram_inX_mcast.cpp +++ b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_multi_dram_inX_mcast.cpp @@ -305,7 +305,7 @@ bool matmul_multi_core_multi_dram_inX_mcast(tt_metal::IDevice* device, int in1_o int per_core_M = M / num_cores_r; int per_core_N = N / num_cores_c; uint32_t single_tile_size = 2 * 1024; - uint32_t in0_dram_addr = device->allocator()->get_base_allocator_addr(HalMemType::DRAM); + uint32_t in0_dram_addr = device->allocator()->get_base_allocator_addr(tt_metal::HalMemType::DRAM); uint32_t in1_dram_addr = 400 * 1024 * 1024; uint32_t out_dram_addr = 800 * 1024 * 1024; @@ -330,7 +330,7 @@ bool matmul_multi_core_multi_dram_inX_mcast(tt_metal::IDevice* device, int in1_o tt::deprecated::Tensor tensor = tt::deprecated::initialize_tensor( shape, tt::deprecated::Initialize::RANDOM, 0, 100, std::chrono::system_clock::now().time_since_epoch().count()); auto identity = create_identity_matrix(K * 32, N * 32, std::min(K, N) * 32); // bfloat16 identity - auto golden = select_columns(tensor.get_values(), M, K, N); + auto golden = tt_metal::select_columns(tensor.get_values(), M, K, N); auto [program, @@ -403,9 +403,9 @@ bool matmul_multi_core_multi_dram_inX_mcast(tt_metal::IDevice* device, int in1_o log_debug(LogTest, "Gathering data back from dram and checking against golden"); for (int i = 0; i < M; i++) { - auto row = get_row_slice(golden, M, i, M * 32, N * 32); + auto row = tt_metal::get_row_slice(golden, M, i, M * 32, N * 32); for (int j = 0; j < N; j++) { - auto golden_tile = get_col_slice(row, N, j, 32, N * 32); + auto golden_tile = tt_metal::get_col_slice(row, N, j, 32, N * 32); int tile_id = i * N + j; int dram_bank = tile_id % device->num_dram_channels(); uint32_t dram_address = ((tile_id / device->num_dram_channels()) * single_tile_size) + out_dram_addr; @@ -424,6 +424,8 @@ bool matmul_multi_core_multi_dram_inX_mcast(tt_metal::IDevice* device, int in1_o } } // namespace unit_tests_common::matmul::test_matmul_multi_core_multi_dram_inX_mcast +namespace tt::tt_metal { + TEST_F(DispatchFixture, TensixMatmulMultiCoreMultiDRAMIn0MCast) { if (!getenv("TT_METAL_SLOW_DISPATCH_MODE")) { tt::log_info(tt::LogTest, "This test is only supported in slow dispatch mode"); @@ -445,3 +447,5 @@ TEST_F(DispatchFixture, TensixMatmulMultiCoreMultiDRAMIn1MCast) { matmul_multi_core_multi_dram_inX_mcast(devices_.at(id), 1)); } } + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/integration/matmul/test_matmul_single_core.cpp b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_single_core.cpp index ce4eb8b6ebd..690be20274e 100644 --- a/tests/tt_metal/tt_metal/integration/matmul/test_matmul_single_core.cpp +++ b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_single_core.cpp @@ -19,7 +19,13 @@ using namespace tt; namespace unit_tests_common::matmul::test_matmul_single_core { bool matmul_single_core( - DispatchFixture* fixture, tt_metal::IDevice* device, int M, int N, int K, int out_subblock_h, int out_subblock_w) { + tt_metal::DispatchFixture* fixture, + tt_metal::IDevice* device, + int M, + int N, + int K, + int out_subblock_h, + int out_subblock_w) { bool pass = true; tt_metal::Program program = tt_metal::CreateProgram(); @@ -190,7 +196,7 @@ bool matmul_single_core( auto activations_tilized = test_utils::tilize(tensor.get_values(), M * 32, K * 32); auto activations_tile_layout = convert_to_tile_layout(activations_tilized); auto activations = pack_bfloat16_vec_into_uint32_vec(activations_tile_layout); - auto activations_tile_transposed = transpose_tiles(activations, M, K, in0_block_w); + auto activations_tile_transposed = tt_metal::transpose_tiles(activations, M, K, in0_block_w); fixture->WriteBuffer(device, src0_dram_buffer, activations_tile_transposed); auto identity = create_identity_matrix(K * 32, N * 32, std::min(K, N) * 32); // bflaot16 32x32 identity @@ -213,7 +219,7 @@ bool matmul_single_core( auto result_bfp16 = unpack_uint32_vec_into_bfloat16_vec(result_vec); auto result_flat_layout = convert_to_flat_layout(result_bfp16); auto result_untilized = test_utils::untilize(result_flat_layout, M * 32, N * 32); - auto golden = select_columns(tensor.get_values(), M, K, std::min(K, N)); + auto golden = tt_metal::select_columns(tensor.get_values(), M, K, std::min(K, N)); pass &= test_utils::is_close_vectors(golden, result_untilized, [&](const bfloat16& a, const bfloat16& b) { return tt::test_utils::is_close(a, b, 0.015f); }); @@ -222,6 +228,8 @@ bool matmul_single_core( } } // namespace unit_tests_common::matmul::test_matmul_single_core +namespace tt::tt_metal { + TEST_F(DispatchFixture, TensixMatmulSingleCoreSmall) { uint32_t M = 4; uint32_t K = 4; @@ -249,3 +257,5 @@ TEST_F(DispatchFixture, TensixMatmulSingleCore) { this, devices_.at(id), M, N, K, out_subblock_h, out_subblock_w)); } } + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/integration/test_autonomous_relay_streams.cpp b/tests/tt_metal/tt_metal/integration/test_autonomous_relay_streams.cpp index 73ab6d1a648..3ebafacc18b 100644 --- a/tests/tt_metal/tt_metal/integration/test_autonomous_relay_streams.cpp +++ b/tests/tt_metal/tt_metal/integration/test_autonomous_relay_streams.cpp @@ -638,10 +638,6 @@ void build_and_run_autonomous_stream_test( } } -} // namespace tt_metal - -} // namespace tt - TEST_F(CommandQueueProgramFixture, DISABLED_TensixTestAutonomousRelayStreams) { auto arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); auto num_devices = tt::tt_metal::GetNumAvailableDevices(); @@ -962,3 +958,7 @@ TEST_F(CommandQueueProgramFixture, DISABLED_TensixTestAutonomousRelayStreamsSwee return; } + +} // namespace tt_metal + +} // namespace tt diff --git a/tests/tt_metal/tt_metal/integration/test_flatten.cpp b/tests/tt_metal/tt_metal/integration/test_flatten.cpp index 6330d4149d7..b54dc179a54 100644 --- a/tests/tt_metal/tt_metal/integration/test_flatten.cpp +++ b/tests/tt_metal/tt_metal/integration/test_flatten.cpp @@ -61,7 +61,8 @@ inline std::vector gold_standard_flatten(std::vector src_vec return expected_dst_vec; } -bool flatten(DispatchFixture* fixture, tt_metal::IDevice* device, uint32_t num_tiles_r = 5, uint32_t num_tiles_c = 5) { +bool flatten( + tt_metal::DispatchFixture* fixture, tt_metal::IDevice* device, uint32_t num_tiles_r = 5, uint32_t num_tiles_c = 5) { bool pass = true; tt_metal::Program program = tt_metal::CreateProgram(); @@ -173,11 +174,11 @@ bool flatten(DispatchFixture* fixture, tt_metal::IDevice* device, uint32_t num_t return pass; } -bool flatten_stress(IDevice* device, uint32_t num_tiles_r = 5, uint32_t num_tiles_c = 5) { +bool flatten_stress(tt_metal::IDevice* device, uint32_t num_tiles_r = 5, uint32_t num_tiles_c = 5) { // Test Simulating Program Caching with Async Command Queues bool pass = true; // Create a program used across all loops - Program program = CreateProgram(); + tt_metal::Program program = tt_metal::CreateProgram(); CoreCoord core = {0, 0}; @@ -189,19 +190,23 @@ bool flatten_stress(IDevice* device, uint32_t num_tiles_r = 5, uint32_t num_tile uint32_t dram_buffer_size = single_tile_size * num_tiles * 32; - InterleavedBufferConfig dram_config{ - .device = device, .size = dram_buffer_size, .page_size = dram_buffer_size, .buffer_type = BufferType::DRAM}; + tt_metal::InterleavedBufferConfig dram_config{ + .device = device, + .size = dram_buffer_size, + .page_size = dram_buffer_size, + .buffer_type = tt_metal::BufferType::DRAM}; uint32_t src0_cb_index = 0; uint32_t num_input_tiles = 8; - CircularBufferConfig cb_src0_config = - CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, tt::DataFormat::Float16_b}}) + tt_metal::CircularBufferConfig cb_src0_config = + tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, tt::DataFormat::Float16_b}}) .set_page_size(src0_cb_index, single_tile_size); auto cb_src0 = CreateCircularBuffer(program, core, cb_src0_config); uint32_t ouput_cb_index = 16; uint32_t num_output_tiles = 1; - CircularBufferConfig cb_output_config = - CircularBufferConfig(num_output_tiles * single_tile_size, {{ouput_cb_index, tt::DataFormat::Float16_b}}) + tt_metal::CircularBufferConfig cb_output_config = + tt_metal::CircularBufferConfig( + num_output_tiles * single_tile_size, {{ouput_cb_index, tt::DataFormat::Float16_b}}) .set_page_size(ouput_cb_index, single_tile_size); auto cb_output = CreateCircularBuffer(program, core, cb_output_config); @@ -209,13 +214,15 @@ bool flatten_stress(IDevice* device, uint32_t num_tiles_r = 5, uint32_t num_tile program, "tests/tt_metal/tt_metal/test_kernels/dataflow/flatten.cpp", core, - DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default}); + tt_metal::DataMovementConfig{ + .processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); auto unary_writer_kernel = CreateKernel( program, "tt_metal/kernels/dataflow/writer_unary.cpp", core, - DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); + tt_metal::DataMovementConfig{ + .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); vector compute_kernel_args = {num_tiles * 32}; @@ -223,7 +230,7 @@ bool flatten_stress(IDevice* device, uint32_t num_tiles_r = 5, uint32_t num_tile program, "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy.cpp", core, - ComputeConfig{.compile_args = compute_kernel_args}); + tt_metal::ComputeConfig{.compile_args = compute_kernel_args}); // Inside the loop, run async runtime functions for (int i = 0; i < 1000; i++) { @@ -238,15 +245,15 @@ bool flatten_stress(IDevice* device, uint32_t num_tiles_r = 5, uint32_t num_tile std::vector golden = gold_standard_flatten(*src_vec, {num_tiles_r * 32, num_tiles_c * 32}); // Set the runtime args asynchronously - std::shared_ptr writer_runtime_args = std::make_shared(); - std::shared_ptr compute_runtime_args = std::make_shared(); + std::shared_ptr writer_runtime_args = std::make_shared(); + std::shared_ptr compute_runtime_args = std::make_shared(); *compute_runtime_args = { src_dram_buffer.get(), (uint32_t)0, num_tiles_r, num_tiles_c, num_bytes_per_tensor_row}; *writer_runtime_args = {dst_dram_buffer.get(), (uint32_t)0, num_tiles * 32}; - SetRuntimeArgs(device, detail::GetKernel(program, flatten_kernel), core, compute_runtime_args); + SetRuntimeArgs(device, tt_metal::detail::GetKernel(program, flatten_kernel), core, compute_runtime_args); - SetRuntimeArgs(device, detail::GetKernel(program, unary_writer_kernel), core, writer_runtime_args); + SetRuntimeArgs(device, tt_metal::detail::GetKernel(program, unary_writer_kernel), core, writer_runtime_args); // Async write input EnqueueWriteBuffer(device->command_queue(), src_dram_buffer, src_vec, false); // Share ownership of buffer with program @@ -281,6 +288,8 @@ bool flatten_stress(IDevice* device, uint32_t num_tiles_r = 5, uint32_t num_tile } // namespace test_flatten +namespace tt::tt_metal { + TEST_F(DispatchFixture, TensixFlatten) { // TODO: Re-enable when #7264 is fixed GTEST_SKIP(); @@ -299,3 +308,5 @@ TEST_F(DispatchFixture, TensixFlatten) { ASSERT_TRUE(test_flatten::flatten(this, this->devices_.at(id), num_tiles_r, num_tiles_c)); } } + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/lightmetal/lightmetal_fixture.hpp b/tests/tt_metal/tt_metal/lightmetal/lightmetal_fixture.hpp index add02b77e4b..9e66e9a31b0 100644 --- a/tests/tt_metal/tt_metal/lightmetal/lightmetal_fixture.hpp +++ b/tests/tt_metal/tt_metal/lightmetal/lightmetal_fixture.hpp @@ -16,6 +16,8 @@ #include "command_queue_fixture.hpp" #include +namespace tt::tt_metal { + class SingleDeviceLightMetalFixture : public CommandQueueFixture { protected: bool replay_binary_; @@ -79,3 +81,5 @@ class SingleDeviceLightMetalFixture : public CommandQueueFixture { } } }; + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/llk/test_copy_block_matmul_partials.cpp b/tests/tt_metal/tt_metal/llk/test_copy_block_matmul_partials.cpp index 38ef7828afc..1a0ffae6eb5 100644 --- a/tests/tt_metal/tt_metal/llk/test_copy_block_matmul_partials.cpp +++ b/tests/tt_metal/tt_metal/llk/test_copy_block_matmul_partials.cpp @@ -169,6 +169,8 @@ void run_single_core_copy_block_matmul_partials( // - matmul_pack_tile //////////////////////////////////////////////////////////////////////////// +namespace tt::tt_metal { + TEST_F(DeviceFixture, DISABLED_TensixComputeCopyBlockSingle) { for (bool fp32_dest_acc_en : {true, false}) { // FP32 dest acc not possible for GS @@ -225,3 +227,5 @@ TEST_F(DeviceFixture, TensixComputeCopyBlockComputeBottleneck) { } } } + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/llk/test_reconfig.cpp b/tests/tt_metal/tt_metal/llk/test_reconfig.cpp index 89e188c2401..81eb096aa94 100644 --- a/tests/tt_metal/tt_metal/llk/test_reconfig.cpp +++ b/tests/tt_metal/tt_metal/llk/test_reconfig.cpp @@ -319,6 +319,8 @@ bool single_core_reconfig(tt_metal::IDevice* device, const ReconfigConfig& test_ // - pack_reconfig_l1_acc //////////////////////////////////////////////////////////////////////////// +namespace tt::tt_metal { + TEST_F(DeviceFixture, TensixTileCopyReconfigExplicitSplitDstAcc) { auto arch = this->arch_; if (arch == tt::ARCH::GRAYSKULL) { @@ -376,3 +378,5 @@ TEST_F(DeviceFixture, TensixTileCopyReconfigL1Acc) { } } } + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/llk/test_single_core_binary_compute.cpp b/tests/tt_metal/tt_metal/llk/test_single_core_binary_compute.cpp index da7134d9ccf..a416874bb6b 100644 --- a/tests/tt_metal/tt_metal/llk/test_single_core_binary_compute.cpp +++ b/tests/tt_metal/tt_metal/llk/test_single_core_binary_compute.cpp @@ -271,6 +271,8 @@ bool single_core_binary(tt_metal::IDevice* device, const SingleCoreBinaryConfig& } } // namespace unit_tests::compute::binary +namespace tt::tt_metal { + TEST_F(DeviceFixture, TensixBinaryComputeSingleCoreSingleTileAdd) { for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) { if (i == 1) { @@ -591,3 +593,5 @@ TEST_F(DeviceFixture, TensixBinaryComputeSingleCoreMultiTileMulDestAcc) { } } } + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/llk/test_single_core_matmul_compute.cpp b/tests/tt_metal/tt_metal/llk/test_single_core_matmul_compute.cpp index 39ef0dfe84d..601aac9c89e 100644 --- a/tests/tt_metal/tt_metal/llk/test_single_core_matmul_compute.cpp +++ b/tests/tt_metal/tt_metal/llk/test_single_core_matmul_compute.cpp @@ -603,6 +603,8 @@ bool blocked_matmul(tt_metal::IDevice* device, uint32_t M, uint32_t K, uint32_t } } // namespace unit_tests::compute::matmul +namespace tt::tt_metal { + TEST_F(DeviceFixture, TensixTestSingleCoreSingleTileComputeMatmul) { for (unsigned int id = 0; id < num_devices_; id++) { ASSERT_TRUE(unit_tests::compute::matmul::single_tile_matmul(this->devices_.at(id))); @@ -623,3 +625,5 @@ TEST_F(DeviceFixture, TensixTestSingleCoreSingleBlockSingleTileNoAccumulationCom ASSERT_TRUE(unit_tests::compute::matmul::single_block_matmul(this->devices_.at(id), 2, 1, 2)); } } + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/noc/test_dynamic_noc.cpp b/tests/tt_metal/tt_metal/noc/test_dynamic_noc.cpp index 99679f937b3..cace9d75067 100644 --- a/tests/tt_metal/tt_metal/noc/test_dynamic_noc.cpp +++ b/tests/tt_metal/tt_metal/noc/test_dynamic_noc.cpp @@ -25,6 +25,8 @@ using namespace tt; using namespace tt::test_utils; using namespace tt::test_utils::df; +namespace tt::tt_metal { + TEST_F(DeviceSingleCardFastSlowDispatchFixture, TestDynamicNoCAsyncWriteProgram) { uint32_t NUM_PROGRAMS = 3; uint32_t MAX_LOOP = 123456789; @@ -100,3 +102,5 @@ TEST_F(DeviceSingleCardFastSlowDispatchFixture, TestDynamicNoCAsyncWriteProgram) log_info(tt::LogTest, "Finish SD runs"); } } + +} // namespace tt::tt_metal diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/common/util.hpp b/tests/tt_metal/tt_metal/perf_microbenchmark/common/util.hpp index 1d2458ea167..1e40eb3ae6f 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/common/util.hpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/common/util.hpp @@ -20,12 +20,13 @@ inline uint64_t get_t0_to_any_riscfw_end_cycle(tt::tt_metal::IDevice* device, co enum BufferIndex { BUFFER_END_INDEX, DROPPED_MARKER_COUNTER, MARKER_DATA_START }; enum TimerDataIndex { TIMER_ID, TIMER_VAL_L, TIMER_VAL_H, TIMER_DATA_UINT32_SIZE }; auto worker_cores_used_in_program = device->worker_cores_from_logical_cores( - program.logical_cores()[hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX)]); + program.logical_cores()[tt::tt_metal::hal.get_programmable_core_type_index( + tt::tt_metal::HalProgrammableCoreType::TENSIX)]); auto device_id = device->id(); uint64_t min_cycle = -1; uint64_t max_cycle = 0; - dprint_buf_msg_t* dprint_msg = - hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::DPRINT); + dprint_buf_msg_t* dprint_msg = tt::tt_metal::hal.get_dev_addr( + tt::tt_metal::HalProgrammableCoreType::TENSIX, tt::tt_metal::HalL1MemAddrType::DPRINT); // This works for tensix only, will need to be updated for eth std::vector print_buffer_addrs = { diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h index 264d5007bf3..43c53aea9e7 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h @@ -17,6 +17,8 @@ #include "llrt.hpp" #include +using namespace tt::tt_metal; // test only + extern bool debug_g; extern bool use_coherent_data_g; extern uint32_t dispatch_buffer_page_size_g; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_bidirectional_bandwidth_no_edm.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_bidirectional_bandwidth_no_edm.cpp index e964b39ccf0..6a1f7766cfc 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_bidirectional_bandwidth_no_edm.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_bidirectional_bandwidth_no_edm.cpp @@ -63,7 +63,7 @@ class N300TestDevice { } } - std::map devices_; + std::map devices_; tt::ARCH arch_; size_t num_devices_; @@ -76,18 +76,18 @@ struct ChipSenderReceiverEthCore { CoreCoord receiver_core; }; -std::tuple build( - IDevice* device0, - IDevice* device1, +std::tuple build( + tt_metal::IDevice* device0, + tt_metal::IDevice* device1, CoreCoord eth_sender_core, CoreCoord eth_receiver_core, std::size_t num_samples, std::size_t sample_page_size, std::size_t max_channels_per_direction, - KernelHandle& local_kernel, - KernelHandle& remote_kernel) { - Program program0; - Program program1; + tt_metal::KernelHandle& local_kernel, + tt_metal::KernelHandle& remote_kernel) { + tt_metal::Program program0; + tt_metal::Program program1; std::vector const& ct_args = {}; constexpr std::size_t num_links = 0; @@ -124,16 +124,16 @@ std::tuple build( throw e; } - return std::tuple{std::move(program0), std::move(program1)}; + return std::tuple{std::move(program0), std::move(program1)}; } void run( - IDevice* device0, - IDevice* device1, - Program& program0, - Program& program1, - KernelHandle local_kernel, - KernelHandle remote_kernel, + tt_metal::IDevice* device0, + tt_metal::IDevice* device1, + tt_metal::Program& program0, + tt_metal::Program& program1, + tt_metal::KernelHandle local_kernel, + tt_metal::KernelHandle remote_kernel, CoreCoord eth_sender_core, CoreCoord eth_receiver_core, @@ -237,8 +237,8 @@ int main(int argc, char** argv) { num_samples, sample_page_size, max_channels_per_direction); - KernelHandle local_kernel; - KernelHandle remote_kernel; + tt_metal::KernelHandle local_kernel; + tt_metal::KernelHandle remote_kernel; try { auto [program0, program1] = build( device_0, diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_hop_latencies_no_edm.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_hop_latencies_no_edm.cpp index 3b9177b6596..a4f2f2ce2d6 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_hop_latencies_no_edm.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_hop_latencies_no_edm.cpp @@ -345,8 +345,8 @@ void build_and_run_roundtrip_latency_test( auto is_device_pcie_connected(chip_id_t device_id) { return device_id < 4; } -std::vector build_eth_sockets_list(std::vector const& devices) { - std::vector sockets; +std::vector build_eth_sockets_list(const std::vector& devices) { + std::vector sockets; std::unordered_map n_edge_visits; for (std::size_t i = 0; i < devices.size(); i++) { IDevice* curr_device = devices.at(i); @@ -502,7 +502,7 @@ int main(int argc, char** argv) { constexpr std::size_t placeholder_arg_value = 1; for (auto n_hops : hop_counts) { auto devices = get_device_list(view, n_hops); - std::vector hop_eth_sockets = build_eth_sockets_list(devices); + std::vector hop_eth_sockets = build_eth_sockets_list(devices); for (auto max_concurrent_samples : max_concurrent_samples) { for (auto num_samples : sample_counts) { @@ -516,9 +516,9 @@ int main(int argc, char** argv) { sample_page_size, max_concurrent_samples, n_hops); - std::vector programs = {}; - std::vector receiver_kernel_ids; - std::vector sender_kernel_ids; + std::vector programs = {}; + std::vector receiver_kernel_ids; + std::vector sender_kernel_ids; tt::tt_metal::build_and_run_roundtrip_latency_test( devices, hop_eth_sockets, diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_link_ping_latency_no_edm.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_link_ping_latency_no_edm.cpp index 2fe2782bd80..eb91f7403a2 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_link_ping_latency_no_edm.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_link_ping_latency_no_edm.cpp @@ -64,7 +64,7 @@ class N300TestDevice { } } - std::map devices_; + std::map devices_; tt::ARCH arch_; size_t num_devices_; @@ -77,18 +77,18 @@ struct ChipSenderReceiverEthCore { CoreCoord receiver_core; }; -std::tuple build( - IDevice* device0, - IDevice* device1, +std::tuple build( + tt_metal::IDevice* device0, + tt_metal::IDevice* device1, CoreCoord eth_sender_core, CoreCoord eth_receiver_core, std::size_t num_samples, std::size_t sample_page_size, std::size_t num_channels, - KernelHandle& local_kernel, - KernelHandle& remote_kernel) { - Program program0; - Program program1; + tt_metal::KernelHandle& local_kernel, + tt_metal::KernelHandle& remote_kernel) { + tt_metal::Program program0; + tt_metal::Program program1; std::vector const& ct_args = {num_channels}; @@ -124,16 +124,16 @@ std::tuple build( throw e; } - return std::tuple{std::move(program0), std::move(program1)}; + return std::tuple{std::move(program0), std::move(program1)}; } void run( - IDevice* device0, - IDevice* device1, - Program& program0, - Program& program1, - KernelHandle local_kernel, - KernelHandle remote_kernel, + tt_metal::IDevice* device0, + tt_metal::IDevice* device1, + tt_metal::Program& program0, + tt_metal::Program& program1, + tt_metal::KernelHandle local_kernel, + tt_metal::KernelHandle remote_kernel, CoreCoord eth_sender_core, CoreCoord eth_receiver_core, @@ -254,8 +254,8 @@ int main(int argc, char** argv) { num_samples, sample_page_size, max_channels_per_direction); - KernelHandle local_kernel; - KernelHandle remote_kernel; + tt_metal::KernelHandle local_kernel; + tt_metal::KernelHandle remote_kernel; try { auto [program0, program1] = build( device_0, diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_read_and_send_data.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_read_and_send_data.cpp index 4eac223e08e..31864c3f7b5 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_read_and_send_data.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_read_and_send_data.cpp @@ -148,7 +148,7 @@ bool RunWriteBWTest( .input_buffer_type = source_is_dram ? tt_metal::BufferType::DRAM : tt_metal::BufferType::L1, .output_buffer_type = dest_is_dram ? tt_metal::BufferType::DRAM : tt_metal::BufferType::L1, .l1_data_format = tt::DataFormat::Float16_b}; - auto input_buffer = CreateBuffer(InterleavedBufferConfig{ + auto input_buffer = CreateBuffer(tt_metal::InterleavedBufferConfig{ sender_device, test_config.size_bytes, test_config.page_size_bytes, test_config.input_buffer_type}); bool input_is_dram = test_config.input_buffer_type == tt_metal::BufferType::DRAM; @@ -162,7 +162,7 @@ bool RunWriteBWTest( // Clear expected value at ethernet L1 address std::vector all_zeros(inputs.size(), 0); - auto output_buffer = CreateBuffer(InterleavedBufferConfig{ + auto output_buffer = CreateBuffer(tt_metal::InterleavedBufferConfig{ receiver_device, test_config.size_bytes, test_config.page_size_bytes, test_config.output_buffer_type}); bool output_is_dram = test_config.output_buffer_type == tt_metal::BufferType::DRAM; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_send_data_looping.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_send_data_looping.cpp index ba6aff47157..9ea1f365214 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_send_data_looping.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_send_data_looping.cpp @@ -13,6 +13,9 @@ #include #include #include +#include "llrt.hpp" +#include "tt_cluster.hpp" +#include "eth_l1_address_map.h" #include "tt_metal/test_utils/comparison.hpp" #include "tt_metal/test_utils/df/df.hpp" #include "tt_metal/test_utils/print_helpers.hpp" diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_write_worker_latency_no_edm.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_write_worker_latency_no_edm.cpp index b233aee0033..4ef0303467f 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_write_worker_latency_no_edm.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_write_worker_latency_no_edm.cpp @@ -66,7 +66,7 @@ class N300TestDevice { } } - std::map devices_; + std::map devices_; tt::ARCH arch_; size_t num_devices_; @@ -87,9 +87,9 @@ void validation(const std::shared_ptr& worker_buffer_0) { TT_FATAL(pass, "validation failed"); } -std::vector build( - IDevice* device0, - IDevice* device1, +std::vector build( + tt_metal::IDevice* device0, + tt_metal::IDevice* device1, CoreCoord eth_sender_core, CoreCoord eth_receiver_core, CoreCoord worker_core, @@ -97,14 +97,14 @@ std::vector build( std::size_t sample_page_size, std::size_t num_buffer_slots, uint32_t benchmark_type, - KernelHandle& local_kernel, - KernelHandle& remote_kernel, - std::shared_ptr& worker_buffer_0, - std::shared_ptr& worker_buffer_1, + tt_metal::KernelHandle& local_kernel, + tt_metal::KernelHandle& remote_kernel, + std::shared_ptr& worker_buffer_0, + std::shared_ptr& worker_buffer_1, bool test_latency, bool disable_trid) { - Program program0; - Program program1; + tt_metal::Program program0; + tt_metal::Program program1; // worker core coords uint32_t worker_noc_x = device1->worker_core_from_logical_core(worker_core).x; @@ -136,7 +136,8 @@ std::vector build( // eth core rt args const std::vector& eth_sender_receiver_rt_args = { - tt_metal::hal.get_dev_addr(HalProgrammableCoreType::ACTIVE_ETH, HalL1MemAddrType::UNRESERVED), + tt_metal::hal.get_dev_addr( + tt_metal::HalProgrammableCoreType::ACTIVE_ETH, tt_metal::HalL1MemAddrType::UNRESERVED), static_cast(num_samples), static_cast(sample_page_size)}; @@ -165,20 +166,20 @@ std::vector build( throw e; } - std::vector programs; + std::vector programs; programs.push_back(std::move(program0)); programs.push_back(std::move(program1)); return programs; } void run( - IDevice* device0, - IDevice* device1, - Program& program0, - Program& program1, + tt_metal::IDevice* device0, + tt_metal::IDevice* device1, + tt_metal::Program& program0, + tt_metal::Program& program1, BenchmarkType benchmark_type, - std::shared_ptr& worker_buffer_0, - std::shared_ptr& worker_buffer_1) { + std::shared_ptr& worker_buffer_0, + std::shared_ptr& worker_buffer_1) { if (std::getenv("TT_METAL_SLOW_DISPATCH_MODE")) { std::thread th2 = std::thread([&] { tt_metal::detail::LaunchProgram(device0, program0); }); std::thread th1 = std::thread([&] { tt_metal::detail::LaunchProgram(device1, program1); }); @@ -271,26 +272,26 @@ int main(int argc, char** argv) { num_samples, sample_page_size, num_buffer_slots); - KernelHandle local_kernel; - KernelHandle remote_kernel; + tt_metal::KernelHandle local_kernel; + tt_metal::KernelHandle remote_kernel; try { - ShardSpecBuffer shard_spec = ShardSpecBuffer( + tt_metal::ShardSpecBuffer shard_spec = tt_metal::ShardSpecBuffer( CoreRangeSet(std::set({CoreRange(worker_core)})), {1, sample_page_size}, - ShardOrientation::ROW_MAJOR, + tt_metal::ShardOrientation::ROW_MAJOR, {1, sample_page_size}, {1, sample_page_size}); auto worker_buffer_0 = CreateBuffer(tt::tt_metal::ShardedBufferConfig{ .device = device_0, .size = sample_page_size, .page_size = sample_page_size, - .buffer_layout = TensorMemoryLayout::HEIGHT_SHARDED, + .buffer_layout = tt_metal::TensorMemoryLayout::HEIGHT_SHARDED, .shard_parameters = shard_spec}); auto worker_buffer_1 = CreateBuffer(tt::tt_metal::ShardedBufferConfig{ .device = device_1, .size = sample_page_size, .page_size = sample_page_size, - .buffer_layout = TensorMemoryLayout::HEIGHT_SHARDED, + .buffer_layout = tt_metal::TensorMemoryLayout::HEIGHT_SHARDED, .shard_parameters = shard_spec}); auto programs = build( diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_workers_and_erisc_datamover_unidirectional.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_workers_and_erisc_datamover_unidirectional.cpp index 2e7a24662d2..72222435736 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_workers_and_erisc_datamover_unidirectional.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_workers_and_erisc_datamover_unidirectional.cpp @@ -8,6 +8,7 @@ #include #include +#include "buffer_constants.hpp" #include "umd/device/types/arch.h" #include "tt_backend_api_types.hpp" #include @@ -119,6 +120,11 @@ bool RunWriteBWTest( bool dest_is_dram ) { + using tt_metal::BufferType; + using tt_metal::CBHandle; + using tt_metal::DataMovementProcessor; + using tt_metal::InterleavedBufferConfig; + // number of bytes to send per eth send (given that eth l1 buf size not // guaranteed to be multiple of page size, we won't send the left over // bytes at the end diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/noc/test_noc_unicast_vs_multicast_to_single_core_latency.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/noc/test_noc_unicast_vs_multicast_to_single_core_latency.cpp index ef049ae2f0a..57c4ddc9f8d 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/noc/test_noc_unicast_vs_multicast_to_single_core_latency.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/noc/test_noc_unicast_vs_multicast_to_single_core_latency.cpp @@ -12,7 +12,7 @@ #include "tt_cluster.hpp" using namespace tt; -// + void measure_latency(const string& kernel_name) { const int device_id = 0; tt_metal::IDevice* device = tt_metal::CreateDevice(device_id); @@ -47,7 +47,7 @@ void measure_latency(const string& kernel_name) { tt::tt_metal::detail::SetDeviceProfilerDir(kernel_name + "_microbenchmark"); tt::tt_metal::detail::FreshProfilerDeviceLog(); - detail::CompileProgram(device, program); + tt::tt_metal::detail::CompileProgram(device, program); tt_metal::detail::LaunchProgram(device, program); tt_metal::CloseDevice(device); } diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux_2level.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux_2level.cpp index 2834227a93e..6696d1e619c 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux_2level.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux_2level.cpp @@ -14,7 +14,6 @@ using std::vector; using namespace tt; - int main(int argc, char **argv) { constexpr uint32_t default_prng_seed = 0x100; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tx_rx.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tx_rx.cpp index d8a5c7263bd..c9f075b3eaa 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tx_rx.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tx_rx.cpp @@ -15,7 +15,6 @@ using std::vector; using namespace tt; - int main(int argc, char **argv) { bool pass = true; diff --git a/tests/tt_metal/tt_metal/test_compile_args.cpp b/tests/tt_metal/tt_metal/test_compile_args.cpp index f52ea268b5a..14a09f37289 100644 --- a/tests/tt_metal/tt_metal/test_compile_args.cpp +++ b/tests/tt_metal/tt_metal/test_compile_args.cpp @@ -69,7 +69,9 @@ int main(int argc, char** argv) { // Remove old compiled kernels static const std::string kernel_name = "test_compile_args"; auto binary_path_str = - kernel->binaries(BuildEnvManager::get_instance().get_device_build_env(device->build_id()).build_env) + kernel + ->binaries( + tt::tt_metal::BuildEnvManager::get_instance().get_device_build_env(device->build_id()).build_env) .get_out_kernel_root_path() + kernel_name; std::filesystem::remove_all(binary_path_str); diff --git a/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp b/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp index e0cab094ff7..73bb74cfe74 100644 --- a/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp +++ b/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp @@ -23,7 +23,8 @@ using std::vector; using namespace tt; -std::string get_latest_kernel_binary_path(const string& kernel_root_path, const std::shared_ptr& kernel) { +std::string get_latest_kernel_binary_path( + const string& kernel_root_path, const std::shared_ptr& kernel) { TT_FATAL(kernel != nullptr, "Error"); TT_FATAL(std::filesystem::exists(kernel_root_path + kernel->name()), "Error"); @@ -41,7 +42,7 @@ std::string get_latest_kernel_binary_path(const string& kernel_root_path, const return kernel->name() + "/" + latest_hash; } -void construct_program(Program& program, IDevice* device, CoreCoord& core) { +void construct_program(tt_metal::Program& program, tt_metal::IDevice* device, CoreCoord& core) { uint32_t single_tile_size = 2 * 1024; uint32_t num_tiles = 2048; uint32_t dram_buffer_size = @@ -81,13 +82,15 @@ void construct_program(Program& program, IDevice* device, CoreCoord& core) { program, "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp", core, - DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default}); + tt_metal::DataMovementConfig{ + .processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); auto unary_writer_kernel = tt_metal::CreateKernel( program, "tt_metal/kernels/dataflow/writer_unary.cpp", core, - DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); + tt_metal::DataMovementConfig{ + .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); vector compute_kernel_args = { uint(num_tiles) // per_core_tile_cnt @@ -113,9 +116,10 @@ int main(int argc, char** argv) { for (unsigned int id = 0; id < num_devices; id++) { ids.push_back(id); } - tt::DevicePool::initialize(ids, 1, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, DispatchCoreConfig{}); + tt::DevicePool::initialize( + ids, 1, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, tt_metal::DispatchCoreConfig{}); auto devices = tt::DevicePool::instance().get_all_active_devices(); - std::vector programs; + std::vector programs; // kernel->binaries() returns 32B aligned binaries std::map> compute_binaries; std::map> brisc_binaries; @@ -127,8 +131,8 @@ int main(int argc, char** argv) { //////////////////////////////////////////////////////////////////////////// // Application Setup //////////////////////////////////////////////////////////////////////////// - programs.push_back(Program()); - Program& program = programs.back(); + programs.push_back(tt_metal::Program()); + tt_metal::Program& program = programs.back(); construct_program(program, device, core); @@ -136,8 +140,9 @@ int main(int argc, char** argv) { // Compile Application //////////////////////////////////////////////////////////////////////////// // Check that binary memory objects in the kernel match the ones obtained from the persistent cache - uint32_t programmable_core_index = hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX); - const KernelGroup* kernel_group = program.kernels_on_core(core, programmable_core_index); + uint32_t programmable_core_index = + tt_metal::hal.get_programmable_core_type_index(tt_metal::HalProgrammableCoreType::TENSIX); + const tt_metal::KernelGroup* kernel_group = program.kernels_on_core(core, programmable_core_index); TT_FATAL( kernel_group != nullptr && kernel_group->kernel_ids[DISPATCH_CLASS_TENSIX_COMPUTE].has_value() and kernel_group->kernel_ids[DISPATCH_CLASS_TENSIX_DM0].has_value() and @@ -151,7 +156,8 @@ int main(int argc, char** argv) { tt_metal::detail::GetKernel(program, kernel_group->kernel_ids[DISPATCH_CLASS_TENSIX_DM1].value()); // Run iteration to get golden - uint32_t mask = BuildEnvManager::get_instance().get_device_build_env(device->build_id()).build_key; + uint32_t mask = + tt_metal::BuildEnvManager::get_instance().get_device_build_env(device->build_id()).build_key; tt_metal::detail::CompileProgram(device, program); compute_binaries.insert({mask, compute_kernel->binaries(mask)}); TT_FATAL(compute_binaries.at(mask).size() == 3, "Expected 3 Compute binaries!"); @@ -167,36 +173,38 @@ int main(int argc, char** argv) { for (int i = 0; i < num_devices; i++) { for (const auto& kernel_name : kernel_names) { std::filesystem::remove_all( - BuildEnvManager::get_instance() + tt_metal::BuildEnvManager::get_instance() .get_device_build_env(devices[i]->id()) .build_env.get_out_kernel_root_path() + kernel_name); } } tt_metal::detail::ClearKernelCache(); - std::vector new_programs; + std::vector new_programs; for (int i = 0; i < num_devices; i++) { auto& device = devices[i]; - new_programs.push_back(Program()); - Program& program = new_programs.back(); + new_programs.push_back(tt_metal::Program()); + tt_metal::Program& program = new_programs.back(); construct_program(program, device, core); } std::vector ths; ths.reserve(num_devices); - uint32_t dm_class_idx = magic_enum::enum_integer(HalProcessorClassType::DM); - uint32_t compute_class_idx = magic_enum::enum_integer(HalProcessorClassType::COMPUTE); + uint32_t dm_class_idx = magic_enum::enum_integer(tt_metal::HalProcessorClassType::DM); + uint32_t compute_class_idx = magic_enum::enum_integer(tt_metal::HalProcessorClassType::COMPUTE); for (int i = 0; i < num_devices; i++) { auto& device = devices[i]; auto& program = new_programs[i]; ths.emplace_back([&] { for (int j = 0; j < num_compiles; j++) { - uint32_t mask = - BuildEnvManager::get_instance().get_device_build_env(device->build_id()).build_key; + uint32_t mask = tt_metal::BuildEnvManager::get_instance() + .get_device_build_env(device->build_id()) + .build_key; tt_metal::detail::CompileProgram(device, program); uint32_t programmable_core_index = - hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX); - const KernelGroup* kernel_group = program.kernels_on_core(core, programmable_core_index); + tt_metal::hal.get_programmable_core_type_index(tt_metal::HalProgrammableCoreType::TENSIX); + const tt_metal::KernelGroup* kernel_group = + program.kernels_on_core(core, programmable_core_index); auto compute_kernel = tt_metal::detail::GetKernel( program, kernel_group->kernel_ids[DISPATCH_CLASS_TENSIX_COMPUTE].value()); auto riscv0_kernel = tt_metal::detail::GetKernel( @@ -208,12 +216,12 @@ int main(int argc, char** argv) { TT_FATAL(riscv1_kernel->binaries(mask) == ncrisc_binaries.at(mask), "Error"); std::string kernel_name = get_latest_kernel_binary_path( - BuildEnvManager::get_instance() + tt_metal::BuildEnvManager::get_instance() .get_device_build_env(device->build_id()) .build_env.get_out_kernel_root_path(), riscv0_kernel); std::string brisc_hex_path = - BuildEnvManager::get_instance() + tt_metal::BuildEnvManager::get_instance() .get_kernel_build_state(device->build_id(), programmable_core_index, dm_class_idx, 0) .get_target_out_path(kernel_name); ll_api::memory const& brisc_binary = @@ -222,12 +230,12 @@ int main(int argc, char** argv) { brisc_binary == *brisc_binaries.at(mask).at(0), "Expected saved BRISC binary to be the same as binary in persistent cache"); kernel_name = get_latest_kernel_binary_path( - BuildEnvManager::get_instance() + tt_metal::BuildEnvManager::get_instance() .get_device_build_env(device->build_id()) .build_env.get_out_kernel_root_path(), riscv1_kernel); std::string ncrisc_hex_path = - BuildEnvManager::get_instance() + tt_metal::BuildEnvManager::get_instance() .get_kernel_build_state(device->build_id(), programmable_core_index, dm_class_idx, 1) .get_target_out_path(kernel_name); auto load_type = @@ -240,13 +248,13 @@ int main(int argc, char** argv) { "Expected saved NCRISC binary to be the same as binary in persistent cache"); for (int trisc_id = 0; trisc_id <= 2; trisc_id++) { kernel_name = get_latest_kernel_binary_path( - BuildEnvManager::get_instance() + tt_metal::BuildEnvManager::get_instance() .get_device_build_env(device->build_id()) .build_env.get_out_kernel_root_path(), compute_kernel); std::string trisc_id_str = std::to_string(trisc_id); std::string trisc_hex_path = - BuildEnvManager::get_instance() + tt_metal::BuildEnvManager::get_instance() .get_kernel_build_state( device->build_id(), programmable_core_index, compute_class_idx, trisc_id) .get_target_out_path(kernel_name); diff --git a/tests/tt_metal/tt_metal/test_dram_loopback_multi_core.cpp b/tests/tt_metal/tt_metal/test_dram_loopback_multi_core.cpp index 7849718f370..8753fa46f86 100644 --- a/tests/tt_metal/tt_metal/test_dram_loopback_multi_core.cpp +++ b/tests/tt_metal/tt_metal/test_dram_loopback_multi_core.cpp @@ -9,6 +9,7 @@ #include #include +#include "tt_metal.hpp" #include "tt_metal/test_utils/deprecated/tensor.hpp" ////////////////////////////////////////////////////////////////////////////////////// @@ -32,7 +33,8 @@ int main(int argc, char** argv) { //////////////////////////////////////////////////////////////////////////// // Device Setup //////////////////////////////////////////////////////////////////////////// - int device_id tt_metal::IDevice* device = tt_metal::CreateDevice(device_id); + int device_id = 0; + tt_metal::IDevice* device = tt_metal::CreateDevice(device_id); //////////////////////////////////////////////////////////////////////////// // Input Data Setup @@ -40,8 +42,8 @@ int main(int argc, char** argv) { std::array shape = {1, 1, 32, 1024 * 32}; uint32_t seed_from_systime = std::chrono::system_clock::now().time_since_epoch().count(); - Tensor tensor = initialize_tensor( - shape, Initialize::RANDOM, 0, 100, seed_from_systime); // TODO: make randomized! + tt::deprecated::Tensor tensor = initialize_tensor( + shape, tt::deprecated::Initialize::RANDOM, 0, 100, seed_from_systime); // TODO: make randomized! auto golden = tensor.get_values(); auto src_vec = pack_bfloat16_vec_into_uint32_vec(golden); diff --git a/tests/tt_metal/tt_metal/test_dram_loopback_multi_core_db.cpp b/tests/tt_metal/tt_metal/test_dram_loopback_multi_core_db.cpp index e5833262e2b..f7aaf76bac6 100644 --- a/tests/tt_metal/tt_metal/test_dram_loopback_multi_core_db.cpp +++ b/tests/tt_metal/tt_metal/test_dram_loopback_multi_core_db.cpp @@ -9,6 +9,7 @@ #include #include +#include "tt_metal.hpp" #include "tt_metal/test_utils/deprecated/tensor.hpp" ////////////////////////////////////////////////////////////////////////////////////// @@ -42,8 +43,8 @@ int main(int argc, char** argv) { std::array shape = {1, 1, 32, 1024 * 32}; uint32_t seed_from_systime = std::chrono::system_clock::now().time_since_epoch().count(); - Tensor tensor = initialize_tensor( - shape, Initialize::RANDOM, 0, 100, seed_from_systime); // TODO: make randomized! + tt::deprecated::Tensor tensor = initialize_tensor( + shape, tt::deprecated::Initialize::RANDOM, 0, 100, seed_from_systime); // TODO: make randomized! auto golden = tensor.get_values(); auto src_vec = pack_bfloat16_vec_into_uint32_vec(golden); diff --git a/tests/ttnn/unit_tests/gtests/ccl/test_ccl_reduce_scatter_host_helpers.cpp b/tests/ttnn/unit_tests/gtests/ccl/test_ccl_reduce_scatter_host_helpers.cpp index 0e105e9a777..0a02f432c00 100644 --- a/tests/ttnn/unit_tests/gtests/ccl/test_ccl_reduce_scatter_host_helpers.cpp +++ b/tests/ttnn/unit_tests/gtests/ccl/test_ccl_reduce_scatter_host_helpers.cpp @@ -21,6 +21,7 @@ using ttnn::ccl::cmd::CclCommandArgCode; using ttnn::ccl::cmd::CclCommandCode; using ttnn::ccl::cmd::CclCommandHeader; using shape4d = ttnn::ccl::Shape4D; + TEST(LineReduceScatter, EmitCclSendSliceSequenceCommands_8Slices_1x1x32x2048Tensor_Dim3_Slice0to7) { const std::size_t num_slices = 8; const std::int64_t start_slice_index = 0; diff --git a/tests/ttnn/unit_tests/gtests/ccl/test_erisc_data_mover_with_workers.cpp b/tests/ttnn/unit_tests/gtests/ccl/test_erisc_data_mover_with_workers.cpp index bde1f18b703..144fb0db132 100644 --- a/tests/ttnn/unit_tests/gtests/ccl/test_erisc_data_mover_with_workers.cpp +++ b/tests/ttnn/unit_tests/gtests/ccl/test_erisc_data_mover_with_workers.cpp @@ -40,9 +40,9 @@ namespace ttnn { namespace ccl { void set_edm_runtime_args( tt_metal::Program& program, - KernelHandle edm_kernel_handle, - ccl::EriscDatamoverBuilder const& edm_builder, - CoreCoord const& eth_core) { + tt_metal::KernelHandle edm_kernel_handle, + const ccl::EriscDatamoverBuilder& edm_builder, + const CoreCoord& eth_core) { std::vector const& edm_clockwise_kernel_rt_args = edm_builder.get_runtime_args(); tt_metal::SetRuntimeArgs(program, edm_kernel_handle, eth_core, edm_clockwise_kernel_rt_args); @@ -87,7 +87,7 @@ class N300TestDevice { } } - std::map devices_; + std::map devices_; tt::ARCH arch_; size_t num_devices_; @@ -99,8 +99,8 @@ struct BankedConfig { size_t num_pages; size_t size_bytes; size_t page_size_bytes; - BufferType input_buffer_type; // = BufferType::L1; - BufferType output_buffer_type; // = BufferType::L1; + tt_metal::BufferType input_buffer_type; // = BufferType::L1; + tt_metal::BufferType output_buffer_type; // = BufferType::L1; tt::DataFormat l1_data_format; // = tt::DataFormat::Float16_b; }; @@ -112,11 +112,11 @@ struct KernelXY { }; void generate_receiver_worker_kernels( - Program& program, - IDevice* device, - CoreCoord const& worker_core, - CoreCoord const& edm_core, - ttnn::ccl::EriscDatamoverBuilder::ChannelBufferInterface const& edm_channel, + tt_metal::Program& program, + tt_metal::IDevice* device, + const CoreCoord& worker_core, + const CoreCoord& edm_core, + const ttnn::ccl::EriscDatamoverBuilder::ChannelBufferInterface& edm_channel, uint32_t page_size, uint32_t num_pages, std::size_t num_buffers_per_edm_channel, @@ -134,7 +134,7 @@ void generate_receiver_worker_kernels( tt_metal::CircularBufferConfig(2 * num_pages_per_edm_buffer * page_size, {{src0_cb_index, df}}) .set_page_size(src0_cb_index, page_size); - CBHandle receiver_workers_cb = CreateCircularBuffer(program, worker_core, cb_src0_config); + tt_metal::CBHandle receiver_workers_cb = CreateCircularBuffer(program, worker_core, cb_src0_config); std::vector receiver_worker_writer_compile_args{ dest_is_dram, // num_pages, // @@ -194,11 +194,11 @@ void generate_receiver_worker_kernels( } void generate_sender_worker_kernels( - Program& program, - IDevice* device, - CoreCoord const& worker_core, - CoreCoord const& edm_core, - ttnn::ccl::EriscDatamoverBuilder::ChannelBufferInterface const& edm_channel, + tt_metal::Program& program, + tt_metal::IDevice* device, + const CoreCoord& worker_core, + const CoreCoord& edm_core, + const ttnn::ccl::EriscDatamoverBuilder::ChannelBufferInterface& edm_channel, uint32_t page_size, uint32_t num_pages_total, std::size_t num_buffers_per_edm_channel, @@ -248,7 +248,7 @@ void generate_sender_worker_kernels( tt_metal::CircularBufferConfig cb_src0_config = tt_metal::CircularBufferConfig(2 * num_pages_per_edm_buffer * page_size, {{src0_cb_index, df}}) .set_page_size(src0_cb_index, page_size); - CBHandle sender_workers_cb = CreateCircularBuffer(program, worker_core, cb_src0_config); + tt_metal::CBHandle sender_workers_cb = CreateCircularBuffer(program, worker_core, cb_src0_config); auto sender_worker_reader_kernel = tt_metal::CreateKernel( program, "tests/ttnn/unit_tests/gtests/ccl/kernels/erisc_datamover_sender_worker_reader.cpp", @@ -335,15 +335,15 @@ bool RunWriteBWTest( .num_pages = num_pages_total, .size_bytes = tensor_size_bytes, .page_size_bytes = page_size, - .input_buffer_type = src_is_dram ? BufferType::DRAM : BufferType::L1, - .output_buffer_type = dest_is_dram ? BufferType::DRAM : BufferType::L1, + .input_buffer_type = src_is_dram ? tt_metal::BufferType::DRAM : tt_metal::BufferType::L1, + .output_buffer_type = dest_is_dram ? tt_metal::BufferType::DRAM : tt_metal::BufferType::L1, .l1_data_format = tt::DataFormat::Float16_b}; - auto local_input_buffer = CreateBuffer(InterleavedBufferConfig{ + auto local_input_buffer = CreateBuffer(tt_metal::InterleavedBufferConfig{ sender_device, test_config.size_bytes, test_config.page_size_bytes, test_config.input_buffer_type}); - auto remote_input_buffer = CreateBuffer(InterleavedBufferConfig{ + auto remote_input_buffer = CreateBuffer(tt_metal::InterleavedBufferConfig{ receiver_device, test_config.size_bytes, test_config.page_size_bytes, test_config.input_buffer_type}); - bool input_is_dram = test_config.input_buffer_type == BufferType::DRAM; + bool input_is_dram = test_config.input_buffer_type == tt_metal::BufferType::DRAM; tt_metal::detail::WriteToBuffer(local_input_buffer, inputs); tt_metal::detail::WriteToBuffer(remote_input_buffer, inputs); @@ -358,21 +358,21 @@ bool RunWriteBWTest( // Clear expected value at ethernet L1 address std::vector all_zeros(inputs.size(), 0); - std::vector> local_output_buffers; - std::vector> remote_output_buffers; + std::vector> local_output_buffers; + std::vector> remote_output_buffers; for (std::size_t i = 0; i < num_local_sender_channels; i++) { - auto output_buffer = CreateBuffer(InterleavedBufferConfig{ + auto output_buffer = CreateBuffer(tt_metal::InterleavedBufferConfig{ receiver_device, test_config.size_bytes, test_config.page_size_bytes, test_config.output_buffer_type}); remote_output_buffers.push_back(output_buffer); } for (std::size_t i = 0; i < num_remote_sender_channels; i++) { - auto output_buffer = CreateBuffer(InterleavedBufferConfig{ + auto output_buffer = CreateBuffer(tt_metal::InterleavedBufferConfig{ sender_device, test_config.size_bytes, test_config.page_size_bytes, test_config.output_buffer_type}); local_output_buffers.push_back(output_buffer); } - bool output_is_dram = test_config.output_buffer_type == BufferType::DRAM; + bool output_is_dram = test_config.output_buffer_type == tt_metal::BufferType::DRAM; for (const auto& buffer_id : local_output_buffers) { tt_metal::detail::WriteToBuffer(buffer_id, all_zeros); } @@ -537,11 +537,11 @@ bool RunWriteBWTest( // Build EDMs //////////////////////////////////////////////////////////////////////////// auto local_edm_kernel = ttnn::ccl::generate_edm_kernel( - sender_program, sender_device, local_chip_edm_builder, eth_sender_core, NOC::NOC_0); + sender_program, sender_device, local_chip_edm_builder, eth_sender_core, tt_metal::NOC::NOC_0); set_edm_runtime_args(sender_program, local_edm_kernel, local_chip_edm_builder, eth_sender_core); auto remote_edm_kernel = ttnn::ccl::generate_edm_kernel( - receiver_program, receiver_device, remote_chip_edm_builder, eth_receiver_core, NOC::NOC_0); + receiver_program, receiver_device, remote_chip_edm_builder, eth_receiver_core, tt_metal::NOC::NOC_0); set_edm_runtime_args(receiver_program, remote_edm_kernel, remote_chip_edm_builder, eth_receiver_core); //////////////////////////////////////////////////////////////////////////// @@ -576,7 +576,7 @@ bool RunWriteBWTest( // tt::tt_metal::detail::DumpDeviceProfileResults(sender_device); log_info(tt::LogTest, "Reading back outputs"); - auto is_output_correct = [&all_zeros, &inputs](const std::shared_ptr& output_buffer) { + auto is_output_correct = [&all_zeros, &inputs](const std::shared_ptr& output_buffer) { constexpr bool debug_mode = false; std::vector readback_data_vec; // init to 0 data for easier debug readback_data_vec.reserve(all_zeros.size()); diff --git a/tests/ttnn/unit_tests/gtests/ccl/test_fabric_edm_common.hpp b/tests/ttnn/unit_tests/gtests/ccl/test_fabric_edm_common.hpp index 1a9465f67b7..b61e6cf2972 100644 --- a/tests/ttnn/unit_tests/gtests/ccl/test_fabric_edm_common.hpp +++ b/tests/ttnn/unit_tests/gtests/ccl/test_fabric_edm_common.hpp @@ -44,6 +44,7 @@ #include "tests/ttnn/unit_tests/gtests/ccl/test_fabric_edm_common.hpp" using namespace tt; +using namespace tt::tt_metal; using namespace tt::test_utils; using namespace tt::test_utils::df; diff --git a/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp b/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp index 1031f80f496..08154d4a04c 100644 --- a/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp +++ b/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp @@ -881,7 +881,7 @@ TEST(CclAsyncOp, ReduceScatterSmall_PersistentFabric) { from_remote_multi_device_global_semaphore, to_remote_multi_device_global_semaphore, ttnn::operations::reduction::ReduceType::Sum, - operation::DEFAULT_OUTPUT_MEMORY_CONFIG, + tt::tt_metal::operation::DEFAULT_OUTPUT_MEMORY_CONFIG, ttnn::ccl::Topology::Linear, num_links, subdevice_managers->worker_subdevice_id.at(devices[0]->id()), diff --git a/tests/ttnn/unit_tests/gtests/tensor/common_tensor_test_utils.cpp b/tests/ttnn/unit_tests/gtests/tensor/common_tensor_test_utils.cpp index d338afe5125..2d819d20066 100644 --- a/tests/ttnn/unit_tests/gtests/tensor/common_tensor_test_utils.cpp +++ b/tests/ttnn/unit_tests/gtests/tensor/common_tensor_test_utils.cpp @@ -10,7 +10,8 @@ namespace test_utils { -void test_tensor_on_device(const ttnn::Shape& input_shape, const TensorLayout& layout, tt::tt_metal::IDevice* device) { +void test_tensor_on_device( + const ttnn::Shape& input_shape, const tt::tt_metal::TensorLayout& layout, tt::tt_metal::IDevice* device) { using namespace tt::tt_metal; const ttnn::QueueId io_cq = ttnn::DefaultQueueId; diff --git a/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor.cpp b/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor.cpp index 297e9816605..d4ff587d335 100644 --- a/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor.cpp +++ b/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor.cpp @@ -21,7 +21,7 @@ namespace { void run_create_tensor_test(tt::tt_metal::IDevice* device, const ttnn::Shape& input_shape) { MemoryConfig mem_cfg = MemoryConfig{ .memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED, - .buffer_type = BufferType::DRAM, + .buffer_type = tt::tt_metal::BufferType::DRAM, .shard_spec = std::nullopt}; const ttnn::QueueId io_cq = ttnn::DefaultQueueId; diff --git a/tests/ttnn/unit_tests/gtests/tensor/test_tensor_layout.cpp b/tests/ttnn/unit_tests/gtests/tensor/test_tensor_layout.cpp index 190d4e8096e..c93da2bf8e7 100644 --- a/tests/ttnn/unit_tests/gtests/tensor/test_tensor_layout.cpp +++ b/tests/ttnn/unit_tests/gtests/tensor/test_tensor_layout.cpp @@ -15,7 +15,6 @@ #include "common_tensor_test_utils.hpp" using namespace ttnn; -using namespace tt::tt_metal; namespace { const MemoryConfig DefaultMemoryConfig{TensorMemoryLayout::INTERLEAVED, BufferType::DRAM, std::nullopt}; @@ -27,9 +26,9 @@ struct Inputs { }; struct Expected { - Shape2D physical_size; - Alignment alignment; - Strides strides; + tt::tt_metal::Shape2D physical_size; + tt::tt_metal::Alignment alignment; + tt::tt_metal::Strides strides; bool tensor_creation_works = true; }; @@ -63,55 +62,64 @@ INSTANTIATE_TEST_SUITE_P( Inputs{.shape = ttnn::Shape{5, 4, 3, 2}, .data_type = DataType::BFLOAT16, .layout = Layout::TILE}, Expected{ .physical_size = {5 * 4 * 32, 32}, - .alignment = Alignment({32, 32}), - .strides = Strides({32 * 3 * 4, 32 * 3, 32, 1})}}, + .alignment = tt::tt_metal::Alignment({32, 32}), + .strides = tt::tt_metal::Strides({32 * 3 * 4, 32 * 3, 32, 1})}}, // Row Major, bfloat16, requires padding to 2 TensorLayoutTestParams{ Inputs{.shape = ttnn::Shape{6, 5, 4, 3}, .data_type = DataType::BFLOAT16, .layout = Layout::ROW_MAJOR}, Expected{ .physical_size = {6 * 5 * 4, 3}, - .alignment = Alignment({1}), - .strides = Strides({5 * 4 * 3, 4 * 3, 3, 1})}}, + .alignment = tt::tt_metal::Alignment({1}), + .strides = tt::tt_metal::Strides({5 * 4 * 3, 4 * 3, 3, 1})}}, // Row Major, uint32 TensorLayoutTestParams{ Inputs{.shape = ttnn::Shape{6, 5, 4, 3}, .data_type = DataType::UINT32, .layout = Layout::ROW_MAJOR}, Expected{ .physical_size = {6 * 5 * 4, 3}, - .alignment = Alignment({1}), - .strides = Strides({5 * 4 * 3, 4 * 3, 3, 1})}}, + .alignment = tt::tt_metal::Alignment({1}), + .strides = tt::tt_metal::Strides({5 * 4 * 3, 4 * 3, 3, 1})}}, // Row Major, bfloat16, requires padding to 2, aligned TensorLayoutTestParams{ Inputs{.shape = ttnn::Shape{6, 5, 4, 8}, .data_type = DataType::BFLOAT16, .layout = Layout::ROW_MAJOR}, Expected{ .physical_size = {6 * 5 * 4, 8}, - .alignment = Alignment({1}), - .strides = Strides({5 * 4 * 8, 4 * 8, 8, 1})}}, + .alignment = tt::tt_metal::Alignment({1}), + .strides = tt::tt_metal::Strides({5 * 4 * 8, 4 * 8, 8, 1})}}, // Tile, 1 element TensorLayoutTestParams{ Inputs{.shape = ttnn::Shape{1, 1, 1, 1}, .data_type = DataType::BFLOAT16, .layout = Layout::TILE}, - Expected{.physical_size = {32, 32}, .alignment = Alignment({32, 32}), .strides = Strides({32, 32, 32, 1})}}, + Expected{ + .physical_size = {32, 32}, + .alignment = tt::tt_metal::Alignment({32, 32}), + .strides = tt::tt_metal::Strides({32, 32, 32, 1})}}, // Row Major, 1 element TensorLayoutTestParams{ Inputs{.shape = ttnn::Shape{1, 1, 1, 1}, .data_type = DataType::BFLOAT16, .layout = Layout::ROW_MAJOR}, - Expected{.physical_size = {1, 1}, .alignment = Alignment({1}), .strides = Strides({1, 1, 1, 1})}}, + Expected{ + .physical_size = {1, 1}, + .alignment = tt::tt_metal::Alignment({1}), + .strides = tt::tt_metal::Strides({1, 1, 1, 1})}}, // Row Major, uint32_t 1 element TensorLayoutTestParams{ Inputs{.shape = ttnn::Shape{1, 1, 1, 1}, .data_type = DataType::UINT32, .layout = Layout::ROW_MAJOR}, - Expected{.physical_size = {1, 1}, .alignment = Alignment({1}), .strides = Strides({1, 1, 1, 1})}}, + Expected{ + .physical_size = {1, 1}, + .alignment = tt::tt_metal::Alignment({1}), + .strides = tt::tt_metal::Strides({1, 1, 1, 1})}}, // Rank 0, RM, in bfloat16 needs additional padding to 4 bytes TensorLayoutTestParams{ Inputs{.shape = ttnn::Shape{}, .data_type = DataType::BFLOAT16, .layout = Layout::ROW_MAJOR}, Expected{ .physical_size = {1, 1}, - .alignment = Alignment({1}), - .strides = Strides({}), + .alignment = tt::tt_metal::Alignment({1}), + .strides = tt::tt_metal::Strides({}), .tensor_creation_works = false}}, // Rank 0, RM, in uint32_t needs no additional padding @@ -119,8 +127,8 @@ INSTANTIATE_TEST_SUITE_P( Inputs{.shape = ttnn::Shape{}, .data_type = DataType::UINT32, .layout = Layout::ROW_MAJOR}, Expected{ .physical_size = {1, 1}, - .alignment = Alignment({1}), - .strides = Strides({}), + .alignment = tt::tt_metal::Alignment({1}), + .strides = tt::tt_metal::Strides({}), .tensor_creation_works = false}}, // Rank 0, Tile @@ -128,8 +136,8 @@ INSTANTIATE_TEST_SUITE_P( Inputs{.shape = ttnn::Shape{}, .data_type = DataType::BFLOAT16, .layout = Layout::TILE}, Expected{ .physical_size = {32, 32}, - .alignment = Alignment({32, 32}), - .strides = Strides({}), + .alignment = tt::tt_metal::Alignment({32, 32}), + .strides = tt::tt_metal::Strides({}), .tensor_creation_works = false}}, // Rank 1, RM, bfloat16 @@ -137,8 +145,8 @@ INSTANTIATE_TEST_SUITE_P( Inputs{.shape = ttnn::Shape{1}, .data_type = DataType::BFLOAT16, .layout = Layout::ROW_MAJOR}, Expected{ .physical_size = {1, 1}, - .alignment = Alignment({1}), - .strides = Strides({1}), + .alignment = tt::tt_metal::Alignment({1}), + .strides = tt::tt_metal::Strides({1}), .tensor_creation_works = false}}, // Rank 1, RM, uint32 @@ -146,14 +154,17 @@ INSTANTIATE_TEST_SUITE_P( Inputs{.shape = ttnn::Shape{1}, .data_type = DataType::UINT32, .layout = Layout::ROW_MAJOR}, Expected{ .physical_size = {1, 1}, - .alignment = Alignment({1}), - .strides = Strides({1}), + .alignment = tt::tt_metal::Alignment({1}), + .strides = tt::tt_metal::Strides({1}), .tensor_creation_works = false}}, // Rank 1, Tile TensorLayoutTestParams{ Inputs{.shape = ttnn::Shape{1}, .data_type = DataType::BFLOAT16, .layout = Layout::TILE}, - Expected{.physical_size = {32, 32}, .alignment = Alignment({32, 32}), .strides = Strides({1})}})); + Expected{ + .physical_size = {32, 32}, + .alignment = tt::tt_metal::Alignment({32, 32}), + .strides = tt::tt_metal::Strides({1})}})); struct LegacyPaddingRoundtripTestParams { Shape shape; diff --git a/tests/ttnn/unit_tests/gtests/tensor/test_xtensor_conversion.cpp b/tests/ttnn/unit_tests/gtests/tensor/test_xtensor_conversion.cpp index 473389c55cd..0e4df78553e 100644 --- a/tests/ttnn/unit_tests/gtests/tensor/test_xtensor_conversion.cpp +++ b/tests/ttnn/unit_tests/gtests/tensor/test_xtensor_conversion.cpp @@ -5,6 +5,7 @@ #include #include +#include "ttnn/operations/functions.hpp" #include "ttnn/tensor/tensor.hpp" #include "ttnn/tensor/types.hpp" #include "ttnn/tensor/xtensor/conversion_utils.hpp" @@ -22,7 +23,8 @@ using ::ttnn::experimental::xtensor::to_xtensor; using ::ttnn::experimental::xtensor::xtensor_to_span; TensorSpec get_tensor_spec(const ttnn::Shape& shape) { - return TensorSpec(shape, TensorLayout(DataType::FLOAT32, Layout::ROW_MAJOR, MemoryConfig{})); + return TensorSpec( + shape, TensorLayout(tt::tt_metal::DataType::FLOAT32, tt::tt_metal::Layout::ROW_MAJOR, MemoryConfig{})); } TEST(XtensorConversionTest, SpanToXtensor) { diff --git a/tests/ttnn/unit_tests/gtests/test_to_and_from_json.cpp b/tests/ttnn/unit_tests/gtests/test_to_and_from_json.cpp index deb4afbc12c..d6a99cb29cc 100644 --- a/tests/ttnn/unit_tests/gtests/test_to_and_from_json.cpp +++ b/tests/ttnn/unit_tests/gtests/test_to_and_from_json.cpp @@ -41,10 +41,10 @@ INSTANTIATE_TEST_SUITE_P( ttnn::MemoryConfig{ .memory_layout = ttnn::TensorMemoryLayout::WIDTH_SHARDED, .buffer_type = ttnn::BufferType::DRAM, - .shard_spec = ShardSpec( + .shard_spec = tt::tt_metal::ShardSpec( CoreRangeSet{std::set{CoreRange{CoreCoord{1, 2}, CoreCoord{7, 4}}}}, {32, 128}, - ShardOrientation::ROW_MAJOR + tt::tt_metal::ShardOrientation::ROW_MAJOR ) } }, @@ -53,11 +53,11 @@ INSTANTIATE_TEST_SUITE_P( ttnn::MemoryConfig{ .memory_layout = ttnn::TensorMemoryLayout::BLOCK_SHARDED, .buffer_type = ttnn::BufferType::DRAM, - .shard_spec = ShardSpec( + .shard_spec = tt::tt_metal::ShardSpec( CoreRangeSet{std::set{CoreRange{CoreCoord{0, 0}, CoreCoord{7, 4}}}}, {5, 6}, - ShardOrientation::ROW_MAJOR, - ShardMode::LOGICAL + tt::tt_metal::ShardOrientation::ROW_MAJOR, + tt::tt_metal::ShardMode::LOGICAL ) } }, @@ -66,11 +66,11 @@ INSTANTIATE_TEST_SUITE_P( ttnn::MemoryConfig{ .memory_layout = ttnn::TensorMemoryLayout::HEIGHT_SHARDED, .buffer_type = ttnn::BufferType::L1, - .shard_spec = ShardSpec( + .shard_spec = tt::tt_metal::ShardSpec( CoreRangeSet{std::set{CoreRange{CoreCoord{0, 0}, CoreCoord{7, 7}}}}, {3, 4}, {32, 32}, - ShardOrientation::COL_MAJOR + tt::tt_metal::ShardOrientation::COL_MAJOR ) } } diff --git a/tests/ttnn/unit_tests/gtests/ttnn_multi_command_queue_fixture.hpp b/tests/ttnn/unit_tests/gtests/ttnn_multi_command_queue_fixture.hpp index 98893a9ae22..f4911439f9b 100644 --- a/tests/ttnn/unit_tests/gtests/ttnn_multi_command_queue_fixture.hpp +++ b/tests/ttnn/unit_tests/gtests/ttnn_multi_command_queue_fixture.hpp @@ -11,6 +11,8 @@ #include #include +using namespace tt::tt_metal; // For test + namespace ttnn { class MultiCommandQueueSingleDeviceFixture : public ::testing::Test { diff --git a/tests/ttnn/unit_tests/gtests/ttnn_test_fixtures.hpp b/tests/ttnn/unit_tests/gtests/ttnn_test_fixtures.hpp index c4ad28babc8..c889afb6b1a 100644 --- a/tests/ttnn/unit_tests/gtests/ttnn_test_fixtures.hpp +++ b/tests/ttnn/unit_tests/gtests/ttnn_test_fixtures.hpp @@ -18,6 +18,8 @@ #include "hostdevcommon/common_values.hpp" #include +using namespace tt::tt_metal; // For test + namespace ttnn { class TTNNFixture : public ::testing::Test { diff --git a/tt-train/sources/examples/nano_gpt/main.cpp b/tt-train/sources/examples/nano_gpt/main.cpp index e65491f282f..644f164fa19 100644 --- a/tt-train/sources/examples/nano_gpt/main.cpp +++ b/tt-train/sources/examples/nano_gpt/main.cpp @@ -268,11 +268,11 @@ void generate( prompt_tokens_padded[i - start_idx] = prompt_tokens[i]; } auto prompt_tokens_padded_size = static_cast(prompt_tokens_padded.size()); - auto prompt_tensor = ttml::autograd::create_tensor(ttml::core::from_vector( + auto prompt_tensor = ttml::autograd::create_tensor(ttml::core::from_vector( prompt_tokens_padded, ttml::core::create_shape({1, 1, 1, prompt_tokens_padded_size}), device, - Layout::ROW_MAJOR)); + ttnn::Layout::ROW_MAJOR)); // Forward pass // 'output' shape is presumably [batch=1, 1, seq_len, vocab_size] or something similar @@ -551,21 +551,26 @@ int main(int argc, char **argv) { auto data_xtensor = xt::adapt(data, {batch_size, 1U, 1U, sequence_length}); auto data_composer = ttml::core::ShardXTensorToMesh(device->shape(), 0); auto data_tensor = - ttml::autograd::create_tensor(ttml::core::from_xtensor( - data_xtensor, device, data_composer, Layout::ROW_MAJOR)); + ttml::autograd::create_tensor(ttml::core::from_xtensor( + data_xtensor, device, data_composer, ttnn::Layout::ROW_MAJOR)); auto targets_xtensor = xt::adapt(targets, {batch_size * sequence_length}); auto targets_composer = ttml::core::ShardXTensorToMesh(device->shape(), 0); - auto targets_tt_tensor = - ttml::core::from_xtensor(targets_xtensor, device, targets_composer); + auto targets_tt_tensor = ttml::core::from_xtensor( + targets_xtensor, device, targets_composer); auto targets_tensor = ttml::autograd::create_tensor(targets_tt_tensor); return {data_tensor, targets_tensor}; } - auto data_tensor = ttml::autograd::create_tensor(ttml::core::from_vector( - data, ttml::core::create_shape({batch_size, 1, 1, sequence_length}), device, Layout::ROW_MAJOR)); - auto targets_tensor = ttml::autograd::create_tensor(ttml::core::from_vector( - targets, ttnn::Shape({batch_size * sequence_length}), device)); + auto data_tensor = + ttml::autograd::create_tensor(ttml::core::from_vector( + data, + ttml::core::create_shape({batch_size, 1, 1, sequence_length}), + device, + ttnn::Layout::ROW_MAJOR)); + auto targets_tensor = + ttml::autograd::create_tensor(ttml::core::from_vector( + targets, ttnn::Shape({batch_size * sequence_length}), device)); return {data_tensor, targets_tensor}; }; diff --git a/tt-train/sources/examples/sample_app/main.cpp b/tt-train/sources/examples/sample_app/main.cpp index 49f78f16a32..a3d3aa2b7ad 100644 --- a/tt-train/sources/examples/sample_app/main.cpp +++ b/tt-train/sources/examples/sample_app/main.cpp @@ -71,7 +71,7 @@ int main() { // Now we create a tensor with the buffer we just created auto x = tt::tt_metal::Tensor( // Let the tensor take ownership of the buffer - OwnedStorage{std::move(buffer)}, + tt::tt_metal::OwnedStorage{std::move(buffer)}, // IMPORTANT: SHAPE MUST BE 4D ELSE EVERYTHING WILL BREAK during the PAD operation ttnn::Shape({1, 1, tensor_width, tensor_height}), // The data type of the tensor diff --git a/tt-train/sources/ttml/autograd/autocast_tensor.cpp b/tt-train/sources/ttml/autograd/autocast_tensor.cpp index 263d718ad02..47ad15b0838 100644 --- a/tt-train/sources/ttml/autograd/autocast_tensor.cpp +++ b/tt-train/sources/ttml/autograd/autocast_tensor.cpp @@ -9,7 +9,7 @@ namespace { inline bool is_castable_tensor(const tt::tt_metal::Tensor &tensor) { - return tensor.get_dtype() == DataType::FLOAT32; + return tensor.get_dtype() == ttnn::DataType::FLOAT32; } } // namespace @@ -17,9 +17,9 @@ inline bool is_castable_tensor(const tt::tt_metal::Tensor &tensor) { namespace ttml::autograd { void AutocastTensor::set_tensor(const tt::tt_metal::Tensor &tensor) { - if (tensor.get_dtype() == DataType::FLOAT32) { + if (tensor.get_dtype() == ttnn::DataType::FLOAT32) { m_full_precision_tensor = tensor; - m_half_precision_tensor = ttnn::typecast(tensor, DataType::BFLOAT16); + m_half_precision_tensor = ttnn::typecast(tensor, ttnn::DataType::BFLOAT16); return; } diff --git a/tt-train/sources/ttml/core/mesh_device.cpp b/tt-train/sources/ttml/core/mesh_device.cpp index 079604b0d9b..eb48d64dd25 100644 --- a/tt-train/sources/ttml/core/mesh_device.cpp +++ b/tt-train/sources/ttml/core/mesh_device.cpp @@ -12,7 +12,7 @@ MeshDevice::MeshDevice(tt::tt_metal::distributed::MeshShape shape) : DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, /* num_command_queues*/ 1, - DispatchCoreConfig{})) { + tt::tt_metal::DispatchCoreConfig{})) { assert(m_mesh_device); } diff --git a/tt-train/sources/ttml/core/tt_tensor_utils.cpp b/tt-train/sources/ttml/core/tt_tensor_utils.cpp index 9f808e151f1..2b31f52faea 100644 --- a/tt-train/sources/ttml/core/tt_tensor_utils.cpp +++ b/tt-train/sources/ttml/core/tt_tensor_utils.cpp @@ -55,21 +55,21 @@ void print_tensor_stats_(const tt::tt_metal::Tensor& tensor, const std::string& // copypaste from deprecated tensor pybinds ttnn tt::tt_metal::OwnedBuffer create_owned_buffer_from_vector_of_floats( - const std::vector& data, DataType data_type) { + const std::vector& data, ttnn::DataType data_type) { switch (data_type) { - case DataType::BFLOAT8_B: { + case ttnn::DataType::BFLOAT8_B: { auto uint32_vector = pack_fp32_vec_as_bfp8_tiles(data, /*row_major_input=*/false, /*is_exp_a=*/false); return tt::tt_metal::owned_buffer::create(std::move(uint32_vector)); } - case DataType::BFLOAT4_B: { + case ttnn::DataType::BFLOAT4_B: { auto uint32_vector = pack_fp32_vec_as_bfp4_tiles(data, /*row_major_input=*/false, /*is_exp_a=*/false); return tt::tt_metal::owned_buffer::create(std::move(uint32_vector)); } - case DataType::FLOAT32: { + case ttnn::DataType::FLOAT32: { auto data_copy = data; return tt::tt_metal::owned_buffer::create(std::move(data_copy)); } - case DataType::BFLOAT16: { + case ttnn::DataType::BFLOAT16: { std::vector bfloat16_data(data.size()); std::transform(std::begin(data), std::end(data), std::begin(bfloat16_data), [](float value) { return bfloat16(value); @@ -86,7 +86,7 @@ template tt::tt_metal::Tensor ttml_create_owned_tensor( std::vector&& data, const ttnn::Shape& shape, tt::tt_metal::DataType data_type, tt::tt_metal::Layout layout) { auto buffer = tt::tt_metal::owned_buffer::create(std::move(data)); - auto storage = OwnedStorage{std::move(buffer)}; + auto storage = ttnn::OwnedStorage{std::move(buffer)}; return {std::move(storage), shape, data_type, layout}; } @@ -102,27 +102,27 @@ tt::tt_metal::Tensor ones_like(const tt::tt_metal::Tensor& tensor) { } tt::tt_metal::Tensor empty( - const ttnn::Shape& shape, ttnn::distributed::MeshDevice* device, const MemoryConfig& memory_config) { - return ttnn::empty(shape, DataType::BFLOAT16, Layout::TILE, device, memory_config); + const ttnn::Shape& shape, ttnn::distributed::MeshDevice* device, const ttnn::MemoryConfig& memory_config) { + return ttnn::empty(shape, ttnn::DataType::BFLOAT16, ttnn::Layout::TILE, device, memory_config); } tt::tt_metal::Tensor full( - const ttnn::Shape& shape, float value, ttnn::distributed::MeshDevice* device, DataType dtype) { - return ttnn::full(shape, value, dtype, Layout::TILE, std::ref(*device)); + const ttnn::Shape& shape, float value, ttnn::distributed::MeshDevice* device, ttnn::DataType dtype) { + return ttnn::full(shape, value, dtype, ttnn::Layout::TILE, std::ref(*device)); } -tt::tt_metal::Tensor zeros(const ttnn::Shape& shape, ttnn::distributed::MeshDevice* device, DataType dtype) { +tt::tt_metal::Tensor zeros(const ttnn::Shape& shape, ttnn::distributed::MeshDevice* device, ttnn::DataType dtype) { return core::full(shape, 0.F, device, dtype); } -tt::tt_metal::Tensor ones(const ttnn::Shape& shape, ttnn::distributed::MeshDevice* device, DataType dtype) { +tt::tt_metal::Tensor ones(const ttnn::Shape& shape, ttnn::distributed::MeshDevice* device, ttnn::DataType dtype) { return core::full(shape, 1.F, device, dtype); } -template +template [[nodiscard]] tt::tt_metal::Tensor from_xtensors_to_host( const std::vector>& buffers, const std::unordered_map& config) { - std::vector host_owned_buffers; + std::vector host_owned_buffers; std::vector host_owned_specs; host_owned_buffers.reserve(buffers.size()); host_owned_specs.reserve(buffers.size()); @@ -150,31 +150,34 @@ template host_owned_buffers.push_back(owned_buffer); } - host_owned_specs.push_back( - TensorSpec(shape, TensorLayout(TensorType, PageConfig(Layout::ROW_MAJOR), MemoryConfig{}))); + host_owned_specs.push_back(ttnn::TensorSpec( + shape, ttnn::TensorLayout(TensorType, ttnn::PageConfig(ttnn::Layout::ROW_MAJOR), ttnn::MemoryConfig{}))); } - auto distributed_tensor_config = get_distributed_tensor_config(config); + auto distributed_tensor_config = tt::tt_metal::get_distributed_tensor_config(config); auto storage = tt::tt_metal::MultiDeviceHostStorage( distributed_tensor_config, std::move(host_owned_buffers), host_owned_specs); // remove possible paddings from the shape (it conflicts with ROW MAJOR) - auto output = Tensor(std::move(storage), host_owned_specs[0]); + auto output = ttnn::Tensor(std::move(storage), host_owned_specs[0]); return output; } -template tt::tt_metal::Tensor from_xtensors_to_host( +template tt::tt_metal::Tensor from_xtensors_to_host( const std::vector>& buffers, const std::unordered_map& config); -template tt::tt_metal::Tensor from_xtensors_to_host( +template tt::tt_metal::Tensor from_xtensors_to_host( const std::vector>& buffers, const std::unordered_map& config); template tt::tt_metal::Tensor from_xtensors_to_host( const std::vector>& buffers, const std::unordered_map& config); template <> -tt::tt_metal::Tensor from_vector( - const std::vector& buffer, const ttnn::Shape& shape, ttnn::distributed::MeshDevice* device, Layout layout) { +tt::tt_metal::Tensor from_vector( + const std::vector& buffer, + const ttnn::Shape& shape, + ttnn::distributed::MeshDevice* device, + ttnn::Layout layout) { assert(device != nullptr); - const DataType data_type = DataType::BFLOAT16; - MemoryConfig output_mem_config{}; + const ttnn::DataType data_type = ttnn::DataType::BFLOAT16; + ttnn::MemoryConfig output_mem_config{}; size_t volume = shape.volume(); if (buffer.size() != volume) { throw std::logic_error( @@ -182,17 +185,17 @@ tt::tt_metal::Tensor from_vector( } auto owned_buffer = create_owned_buffer_from_vector_of_floats(buffer, data_type); // remove possible paddings from the shape (it conflicts with ROW MAJOR) - auto output = tt::tt_metal::Tensor(OwnedStorage{owned_buffer}, shape, data_type, Layout::ROW_MAJOR); + auto output = tt::tt_metal::Tensor(ttnn::OwnedStorage{owned_buffer}, shape, data_type, ttnn::Layout::ROW_MAJOR); const size_t MAX_TILE_DIMENSION = 16384; // Temporary workaround for the issue with tilize for large size // https://github.com/tenstorrent/tt-metal/issues/15950 - if (shape[-1] >= MAX_TILE_DIMENSION && layout == Layout::TILE) { - output = ttnn::to_layout(output, Layout::TILE, std::nullopt, output_mem_config, device); + if (shape[-1] >= MAX_TILE_DIMENSION && layout == ttnn::Layout::TILE) { + output = ttnn::to_layout(output, ttnn::Layout::TILE, std::nullopt, output_mem_config, device); output = ttnn::to_device(output, device, output_mem_config); } else { output = ttnn::to_device(output, device, output_mem_config); - if (layout == Layout::TILE) { + if (layout == ttnn::Layout::TILE) { output = ttnn::tilize_with_zero_padding(output, output_mem_config, std::nullopt, /* multicore */ true); } } @@ -203,22 +206,25 @@ tt::tt_metal::Tensor from_vector( // Workaround implementation due to issue with tilize for float32 // it is expected that tilize will be fixed in the after next tt-metal main update template <> -tt::tt_metal::Tensor from_vector( - const std::vector& buffer, const ttnn::Shape& shape, ttnn::distributed::MeshDevice* device, Layout layout) { - auto tensor = from_vector(buffer, shape, device, layout); - return ttnn::typecast(tensor, DataType::FLOAT32); +tt::tt_metal::Tensor from_vector( + const std::vector& buffer, + const ttnn::Shape& shape, + ttnn::distributed::MeshDevice* device, + ttnn::Layout layout) { + auto tensor = from_vector(buffer, shape, device, layout); + return ttnn::typecast(tensor, ttnn::DataType::FLOAT32); } /* From vector uint32 doesn't support tilize_with_zero_padding on device */ template <> -tt::tt_metal::Tensor from_vector( +tt::tt_metal::Tensor from_vector( const std::vector& buffer, const ttnn::Shape& shape, ttnn::distributed::MeshDevice* device, - Layout layout) { - MemoryConfig output_mem_config{}; + ttnn::Layout layout) { + ttnn::MemoryConfig output_mem_config{}; auto volume = shape.volume(); if (buffer.size() != volume) { throw std::logic_error( @@ -227,9 +233,10 @@ tt::tt_metal::Tensor from_vector( // remove possible paddings from the shape (it conflicts with ROW MAJOR) std::vector buffer_copy = buffer; - auto output = ttml_create_owned_tensor(std::move(buffer_copy), shape, DataType::UINT32, Layout::ROW_MAJOR); + auto output = + ttml_create_owned_tensor(std::move(buffer_copy), shape, ttnn::DataType::UINT32, ttnn::Layout::ROW_MAJOR); if (device != nullptr) { - if (layout != Layout::ROW_MAJOR) { + if (layout != ttnn::Layout::ROW_MAJOR) { output = ttnn::to_layout(output, layout, std::nullopt, output_mem_config, device); } output = ttnn::to_device(output, device, output_mem_config); @@ -242,12 +249,12 @@ tt::tt_metal::Tensor from_vector( From vector int32 doesn't support tilize_with_zero_padding on device */ template <> -tt::tt_metal::Tensor from_vector( +tt::tt_metal::Tensor from_vector( const std::vector& buffer, const ttnn::Shape& shape, ttnn::distributed::MeshDevice* device, - Layout layout) { - MemoryConfig output_mem_config{}; + ttnn::Layout layout) { + ttnn::MemoryConfig output_mem_config{}; auto volume = shape.volume(); if (buffer.size() != volume) { throw std::logic_error( @@ -256,9 +263,10 @@ tt::tt_metal::Tensor from_vector( // remove possible paddings from the shape (it conflicts with ROW MAJOR) std::vector buffer_copy = buffer; - auto output = ttml_create_owned_tensor(std::move(buffer_copy), shape, DataType::INT32, Layout::ROW_MAJOR); + auto output = + ttml_create_owned_tensor(std::move(buffer_copy), shape, ttnn::DataType::INT32, ttnn::Layout::ROW_MAJOR); if (device != nullptr) { - if (layout != Layout::ROW_MAJOR) { + if (layout != ttnn::Layout::ROW_MAJOR) { output = ttnn::to_layout(output, layout, std::nullopt, output_mem_config, device); } output = ttnn::to_device(output, device, output_mem_config); @@ -276,54 +284,54 @@ ttnn::Shape create_shape(const std::array& args) { } void print_tensor_stats(const tt::tt_metal::Tensor& tensor, const std::string& name) { - if (tensor.get_dtype() == DataType::BFLOAT16 || tensor.get_dtype() == DataType::FLOAT32) { + if (tensor.get_dtype() == ttnn::DataType::BFLOAT16 || tensor.get_dtype() == ttnn::DataType::FLOAT32) { print_tensor_stats_(tensor, name); } else { print_tensor_stats_(tensor, name); } } -template +template tt::tt_metal::Tensor from_xtensor( const xt::xarray& tensor, ttnn::distributed::MeshDevice* device, const XTensorToMeshVariant& composer, - Layout layout) { + ttnn::Layout layout) { auto sharded_tensors = std::visit([&tensor](auto&& arg) { return arg.map(tensor); }, composer); auto config = std::visit([](auto&& arg) { return arg.config(); }, composer); auto output = from_xtensors_to_host(sharded_tensors, config); - MemoryConfig output_mem_config{}; + ttnn::MemoryConfig output_mem_config{}; if constexpr (std::is_same_v || std::is_same_v) { - if (layout != Layout::ROW_MAJOR) { + if (layout != ttnn::Layout::ROW_MAJOR) { output = ttnn::to_layout(output, layout, std::nullopt, output_mem_config, device); } output = ttnn::to_device(output, device, output_mem_config); } else { output = ttnn::to_device(output, device, output_mem_config); - if (layout == Layout::TILE) { + if (layout == ttnn::Layout::TILE) { output = ttnn::tilize_with_zero_padding(output, output_mem_config, std::nullopt, /* multicore */ true); } } return output; } -template tt::tt_metal::Tensor from_xtensor( +template tt::tt_metal::Tensor from_xtensor( const xt::xarray& tensor, ttnn::distributed::MeshDevice* device, const XTensorToMeshVariant& composer, - Layout layout); + ttnn::Layout layout); -template tt::tt_metal::Tensor from_xtensor( +template tt::tt_metal::Tensor from_xtensor( const xt::xarray& tensor, ttnn::distributed::MeshDevice* device, const XTensorToMeshVariant& composer, - Layout layout); + ttnn::Layout layout); -template tt::tt_metal::Tensor from_xtensor( +template tt::tt_metal::Tensor from_xtensor( const xt::xarray& tensor, ttnn::distributed::MeshDevice* device, const XTensorToMeshVariant& composer, - Layout layout); + ttnn::Layout layout); } // namespace ttml::core diff --git a/tt-train/sources/ttml/core/tt_tensor_utils.hpp b/tt-train/sources/ttml/core/tt_tensor_utils.hpp index 1c4d02326f0..04c79144320 100644 --- a/tt-train/sources/ttml/core/tt_tensor_utils.hpp +++ b/tt-train/sources/ttml/core/tt_tensor_utils.hpp @@ -18,22 +18,25 @@ tt::tt_metal::Tensor zeros_like(const tt::tt_metal::Tensor& tensor); tt::tt_metal::Tensor ones_like(const tt::tt_metal::Tensor& tensor); tt::tt_metal::Tensor empty( - const ttnn::Shape& shape, ttnn::distributed::MeshDevice* device, const MemoryConfig& memory_config); + const ttnn::Shape& shape, ttnn::distributed::MeshDevice* device, const ttnn::MemoryConfig& memory_config); tt::tt_metal::Tensor full( - const ttnn::Shape& shape, float value, ttnn::distributed::MeshDevice* device, DataType dtype = DataType::BFLOAT16); + const ttnn::Shape& shape, + float value, + ttnn::distributed::MeshDevice* device, + ttnn::DataType dtype = ttnn::DataType::BFLOAT16); tt::tt_metal::Tensor zeros( - const ttnn::Shape& shape, ttnn::distributed::MeshDevice* device, DataType dtype = DataType::BFLOAT16); + const ttnn::Shape& shape, ttnn::distributed::MeshDevice* device, ttnn::DataType dtype = ttnn::DataType::BFLOAT16); tt::tt_metal::Tensor ones( - const ttnn::Shape& shape, ttnn::distributed::MeshDevice* device, DataType dtype = DataType::BFLOAT16); + const ttnn::Shape& shape, ttnn::distributed::MeshDevice* device, ttnn::DataType dtype = ttnn::DataType::BFLOAT16); -template +template [[nodiscard]] tt::tt_metal::Tensor from_vector( const std::vector& buffer, const ttnn::Shape& shape, ttnn::distributed::MeshDevice* device, - Layout layout = Layout::TILE); + ttnn::Layout layout = ttnn::Layout::TILE); -template +template [[nodiscard]] tt::tt_metal::Tensor from_xtensors_to_host( const std::vector>& buffers, const std::unordered_map& config); @@ -46,9 +49,9 @@ template [[nodiscard]] ttnn::Shape create_shape(const std::array& args); -template +template [[nodiscard]] tt::tt_metal::Tensor from_xtensor( - const xt::xarray& buffer, ttnn::distributed::MeshDevice* device, Layout layout = Layout::TILE) { + const xt::xarray& buffer, ttnn::distributed::MeshDevice* device, ttnn::Layout layout = ttnn::Layout::TILE) { auto shape = ttnn::experimental::xtensor::get_shape_from_xarray(buffer); auto buffer_view = xtensor_to_span(buffer); return from_vector(std::vector(buffer_view.begin(), buffer_view.end()), shape, device, layout); @@ -66,7 +69,7 @@ template template auto to_xtensor(const tt::tt_metal::Tensor& tensor, const MeshToXTensorVariant& composer) { auto cpu_tensor = tensor.cpu(); - cpu_tensor = cpu_tensor.to_layout(Layout::ROW_MAJOR); + cpu_tensor = cpu_tensor.to_layout(ttnn::Layout::ROW_MAJOR); auto cpu_tensors = ttnn::distributed::get_device_tensors(cpu_tensor); std::vector> res; res.reserve(cpu_tensors.size()); @@ -76,11 +79,11 @@ auto to_xtensor(const tt::tt_metal::Tensor& tensor, const MeshToXTensorVariant +template tt::tt_metal::Tensor from_xtensor( const xt::xarray& tensor, ttnn::distributed::MeshDevice* device, const XTensorToMeshVariant& composer, - Layout layout = Layout::TILE); + ttnn::Layout layout = ttnn::Layout::TILE); } // namespace ttml::core diff --git a/tt-train/sources/ttml/models/distributed/gpt2.cpp b/tt-train/sources/ttml/models/distributed/gpt2.cpp index 5207cf730d6..c5519e665da 100644 --- a/tt-train/sources/ttml/models/distributed/gpt2.cpp +++ b/tt-train/sources/ttml/models/distributed/gpt2.cpp @@ -78,7 +78,7 @@ void weights_initialization(DistributedTransformer& model) { core::XTensorToMeshVariant shard_composer = core::ShardXTensorToMesh(device->shape(), 0); auto weight_xtensor = init::normal_init(tensor_shape, {0.F, 0.02F}); tensor_ptr->set_value( - core::from_xtensor(weight_xtensor, device, shard_composer)); + core::from_xtensor(weight_xtensor, device, shard_composer)); } else if (name.find("bias") != std::string::npos) { init::constant_init(tensor_ptr, tensor.get_logical_shape(), 0.F); } diff --git a/tt-train/sources/ttml/modules/distributed/linear.cpp b/tt-train/sources/ttml/modules/distributed/linear.cpp index 0f3683ff9e5..dd1a678f909 100644 --- a/tt-train/sources/ttml/modules/distributed/linear.cpp +++ b/tt-train/sources/ttml/modules/distributed/linear.cpp @@ -60,8 +60,8 @@ void RowParallelLinear::initialize_tensors(uint32_t in_features, uint32_t out_fe ttml::core::XTensorToMeshVariant shard_composer = ttml::core::ShardXTensorToMesh(mesh_shape, rank - 1U); auto weight = init::uniform_init(weight_shape, init::UniformRange{-init_k, init_k}); - m_weight = - autograd::create_tensor(ttml::core::from_xtensor(weight, device, shard_composer)); + m_weight = autograd::create_tensor( + ttml::core::from_xtensor(weight, device, shard_composer)); if (has_bias) { auto bias_shape = core::create_shape({1, 1, 1, out_features}); @@ -110,16 +110,16 @@ void ColumnParallelLinear::initialize_tensors(uint32_t in_features, uint32_t out ttml::core::XTensorToMeshVariant shard_composer = ttml::core::ShardXTensorToMesh(mesh_shape, rank - 2U); auto weight = init::uniform_init(weight_shape, init::UniformRange{-init_k, init_k}); - m_weight = - autograd::create_tensor(ttml::core::from_xtensor(weight, device, shard_composer)); + m_weight = autograd::create_tensor( + ttml::core::from_xtensor(weight, device, shard_composer)); if (has_bias) { auto bias_shape = core::create_shape({1, 1, 1, out_features}); auto bias = init::uniform_init(bias_shape, init::UniformRange{-init_k, init_k}); ttml::core::XTensorToMeshVariant shard_composer = ttml::core::ShardXTensorToMesh(mesh_shape, rank - 1U); - m_bias = - autograd::create_tensor(ttml::core::from_xtensor(bias, device, shard_composer)); + m_bias = autograd::create_tensor( + ttml::core::from_xtensor(bias, device, shard_composer)); } } diff --git a/tt-train/sources/ttml/ops/embedding_op.cpp b/tt-train/sources/ttml/ops/embedding_op.cpp index ef6841bd979..dd60d860c44 100644 --- a/tt-train/sources/ttml/ops/embedding_op.cpp +++ b/tt-train/sources/ttml/ops/embedding_op.cpp @@ -17,7 +17,8 @@ autograd::TensorPtr embedding_op(const autograd::TensorPtr& tensor, const autogr auto weight_tensor = weight->get_value(); weight_tensor = ttnn::untilize(weight_tensor); - auto embeddings = ttnn::embedding(tensor->get_value(), weight_tensor, /* pad_token */ std::nullopt, Layout::TILE); + auto embeddings = + ttnn::embedding(tensor->get_value(), weight_tensor, /* pad_token */ std::nullopt, ttnn::Layout::TILE); auto embeddings_shape = embeddings.get_logical_shape(); auto batch_size = embeddings_shape[0]; auto sentence_size = embeddings_shape[1]; diff --git a/tt-train/sources/ttml/serialization/serialization.cpp b/tt-train/sources/ttml/serialization/serialization.cpp index acd8dd579e4..474a6c6bc29 100644 --- a/tt-train/sources/ttml/serialization/serialization.cpp +++ b/tt-train/sources/ttml/serialization/serialization.cpp @@ -121,8 +121,8 @@ void read_ttnn_tensor(MsgPackFile& file, std::string_view name, tt::tt_metal::Te } else if (data_type == tt::tt_metal::DataType::UINT32) { std::vector data; file.get(std::string(name) + "/data", data); - tensor = - core::from_vector(data, shape, &ttml::autograd::ctx().get_device(), layout); + tensor = core::from_vector( + data, shape, &ttml::autograd::ctx().get_device(), layout); } else { throw std::runtime_error(fmt::format("Unsupported data type: {}", magic_enum::enum_name(data_type))); } diff --git a/tt-train/tests/autograd/autograd_tensor.cpp b/tt-train/tests/autograd/autograd_tensor.cpp index 54851d7b085..182fb2d8dc1 100644 --- a/tt-train/tests/autograd/autograd_tensor.cpp +++ b/tt-train/tests/autograd/autograd_tensor.cpp @@ -25,31 +25,32 @@ class AutogradTensorTest : public ::testing::Test { }; TEST_F(AutogradTensorTest, AutogradTensorFLOAT32) { - auto tensor = autograd::create_tensor( - core::ones(core::create_shape({1, 1, 1, 32}), &autograd::ctx().get_device(), DataType::FLOAT32)); + auto tensor = autograd::create_tensor(ttml::core::ones( + ttml::core::create_shape({1, 1, 1, 32}), &autograd::ctx().get_device(), ttnn::DataType::FLOAT32)); const auto& half_precision_tensor = tensor->get_value(); const auto& full_precision_tensor = tensor->get_value(autograd::PreferredPrecision::FULL); - EXPECT_EQ(half_precision_tensor.dtype(), DataType::BFLOAT16); - EXPECT_EQ(full_precision_tensor.dtype(), DataType::FLOAT32); + EXPECT_EQ(half_precision_tensor.dtype(), ttnn::DataType::BFLOAT16); + EXPECT_EQ(full_precision_tensor.dtype(), ttnn::DataType::FLOAT32); } TEST_F(AutogradTensorTest, AutogradTensorBFLOAT16) { - auto tensor = autograd::create_tensor( - core::ones(core::create_shape({1, 1, 1, 32}), &autograd::ctx().get_device(), DataType::BFLOAT16)); + auto tensor = autograd::create_tensor(ttml::core::ones( + ttml::core::create_shape({1, 1, 1, 32}), &autograd::ctx().get_device(), ttnn::DataType::BFLOAT16)); const auto& half_precision_tensor = tensor->get_value(); const auto& full_precision_tensor = tensor->get_value(autograd::PreferredPrecision::FULL); - EXPECT_EQ(half_precision_tensor.dtype(), DataType::BFLOAT16); - EXPECT_EQ(full_precision_tensor.dtype(), DataType::BFLOAT16); + EXPECT_EQ(half_precision_tensor.dtype(), ttnn::DataType::BFLOAT16); + EXPECT_EQ(full_precision_tensor.dtype(), ttnn::DataType::BFLOAT16); } TEST_F(AutogradTensorTest, AutocastTensor) { - auto tt_tensor = core::ones(core::create_shape({1, 1, 1, 32}), &autograd::ctx().get_device(), DataType::FLOAT32); + auto tt_tensor = ttml::core::ones( + ttml::core::create_shape({1, 1, 1, 32}), &autograd::ctx().get_device(), ttnn::DataType::FLOAT32); auto autocast_tensor = autograd::AutocastTensor(tt_tensor); const auto& half_precision_tensor = autocast_tensor.get_tensor(); const auto& full_precision_tensor = autocast_tensor.get_tensor(autograd::PreferredPrecision::FULL); - EXPECT_EQ(half_precision_tensor.dtype(), DataType::BFLOAT16); - EXPECT_EQ(full_precision_tensor.dtype(), DataType::FLOAT32); + EXPECT_EQ(half_precision_tensor.dtype(), ttnn::DataType::BFLOAT16); + EXPECT_EQ(full_precision_tensor.dtype(), ttnn::DataType::FLOAT32); } diff --git a/tt-train/tests/core/n300_utils_test.cpp b/tt-train/tests/core/n300_utils_test.cpp index b2bfd8116e3..0bdfbcb49bc 100644 --- a/tt-train/tests/core/n300_utils_test.cpp +++ b/tt-train/tests/core/n300_utils_test.cpp @@ -13,6 +13,8 @@ #include "core/distributed_mapping.hpp" #include "core/tt_tensor_utils.hpp" +using namespace ttml; + auto check_board_is_n300() { return tt_ClusterDescriptor::create()->get_board_type(0) == BoardType::N300; } @@ -39,7 +41,7 @@ TEST_F(N300UtilsTest, TestXTensorReplicateInt32) { xt::xarray xtensor = test_data.reshape({1, 1, 1, 3}); ttml::core::XTensorToMeshVariant replicate_composer = ttml::core::ReplicateXTensorToMesh(mesh_shape); - auto tensor = ttml::core::from_xtensor(xtensor, device, replicate_composer); + auto tensor = ttml::core::from_xtensor(xtensor, device, replicate_composer); ttml::core::MeshToXTensorVariant identity_composer = ttml::core::VectorMeshToXTensor(mesh_shape); auto xtensors_back = ttml::core::to_xtensor(tensor, identity_composer); @@ -54,7 +56,7 @@ TEST_F(N300UtilsTest, TestXTensorReplicateUInt32) { xt::xarray xtensor = test_data.reshape({1, 1, 1, 3}); ttml::core::XTensorToMeshVariant replicate_composer = ttml::core::ReplicateXTensorToMesh(mesh_shape); - auto tensor = ttml::core::from_xtensor(xtensor, device, replicate_composer); + auto tensor = ttml::core::from_xtensor(xtensor, device, replicate_composer); ttml::core::MeshToXTensorVariant identity_composer = ttml::core::VectorMeshToXTensor(mesh_shape); auto xtensors_back = ttml::core::to_xtensor(tensor, identity_composer); diff --git a/tt-train/tests/core/tensor_utils_test.cpp b/tt-train/tests/core/tensor_utils_test.cpp index 8a05fbf6802..02f633ac0c4 100644 --- a/tt-train/tests/core/tensor_utils_test.cpp +++ b/tt-train/tests/core/tensor_utils_test.cpp @@ -74,7 +74,7 @@ TEST_F(TensorUtilsTest, TestUint32ToFromTensorEven) { std::vector test_data = {1, 5, 10, 15}; auto shape = ttml::core::create_shape({1, 1, 1, 4}); - auto tensor = ttml::core::from_vector(test_data, shape, device); + auto tensor = ttml::core::from_vector(test_data, shape, device); auto vec_back = ttml::core::to_vector(tensor); @@ -89,7 +89,7 @@ TEST_F(TensorUtilsTest, TestUint32ToFromTensorOdd) { std::vector test_data = {30, 20, 2}; auto shape = ttml::core::create_shape({1, 1, 1, 3}); - auto tensor = ttml::core::from_vector(test_data, shape, device); + auto tensor = ttml::core::from_vector(test_data, shape, device); auto vec_back = ttml::core::to_vector(tensor); @@ -109,7 +109,7 @@ TEST_F(TensorUtilsTest, TestUint32ToFromTensorLargeWithBatch) { } auto shape = ttml::core::create_shape({batch_size, 1, 1, vec_size / batch_size}); - auto tensor = ttml::core::from_vector(test_data, shape, device); + auto tensor = ttml::core::from_vector(test_data, shape, device); auto vec_back = ttml::core::to_vector(tensor); ASSERT_EQ(vec_back.size(), test_data.size()); for (size_t i = 0; i < test_data.size(); i++) { @@ -261,7 +261,7 @@ TEST_F(TensorUtilsTest, TestUint32XTensor) { auto shape = ttml::core::create_shape({1, 1, 1, 3}); xt::xarray xtensor = ttml::core::span_to_xtensor_view(std::span{test_data.data(), test_data.size()}, shape); - auto tensor = ttml::core::from_xtensor(xtensor, device); + auto tensor = ttml::core::from_xtensor(xtensor, device); auto xtensor_back = ttml::core::to_xtensor(tensor); diff --git a/tt-train/tests/model/nano_gpt_test.cpp b/tt-train/tests/model/nano_gpt_test.cpp index b0dcf77742c..3de1efa8a15 100644 --- a/tt-train/tests/model/nano_gpt_test.cpp +++ b/tt-train/tests/model/nano_gpt_test.cpp @@ -30,6 +30,7 @@ class NanoGPTTest : public ::testing::Test { }; using ttml::autograd::TensorPtr; +using namespace ttml; using DatasetSample = std::pair, std::span>; // tokens, targets, mask @@ -126,9 +127,9 @@ void train_test(bool use_moreh_adamw = false, bool memory_efficient = false) { auto end_timer = std::chrono::high_resolution_clock::now(); auto duration = std::chrono::duration_cast(end_timer - start_timer).count(); fmt::print("dataloader host only step time {} ms\n", (double)duration / 1000.); - auto data_tensor = ttml::autograd::create_tensor(ttml::core::from_vector( - data, ttml::core::create_shape({batch_size, 1, 1, sequence_length}), device, Layout::ROW_MAJOR)); - auto targets_tensor = ttml::autograd::create_tensor(ttml::core::from_vector( + auto data_tensor = ttml::autograd::create_tensor(ttml::core::from_vector( + data, ttml::core::create_shape({batch_size, 1, 1, sequence_length}), device, ttnn::Layout::ROW_MAJOR)); + auto targets_tensor = ttml::autograd::create_tensor(ttml::core::from_vector( targets, ttnn::Shape({batch_size * sequence_length}), device)); end_timer = std::chrono::high_resolution_clock::now(); duration = std::chrono::duration_cast(end_timer - start_timer).count(); diff --git a/tt-train/tests/modules/distributed/linear_test.cpp b/tt-train/tests/modules/distributed/linear_test.cpp index 240725f0035..ced35f62d45 100644 --- a/tt-train/tests/modules/distributed/linear_test.cpp +++ b/tt-train/tests/modules/distributed/linear_test.cpp @@ -65,7 +65,7 @@ TEST_F(N300TensorParallelLinearTest, RowParallelLinearHasBiasNotInputParallel) { xt::xarray test_data = xt::random::rand({in_features}, 0.F, 1.F).reshape({1U, 1U, 1U, in_features}); ttml::core::XTensorToMeshVariant replicate_composer = ttml::core::ReplicateXTensorToMesh(mesh_shape); - auto tt_tensor = ttml::core::from_xtensor(test_data, device, replicate_composer); + auto tt_tensor = ttml::core::from_xtensor(test_data, device, replicate_composer); auto tensor = ttml::autograd::create_tensor(tt_tensor); auto output = layer(tensor); @@ -107,7 +107,7 @@ TEST_F(N300TensorParallelLinearTest, RowParallelLinearNoBiasNotInputParallel) { xt::xarray test_data = xt::random::rand({in_features}, 0.F, 1.F).reshape({1U, 1U, 1U, in_features}); ttml::core::XTensorToMeshVariant replicate_composer = ttml::core::ReplicateXTensorToMesh(mesh_shape); - auto tt_tensor = ttml::core::from_xtensor(test_data, device, replicate_composer); + auto tt_tensor = ttml::core::from_xtensor(test_data, device, replicate_composer); auto tensor = ttml::autograd::create_tensor(tt_tensor); auto output = layer(tensor); @@ -145,7 +145,7 @@ TEST_F(N300TensorParallelLinearTest, RowParallelLinearHasBiasInputParallel) { xt::xarray test_data = xt::random::rand({in_features}, 0.F, 1.F).reshape({1U, 1U, 1U, in_features}); ttml::core::XTensorToMeshVariant shard_composer = ttml::core::ShardXTensorToMesh(mesh_shape, 3); - auto tt_tensor = ttml::core::from_xtensor(test_data, device, shard_composer); + auto tt_tensor = ttml::core::from_xtensor(test_data, device, shard_composer); auto tensor = ttml::autograd::create_tensor(tt_tensor); auto output = layer(tensor); @@ -183,7 +183,7 @@ TEST_F(N300TensorParallelLinearTest, RowParallelLinearNoBiasInputParallel) { xt::xarray test_data = xt::random::rand({in_features}, 0.F, 1.F).reshape({1U, 1U, 1U, in_features}); ttml::core::XTensorToMeshVariant shard_composer = ttml::core::ShardXTensorToMesh(mesh_shape, 3); - auto tt_tensor = ttml::core::from_xtensor(test_data, device, shard_composer); + auto tt_tensor = ttml::core::from_xtensor(test_data, device, shard_composer); auto tensor = ttml::autograd::create_tensor(tt_tensor); auto output = layer(tensor); @@ -218,7 +218,7 @@ TEST_F(N300TensorParallelLinearTest, ColumnParallelLinearHasBiasAllGather) { xt::xarray test_data = xt::random::rand({in_features}, 0.F, 1.F).reshape({1U, 1U, 1U, in_features}); ttml::core::XTensorToMeshVariant replicate_composer = ttml::core::ReplicateXTensorToMesh(mesh_shape); - auto tt_tensor = ttml::core::from_xtensor(test_data, device, replicate_composer); + auto tt_tensor = ttml::core::from_xtensor(test_data, device, replicate_composer); auto tensor = ttml::autograd::create_tensor(tt_tensor); auto output = layer(tensor); @@ -258,7 +258,7 @@ TEST_F(N300TensorParallelLinearTest, ColumnParallelLinearNoBiasAllGather) { xt::xarray test_data = xt::random::rand({in_features}, 0.F, 1.F).reshape({1U, 1U, 1U, in_features}); ttml::core::XTensorToMeshVariant replicate_composer = ttml::core::ReplicateXTensorToMesh(mesh_shape); - auto tt_tensor = ttml::core::from_xtensor(test_data, device, replicate_composer); + auto tt_tensor = ttml::core::from_xtensor(test_data, device, replicate_composer); auto tensor = ttml::autograd::create_tensor(tt_tensor); auto output = layer(tensor); @@ -294,7 +294,7 @@ TEST_F(N300TensorParallelLinearTest, ColumnParallelLinearHasBiasNoAllGather) { xt::xarray test_data = xt::random::rand({in_features}, 0.F, 1.F).reshape({1U, 1U, 1U, in_features}); ttml::core::XTensorToMeshVariant replicate_composer = ttml::core::ReplicateXTensorToMesh(mesh_shape); - auto tt_tensor = ttml::core::from_xtensor(test_data, device, replicate_composer); + auto tt_tensor = ttml::core::from_xtensor(test_data, device, replicate_composer); auto tensor = ttml::autograd::create_tensor(tt_tensor); auto output = layer(tensor); @@ -342,7 +342,7 @@ TEST_F(N300TensorParallelLinearTest, ColumnParallelLinearNoBiasNoAllGather) { xt::xarray test_data = xt::random::rand({in_features}, 0.F, 1.F).reshape({1U, 1U, 1U, in_features}); ttml::core::XTensorToMeshVariant replicate_composer = ttml::core::ReplicateXTensorToMesh(mesh_shape); - auto tt_tensor = ttml::core::from_xtensor(test_data, device, replicate_composer); + auto tt_tensor = ttml::core::from_xtensor(test_data, device, replicate_composer); auto tensor = ttml::autograd::create_tensor(tt_tensor); auto output = layer(tensor); @@ -391,7 +391,7 @@ TEST_F(N300TensorParallelLinearTest, RowParallelLinearHasBiasNanoGPT) { xt::xarray test_data = xt::random::rand({in_features * batch_size * sequence_length}, -1.F, 1.F) .reshape({batch_size, 1U, sequence_length, in_features}); ttml::core::XTensorToMeshVariant replicate_composer = ttml::core::ReplicateXTensorToMesh(mesh_shape); - auto tt_tensor = ttml::core::from_xtensor(test_data, device, replicate_composer); + auto tt_tensor = ttml::core::from_xtensor(test_data, device, replicate_composer); auto tensor = ttml::autograd::create_tensor(tt_tensor); auto output = layer(tensor); output->backward(); @@ -458,7 +458,7 @@ TEST_F(N300TensorParallelLinearTest, ColumnParallelLinearHasBiasNanoGPT) { xt::xarray test_data = xt::random::rand({in_features * batch_size * sequence_length}, -1.F, 1.F) .reshape({batch_size, 1U, sequence_length, in_features}); ttml::core::XTensorToMeshVariant replicate_composer = ttml::core::ReplicateXTensorToMesh(mesh_shape); - auto tt_tensor = ttml::core::from_xtensor(test_data, device, replicate_composer); + auto tt_tensor = ttml::core::from_xtensor(test_data, device, replicate_composer); auto tensor = ttml::autograd::create_tensor(tt_tensor); auto output = layer(tensor); output->backward(); @@ -525,7 +525,7 @@ TEST_F(N300TensorParallelLinearTest, ColumnParallelLinearNoBiasNanoGPT) { xt::xarray test_data = xt::random::rand({in_features * batch_size * sequence_length}, -1.F, 1.F) .reshape({batch_size, 1U, sequence_length, in_features}); ttml::core::XTensorToMeshVariant replicate_composer = ttml::core::ReplicateXTensorToMesh(mesh_shape); - auto tt_tensor = ttml::core::from_xtensor(test_data, device, replicate_composer); + auto tt_tensor = ttml::core::from_xtensor(test_data, device, replicate_composer); auto tensor = ttml::autograd::create_tensor(tt_tensor); auto output = layer(tensor); output->backward(); diff --git a/tt-train/tests/ops/distributed/comm_ops_test.cpp b/tt-train/tests/ops/distributed/comm_ops_test.cpp index 317955507e4..fda50c10f89 100644 --- a/tt-train/tests/ops/distributed/comm_ops_test.cpp +++ b/tt-train/tests/ops/distributed/comm_ops_test.cpp @@ -48,7 +48,7 @@ TEST_F(N300CommOpsTest, TestAllReduceNotFullyTiled) { xt::xarray test_data = xt::adapt(test_data_vec); xt::xarray xtensor = test_data.reshape({1U, 1U, 1U, size}); ttml::core::XTensorToMeshVariant shard_composer = ttml::core::ShardXTensorToMesh(mesh_shape, 3); - auto tt_tensor = ttml::core::from_xtensor(xtensor, device, shard_composer); + auto tt_tensor = ttml::core::from_xtensor(xtensor, device, shard_composer); auto tensor = ttml::autograd::create_tensor(tt_tensor); auto all_reduce_tensor = ttml::ops::distributed::all_reduce(tensor); @@ -64,7 +64,8 @@ TEST_F(N300CommOpsTest, TestAllReduceNotFullyTiled) { xt::xarray grad_data = xt::random::rand(all_reduce_expected.shape(), 0.F, 1.F); ttml::core::XTensorToMeshVariant replicate_composer = ttml::core::ReplicateXTensorToMesh(mesh_shape); - auto tt_grad_tensor = ttml::core::from_xtensor(grad_data, device, replicate_composer); + auto tt_grad_tensor = + ttml::core::from_xtensor(grad_data, device, replicate_composer); all_reduce_tensor->set_grad(tt_grad_tensor); all_reduce_tensor->backward(); @@ -102,7 +103,7 @@ TEST_F(N300CommOpsTest, TestAllReduceNanoGPT) { xt::xarray test_data = xt::adapt(test_data_vec); xt::xarray xtensor = test_data.reshape({batch, 1U, height, size}); ttml::core::XTensorToMeshVariant shard_composer = ttml::core::ShardXTensorToMesh(mesh_shape, 3); - auto tt_tensor = ttml::core::from_xtensor(xtensor, device, shard_composer); + auto tt_tensor = ttml::core::from_xtensor(xtensor, device, shard_composer); auto tensor = ttml::autograd::create_tensor(tt_tensor); auto all_reduce_tensor = ttml::ops::distributed::all_reduce(tensor); @@ -118,7 +119,8 @@ TEST_F(N300CommOpsTest, TestAllReduceNanoGPT) { xt::xarray grad_data = xt::random::rand(all_reduce_expected.shape(), 0.F, 1.F); ttml::core::XTensorToMeshVariant replicate_composer = ttml::core::ReplicateXTensorToMesh(mesh_shape); - auto tt_grad_tensor = ttml::core::from_xtensor(grad_data, device, replicate_composer); + auto tt_grad_tensor = + ttml::core::from_xtensor(grad_data, device, replicate_composer); all_reduce_tensor->set_grad(tt_grad_tensor); all_reduce_tensor->backward(); @@ -150,7 +152,7 @@ TEST_F(N300CommOpsTest, TestAllReduceFullyTiled) { xt::xarray test_data = xt::adapt(test_data_vec); xt::xarray xtensor = test_data.reshape({1U, 1U, height, size}); ttml::core::XTensorToMeshVariant shard_composer = ttml::core::ShardXTensorToMesh(mesh_shape, 3); - auto tt_tensor = ttml::core::from_xtensor(xtensor, device, shard_composer); + auto tt_tensor = ttml::core::from_xtensor(xtensor, device, shard_composer); auto tensor = ttml::autograd::create_tensor(tt_tensor); auto all_reduce_tensor = ttml::ops::distributed::all_reduce(tensor); @@ -166,7 +168,8 @@ TEST_F(N300CommOpsTest, TestAllReduceFullyTiled) { xt::xarray grad_data = xt::random::rand(all_reduce_expected.shape(), 0.F, 1.F); ttml::core::XTensorToMeshVariant replicate_composer = ttml::core::ReplicateXTensorToMesh(mesh_shape); - auto tt_grad_tensor = ttml::core::from_xtensor(grad_data, device, replicate_composer); + auto tt_grad_tensor = + ttml::core::from_xtensor(grad_data, device, replicate_composer); all_reduce_tensor->set_grad(tt_grad_tensor); all_reduce_tensor->backward(); @@ -197,7 +200,7 @@ TEST_F(N300CommOpsTest, TestAllGatherNotFullyTiled) { xt::xarray test_data = xt::adapt(test_data_vec); xt::xarray xtensor = test_data.reshape({1U, 1U, 1U, size}); ttml::core::XTensorToMeshVariant shard_composer = ttml::core::ShardXTensorToMesh(mesh_shape, 3); - auto tt_tensor = ttml::core::from_xtensor(xtensor, device, shard_composer); + auto tt_tensor = ttml::core::from_xtensor(xtensor, device, shard_composer); auto tensor = ttml::autograd::create_tensor(tt_tensor); auto gathered_tensor = ttml::ops::distributed::all_gather(tensor, 3); @@ -208,7 +211,8 @@ TEST_F(N300CommOpsTest, TestAllGatherNotFullyTiled) { xt::xarray grad_data = xt::random::rand(xtensor.shape(), 0.F, 1.F); ttml::core::XTensorToMeshVariant replicate_composer = ttml::core::ReplicateXTensorToMesh(mesh_shape); - auto tt_grad_tensor = ttml::core::from_xtensor(grad_data, device, replicate_composer); + auto tt_grad_tensor = + ttml::core::from_xtensor(grad_data, device, replicate_composer); gathered_tensor->set_grad(tt_grad_tensor); gathered_tensor->backward(); @@ -241,7 +245,7 @@ TEST_F(N300CommOpsTest, TestAllGatherFullyTiled) { xt::xarray test_data = xt::adapt(test_data_vec); xt::xarray xtensor = test_data.reshape({batch, 1U, height, size}); ttml::core::XTensorToMeshVariant shard_composer = ttml::core::ShardXTensorToMesh(mesh_shape, 3); - auto tt_tensor = ttml::core::from_xtensor(xtensor, device, shard_composer); + auto tt_tensor = ttml::core::from_xtensor(xtensor, device, shard_composer); auto tensor = ttml::autograd::create_tensor(tt_tensor); auto gathered_tensor = ttml::ops::distributed::all_gather(tensor, 3); @@ -252,7 +256,8 @@ TEST_F(N300CommOpsTest, TestAllGatherFullyTiled) { xt::xarray grad_data = xt::random::rand(xtensor.shape(), 0.F, 1.F); ttml::core::XTensorToMeshVariant replicate_composer = ttml::core::ReplicateXTensorToMesh(mesh_shape); - auto tt_grad_tensor = ttml::core::from_xtensor(grad_data, device, replicate_composer); + auto tt_grad_tensor = + ttml::core::from_xtensor(grad_data, device, replicate_composer); gathered_tensor->set_grad(tt_grad_tensor); gathered_tensor->backward(); @@ -283,7 +288,7 @@ TEST_F(N300CommOpsTest, TestScatterNotFullyTiled) { xt::xarray test_data = xt::adapt(test_data_vec); xt::xarray xtensor = test_data.reshape({1U, 1U, 1U, size}); ttml::core::XTensorToMeshVariant replicate_composer = ttml::core::ReplicateXTensorToMesh(mesh_shape); - auto tt_tensor = ttml::core::from_xtensor(xtensor, device, replicate_composer); + auto tt_tensor = ttml::core::from_xtensor(xtensor, device, replicate_composer); auto tensor = ttml::autograd::create_tensor(tt_tensor); auto scattered_tensor = ttml::ops::distributed::scatter(tensor, 3); @@ -298,7 +303,7 @@ TEST_F(N300CommOpsTest, TestScatterNotFullyTiled) { // check backward xt::xarray grad_data = xt::random::rand(xtensor.shape(), 0.F, 1.F); ttml::core::XTensorToMeshVariant shard_composer = ttml::core::ShardXTensorToMesh(mesh_shape, 3); - auto tt_grad_tensor = ttml::core::from_xtensor(grad_data, device, shard_composer); + auto tt_grad_tensor = ttml::core::from_xtensor(grad_data, device, shard_composer); scattered_tensor->set_grad(tt_grad_tensor); scattered_tensor->backward(); @@ -328,7 +333,7 @@ TEST_F(N300CommOpsTest, TestScatterFullyTiled) { ttml::core::MeshToXTensorVariant identity_composer = ttml::core::VectorMeshToXTensor(mesh_shape); ttml::core::XTensorToMeshVariant replicate_composer = ttml::core::ReplicateXTensorToMesh(mesh_shape); - auto tt_tensor = ttml::core::from_xtensor(xtensor, device, replicate_composer); + auto tt_tensor = ttml::core::from_xtensor(xtensor, device, replicate_composer); auto xtensor_after_replication = ttml::core::to_xtensor(tt_tensor, identity_composer); EXPECT_TRUE(xt::allclose(xtensor, xtensor_after_replication[0], /* rtol */ 1e-3, /* atol */ 1e-2)); diff --git a/tt-train/tests/ops/embedding_op_test.cpp b/tt-train/tests/ops/embedding_op_test.cpp index 9394a44a3e3..98e42e3fe3d 100644 --- a/tt-train/tests/ops/embedding_op_test.cpp +++ b/tt-train/tests/ops/embedding_op_test.cpp @@ -30,15 +30,15 @@ TEST_F(EmbeddingOpTest, EmbeddingForwardBackward) { auto* device = &autograd::ctx().get_device(); uint32_t num_embeddings = 32; uint32_t embedding_dim = 32; - auto weight_tensor = core::zeros(core::create_shape({1, 1, num_embeddings, embedding_dim}), device); + auto weight_tensor = ttml::core::zeros(ttml::core::create_shape({1, 1, num_embeddings, embedding_dim}), device); autograd::TensorPtr weight = autograd::create_tensor(weight_tensor); uint32_t batch_size = 1; uint32_t sentence_size = 32; std::vector input_data((size_t)batch_size * sentence_size); std::iota(input_data.begin(), input_data.end(), 0U); - auto input_tensor = core::from_vector( - input_data, core::create_shape({batch_size, 1, 1, sentence_size}), device, Layout::ROW_MAJOR); + auto input_tensor = ttml::core::from_vector( + input_data, ttml::core::create_shape({batch_size, 1, 1, sentence_size}), device, ttnn::Layout::ROW_MAJOR); autograd::TensorPtr input = autograd::create_tensor(input_tensor); autograd::TensorPtr embeddings = ops::embedding_op(input, weight); @@ -49,13 +49,13 @@ TEST_F(EmbeddingOpTest, EmbeddingForwardBackward) { target_vector[embedding_dim * i + j] = static_cast(i); } } - auto target_tensor = autograd::create_tensor( - core::from_vector(target_vector, core::create_shape({batch_size, 1, sentence_size, embedding_dim}), device)); + auto target_tensor = autograd::create_tensor(ttml::core::from_vector( + target_vector, ttml::core::create_shape({batch_size, 1, sentence_size, embedding_dim}), device)); auto result = ttml::ops::mse_loss(embeddings, target_tensor); result->backward(); auto weight_grad_tensor = weight->get_grad(); - auto weight_grad_data = core::to_vector(weight_grad_tensor); + auto weight_grad_data = ttml::core::to_vector(weight_grad_tensor); for (uint32_t i = 0; i < num_embeddings; i++) { for (uint32_t j = 0; j < embedding_dim; j++) { EXPECT_NEAR( @@ -67,40 +67,42 @@ TEST_F(EmbeddingOpTest, EmbeddingForwardBackward) { } TEST_F(EmbeddingOpTest, EmbeddingNumEmbeddingsEmbeddingDimNotDivisibleBy32) { + using namespace ttnn; using namespace ttml; auto* device = &autograd::ctx().get_device(); uint32_t num_embeddings = 13; uint32_t embedding_dim = 26; - auto weight_tensor = core::zeros(core::create_shape({1, 1, num_embeddings, embedding_dim}), device); + auto weight_tensor = ttml::core::zeros(ttml::core::create_shape({1, 1, num_embeddings, embedding_dim}), device); autograd::TensorPtr weight = autograd::create_tensor(weight_tensor); uint32_t batch_size = 1; uint32_t sentence_size = 32; std::vector input_data((size_t)batch_size * sentence_size); std::iota(input_data.begin(), input_data.end(), 0U); - auto input_tensor = core::from_vector( - input_data, core::create_shape({batch_size, 1, 1, sentence_size}), device, Layout::ROW_MAJOR); + auto input_tensor = ttml::core::from_vector( + input_data, ttml::core::create_shape({batch_size, 1, 1, sentence_size}), device, Layout::ROW_MAJOR); autograd::TensorPtr input = autograd::create_tensor(input_tensor); EXPECT_NO_THROW(ops::embedding_op(input, weight)); } TEST_F(EmbeddingOpTest, EmbeddingSentenceDimNotDivisibleBy32) { + using namespace ttnn; using namespace ttml; auto* device = &autograd::ctx().get_device(); uint32_t num_embeddings = 32; uint32_t embedding_dim = 32; - auto weight_tensor = core::zeros(core::create_shape({1, 1, num_embeddings, embedding_dim}), device); + auto weight_tensor = ttml::core::zeros(ttml::core::create_shape({1, 1, num_embeddings, embedding_dim}), device); autograd::TensorPtr weight = autograd::create_tensor(weight_tensor); uint32_t batch_size = 1; uint32_t sentence_size = 13; std::vector input_data((size_t)batch_size * sentence_size); std::iota(input_data.begin(), input_data.end(), 0U); - auto input_tensor = core::from_vector( - input_data, core::create_shape({batch_size, 1, 1, sentence_size}), device, Layout::ROW_MAJOR); + auto input_tensor = ttml::core::from_vector( + input_data, ttml::core::create_shape({batch_size, 1, 1, sentence_size}), device, Layout::ROW_MAJOR); autograd::TensorPtr input = autograd::create_tensor(input_tensor); EXPECT_NO_THROW(ops::embedding_op(input, weight)); diff --git a/tt_metal/api/tt-metalium/command_queue_interface.hpp b/tt_metal/api/tt-metalium/command_queue_interface.hpp index 30de4f2e631..c9121012948 100644 --- a/tt_metal/api/tt-metalium/command_queue_interface.hpp +++ b/tt_metal/api/tt-metalium/command_queue_interface.hpp @@ -19,9 +19,6 @@ #include "buffer.hpp" #include "umd/device/tt_core_coordinates.h" -// FIXME: Don't do this in header files -using namespace tt::tt_metal; - namespace tt::tt_metal { enum class CommandQueueDeviceAddrType : uint8_t { diff --git a/tt_metal/api/tt-metalium/device_pool.hpp b/tt_metal/api/tt-metalium/device_pool.hpp index 9712ca527bd..3becae3e6f6 100644 --- a/tt_metal/api/tt-metalium/device_pool.hpp +++ b/tt_metal/api/tt-metalium/device_pool.hpp @@ -22,12 +22,12 @@ namespace tt { namespace tt_metal::detail { -void CloseDevices(const std::map& devices); +void CloseDevices(const std::map& devices); } // namespace tt_metal::detail class DevicePool { - friend void tt_metal::detail::CloseDevices(const std::map& devices); + friend void tt_metal::detail::CloseDevices(const std::map& devices); public: DevicePool& operator=(const DevicePool&) = delete; @@ -48,13 +48,13 @@ class DevicePool { const tt_metal::DispatchCoreConfig& dispatch_core_config, tt::stl::Span l1_bank_remap = {}) noexcept; - IDevice* get_active_device(chip_id_t device_id) const; - std::vector get_all_active_devices() const; + tt_metal::IDevice* get_active_device(chip_id_t device_id) const; + std::vector get_all_active_devices() const; bool close_device(chip_id_t device_id); - void close_devices(const std::vector& devices); + void close_devices(const std::vector& devices); bool is_device_active(chip_id_t id) const; - void register_worker_thread_for_device(IDevice* device, std::thread::id worker_thread_id); - void unregister_worker_thread_for_device(IDevice* device); + void register_worker_thread_for_device(tt_metal::IDevice* device, std::thread::id worker_thread_id); + void unregister_worker_thread_for_device(tt_metal::IDevice* device); const std::unordered_set& get_worker_thread_ids() const; tt::tt_fabric::ControlPlane* get_control_plane() const; @@ -69,11 +69,11 @@ class DevicePool { bool using_fast_dispatch; std::mutex lock; // TODO replace std::vector> with stl::SlotMap when removing v0 - std::vector> devices; + std::vector> devices; // Used to track worker thread handles (1 worker thread created per device) // when we need to check if a call is made from an application thread or a // worker thread - std::unordered_map device_to_worker_thread_id; + std::unordered_map device_to_worker_thread_id; std::unordered_set worker_thread_ids; std::thread::id device_pool_creation_thread_id; bool skip_remote_devices; @@ -87,10 +87,10 @@ class DevicePool { void init_firmware_on_active_devices() const; void init_profiler_devices() const; void activate_device(chip_id_t id); - void initialize_device(IDevice* dev) const; + void initialize_device(tt_metal::IDevice* dev) const; void add_devices_to_pool(const std::vector& device_ids); void wait_for_fabric_master_router_sync() const; - IDevice* get_device(chip_id_t id) const; + tt_metal::IDevice* get_device(chip_id_t id) const; // Fabric setup helper functions void initialize_control_plane(); diff --git a/tt_metal/impl/dispatch/kernel_config/dispatch.hpp b/tt_metal/impl/dispatch/kernel_config/dispatch.hpp index 00195dae6e8..31884b8159e 100644 --- a/tt_metal/impl/dispatch/kernel_config/dispatch.hpp +++ b/tt_metal/impl/dispatch/kernel_config/dispatch.hpp @@ -65,8 +65,9 @@ class DispatchKernel : public FDKernel { bool h_variant, bool d_variant) : FDKernel(node_id, device_id, servicing_device_id, cq_id, noc_selection) { + auto& core_manager = tt::tt_metal::dispatch_core_manager::instance(); // Not thread safe TT_FATAL( - noc_selection.downstream_noc == dispatch_downstream_noc, + noc_selection.downstream_noc == tt::tt_metal::dispatch_downstream_noc, "Invalid downstream NOC specified for Dispatcher kernel"); TT_FATAL( noc_selection.upstream_noc != noc_selection.downstream_noc, @@ -75,13 +76,12 @@ class DispatchKernel : public FDKernel { static_config_.is_d_variant = d_variant; uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device_id); if (h_variant && d_variant) { - this->logical_core_ = dispatch_core_manager::instance().dispatcher_core(device_id, channel, cq_id); + this->logical_core_ = core_manager.dispatcher_core(device_id, channel, cq_id); } else if (h_variant) { channel = tt::Cluster::instance().get_assigned_channel_for_device(servicing_device_id); - this->logical_core_ = - dispatch_core_manager::instance().dispatcher_core(servicing_device_id, channel, cq_id); + this->logical_core_ = core_manager.dispatcher_core(servicing_device_id, channel, cq_id); } else if (d_variant) { - this->logical_core_ = dispatch_core_manager::instance().dispatcher_d_core(device_id, channel, cq_id); + this->logical_core_ = core_manager.dispatcher_d_core(device_id, channel, cq_id); } } void CreateKernel() override; diff --git a/tt_metal/impl/dispatch/kernel_config/dispatch_s.hpp b/tt_metal/impl/dispatch/kernel_config/dispatch_s.hpp index ba1760f53f8..db3c525e2ed 100644 --- a/tt_metal/impl/dispatch/kernel_config/dispatch_s.hpp +++ b/tt_metal/impl/dispatch/kernel_config/dispatch_s.hpp @@ -31,7 +31,8 @@ class DispatchSKernel : public FDKernel { int node_id, chip_id_t device_id, chip_id_t servicing_device_id, uint8_t cq_id, noc_selection_t noc_selection) : FDKernel(node_id, device_id, servicing_device_id, cq_id, noc_selection) { uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device_id); - this->logical_core_ = dispatch_core_manager::instance().dispatcher_s_core(device_id, channel, cq_id_); + this->logical_core_ = + tt::tt_metal::dispatch_core_manager::instance().dispatcher_s_core(device_id, channel, cq_id_); } void CreateKernel() override; void GenerateStaticConfigs() override; diff --git a/tt_metal/impl/dispatch/kernel_config/fd_kernel.hpp b/tt_metal/impl/dispatch/kernel_config/fd_kernel.hpp index d60d15c991b..7b3c2ae6abe 100644 --- a/tt_metal/impl/dispatch/kernel_config/fd_kernel.hpp +++ b/tt_metal/impl/dispatch/kernel_config/fd_kernel.hpp @@ -12,9 +12,9 @@ #define UNUSED_SEM_ID 0 typedef struct { - NOC non_dispatch_noc; // For communicating with workers/DRAM/host - NOC upstream_noc; // For communicating with upstream dispatch modules - NOC downstream_noc; // For communicating with downstream dispatch modules + tt::tt_metal::NOC non_dispatch_noc; // For communicating with workers/DRAM/host + tt::tt_metal::NOC upstream_noc; // For communicating with upstream dispatch modules + tt::tt_metal::NOC downstream_noc; // For communicating with downstream dispatch modules } noc_selection_t; static std::vector dispatch_kernel_file_names = { diff --git a/tt_metal/impl/dispatch/kernel_config/prefetch.hpp b/tt_metal/impl/dispatch/kernel_config/prefetch.hpp index a029049928e..3ba0a426564 100644 --- a/tt_metal/impl/dispatch/kernel_config/prefetch.hpp +++ b/tt_metal/impl/dispatch/kernel_config/prefetch.hpp @@ -64,20 +64,20 @@ class PrefetchKernel : public FDKernel { bool h_variant, bool d_variant) : FDKernel(node_id, device_id, servicing_device_id, cq_id, noc_selection) { + auto& core_manager = tt::tt_metal::dispatch_core_manager::instance(); // Not thread safe static_config_.is_h_variant = h_variant; static_config_.is_d_variant = d_variant; uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device_id); TT_FATAL( - noc_selection.downstream_noc == dispatch_downstream_noc, + noc_selection.downstream_noc == tt::tt_metal::dispatch_downstream_noc, "Invalid downstream NOC specified for Prefetcher kernel"); if (h_variant && d_variant) { - this->logical_core_ = dispatch_core_manager::instance().prefetcher_core(device_id, channel, cq_id); + this->logical_core_ = core_manager.prefetcher_core(device_id, channel, cq_id); } else if (h_variant) { channel = tt::Cluster::instance().get_assigned_channel_for_device(servicing_device_id); - this->logical_core_ = - dispatch_core_manager::instance().prefetcher_core(servicing_device_id, channel, cq_id); + this->logical_core_ = core_manager.prefetcher_core(servicing_device_id, channel, cq_id); } else if (d_variant) { - this->logical_core_ = dispatch_core_manager::instance().prefetcher_d_core(device_id, channel, cq_id); + this->logical_core_ = core_manager.prefetcher_d_core(device_id, channel, cq_id); } } void CreateKernel() override; diff --git a/tt_metal/impl/lightmetal/lightmetal_replay.hpp b/tt_metal/impl/lightmetal/lightmetal_replay.hpp index 87e94bb2c86..88294287721 100644 --- a/tt_metal/impl/lightmetal/lightmetal_replay.hpp +++ b/tt_metal/impl/lightmetal/lightmetal_replay.hpp @@ -46,7 +46,7 @@ struct LightMetalBinary; using FlatbufferRuntimeArgVector = const flatbuffers::Vector>*; -using RuntimeArgs = std::vector>; +using RuntimeArgs = std::vector>; namespace tt::tt_metal { inline namespace v0 { diff --git a/tt_metal/programming_examples/distributed/1_distributed_program_dispatch/distributed_program_dispatch.cpp b/tt_metal/programming_examples/distributed/1_distributed_program_dispatch/distributed_program_dispatch.cpp index e13cbe73ef6..4f94d971210 100644 --- a/tt_metal/programming_examples/distributed/1_distributed_program_dispatch/distributed_program_dispatch.cpp +++ b/tt_metal/programming_examples/distributed/1_distributed_program_dispatch/distributed_program_dispatch.cpp @@ -8,6 +8,7 @@ // Stand-alone example demonstrating usage of native multi-device TT-Metalium APIs // for issuing a program dispatch across a mesh of devices. int main(int argc, char** argv) { + using namespace tt::tt_metal; using namespace tt::tt_metal::distributed; auto mesh_device = MeshDevice::create(MeshDeviceConfig{.mesh_shape = MeshShape(2, 4)}); diff --git a/tt_metal/programming_examples/distributed/2_distributed_buffer_rw/distributed_buffer_rw.cpp b/tt_metal/programming_examples/distributed/2_distributed_buffer_rw/distributed_buffer_rw.cpp index 7678985f273..b23832c4181 100644 --- a/tt_metal/programming_examples/distributed/2_distributed_buffer_rw/distributed_buffer_rw.cpp +++ b/tt_metal/programming_examples/distributed/2_distributed_buffer_rw/distributed_buffer_rw.cpp @@ -16,6 +16,7 @@ // 3. Enqueue a Read command to the MeshBuffer and read back the data to a local buffer // 4. Verify that the data read back matches the original data int main(int argc, char** argv) { + using namespace tt::tt_metal; using namespace tt::tt_metal::distributed; using tt::tt_metal::distributed::ShardedBufferConfig; diff --git a/tt_metal/programming_examples/hello_world_datamovement_kernel/hello_world_datamovement_kernel.cpp b/tt_metal/programming_examples/hello_world_datamovement_kernel/hello_world_datamovement_kernel.cpp index 82f82e1d623..de7abf946c1 100644 --- a/tt_metal/programming_examples/hello_world_datamovement_kernel/hello_world_datamovement_kernel.cpp +++ b/tt_metal/programming_examples/hello_world_datamovement_kernel/hello_world_datamovement_kernel.cpp @@ -5,10 +5,10 @@ #include #include -using namespace tt; -using namespace tt::tt_metal; - int main(int argc, char** argv) { + using namespace tt; + using namespace tt::tt_metal; + // Initialize Program and Device constexpr CoreCoord core = {0, 0}; diff --git a/tt_metal/tools/lightmetal_runner/lightmetal_runner.cpp b/tt_metal/tools/lightmetal_runner/lightmetal_runner.cpp index c65b3a912d6..81c8600a1ae 100644 --- a/tt_metal/tools/lightmetal_runner/lightmetal_runner.cpp +++ b/tt_metal/tools/lightmetal_runner/lightmetal_runner.cpp @@ -32,7 +32,7 @@ int main(int argc, char* argv[]) { std::string binary_filename = argv[1]; // Read the Light Metal Binary file into blob, transfer ownership and execute it. - LightMetalBinary binary = LightMetalBinary::load_from_file(binary_filename); + auto binary = tt::tt_metal::LightMetalBinary::load_from_file(binary_filename); tt::tt_metal::LightMetalReplay lm_replay(std::move(binary)); if (!lm_replay.execute_binary()) { diff --git a/ttnn/cpp/pybind11/core.hpp b/ttnn/cpp/pybind11/core.hpp index fe8c5b28a21..479076202a8 100644 --- a/ttnn/cpp/pybind11/core.hpp +++ b/ttnn/cpp/pybind11/core.hpp @@ -19,6 +19,9 @@ namespace core { void py_module_types(py::module& module) { py::class_(module, "Config"); } void py_module(py::module& module) { + using tt::tt_metal::LightMetalBeginCapture; + using tt::tt_metal::LightMetalBinary; + using tt::tt_metal::LightMetalEndCapture; auto py_config = static_cast>(module.attr("Config")); py_config.def(py::init()).def("__repr__", [](const ttnn::Config& config) { return fmt::format("{}", config); diff --git a/ttnn/cpp/ttnn/async_runtime.hpp b/ttnn/cpp/ttnn/async_runtime.hpp index f7647b28fcf..1811396b7aa 100644 --- a/ttnn/cpp/ttnn/async_runtime.hpp +++ b/ttnn/cpp/ttnn/async_runtime.hpp @@ -15,23 +15,23 @@ void write_buffer( QueueId cq_id, Tensor& dst, std::vector> src, - const std::optional& region = std::nullopt); + const std::optional& region = std::nullopt); void read_buffer( QueueId cq_id, Tensor& src, std::vector> dst, - const std::optional& region = std::nullopt, + const std::optional& region = std::nullopt, size_t src_offset = 0, bool blocking = true); -void queue_synchronize(CommandQueue& cq); +void queue_synchronize(tt::tt_metal::CommandQueue& cq); -void event_synchronize(const std::shared_ptr& event); +void event_synchronize(const std::shared_ptr& event); -bool event_query(const std::shared_ptr& event); +bool event_query(const std::shared_ptr& event); -void wait_for_event(CommandQueue& cq, const std::shared_ptr& event); +void wait_for_event(tt::tt_metal::CommandQueue& cq, const std::shared_ptr& event); -void record_event(CommandQueue& cq, const std::shared_ptr& event); +void record_event(tt::tt_metal::CommandQueue& cq, const std::shared_ptr& event); } // namespace ttnn diff --git a/ttnn/cpp/ttnn/device_operation.hpp b/ttnn/cpp/ttnn/device_operation.hpp index 3e67bc6e5cf..4f1676e0fb6 100644 --- a/ttnn/cpp/ttnn/device_operation.hpp +++ b/ttnn/cpp/ttnn/device_operation.hpp @@ -362,7 +362,7 @@ template typename device_operation_t::tensor_args_t get_shard_tensor_args(std::size_t index, auto device, const typename device_operation_t::tensor_args_t& tensor_args) { auto get_shard = [device](const auto& tensor) { auto& storage = std::get(tensor.get_storage()); - return Tensor{DeviceStorage{storage.get_buffer_for_device(device)}, storage.get_tensor_spec_for_device(device)}; + return Tensor{tt::tt_metal::DeviceStorage{storage.get_buffer_for_device(device)}, storage.get_tensor_spec_for_device(device)}; }; return tt::stl::reflection::transform_object_of_type(get_shard, tensor_args); } diff --git a/ttnn/cpp/ttnn/distributed/api.hpp b/ttnn/cpp/ttnn/distributed/api.hpp index 4ecf4807734..7ab064dae99 100644 --- a/ttnn/cpp/ttnn/distributed/api.hpp +++ b/ttnn/cpp/ttnn/distributed/api.hpp @@ -33,7 +33,7 @@ Tensor aggregate_as_tensor( std::vector get_t3k_physical_device_ids_ring(); // Maps a tensor to the set of devices in the device-mesh that the shards will be distributed across. -std::vector get_mapped_devices(const Tensor& tensor, MeshDevice& mesh_device); +std::vector get_mapped_devices(const Tensor& tensor, MeshDevice& mesh_device); // Get the distributed tensor config from a tensor. tt::tt_metal::DistributedTensorConfig get_distributed_tensor_config_from_tensor(const Tensor& tensor); diff --git a/ttnn/cpp/ttnn/distributed/distributed_tensor.cpp b/ttnn/cpp/ttnn/distributed/distributed_tensor.cpp index 18995b49ed0..af3cf6d1fbf 100644 --- a/ttnn/cpp/ttnn/distributed/distributed_tensor.cpp +++ b/ttnn/cpp/ttnn/distributed/distributed_tensor.cpp @@ -26,7 +26,7 @@ class ReplicateTensorToMesh : public TensorToMesh { } tt::tt_metal::DistributedTensorConfig config() const override { - return tt::tt_metal::DistributedTensorConfig{ReplicateTensor{num_devices_}}; + return tt::tt_metal::DistributedTensorConfig{tt::tt_metal::ReplicateTensor{num_devices_}}; } private: @@ -42,7 +42,7 @@ class ShardTensorToMesh : public TensorToMesh { } tt::tt_metal::DistributedTensorConfig config() const override { - return tt::tt_metal::DistributedTensorConfig{ShardTensor{shard_dim_}}; + return tt::tt_metal::DistributedTensorConfig{tt::tt_metal::ShardTensor{shard_dim_}}; } private: @@ -98,7 +98,8 @@ class ShardTensorTo2dMesh : public TensorToMesh { } tt::tt_metal::DistributedTensorConfig config() const override { - return DistributedTensorConfig{ShardTensor2D{ShardMesh{mesh_rows_, mesh_cols_}}}; + return tt::tt_metal::DistributedTensorConfig{ + tt::tt_metal::ShardTensor2D{tt::tt_metal::ShardMesh{mesh_rows_, mesh_cols_}}}; } private: diff --git a/ttnn/cpp/ttnn/events.cpp b/ttnn/cpp/ttnn/events.cpp index 525c353941b..4a327bf052d 100644 --- a/ttnn/cpp/ttnn/events.cpp +++ b/ttnn/cpp/ttnn/events.cpp @@ -19,21 +19,24 @@ using ::tt::tt_metal::EnqueueWaitForEvent; using ::tt::tt_metal::distributed::EnqueueRecordEventToHost; using ::tt::tt_metal::distributed::EnqueueWaitForEvent; -std::shared_ptr create_event(IDevice* device) { - std::shared_ptr event = std::make_shared(); +std::shared_ptr create_event(tt::tt_metal::IDevice* device) { + std::shared_ptr event = std::make_shared(); event->device = device; return event; } -void record_event(QueueId cq_id, const std::shared_ptr& event, const std::vector& sub_device_ids) { - IDevice* device = event->device; +void record_event( + QueueId cq_id, + const std::shared_ptr& event, + const std::vector& sub_device_ids) { + tt::tt_metal::IDevice* device = event->device; device->push_work([device, event, cq_id, sub_device_ids] { EnqueueRecordEvent(device->command_queue(*cq_id), event, sub_device_ids); }); } -void wait_for_event(QueueId cq_id, const std::shared_ptr& event) { - IDevice* device = event->device; +void wait_for_event(QueueId cq_id, const std::shared_ptr& event) { + tt::tt_metal::IDevice* device = event->device; device->push_work([device, event, cq_id] { EnqueueWaitForEvent(device->command_queue(*cq_id), event); }); } @@ -48,7 +51,9 @@ MultiDeviceEvent create_event(MeshDevice* mesh_device) { } void record_event( - QueueId cq_id, const MultiDeviceEvent& multi_device_event, const std::vector& sub_device_ids) { + QueueId cq_id, + const MultiDeviceEvent& multi_device_event, + const std::vector& sub_device_ids) { for (auto& event : multi_device_event.events) { record_event(cq_id, event, sub_device_ids); } @@ -63,7 +68,7 @@ void wait_for_event(QueueId cq_id, const MultiDeviceEvent& multi_device_event) { MeshEvent record_mesh_event( MeshDevice* mesh_device, QueueId cq_id, - const std::vector& sub_device_ids, + const std::vector& sub_device_ids, const std::optional& device_range) { return EnqueueRecordEventToHost(mesh_device->mesh_command_queue(*cq_id), sub_device_ids, device_range); } diff --git a/ttnn/cpp/ttnn/events.hpp b/ttnn/cpp/ttnn/events.hpp index cb20d24e78a..edf7d3d7a9b 100644 --- a/ttnn/cpp/ttnn/events.hpp +++ b/ttnn/cpp/ttnn/events.hpp @@ -20,16 +20,16 @@ using MeshEvent = tt::tt_metal::distributed::MeshEvent; namespace events { // Single Device APIs -std::shared_ptr create_event(IDevice* device); +std::shared_ptr create_event(tt::tt_metal::IDevice* device); void record_event( QueueId cq_id, - const std::shared_ptr& event, + const std::shared_ptr& event, const std::vector& sub_device_ids = {}); -void wait_for_event(QueueId cq_id, const std::shared_ptr& event); +void wait_for_event(QueueId cq_id, const std::shared_ptr& event); // Multi Device APIs struct MultiDeviceEvent { - std::vector> events; + std::vector> events; }; MultiDeviceEvent create_event(MeshDevice* mesh_device); void record_event( diff --git a/ttnn/cpp/ttnn/global_semaphore.cpp b/ttnn/cpp/ttnn/global_semaphore.cpp index 67471dec341..4b462b60646 100644 --- a/ttnn/cpp/ttnn/global_semaphore.cpp +++ b/ttnn/cpp/ttnn/global_semaphore.cpp @@ -4,7 +4,6 @@ #include "global_semaphore.hpp" -#include #include #include #include @@ -65,7 +64,7 @@ MultiDeviceGlobalSemaphore create_global_semaphore_with_same_address( if (!all_same) { tt::log_debug("chkpt 1, attempts: {}", attempts); - DeviceAddr target_addr = get_global_semaphore_address(global_semaphores.front()); + tt::tt_metal::DeviceAddr target_addr = get_global_semaphore_address(global_semaphores.front()); for (auto i = 1; i < global_semaphores.size(); i++) { tt::log_debug( "chkpt 1.1, i: {}, global_semaphores[i]->address(): {}", diff --git a/ttnn/cpp/ttnn/operations/bernoulli/device/bernoulli_device_operation.cpp b/ttnn/cpp/ttnn/operations/bernoulli/device/bernoulli_device_operation.cpp index 0326eaf9226..aa2e77ac27d 100644 --- a/ttnn/cpp/ttnn/operations/bernoulli/device/bernoulli_device_operation.cpp +++ b/ttnn/cpp/ttnn/operations/bernoulli/device/bernoulli_device_operation.cpp @@ -54,7 +54,7 @@ BernoulliDeviceOperation::spec_return_value_t BernoulliDeviceOperation::compute_ } auto output_shape = tensor_args.input.get_logical_shape(); - return TensorSpec(output_shape, TensorLayout(operation_attributes.dtype, PageConfig(Layout::TILE), operation_attributes.memory_config)); + return TensorSpec(output_shape, tt::tt_metal::TensorLayout(operation_attributes.dtype, tt::tt_metal::PageConfig(Layout::TILE), operation_attributes.memory_config)); } BernoulliDeviceOperation::tensor_return_value_t BernoulliDeviceOperation::create_output_tensors( diff --git a/ttnn/cpp/ttnn/operations/bernoulli/device/bernoulli_device_operation.hpp b/ttnn/cpp/ttnn/operations/bernoulli/device/bernoulli_device_operation.hpp index e4f6cbb83cf..8996b8b7688 100644 --- a/ttnn/cpp/ttnn/operations/bernoulli/device/bernoulli_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/bernoulli/device/bernoulli_device_operation.hpp @@ -27,9 +27,9 @@ struct BernoulliDeviceOperation { struct ProgramFactory { struct shared_variables_t { - KernelHandle reader_kernel_id; - KernelHandle compute_kernel_id; - KernelHandle writer_kernel_id; + tt::tt_metal::KernelHandle reader_kernel_id; + tt::tt_metal::KernelHandle compute_kernel_id; + tt::tt_metal::KernelHandle writer_kernel_id; std::vector cores; }; diff --git a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.cpp b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.cpp index ae1939e7ae7..14747bb5ef8 100644 --- a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.cpp @@ -195,20 +195,22 @@ std::vector AllGather::compute_output_specs(const std::vector< const auto& input_tensor = input_tensors[0]; TensorSpec spec( output_shape, - TensorLayout(input_tensor.get_dtype(), input_tensor.get_tensor_spec().page_config(), output_mem_config)); + tt::tt_metal::TensorLayout( + input_tensor.get_dtype(), input_tensor.get_tensor_spec().page_config(), output_mem_config)); if (this->output_mem_config.is_sharded()) { return {TensorSpec( output_shape, - TensorLayout(input_tensor.get_dtype(), input_tensor.get_tensor_spec().page_config(), output_mem_config))}; + tt::tt_metal::TensorLayout( + input_tensor.get_dtype(), input_tensor.get_tensor_spec().page_config(), output_mem_config))}; } return std::vector(input_tensors.size(), spec); } std::vector AllGather::create_output_tensors(const std::vector& input_tensors) const { - return operation::default_create_output_tensors(*this, input_tensors, {}); + return tt::tt_metal::operation::default_create_output_tensors(*this, input_tensors, {}); } -operation::ProgramWithCallbacks AllGather::create_program( +tt::tt_metal::operation::ProgramWithCallbacks AllGather::create_program( const std::vector& input_tensors, std::vector& output_tensors) const { return all_gather_multi_core_with_workers( input_tensors[0], @@ -257,8 +259,8 @@ Tensor all_gather( rank - 1, dim); - std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor}))}; - operation::launch_op( + std::vector output_tensors = {Tensor(tt::tt_metal::operation::get_workers_for_op_output({input_tensor}))}; + tt::tt_metal::operation::launch_op( [gather_dim, num_links, dim, @@ -293,7 +295,7 @@ Tensor all_gather( } } - auto output_tensor = operation::run( + auto output_tensor = tt::tt_metal::operation::run( ttnn::ccl::all_gather_detail::create_all_gather_struct( input_tensor, gather_dim, @@ -344,9 +346,9 @@ Tensor all_gather( rank - 1, dim); - std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor}))}; + std::vector output_tensors = {Tensor(tt::tt_metal::operation::get_workers_for_op_output({input_tensor}))}; - operation::launch_op( + tt::tt_metal::operation::launch_op( [gather_dim, num_links, memory_config, @@ -387,7 +389,7 @@ Tensor all_gather( ? std::nullopt : get_chip_id(device_index + num_devices - 1); - return operation::run( + return tt::tt_metal::operation::run( ttnn::AllGather{ gather_dim, num_links, diff --git a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.hpp b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.hpp index afd36f9a1f3..c6eebe6cfe9 100644 --- a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.hpp @@ -133,7 +133,7 @@ struct AllGather { void validate(const std::vector &input_tensors) const; std::vector compute_output_specs(const std::vector &input_tensors) const; std::vector create_output_tensors(const std::vector &input_tensors) const; - operation::ProgramWithCallbacks create_program(const std::vector& input_tensors, std::vector &output_tensors) const; + tt::tt_metal::operation::ProgramWithCallbacks create_program(const std::vector& input_tensors, std::vector &output_tensors) const; }; namespace ccl{ @@ -152,7 +152,7 @@ AllGather create_all_gather_struct( } // namespace ccl // All Gather Variants -operation::ProgramWithCallbacks all_gather_full_shard_grid( +tt::tt_metal::operation::ProgramWithCallbacks all_gather_full_shard_grid( const Tensor& input_tensor, Tensor& output_tensor, const uint32_t dim, @@ -164,7 +164,7 @@ operation::ProgramWithCallbacks all_gather_full_shard_grid( const std::optional receiver_device_id, const std::optional sender_device_id, ccl::Topology topology); -operation::ProgramWithCallbacks all_gather_multi_core_with_workers( +tt::tt_metal::operation::ProgramWithCallbacks all_gather_multi_core_with_workers( const Tensor& input_tensor, Tensor& output_tensor, const uint32_t dim, @@ -176,7 +176,7 @@ operation::ProgramWithCallbacks all_gather_multi_core_with_workers( ccl::Topology topology, const std::optional user_defined_num_workers, const std::optional user_defined_num_buffers_per_channel); -operation::ProgramWithCallbacks all_gather_multi_core_with_workers_helper( +tt::tt_metal::operation::ProgramWithCallbacks all_gather_multi_core_with_workers_helper( tt::tt_metal::Program& program, const Tensor& input_tensor, Tensor& output_tensor, diff --git a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/multi_core/all_gather_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/multi_core/all_gather_op_multi_core.cpp index a31309388e3..953e0f62cee 100644 --- a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/multi_core/all_gather_op_multi_core.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/multi_core/all_gather_op_multi_core.cpp @@ -240,7 +240,7 @@ static void log_sharded_tensor_kernel_args(const Tensor& tensor, const std::stri // For ring all-gather, we can send sub-sections of input tensor in opposite directions // For linear all-gather though, we must ensure we send full tensors in BOTH directions // (in other words, disable the "bidirectional" send flag) -operation::ProgramWithCallbacks all_gather_multi_core_with_workers( +tt::tt_metal::operation::ProgramWithCallbacks all_gather_multi_core_with_workers( const Tensor& input_tensor, Tensor& output_tensor, const uint32_t dim, @@ -252,7 +252,7 @@ operation::ProgramWithCallbacks all_gather_multi_core_with_workers( ccl::Topology topology, const std::optional user_defined_num_workers, const std::optional user_defined_num_buffers_per_channel) { - tt::tt_metal::Program program{}; + Program program{}; std::optional empty_fused_op_signaler; return all_gather_multi_core_with_workers_helper( program, @@ -270,8 +270,8 @@ operation::ProgramWithCallbacks all_gather_multi_core_with_workers( empty_fused_op_signaler); } -operation::ProgramWithCallbacks all_gather_multi_core_with_workers_helper( - tt::tt_metal::Program& program, +tt::tt_metal::operation::ProgramWithCallbacks all_gather_multi_core_with_workers_helper( + Program& program, const Tensor& input_tensor, Tensor& output_tensor, const uint32_t dim, @@ -461,11 +461,12 @@ operation::ProgramWithCallbacks all_gather_multi_core_with_workers_helper( const uint32_t cb_size_in_pages = cb_n_packets * max_pages_per_chunk; const uint32_t CB_buffer_size = cb_n_packets * max_buffer_per_chunk; log_trace(tt::LogOp, "max_pages_per_chunk: {}", max_pages_per_chunk); - CircularBufferConfig cb_src0_config = CircularBufferConfig(CB_buffer_size, {{src0_cb_index, df}}) - .set_page_size(src0_cb_index, input_page_size) - .set_tile_dims(src0_cb_index, input_tensor_config->get_tile()); - CBHandle cb_src0_sender_workers = CreateCircularBuffer(program, all_sender_workers, cb_src0_config); - CBHandle cb_src0_receiver_workers = CreateCircularBuffer(program, all_receiver_workers, cb_src0_config); + auto cb_src0_config = tt::tt_metal::CircularBufferConfig(CB_buffer_size, {{src0_cb_index, df}}) + .set_page_size(src0_cb_index, input_page_size) + .set_tile_dims(src0_cb_index, input_tensor_config->get_tile()); + tt::tt_metal::CBHandle cb_src0_sender_workers = CreateCircularBuffer(program, all_sender_workers, cb_src0_config); + tt::tt_metal::CBHandle cb_src0_receiver_workers = + CreateCircularBuffer(program, all_receiver_workers, cb_src0_config); // This semaphore is used by the receiver core to tell workers that data is available to read auto receiver_worker_semaphore_id = CreateSemaphore(program, all_receiver_workers, 0); @@ -517,7 +518,7 @@ operation::ProgramWithCallbacks all_gather_multi_core_with_workers_helper( std::string const& send_reader_kernel_path = "ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/" "worker_interleaved_ring_gather_send_reader.cpp"; - KernelHandle worker_sender_reader_kernel_id = tt::tt_metal::CreateKernel( + tt::tt_metal::KernelHandle worker_sender_reader_kernel_id = tt::tt_metal::CreateKernel( program, send_reader_kernel_path, all_sender_workers, @@ -559,7 +560,7 @@ operation::ProgramWithCallbacks all_gather_multi_core_with_workers_helper( std::string const& sender_writer_kernel_path = "ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/" "worker_interleaved_ring_gather_send_writer.cpp"; - KernelHandle worker_sender_writer_kernel_id = tt::tt_metal::CreateKernel( + tt::tt_metal::KernelHandle worker_sender_writer_kernel_id = tt::tt_metal::CreateKernel( program, sender_writer_kernel_path, all_sender_workers, @@ -584,7 +585,7 @@ operation::ProgramWithCallbacks all_gather_multi_core_with_workers_helper( std::string const& receiver_reader_kernel_path = "ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/" "worker_interleaved_ring_gather_receive_reader.cpp"; - KernelHandle worker_receiver_reader_kernel_id = tt::tt_metal::CreateKernel( + tt::tt_metal::KernelHandle worker_receiver_reader_kernel_id = tt::tt_metal::CreateKernel( program, receiver_reader_kernel_path, all_receiver_workers, @@ -626,7 +627,7 @@ operation::ProgramWithCallbacks all_gather_multi_core_with_workers_helper( std::string const& receiver_writer_kernel_path = "ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/" "worker_interleaved_ring_gather_receive_writer.cpp"; - KernelHandle worker_receiver_writer_kernel_id = tt::tt_metal::CreateKernel( + tt::tt_metal::KernelHandle worker_receiver_writer_kernel_id = tt::tt_metal::CreateKernel( program, receiver_writer_kernel_path, all_receiver_workers, diff --git a/ttnn/cpp/ttnn/operations/ccl/barrier/device/barrier_op.cpp b/ttnn/cpp/ttnn/operations/ccl/barrier/device/barrier_op.cpp index ecb8b0d7733..d7d41b6c9ea 100644 --- a/ttnn/cpp/ttnn/operations/ccl/barrier/device/barrier_op.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/barrier/device/barrier_op.cpp @@ -28,7 +28,7 @@ std::vector Barrier::create_output_tensors(const std::vector& in return input_tensors; } -operation::ProgramWithCallbacks Barrier::create_program( +tt::tt_metal::operation::ProgramWithCallbacks Barrier::create_program( const std::vector& input_tensors, std::vector& output_tensors) const { return ccl::barrier::detail::barrier_with_workers( input_tensors.at(0), @@ -72,8 +72,8 @@ void Barrier::update_structure(const Tensor& input_tensor) { namespace operations::ccl { Tensor barrier_function(const Tensor& input_tensor, const ttnn::Barrier& barrier_struct) { - std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor}))}; - operation::launch_op( + std::vector output_tensors = {Tensor(tt::tt_metal::operation::get_workers_for_op_output({input_tensor}))}; + tt::tt_metal::operation::launch_op( [barrier_struct]( const std::vector& input_tensors, const std::vector>& optional_input_tensors, @@ -82,7 +82,7 @@ Tensor barrier_function(const Tensor& input_tensor, const ttnn::Barrier& barrier // need to copy and update barrier struct for this particular tensor ttnn::Barrier new_barrier_struct = barrier_struct; new_barrier_struct.update_structure(input_tensor); - return operation::run(new_barrier_struct, {input_tensor}); + return tt::tt_metal::operation::run(new_barrier_struct, {input_tensor}); }, {input_tensor}, output_tensors); diff --git a/ttnn/cpp/ttnn/operations/ccl/barrier/device/barrier_op.hpp b/ttnn/cpp/ttnn/operations/ccl/barrier/device/barrier_op.hpp index 57fb387d595..e50deb6c9f1 100644 --- a/ttnn/cpp/ttnn/operations/ccl/barrier/device/barrier_op.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/barrier/device/barrier_op.hpp @@ -25,14 +25,14 @@ struct Barrier { void validate(const std::vector& input_tensors) const; std::vector compute_output_specs(const std::vector& input_tensors) const; std::vector create_output_tensors(const std::vector& input_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector& input_tensors, std::vector& output_tensors) const; }; namespace ccl::barrier::detail { // Template for the barrier_with_workers function // Found in device/host/barrier_full_worker_grid.cpp -operation::ProgramWithCallbacks barrier_with_workers( +tt::tt_metal::operation::ProgramWithCallbacks barrier_with_workers( const Tensor& input_tensors, const Tensor& output_tensors, const bool is_starting_core, diff --git a/ttnn/cpp/ttnn/operations/ccl/ccl_common.cpp b/ttnn/cpp/ttnn/operations/ccl/ccl_common.cpp index faa87870ab8..10c7446022b 100644 --- a/ttnn/cpp/ttnn/operations/ccl/ccl_common.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/ccl_common.cpp @@ -182,14 +182,14 @@ CclOpTensorConfig::CclOpTensorConfig(Tensor const& tensor) : this->page_size = this->tile.get_tile_size(this->df); this->tile_size = this->tile.get_tile_hw(); } else { - this->tile = Tile({32, 32}); + this->tile = tt::tt_metal::Tile({32, 32}); this->page_size = tensor.buffer()->page_size(); this->tile_size = 1024; } } uint32_t CclOpTensorConfig::get_page_size() const { return this->page_size; } uint32_t CclOpTensorConfig::get_tile_size() const { return this->tile_size; } -Tile CclOpTensorConfig::get_tile() const { return this->tile; } +tt::tt_metal::Tile CclOpTensorConfig::get_tile() const { return this->tile; } uint32_t CclOpTensorConfig::get_buffer_start_address() const { return this->buffer_start_address; } @@ -199,7 +199,7 @@ CclOpInterleavedTensorConfig::CclOpInterleavedTensorConfig(Tensor const& input_t CclOpShardedTensorConfig::CclOpShardedTensorConfig(Tensor const& tensor) : CclOpTensorConfig(tensor), shard_spec(tensor.shard_spec().value()) {} -ShardSpec const& CclOpShardedTensorConfig::get_shard_spec() const { return this->shard_spec; } +const tt::tt_metal::ShardSpec& CclOpShardedTensorConfig::get_shard_spec() const { return this->shard_spec; } std::unique_ptr CclOpTensorConfig::build_all_gather_tensor_config(Tensor const& tensor) { if (tensor.is_sharded()) { @@ -210,11 +210,11 @@ std::unique_ptr CclOpTensorConfig::build_all_gather_tensor_co } void generate_edm_kernels_for_ring_or_linear_topology( - tt::tt_metal::Program& program, - IDevice const* device, - RingTopology const& topology_config, - std::vector const& clockwise_edm_builders, - std::vector const& counter_clockwise_edm_builders, + Program& program, + const IDevice* device, + const RingTopology& topology_config, + const std::vector& clockwise_edm_builders, + const std::vector& counter_clockwise_edm_builders, std::optional receiver_device_id, std::optional sender_device_id) { auto sender_noc = tt::tt_metal::detail::GetPreferredNOCForDRAMRead(hal::get_arch()); @@ -263,13 +263,13 @@ void generate_edm_kernels_for_ring_or_linear_topology( } template -KernelHandle generate_edm_kernel_impl( - tt::tt_metal::Program& program, - IDevice const* device, - EDMBuilder const& edm_builder, - std::string const& kernel_path, - CoreCoord const& eth_core, - NOC noc_id, +tt::tt_metal::KernelHandle generate_edm_kernel_impl( + Program& program, + const IDevice* device, + const EDMBuilder& edm_builder, + const std::string& kernel_path, + const CoreCoord& eth_core, + tt::tt_metal::NOC noc_id, std::optional opt_level = std::nullopt) { edm_builder.dump_to_log(); @@ -304,12 +304,12 @@ KernelHandle generate_edm_kernel_impl( return eth_sender_kernel; } -KernelHandle generate_edm_kernel( - tt::tt_metal::Program& program, - IDevice const* device, - ccl::FabricEriscDatamoverBuilder const& edm_builder, - CoreCoord const& eth_core, - NOC noc_id) { +tt::tt_metal::KernelHandle generate_edm_kernel( + Program& program, + const IDevice* device, + const ccl::FabricEriscDatamoverBuilder& edm_builder, + const CoreCoord& eth_core, + tt::tt_metal::NOC noc_id) { return generate_edm_kernel_impl( program, device, @@ -320,12 +320,12 @@ KernelHandle generate_edm_kernel( tt::tt_metal::KernelBuildOptLevel::O3); } -KernelHandle generate_edm_kernel( - tt::tt_metal::Program& program, - IDevice const* device, - ccl::EriscDatamoverBuilder const& edm_builder, - CoreCoord const& eth_core, - NOC noc_id) { +tt::tt_metal::KernelHandle generate_edm_kernel( + Program& program, + const IDevice* device, + const ccl::EriscDatamoverBuilder& edm_builder, + const CoreCoord& eth_core, + tt::tt_metal::NOC noc_id) { return generate_edm_kernel_impl( program, device, edm_builder, "ttnn/cpp/ttnn/operations/ccl/kernels/edm/erisc_datamover.cpp", eth_core, noc_id); } diff --git a/ttnn/cpp/ttnn/operations/ccl/ccl_common.hpp b/ttnn/cpp/ttnn/operations/ccl/ccl_common.hpp index d523d336ecb..3998992908b 100644 --- a/ttnn/cpp/ttnn/operations/ccl/ccl_common.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/ccl_common.hpp @@ -110,7 +110,7 @@ class CclOpTensorConfig { CclOpTensorConfig(Tensor const& tensor); uint32_t get_page_size() const; uint32_t get_tile_size() const; - Tile get_tile() const; + tt::tt_metal::Tile get_tile() const; uint32_t get_buffer_start_address() const; @@ -119,7 +119,7 @@ class CclOpTensorConfig { protected: uint32_t page_size; uint32_t tile_size; - Tile tile; + tt::tt_metal::Tile tile; uint32_t buffer_start_address; tt::DataFormat df; }; @@ -542,14 +542,14 @@ tt::tt_metal::KernelHandle generate_edm_kernel( tt::tt_metal::IDevice const* device, FabricEriscDatamoverBuilder const& edm_builder, CoreCoord const& eth_core, - NOC noc_id); + tt::tt_metal::NOC noc_id); tt::tt_metal::KernelHandle generate_edm_kernel( tt::tt_metal::Program& program, IDevice const* device, EriscDatamoverBuilder const& edm_builder, CoreCoord const& eth_core, - NOC noc_id); + tt::tt_metal:: NOC noc_id); void generate_edm_kernels_for_ring_or_linear_topology( tt::tt_metal::Program& program, diff --git a/ttnn/cpp/ttnn/operations/ccl/ccl_host_datastructures.hpp b/ttnn/cpp/ttnn/operations/ccl/ccl_host_datastructures.hpp index d098ea5ff3a..454246d28d7 100644 --- a/ttnn/cpp/ttnn/operations/ccl/ccl_host_datastructures.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/ccl_host_datastructures.hpp @@ -49,7 +49,7 @@ struct CCLOpConfig { CCLOpConfig(std::vector& input_tensors, const std::vector& output_tensors, Topology topology); uint32_t get_page_size() const; - Tile get_tile() const; + tt::tt_metal::Tile get_tile() const; Topology get_topology() const; bool is_input_sharded() const; bool is_output_sharded() const; @@ -66,7 +66,7 @@ struct CCLOpConfig { bool output_sharded; bool is_row_major; tt::DataFormat df; - Tile tile; + tt::tt_metal::Tile tile; std::vector const* input_tensors; std::vector const* output_tensors; diff --git a/ttnn/cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.cpp b/ttnn/cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.cpp index 57eebb6f0d7..15cc0c4bb5a 100644 --- a/ttnn/cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.cpp @@ -798,12 +798,12 @@ void generate_ccl_cb_to_tensor_slice_sequence_commands( slices, ttnn::ccl::cmd::CclCommandCode::STREAM_CB_TO_TENSOR, args_out, dest_args); } -KernelHandle generate_multi_command_stream_kernel_ct_args( +tt::tt_metal::KernelHandle generate_multi_command_stream_kernel_ct_args( Program& program, std::vector const& cb_indices, // TODO: move to RT arg std::vector const& tensors, CoreRangeSet const& worker_core_range, - DataMovementConfig datamovement_kernel_config, + tt::tt_metal::DataMovementConfig datamovement_kernel_config, const size_t num_command_streams, std::optional my_chip_id) { TT_FATAL( @@ -843,7 +843,7 @@ KernelHandle generate_multi_command_stream_kernel_ct_args( // Set aside a buffer we can use for storing packet headers in (particularly for atomic incs) const auto reserved_packet_header_CB_index = - datamovement_kernel_config.processor == DataMovementProcessor::RISCV_0 ? tt::CB::c_in6 : tt::CB::c_in7; + datamovement_kernel_config.processor == tt::tt_metal::DataMovementProcessor::RISCV_0 ? tt::CB::c_in6 : tt::CB::c_in7; static constexpr auto num_packet_headers_storable = 8; static constexpr auto packet_header_size_bytes = sizeof(tt::fabric::PacketHeader); tt::tt_metal::CircularBufferConfig cb_config = @@ -1058,7 +1058,7 @@ std::vector generate_edm_connection_rt_args( void generate_multi_input_command_stream_kernel_rt_args( Program& program, - KernelHandle kernel_id, + tt::tt_metal::KernelHandle kernel_id, std::vector const& tensors, std::vector const& page_sizes, IDevice* device, @@ -1194,7 +1194,7 @@ void generate_multi_input_command_stream_kernel_rt_args( void generate_multi_command_stream_kernel_rt_args( Program& program, - KernelHandle kernel_id, + tt::tt_metal::KernelHandle kernel_id, std::vector const& cb_ids, std::vector const& tensors, IDevice* device, @@ -1304,7 +1304,7 @@ void generate_multi_command_stream_kernel_rt_args( } ttnn::ccl::cmd::CclHostLowLevelCommandSequence build_ccl_cmd_proc_teardown_commands( - tt::tt_metal::Program& program, + Program& program, IDevice* device, IDevice* forward_device, size_t line_size, @@ -1369,7 +1369,7 @@ ttnn::ccl::cmd::CclHostLowLevelCommandSequence build_ccl_cmd_proc_teardown_comma void build_sync_kernels( IDevice* device, - tt::tt_metal::Program& program, + Program& program, ccl::SyncModeSpec const& sync_details, bool terminate_fabric, ccl::EdmLineFabricOpInterface& fabric_interface) { diff --git a/ttnn/cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.hpp b/ttnn/cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.hpp index 23271b809b8..a1426370909 100644 --- a/ttnn/cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.hpp @@ -66,12 +66,12 @@ void generate_ccl_command_stream_to_kernel_args( * @return the runtime args */ std::vector generate_edm_connection_rt_args( - const ttnn::ccl::SenderWorkerAdapterSpec& connection_info, Program& program, CoreRangeSet worker_cores); + const ttnn::ccl::SenderWorkerAdapterSpec& connection_info, tt::tt_metal::Program& program, CoreRangeSet worker_cores); // TODO: eventually take a fabric handle void generate_multi_input_command_stream_kernel_rt_args( - Program& program, - KernelHandle kernel_id, + tt::tt_metal::Program& program, + tt::tt_metal::KernelHandle kernel_id, std::vector const& tensors, std::vector const& page_sizes, IDevice* device, @@ -88,8 +88,8 @@ void generate_multi_input_command_stream_kernel_rt_args( // TODO: Bundle into command bundle per command stream to cut down // on args and improve usability void generate_multi_command_stream_kernel_rt_args( - Program& program, - KernelHandle kernel_id, + tt::tt_metal::Program& program, + tt::tt_metal::KernelHandle kernel_id, std::vector const& cb_ids, std::vector const& tensors, IDevice* device, @@ -102,12 +102,12 @@ void generate_multi_command_stream_kernel_rt_args( std::optional const& backward_fabric_connections, std::optional> const& edm_termination_infos, std::vector const& dest_args); -KernelHandle generate_multi_command_stream_kernel_ct_args( - Program& program, +tt::tt_metal::KernelHandle generate_multi_command_stream_kernel_ct_args( + tt::tt_metal::Program& program, std::vector const& cb_indices, std::vector const& tensors, CoreRangeSet const& worker_core_range, - DataMovementConfig datamovement_kernel_config, + tt::tt_metal::DataMovementConfig datamovement_kernel_config, const size_t num_command_streams = 2, std::optional my_chip_id = std::nullopt); diff --git a/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.cpp b/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.cpp index 3c61c8c37ea..21b658f4081 100644 --- a/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.cpp @@ -29,7 +29,6 @@ using namespace tt::tt_metal::experimental; namespace ttnn::ccl { - // The channel structure is as follows: // &header-> |----------------| channel_base_address // | header | @@ -696,11 +695,7 @@ void EdmLineFabricOpInterface::build_kernels() const { device->ethernet_core_from_logical_core(edm_builder.my_eth_core_logical).y, device->ethernet_core_from_logical_core(edm_builder.my_eth_core_logical).x); auto local_edm_kernel = ttnn::ccl::generate_edm_kernel( - *program, - device, - edm_builder, - edm_builder.my_eth_core_logical, - NOC::NOC_0); + *program, device, edm_builder, edm_builder.my_eth_core_logical, tt::tt_metal::NOC::NOC_0); } } }; diff --git a/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.hpp b/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.hpp index ce0fac4e864..997308173ad 100644 --- a/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.hpp @@ -228,57 +228,60 @@ class FabricEriscDatamoverBuilder { // TODO } - void teardown_from_host(IDevice*d, tt::fabric::TerminationSignal termination_signal = tt::fabric::TerminationSignal::GRACEFULLY_TERMINATE) const; - - void set_firmware_context_switch_interval(size_t interval); - - // protected: - friend class EdmLineFabricOpInterface; - CoreCoord my_eth_core_logical; - size_t my_noc_x = 0; - size_t my_noc_y = 0; - - FabricEriscDatamoverConfig config; - - size_t my_chip_id = 0; - size_t peer_chip_id = 0; - size_t handshake_address = 0; - size_t channel_buffer_size = 0; - - size_t sender_0_num_buffers = 0; - size_t sender_1_num_buffers = 0; - size_t receiver_num_buffers = 0; - - size_t local_sender_channel_0_buffer_address = 0; - size_t local_sender_channel_0_connection_info_addr = 0; - size_t local_sender_channel_1_buffer_address = 0; - size_t local_sender_channel_1_connection_info_addr = 0; - size_t local_receiver_channel_buffer_address = 0; - - size_t termination_signal_ptr = 0; - - // Semaphore IDs - // this is the receiver channel's local sem for flow controlling with downstream fabric sender - std::optional receiver_channel_downstream_flow_control_semaphore_id; - std::optional receiver_channel_downstream_teardown_semaphore_id; - size_t sender_channel_0_flow_control_semaphore_id = 0; - size_t sender_channel_1_flow_control_semaphore_id = 0; - size_t sender_channel_0_connection_semaphore_id = 0; - size_t sender_channel_1_connection_semaphore_id = 0; - size_t sender_channel_0_buffer_index_semaphore_id = 0; - size_t sender_channel_1_buffer_index_semaphore_id = 0; - size_t receiver_channel_local_buffer_index_address = 0; - - std::optional downstream_edm_noc_x; - std::optional downstream_edm_noc_y; - std::optional downstream_edm_buffer_base_address; - std::optional downstream_edm_semaphore_address; - std::optional downstream_edm_worker_registration_address; - std::optional downstream_edm_worker_location_info_address; - std::optional downstream_sender_channel_buffer_index_semaphore_id; - bool enable_persistent_mode = false; - bool build_in_worker_connection_mode = false; - size_t firmware_context_switch_interval = default_firmware_context_switch_interval; + void teardown_from_host( + tt::tt_metal::IDevice* d, + tt::fabric::TerminationSignal termination_signal = + tt::fabric::TerminationSignal::GRACEFULLY_TERMINATE) const; + + void set_firmware_context_switch_interval(size_t interval); + + // protected: + friend class EdmLineFabricOpInterface; + CoreCoord my_eth_core_logical; + size_t my_noc_x = 0; + size_t my_noc_y = 0; + + FabricEriscDatamoverConfig config; + + size_t my_chip_id = 0; + size_t peer_chip_id = 0; + size_t handshake_address = 0; + size_t channel_buffer_size = 0; + + size_t sender_0_num_buffers = 0; + size_t sender_1_num_buffers = 0; + size_t receiver_num_buffers = 0; + + size_t local_sender_channel_0_buffer_address = 0; + size_t local_sender_channel_0_connection_info_addr = 0; + size_t local_sender_channel_1_buffer_address = 0; + size_t local_sender_channel_1_connection_info_addr = 0; + size_t local_receiver_channel_buffer_address = 0; + + size_t termination_signal_ptr = 0; + + // Semaphore IDs + // this is the receiver channel's local sem for flow controlling with downstream fabric sender + std::optional receiver_channel_downstream_flow_control_semaphore_id; + std::optional receiver_channel_downstream_teardown_semaphore_id; + size_t sender_channel_0_flow_control_semaphore_id = 0; + size_t sender_channel_1_flow_control_semaphore_id = 0; + size_t sender_channel_0_connection_semaphore_id = 0; + size_t sender_channel_1_connection_semaphore_id = 0; + size_t sender_channel_0_buffer_index_semaphore_id = 0; + size_t sender_channel_1_buffer_index_semaphore_id = 0; + size_t receiver_channel_local_buffer_index_address = 0; + + std::optional downstream_edm_noc_x; + std::optional downstream_edm_noc_y; + std::optional downstream_edm_buffer_base_address; + std::optional downstream_edm_semaphore_address; + std::optional downstream_edm_worker_registration_address; + std::optional downstream_edm_worker_location_info_address; + std::optional downstream_sender_channel_buffer_index_semaphore_id; + bool enable_persistent_mode = false; + bool build_in_worker_connection_mode = false; + size_t firmware_context_switch_interval = default_firmware_context_switch_interval; }; @@ -295,18 +298,34 @@ class EdmLineFabricOpInterface { // The constructor will assemble/connect the line across the specified device sequence, for all available links. - EdmLineFabricOpInterface (std::vector const& device_sequence, std::vector const& program_sequence, bool enable_persistent_mode, std::optional desired_num_links = std::nullopt, bool build_in_worker_connection_mode = false); + EdmLineFabricOpInterface( + const std::vector& device_sequence, + const std::vector& program_sequence, + bool enable_persistent_mode, + std::optional desired_num_links = std::nullopt, + bool build_in_worker_connection_mode = false); // Invocable per chip if we want to collectively build the fabric by building this separately per chip // (and implicitly building the fabric that way) - EdmLineFabricOpInterface (IDevice* local_device, std::optional forward_device, std::optional backward_device, Program* program, bool enable_persistent_mode, std::optional desired_num_links, bool build_in_worker_connection_mode = false); + EdmLineFabricOpInterface( + tt::tt_metal::IDevice* local_device, + std::optional forward_device, + std::optional backward_device, + tt::tt_metal::Program* program, + bool enable_persistent_mode, + std::optional desired_num_links, + bool build_in_worker_connection_mode = false); - static EdmLineFabricOpInterface build_program_builder_worker_connection_fabric(std::vector const& device_sequence, std::vector const& program_sequence, bool enable_persistent_mode, std::optional desired_num_links = std::nullopt); static EdmLineFabricOpInterface build_program_builder_worker_connection_fabric( - IDevice* local_device, - IDevice* forward_device, - IDevice* backward_device, - Program* program, + const std::vector& device_sequence, + const std::vector& program_sequence, + bool enable_persistent_mode, + std::optional desired_num_links = std::nullopt); + static EdmLineFabricOpInterface build_program_builder_worker_connection_fabric( + tt::tt_metal::IDevice* local_device, + tt::tt_metal::IDevice* forward_device, + tt::tt_metal::IDevice* backward_device, + tt::tt_metal::Program* program, bool enable_persistent_mode, std::optional desired_num_links = std::nullopt); @@ -328,14 +347,15 @@ class EdmLineFabricOpInterface { std::vector generate_ordered_termination_info_farthest_to_nearest() const; // Generates a list of termination infos for the local chip's EDMs - std::vector generate_local_chip_fabric_termination_infos(IDevice*device) const; + std::vector generate_local_chip_fabric_termination_infos( + tt::tt_metal::IDevice* device) const; // Accessors size_t get_num_links() const { return num_links; } size_t get_device_count() const { return device_sequence.size(); } - size_t get_index_of_device(IDevice*device) const { + size_t get_index_of_device(tt::tt_metal::IDevice* device) const { for (size_t i = 0; i < device_sequence.size(); i++) { if (device_sequence[i] == device) { return i; @@ -363,8 +383,8 @@ class EdmLineFabricOpInterface { std::unordered_map next_forward_direction_edm_available; std::unordered_map next_backward_direction_edm_available; - std::vector device_sequence; - std::vector programs; + std::vector device_sequence; + std::vector programs; size_t num_links; size_t buffer_size_bytes; diff --git a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.cpp b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.cpp index af614f48b80..7993ab70cd0 100644 --- a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.cpp @@ -70,11 +70,13 @@ std::vector ReduceScatter::compute_output_specs(const std::vec this->ring_size); shape[this->scatter_dim] /= this->ring_size; TensorSpec spec( - shape, TensorLayout(input_tensor.get_dtype(), PageConfig(input_tensor.get_layout()), output_mem_config)); + shape, + tt::tt_metal::TensorLayout( + input_tensor.get_dtype(), tt::tt_metal::PageConfig(input_tensor.get_layout()), output_mem_config)); return std::vector(input_tensors.size(), spec); } -operation::ProgramWithCallbacks ReduceScatter::create_program( +tt::tt_metal::operation::ProgramWithCallbacks ReduceScatter::create_program( const std::vector& input_tensors, std::vector& output_tensors) const { return ccl::reduce_scatter_detail::reduce_scatter_with_workers( input_tensors.at(0), @@ -141,8 +143,8 @@ Tensor reduce_scatter( rank - 1, dim); - std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor}))}; - operation::launch_op( + std::vector output_tensors = {Tensor(tt::tt_metal::operation::get_workers_for_op_output({input_tensor}))}; + tt::tt_metal::operation::launch_op( [binary_op_type, scatter_dim, num_links, @@ -156,7 +158,7 @@ Tensor reduce_scatter( const std::vector>& optional_output_tensors) mutable -> std::vector { const auto& input_tensor = input_tensors.at(0); - return operation::run( + return tt::tt_metal::operation::run( ttnn::ccl::reduce_scatter_detail::create_reduce_scatter_struct( input_tensor, binary_op_type, @@ -205,9 +207,9 @@ Tensor reduce_scatter( rank - 1, dim); - std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor}))}; + std::vector output_tensors = {Tensor(tt::tt_metal::operation::get_workers_for_op_output({input_tensor}))}; - operation::launch_op( + tt::tt_metal::operation::launch_op( [scatter_dim, binary_op_type, num_links, @@ -249,7 +251,7 @@ Tensor reduce_scatter( ? std::nullopt : get_chip_id(device_index + num_devices - 1); - return operation::run( + return tt::tt_metal::operation::run( ttnn::ReduceScatter{ binary_op_type, scatter_dim, diff --git a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.hpp b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.hpp index b2cf5f19d1e..45a86849ff4 100644 --- a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.hpp @@ -26,13 +26,13 @@ struct ReduceScatter { void validate(const std::vector& input_tensors) const; std::vector compute_output_specs(const std::vector& input_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector& input_tensors, std::vector& output_tensors) const; }; namespace ccl { namespace reduce_scatter_detail { -operation::ProgramWithCallbacks reduce_scatter_with_workers( +tt::tt_metal::operation::ProgramWithCallbacks reduce_scatter_with_workers( const Tensor& input_tensors, const Tensor& output_tensors, ttnn::operations::binary::BinaryOpType reduce_op, @@ -70,7 +70,7 @@ Tensor reduce_scatter( const int32_t dim, ttnn::operations::reduction::ReduceType reduce_op = ttnn::operations::reduction::ReduceType::Sum, const uint32_t num_links = 1, - const MemoryConfig& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, + const MemoryConfig& output_mem_config = tt::tt_metal::operation::DEFAULT_OUTPUT_MEMORY_CONFIG, ttnn::ccl::Topology topology = ttnn::ccl::Topology::Ring, const std::optional user_defined_num_workers = std::nullopt, const std::optional user_defined_num_buffers_per_channel = std::nullopt); @@ -82,7 +82,7 @@ Tensor reduce_scatter( const MeshDevice& mesh_device, ttnn::operations::reduction::ReduceType reduce_op = ttnn::operations::reduction::ReduceType::Sum, const uint32_t num_links = 1, - const std::optional& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, + const std::optional& output_mem_config = tt::tt_metal::operation::DEFAULT_OUTPUT_MEMORY_CONFIG, ttnn::ccl::Topology topology = ttnn::ccl::Topology::Ring, const std::optional user_defined_num_workers = std::nullopt, const std::optional user_defined_num_buffers_per_channel = std::nullopt); diff --git a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter.hpp b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter.hpp index 9bb00b77918..fc0c95bc57d 100644 --- a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter.hpp @@ -22,7 +22,8 @@ struct ExecuteReduceScatter { const MeshDevice& mesh_device, ttnn::operations::reduction::ReduceType reduce_op = ttnn::operations::reduction::ReduceType::Sum, const uint32_t num_links = 1, - const std::optional& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, + const std::optional& output_mem_config = + tt::tt_metal::operation::DEFAULT_OUTPUT_MEMORY_CONFIG, ttnn::ccl::Topology topology = ttnn::ccl::Topology::Ring, const std::optional user_defined_num_workers = std::nullopt, const std::optional user_defined_num_buffers_per_channel = std::nullopt); diff --git a/ttnn/cpp/ttnn/operations/ccl/sharding_addrgen_helper.hpp b/ttnn/cpp/ttnn/operations/ccl/sharding_addrgen_helper.hpp index ab12f4a733b..53584d4c5f1 100644 --- a/ttnn/cpp/ttnn/operations/ccl/sharding_addrgen_helper.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/sharding_addrgen_helper.hpp @@ -7,6 +7,9 @@ #include "cpp/ttnn/operations/ccl/common/types/sharding_common.hpp" namespace shard_builder { + +using namespace tt::tt_metal; + void extend_sharding_compile_time_args(const tt::tt_metal::Tensor& t, std::vector& args); void extend_sharding_run_time_args(const tt::tt_metal::Tensor& t, std::vector& args); std::vector generate_run_time_args(const tt::tt_metal::Tensor& t); diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp index a3928a36629..695ae07ce0e 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp @@ -21,9 +21,9 @@ #include "ttnn/operations/core/core.hpp" #include "ttnn/operations/data_movement/move/move.hpp" -using namespace tt; namespace ttnn { namespace operations::conv { +using namespace tt; using sliding_window::ParallelConfig; using sliding_window::SlidingWindowConfig; diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp index 0591ed02d0c..d12ee00da8f 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp @@ -282,7 +282,7 @@ void py_bind_conv2d(py::module& module) { uint32_t output_width, uint32_t output_channels, const CoreCoord& compute_grid_size, - ShardOrientation block_shard_orientation, + tt::tt_metal::ShardOrientation block_shard_orientation, bool enable_channels_padding, bool is_out_tiled) -> ttnn::operations::sliding_window::ParallelConfig { return determine_parallel_config( diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp index 6f67fb238a6..5a2b9749b58 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp @@ -110,7 +110,7 @@ ParallelConfig determine_parallel_config( if (shard_layout == TensorMemoryLayout::HEIGHT_SHARDED) { uint32_t num_cores_nhw = find_closest_largest_divisor_with_num_padding_and_mult( out_nhw_ntiles, max_num_cores, act_block_h_override_ntiles); - grid = num_cores_to_corerangeset(num_cores_nhw, compute_grid_size, true); + grid = tt::tt_metal::num_cores_to_corerangeset(num_cores_nhw, compute_grid_size, true); } else if (shard_layout == TensorMemoryLayout::BLOCK_SHARDED) { uint32_t start_divisor = block_shard_orientation == ShardOrientation::COL_MAJOR ? compute_grid_size.x : compute_grid_size.y; @@ -131,7 +131,7 @@ ParallelConfig determine_parallel_config( uint32_t num_cores_c = enable_channels_padding ? find_closest_largest_divisor_with_num_padding(input_channles_ntiles, max_num_cores) : find_closest_largest_divisor(input_channles_ntiles, max_num_cores); - grid = num_cores_to_corerangeset(num_cores_c, compute_grid_size, true); + grid = tt::tt_metal::num_cores_to_corerangeset(num_cores_c, compute_grid_size, true); } else { TT_THROW("Conv2d supports Height, Block or Width Sharded Layouts but got {}", shard_layout); } @@ -154,7 +154,7 @@ ParallelConfig determine_output_parallel_config( if (input_parallel_config.shard_scheme == ttnn::TensorMemoryLayout::WIDTH_SHARDED && !is_mm_conv) { uint32_t max_num_cores = compute_grid_size.x * compute_grid_size.y; output_parallel_config = { - .grid = num_cores_to_corerangeset( + .grid = tt::tt_metal::num_cores_to_corerangeset( find_closest_largest_divisor_with_num_padding( tt::div_up(out_channels, tt::constants::TILE_WIDTH), max_num_cores), compute_grid_size, @@ -238,7 +238,7 @@ MemoryConfig create_sharded_memory_config_from_parallel_config( uint32_t nhw_shard = nhw_padded / num_cores_nhw; TT_ASSERT(channels % num_cores_channels == 0, "Channels: {}, num core channels: {}", channels, num_cores_channels); uint32_t channel_shard = channels / num_cores_channels; - auto shard_spec = ShardSpec{parallel_config.grid, {nhw_shard, channel_shard}, shard_orientation}; + auto shard_spec = tt::tt_metal::ShardSpec{parallel_config.grid, {nhw_shard, channel_shard}, shard_orientation}; log_debug("Calculated Shard Spec = {}", shard_spec); return MemoryConfig{shard_scheme, BufferType::L1, shard_spec}; } @@ -597,7 +597,7 @@ std::tuple shard_or_reshard_tensor if (!input_tensor.is_sharded()) { // In case we need to run Interleaved2Sharded switch fron physical sharding // to logical sharding, in order to get smaller allocation size of sharded buffer. - input_tensor_sharded_memory_config_to_layout.shard_spec = ShardSpec( + input_tensor_sharded_memory_config_to_layout.shard_spec = tt::tt_metal::ShardSpec( input_tensor_sharded_memory_config.shard_spec.value().grid, input_tensor_sharded_memory_config.shard_spec.value().shape, input_tensor_sharded_memory_config.shard_spec.value().shape, @@ -913,7 +913,7 @@ conv_op_l1_usage conv2d::calculate_L1_usage( uint32_t output_tile_size = tt::tile_size(datatype_to_dataformat_converter(conv_config.dtype)); auto [math_fidelity, math_approx_mode, fp32_dest_acc_en, packer_l1_acc, dst_full_sync_en] = - get_compute_kernel_config_args(hal.get_arch(), compute_kernel_config); + get_compute_kernel_config_args(tt::tt_metal::hal.get_arch(), compute_kernel_config); uint32_t act_block_w_ntiles = block_config.act_block_w_ntiles; uint32_t act_block_h_ntiles = block_config.act_block_h_ntiles; @@ -1102,8 +1102,9 @@ conv_op_l1_usage conv2d::calculate_L1_usage( } else if (conv_config.dtype == DataType::FLOAT32) { per_core_out_width_aligned *= 4; } - output_size = round_up(per_core_out_width_aligned, hal.get_alignment(HalMemType::L1)) * - pconfig.per_core_out_matrix_height_ntile * tt::constants::TILE_HEIGHT; + output_size = + round_up(per_core_out_width_aligned, tt::tt_metal::hal.get_alignment(tt::tt_metal::HalMemType::L1)) * + pconfig.per_core_out_matrix_height_ntile * tt::constants::TILE_HEIGHT; } else { output_size = per_core_out_matrix_height_ntiles * per_core_out_matrix_width_ntiles * output_tile_size; } diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.hpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.hpp index 440521121d5..c28026849fc 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.hpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.hpp @@ -45,7 +45,7 @@ sliding_window::ParallelConfig determine_parallel_config( uint32_t output_width, uint32_t output_channels, const CoreCoord& compute_grid_size, - ShardOrientation block_shard_orientation, + tt::tt_metal::ShardOrientation block_shard_orientation, bool enable_channels_padding, bool is_out_tiled = true, uint32_t act_block_h_override = 0); diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.hpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.hpp index 04557524b76..2cd7872564a 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.hpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.hpp @@ -16,8 +16,8 @@ namespace conv2d { constexpr uint32_t l1_scratchpad_CB_size = 64; struct Conv2dConfig { - DataType dtype = DataType::BFLOAT16; - DataType weights_dtype = DataType::BFLOAT16; + tt::tt_metal::DataType dtype = tt::tt_metal::DataType::BFLOAT16; + tt::tt_metal::DataType weights_dtype = tt::tt_metal::DataType::BFLOAT16; // Either "relu" or "" string activation = ""; @@ -49,7 +49,7 @@ struct Conv2dConfig { // if override_sharding_config is true, reshard_if_not_optimal should not be set to true bool override_sharding_config = false; - std::optional shard_layout; + std::optional shard_layout; // used only if override_sharding_config is true std::optional core_grid = std::nullopt; @@ -59,7 +59,7 @@ struct Conv2dConfig { // Useful when output is BFLOAT16. // BFLOAT8 is always Tile layout. - Layout output_layout = Layout::TILE; + tt::tt_metal::Layout output_layout = tt::tt_metal::Layout::TILE; // Doubles the size of the CBs for activation. // Increased perf, but increased L1 usage. @@ -305,7 +305,7 @@ conv_op_l1_usage calculate_L1_usage( const ttnn::Shape& weights_shape, std::array kernel_size, const Conv2dConfig& conv_config, - const MemoryConfig& output_memory_config, + const tt::tt_metal::MemoryConfig& output_memory_config, bool enable_bias, bool is_1d_depthwise_conv); diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp index 32fd24971e8..61d10cdaf6a 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp @@ -2,6 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 +#include "tt-metalium/circular_buffer.hpp" +#include "tt-metalium/circular_buffer_types.hpp" #include "ttnn/operations/conv/conv2d/conv2d_utils.hpp" #include "ttnn/operations/conv/conv2d/device/conv2d_op.hpp" #include "ttnn/operations/sliding_window/sliding_window.hpp" @@ -60,7 +62,7 @@ tt::tt_metal::operation::ProgramWithCallbacks multi_core_optimized_conv_width_sh bool enable_subblock_padding); // TODO: Add namespace for utilities? -std::tuple create_CBs_for_sharded_input_v2( +std::tuple create_CBs_for_sharded_input_v2( tt_metal::Program& program, const Tensor& input, CoreRange core, @@ -85,6 +87,9 @@ std::tuple create_CBs_for_sharded_input_v2( bool fp32_dest_acc_en, bool packer_l1_acc_en) { using namespace CMAKE_UNIQUE_NAMESPACE; + using tt::tt_metal::CBHandle; + using tt::tt_metal::CircularBuffer; + using tt::tt_metal::CircularBufferConfig; tt::DataFormat interm0_df = packer_l1_acc_en ? (fp32_dest_acc_en ? tt::DataFormat::Float32 : tt::DataFormat::Float16_b) : out_df; @@ -265,7 +270,7 @@ std::tuple create_CBs_for_sharded_input_v2( } // TODO: Add namespace for utilities? -std::tuple create_CBs_for_depthwise_sharded_input( +std::tuple create_CBs_for_depthwise_sharded_input( tt_metal::Program& program, const Tensor& input, CoreRange core, @@ -289,6 +294,10 @@ std::tuple create_CBs_for_depthwise_sharded_input( bool fp32_dest_acc_en, bool packer_l1_acc_en) { using namespace CMAKE_UNIQUE_NAMESPACE; + using tt::tt_metal::CBHandle; + using tt::tt_metal::CircularBuffer; + using tt::tt_metal::CircularBufferConfig; + tt::DataFormat interm0_df = packer_l1_acc_en ? (fp32_dest_acc_en ? tt::DataFormat::Float32 : tt::DataFormat::Float16_b) : out_df; @@ -374,7 +383,7 @@ std::tuple create_CBs_for_depthwise_sharded_input( return {cb_sharded_act, cb_output}; } -operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( +tt::tt_metal::operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( tt_metal::Program& program, const Tensor& a, const Tensor& b, @@ -398,6 +407,10 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( bool enable_split_reader, bool enable_subblock_padding) { using namespace CMAKE_UNIQUE_NAMESPACE; + using tt::tt_metal::CBHandle; + using tt::tt_metal::CircularBuffer; + using tt::tt_metal::CircularBufferConfig; + bool pass = true; tt_metal::IDevice* device = a.device(); TT_FATAL(a.get_layout() == Layout::ROW_MAJOR, "Conv activation should be in row major layout"); @@ -1378,26 +1391,27 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( bias_ntiles_per_core, out0_cb}; - auto writer_mcast_noc = NOC::NOC_0; - auto reader_noc = writer_mcast_noc == NOC::NOC_0 ? NOC::NOC_1 : NOC::NOC_0; + auto writer_mcast_noc = tt::tt_metal::NOC::NOC_0; + auto reader_noc = + writer_mcast_noc == tt::tt_metal::NOC::NOC_0 ? tt::tt_metal::NOC::NOC_1 : tt::tt_metal::NOC::NOC_0; auto writer_mcast_sender_id = CreateKernel( program, writer_mcast_sender_kernel, mcast_sender_cores, - DataMovementConfig{ - .processor = DataMovementProcessor::RISCV_0, + tt::tt_metal::DataMovementConfig{ + .processor = tt::tt_metal::DataMovementProcessor::RISCV_0, .noc = writer_mcast_noc, .compile_args = writer_compile_time_args, .defines = writer_mcast_sender_defines}); - KernelHandle writer_mcast_receiver_id = -1; + tt::tt_metal::KernelHandle writer_mcast_receiver_id = -1; if (total_num_cores > 1) { writer_mcast_receiver_id = CreateKernel( program, writer_mcast_receiver_kernel, mcast_receiver_cores, - DataMovementConfig{ - .processor = DataMovementProcessor::RISCV_0, + tt::tt_metal::DataMovementConfig{ + .processor = tt::tt_metal::DataMovementProcessor::RISCV_0, .noc = writer_mcast_noc, .compile_args = writer_compile_time_args, .defines = writer_defines}); @@ -1407,8 +1421,8 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( program, reader_kernel, all_active_cores, - DataMovementConfig{ - .processor = DataMovementProcessor::RISCV_1, + tt::tt_metal::DataMovementConfig{ + .processor = tt::tt_metal::DataMovementProcessor::RISCV_1, .noc = reader_noc, .compile_args = reader_compile_time_args, .defines = reader_defines}); @@ -1419,7 +1433,7 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( program, compute_kernel, all_active_cores, - ComputeConfig{ + tt::tt_metal::ComputeConfig{ .math_fidelity = math_fidelity, .fp32_dest_acc_en = fp32_dest_acc_en, .compile_args = compute_kernel_args, @@ -1454,7 +1468,7 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( auto shard_shape = a.shard_spec().value().shape; uint32_t tilized_act_tile_size = tt_metal::detail::TileSize(tilized_act_df); - bool reader_is_noc_0 = reader_noc == NOC::NOC_0; + bool reader_is_noc_0 = reader_noc == tt::tt_metal::NOC::NOC_0; TT_FATAL(!reader_is_noc_0, "Error"); @@ -1563,7 +1577,7 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( auto right_core_physical = device->worker_core_from_logical_core(right_core); if (core_x_i == 0) { // sender - if (writer_mcast_noc == NOC::NOC_0) { + if (writer_mcast_noc == tt::tt_metal::NOC::NOC_0) { writer_rt_args.push_back(top_left_core_plus_one_physical.x); // weights_mcast_dest_noc_start_x writer_rt_args.push_back(right_core_physical.y); // weights_mcast_dest_noc_start_y writer_rt_args.push_back(bottom_right_core_physical.x); // weights_mcast_dest_noc_end_x @@ -1595,10 +1609,10 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( } else { CoreCoord top_core = {(std::size_t)core_x_i, 0}; auto top_core_physical = device->worker_core_from_logical_core(top_core); - TT_FATAL(writer_mcast_noc == NOC::NOC_0, "Error"); + TT_FATAL(writer_mcast_noc == tt::tt_metal::NOC::NOC_0, "Error"); if (core_y_i == 0) { // sender - if (writer_mcast_noc == NOC::NOC_0) { + if (writer_mcast_noc == tt::tt_metal::NOC::NOC_0) { writer_rt_args.push_back(top_core_physical.x); // weights_mcast_dest_noc_start_x writer_rt_args.push_back(top_left_core_plus_one_physical.y); // weights_mcast_dest_noc_start_y writer_rt_args.push_back(top_core_physical.x); // weights_mcast_dest_noc_end_x @@ -1632,7 +1646,7 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( // 1D mcast if (core_x_i == 0 and core_y_i == 0) { // sender - if (writer_mcast_noc == NOC::NOC_0) { + if (writer_mcast_noc == tt::tt_metal::NOC::NOC_0) { writer_rt_args.push_back(top_left_core_physical.x); // weights_mcast_dest_noc_start_x writer_rt_args.push_back(top_left_core_physical.y); // weights_mcast_dest_noc_start_y writer_rt_args.push_back(bottom_right_core_physical.x); // weights_mcast_dest_noc_end_x @@ -1743,7 +1757,7 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( return {.program = std::move(program), .override_runtime_arguments_callback = override_runtime_arguments_callback}; } -operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_new( +tt::tt_metal::operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_new( const Tensor& a, const Tensor& b, const std::optional& bias, diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_width_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_width_sharded_program_factory.cpp index 84d7bc017aa..fde3721d120 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_width_sharded_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_width_sharded_program_factory.cpp @@ -3,6 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 #include +#include "tt-metalium/circular_buffer.hpp" +#include "tt-metalium/circular_buffer_types.hpp" #include "ttnn/operations/conv/conv2d/device/conv2d_op.hpp" #include "ttnn/operations/sliding_window/sliding_window.hpp" #include @@ -41,6 +43,10 @@ tt::tt_metal::operation::ProgramWithCallbacks multi_core_optimized_conv_width_sh bool enable_act_double_buffer, bool enable_split_reader, bool enable_subblock_padding) { + using tt::tt_metal::CBHandle; + using tt::tt_metal::CircularBuffer; + using tt::tt_metal::CircularBufferConfig; + const uint32_t act_cb = CBIndex::c_0; const uint32_t weight_cb = CBIndex::c_1; const uint32_t bias_cb = CBIndex::c_2; @@ -57,7 +63,7 @@ tt::tt_metal::operation::ProgramWithCallbacks multi_core_optimized_conv_width_sh bool pass = true; enable_split_reader = false; tt_metal::IDevice* device = a.device(); - TT_FATAL(a.get_layout() == Layout::ROW_MAJOR, "Conv activation should be in row major layout"); + TT_FATAL(a.get_layout() == tt::tt_metal::Layout::ROW_MAJOR, "Conv activation should be in row major layout"); TT_FATAL(a.memory_config().is_sharded(), "Conv activation must be sharded."); TT_FATAL(output_channels <= b.get_padded_shape()[3], "Invalid weight shape. Incorrect weight tensor."); uint32_t act_block_h_ntiles = block_config.act_block_h_ntiles; @@ -139,7 +145,7 @@ tt::tt_metal::operation::ProgramWithCallbacks multi_core_optimized_conv_width_sh act_block_h_ntiles); // Tensor b has weights and it should be tiled layout after converting conv weights into weight matrix - TT_FATAL(b.get_layout() == Layout::TILE, "Conv weights should be in tiled layout"); + TT_FATAL(b.get_layout() == tt::tt_metal::Layout::TILE, "Conv weights should be in tiled layout"); TT_FATAL(b.get_padded_shape()[0] == 1, "Conv weight matrix shape is invalid"); TT_FATAL(b.get_padded_shape()[1] == 1, "Conv weight matrix shape is invalid"); uint32_t weight_matrix_height = b.get_padded_shape()[2]; @@ -254,13 +260,13 @@ tt::tt_metal::operation::ProgramWithCallbacks multi_core_optimized_conv_width_sh // Device compatibility checks TT_FATAL( - a.storage_type() == StorageType::DEVICE && b.storage_type() == StorageType::DEVICE, + a.storage_type() == tt::tt_metal::StorageType::DEVICE && b.storage_type() == tt::tt_metal::StorageType::DEVICE, "Operands to large matmul need to be on device!"); TT_FATAL(a.device() == b.device(), "Operands to conv need to be on the same device!"); TT_FATAL( a.buffer() != nullptr && b.buffer() != nullptr, "Operands to conv need to be allocated in buffers on device!"); if (has_bias) { - TT_FATAL(bias.value().storage_type() == StorageType::DEVICE, "Bias should be on device"); + TT_FATAL(bias.value().storage_type() == tt::tt_metal::StorageType::DEVICE, "Bias should be on device"); TT_FATAL(bias.value().device() == a.device(), "Bias should be on the same device as act tensor"); } @@ -437,7 +443,7 @@ tt::tt_metal::operation::ProgramWithCallbacks multi_core_optimized_conv_width_sh bias_buffer = bias.value().buffer(); bias_dram_addr = bias_buffer->address(); bias_ntiles = weight_block_w_ntiles; - bias_in_dram = bias_buffer->buffer_type() == BufferType::DRAM; + bias_in_dram = bias_buffer->buffer_type() == tt::tt_metal::BufferType::DRAM; } uint32_t num_weight_slices_width = weight_matrix_width_ntiles / p_config.per_core_out_matrix_width_ntile; @@ -800,18 +806,18 @@ tt::tt_metal::operation::ProgramWithCallbacks multi_core_optimized_conv_width_sh program, activation_kernel_path, all_cores, - DataMovementConfig{ - .processor = DataMovementProcessor::RISCV_0, - .noc = NOC::RISCV_0_default, + tt::tt_metal::DataMovementConfig{ + .processor = tt::tt_metal::DataMovementProcessor::RISCV_0, + .noc = tt::tt_metal::NOC::RISCV_0_default, .compile_args = activation_kernel_compile_args}); auto weights_kernel_id = CreateKernel( program, weights_kernel_path, all_cores, - DataMovementConfig{ - .processor = DataMovementProcessor::RISCV_1, - .noc = NOC::RISCV_1_default, + tt::tt_metal::DataMovementConfig{ + .processor = tt::tt_metal::DataMovementProcessor::RISCV_1, + .noc = tt::tt_metal::NOC::RISCV_1_default, .compile_args = weights_kernel_compile_args, .defines = writer_defines}); @@ -819,7 +825,7 @@ tt::tt_metal::operation::ProgramWithCallbacks multi_core_optimized_conv_width_sh program, compute_kernel_path, all_cores, - ComputeConfig{ + tt::tt_metal::ComputeConfig{ .math_fidelity = math_fidelity, .fp32_dest_acc_en = fp32_dest_acc_en, .compile_args = compute_kernel_args, @@ -871,7 +877,7 @@ tt::tt_metal::operation::ProgramWithCallbacks multi_core_optimized_conv_width_sh // Capture conv_reader_indices_buffer to cache this with the program auto empty_callback = [conv_reader_indices_buffer]( const void* operation, - Program& program, + tt::tt_metal::Program& program, const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector& output_tensors) {}; diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.cpp index 2f7b82a170e..2a6ce8a9281 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.cpp @@ -9,9 +9,10 @@ #include "ttnn/operations/core/core.hpp" #include "ttnn/operations/data_movement/pad/pad.hpp" #include "ttnn/tensor/types.hpp" -using namespace tt; + namespace ttnn { namespace operations::conv { +using namespace tt; using sliding_window::ParallelConfig; using sliding_window::SlidingWindowConfig; @@ -23,10 +24,10 @@ Tensor convert_tensor(const Tensor& input_tensor, compute_& compute) { return std::visit( [&compute](auto&& storage) -> Tensor { using StorageType = std::decay_t; - if constexpr (std::is_same_v) { - return compute(owned_buffer::get_as(storage.buffer)); - } else if constexpr (std::is_same_v) { - return compute(borrowed_buffer::get_as(storage.buffer)); + if constexpr (std::is_same_v) { + return compute(tt::tt_metal::owned_buffer::get_as(storage.buffer)); + } else if constexpr (std::is_same_v) { + return compute(tt::tt_metal::borrowed_buffer::get_as(storage.buffer)); } else { TT_THROW("Unsupported storage type"); } @@ -64,27 +65,34 @@ Tensor convert_tensor_to_tiled_layout_common( template Tensor create_tensor_from_owned_buffer( - owned_buffer::Buffer& buf, DataType& output_dtype, ttnn::Shape& output_shape) { + tt::tt_metal::owned_buffer::Buffer& buf, DataType& output_dtype, ttnn::Shape& output_shape) { if constexpr (std::is_same::value) { if (output_dtype == DataType::BFLOAT8_B || output_dtype == DataType::BFLOAT4_B) { - auto tensor = - Tensor(std::move(OwnedStorage{std::move(buf)}), output_shape, DataType::FLOAT32, Layout::ROW_MAJOR) - .to_layout(Layout::TILE); - auto output_float_data = owned_buffer::get_as(tensor).get(); + auto tensor = Tensor( + std::move(tt::tt_metal::OwnedStorage{std::move(buf)}), + output_shape, + DataType::FLOAT32, + Layout::ROW_MAJOR) + .to_layout(Layout::TILE); + auto output_float_data = tt::tt_metal::owned_buffer::get_as(tensor).get(); auto output_packed_data = output_dtype == DataType::BFLOAT8_B ? pack_fp32_vec_as_bfp8_tiles(output_float_data, /*row_major_input=*/false, /*is_exp_a=*/false) : pack_fp32_vec_as_bfp4_tiles(output_float_data, /*row_major_input=*/false, /*is_exp_a=*/false); - auto output_uint32_buffer = owned_buffer::create(std::move(output_packed_data)); + auto output_uint32_buffer = tt::tt_metal::owned_buffer::create(std::move(output_packed_data)); return Tensor( - std::move(OwnedStorage{std::move(output_uint32_buffer)}), output_shape, output_dtype, Layout::TILE); + std::move(tt::tt_metal::OwnedStorage{std::move(output_uint32_buffer)}), + output_shape, + output_dtype, + Layout::TILE); } } else { TT_FATAL( (output_dtype != DataType::BFLOAT8_B) || (output_dtype != DataType::BFLOAT4_B), "Unsupported output datatype"); } - auto rm_tensor = Tensor(std::move(OwnedStorage{std::move(buf)}), output_shape, output_dtype, Layout::ROW_MAJOR); + auto rm_tensor = + Tensor(std::move(tt::tt_metal::OwnedStorage{std::move(buf)}), output_shape, output_dtype, Layout::ROW_MAJOR); return rm_tensor.to_layout(Layout::TILE); } @@ -106,7 +114,7 @@ Tensor to_weight_special_padding_tile_layout( uint32_t block_height_padding = in1_block_h_datums - (w_shape[1] * w_shape[3]); auto weight_matrix_rows = ((w_shape[1] * w_shape[3]) + block_height_padding) * w_shape[2]; ttnn::Shape output_shape{1, 1, weight_matrix_rows, weight_matrix_cols}; - auto output_buffer = owned_buffer::create(output_shape.volume()); + auto output_buffer = tt::tt_metal::owned_buffer::create(output_shape.volume()); for (auto r = 0; r < w_shape[2]; r++) { for (auto s = 0; s < w_shape[3]; s++) { for (auto c = 0; c < w_shape[1]; c++) { @@ -145,7 +153,7 @@ Tensor to_weight_tile_layout( (uint32_t)std::ceil((double)weight_matrix_rows / (double)in1_block_h_datums) * in1_block_h_datums; } ttnn::Shape output_shape{1, 1, weight_matrix_rows, weight_matrix_cols}; - auto output_buffer = owned_buffer::create(output_shape.volume()); + auto output_buffer = tt::tt_metal::owned_buffer::create(output_shape.volume()); for (auto r = 0; r < w_shape[2]; r++) { for (auto s = 0; s < w_shape[3]; s++) { for (auto c = 0; c < w_shape[1]; c++) { @@ -209,7 +217,7 @@ Tensor to_weight_tile_layout_block_sharded( weight_matrix_rows = weight_block_height_padded * num_channel_shards; } ttnn::Shape output_shape{1, 1, weight_matrix_rows, weight_matrix_cols}; - auto output_buffer = owned_buffer::create(output_shape.volume()); + auto output_buffer = tt::tt_metal::owned_buffer::create(output_shape.volume()); for (auto ic = 0; ic < num_channel_shards; ic++) { for (auto r = 0; r < w_shape[2]; r++) { for (auto s = 0; s < w_shape[3]; s++) { @@ -269,7 +277,7 @@ Tensor to_bias_tile_layout_block_sharded( auto bias_matrix_rows = 32; ttnn::Shape output_shape{1, 1, bias_matrix_rows, bias_matrix_cols}; - auto output_buffer = owned_buffer::create(output_shape.volume()); + auto output_buffer = tt::tt_metal::owned_buffer::create(output_shape.volume()); for (auto oc = 0; oc < num_channel_shards; oc++) { for (auto k_s = 0; k_s < conv_output_shard_width; k_s++) { auto matrix_idx = oc * conv_output_shard_width_padded + k_s; @@ -328,7 +336,8 @@ static Tensor conv_group_weight_zero_pad_helper( DataType output_dtype) { auto pad_weight = [&original_weight_shape, &output_weight_shape, &num_groups, &output_dtype]( const auto& conv_weight_tensor_buffer) { - owned_buffer::Buffer output_buffer = owned_buffer::create(output_weight_shape.volume()); + tt::tt_metal::owned_buffer::Buffer output_buffer = + tt::tt_metal::owned_buffer::create(output_weight_shape.volume()); for (int curr_batch_idx = 0; curr_batch_idx < original_weight_shape[0]; curr_batch_idx++) { int new_batch_idx = curr_batch_idx; @@ -343,13 +352,13 @@ static Tensor conv_group_weight_zero_pad_helper( for (int k = 0; k < original_weight_shape[2]; k++) { for (int m = 0; m < original_weight_shape[3]; m++) { // Get value from original weight tensor - auto value_flat_input_index = compute_flat_indices( + auto value_flat_input_index = tt::tt_metal::compute_flat_indices( ttnn::SmallVector{curr_batch_idx, j, k, m}, compute_strides(original_weight_shape)); auto value = conv_weight_tensor_buffer[value_flat_input_index]; // Copy value to output tensor at the adjusted position auto new_channel_idx = new_channel_start_idx + j; - auto output_flat_input_index = compute_flat_indices( + auto output_flat_input_index = tt::tt_metal::compute_flat_indices( ttnn::SmallVector{new_batch_idx, new_channel_idx, k, m}, compute_strides(output_weight_shape)); output_buffer[output_flat_input_index] = value; @@ -358,7 +367,10 @@ static Tensor conv_group_weight_zero_pad_helper( } } return Tensor( - std::move(OwnedStorage{std::move(output_buffer)}), output_weight_shape, output_dtype, Layout::ROW_MAJOR); + std::move(tt::tt_metal::OwnedStorage{std::move(output_buffer)}), + output_weight_shape, + output_dtype, + Layout::ROW_MAJOR); }; return convert_tensor(weight, pad_weight); @@ -373,26 +385,30 @@ static Tensor conv_depthwise_weight_bcast_helper( const ttnn::Shape& original_weight_shape, const ttnn::Shape& output_weight_shape, DataType output_dtype) { - owned_buffer::Buffer output_buffer = owned_buffer::create(output_weight_shape.volume()); - auto conv_weight_tensor_buffer = borrowed_buffer::get_as(conv_weight_tensor); + tt::tt_metal::owned_buffer::Buffer output_buffer = + tt::tt_metal::owned_buffer::create(output_weight_shape.volume()); + auto conv_weight_tensor_buffer = tt::tt_metal::borrowed_buffer::get_as(conv_weight_tensor); // Copy the original weight tensor to the output tensor for (int i = 0; i < output_weight_shape[0]; i++) { for (int j = 0; j < output_weight_shape[1]; j++) { for (int k = 0; k < output_weight_shape[2]; k++) { for (int l = 0; l < output_weight_shape[3]; l++) { - auto value_flat_input_index = compute_flat_indices( + auto value_flat_input_index = tt::tt_metal::compute_flat_indices( ttnn::SmallVector{i, 0, k, l}, compute_strides(original_weight_shape)); auto value = conv_weight_tensor_buffer[value_flat_input_index]; - auto output_flat_input_index = - compute_flat_indices(ttnn::SmallVector{i, j, k, l}, compute_strides(output_weight_shape)); + auto output_flat_input_index = tt::tt_metal::compute_flat_indices( + ttnn::SmallVector{i, j, k, l}, compute_strides(output_weight_shape)); output_buffer[output_flat_input_index] = value; } } } } - auto output_tensor = - Tensor(std::move(OwnedStorage{std::move(output_buffer)}), output_weight_shape, output_dtype, Layout::ROW_MAJOR); + auto output_tensor = Tensor( + std::move(tt::tt_metal::OwnedStorage{std::move(output_buffer)}), + output_weight_shape, + output_dtype, + Layout::ROW_MAJOR); return output_tensor; } @@ -589,7 +605,7 @@ static OptimizedConvBlockConfig get_opt_block_config( if (conv_config.shard_layout.value() == ttnn::TensorMemoryLayout::WIDTH_SHARDED && !mm_conv) { uint32_t max_num_cores = compute_grid_size.x * compute_grid_size.y; output_parallel_config = { - .grid = num_cores_to_corerangeset( + .grid = tt::tt_metal::num_cores_to_corerangeset( find_closest_largest_divisor(tt::div_up(out_channels, tt::constants::TILE_WIDTH), max_num_cores), compute_grid_size, true), diff --git a/ttnn/cpp/ttnn/operations/conv/conv_transpose2d/conv_transpose2d.cpp b/ttnn/cpp/ttnn/operations/conv/conv_transpose2d/conv_transpose2d.cpp index d9e4f831fb5..85b4c5874e1 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv_transpose2d/conv_transpose2d.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv_transpose2d/conv_transpose2d.cpp @@ -13,9 +13,10 @@ #include "ttnn/operations/conv/conv2d/prepare_conv2d_weights.hpp" #include "ttnn/operations/sliding_window/halo/halo.hpp" -using namespace tt; namespace ttnn { namespace operations::conv { + +using namespace tt; using sliding_window::ParallelConfig; using sliding_window::SlidingWindowConfig; @@ -34,7 +35,7 @@ Tensor _transform_weights_for_conv_transpose2d(const Tensor& conv_weight_tensor, auto kernel_height = in_w_shape[2]; auto kernel_width = in_w_shape[3]; ttnn::Shape output_shape{out_channels, in_channels, kernel_height, kernel_width}; - auto output_buffer = owned_buffer::create(output_shape.volume()); + auto output_buffer = tt::tt_metal::owned_buffer::create(output_shape.volume()); for (auto out_channels_index = 0; out_channels_index < out_channels; out_channels_index++) { auto output_weight_out_channel_base_idx = out_channels_index * in_channels * kernel_height * kernel_width; @@ -64,16 +65,17 @@ Tensor _transform_weights_for_conv_transpose2d(const Tensor& conv_weight_tensor, } } } - return Tensor(std::move(OwnedStorage{std::move(output_buffer)}), output_shape, dtype, Layout::ROW_MAJOR); + return Tensor( + std::move(tt::tt_metal::OwnedStorage{std::move(output_buffer)}), output_shape, dtype, Layout::ROW_MAJOR); }; auto convert_tensor = [&compute](const auto& conv_weight_tensor) { return std::visit( [&compute](auto&& storage) -> Tensor { using StorageType = std::decay_t; - if constexpr (std::is_same_v) { - return compute(owned_buffer::get_as(storage.buffer)); - } else if constexpr (std::is_same_v) { - return compute(borrowed_buffer::get_as(storage.buffer)); + if constexpr (std::is_same_v) { + return compute(tt::tt_metal::owned_buffer::get_as(storage.buffer)); + } else if constexpr (std::is_same_v) { + return compute(tt::tt_metal::borrowed_buffer::get_as(storage.buffer)); } else { TT_THROW("Unsupported storage type"); } diff --git a/ttnn/cpp/ttnn/operations/copy.hpp b/ttnn/cpp/ttnn/operations/copy.hpp index 7554904252d..4ab929e6e2a 100644 --- a/ttnn/cpp/ttnn/operations/copy.hpp +++ b/ttnn/cpp/ttnn/operations/copy.hpp @@ -91,14 +91,14 @@ struct Typecast { // const Tensor& input_tensor, // uint32_t tt_input_dtype, // uint32_t tt_output_dtype, - // const MemoryConfig& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG) + // const MemoryConfig& output_mem_config = tt::tt_metal::operation::DEFAULT_OUTPUT_MEMORY_CONFIG) static ttnn::Tensor invoke( const QueueId queue_id, const Tensor& input_tensor, const DataType& tt_input_dtype, const DataType& tt_output_dtype, - const std::optional& memory_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, + const std::optional& memory_config = tt::tt_metal::operation::DEFAULT_OUTPUT_MEMORY_CONFIG, const std::optional& optional_output_tensor = std::nullopt) { TT_ASSERT( input_tensor.device()->arch() != tt::ARCH::GRAYSKULL, diff --git a/ttnn/cpp/ttnn/operations/core/core.cpp b/ttnn/cpp/ttnn/operations/core/core.cpp index ecce273d047..fc648c15c9f 100644 --- a/ttnn/cpp/ttnn/operations/core/core.cpp +++ b/ttnn/cpp/ttnn/operations/core/core.cpp @@ -76,7 +76,9 @@ ttnn::Tensor allocate_tensor_on_device( const std::optional& memory_config) { return allocate_tensor_on_device( TensorSpec( - shape, TensorLayout(data_type, PageConfig(layout), memory_config.value_or(ttnn::DRAM_MEMORY_CONFIG))), + shape, + tt::tt_metal::TensorLayout( + data_type, tt::tt_metal::PageConfig(layout), memory_config.value_or(ttnn::DRAM_MEMORY_CONFIG))), device); } @@ -88,7 +90,9 @@ ttnn::Tensor allocate_tensor_on_device( const std::optional& memory_config) { return allocate_tensor_on_device( TensorSpec( - shape, TensorLayout(data_type, PageConfig(layout), memory_config.value_or(ttnn::DRAM_MEMORY_CONFIG))), + shape, + tt::tt_metal::TensorLayout( + data_type, tt::tt_metal::PageConfig(layout), memory_config.value_or(ttnn::DRAM_MEMORY_CONFIG))), mesh_device); } diff --git a/ttnn/cpp/ttnn/operations/core/to_dtype/to_dtype_op.hpp b/ttnn/cpp/ttnn/operations/core/to_dtype/to_dtype_op.hpp index ded9501cc3d..ee1002f8249 100644 --- a/ttnn/cpp/ttnn/operations/core/to_dtype/to_dtype_op.hpp +++ b/ttnn/cpp/ttnn/operations/core/to_dtype/to_dtype_op.hpp @@ -39,14 +39,14 @@ inline Tensor convert_to_cpp_supported_dtype(const Tensor& input_tensor) { "Tensor MultiDeviceHostStorage cannot be converted to torch directly. Use composer(..) " "functionality."); } else { - raise_unsupported_storage(); + tt::tt_metal::raise_unsupported_storage(); } }, input_tensor.get_storage()); if (input_dtype == DataType::BFLOAT8_B) { TT_ASSERT( - std::holds_alternative(buffer), + std::holds_alternative(buffer), "Unexpected type {}", tt::stl::get_active_type_name_in_variant(buffer)); auto uint32_data = @@ -58,7 +58,7 @@ inline Tensor convert_to_cpp_supported_dtype(const Tensor& input_tensor) { input_dtype = DataType::FLOAT32; } else if (input_dtype == DataType::BFLOAT4_B) { TT_ASSERT( - std::holds_alternative(buffer), + std::holds_alternative(buffer), "Unexpected type {}", tt::stl::get_active_type_name_in_variant(buffer)); auto uint32_data = @@ -78,9 +78,9 @@ inline Tensor convert_to_cpp_supported_dtype(const Tensor& input_tensor) { tt::tt_metal::OwnedStorage{buffer}, TensorSpec( input_tensor.get_logical_shape(), - TensorLayout::fromPaddedShape( + tt::tt_metal::TensorLayout::fromPaddedShape( input_dtype, - PageConfig(input_tensor.get_layout()), + tt::tt_metal::PageConfig(input_tensor.get_layout()), MemoryConfig{}, input_tensor.get_logical_shape(), input_tensor.get_padded_shape()))); @@ -89,9 +89,9 @@ inline Tensor convert_to_cpp_supported_dtype(const Tensor& input_tensor) { tt::tt_metal::BorrowedStorage{buffer, []() {}, []() {}}, TensorSpec( input_tensor.get_logical_shape(), - TensorLayout::fromPaddedShape( + tt::tt_metal::TensorLayout::fromPaddedShape( input_dtype, - PageConfig(input_tensor.get_layout()), + tt::tt_metal::PageConfig(input_tensor.get_layout()), MemoryConfig{}, input_tensor.get_logical_shape(), input_tensor.get_padded_shape()))}; @@ -130,7 +130,8 @@ Tensor create_owned_tensor( std::move(storage), TensorSpec( logical_shape, - TensorLayout::fromPaddedShape(data_type, PageConfig(layout), MemoryConfig{}, logical_shape, padded_shape))); + tt::tt_metal::TensorLayout::fromPaddedShape( + data_type, tt::tt_metal::PageConfig(layout), MemoryConfig{}, logical_shape, padded_shape))); } template @@ -206,23 +207,23 @@ inline Tensor convert_to_dtype(const Tensor& input_tensor, const Layout& input_l [&input_layout, &input_dtype, &dtype, &logical_shape, &padded_shape](const Tensor& input_tensor) { switch (input_dtype) { case DataType::UINT16: { - auto buffer = host_buffer::get_as(input_tensor); + auto buffer = tt::tt_metal::host_buffer::get_as(input_tensor); return create_tensor_from_buffer(buffer, logical_shape, padded_shape, input_layout, dtype); } case DataType::INT32: { - auto buffer = host_buffer::get_as(input_tensor); + auto buffer = tt::tt_metal::host_buffer::get_as(input_tensor); return create_tensor_from_buffer(buffer, logical_shape, padded_shape, input_layout, dtype); } case DataType::UINT32: { - auto buffer = host_buffer::get_as(input_tensor); + auto buffer = tt::tt_metal::host_buffer::get_as(input_tensor); return create_tensor_from_buffer(buffer, logical_shape, padded_shape, input_layout, dtype); } case DataType::FLOAT32: { - auto buffer = host_buffer::get_as(input_tensor); + auto buffer = tt::tt_metal::host_buffer::get_as(input_tensor); return create_tensor_from_buffer(buffer, logical_shape, padded_shape, input_layout, dtype); } case DataType::BFLOAT16: { - auto buffer = host_buffer::get_as<::bfloat16>(input_tensor); + auto buffer = tt::tt_metal::host_buffer::get_as<::bfloat16>(input_tensor); return create_tensor_from_buffer(buffer, logical_shape, padded_shape, input_layout, dtype); } default: TT_THROW("Unsupported DataType: {}", input_dtype); break; diff --git a/ttnn/cpp/ttnn/operations/core/to_layout/to_layout_op.cpp b/ttnn/cpp/ttnn/operations/core/to_layout/to_layout_op.cpp index 3c8c306b913..6429d55226b 100644 --- a/ttnn/cpp/ttnn/operations/core/to_layout/to_layout_op.cpp +++ b/ttnn/cpp/ttnn/operations/core/to_layout/to_layout_op.cpp @@ -33,7 +33,8 @@ bool requires_padding_change(const ttnn::Tensor& tensor, ttnn::Layout layout) { // It's okay for conversion to tile layout to preserve arbitrary padding as long as it satisfies the alignment TensorSpec padded_spec( tensor.get_padded_shape(), - TensorLayout(tensor.get_dtype(), PageConfig(layout, std::move(tile)), tensor.memory_config())); + tt::tt_metal::TensorLayout( + tensor.get_dtype(), tt::tt_metal::PageConfig(layout, std::move(tile)), tensor.memory_config())); return tensor.get_padded_shape() != padded_spec.padded_shape(); } @@ -79,7 +80,8 @@ Tensor to_layout_impl( TensorSpec tile_spec( tensor_arg.get_logical_shape(), - TensorLayout(tensor_arg.dtype(), PageConfig(Layout::TILE, tile), output_memory_config)); + tt::tt_metal::TensorLayout( + tensor_arg.dtype(), tt::tt_metal::PageConfig(Layout::TILE, tile), output_memory_config)); auto padded_output_shape = tile_spec.padded_shape(); auto original_rank = tensor_arg.get_logical_shape().rank(); auto original_shape = tensor_arg.get_logical_shape(); diff --git a/ttnn/cpp/ttnn/operations/core/to_memory_config/to_memory_config_op.hpp b/ttnn/cpp/ttnn/operations/core/to_memory_config/to_memory_config_op.hpp index 948105bfb68..db69c9b16f2 100644 --- a/ttnn/cpp/ttnn/operations/core/to_memory_config/to_memory_config_op.hpp +++ b/ttnn/cpp/ttnn/operations/core/to_memory_config/to_memory_config_op.hpp @@ -6,6 +6,7 @@ #include +#include "ttnn/core.hpp" #include "ttnn/tensor/tensor.hpp" #include "ttnn/run_operation.hpp" #include "cpp/ttnn/operations/data_movement/sharded/reshard/device/reshard_op.hpp" @@ -26,6 +27,7 @@ struct ToMemoryConfig { const ttnn::Tensor& tensor, const ttnn::MemoryConfig& memory_config, std::optional dtype = std::nullopt) { + using namespace tt::tt_metal; // Temporary until we see why buffer data not being populated const auto& original_shape = tensor.get_logical_shape(); const auto original_memory_config = ttnn::get_memory_config(tensor); @@ -46,7 +48,7 @@ struct ToMemoryConfig { throw std::runtime_error( "dtype cannot be specified when converting sharded tensor to sharded tensor"); } - return operation::run( + return tt::tt_metal::operation::run( data_movement::ReshardDeviceOperation{ .output_mem_config = memory_config, }, @@ -58,13 +60,13 @@ struct ToMemoryConfig { // for row-major tensors where shard-spec[1] is different for input shard and output shard TT_FATAL(memory_config.is_sharded(), "Error"); - Tensor temp = operation::run( + Tensor temp = tt::tt_metal::operation::run( data_movement::ShardedToInterleavedDeviceOperation{ .output_mem_config = ttnn::DRAM_MEMORY_CONFIG, .output_dtype = dtype.value_or(tensor.get_dtype())}, {tensor}) .at(0); - return operation::run( + return tt::tt_metal::operation::run( data_movement::InterleavedToShardedDeviceOperation{ .output_mem_config = memory_config, .output_dtype = dtype.value_or(temp.get_dtype())}, @@ -74,7 +76,7 @@ struct ToMemoryConfig { } else { auto bbox = memory_config.shard_spec.value().grid.bounding_box(); CoreCoord grid_size(bbox.end_coord.x + 1, bbox.end_coord.y + 1); - return operation::run( + return tt::tt_metal::operation::run( data_movement::InterleavedToShardedDeviceOperation{ .output_mem_config = memory_config, .output_dtype = dtype.value_or(tensor.get_dtype())}, {tensor}) @@ -83,14 +85,14 @@ struct ToMemoryConfig { } else { // to_interleaved path if (tensor.is_sharded()) { - return operation::run( + return tt::tt_metal::operation::run( data_movement::ShardedToInterleavedDeviceOperation{ .output_mem_config = memory_config, .output_dtype = dtype.value_or(tensor.get_dtype())}, {tensor}) .at(0); } else { // L1 to DRAM or DRAM to L1 - return operation::run( + return tt::tt_metal::operation::run( ttnn::operations::data_movement::CopyDeviceOperation{ memory_config, dtype.value_or(tensor.get_dtype())}, {tensor}) diff --git a/ttnn/cpp/ttnn/operations/data_movement/bcast/bcast.cpp b/ttnn/cpp/ttnn/operations/data_movement/bcast/bcast.cpp index 2f262f71639..63f40b39a84 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/bcast/bcast.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/bcast/bcast.cpp @@ -20,9 +20,9 @@ Tensor BcastOperation::invoke( const std::optional& memory_config, const std::optional& output_tensor) { auto output_memory_config = memory_config.value_or(input_tensor_a.memory_config()); - std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor_a}))}; + std::vector output_tensors = {Tensor(tt::tt_metal::operation::get_workers_for_op_output({input_tensor_a}))}; - operation::launch_with_autoformat( + tt::tt_metal::operation::launch_with_autoformat( [bcast_op, bcast_dim, output_memory_config, output_tensor, queue_id]( const std::vector& input_tensors, const std::vector>& optional_input_tensors, @@ -69,7 +69,7 @@ Tensor BcastOperation::invoke( "Error"); } } - return operation::run_with_autoformat( + return tt::tt_metal::operation::run_with_autoformat( EltwiseBinaryBroadcast{bcast_op, bcast_dim, output_memory_config}, {input_tensor_a, input_tensor_b}, {}, diff --git a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_h/bcast_op_multi_core_h.hpp b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_h/bcast_op_multi_core_h.hpp index f3b8788ace5..a94d31af56d 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_h/bcast_op_multi_core_h.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_h/bcast_op_multi_core_h.hpp @@ -6,7 +6,7 @@ namespace ttnn::operations::data_movement { -operation::ProgramWithCallbacks bcast_multi_core_h( +tt::tt_metal::operation::ProgramWithCallbacks bcast_multi_core_h( const Tensor& input_tensor_a, const Tensor& input_tensor_b, const Tensor& output_tensor, diff --git a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_h/bcast_op_sharded_h.hpp b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_h/bcast_op_sharded_h.hpp index 7eea8a0df91..488ca4dad77 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_h/bcast_op_sharded_h.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_h/bcast_op_sharded_h.hpp @@ -6,7 +6,7 @@ namespace ttnn::operations::data_movement { -operation::ProgramWithCallbacks bcast_sharded_h( +tt::tt_metal::operation::ProgramWithCallbacks bcast_sharded_h( const Tensor& input_tensor_a, const Tensor& input_tensor_b, const Tensor& output_tensor, diff --git a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_h/bcast_op_sharded_h_optimised.hpp b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_h/bcast_op_sharded_h_optimised.hpp index 35c6555e18d..35336fe4b4d 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_h/bcast_op_sharded_h_optimised.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_h/bcast_op_sharded_h_optimised.hpp @@ -6,7 +6,7 @@ namespace ttnn::operations::data_movement { -operation::ProgramWithCallbacks bcast_sharded_h_optimised( +tt::tt_metal::operation::ProgramWithCallbacks bcast_sharded_h_optimised( const Tensor& input_tensor_a, const Tensor& input_tensor_b, const Tensor& output_tensor, diff --git a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_hw/bcast_op_multi_core_hw.hpp b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_hw/bcast_op_multi_core_hw.hpp index 4895fe60930..3d4f2b76910 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_hw/bcast_op_multi_core_hw.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_hw/bcast_op_multi_core_hw.hpp @@ -6,7 +6,7 @@ namespace ttnn::operations::data_movement { -operation::ProgramWithCallbacks bcast_multi_core_hw( +tt::tt_metal::operation::ProgramWithCallbacks bcast_multi_core_hw( const Tensor& input_tensor_a, const Tensor& input_tensor_b, const Tensor& output_tensor, diff --git a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_w/bcast_op_multi_core_w.hpp b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_w/bcast_op_multi_core_w.hpp index 62fac51aaa6..20b04a2e068 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_w/bcast_op_multi_core_w.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_w/bcast_op_multi_core_w.hpp @@ -6,7 +6,7 @@ namespace ttnn::operations::data_movement { -operation::ProgramWithCallbacks bcast_multi_core_w( +tt::tt_metal::operation::ProgramWithCallbacks bcast_multi_core_w( const Tensor& input_tensor_a, const Tensor& input_tensor_b, const Tensor& output_tensor, diff --git a/ttnn/cpp/ttnn/operations/data_movement/clone/clone.cpp b/ttnn/cpp/ttnn/operations/data_movement/clone/clone.cpp index 489acb4fc48..83ae987dedc 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/clone/clone.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/clone/clone.cpp @@ -7,6 +7,7 @@ #include "device/clone_device_operation.hpp" namespace ttnn::operations::data_movement::clone { + Tensor Clone::invoke( const Tensor& input, const std::optional& dtype, diff --git a/ttnn/cpp/ttnn/operations/data_movement/clone/device/clone_device_operation.cpp b/ttnn/cpp/ttnn/operations/data_movement/clone/device/clone_device_operation.cpp index b3b10b920d2..861103b506a 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/clone/device/clone_device_operation.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/clone/device/clone_device_operation.cpp @@ -41,7 +41,10 @@ CloneOperation::spec_return_value_t CloneOperation::compute_output_specs( const auto& input = tensor_args.input; return TensorSpec( input.get_logical_shape(), - TensorLayout(operation_attributes.dtype, PageConfig(input.get_layout()), operation_attributes.memory_config)); + tt::tt_metal::TensorLayout( + operation_attributes.dtype, + tt::tt_metal::PageConfig(input.get_layout()), + operation_attributes.memory_config)); }; CloneOperation::tensor_return_value_t CloneOperation::create_output_tensors( diff --git a/ttnn/cpp/ttnn/operations/data_movement/clone/device/clone_device_operation.hpp b/ttnn/cpp/ttnn/operations/data_movement/clone/device/clone_device_operation.hpp index 620c46e910c..f800a7989ab 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/clone/device/clone_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/clone/device/clone_device_operation.hpp @@ -11,7 +11,7 @@ namespace ttnn::operations::data_movement::clone { struct CloneOperation { struct operation_attributes_t { - const DataType dtype; + const tt::tt_metal::DataType dtype; const MemoryConfig memory_config; const DeviceComputeKernelConfig compute_kernel_config; }; @@ -25,8 +25,8 @@ struct CloneOperation { struct ProgramFactory { struct shared_variables_t { - KernelHandle read_kernel_id; - KernelHandle write_kernel_id; + tt::tt_metal::KernelHandle read_kernel_id; + tt::tt_metal::KernelHandle write_kernel_id; std::vector cores; }; diff --git a/ttnn/cpp/ttnn/operations/data_movement/common/common.cpp b/ttnn/cpp/ttnn/operations/data_movement/common/common.cpp index 384236c6bcf..d4042fb2408 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/common/common.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/common/common.cpp @@ -66,7 +66,7 @@ uint32_t get_max_l1_space(const Tensor& input_tensor_a) { auto device = input_tensor_a.device(); auto lowest_address = device->lowest_occupied_compute_l1_address(); uint32_t max_l1_space = lowest_address.has_value() ? lowest_address.value() : device->l1_size_per_core(); - max_l1_space = max_l1_space - device->allocator()->get_base_allocator_addr(HalMemType::L1); + max_l1_space = max_l1_space - device->allocator()->get_base_allocator_addr(tt::tt_metal::HalMemType::L1); return max_l1_space; } diff --git a/ttnn/cpp/ttnn/operations/data_movement/concat/concat.cpp b/ttnn/cpp/ttnn/operations/data_movement/concat/concat.cpp index fb9c6581982..d1cd8f75e14 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/concat/concat.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/concat/concat.cpp @@ -35,6 +35,7 @@ inline void concat_db_print(bool condition, const std::string& msg) { namespace ttnn { namespace operations { namespace data_movement { + using OwnedConcatArgs = std::tuple, int, unsigned int>; using MassagedConcat = MassagedOperation&, int, unsigned int>; @@ -126,7 +127,7 @@ MassagedConcat build_untilize_rm_retilize_concat( // FIXME: change this to a legit slice call once // padding-oblivious entry point is uplifted to the slice // op. - untilized_tensor = operation::run( + untilized_tensor = tt::tt_metal::operation::run( SliceDeviceOperation{ ttnn::Shape(begins), ttnn::Shape(ends), ttnn::Shape(steps), output_memory_config}, {untilized_tensor}, diff --git a/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/fill_pad_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/fill_pad_op.cpp index 3de81f581ff..d9d71fced5d 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/fill_pad_op.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/fill_pad_op.cpp @@ -7,10 +7,11 @@ #include #include "ttnn/operations/data_movement/fill_pad/device/fill_pad_program_factory.hpp" -using namespace tt; - namespace ttnn::operations::data_movement { +using namespace tt; +using namespace tt::tt_metal; + void FillPad::validate(const std::vector& input_tensors) const { const auto& input_tensor_a = input_tensors.at(0); TT_FATAL(input_tensor_a.get_layout() == TILE_LAYOUT, "FillPad should only be used for tile layout"); diff --git a/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/fill_pad_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/fill_pad_program_factory.cpp index fa2895ea815..0ecad849026 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/fill_pad_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/fill_pad_program_factory.cpp @@ -15,16 +15,16 @@ bool is_power_of_two_at_least_32(uint32_t value) { return value >= 32 && (value using namespace tt; -std::map data_type_to_size = { - {DataType::BFLOAT16, 2}, - {DataType::FLOAT32, 4}, - {DataType::UINT32, 4}, - {DataType::UINT8, 1}, +std::map data_type_to_size = { + {ttnn::DataType::BFLOAT16, 2}, + {ttnn::DataType::FLOAT32, 4}, + {ttnn::DataType::UINT32, 4}, + {ttnn::DataType::UINT8, 1}, }; namespace ttnn::operations::data_movement::detail { -operation::ProgramWithCallbacks fill_pad_multi_core(const Tensor& input_tensor, float fill_value) { +tt::tt_metal::operation::ProgramWithCallbacks fill_pad_multi_core(const Tensor& input_tensor, float fill_value) { tt::tt_metal::IDevice* device = input_tensor.device(); tt::tt_metal::Program program = tt::tt_metal::CreateProgram(); diff --git a/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/fill_pad_program_factory.hpp b/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/fill_pad_program_factory.hpp index 241447317b4..3596df574d9 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/fill_pad_program_factory.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/fill_pad/device/fill_pad_program_factory.hpp @@ -4,6 +4,6 @@ namespace ttnn::operations::data_movement::detail { -operation::ProgramWithCallbacks fill_pad_multi_core(const Tensor& input_tensor, float fill_value); +tt::tt_metal::operation::ProgramWithCallbacks fill_pad_multi_core(const Tensor& input_tensor, float fill_value); } // namespace ttnn::operations::data_movement::detail diff --git a/ttnn/cpp/ttnn/operations/data_movement/fill_pad/fill_pad.cpp b/ttnn/cpp/ttnn/operations/data_movement/fill_pad/fill_pad.cpp index 26074b26045..312f7165bb9 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/fill_pad/fill_pad.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/fill_pad/fill_pad.cpp @@ -11,8 +11,6 @@ #include #include "cpp/ttnn/operations/copy.hpp" -using namespace tt::tt_metal; - namespace ttnn::operations::data_movement { ttnn::Tensor FillPadOperation::invoke( @@ -45,12 +43,12 @@ ttnn::Tensor FillPadOperation::invoke( ttnn::Shape new_shape = ttnn::Shape{std::array{third_dim, original_shape[-2], original_shape[-1]}}; auto reshaped_tensor = ttnn::reshape(mutable_input_tensor, new_shape); - reshaped_tensor = operation::run_without_autoformat( + reshaped_tensor = tt::tt_metal::operation::run_without_autoformat( FillPad{fill_value, output_memory_config}, {reshaped_tensor}, {}, {}, queue_id) .at(0); return ttnn::reshape(reshaped_tensor, original_shape); } - auto output_tensor = operation::run_without_autoformat( + auto output_tensor = tt::tt_metal::operation::run_without_autoformat( FillPad{fill_value, output_memory_config}, {mutable_input_tensor}, {}, {}, queue_id) .at(0); if (input_tensor.get_dtype() == DataType::BFLOAT8_B) { diff --git a/ttnn/cpp/ttnn/operations/data_movement/fold/device/fold_device_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/fold/device/fold_device_op.cpp index cf1c007d9a6..3abaa9f00cd 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/fold/device/fold_device_op.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/fold/device/fold_device_op.cpp @@ -63,12 +63,15 @@ Fold::spec_return_value_t Fold::compute_output_specs( mem_config.shard_spec->shape[1] *= op_attr.stride_h * op_attr.stride_w; return {TensorSpec( - output_shape, TensorLayout(input_tensor.get_dtype(), PageConfig(input_tensor.get_layout()), mem_config))}; + output_shape, + tt::tt_metal::TensorLayout( + input_tensor.get_dtype(), tt::tt_metal::PageConfig(input_tensor.get_layout()), mem_config))}; } return {TensorSpec( output_shape, - TensorLayout(input_tensor.get_dtype(), PageConfig(Layout::ROW_MAJOR), input_tensor.memory_config()))}; + tt::tt_metal::TensorLayout( + input_tensor.get_dtype(), tt::tt_metal::PageConfig(Layout::ROW_MAJOR), input_tensor.memory_config()))}; } Fold::tensor_return_value_t Fold::create_output_tensors( diff --git a/ttnn/cpp/ttnn/operations/data_movement/fold/fold.cpp b/ttnn/cpp/ttnn/operations/data_movement/fold/fold.cpp index ca3d56d8f77..633bcaeec58 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/fold/fold.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/fold/fold.cpp @@ -136,7 +136,7 @@ ttnn::MemoryConfig create_sharded_memory_config( auto sharded_memory_config = ttnn::MemoryConfig{ .memory_layout = ttnn::TensorMemoryLayout::HEIGHT_SHARDED, .buffer_type = ttnn::BufferType::L1, - .shard_spec = ShardSpec{ + .shard_spec = tt::tt_metal::ShardSpec{ CoreRangeSet{std::set{CoreRange{CoreCoord{0, 0}, CoreCoord{grid_size.x - 1, grid_size.y - 1}}}}, {shard_height, shard_width}, orientation}}; diff --git a/ttnn/cpp/ttnn/operations/data_movement/pad/pad.cpp b/ttnn/cpp/ttnn/operations/data_movement/pad/pad.cpp index 9e4382f3d73..8231b0d7fb5 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/pad/pad.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/pad/pad.cpp @@ -136,7 +136,7 @@ static ttnn::Tensor pad_impl( "output_w != output_memory_config.shard_spec().shape[1]"); ttnn::Shape output_shape{output_padded_shape}; - auto output_tensor = operation::run( + auto output_tensor = tt::tt_metal::operation::run( Pad{output_shape, output_shape, ttnn::Shape{input_tensor_start}, diff --git a/ttnn/cpp/ttnn/operations/data_movement/permute/device/permute_device_operation.cpp b/ttnn/cpp/ttnn/operations/data_movement/permute/device/permute_device_operation.cpp index 484235aa101..9b2596222a9 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/permute/device/permute_device_operation.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/permute/device/permute_device_operation.cpp @@ -65,7 +65,8 @@ PermuteDeviceOperation::spec_return_value_t PermuteDeviceOperation::compute_outp return TensorSpec( Shape(std::move(shape)), - TensorLayout(input_tensor.dtype(), PageConfig(input_tensor.layout()), attributes.output_mem_config)); + tt::tt_metal::TensorLayout( + input_tensor.dtype(), tt::tt_metal::PageConfig(input_tensor.layout()), attributes.output_mem_config)); } PermuteDeviceOperation::tensor_return_value_t PermuteDeviceOperation::create_output_tensors( diff --git a/ttnn/cpp/ttnn/operations/data_movement/permute/permute.cpp b/ttnn/cpp/ttnn/operations/data_movement/permute/permute.cpp index 98ad66655c0..eddf711f8a3 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/permute/permute.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/permute/permute.cpp @@ -102,13 +102,13 @@ ttnn::Tensor permute_launch( const ttnn::SmallVector& dims, const MemoryConfig& output_mem_config, const std::optional& pad_value) { - std::vector output_tensors = {ttnn::Tensor(operation::get_workers_for_op_output({a}))}; - operation::launch_with_autoformat( + std::vector output_tensors = {ttnn::Tensor(tt::tt_metal::operation::get_workers_for_op_output({a}))}; + tt::tt_metal::operation::launch_with_autoformat( [dims, output_mem_config, pad_value]( const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable - -> std::vector { + -> std::vector { auto& a = input_tensors.at(0); return {permute_impl(a, dims, output_mem_config, pad_value)}; }, diff --git a/ttnn/cpp/ttnn/operations/data_movement/repeat/device/repeat_device_operation.cpp b/ttnn/cpp/ttnn/operations/data_movement/repeat/device/repeat_device_operation.cpp index 621b42fd58d..cd67f133b07 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/repeat/device/repeat_device_operation.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/repeat/device/repeat_device_operation.cpp @@ -12,12 +12,15 @@ namespace ttnn { void RepeatDeviceOperation::validate(const std::vector& input_tensors) const { // Validate the input tensor const Tensor& input_tensor_a = input_tensors.at(0); - TT_FATAL(input_tensor_a.storage_type() == StorageType::DEVICE, "Operands to reshape need to be on device!"); + TT_FATAL( + input_tensor_a.storage_type() == tt::tt_metal::StorageType::DEVICE, + "Operands to reshape need to be on device!"); TT_FATAL(input_tensor_a.buffer() != nullptr, "Operands need to be allocated in buffers on device!"); - TT_FATAL(input_tensor_a.get_layout() == Layout::ROW_MAJOR, "This function is for RM->RM"); + TT_FATAL(input_tensor_a.get_layout() == tt::tt_metal::Layout::ROW_MAJOR, "This function is for RM->RM"); TT_FATAL( - input_tensor_a.get_dtype() == DataType::BFLOAT16 or input_tensor_a.get_dtype() == DataType::UINT32 or - input_tensor_a.get_dtype() == DataType::FLOAT32, + input_tensor_a.get_dtype() == tt::tt_metal::DataType::BFLOAT16 or + input_tensor_a.get_dtype() == tt::tt_metal::DataType::UINT32 or + input_tensor_a.get_dtype() == tt::tt_metal::DataType::FLOAT32, "Can only work with bfloat16/float32 or uint32 tensors"); // is this relevant? TT_FATAL( @@ -37,10 +40,12 @@ std::vector RepeatDeviceOperation::compute_output_specs(const std::v mem_config.shard_spec = shard_spec; } return {TensorSpec( - output_shape, TensorLayout(input_tensor_a.get_dtype(), PageConfig(input_tensor_a.get_layout()), mem_config))}; + output_shape, + tt::tt_metal::TensorLayout( + input_tensor_a.get_dtype(), tt::tt_metal::PageConfig(input_tensor_a.get_layout()), mem_config))}; } -operation::ProgramWithCallbacks RepeatDeviceOperation::create_program( +tt::tt_metal::operation::ProgramWithCallbacks RepeatDeviceOperation::create_program( const std::vector& input_tensors, std::vector& output_tensors) const { return operations::data_movement::repeat::rm_repeat_program_factory( input_tensors.at(0), m_num_repeats, output_tensors.at(0), m_is_last_dim); diff --git a/ttnn/cpp/ttnn/operations/data_movement/repeat/device/repeat_device_operation.hpp b/ttnn/cpp/ttnn/operations/data_movement/repeat/device/repeat_device_operation.hpp index d8bec905880..7ba10a4d0bb 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/repeat/device/repeat_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/repeat/device/repeat_device_operation.hpp @@ -10,12 +10,12 @@ namespace ttnn { struct RepeatDeviceOperation { const uint32_t m_num_repeats; const bool m_is_last_dim; - MemoryConfig m_output_mem_config; + tt::tt_metal::MemoryConfig m_output_mem_config; // Required functions to all tensor op functions void validate(const std::vector& input_tensors) const; std::vector compute_output_specs(const std::vector& input_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector& input_tensors, std::vector& output_tensors) const; }; } // namespace ttnn diff --git a/ttnn/cpp/ttnn/operations/data_movement/repeat/repeat.cpp b/ttnn/cpp/ttnn/operations/data_movement/repeat/repeat.cpp index 0cf3f74f8ef..f8a18a1002a 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/repeat/repeat.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/repeat/repeat.cpp @@ -62,7 +62,7 @@ ttnn::Tensor repeat_upper_dims_rm( constexpr bool is_final_dim = false; auto out_tensor = - operation::run( + tt::tt_metal::operation::run( RepeatDeviceOperation{repetitions, is_final_dim, output_mem_config}, {input_tensor}, {}, {}, queue_id) .at(0); auto expected_shape = input_shape; @@ -88,7 +88,7 @@ ttnn::Tensor repeat_last_dim_rm( constexpr bool is_final_dim = true; auto out_tensor = - operation::run( + tt::tt_metal::operation::run( RepeatDeviceOperation{repetitions, is_final_dim, output_mem_config}, {input_tensor}, {}, {}, queue_id) .at(0); @@ -176,7 +176,7 @@ ttnn::Tensor RepeatOperation::invoke( // tiled -> RM if (working_tensor.layout() == ttnn::TILE_LAYOUT) { working_tensor = - ttnn::to_layout(working_tensor, ttnn::ROW_MAJOR_LAYOUT, std::nullopt, std::nullopt, (Device*)nullptr); + ttnn::to_layout(working_tensor, ttnn::ROW_MAJOR_LAYOUT, std::nullopt, std::nullopt, (IDevice*)nullptr); } // loop over dims in repetition vector, backwards because repeat pages first is faster @@ -199,7 +199,7 @@ ttnn::Tensor RepeatOperation::invoke( // RM -> OG page layout if (tensor.layout() == ttnn::TILE_LAYOUT) { working_tensor = - ttnn::to_layout(working_tensor, ttnn::TILE_LAYOUT, tensor.get_dtype(), std::nullopt, (Device*)nullptr); + ttnn::to_layout(working_tensor, ttnn::TILE_LAYOUT, tensor.get_dtype(), std::nullopt, (IDevice*)nullptr); } // Interleaved to OG mem layout diff --git a/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/reshape.cpp b/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/reshape.cpp index e3d9ca247d9..e3085556bb3 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/reshape.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/reshape.cpp @@ -40,7 +40,7 @@ static Tensor manual_insertion( } else { tt::tt_metal::tensor_impl::read_data_from_device_buffer(device_buffer, data_vec); } - auto owned_buffer = owned_buffer::create(std::move(data_vec)); + auto owned_buffer = tt::tt_metal::owned_buffer::create(std::move(data_vec)); auto output = Tensor( OwnedStorage{owned_buffer}, @@ -91,7 +91,7 @@ ttnn::Tensor ReshapeOperation::invoke( output_mem_config); } std::vector output_tensors = {Tensor(tt::tt_metal::operation::get_workers_for_op_output({input_tensor}))}; - return operation::run( + return tt::tt_metal::operation::run( ReshapeDeviceOperation{logical_output_shape, padded_output_shape, output_mem_config}, {input_tensor}) .at(0); } diff --git a/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp b/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp index 90b35c86243..5f559b84f78 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp @@ -25,7 +25,6 @@ namespace ttnn::operations::data_movement { - namespace detail { ttnn::Tensor convert_tile_to_rm( @@ -207,7 +206,7 @@ ttnn::Tensor perform_reshape_on_2D_RM( } //Guaranteed to be interleaved //We are guaranteed to be working 2D->2D in this function - auto temp_tensor2 = operation::run( + auto temp_tensor2 = tt::tt_metal::operation::run( RM_RESHAPE_STRUCT{logical_shape, padded_shape, intermediate_out_memory_config}, {temp_tensor}, {}, diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/interleaved_to_sharded.hpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/interleaved_to_sharded.hpp index 1dcf2072a62..0ac16dd416c 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/interleaved_to_sharded.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/interleaved_to_sharded.hpp @@ -23,7 +23,7 @@ struct InterleavedToShardedOperation { const std::variant& grid, const std::array shard_shape, const TensorMemoryLayout shard_scheme, - const ShardOrientation shard_orientation, + const tt::tt_metal::ShardOrientation shard_orientation, const std::optional& data_type_arg, const std::optional& keep_l1_aligned = std::nullopt); }; diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/device/reshard_program_factory.hpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/device/reshard_program_factory.hpp index 0bfbfa41d18..a33ff13a660 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/device/reshard_program_factory.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/device/reshard_program_factory.hpp @@ -38,6 +38,6 @@ struct CorePageStride { PageStride page_stride; }; -operation::ProgramWithCallbacks reshard_multi_core(const Tensor& input, Tensor& output); +tt::tt_metal::operation::ProgramWithCallbacks reshard_multi_core(const Tensor& input, Tensor& output); } // namespace ttnn::operations::data_movement::detail diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/sharded_to_interleaved.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/sharded_to_interleaved.cpp index 58d5bb7a599..4b28fea0e5e 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/sharded_to_interleaved.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/sharded_to_interleaved.cpp @@ -7,8 +7,6 @@ #include "device/sharded_to_interleaved_op.hpp" #include "sharded_to_interleaved.hpp" -using namespace tt::tt_metal; - namespace ttnn::operations::data_movement { ttnn::Tensor ShardedToInterleavedOperation::invoke( @@ -21,9 +19,9 @@ ttnn::Tensor ShardedToInterleavedOperation::invoke( return input_tensor; } - std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor}))}; + std::vector output_tensors = {Tensor(tt::tt_metal::operation::get_workers_for_op_output({input_tensor}))}; auto shard_spec = input_tensor.shard_spec().value(); - return operation::run( + return tt::tt_metal::operation::run( ShardedToInterleavedDeviceOperation{ .output_mem_config = memory_config, .output_dtype = output_dtype.value_or(input_tensor.get_dtype()), diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/interleaved_to_sharded_partial.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/interleaved_to_sharded_partial.cpp index c386d335b88..0d8137e182f 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/interleaved_to_sharded_partial.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/interleaved_to_sharded_partial.cpp @@ -8,6 +8,8 @@ #include "interleaved_to_sharded_partial.hpp" #include +using namespace tt::tt_metal; + namespace ttnn::operations::data_movement { ttnn::Tensor InterleavedToShardedPartialOperation::invoke( diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/sharded_to_interleaved_partial/sharded_to_interleaved_partial.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/sharded_to_interleaved_partial/sharded_to_interleaved_partial.cpp index aeb20e1b9f8..fb9a3ff9615 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/sharded_to_interleaved_partial/sharded_to_interleaved_partial.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/sharded_to_interleaved_partial/sharded_to_interleaved_partial.cpp @@ -7,6 +7,8 @@ #include "device/sharded_to_interleaved_partial_op.hpp" #include "sharded_to_interleaved_partial.hpp" +using namespace tt::tt_metal; + namespace ttnn::operations::data_movement { ttnn::Tensor ShardedToInterleavedPartialOperation::invoke( diff --git a/ttnn/cpp/ttnn/operations/data_movement/slice/slice.cpp b/ttnn/cpp/ttnn/operations/data_movement/slice/slice.cpp index 6fcb9702889..55040782283 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/slice/slice.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/slice/slice.cpp @@ -187,7 +187,7 @@ ttnn::Tensor SliceOperation::invoke( } auto res = - operation::run( + tt::tt_metal::operation::run( SliceDeviceOperation{ ttnn::Shape(modified_begins), ttnn::Shape(padded_ends), ttnn::Shape(modified_step), memory_config}, {input}, @@ -307,7 +307,7 @@ ttnn::Tensor SliceOperation::invoke( } } - input = operation::run( + input = tt::tt_metal::operation::run( SliceDeviceOperation{ttnn::Shape(begins), ttnn::Shape(padded_ends), ttnn::Shape(step), memory_config}, {input}, {}, diff --git a/ttnn/cpp/ttnn/operations/data_movement/split/split.cpp b/ttnn/cpp/ttnn/operations/data_movement/split/split.cpp index a64713f8fee..a55343fa1a1 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/split/split.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/split/split.cpp @@ -92,7 +92,7 @@ std::vector impl_split_last_dim_two_chunks_tiled(const Tensor& input_ten auto padded_input_shape = ttnn::operations::experimental::auto_format::AutoFormat::pad_to_tile_shape(input_shape); ttnn::operations::experimental::auto_format::FormatParams input_format_params = { .pad_shape = padded_input_shape, .pad_value = 0.0, .target_layout = Layout::TILE}; - return operation::run_with_autoformat( + return tt::tt_metal::operation::run_with_autoformat( SplitDeviceOperation{2, 3, mem_config}, {input_tensor}, {input_format_params}, {Layout::TILE, Layout::TILE}); } diff --git a/ttnn/cpp/ttnn/operations/data_movement/transpose/transpose.cpp b/ttnn/cpp/ttnn/operations/data_movement/transpose/transpose.cpp index bf70fc59e17..5e31983e3ed 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/transpose/transpose.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/transpose/transpose.cpp @@ -15,12 +15,14 @@ #include -using namespace tt::tt_metal::experimental; - namespace ttnn::operations::data_movement { namespace detail { +using namespace tt::tt_metal::experimental; +using namespace tt; +using namespace tt::tt_metal::operation; + inline Tensor transpose_( const Tensor& a, TransposeOpDim transpose_dim, @@ -58,7 +60,7 @@ inline Tensor transpose_( break; default: break; } - return operation::run(Transpose{transpose_dim, output_mem_config, pad_value}, {a}).at(0); + return tt::tt_metal::operation::run(Transpose{transpose_dim, output_mem_config, pad_value}, {a}).at(0); } ttnn::Tensor transpose_nd( @@ -108,8 +110,8 @@ ttnn::Tensor ExecuteTranspose::invoke( input_unsqueezed.get_dtype() == DataType::BFLOAT8_B and !bfloat8_supported and !input_unsqueezed.is_sharded(); Tensor input_typecasted = typecast ? ttnn::typecast(input_unsqueezed, DataType::BFLOAT16) : input_unsqueezed; - std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({input_typecasted}))}; - operation::launch_with_autoformat( + std::vector output_tensors = {Tensor(detail::get_workers_for_op_output({input_typecasted}))}; + detail::launch_with_autoformat( [normalized_dim1, normalized_dim2, memory_config_arg, pad_value]( const std::vector& input_tensors, const std::vector>& optional_input_tensors, diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_program_factory.hpp b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_program_factory.hpp index 5a2d2e6d76f..9f8eea8358e 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_program_factory.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_program_factory.hpp @@ -9,7 +9,7 @@ namespace ttnn::operations::data_movement::detail { tt::tt_metal::operation::ProgramWithCallbacks untilize_with_halo_multi_core_v2( - Program& program, + tt::tt_metal::Program& program, const Tensor& input_tensor, const uint32_t pad_val, const uint32_t ncores_nhw, diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/device/untilize_with_unpadding_op.hpp b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/device/untilize_with_unpadding_op.hpp index 0ca24f4985d..e61fb59e155 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/device/untilize_with_unpadding_op.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/device/untilize_with_unpadding_op.hpp @@ -13,7 +13,7 @@ namespace ttnn::operations::data_movement { struct UntilizeWithUnpadding { const ttnn::Shape output_tensor_end; - const MemoryConfig output_mem_config; + const tt::tt_metal::MemoryConfig output_mem_config; const bool use_multicore; const bool use_pack_untilize; const bool fp32_dest_acc_en; diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/binary_ng.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/binary_ng.cpp index efa19f1962b..4bc828504d5 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/binary_ng.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/binary_ng.cpp @@ -6,7 +6,7 @@ #include "binary_ng.hpp" #include "device/binary_ng_device_operation.hpp" -inline Tensor typecast_to(DataType dtype, const Tensor& input) { +inline ttnn::Tensor typecast_to(ttnn::DataType dtype, const ttnn::Tensor& input) { return input.get_dtype() == dtype ? input : ttnn::typecast(input, dtype); } diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_device_operation_types.hpp b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_device_operation_types.hpp index e29cd4d728a..7377136f17a 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_device_operation_types.hpp +++ b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_device_operation_types.hpp @@ -14,8 +14,8 @@ namespace ttnn::operations::unary { struct operation_attributes_t { const std::vector op_chain; - const DataType output_dtype = DataType::INVALID; - const MemoryConfig output_memory_config; + const tt::tt_metal::DataType output_dtype = tt::tt_metal::DataType::INVALID; + const tt::tt_metal::MemoryConfig output_memory_config; const bool fp32_dest_acc_en = false; const bool preserve_fp32_precision = false; const bool bfp8_pack_precise = false; diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_program_factory.hpp b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_program_factory.hpp index 780c5b99c2e..3d0f47fedfe 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_program_factory.hpp +++ b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_program_factory.hpp @@ -11,8 +11,8 @@ namespace ttnn::operations::unary::program { struct UnaryProgramFactory { struct shared_variables_t { - KernelHandle unary_reader_kernel_id; - KernelHandle unary_writer_kernel_id; + tt::tt_metal::KernelHandle unary_reader_kernel_id; + tt::tt_metal::KernelHandle unary_writer_kernel_id; uint32_t num_cores; uint32_t num_cores_y; }; diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_sharded_program_factory.hpp b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_sharded_program_factory.hpp index 8f08e19684d..c01665fdb86 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_sharded_program_factory.hpp +++ b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_sharded_program_factory.hpp @@ -11,8 +11,8 @@ namespace ttnn::operations::unary::program { struct UnaryShardedProgramFactory { struct shared_variables_t { - CBHandle cb_src0; - CBHandle out_cb; + tt::tt_metal::CBHandle cb_src0; + tt::tt_metal::CBHandle out_cb; }; using cached_program_t = ttnn::device_operation::CachedProgram; diff --git a/ttnn/cpp/ttnn/operations/embedding/device/embedding_device_operation.hpp b/ttnn/cpp/ttnn/operations/embedding/device/embedding_device_operation.hpp index 3fad9391e0f..9c194a63d0e 100644 --- a/ttnn/cpp/ttnn/operations/embedding/device/embedding_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/embedding/device/embedding_device_operation.hpp @@ -17,15 +17,15 @@ enum class EmbeddingsType { GENERIC, PADDED, BINARY }; enum class EmbeddingsIndexType { UINT32, BFP16 }; struct Embeddings { - const MemoryConfig output_mem_config; + const tt::tt_metal::MemoryConfig output_mem_config; const bool tilized; const EmbeddingsType embeddings_type; const std::optional pad_token; - const DataType output_dtype; + const tt::tt_metal::DataType output_dtype; void validate(const std::vector &input_tensors) const; std::vector compute_output_specs(const std::vector &input_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector &input_tensors, std::vector &output_tensors) const; }; } // namespace ttnn::operations::embedding diff --git a/ttnn/cpp/ttnn/operations/embedding/device/embedding_program_factory.hpp b/ttnn/cpp/ttnn/operations/embedding/device/embedding_program_factory.hpp index a85485c64e4..a2630c0b196 100644 --- a/ttnn/cpp/ttnn/operations/embedding/device/embedding_program_factory.hpp +++ b/ttnn/cpp/ttnn/operations/embedding/device/embedding_program_factory.hpp @@ -5,13 +5,17 @@ #pragma once #include "ttnn/operations/core/core.hpp" +#include "ttnn/operations/embedding/device/embedding_device_operation.hpp" +#include "ttnn/operations/math.hpp" #include #include #include #include #include +#include using namespace tt; +using namespace tt::tt_metal; struct CoreSplitResult { uint32_t required_cores = 0; @@ -70,7 +74,7 @@ CoreSplitResult split_work_to_cores_aligned( namespace ttnn::operations::embedding::detail { -operation::ProgramWithCallbacks embeddings_fused( +tt::tt_metal::operation::ProgramWithCallbacks embeddings_fused( const Tensor& a, const Tensor& weights, Tensor& output, @@ -368,7 +372,7 @@ operation::ProgramWithCallbacks embeddings_fused( return {.program = std::move(program), .override_runtime_arguments_callback = override_runtime_arguments_callback}; } -operation::ProgramWithCallbacks embeddings_rm( +tt::tt_metal::operation::ProgramWithCallbacks embeddings_rm( const Tensor& a, const Tensor& weights, Tensor& output, @@ -590,7 +594,7 @@ operation::ProgramWithCallbacks embeddings_rm( return {.program = std::move(program), .override_runtime_arguments_callback = override_runtime_arguments_callback}; } -operation::ProgramWithCallbacks embeddings_tilized_indices( +tt::tt_metal::operation::ProgramWithCallbacks embeddings_tilized_indices( const Tensor& a, const Tensor& weights, Tensor& output, @@ -834,7 +838,7 @@ operation::ProgramWithCallbacks embeddings_tilized_indices( return {.program = std::move(program), .override_runtime_arguments_callback = override_runtime_arguments_callback}; } -operation::ProgramWithCallbacks embeddings_( +tt::tt_metal::operation::ProgramWithCallbacks embeddings_( const Tensor& a, const Tensor& weights, Tensor& output, diff --git a/ttnn/cpp/ttnn/operations/embedding/embedding.cpp b/ttnn/cpp/ttnn/operations/embedding/embedding.cpp index ab546a0fa70..96e3f8017e6 100644 --- a/ttnn/cpp/ttnn/operations/embedding/embedding.cpp +++ b/ttnn/cpp/ttnn/operations/embedding/embedding.cpp @@ -58,7 +58,7 @@ ttnn::Tensor EmbeddingOperation::invoke( } } - auto embeddings = operation::run( + auto embeddings = tt::tt_metal::operation::run( Embeddings{ .output_mem_config = memory_config.value_or(input_tensor.memory_config()), .tilized = fused_tilized, diff --git a/ttnn/cpp/ttnn/operations/embedding_backward/device/embedding_backward_device_operation.hpp b/ttnn/cpp/ttnn/operations/embedding_backward/device/embedding_backward_device_operation.hpp index 27a18bc9b0b..3e51e22a1ce 100644 --- a/ttnn/cpp/ttnn/operations/embedding_backward/device/embedding_backward_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/embedding_backward/device/embedding_backward_device_operation.hpp @@ -14,18 +14,18 @@ using namespace tt::constants; namespace ttnn::operations::embedding_backward { namespace detail { -operation::ProgramWithCallbacks embedding_backward_multi_core( +tt::tt_metal::operation::ProgramWithCallbacks embedding_backward_multi_core( const Tensor &index_tensor, const Tensor &grad_tensor, Tensor &output, const uint32_t num_embeddings); } struct EmbeddingBackward { - MemoryConfig output_mem_config; - DataType output_dtype; + tt::tt_metal::MemoryConfig output_mem_config; + tt::tt_metal::DataType output_dtype; uint32_t num_embeddings; void validate(const std::vector &input_tensors) const; std::vector compute_output_specs(const std::vector &input_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector &input_tensors, std::vector &output_tensors) const; tt::stl::reflection::Attributes attributes() const; }; diff --git a/ttnn/cpp/ttnn/operations/embedding_backward/embedding_backward.cpp b/ttnn/cpp/ttnn/operations/embedding_backward/embedding_backward.cpp index 99825d65a61..a9c5daf5d71 100644 --- a/ttnn/cpp/ttnn/operations/embedding_backward/embedding_backward.cpp +++ b/ttnn/cpp/ttnn/operations/embedding_backward/embedding_backward.cpp @@ -29,7 +29,7 @@ Tensor EmbeddingBackwardOperation::invoke( auto input_tensor = ttnn::reshape(input_tensor_arg, ttnn::Shape({batch_size, 1, 1, sentence_size})); auto input_gradient = - operation::run( + tt::tt_metal::operation::run( EmbeddingBackward{ .output_mem_config = memory_config.value_or(output_gradient_tensor_arg.memory_config()), .output_dtype = dtype.value_or(output_gradient_tensor_arg.get_dtype()), diff --git a/ttnn/cpp/ttnn/operations/examples/example/device/example_device_operation.cpp b/ttnn/cpp/ttnn/operations/examples/example/device/example_device_operation.cpp index 3d4a3b95cdf..bead3e027f5 100644 --- a/ttnn/cpp/ttnn/operations/examples/example/device/example_device_operation.cpp +++ b/ttnn/cpp/ttnn/operations/examples/example/device/example_device_operation.cpp @@ -26,7 +26,8 @@ ExampleDeviceOperation::spec_return_value_t ExampleDeviceOperation::compute_outp const auto& input_tensor = tensor_args.input_tensor; return TensorSpec( input_tensor.get_logical_shape(), - TensorLayout(input_tensor.get_dtype(), PageConfig(input_tensor.get_layout()), MemoryConfig{})); + tt::tt_metal::TensorLayout( + input_tensor.get_dtype(), tt::tt_metal::PageConfig(input_tensor.get_layout()), MemoryConfig{})); } ExampleDeviceOperation::tensor_return_value_t ExampleDeviceOperation::create_output_tensors( diff --git a/ttnn/cpp/ttnn/operations/examples/example/device/example_device_operation.hpp b/ttnn/cpp/ttnn/operations/examples/example/device/example_device_operation.hpp index b1c46a57054..9536d83d0fa 100644 --- a/ttnn/cpp/ttnn/operations/examples/example/device/example_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/examples/example/device/example_device_operation.hpp @@ -66,8 +66,8 @@ struct ExampleDeviceOperation { struct SingleCore { // Shared variables are the variables that are shared between the create and override_runtime_arguments methods struct shared_variables_t { - KernelHandle unary_reader_kernel_id; - KernelHandle unary_writer_kernel_id; + tt::tt_metal::KernelHandle unary_reader_kernel_id; + tt::tt_metal::KernelHandle unary_writer_kernel_id; }; using cached_program_t = ttnn::device_operation::CachedProgram; @@ -86,8 +86,8 @@ struct ExampleDeviceOperation { struct MultiCore { // Shared variables are the variables that are shared between the create and override_runtime_arguments methods struct shared_variables_t { - KernelHandle unary_reader_kernel_id; - KernelHandle unary_writer_kernel_id; + tt::tt_metal::KernelHandle unary_reader_kernel_id; + tt::tt_metal::KernelHandle unary_writer_kernel_id; std::size_t num_cores; std::size_t num_cores_y; }; @@ -141,7 +141,7 @@ struct ExampleDeviceOperation { // In case the operation needs a custom create_op_performance_model, this method can be implemented /* - static operation::OpPerformanceModel create_op_performance_model( + static tt::tt_metal::tt::tt_metal::operation::OpPerformanceModel create_op_performance_model( const operation_attributes_t&, const tensor_args_t&, tensor_return_value_t&); diff --git a/ttnn/cpp/ttnn/operations/examples/example_multiple_return/device/example_multiple_return_device_operation.cpp b/ttnn/cpp/ttnn/operations/examples/example_multiple_return/device/example_multiple_return_device_operation.cpp index 1045b9aa965..ae2ea337d09 100644 --- a/ttnn/cpp/ttnn/operations/examples/example_multiple_return/device/example_multiple_return_device_operation.cpp +++ b/ttnn/cpp/ttnn/operations/examples/example_multiple_return/device/example_multiple_return_device_operation.cpp @@ -30,7 +30,8 @@ ExampleMultipleReturnDeviceOperation::spec_return_value_t ExampleMultipleReturnD const auto& input_tensor = tensor_args.input_tensor; TensorSpec spec( input_tensor.get_logical_shape(), - TensorLayout(input_tensor.get_dtype(), PageConfig(input_tensor.get_layout()), MemoryConfig{})); + tt::tt_metal::TensorLayout( + input_tensor.get_dtype(), tt::tt_metal::PageConfig(input_tensor.get_layout()), MemoryConfig{})); spec_return_value_t result = {std::nullopt, std::nullopt}; if (operation_attributes.return_output1) { std::get<0>(result) = spec; diff --git a/ttnn/cpp/ttnn/operations/examples/example_multiple_return/device/example_multiple_return_device_operation.hpp b/ttnn/cpp/ttnn/operations/examples/example_multiple_return/device/example_multiple_return_device_operation.hpp index 7ccd7c34310..23e1b57517d 100644 --- a/ttnn/cpp/ttnn/operations/examples/example_multiple_return/device/example_multiple_return_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/examples/example_multiple_return/device/example_multiple_return_device_operation.hpp @@ -68,8 +68,8 @@ struct ExampleMultipleReturnDeviceOperation { struct SingleCore { // Shared variables are the variables that are shared between the create and override_runtime_arguments methods struct shared_variables_t { - KernelHandle unary_reader_kernel_id; - KernelHandle unary_writer_kernel_id; + tt::tt_metal::KernelHandle unary_reader_kernel_id; + tt::tt_metal::KernelHandle unary_writer_kernel_id; }; using cached_program_t = ttnn::device_operation::CachedProgram; @@ -122,7 +122,7 @@ struct ExampleMultipleReturnDeviceOperation { // In case the operation needs a custom create_op_performance_model, this method can be implemented /* - static operation::OpPerformanceModel create_op_performance_model( + static tt::tt_metal::tt::tt_metal::operation::OpPerformanceModel create_op_performance_model( const operation_attributes_t&, const tensor_args_t&, tensor_return_value_t&); diff --git a/ttnn/cpp/ttnn/operations/experimental/auto_format/auto_format.cpp b/ttnn/cpp/ttnn/operations/experimental/auto_format/auto_format.cpp index 0301fb8eef7..74a3f613a9c 100644 --- a/ttnn/cpp/ttnn/operations/experimental/auto_format/auto_format.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/auto_format/auto_format.cpp @@ -16,8 +16,6 @@ #include "ttnn/operations/data_movement/untilize_with_unpadding/untilize_with_unpadding.hpp" #include "ttnn/tensor/tensor.hpp" -using namespace tt::tt_metal; - namespace ttnn::operations::experimental::auto_format { Tensor AutoFormat::move_tensor_to_device(const Tensor& input, IDevice* device, const MemoryConfig& mem_config) { @@ -69,7 +67,7 @@ Tensor AutoFormat::format_input_tensor( return AutoFormat::move_tensor_to_device(input, device); } - MemoryConfig mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG; + MemoryConfig mem_config = tt::tt_metal::operation::DEFAULT_OUTPUT_MEMORY_CONFIG; if (target_mem_config.has_value()) { mem_config = target_mem_config.value(); } else if (input.storage_type() == StorageType::DEVICE) { @@ -157,7 +155,7 @@ Tensor AutoFormat::format_output_tensor( if (!unpad_output && !convert_layout) { return output; } - MemoryConfig mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG; + MemoryConfig mem_config = tt::tt_metal::operation::DEFAULT_OUTPUT_MEMORY_CONFIG; if (target_mem_config.has_value()) { mem_config = target_mem_config.value(); } else if (output.storage_type() == StorageType::DEVICE) { diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async.cpp index 6b70ebe95bc..67dbaa9b9f5 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async.cpp @@ -17,7 +17,7 @@ ttnn::Tensor ExecuteAllGatherAsync::invoke( const uint32_t num_links, const std::optional& memory_config, const ttnn::ccl::Topology topology, - std::optional subdevice_id, + std::optional subdevice_id, bool enable_persistent_fabric_mode) { return ttnn::operations::experimental::ccl::all_gather_async( input_tensor, @@ -39,7 +39,7 @@ ttnn::Tensor ExecuteAllGatherAsync::invoke( const global_semaphore::MultiDeviceGlobalSemaphore& multi_device_global_semaphore, const std::optional& memory_config, const std::optional num_preferred_links, - std::optional subdevice_id, + std::optional subdevice_id, bool enable_persistent_fabric_mode) { return ttnn::operations::experimental::ccl::all_gather_async( input_tensor, diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async.hpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async.hpp index 350c5c60b17..b6da224de97 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async.hpp @@ -19,7 +19,7 @@ struct ExecuteAllGatherAsync { const uint32_t num_links = 1, const std::optional& memory_config = std::nullopt, const ttnn::ccl::Topology topology = ttnn::ccl::Topology::Ring, - std::optional subdevice_id = std::nullopt, + std::optional subdevice_id = std::nullopt, bool enable_persistent_fabric_mode = false); static ttnn::Tensor invoke( @@ -31,7 +31,7 @@ struct ExecuteAllGatherAsync { const global_semaphore::MultiDeviceGlobalSemaphore& multi_device_global_semaphore, const std::optional& memory_config = std::nullopt, const std::optional num_preferred_links = std::nullopt, - std::optional subdevice_id = std::nullopt, + std::optional subdevice_id = std::nullopt, bool enable_persistent_fabric_mode = false); }; diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async_pybind.cpp index b6eb3fa8949..8e1ab8c48f7 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async_pybind.cpp @@ -33,7 +33,7 @@ void bind_all_gather_async(pybind11::module& module, const ccl_operation_t& oper const uint32_t num_links, const std::optional& memory_config, const ttnn::ccl::Topology topology, - std::optional subdevice_id, + std::optional subdevice_id, bool enable_persistent_fabric_mode) -> ttnn::Tensor { return self( input_tensor, @@ -65,7 +65,7 @@ void bind_all_gather_async(pybind11::module& module, const ccl_operation_t& oper const global_semaphore::MultiDeviceGlobalSemaphore& multi_device_global_semaphore, const std::optional num_preferred_links, const std::optional& memory_config, - std::optional subdevice_id, + std::optional subdevice_id, bool enable_persistent_fabric_mode) -> ttnn::Tensor { return self( input_tensor, diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_op.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_op.cpp index eea3800c374..cfd21f7150f 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_op.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_op.cpp @@ -3,6 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 #include "all_gather_async_op.hpp" +#include "ttnn/operations/functions.hpp" #include "ttnn/operations/math.hpp" #include "cpp/ttnn/global_semaphore.hpp" @@ -20,7 +21,7 @@ AllGatherAsync create_all_gather_async_struct( const std::vector& devices, const ttnn::ccl::Topology topology, const std::vector& semaphores, - std::optional sub_device_id, + std::optional sub_device_id, bool enable_persistent_fabric_mode) { uint32_t num_devices = devices.size(); @@ -178,7 +179,7 @@ AllGatherAsyncVersion AllGatherAsync::select_version(const Tensor& input_tensor) return AllGatherAsyncVersion::GENERIC; } -operation::ProgramWithCallbacks AllGatherAsync::create_program( +tt::tt_metal::operation::ProgramWithCallbacks AllGatherAsync::create_program( const std::vector& input_tensors, std::vector& output_tensors) const { tt::log_debug(tt::LogOp, "DEBUG: create_program is called"); @@ -243,7 +244,8 @@ operation::ProgramWithCallbacks AllGatherAsync::create_program( } } -const operation::Hash AllGatherAsync::compute_program_hash(const std::vector& input_tensors) const { +const tt::tt_metal::operation::Hash AllGatherAsync::compute_program_hash( + const std::vector& input_tensors) const { log_trace(tt::LogOp, "compute_program_hash is called"); AllGatherAsyncVersion version = select_version(input_tensors[0]); log_trace(tt::LogOp, "version: {}", static_cast(version)); @@ -254,7 +256,7 @@ const operation::Hash AllGatherAsync::compute_program_hash(const std::vectorsemaphore.address(); - return operation::hash_operation( + return tt::tt_metal::operation::hash_operation( this->dim, this->num_links, this->ring_size, @@ -267,7 +269,7 @@ const operation::Hash AllGatherAsync::compute_program_hash(const std::vector( + return tt::tt_metal::operation::hash_operation( this->dim, this->num_links, this->ring_size, @@ -291,7 +293,7 @@ Tensor all_gather_async( const uint32_t num_links, const std::optional& memory_config, const ttnn::ccl::Topology topology, - std::optional sub_device_id, + std::optional sub_device_id, bool enable_persistent_fabric_mode) { TT_FATAL( std::getenv("TT_METAL_SLOW_DISPATCH_MODE") == nullptr, @@ -304,7 +306,7 @@ Tensor all_gather_async( if (num_devices == 2) { ccl_topology = ttnn::ccl::Topology::Linear; } - std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor}))}; + std::vector output_tensors = {Tensor(tt::tt_metal::operation::get_workers_for_op_output({input_tensor}))}; tt::log_debug( tt::LogOp, "DEBUG: creating line_fabric with num devices: {}, num links: {}", devices.size(), num_links); @@ -316,7 +318,7 @@ Tensor all_gather_async( std::vector semaphores = multi_device_global_semaphore.global_semaphores; - operation::launch_op( + tt::tt_metal::operation::launch_op( [dim, num_links, num_devices, @@ -331,7 +333,7 @@ Tensor all_gather_async( const std::vector>& optional_output_tensors) mutable -> std::vector { const auto& input_tensor = input_tensors.at(0); - return operation::run( + return tt::tt_metal::operation::run( ttnn::ccl::all_gather_detail::create_all_gather_async_struct( input_tensor, dim, @@ -358,7 +360,7 @@ Tensor all_gather_async( const global_semaphore::MultiDeviceGlobalSemaphore& multi_device_global_semaphore, const std::optional& memory_config, const std::optional num_preferred_links, - std::optional sub_device_id, + std::optional sub_device_id, bool enable_persistent_fabric_mode) { TT_FATAL( topology == ttnn::ccl::Topology::Linear, @@ -378,12 +380,12 @@ Tensor all_gather_async( rank - 1, dim); - std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor}))}; + std::vector output_tensors = {Tensor(tt::tt_metal::operation::get_workers_for_op_output({input_tensor}))}; CoreCoord grid_size = devices[0]->compute_with_storage_grid_size(); auto core_grid = CoreRange({0, 0}, {grid_size.x - 1, grid_size.y - 1}); std::vector semaphores = multi_device_global_semaphore.global_semaphores; - operation::launch_op( + tt::tt_metal::operation::launch_op( [gather_dim, num_preferred_links, memory_config, @@ -408,7 +410,7 @@ Tensor all_gather_async( const auto& input_tensor = input_tensors.at(0); - return operation::run( + return tt::tt_metal::operation::run( ttnn::ccl::all_gather_detail::create_all_gather_async_struct( input_device_tensor, gather_dim, diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_op.hpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_op.hpp index d8193771f62..b947120f463 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_op.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_op.hpp @@ -41,7 +41,7 @@ struct AllGatherAsync { const MemoryConfig output_mem_config; const ccl::Topology topology; const GlobalSemaphore semaphore; - std::optional sub_device_id; + std::optional sub_device_id; bool enable_persistent_fabric_mode; AllGatherAsync( @@ -54,7 +54,7 @@ struct AllGatherAsync { MemoryConfig output_mem_config, ccl::Topology topology, GlobalSemaphore semaphore, - std::optional& sub_device_id, + std::optional& sub_device_id, bool enable_persistent_fabric_mode) : forward_device(forward_device), backward_device(backward_device), @@ -86,9 +86,9 @@ struct AllGatherAsync { void validate(const std::vector& input_tensors) const; std::vector compute_output_specs(const std::vector& input_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector& input_tensors, std::vector& output_tensors) const; - const operation::Hash compute_program_hash(const std::vector& input_tensors) const; + const tt::tt_metal::operation::Hash compute_program_hash(const std::vector& input_tensors) const; AllGatherAsyncVersion select_version(const Tensor& input_tensor) const; }; @@ -103,15 +103,19 @@ AllGatherAsync create_all_gather_async_struct( const std::vector& devices, const ccl::Topology topology, const std::vector& semaphores, - std::optional sub_device_id, + std::optional sub_device_id, bool enable_persistent_fabric_mode); } // namespace all_gather_async_detail } // namespace ccl // All Gather Variants std::tuple> choose_worker_cores( - size_t num_links, size_t num_workers_per_link, bool persistent_fabric_mode, IDevice* device, const std::optional& sub_device_id); -operation::ProgramWithCallbacks all_gather_async_multi_core_with_workers( + size_t num_links, + size_t num_workers_per_link, + bool persistent_fabric_mode, + IDevice* device, + const std::optional& sub_device_id); +tt::tt_metal::operation::ProgramWithCallbacks all_gather_async_multi_core_with_workers( const Tensor& input_tensor, std::optional forward_device, std::optional backward_device, @@ -122,9 +126,9 @@ operation::ProgramWithCallbacks all_gather_async_multi_core_with_workers( const uint32_t ring_index, ccl::Topology topology, const GlobalSemaphore semaphore, - const std::optional& sub_device_id, + const std::optional& sub_device_id, bool enable_persistent_fabric_mode); -operation::ProgramWithCallbacks all_gather_async_minimal_interleaved_dim3_1_1_32_any( +tt::tt_metal::operation::ProgramWithCallbacks all_gather_async_minimal_interleaved_dim3_1_1_32_any( const Tensor& input_tensor, std::optional forward_device, std::optional backward_device, @@ -135,9 +139,9 @@ operation::ProgramWithCallbacks all_gather_async_minimal_interleaved_dim3_1_1_32 const uint32_t ring_index, ccl::Topology topology, const GlobalSemaphore& semaphore, - const std::optional& sub_device_id, + const std::optional& sub_device_id, bool enable_persistent_fabric_mode); -operation::ProgramWithCallbacks all_gather_async_llama_post_binary_matmul( +tt::tt_metal::operation::ProgramWithCallbacks all_gather_async_llama_post_binary_matmul( const Tensor& input_tensor, std::optional forward_device, std::optional backward_device, @@ -148,7 +152,7 @@ operation::ProgramWithCallbacks all_gather_async_llama_post_binary_matmul( const uint32_t ring_index, ccl::Topology topology, const GlobalSemaphore& semaphore, - const std::optional& sub_device_id, + const std::optional& sub_device_id, bool enable_persistent_fabric_mode); namespace operations { @@ -162,7 +166,7 @@ Tensor all_gather_async( const uint32_t num_links = 1, const std::optional& memory_config = std::nullopt, const ttnn::ccl::Topology topology = ttnn::ccl::Topology::Ring, - std::optional sub_device_id = std::nullopt, + std::optional sub_device_id = std::nullopt, bool enable_persistent_fabric_mode = false); // TODO make reference Tensor all_gather_async( @@ -174,7 +178,7 @@ Tensor all_gather_async( const global_semaphore::MultiDeviceGlobalSemaphore& multi_device_global_semaphore, const std::optional& memory_config = std::nullopt, const std::optional num_preferred_links = std::nullopt, - std::optional sub_device_id = std::nullopt, + std::optional sub_device_id = std::nullopt, bool enable_persistent_fabric_mode = false); } // namespace ccl diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_program.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_program.cpp index dbcc0d5848d..2f368d5476f 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_program.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_program.cpp @@ -27,6 +27,7 @@ #include #include #include + using namespace tt::constants; namespace ttnn { @@ -77,13 +78,13 @@ std::tuple> choose_worker_cores( size_t num_workers_per_link, bool persistent_fabric_mode, IDevice* device, - const std::optional& sub_device_id) { + const std::optional& sub_device_id) { std::tuple> result; CoreRangeSet sender_worker_core_range; if (persistent_fabric_mode) { const size_t num_workers_preferred = num_workers_per_link * num_links; const auto available_cores = device->worker_cores( - HalProgrammableCoreType::TENSIX, + tt::tt_metal::HalProgrammableCoreType::TENSIX, sub_device_id.has_value() ? *sub_device_id : device->get_sub_device_ids().at(0)); if (available_cores.num_cores() < num_workers_preferred) { log_warning( @@ -125,7 +126,7 @@ std::tuple> choose_worker_cores( // For ring all-gather, we can send sub-sections of input tensor in opposite directions // For linear all-gather though, we must ensure we send full tensors in BOTH directions // (in other words, disable the "bidirectional" send flag) -operation::ProgramWithCallbacks all_gather_async_multi_core_with_workers( +tt::tt_metal::operation::ProgramWithCallbacks all_gather_async_multi_core_with_workers( const Tensor& input_tensor, std::optional forward_device, std::optional backward_device, @@ -136,7 +137,7 @@ operation::ProgramWithCallbacks all_gather_async_multi_core_with_workers( const uint32_t ring_index, ccl::Topology topology, const GlobalSemaphore semaphore, - const std::optional& sub_device_id, + const std::optional& sub_device_id, bool enable_persistent_fabric_mode) { tt::tt_metal::Program program{}; const bool enable_async_output_tensor = false; @@ -202,7 +203,7 @@ operation::ProgramWithCallbacks all_gather_async_multi_core_with_workers( tt::tt_metal::CircularBufferConfig cb_src0_config = tt::tt_metal::CircularBufferConfig(cb_num_pages * l1_scratch_cb_page_size_bytes, {{src0_cb_index, df}}) .set_page_size(src0_cb_index, l1_scratch_cb_page_size_bytes); - CBHandle cb_src0_workers = CreateCircularBuffer(program, sender_worker_core_range, cb_src0_config); + tt::tt_metal::CBHandle cb_src0_workers = CreateCircularBuffer(program, sender_worker_core_range, cb_src0_config); // Create Tensor slicer // read the entire input tensor (partition size = 1, partition index = 0) @@ -224,7 +225,7 @@ operation::ProgramWithCallbacks all_gather_async_multi_core_with_workers( ); // KERNEL CREATION - KernelHandle worker_sender_reader_kernel_id = + tt::tt_metal::KernelHandle worker_sender_reader_kernel_id = ttnn::ccl::worker_detail::generate_multi_command_stream_kernel_ct_args( program, {src0_cb_index}, @@ -234,7 +235,7 @@ operation::ProgramWithCallbacks all_gather_async_multi_core_with_workers( 1, // num_command_streams device->id()); - KernelHandle worker_sender_writer_kernel_id = + tt::tt_metal::KernelHandle worker_sender_writer_kernel_id = ttnn::ccl::worker_detail::generate_multi_command_stream_kernel_ct_args( program, {src0_cb_index}, diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_program_minimal_variants.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_program_minimal_variants.cpp index ba8edc57bf6..e8191564e2a 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_program_minimal_variants.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_program_minimal_variants.cpp @@ -27,6 +27,7 @@ #include #include #include + using namespace tt::constants; namespace ttnn { @@ -52,7 +53,7 @@ void append_fabric_connection_rt_args( } } -operation::ProgramWithCallbacks all_gather_async_minimal_interleaved_dim3_1_1_32_any( +tt::tt_metal::operation::ProgramWithCallbacks all_gather_async_minimal_interleaved_dim3_1_1_32_any( const Tensor& input_tensor, std::optional forward_device, std::optional backward_device, @@ -63,7 +64,7 @@ operation::ProgramWithCallbacks all_gather_async_minimal_interleaved_dim3_1_1_32 const uint32_t ring_index, ccl::Topology topology, const GlobalSemaphore& semaphore, - const std::optional& sub_device_id, + const std::optional& sub_device_id, bool enable_persistent_fabric_mode) { tt::tt_metal::Program program{}; const bool enable_async_output_tensor = false; @@ -115,7 +116,7 @@ operation::ProgramWithCallbacks all_gather_async_minimal_interleaved_dim3_1_1_32 tt::tt_metal::CircularBufferConfig cb_src0_config = tt::tt_metal::CircularBufferConfig(cb_num_pages * l1_scratch_cb_page_size_bytes, {{src0_cb_index, df}}) .set_page_size(src0_cb_index, l1_scratch_cb_page_size_bytes); - CBHandle cb_src0_workers = CreateCircularBuffer(program, sender_worker_core_range, cb_src0_config); + tt::tt_metal::CBHandle cb_src0_workers = CreateCircularBuffer(program, sender_worker_core_range, cb_src0_config); // Set aside a buffer we can use for storing packet headers in (particularly for atomic incs) const auto reserved_packet_header_CB_index = tt::CB::c_in1; static constexpr auto num_packet_headers_storable = 8; @@ -275,7 +276,7 @@ operation::ProgramWithCallbacks all_gather_async_minimal_interleaved_dim3_1_1_32 return {.program = std::move(program), .override_runtime_arguments_callback = override_runtime_arguments_callback}; } -operation::ProgramWithCallbacks all_gather_async_llama_post_binary_matmul( +tt::tt_metal::operation::ProgramWithCallbacks all_gather_async_llama_post_binary_matmul( const Tensor& input_tensor, std::optional forward_device, std::optional backward_device, @@ -286,7 +287,7 @@ operation::ProgramWithCallbacks all_gather_async_llama_post_binary_matmul( const uint32_t ring_index, ccl::Topology topology, const GlobalSemaphore& semaphore, - const std::optional& sub_device_id, + const std::optional& sub_device_id, bool enable_persistent_fabric_mode) { tt::tt_metal::Program program{}; const bool enable_async_output_tensor = false; @@ -357,7 +358,7 @@ operation::ProgramWithCallbacks all_gather_async_llama_post_binary_matmul( tt::tt_metal::CircularBufferConfig cb_src0_config = tt::tt_metal::CircularBufferConfig(cb_num_pages * l1_scratch_cb_page_size_bytes, {{src0_cb_index, df}}) .set_page_size(src0_cb_index, l1_scratch_cb_page_size_bytes); - CBHandle cb_src0_workers = CreateCircularBuffer(program, sender_worker_core_range, cb_src0_config); + tt::tt_metal::CBHandle cb_src0_workers = CreateCircularBuffer(program, sender_worker_core_range, cb_src0_config); // Set aside a buffer we can use for storing packet headers in (particularly for atomic incs) const auto reserved_packet_header_CB_index = tt::CB::c_in1; static constexpr auto num_packet_headers_storable = 8; diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/all_gather_matmul_op.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/all_gather_matmul_op.cpp index 15bd0227fba..25ab8d59f6d 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/all_gather_matmul_op.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/all_gather_matmul_op.cpp @@ -93,7 +93,7 @@ std::vector AllGatherMatmul::create_output_tensors(const std::vector& input_tensors, const std::vector>& optional_input_tensors, std::vector& output_tensors) const { @@ -153,12 +153,12 @@ std::vector all_gather_matmul( auto devices = input_tensor.get_workers(); std::vector output_tensors = { - ttnn::Tensor(operation::get_workers_for_op_output({input_tensor, weight_tensor})), - ttnn::Tensor(operation::get_workers_for_op_output({input_tensor, weight_tensor})), - ttnn::Tensor(operation::get_workers_for_op_output({input_tensor, weight_tensor}))}; + ttnn::Tensor(tt::tt_metal::operation::get_workers_for_op_output({input_tensor, weight_tensor})), + ttnn::Tensor(tt::tt_metal::operation::get_workers_for_op_output({input_tensor, weight_tensor})), + ttnn::Tensor(tt::tt_metal::operation::get_workers_for_op_output({input_tensor, weight_tensor}))}; std::vector> optional_input_tensors = {std::nullopt}; - operation::launch_op( + tt::tt_metal::operation::launch_op( [dim, all_gather_core_grid_offset, num_links, @@ -222,7 +222,7 @@ std::vector all_gather_matmul( /*output_tile=*/std::nullopt, /*global_cb=*/std::nullopt}); - return operation::run( + return tt::tt_metal::operation::run( ttnn::experimental::AllGatherMatmul{/* All Gather Params */ all_gather_struct, /* Matmul params */ diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/all_gather_matmul_op.hpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/all_gather_matmul_op.hpp index 0b0187669cd..43cec6c451b 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/all_gather_matmul_op.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/all_gather_matmul_op.hpp @@ -44,7 +44,7 @@ struct AllGatherMatmul { const std::vector>& optional_output_tensors = {std::nullopt}) const; std::vector compute_output_specs(const std::vector& input_tensors) const; std::vector create_output_tensors(const std::vector& input_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector& input_tensors, const std::vector>& optional_input_tensors, std::vector& output_tensors) const; @@ -54,7 +54,7 @@ struct AllGatherMatmul { } }; -operation::ProgramWithCallbacks all_gather_matmul_multi_core_with_workers( +tt::tt_metal::operation::ProgramWithCallbacks all_gather_matmul_multi_core_with_workers( /* General Params */ const Tensor& input_tensor, diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/multi_core/all_gather_matmul_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/multi_core/all_gather_matmul_op_multi_core.cpp index d87880486a4..f8e15151749 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/multi_core/all_gather_matmul_op_multi_core.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/multi_core/all_gather_matmul_op_multi_core.cpp @@ -24,6 +24,7 @@ #include "ttnn/operations/matmul/device/matmul_op.hpp" using namespace tt::constants; +using namespace tt::tt_metal; namespace ttnn { @@ -322,21 +323,22 @@ operation::ProgramWithCallbacks experimental::all_gather_matmul_multi_core_with_ } // All Gather - operation::ProgramWithCallbacks program_with_callbacks = ttnn::all_gather_multi_core_with_workers_helper( - matmul_program_with_callbacks->program, - input_tensor, - all_gather_output_tensor, - dim, - num_links, - ring_size, - ring_index, - receiver_device_id, - sender_device_id, - topology, - user_defined_num_workers, - user_defined_num_buffers_per_channel, - all_gather_fused_op_signaler, - core_grid_offset); + tt::tt_metal::operation::ProgramWithCallbacks program_with_callbacks = + ttnn::all_gather_multi_core_with_workers_helper( + matmul_program_with_callbacks->program, + input_tensor, + all_gather_output_tensor, + dim, + num_links, + ring_size, + ring_index, + receiver_device_id, + sender_device_id, + topology, + user_defined_num_workers, + user_defined_num_buffers_per_channel, + all_gather_fused_op_signaler, + core_grid_offset); const auto all_gather_override_runtime_arguments_callback = program_with_callbacks.override_runtime_arguments_callback; diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_reduce/device/all_reduce_op.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_reduce/device/all_reduce_op.cpp index 14ed4c2a122..204aeb0b314 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_reduce/device/all_reduce_op.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_reduce/device/all_reduce_op.cpp @@ -22,11 +22,13 @@ std::vector AllReduce::compute_output_specs(const std::vector< const auto& input_tensor = input_tensors.at(0); auto shape = input_tensor.get_logical_shape(); TensorSpec spec( - shape, TensorLayout(input_tensor.get_dtype(), PageConfig(input_tensor.get_layout()), output_mem_config)); + shape, + tt::tt_metal::TensorLayout( + input_tensor.get_dtype(), tt::tt_metal::PageConfig(input_tensor.get_layout()), output_mem_config)); return std::vector(input_tensors.size(), spec); } -operation::ProgramWithCallbacks AllReduce::create_program( +tt::tt_metal::operation::ProgramWithCallbacks AllReduce::create_program( const std::vector& input_tensors, std::vector& output_tensors) const { return ccl::reduce_scatter_detail::reduce_scatter_with_workers( input_tensors.at(0), @@ -140,7 +142,7 @@ static Tensor all_gather_local_reduce( std::vector new_shape{1, merged_dim_size, shape[rank - 2], shape[rank - 1]}; auto reshaped_tensor = ttnn::reshape(input_tensor, new_shape); - const auto& gathered_tensor = operation::run( + const auto& gathered_tensor = tt::tt_metal::operation::run( ttnn::ccl::all_gather_detail::create_all_gather_struct( reshaped_tensor, 0, @@ -176,7 +178,7 @@ static Tensor reduce_scatter_all_gather( } } - const auto& reduced_tensor = operation::run( + const auto& reduced_tensor = tt::tt_metal::operation::run( ttnn::ccl::reduce_scatter_detail::create_reduce_scatter_struct( input_tensor, binary_op_type, @@ -189,7 +191,7 @@ static Tensor reduce_scatter_all_gather( topology), {input_tensor}); - const auto& gathered_tensor = operation::run( + const auto& gathered_tensor = tt::tt_metal::operation::run( ttnn::ccl::all_gather_detail::create_all_gather_struct( reduced_tensor.at(0), all_reduce_dim, @@ -264,8 +266,8 @@ Tensor all_reduce( uint32_t num_devices = devices.size(); TT_FATAL(num_devices > 1, "all_reduce op will only work for num_devices > 1, but has {}", num_devices); - std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor}))}; - operation::launch_op( + std::vector output_tensors = {Tensor(tt::tt_metal::operation::get_workers_for_op_output({input_tensor}))}; + tt::tt_metal::operation::launch_op( [binary_op_type, num_links, num_devices, diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_reduce/device/all_reduce_op.hpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_reduce/device/all_reduce_op.hpp index 42a7f5429c6..e6bfb50aa59 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_reduce/device/all_reduce_op.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_reduce/device/all_reduce_op.hpp @@ -32,7 +32,7 @@ struct AllReduce { void validate(const std::vector& input_tensors) const; std::vector compute_output_specs(const std::vector& input_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector& input_tensors, std::vector& output_tensors) const; }; @@ -43,7 +43,7 @@ Tensor all_reduce( const Tensor& input_tensor, ttnn::operations::reduction::ReduceType reduce_op = ttnn::operations::reduction::ReduceType::Sum, const uint32_t num_links = 1, - const MemoryConfig& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, + const MemoryConfig& output_mem_config = tt::tt_metal::operation::DEFAULT_OUTPUT_MEMORY_CONFIG, ttnn::ccl::Topology topology = ttnn::ccl::Topology::Ring, const std::optional user_defined_num_workers = std::nullopt, const std::optional user_defined_num_buffers_per_channel = std::nullopt); diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_reduce_async/all_reduce_async.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_reduce_async/all_reduce_async.cpp index 1a94281724e..fabb376d55c 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_reduce_async/all_reduce_async.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_reduce_async/all_reduce_async.cpp @@ -40,7 +40,7 @@ ttnn::Tensor ExecuteAllReduceAsync::invoke( const std::optional& memory_config, ttnn::ccl::Topology topology, const std::optional num_preferred_links, - std::optional worker_subdevice_id_opt) { + std::optional worker_subdevice_id_opt) { MemoryConfig out_memory_config = memory_config.value_or(input_tensor.memory_config()); uint32_t dim = find_scatter_dim(input_tensor.get_padded_shape(), input_tensor.get_workers().size()); ttnn::Tensor scattered_tensor = ttnn::operations::experimental::ccl::reduce_scatter( @@ -75,7 +75,7 @@ ttnn::Tensor ExecuteAllReduceAsync::invoke( const std::optional& memory_config, ttnn::ccl::Topology topology, const std::optional num_preferred_links, - std::optional worker_subdevice_id_opt) { + std::optional worker_subdevice_id_opt) { MemoryConfig out_memory_config = memory_config.value_or(input_tensor.memory_config()); const auto mesh_view = mesh_device.get_view(); std::vector devices = diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_reduce_async/all_reduce_async.hpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_reduce_async/all_reduce_async.hpp index b0b80451b8e..05efd2ba14e 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_reduce_async/all_reduce_async.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_reduce_async/all_reduce_async.hpp @@ -27,7 +27,7 @@ struct ExecuteAllReduceAsync { const std::optional& memory_config = std::nullopt, ttnn::ccl::Topology topology = ttnn::ccl::Topology::Linear, const std::optional num_links = std::nullopt, - std::optional worker_subdevice_id_opt = std::nullopt); + std::optional worker_subdevice_id_opt = std::nullopt); static ttnn::Tensor invoke( const ttnn::Tensor& input_tensor, @@ -40,7 +40,7 @@ struct ExecuteAllReduceAsync { const std::optional& memory_config, ttnn::ccl::Topology topology, const std::optional num_preferred_links, - std::optional worker_subdevice_id_opt); + std::optional worker_subdevice_id_opt); }; } // namespace ccl diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_reduce_async/all_reduce_async_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_reduce_async/all_reduce_async_pybind.cpp index ec91d88a2be..2e6e0abd87a 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_reduce_async/all_reduce_async_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_reduce_async/all_reduce_async_pybind.cpp @@ -34,7 +34,7 @@ void bind_all_reduce(pybind11::module& module, const ccl_operation_t& operation, const ttnn::MemoryConfig& memory_config, ttnn::ccl::Topology topology, const std::optional num_links, - std::optional worker_subdevice_id_opt) -> ttnn::Tensor { + std::optional worker_subdevice_id_opt) -> ttnn::Tensor { return self( input_tensor, from_remote_multi_device_global_semaphore, @@ -69,7 +69,7 @@ void bind_all_reduce(pybind11::module& module, const ccl_operation_t& operation, const ttnn::MemoryConfig& memory_config, ttnn::ccl::Topology topology, const std::optional num_links, - std::optional worker_subdevice_id_opt) -> ttnn::Tensor { + std::optional worker_subdevice_id_opt) -> ttnn::Tensor { return self( input_tensor, cluster_axis, diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_op.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_op.cpp index eeb67c0f502..b655c5e8504 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_op.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_op.cpp @@ -11,6 +11,8 @@ #include #include +using namespace tt::tt_metal; + namespace ttnn { namespace ccl { namespace reduce_scatter_detail { diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_op.hpp b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_op.hpp index c6256e6d734..1911a6a1160 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_op.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_op.hpp @@ -26,7 +26,7 @@ struct ReduceScatterAsync { std::optional num_links_preferred, const GlobalSemaphore& from_remote_sem, const GlobalSemaphore& to_remote_sem, - std::optional& sub_device_id, + std::optional& sub_device_id, std::optional& fabric_handle) : binary_op_type(binary_op_type), scatter_dim(scatter_dim), @@ -59,7 +59,7 @@ struct ReduceScatterAsync { const GlobalSemaphore from_remote_sem; const GlobalSemaphore to_remote_sem; std::optional& fabric_handle; - std::optional sub_device_id; + std::optional sub_device_id; auto attributes() const { using tt::stl::reflection::Attribute; @@ -80,14 +80,14 @@ struct ReduceScatterAsync { void validate(const std::vector& input_tensors) const; std::vector compute_output_specs(const std::vector& input_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector& input_tensors, std::vector& output_tensors) const; - operation::Hash compute_program_hash(const std::vector& input_tensors) const; + tt::tt_metal::operation::Hash compute_program_hash(const std::vector& input_tensors) const; }; namespace ccl { namespace reduce_scatter_detail { -operation::ProgramWithCallbacks build_reduce_scatter_async_program( +tt::tt_metal::operation::ProgramWithCallbacks build_reduce_scatter_async_program( const Tensor& input_tensor, Tensor& local_output_tensor, Tensor& input_tensor_from_remote_forward_direction, @@ -106,7 +106,7 @@ operation::ProgramWithCallbacks build_reduce_scatter_async_program( std::optional num_links_preferred, const GlobalSemaphore& from_remote_sem, const GlobalSemaphore& to_remote_sem, - const std::optional& sub_device_id, + const std::optional& sub_device_id, std::optional& fabric_handle); } }; // namespace ccl @@ -125,7 +125,7 @@ ReduceScatterAsync create_reduce_scatter_struct( std::optional num_links_preferred, const std::vector& from_remote_sems, const std::vector& to_remote_sems, - std::optional sub_device_id, + std::optional sub_device_id, std::optional& fabric_handle); } // namespace reduce_scatter_detail } // namespace ccl @@ -139,10 +139,10 @@ Tensor reduce_scatter( const global_semaphore::MultiDeviceGlobalSemaphore& from_remote_multi_device_global_semaphore, const global_semaphore::MultiDeviceGlobalSemaphore& to_remote_multi_device_global_semaphore, ttnn::operations::reduction::ReduceType reduce_op = ttnn::operations::reduction::ReduceType::Sum, - const MemoryConfig& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, + const MemoryConfig& output_mem_config = tt::tt_metal::operation::DEFAULT_OUTPUT_MEMORY_CONFIG, ttnn::ccl::Topology topology = ttnn::ccl::Topology::Linear, const std::optional num_preferred_links = std::nullopt, - std::optional worker_subdevice_id_opt = std::nullopt, // TODO make reference + std::optional worker_subdevice_id_opt = std::nullopt, // TODO make reference std::optional fabric_handle = std::nullopt); // TODO make reference Tensor reduce_scatter( const Tensor& input_tensor, @@ -152,10 +152,10 @@ Tensor reduce_scatter( const global_semaphore::MultiDeviceGlobalSemaphore& from_remote_multi_device_global_semaphore, const global_semaphore::MultiDeviceGlobalSemaphore& to_remote_multi_device_global_semaphore, ttnn::operations::reduction::ReduceType reduce_op = ttnn::operations::reduction::ReduceType::Sum, - const MemoryConfig& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, + const MemoryConfig& output_mem_config = tt::tt_metal::operation::DEFAULT_OUTPUT_MEMORY_CONFIG, ttnn::ccl::Topology topology = ttnn::ccl::Topology::Linear, const std::optional num_preferred_links = std::nullopt, - std::optional worker_subdevice_id_opt = std::nullopt, // TODO make reference + std::optional worker_subdevice_id_opt = std::nullopt, // TODO make reference std::optional fabric_handle = std::nullopt); // TODO make reference } // namespace ccl diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_program.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_program.cpp index 11447364c4f..563f868abfe 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_program.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_program.cpp @@ -87,6 +87,8 @@ * */ +using namespace tt::tt_metal; + namespace ttnn::ccl::reduce_scatter_detail { using ttnn::ccl::Shape4D; diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter.cpp index ac044afafd7..e9f66e1645b 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter.cpp @@ -18,7 +18,7 @@ ttnn::Tensor ExecuteReduceScatter::invoke( const std::optional& memory_config, ttnn::ccl::Topology topology, const std::optional num_preferred_links, - std::optional worker_subdevice_id_opt) { + std::optional worker_subdevice_id_opt) { MemoryConfig out_memory_config = memory_config.value_or(input_tensor.memory_config()); return ttnn::operations::experimental::ccl::reduce_scatter( input_tensor, @@ -43,7 +43,7 @@ ttnn::Tensor ExecuteReduceScatter::invoke( const std::optional& memory_config, ttnn::ccl::Topology topology, const std::optional num_preferred_links, - std::optional worker_subdevice_id_opt) { + std::optional worker_subdevice_id_opt) { MemoryConfig out_memory_config = memory_config.value_or(input_tensor.memory_config()); return ttnn::operations::experimental::ccl::reduce_scatter( input_tensor, diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter.hpp b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter.hpp index 70989cbebfc..42de61e83cc 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter.hpp @@ -27,7 +27,7 @@ struct ExecuteReduceScatter { const std::optional& memory_config = std::nullopt, ttnn::ccl::Topology topology = ttnn::ccl::Topology::Linear, const std::optional num_links = std::nullopt, - std::optional worker_subdevice_id_opt = std::nullopt); + std::optional worker_subdevice_id_opt = std::nullopt); static ttnn::Tensor invoke( const ttnn::Tensor& input_tensor, @@ -40,7 +40,7 @@ struct ExecuteReduceScatter { const std::optional& memory_config, ttnn::ccl::Topology topology, const std::optional num_preferred_links, - std::optional worker_subdevice_id_opt); + std::optional worker_subdevice_id_opt); }; } // namespace ccl diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter_pybind.cpp index c3d88c4d4e3..8ea38fab03b 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter_pybind.cpp @@ -34,7 +34,7 @@ void bind_reduce_scatter(pybind11::module& module, const ccl_operation_t& operat const ttnn::MemoryConfig& memory_config, ttnn::ccl::Topology topology, const std::optional num_links, - std::optional worker_subdevice_id_opt) -> ttnn::Tensor { + std::optional worker_subdevice_id_opt) -> ttnn::Tensor { return self( input_tensor, dim, @@ -69,7 +69,7 @@ void bind_reduce_scatter(pybind11::module& module, const ccl_operation_t& operat const ttnn::MemoryConfig& memory_config, ttnn::ccl::Topology topology, const std::optional num_links, - std::optional worker_subdevice_id_opt) -> ttnn::Tensor { + std::optional worker_subdevice_id_opt) -> ttnn::Tensor { return self( input_tensor, dim, diff --git a/ttnn/cpp/ttnn/operations/experimental/cnn/convert_to_chw/convert_to_chw.cpp b/ttnn/cpp/ttnn/operations/experimental/cnn/convert_to_chw/convert_to_chw.cpp index df87c6d4368..6580a6ae1b6 100644 --- a/ttnn/cpp/ttnn/operations/experimental/cnn/convert_to_chw/convert_to_chw.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/cnn/convert_to_chw/convert_to_chw.cpp @@ -15,7 +15,7 @@ ttnn::Tensor ExecuteConvertToCHW::invoke( const std::optional& memory_config, const std::optional& dtype) { auto program = ConvertToCHW{memory_config.value_or(a.memory_config()), dtype.value_or(a.dtype())}; - return operation::run(program, {a}, {}, {}, queue_id).at(0); + return tt::tt_metal::operation::run(program, {a}, {}, {}, queue_id).at(0); } } // namespace ttnn::operations::experimental::cnn diff --git a/ttnn/cpp/ttnn/operations/experimental/cnn/convert_to_chw/device/convert_to_chw_op.cpp b/ttnn/cpp/ttnn/operations/experimental/cnn/convert_to_chw/device/convert_to_chw_op.cpp index 06dd09cd294..465d1eb5eb5 100644 --- a/ttnn/cpp/ttnn/operations/experimental/cnn/convert_to_chw/device/convert_to_chw_op.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/cnn/convert_to_chw/device/convert_to_chw_op.cpp @@ -30,7 +30,8 @@ void ConvertToCHW::validate(const std::vector& input_tensors) const { input_shard_spec.shape[0] % TILE_HEIGHT == 0, "Shard height must be divisible by tile size"); // input shards can be padded so HW may not match shard height TT_FATAL( - this->memory_config.is_sharded() && this->memory_config.memory_layout == TensorMemoryLayout::WIDTH_SHARDED, + this->memory_config.is_sharded() && + this->memory_config.memory_layout == tt::tt_metal::TensorMemoryLayout::WIDTH_SHARDED, "Output tensor must be width sharded"); } @@ -39,10 +40,12 @@ std::vector ConvertToCHW::compute_output_specs(const std::vect const auto B = shape[0]; const auto HW = shape[2]; const auto C = shape[3]; - return {TensorSpec(Shape({B, 1, C, HW}), TensorLayout(dtype, PageConfig(Layout::ROW_MAJOR), memory_config))}; + return {TensorSpec( + Shape({B, 1, C, HW}), + tt::tt_metal::TensorLayout(dtype, tt::tt_metal::PageConfig(tt::tt_metal::Layout::ROW_MAJOR), memory_config))}; } -operation::ProgramWithCallbacks ConvertToCHW::create_program( +tt::tt_metal::operation::ProgramWithCallbacks ConvertToCHW::create_program( const std::vector& input_tensors, std::vector& output_tensors) const { const auto& a = input_tensors.at(0); auto& output = output_tensors.at(0); diff --git a/ttnn/cpp/ttnn/operations/experimental/cnn/convert_to_chw/device/convert_to_chw_op.hpp b/ttnn/cpp/ttnn/operations/experimental/cnn/convert_to_chw/device/convert_to_chw_op.hpp index 6ae7164a5e6..a50f0b24b85 100644 --- a/ttnn/cpp/ttnn/operations/experimental/cnn/convert_to_chw/device/convert_to_chw_op.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/cnn/convert_to_chw/device/convert_to_chw_op.hpp @@ -10,12 +10,12 @@ namespace ttnn::operations::experimental::cnn { struct ConvertToCHW { - MemoryConfig memory_config; - DataType dtype; + tt::tt_metal::MemoryConfig memory_config; + tt::tt_metal::DataType dtype; void validate(const std::vector& input_tensors) const; std::vector compute_output_specs(const std::vector& input_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector& input_tensors, std::vector& output_tensors) const; }; diff --git a/ttnn/cpp/ttnn/operations/experimental/cnn/convert_to_chw/device/convert_to_chw_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/cnn/convert_to_chw/device/convert_to_chw_program_factory.cpp index 668c1c26d5f..5d5d4875c8c 100644 --- a/ttnn/cpp/ttnn/operations/experimental/cnn/convert_to_chw/device/convert_to_chw_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/cnn/convert_to_chw/device/convert_to_chw_program_factory.cpp @@ -8,7 +8,7 @@ namespace ttnn::operations::experimental::cnn::detail { using namespace tt::constants; -operation::ProgramWithCallbacks multi_core_convert_to_chw( +tt::tt_metal::operation::ProgramWithCallbacks multi_core_convert_to_chw( const Tensor& a, Tensor& output, CoreCoord compute_with_storage_grid_size) { tt::tt_metal::Program program = tt::tt_metal::CreateProgram(); @@ -38,14 +38,14 @@ operation::ProgramWithCallbacks multi_core_convert_to_chw( uint32_t total_size, uint32_t page_size, const tt::DataFormat& format, - Buffer* buffer = nullptr) -> tt::tt_metal::CBHandle { + tt::tt_metal::Buffer* buffer = nullptr) -> tt::tt_metal::CBHandle { tt::log_debug( tt::LogType::LogOp, "Creating CB at index {} with total size {} B and page size {} B", index, total_size, page_size); - auto config = CircularBufferConfig(total_size, {{index, format}}).set_page_size(index, page_size); + auto config = tt::tt_metal::CircularBufferConfig(total_size, {{index, format}}).set_page_size(index, page_size); if (buffer != nullptr) { config = config.set_globally_allocated_address(*buffer); } @@ -104,7 +104,7 @@ operation::ProgramWithCallbacks multi_core_convert_to_chw( auto set_runtime_args = [cb_in, cb_out, input_cores, total_tiles_per_core, reader_kernel_id, writer_kernel_id, compute_kernel_id]( - Program& program, const Tensor& a, const Tensor& output) { + tt::tt_metal::Program& program, const Tensor& a, const Tensor& output) { tt::tt_metal::Buffer* a_buffer = a.buffer(); tt::tt_metal::Buffer* output_buffer = output.buffer(); UpdateDynamicCircularBufferAddress(program, cb_in, *a_buffer); @@ -129,7 +129,7 @@ operation::ProgramWithCallbacks multi_core_convert_to_chw( auto override_runtime_arguments_callback = [set_runtime_args]( const void* operation, - Program& program, + tt::tt_metal::Program& program, const std::vector& input_tensors, const std::vector>&, const std::vector& output_tensors) { diff --git a/ttnn/cpp/ttnn/operations/experimental/cnn/convert_to_chw/device/convert_to_chw_program_factory.hpp b/ttnn/cpp/ttnn/operations/experimental/cnn/convert_to_chw/device/convert_to_chw_program_factory.hpp index d78d52e4c79..b2e3b9e2ec4 100644 --- a/ttnn/cpp/ttnn/operations/experimental/cnn/convert_to_chw/device/convert_to_chw_program_factory.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/cnn/convert_to_chw/device/convert_to_chw_program_factory.hpp @@ -8,7 +8,7 @@ namespace ttnn::operations::experimental::cnn::detail { -operation::ProgramWithCallbacks multi_core_convert_to_chw( +tt::tt_metal::operation::ProgramWithCallbacks multi_core_convert_to_chw( const Tensor& a, Tensor& output, CoreCoord compute_with_storage_grid_size); } // namespace ttnn::operations::experimental::cnn::detail diff --git a/ttnn/cpp/ttnn/operations/experimental/copy/typecast/typecast.cpp b/ttnn/cpp/ttnn/operations/experimental/copy/typecast/typecast.cpp index 5dba4ac14f4..9e097512130 100644 --- a/ttnn/cpp/ttnn/operations/experimental/copy/typecast/typecast.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/copy/typecast/typecast.cpp @@ -16,7 +16,7 @@ ttnn::Tensor TypecastOperation::invoke( const DataType& dtype, const std::optional& output_mem_config, const std::optional& optional_output_tensor) { - return operation::run( + return tt::tt_metal::operation::run( ttnn::operations::data_movement::CopyDeviceOperation{ output_mem_config.value_or(input_tensor.memory_config()), dtype}, {input_tensor}, diff --git a/ttnn/cpp/ttnn/operations/experimental/dropout/device/dropout_device_operation_types.hpp b/ttnn/cpp/ttnn/operations/experimental/dropout/device/dropout_device_operation_types.hpp index 0175a32ad34..ce44a4fd995 100644 --- a/ttnn/cpp/ttnn/operations/experimental/dropout/device/dropout_device_operation_types.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/dropout/device/dropout_device_operation_types.hpp @@ -9,8 +9,8 @@ namespace ttnn::operations::experimental::dropout { struct operation_attributes_t { - const DataType output_dtype = DataType::INVALID; - const MemoryConfig output_memory_config; + const tt::tt_metal::DataType output_dtype = tt::tt_metal::DataType::INVALID; + const tt::tt_metal::MemoryConfig output_memory_config; uint32_t seed = 0; const float prob = 0.0f; const float scale = 1.0f; diff --git a/ttnn/cpp/ttnn/operations/experimental/dropout/device/dropout_program_factory.hpp b/ttnn/cpp/ttnn/operations/experimental/dropout/device/dropout_program_factory.hpp index 75a6009c986..bd79e3a774e 100644 --- a/ttnn/cpp/ttnn/operations/experimental/dropout/device/dropout_program_factory.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/dropout/device/dropout_program_factory.hpp @@ -11,10 +11,10 @@ namespace ttnn::operations::experimental::dropout::program { struct DropoutProgramFactory { struct shared_variables_t { - KernelHandle dropout_reader_kernel_id; - KernelHandle dropout_writer_kernel_id; - KernelHandle dropout_kernel_group_1_id; - KernelHandle dropout_kernel_group_2_id; + tt::tt_metal::KernelHandle dropout_reader_kernel_id; + tt::tt_metal::KernelHandle dropout_writer_kernel_id; + tt::tt_metal::KernelHandle dropout_kernel_group_1_id; + tt::tt_metal::KernelHandle dropout_kernel_group_2_id; CoreRangeSet core_group_1; CoreRangeSet core_group_2; uint32_t num_cores; diff --git a/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/attn_matmul.cpp b/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/attn_matmul.cpp index 0788ebf8fdc..c85e7b297a2 100644 --- a/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/attn_matmul.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/attn_matmul.cpp @@ -24,7 +24,7 @@ ttnn::Tensor AttnMatmulOperation::invoke( ? input_tensor_a.device()->arch() : ttnn::operations::experimental::auto_format::AutoFormat::GetDefaultDevice()->arch(); auto kernel_config_val = init_device_compute_kernel_config(arch, compute_kernel_config); - return operation::run( + return tt::tt_metal::operation::run( AttnMatmulDeviceOperation{ std::nullopt, std::nullopt, @@ -79,7 +79,7 @@ ttnn::Tensor AttnMatmulFromCacheOperation::invoke( ? input_tensor_a.device()->arch() : ttnn::operations::experimental::auto_format::AutoFormat::GetDefaultDevice()->arch(); auto kernel_config_val = init_device_compute_kernel_config(arch, compute_kernel_config); - return operation::run( + return tt::tt_metal::operation::run( AttnMatmulDeviceOperation{ num_tokens_rounded_up_to_32, transpose_hw, diff --git a/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/device/attn_matmul_device_operation.hpp b/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/device/attn_matmul_device_operation.hpp index 2db317e53aa..c6967b6bf2f 100644 --- a/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/device/attn_matmul_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/device/attn_matmul_device_operation.hpp @@ -13,7 +13,7 @@ namespace ttnn::operations::experimental::matmul { -operation::ProgramWithCallbacks multi_core_attn_matmul( +tt::tt_metal::operation::ProgramWithCallbacks multi_core_attn_matmul( const Tensor& a, const Tensor& b, Tensor& output, @@ -26,15 +26,15 @@ struct AttnMatmulDeviceOperation { std::optional num_tokens; std::optional transpose_hw; CoreCoord compute_with_storage_grid_size; - MemoryConfig output_mem_config; - DataType output_dtype; + tt::tt_metal::MemoryConfig output_mem_config; + tt::tt_metal::DataType output_dtype; const ttnn::DeviceComputeKernelConfig compute_kernel_config; void validate(const std::vector& input_tensors) const; std::vector compute_output_specs(const std::vector& input_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector& input_tensors, std::vector& output_tensors) const; - const operation::Hash compute_program_hash(const std::vector& input_tensors) const; + const tt::tt_metal::operation::Hash compute_program_hash(const std::vector& input_tensors) const; }; } // namespace ttnn::operations::experimental::matmul diff --git a/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/device/group_attn_matmul_device_operation.hpp b/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/device/group_attn_matmul_device_operation.hpp index 14531364344..1bdb9c062cf 100644 --- a/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/device/group_attn_matmul_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/device/group_attn_matmul_device_operation.hpp @@ -15,7 +15,7 @@ namespace ttnn::operations::experimental::matmul { // TODO: Group attention matmul will support sharding, mcasting, and should be faster; we should make attn_matmul (ie. // KV heads = 1) a special case of group_attn_matmul and run the same op -operation::ProgramWithCallbacks multi_core_group_attn_matmul( +tt::tt_metal::operation::ProgramWithCallbacks multi_core_group_attn_matmul( const Tensor& a, const Tensor& b, Tensor& output, @@ -31,16 +31,16 @@ struct GroupAttnMatmulDeviceOperation { std::optional transpose_hw; const uint32_t out_subblock_w; CoreCoord compute_with_storage_grid_size; - MemoryConfig output_mem_config; - DataType output_dtype; + tt::tt_metal::MemoryConfig output_mem_config; + tt::tt_metal::DataType output_dtype; const bool row_major; // Specifies how work is distributed across cores const ttnn::DeviceComputeKernelConfig compute_kernel_config; void validate(const std::vector& input_tensors) const; std::vector compute_output_specs(const std::vector& input_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector& input_tensors, std::vector& output_tensors) const; - const operation::Hash compute_program_hash(const std::vector& input_tensors) const; + const tt::tt_metal::operation::Hash compute_program_hash(const std::vector& input_tensors) const; }; } // namespace ttnn::operations::experimental::matmul diff --git a/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/group_attn_matmul.cpp b/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/group_attn_matmul.cpp index a4b967fc04c..7c5eca30e34 100644 --- a/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/group_attn_matmul.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/group_attn_matmul.cpp @@ -57,7 +57,7 @@ ttnn::Tensor GroupAttnMatmulOperation::invoke( }, kernel_config_val); - return operation::run( + return tt::tt_metal::operation::run( GroupAttnMatmulDeviceOperation{ std::nullopt, std::nullopt, diff --git a/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/group_attn_matmul_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/group_attn_matmul_pybind.cpp index ec5c3b375bb..6284aba7f89 100644 --- a/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/group_attn_matmul_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/group_attn_matmul_pybind.cpp @@ -7,8 +7,6 @@ #include "ttnn/operations/experimental/matmul/group_attn_matmul/group_attn_matmul_pybind.hpp" #include "ttnn/operations/experimental/matmul/group_attn_matmul/group_attn_matmul.hpp" -using namespace tt::tt_metal; - namespace ttnn::operations::experimental::matmul::detail { void bind_group_attn_matmul(pybind11::module& module) { @@ -43,7 +41,7 @@ void bind_group_attn_matmul(pybind11::module& module) { pybind11::arg().noconvert(), pybind11::kw_only(), pybind11::arg("compute_with_storage_grid_size").noconvert(), - pybind11::arg("memory_config").noconvert() = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, + pybind11::arg("memory_config").noconvert() = tt::tt_metal::operation::DEFAULT_OUTPUT_MEMORY_CONFIG, pybind11::arg("dtype").noconvert() = std::nullopt, pybind11::arg("compute_kernel_config").noconvert() = std::nullopt, pybind11::arg("optional_output_tensor").noconvert() = std::nullopt, diff --git a/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_cache_operation.hpp b/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_cache_operation.hpp index 1a318562bc8..0fcebda32bb 100644 --- a/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_cache_operation.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_cache_operation.hpp @@ -32,7 +32,7 @@ struct PagedUpdateCacheDeviceOperation { const std::vector>& optional_input_tensors) const; const std::vector compute_output_specs(const std::vector& input_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector& input_tensors, const std::vector>& optional_input_tensors, std::vector& output_tensors) const; @@ -44,7 +44,7 @@ struct PagedUpdateCacheDeviceOperation { return std::forward_as_tuple(batch_idx, update_idxs, batch_offset, op_type, compute_kernel_config, share_cache); } - const operation::Hash compute_program_hash( + const tt::tt_metal::operation::Hash compute_program_hash( const std::vector& input_tensors, const std::vector>& optional_input_tensors) const; }; diff --git a/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_fill_cache_program_factory.hpp b/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_fill_cache_program_factory.hpp index 0708ecc46cb..5a2d809750b 100644 --- a/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_fill_cache_program_factory.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_fill_cache_program_factory.hpp @@ -10,6 +10,6 @@ namespace ttnn::operations::experimental::paged_cache::detail { -operation::ProgramWithCallbacks paged_fill_cache_multi_core( +tt::tt_metal::operation::ProgramWithCallbacks paged_fill_cache_multi_core( const Tensor& cache_tensor, const Tensor& input_tensor, const Tensor& page_table_tensor, const uint32_t batch_idx); } // namespace ttnn::operations::experimental::paged_cache::detail diff --git a/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_fused_update_cache_program_factory.hpp b/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_fused_update_cache_program_factory.hpp index 28dc28b077a..331f9a7de78 100644 --- a/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_fused_update_cache_program_factory.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_fused_update_cache_program_factory.hpp @@ -10,7 +10,7 @@ namespace ttnn::operations::experimental::paged_cache::detail { -operation::ProgramWithCallbacks paged_fused_update_cache_multi_core( +tt::tt_metal::operation::ProgramWithCallbacks paged_fused_update_cache_multi_core( const Tensor& cache_tensor1, const Tensor& input_tensor1, const Tensor& cache_tensor2, diff --git a/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_update_cache_program_factory.hpp b/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_update_cache_program_factory.hpp index 1ceec1ce7f5..262af1fea70 100644 --- a/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_update_cache_program_factory.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_update_cache_program_factory.hpp @@ -10,7 +10,7 @@ namespace ttnn::operations::experimental::paged_cache::detail { -operation::ProgramWithCallbacks paged_update_cache_multi_core( +tt::tt_metal::operation::ProgramWithCallbacks paged_update_cache_multi_core( const Tensor& cache_tensor, const Tensor& input_tensor, std::optional update_idxs_tensor, diff --git a/ttnn/cpp/ttnn/operations/experimental/paged_cache/paged_cache.cpp b/ttnn/cpp/ttnn/operations/experimental/paged_cache/paged_cache.cpp index 2bc48b2eb4f..30538d10eb2 100644 --- a/ttnn/cpp/ttnn/operations/experimental/paged_cache/paged_cache.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/paged_cache/paged_cache.cpp @@ -23,7 +23,7 @@ ttnn::Tensor PagedUpdateCacheOperation::invoke( std::optional compute_kernel_config = std::nullopt) { auto kernel_config_val = init_device_compute_kernel_config(input_tensor.device()->arch(), compute_kernel_config); const bool share_cache_arg = share_cache.has_value() ? share_cache.value() : false; // Default share cache to false - operation::run( + tt::tt_metal::operation::run( PagedUpdateCacheDeviceOperation{ 0, update_idxs, batch_offset, PagedUpdateCacheOpType::UPDATE, kernel_config_val, share_cache_arg}, {cache_tensor, input_tensor}, @@ -45,7 +45,7 @@ std::tuple PagedFusedUpdateCacheOperation::invoke( std::optional compute_kernel_config = std::nullopt) { auto kernel_config_val = init_device_compute_kernel_config(input_tensor1.device()->arch(), compute_kernel_config); const bool share_cache_arg = share_cache.has_value() ? share_cache.value() : false; // Default share cache to false - operation::run( + tt::tt_metal::operation::run( PagedUpdateCacheDeviceOperation{ 0, update_idxs, batch_offset, PagedUpdateCacheOpType::FUSED_UPDATE, kernel_config_val, share_cache_arg}, {cache_tensor1, input_tensor1, cache_tensor2, input_tensor2}, @@ -61,7 +61,7 @@ ttnn::Tensor PagedFillCacheOperation::invoke( const uint32_t batch_idx, std::optional compute_kernel_config = std::nullopt) { auto kernel_config_val = init_device_compute_kernel_config(input_tensor.device()->arch(), compute_kernel_config); - operation::run( + tt::tt_metal::operation::run( PagedUpdateCacheDeviceOperation{batch_idx, {}, 0, PagedUpdateCacheOpType::FILL, kernel_config_val}, {cache_tensor, input_tensor, page_table}, {std::nullopt, std::nullopt}); diff --git a/ttnn/cpp/ttnn/operations/experimental/plusone/device/plusone_op.cpp b/ttnn/cpp/ttnn/operations/experimental/plusone/device/plusone_op.cpp index cf019cb297b..b8f3bedde1d 100644 --- a/ttnn/cpp/ttnn/operations/experimental/plusone/device/plusone_op.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/plusone/device/plusone_op.cpp @@ -5,16 +5,16 @@ #include "plusone_op.hpp" #include "plusone_program_factory.hpp" -using namespace tt::tt_metal; - namespace ttnn::operations::experimental { void PlusOne::validate_with_output_tensors( const std::vector& input_tensors, const std::vector>& output_tensors) const { const auto& input_tensor_a = input_tensors.at(0); - TT_FATAL(input_tensor_a.get_dtype() == DataType::INT32, "Only INT32 is supported for inputs!"); - TT_FATAL(input_tensor_a.get_layout() == Layout::ROW_MAJOR, "Only ROW_MAJOR layout is supported for inputs!"); + TT_FATAL(input_tensor_a.get_dtype() == tt::tt_metal::DataType::INT32, "Only INT32 is supported for inputs!"); + TT_FATAL( + input_tensor_a.get_layout() == tt::tt_metal::Layout::ROW_MAJOR, + "Only ROW_MAJOR layout is supported for inputs!"); auto input_shape = input_tensor_a.get_padded_shape(); TT_FATAL(input_shape.size() == 1, "must have 1 dimension"); @@ -29,7 +29,7 @@ std::vector PlusOne::create_output_tensors( return {input_tensors.at(0)}; } -operation::ProgramWithCallbacks PlusOne::create_program( +tt::tt_metal::operation::ProgramWithCallbacks PlusOne::create_program( const std::vector& input_tensors, std::vector& output_tensors) const { const auto& input_tensor = input_tensors.at(0); return detail::plusone_single_core(input_tensor); diff --git a/ttnn/cpp/ttnn/operations/experimental/plusone/device/plusone_op.hpp b/ttnn/cpp/ttnn/operations/experimental/plusone/device/plusone_op.hpp index fe5649ae288..9c9cb0624c1 100644 --- a/ttnn/cpp/ttnn/operations/experimental/plusone/device/plusone_op.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/plusone/device/plusone_op.hpp @@ -18,7 +18,7 @@ struct PlusOne { std::vector compute_output_specs(const std::vector& input_tensors) const; std::vector create_output_tensors( const std::vector& input_tensors, const std::vector>& output_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector& input_tensors, std::vector& output_tensors) const; }; diff --git a/ttnn/cpp/ttnn/operations/experimental/plusone/device/plusone_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/plusone/device/plusone_program_factory.cpp index 487d425cd5a..eecc425c15c 100644 --- a/ttnn/cpp/ttnn/operations/experimental/plusone/device/plusone_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/plusone/device/plusone_program_factory.cpp @@ -13,9 +13,8 @@ namespace ttnn::operations::experimental::detail { using namespace tt::constants; -using namespace tt::tt_metal; -operation::ProgramWithCallbacks plusone_single_core(const Tensor& input) { +tt::tt_metal::operation::ProgramWithCallbacks plusone_single_core(const Tensor& input) { tt::tt_metal::Program program{}; tt::DataFormat input_cb_data_format = tt::tt_metal::datatype_to_dataformat_converter(input.get_dtype()); @@ -68,18 +67,19 @@ operation::ProgramWithCallbacks plusone_single_core(const Tensor& input) { tt::tt_metal::SetRuntimeArgs(program, reader_kernel_id, core, {src_buffer->address()}); } - auto override_runtime_args_callback = - [reader_kernel_id, cores]( - const Program& program, const std::vector& input_buffers, const std::vector&) { - auto src_buffer = input_buffers.at(0); + auto override_runtime_args_callback = [reader_kernel_id, cores]( + const tt::tt_metal::Program& program, + const std::vector& input_buffers, + const std::vector&) { + auto src_buffer = input_buffers.at(0); - for (const auto& core : cores) { - { - auto& runtime_args = GetRuntimeArgs(program, reader_kernel_id, core); - runtime_args[0] = src_buffer->address(); - } + for (const auto& core : cores) { + { + auto& runtime_args = GetRuntimeArgs(program, reader_kernel_id, core); + runtime_args[0] = src_buffer->address(); } - }; + } + }; return {std::move(program), override_runtime_args_callback}; } diff --git a/ttnn/cpp/ttnn/operations/experimental/plusone/device/plusone_program_factory.hpp b/ttnn/cpp/ttnn/operations/experimental/plusone/device/plusone_program_factory.hpp index 30d96780f13..5970b6bed7b 100644 --- a/ttnn/cpp/ttnn/operations/experimental/plusone/device/plusone_program_factory.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/plusone/device/plusone_program_factory.hpp @@ -7,6 +7,6 @@ namespace ttnn::operations::experimental::detail { using namespace tt::constants; -operation::ProgramWithCallbacks plusone_single_core(const Tensor& input); +tt::tt_metal::operation::ProgramWithCallbacks plusone_single_core(const Tensor& input); } // namespace ttnn::operations::experimental::detail diff --git a/ttnn/cpp/ttnn/operations/experimental/plusone/plusone.cpp b/ttnn/cpp/ttnn/operations/experimental/plusone/plusone.cpp index a090a3b241d..de31e9f6523 100644 --- a/ttnn/cpp/ttnn/operations/experimental/plusone/plusone.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/plusone/plusone.cpp @@ -12,7 +12,7 @@ namespace ttnn::operations::experimental { ttnn::Tensor PlusOneOperation::invoke(QueueId queue_id, const Tensor& input_tensor) { - return operation::run(PlusOne{}, {input_tensor}, {}, {}, queue_id).at(0); + return tt::tt_metal::operation::run(PlusOne{}, {input_tensor}, {}, {}, queue_id).at(0); } } // namespace ttnn::operations::experimental diff --git a/ttnn/cpp/ttnn/operations/experimental/reduction/argmax/argmax.cpp b/ttnn/cpp/ttnn/operations/experimental/reduction/argmax/argmax.cpp index 4fcbe4e09ee..a4c7406edcd 100644 --- a/ttnn/cpp/ttnn/operations/experimental/reduction/argmax/argmax.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/reduction/argmax/argmax.cpp @@ -29,8 +29,8 @@ Tensor create_mask(const Tensor& input_a, const std::optional& out Tensor ArgmaxOperation::invoke( const Tensor& input_t, int64_t _dim, bool all, const std::optional& output_mem_config) { auto output_memory_config = output_mem_config.value_or(input_t.memory_config()); - std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({input_t}))}; - operation::launch_op( + std::vector output_tensors = {Tensor(tt::tt_metal::operation::get_workers_for_op_output({input_t}))}; + tt::tt_metal::operation::launch_op( [_dim, all, output_memory_config]( const std::vector& input_tensors, const std::vector>& optional_input_tensors, diff --git a/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/device/fast_reduce_nc_device_operation.hpp b/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/device/fast_reduce_nc_device_operation.hpp index 99d44575f9e..cb54dbd1e79 100644 --- a/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/device/fast_reduce_nc_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/device/fast_reduce_nc_device_operation.hpp @@ -23,7 +23,7 @@ struct FastReduceNCDeviceOperation { const std::vector& input_tensors, const std::vector>& output_tensors) const; std::vector create_output_tensors( const std::vector& input_tensors, const std::vector>& output_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector& inputs, std::vector& outputs) const; }; @@ -32,7 +32,7 @@ Tensor fast_reduce_nc( const ttnn::Tensor& input, tt::stl::Span dims, const std::optional& output = std::nullopt, - const MemoryConfig& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, + const MemoryConfig& output_mem_config = tt::tt_metal::operation::DEFAULT_OUTPUT_MEMORY_CONFIG, std::optional compute_kernel_config = std::nullopt); } // namespace ttnn::operations::experimental::reduction::detail diff --git a/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/device/fast_reduce_nc_program_factory.hpp b/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/device/fast_reduce_nc_program_factory.hpp index c9c67e1e00d..f6699ec7677 100644 --- a/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/device/fast_reduce_nc_program_factory.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/device/fast_reduce_nc_program_factory.hpp @@ -10,7 +10,7 @@ namespace ttnn::operations::experimental::reduction::detail { -operation::ProgramWithCallbacks reduce_nc_factory( +tt::tt_metal::operation::ProgramWithCallbacks reduce_nc_factory( const ttnn::Tensor& input, const ttnn::Tensor& output, int64_t dim, diff --git a/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/fast_reduce_nc_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/fast_reduce_nc_pybind.cpp index 20fdbd17ed0..b2fa272028d 100644 --- a/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/fast_reduce_nc_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/fast_reduce_nc_pybind.cpp @@ -29,7 +29,7 @@ void bind_fast_reduce_nc(pybind11::module& module) { pybind11::kw_only(), pybind11::arg("dims").noconvert() = ttnn::SmallVector(), pybind11::arg("output").noconvert() = std::nullopt, - pybind11::arg("memory_config").noconvert() = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, + pybind11::arg("memory_config").noconvert() = tt::tt_metal::operation::DEFAULT_OUTPUT_MEMORY_CONFIG, pybind11::arg("compute_kernel_config").noconvert() = std::nullopt, pybind11::arg("queue_id") = DefaultQueueId}); } diff --git a/ttnn/cpp/ttnn/operations/experimental/reshape/view.cpp b/ttnn/cpp/ttnn/operations/experimental/reshape/view.cpp index 0753f8468dc..8d896cfd4db 100644 --- a/ttnn/cpp/ttnn/operations/experimental/reshape/view.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/reshape/view.cpp @@ -27,7 +27,8 @@ static MemoryConfig infer_output_memory_config( Tensor tensor_reshape( const Tensor& input_tensor, const ttnn::Shape& new_logical_shape, const ttnn::Shape& new_padded_shape) { ZoneScoped; - GraphTracker::instance().track_function_start("Tensor::reshape", input_tensor, new_logical_shape, new_padded_shape); + tt::tt_metal::GraphTracker::instance().track_function_start( + "Tensor::reshape", input_tensor, new_logical_shape, new_padded_shape); const auto output_memory_config = infer_output_memory_config(input_tensor.memory_config(), new_padded_shape); auto new_spec = ttnn::TensorSpec( @@ -43,7 +44,7 @@ Tensor tensor_reshape( [&input_tensor, &new_spec, &new_logical_shape, &new_padded_shape](auto&& storage) -> Tensor { using T = std::decay_t; const auto& tensor = input_tensor; - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { auto updated_storage = std::get(tensor.get_storage()); for (int i = 0; i < updated_storage.specs.size(); i++) { const auto& prev_spec = updated_storage.specs[i]; @@ -59,8 +60,8 @@ Tensor tensor_reshape( } return Tensor(updated_storage, new_spec); } - if constexpr (std::is_same_v) { - MultiDeviceStorage updated_storage = std::get(tensor.get_storage()); + if constexpr (std::is_same_v) { + tt::tt_metal::MultiDeviceStorage updated_storage = std::get(tensor.get_storage()); std::unordered_map new_specs; for (auto device_id : updated_storage.ordered_device_ids) { const auto& prev_spec = updated_storage.specs.at(device_id); @@ -77,10 +78,10 @@ Tensor tensor_reshape( updated_storage.specs = new_specs; return Tensor(updated_storage, new_spec); } - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { if (input_tensor.get_layout() == Layout::ROW_MAJOR) { if (tensor.memory_config().memory_layout != TensorMemoryLayout::HEIGHT_SHARDED) { - DeviceStorage device_storage = std::get(tensor.get_storage()); + tt::tt_metal::DeviceStorage device_storage = std::get(tensor.get_storage()); auto device_buffer = device_storage.get_buffer(); const auto& tensor_spec = tensor.tensor_spec(); auto page_size_bytes = tensor_spec.compute_page_size_bytes(); @@ -88,9 +89,9 @@ Tensor tensor_reshape( device_storage.insert_buffer(device_buffer); return Tensor(device_storage, new_spec); } else { - DeviceStorage device_storage = std::get(tensor.get_storage()); + tt::tt_metal::DeviceStorage device_storage = std::get(tensor.get_storage()); auto device_buffer = device_storage.get_buffer(); - ShardSpecBuffer shard_spec_buffer = device_buffer->shard_spec(); + tt::tt_metal::ShardSpecBuffer shard_spec_buffer = device_buffer->shard_spec(); auto shard_spec = shard_spec_buffer.tensor_shard_spec; auto shard_shape = shard_spec.shape; @@ -137,7 +138,7 @@ Tensor tensor_reshape( }, input_tensor.get_storage()); output = tt::tt_metal::set_tensor_id(output); - GraphTracker::instance().track_function_end(output); + tt::tt_metal::GraphTracker::instance().track_function_end(output); return output; } diff --git a/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/device/hc_sum_reduce_op.hpp b/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/device/hc_sum_reduce_op.hpp index 70b3b2775cd..17ede2d6aaa 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/device/hc_sum_reduce_op.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/device/hc_sum_reduce_op.hpp @@ -10,13 +10,13 @@ namespace ttnn::operations::experimental::ssm { struct HCSumReduce { - MemoryConfig memory_config; - DataType dtype; + tt::tt_metal::MemoryConfig memory_config; + tt::tt_metal::DataType dtype; MathFidelity math_fidelity; void validate(const std::vector& input_tensors) const; std::vector compute_output_specs(const std::vector& input_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector& input_tensors, std::vector& output_tensors) const; }; diff --git a/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/device/hc_sum_reduce_program_factory.hpp b/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/device/hc_sum_reduce_program_factory.hpp index f80fe388a45..5542e61fbf4 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/device/hc_sum_reduce_program_factory.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/device/hc_sum_reduce_program_factory.hpp @@ -8,7 +8,7 @@ namespace ttnn::operations::experimental::ssm::detail { -operation::ProgramWithCallbacks multi_core_ssm_1d_sum_reduce( +tt::tt_metal::operation::ProgramWithCallbacks multi_core_ssm_1d_sum_reduce( const Tensor& a, Tensor& output, MathFidelity math_fidelity, CoreCoord compute_with_storage_grid_size); } // namespace ttnn::operations::experimental::ssm::detail diff --git a/ttnn/cpp/ttnn/operations/experimental/ssm/prefix_scan/device/prefix_scan_op.hpp b/ttnn/cpp/ttnn/operations/experimental/ssm/prefix_scan/device/prefix_scan_op.hpp index e632ebec73d..74019e10dda 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ssm/prefix_scan/device/prefix_scan_op.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/ssm/prefix_scan/device/prefix_scan_op.hpp @@ -10,13 +10,13 @@ namespace ttnn::operations::experimental::ssm { struct PrefixScan { - MemoryConfig memory_config; - DataType dtype; + tt::tt_metal::MemoryConfig memory_config; + tt::tt_metal::DataType dtype; MathFidelity math_fidelity; void validate(const std::vector& input_tensors) const; std::vector compute_output_specs(const std::vector& input_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector& input_tensors, std::vector& output_tensors) const; }; diff --git a/ttnn/cpp/ttnn/operations/experimental/ssm/prefix_scan/device/prefix_scan_program_factory.hpp b/ttnn/cpp/ttnn/operations/experimental/ssm/prefix_scan/device/prefix_scan_program_factory.hpp index 98ed15f6a1d..b25ac962122 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ssm/prefix_scan/device/prefix_scan_program_factory.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/ssm/prefix_scan/device/prefix_scan_program_factory.hpp @@ -8,7 +8,7 @@ namespace ttnn::operations::experimental::ssm::detail { -operation::ProgramWithCallbacks multi_core_ssm_prefix_scan( +tt::tt_metal::operation::ProgramWithCallbacks multi_core_ssm_prefix_scan( const Tensor& a, const Tensor& bx, const Tensor& h, diff --git a/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/device/repeat_and_interleave_eltwise_mul_op.hpp b/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/device/repeat_and_interleave_eltwise_mul_op.hpp index b028ec5dada..615a5bb3069 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/device/repeat_and_interleave_eltwise_mul_op.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/device/repeat_and_interleave_eltwise_mul_op.hpp @@ -10,15 +10,15 @@ namespace ttnn::operations::experimental::ssm { struct RepeatAndInterleaveEltwiseMul { - MemoryConfig memory_config; - DataType dtype; + tt::tt_metal::MemoryConfig memory_config; + tt::tt_metal::DataType dtype; MathFidelity math_fidelity; const uint32_t HIDDEN_SIZE = 5120; void validate(const std::vector& input_tensors) const; std::vector compute_output_specs(const std::vector& input_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector& input_tensors, std::vector& output_tensors) const; }; diff --git a/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/device/repeat_and_interleave_eltwise_mul_program_factory.hpp b/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/device/repeat_and_interleave_eltwise_mul_program_factory.hpp index 15c26881ea0..f6e1e3f23a3 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/device/repeat_and_interleave_eltwise_mul_program_factory.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/device/repeat_and_interleave_eltwise_mul_program_factory.hpp @@ -8,7 +8,7 @@ namespace ttnn::operations::experimental::ssm::detail { -operation::ProgramWithCallbacks multi_core_ssm_eltwise_mul( +tt::tt_metal::operation::ProgramWithCallbacks multi_core_ssm_eltwise_mul( const Tensor& a, const Tensor& b, Tensor& output, diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/concatenate_heads/concatenate_heads.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/concatenate_heads/concatenate_heads.hpp index 38b6905baec..1ef58ccb7cd 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/concatenate_heads/concatenate_heads.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/concatenate_heads/concatenate_heads.hpp @@ -18,7 +18,7 @@ struct ConcatenateHeadsOperation { const CoreCoord& compute_with_storage_grid_size, const std::optional& memory_config = std::nullopt, std::optional optional_output_tensor = std::nullopt) { - return operation::run( + return tt::tt_metal::operation::run( ConcatenateHeadsDeviceOperation{ compute_with_storage_grid_size, memory_config.value_or(input_tensor.memory_config())}, {input_tensor}, diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/concatenate_heads/device/concatenate_heads_device_operation.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/concatenate_heads/device/concatenate_heads_device_operation.cpp index 6bc9468a21f..7035d6814e8 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/concatenate_heads/device/concatenate_heads_device_operation.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/concatenate_heads/device/concatenate_heads_device_operation.cpp @@ -15,7 +15,7 @@ void ConcatenateHeadsDeviceOperation::validate_with_output_tensors( // TODO: See issue #1744 TT_FATAL(batch_size >= 7 && batch_size <= 9, "Input batch size must be between 7 to 9 for bert large TM ops!"); - TT_FATAL(input_tensor.storage_type() == StorageType::DEVICE, "Operands to TM need to be on device!"); + TT_FATAL(input_tensor.storage_type() == tt::tt_metal::StorageType::DEVICE, "Operands to TM need to be on device!"); TT_FATAL(input_tensor.buffer() != nullptr, "Operands to TM need to be allocated in buffers on device!"); TT_FATAL( input_tensor.get_dtype() == tt::tt_metal::DataType::BFLOAT16 || @@ -45,8 +45,10 @@ std::vector ConcatenateHeadsDeviceOperation::compute_output_sp const auto& input_tensor = input_tensors.at(0); const auto batch_size = input_tensor.get_padded_shape()[0]; ttnn::Shape output_shape({batch_size, 1, 384, 1024}); - return { - TensorSpec(output_shape, TensorLayout(input_tensor.get_dtype(), PageConfig(Layout::TILE), output_mem_config))}; + return {TensorSpec( + output_shape, + tt::tt_metal::TensorLayout( + input_tensor.get_dtype(), tt::tt_metal::PageConfig(tt::tt_metal::Layout::TILE), output_mem_config))}; } std::vector ConcatenateHeadsDeviceOperation::create_output_tensors( @@ -58,7 +60,7 @@ std::vector ConcatenateHeadsDeviceOperation::create_output_tensors( return {create_device_tensor(compute_output_specs(input_tensors, output_tensors)[0], input_tensors.at(0).device())}; } -operation::ProgramWithCallbacks ConcatenateHeadsDeviceOperation::create_program( +tt::tt_metal::operation::ProgramWithCallbacks ConcatenateHeadsDeviceOperation::create_program( const std::vector& input_tensors, std::vector& output_tensors) const { const auto& input_tensor = input_tensors.at(0); auto& output_tensor = output_tensors.at(0); diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/concatenate_heads/device/concatenate_heads_device_operation.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/concatenate_heads/device/concatenate_heads_device_operation.hpp index cc62e8f8e48..08bab4f42c9 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/concatenate_heads/device/concatenate_heads_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/concatenate_heads/device/concatenate_heads_device_operation.hpp @@ -14,7 +14,7 @@ namespace ttnn::operations::experimental::transformer { struct ConcatenateHeadsDeviceOperation { CoreCoord compute_with_storage_grid_size; - MemoryConfig output_mem_config; + tt::tt_metal::MemoryConfig output_mem_config; void validate_with_output_tensors( const std::vector& input_tensors, const std::vector>& output_tensors) const; @@ -22,7 +22,7 @@ struct ConcatenateHeadsDeviceOperation { const std::vector& input_tensors, const std::vector>& output_tensors) const; std::vector create_output_tensors( const std::vector& input_tensors, const std::vector>& output_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector& input_tensors, std::vector& output_tensors) const; }; diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/concatenate_heads/device/concatenate_heads_program_factory.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/concatenate_heads/device/concatenate_heads_program_factory.hpp index d90229779b3..219c302002e 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/concatenate_heads/device/concatenate_heads_program_factory.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/concatenate_heads/device/concatenate_heads_program_factory.hpp @@ -9,9 +9,10 @@ namespace ttnn::operations::experimental::transformer::detail { using namespace tt::constants; +using namespace tt::tt_metal; using namespace tt; -operation::ProgramWithCallbacks concatenate_heads_multi_core( +tt::tt_metal::operation::ProgramWithCallbacks concatenate_heads_multi_core( const Tensor& a, Tensor& output, CoreCoord compute_with_storage_grid_size) { const auto& ashape = a.get_padded_shape(); diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/create_qkv_heads.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/create_qkv_heads.cpp index 43d3a084faf..ead33147302 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/create_qkv_heads.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/create_qkv_heads.cpp @@ -36,7 +36,7 @@ std::tuple CreateQKVHeadsOperation::in } else { optional_outputs = {}; } - auto output_tensors = operation::run( + auto output_tensors = tt::tt_metal::operation::run( CreateQKVHeadsDeviceOperation{num_q_heads, num_kv_heads_val, head_dim, transpose_k_heads, output_mem_config}, {input_tensor}, {}, diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/device/create_qkv_heads_device_operation.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/device/create_qkv_heads_device_operation.cpp index 0baabc209d3..a58264c1adc 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/device/create_qkv_heads_device_operation.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/device/create_qkv_heads_device_operation.cpp @@ -87,9 +87,9 @@ std::vector CreateQKVHeadsDeviceOperation::compute_output_spec uint32_t v_shard_h = v_shape[0] * v_shape[1] * v_shape[2] / num_cores; // want the API to work for different sequence lengths - auto q_spec = ShardSpec(all_cores, {q_shard_h, q_shape[-1]}, shard_orientation); - auto k_spec = ShardSpec(all_cores, {k_shard_h, k_shape[-1]}, shard_orientation); - auto v_spec = ShardSpec(all_cores, {v_shard_h, v_shape[-1]}, shard_orientation); + auto q_spec = tt::tt_metal::ShardSpec(all_cores, {q_shard_h, q_shape[-1]}, shard_orientation); + auto k_spec = tt::tt_metal::ShardSpec(all_cores, {k_shard_h, k_shape[-1]}, shard_orientation); + auto v_spec = tt::tt_metal::ShardSpec(all_cores, {v_shard_h, v_shape[-1]}, shard_orientation); // create sharded tensors auto mem_config_q = this->output_mem_config; mem_config_q.shard_spec = q_spec; @@ -100,9 +100,15 @@ std::vector CreateQKVHeadsDeviceOperation::compute_output_spec auto mem_config_v = this->output_mem_config; mem_config_v.shard_spec = v_spec; - TensorSpec out_tensor_q(q_shape, TensorLayout(input_tensor.get_dtype(), PageConfig(Layout::TILE), mem_config_q)); - TensorSpec out_tensor_k(k_shape, TensorLayout(input_tensor.get_dtype(), PageConfig(Layout::TILE), mem_config_k)); - TensorSpec out_tensor_v(v_shape, TensorLayout(input_tensor.get_dtype(), PageConfig(Layout::TILE), mem_config_v)); + TensorSpec out_tensor_q( + q_shape, + tt::tt_metal::TensorLayout(input_tensor.get_dtype(), tt::tt_metal::PageConfig(Layout::TILE), mem_config_q)); + TensorSpec out_tensor_k( + k_shape, + tt::tt_metal::TensorLayout(input_tensor.get_dtype(), tt::tt_metal::PageConfig(Layout::TILE), mem_config_k)); + TensorSpec out_tensor_v( + v_shape, + tt::tt_metal::TensorLayout(input_tensor.get_dtype(), tt::tt_metal::PageConfig(Layout::TILE), mem_config_v)); return {out_tensor_q, out_tensor_k, out_tensor_v}; } @@ -116,7 +122,7 @@ std::vector CreateQKVHeadsDeviceOperation::create_output_tensors( }; } -operation::ProgramWithCallbacks CreateQKVHeadsDeviceOperation::create_program( +tt::tt_metal::operation::ProgramWithCallbacks CreateQKVHeadsDeviceOperation::create_program( const std::vector& input_tensors, std::vector& output_tensors) const { const auto& input_tensor = input_tensors.at(0); diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/device/create_qkv_heads_device_operation.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/device/create_qkv_heads_device_operation.hpp index 4b79deafdd2..ab01cdc282a 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/device/create_qkv_heads_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/device/create_qkv_heads_device_operation.hpp @@ -11,7 +11,7 @@ namespace ttnn::operations::experimental::transformer { -operation::ProgramWithCallbacks multi_core_create_qkv_heads_sharded( +tt::tt_metal::operation::ProgramWithCallbacks multi_core_create_qkv_heads_sharded( const Tensor& input_tensor_qkv, const uint32_t num_q_heads, const uint32_t num_kv_heads, @@ -31,7 +31,7 @@ struct CreateQKVHeadsDeviceOperation { const std::vector& input_tensors, const std::vector>& output_tensors) const; std::vector create_output_tensors( const std::vector& input_tensors, const std::vector>& output_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector& input_tensors, std::vector& output_tensors) const; }; } // namespace ttnn::operations::experimental::transformer diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/device/create_qkv_heads_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/device/create_qkv_heads_program_factory.cpp index 759fac680c8..d049e18ad79 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/device/create_qkv_heads_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/device/create_qkv_heads_program_factory.cpp @@ -12,7 +12,7 @@ using namespace tt; namespace ttnn::operations::experimental::transformer { -static inline operation::ProgramWithCallbacks create_heads_combined_qkv_sharded( +static inline tt::tt_metal::operation::ProgramWithCallbacks create_heads_combined_qkv_sharded( const Tensor& input_tensor, const std::vector&& heads_per_group, const uint32_t head_dim, @@ -105,7 +105,7 @@ static inline operation::ProgramWithCallbacks create_heads_combined_qkv_sharded( num_tiles_per_group.push_back(heads * head_dim / TILE_WIDTH); } - Program program = CreateProgram(); + Program program = tt::tt_metal::CreateProgram(); std::vector reader_compile_time_args = { (std::uint32_t)heads_per_group[0], // q heads in group @@ -159,30 +159,30 @@ static inline operation::ProgramWithCallbacks create_heads_combined_qkv_sharded( uint32_t v_size = block_ht * num_tiles_per_group[2] * single_tile_size * groups_per_block; // qkv tensor - auto c_in0_config = CircularBufferConfig(input_size, {{CBIndex::c_0, data_format}}) + auto c_in0_config = tt::tt_metal::CircularBufferConfig(input_size, {{CBIndex::c_0, data_format}}) .set_page_size(CBIndex::c_0, single_tile_size) .set_globally_allocated_address(*input_tensor.buffer()); auto cb_in0_id = CreateCircularBuffer(program, all_cores, c_in0_config); // q sharded - auto c_out0_config = CircularBufferConfig(q_size, {{CBIndex::c_16, data_format}}) + auto c_out0_config = tt::tt_metal::CircularBufferConfig(q_size, {{CBIndex::c_16, data_format}}) .set_page_size(CBIndex::c_16, single_tile_size) .set_globally_allocated_address(*output[0].buffer()); auto cb_out0_id = CreateCircularBuffer(program, all_cores, c_out0_config); // k sharded - auto c_out1_config = CircularBufferConfig(k_size, {{CBIndex::c_17, data_format}}) + auto c_out1_config = tt::tt_metal::CircularBufferConfig(k_size, {{CBIndex::c_17, data_format}}) .set_page_size(CBIndex::c_17, single_tile_size) .set_globally_allocated_address(*output[1].buffer()); auto cb_out1_id = CreateCircularBuffer(program, all_cores, c_out1_config); // v sharded - auto c_out2_config = CircularBufferConfig(v_size, {{CBIndex::c_18, data_format}}) + auto c_out2_config = tt::tt_metal::CircularBufferConfig(v_size, {{CBIndex::c_18, data_format}}) .set_page_size(CBIndex::c_18, single_tile_size) .set_globally_allocated_address(*output[2].buffer()); auto cb_out2_id = CreateCircularBuffer(program, all_cores, c_out2_config); if (transpose_k) { - auto c_im0_config = - CircularBufferConfig(k_size, {{CBIndex::c_24, data_format}}).set_page_size(CBIndex::c_24, single_tile_size); + auto c_im0_config = tt::tt_metal::CircularBufferConfig(k_size, {{CBIndex::c_24, data_format}}) + .set_page_size(CBIndex::c_24, single_tile_size); auto cb_im0_id = CreateCircularBuffer(program, all_cores, c_im0_config); } @@ -235,7 +235,7 @@ static inline operation::ProgramWithCallbacks create_heads_combined_qkv_sharded( * * Combined batch/sequence sharding is possible too...that may best be left as an extension */ -operation::ProgramWithCallbacks multi_core_create_qkv_heads_sharded( +tt::tt_metal::operation::ProgramWithCallbacks multi_core_create_qkv_heads_sharded( const Tensor& input_tensor_qkv, const uint32_t num_q_heads, const uint32_t num_kv_heads, diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/create_qkv_heads_from_separate_tensors.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/create_qkv_heads_from_separate_tensors.cpp index bc78ada8d7b..5746c9d7f97 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/create_qkv_heads_from_separate_tensors.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/create_qkv_heads_from_separate_tensors.cpp @@ -34,7 +34,7 @@ std::tuple CreateQKVHeadsSeparateTenso } else { optional_outputs = {}; } - auto output_tensors = operation::run( + auto output_tensors = tt::tt_metal::operation::run( CreateQKVHeadsSeparateTensorsDeviceOperation{ num_q_heads, num_kv_heads_val, diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/device/create_qkv_heads_from_separate_tensors_device_operation.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/device/create_qkv_heads_from_separate_tensors_device_operation.cpp index 7e9f0743e69..c3ffba46281 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/device/create_qkv_heads_from_separate_tensors_device_operation.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/device/create_qkv_heads_from_separate_tensors_device_operation.cpp @@ -161,9 +161,9 @@ std::vector CreateQKVHeadsSeparateTensorsDeviceOperation::comp uint32_t v_shard_h = v_shape[0] * v_shape[1] * v_shape[2] / num_cores; // want the API to work for different sequence lengths - auto q_spec = ShardSpec(all_cores, {q_shard_h, q_shape[-1]}, shard_orientation); - auto k_spec = ShardSpec(all_cores, {k_shard_h, k_shape[-1]}, shard_orientation); - auto v_spec = ShardSpec(all_cores, {v_shard_h, v_shape[-1]}, shard_orientation); + auto q_spec = tt::tt_metal::ShardSpec(all_cores, {q_shard_h, q_shape[-1]}, shard_orientation); + auto k_spec = tt::tt_metal::ShardSpec(all_cores, {k_shard_h, k_shape[-1]}, shard_orientation); + auto v_spec = tt::tt_metal::ShardSpec(all_cores, {v_shard_h, v_shape[-1]}, shard_orientation); // create sharded tensors auto mem_config_q = this->output_mem_config; mem_config_q.shard_spec = q_spec; @@ -174,16 +174,19 @@ std::vector CreateQKVHeadsSeparateTensorsDeviceOperation::comp auto mem_config_v = this->output_mem_config; mem_config_v.shard_spec = v_spec; - auto out_tensor_q = - TensorSpec(q_shape, TensorLayout(input_tensor.get_dtype(), PageConfig(Layout::TILE), mem_config_q)); - auto out_tensor_k = - TensorSpec(k_shape, TensorLayout(input_tensor.get_dtype(), PageConfig(Layout::TILE), mem_config_k)); - auto out_tensor_v = - TensorSpec(v_shape, TensorLayout(input_tensor.get_dtype(), PageConfig(Layout::TILE), mem_config_v)); + auto out_tensor_q = TensorSpec( + q_shape, + tt::tt_metal::TensorLayout(input_tensor.get_dtype(), tt::tt_metal::PageConfig(Layout::TILE), mem_config_q)); + auto out_tensor_k = TensorSpec( + k_shape, + tt::tt_metal::TensorLayout(input_tensor.get_dtype(), tt::tt_metal::PageConfig(Layout::TILE), mem_config_k)); + auto out_tensor_v = TensorSpec( + v_shape, + tt::tt_metal::TensorLayout(input_tensor.get_dtype(), tt::tt_metal::PageConfig(Layout::TILE), mem_config_v)); return {out_tensor_q, out_tensor_k, out_tensor_v}; } -operation::ProgramWithCallbacks CreateQKVHeadsSeparateTensorsDeviceOperation::create_program( +tt::tt_metal::operation::ProgramWithCallbacks CreateQKVHeadsSeparateTensorsDeviceOperation::create_program( const std::vector& input_tensors, std::vector& output_tensors) const { const auto& input_tensor_q = input_tensors.at(0); const auto& input_tensor_kv = input_tensors.at(1); diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/device/create_qkv_heads_from_separate_tensors_device_operation.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/device/create_qkv_heads_from_separate_tensors_device_operation.hpp index 9321193353d..44b880f56c6 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/device/create_qkv_heads_from_separate_tensors_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/device/create_qkv_heads_from_separate_tensors_device_operation.hpp @@ -11,7 +11,7 @@ namespace ttnn::operations::experimental::transformer { -operation::ProgramWithCallbacks multi_core_create_q_and_kv_heads_sharded( +tt::tt_metal::operation::ProgramWithCallbacks multi_core_create_q_and_kv_heads_sharded( const Tensor& input_tensor_q, const Tensor& input_tensor_kv, const uint32_t num_q_heads, @@ -29,7 +29,7 @@ struct CreateQKVHeadsSeparateTensorsDeviceOperation { MemoryConfig output_mem_config; void validate(const std::vector& input_tensors) const; std::vector compute_output_specs(const std::vector& input_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector& input_tensors, std::vector& output_tensors) const; }; } // namespace ttnn::operations::experimental::transformer diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/device/create_qkv_heads_from_separate_tensors_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/device/create_qkv_heads_from_separate_tensors_program_factory.cpp index dac5820e591..08079ad6baf 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/device/create_qkv_heads_from_separate_tensors_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/device/create_qkv_heads_from_separate_tensors_program_factory.cpp @@ -7,12 +7,12 @@ #include #include +namespace ttnn::operations::experimental::transformer { + using namespace tt::constants; using namespace tt; -namespace ttnn::operations::experimental::transformer { - -static inline operation::ProgramWithCallbacks create_qkv_separate( +static inline tt::tt_metal::operation::ProgramWithCallbacks create_qkv_separate( const Tensor& input_tensor_q, const Tensor& input_tensor_kv, const uint32_t num_q_heads, @@ -49,7 +49,7 @@ static inline operation::ProgramWithCallbacks create_qkv_separate( uint32_t q_heads_per_core = num_q_heads / num_w_cores; uint32_t k_heads_per_core = num_kv_heads / num_w_cores; - Program program = CreateProgram(); + Program program = tt::tt_metal::CreateProgram(); std::vector reader_compile_time_args = { (std::uint32_t)q_shard_ht, (std::uint32_t)q_shard_wt, @@ -89,34 +89,34 @@ static inline operation::ProgramWithCallbacks create_qkv_separate( uint32_t kv_size = 2 * k_size; // qkv tensor - auto c_in0_config = CircularBufferConfig(q_size, {{CBIndex::c_0, q_data_format}}) + auto c_in0_config = tt::tt_metal::CircularBufferConfig(q_size, {{CBIndex::c_0, q_data_format}}) .set_page_size(CBIndex::c_0, single_tile_size) .set_globally_allocated_address(*input_tensor_q.buffer()); auto cb_in0_id = CreateCircularBuffer(program, all_cores, c_in0_config); - auto c_in1_config = CircularBufferConfig(kv_size, {{CBIndex::c_1, kv_data_format}}) + auto c_in1_config = tt::tt_metal::CircularBufferConfig(kv_size, {{CBIndex::c_1, kv_data_format}}) .set_page_size(CBIndex::c_1, single_tile_size) .set_globally_allocated_address(*input_tensor_kv.buffer()); auto cb_in1_id = CreateCircularBuffer(program, all_cores, c_in1_config); // q sharded - auto c_out0_config = CircularBufferConfig(q_size, {{CBIndex::c_16, q_data_format}}) + auto c_out0_config = tt::tt_metal::CircularBufferConfig(q_size, {{CBIndex::c_16, q_data_format}}) .set_page_size(CBIndex::c_16, single_tile_size) .set_globally_allocated_address(*output[0].buffer()); auto cb_out0_id = CreateCircularBuffer(program, all_cores, c_out0_config); // k sharded - auto c_out1_config = CircularBufferConfig(k_size, {{CBIndex::c_17, kv_data_format}}) + auto c_out1_config = tt::tt_metal::CircularBufferConfig(k_size, {{CBIndex::c_17, kv_data_format}}) .set_page_size(CBIndex::c_17, single_tile_size) .set_globally_allocated_address(*output[1].buffer()); auto cb_out1_id = CreateCircularBuffer(program, all_cores, c_out1_config); // v sharded - auto c_out2_config = CircularBufferConfig(v_size, {{CBIndex::c_18, kv_data_format}}) + auto c_out2_config = tt::tt_metal::CircularBufferConfig(v_size, {{CBIndex::c_18, kv_data_format}}) .set_page_size(CBIndex::c_18, single_tile_size) .set_globally_allocated_address(*output[2].buffer()); auto cb_out2_id = CreateCircularBuffer(program, all_cores, c_out2_config); if (transpose_k) { - auto c_im0_config = CircularBufferConfig(k_size, {{CBIndex::c_24, kv_data_format}}) + auto c_im0_config = tt::tt_metal::CircularBufferConfig(k_size, {{CBIndex::c_24, kv_data_format}}) .set_page_size(CBIndex::c_24, single_tile_size); auto cb_im0_id = CreateCircularBuffer(program, all_cores, c_im0_config); } @@ -172,7 +172,7 @@ static inline operation::ProgramWithCallbacks create_qkv_separate( * * Combined batch/sequence sharding is possible too...that may best be left as an extension */ -operation::ProgramWithCallbacks multi_core_create_q_and_kv_heads_sharded( +tt::tt_metal::operation::ProgramWithCallbacks multi_core_create_q_and_kv_heads_sharded( const Tensor& input_tensor_q, const Tensor& input_tensor_kv, const uint32_t num_q_heads, diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/nlp_concat_heads_device_operation.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/nlp_concat_heads_device_operation.cpp index 1f0b2829c9e..36fbed5507d 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/nlp_concat_heads_device_operation.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/nlp_concat_heads_device_operation.cpp @@ -4,8 +4,6 @@ #include "nlp_concat_heads_device_operation.hpp" -using namespace tt::tt_metal; - namespace ttnn::operations::experimental::transformer { // Generic NLP ConcatHeads op @@ -13,26 +11,27 @@ void NLPConcatHeadsDeviceOperation::validate(const std::vector& input_te const auto& input_tensor = input_tensors.at(0); const auto input_shape = input_tensor.get_padded_shape(); - TT_FATAL(input_tensor.storage_type() == StorageType::DEVICE, "Operands to TM need to be on device!"); + TT_FATAL(input_tensor.storage_type() == tt::tt_metal::StorageType::DEVICE, "Operands to TM need to be on device!"); TT_FATAL(input_tensor.buffer() != nullptr, "Operands to TM need to be allocated in buffers on device!"); TT_FATAL( input_tensor.get_dtype() == tt::tt_metal::DataType::FLOAT32 || input_tensor.get_dtype() == tt::tt_metal::DataType::BFLOAT16 || input_tensor.get_dtype() == tt::tt_metal::DataType::BFLOAT8_B, "Unsupported data format"); - TT_FATAL(input_tensor.get_layout() == Layout::TILE, "Error"); + TT_FATAL(input_tensor.get_layout() == tt::tt_metal::Layout::TILE, "Error"); if (input_tensor.is_sharded()) { - TT_FATAL(input_tensor.memory_config().memory_layout != TensorMemoryLayout::WIDTH_SHARDED, "Error"); + TT_FATAL( + input_tensor.memory_config().memory_layout != tt::tt_metal::TensorMemoryLayout::WIDTH_SHARDED, "Error"); auto shard_spec = input_tensor.shard_spec().value(); TT_FATAL(shard_spec.shape[1] == input_tensor.get_padded_shape()[-1], "Error"); TT_FATAL(shard_spec.shape[0] % input_tensor.get_padded_shape()[-2] == 0, "Error"); TT_FATAL( input_tensor.get_padded_shape()[1] % (shard_spec.shape[0] / input_tensor.get_padded_shape()[-2]) == 0, "Error"); - TT_FATAL(this->output_mem_config.memory_layout != TensorMemoryLayout::HEIGHT_SHARDED, "Error"); + TT_FATAL(this->output_mem_config.memory_layout != tt::tt_metal::TensorMemoryLayout::HEIGHT_SHARDED, "Error"); } else { - TT_FATAL(this->output_mem_config.memory_layout == TensorMemoryLayout::INTERLEAVED, "Error"); + TT_FATAL(this->output_mem_config.memory_layout == tt::tt_metal::TensorMemoryLayout::INTERLEAVED, "Error"); } } @@ -50,19 +49,24 @@ std::vector NLPConcatHeadsDeviceOperation::compute_output_spec Shape output_shape({input_shape[0], 1, sequence_length, hidden_dim}); if (this->output_mem_config.is_sharded()) { - ShardSpec shard_spec = input_tensor.shard_spec().value(); + tt::tt_metal::ShardSpec shard_spec = input_tensor.shard_spec().value(); uint32_t heads_per_shard = shard_spec.shape[0] / input_tensor.get_padded_shape()[-2]; shard_spec.shape = {shard_spec.shape[0] / heads_per_shard, shard_spec.shape[1] * heads_per_shard}; auto mem_config = this->output_mem_config; mem_config.shard_spec = shard_spec; - return {TensorSpec(output_shape, TensorLayout(input_tensor.get_dtype(), PageConfig(Layout::TILE), mem_config))}; + return {TensorSpec( + output_shape, + tt::tt_metal::TensorLayout( + input_tensor.get_dtype(), tt::tt_metal::PageConfig(tt::tt_metal::Layout::TILE), mem_config))}; } - return { - TensorSpec(output_shape, TensorLayout(input_tensor.get_dtype(), PageConfig(Layout::TILE), output_mem_config))}; + return {TensorSpec( + output_shape, + tt::tt_metal::TensorLayout( + input_tensor.get_dtype(), tt::tt_metal::PageConfig(tt::tt_metal::Layout::TILE), output_mem_config))}; } -operation::ProgramWithCallbacks NLPConcatHeadsDeviceOperation::create_program( +tt::tt_metal::operation::ProgramWithCallbacks NLPConcatHeadsDeviceOperation::create_program( const std::vector& input_tensors, std::vector& output_tensors) const { const auto& input_tensor = input_tensors.at(0); auto& output_tensor = output_tensors.at(0); diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/nlp_concat_heads_device_operation.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/nlp_concat_heads_device_operation.hpp index ddfafc6d76a..26cf505a0ce 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/nlp_concat_heads_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/nlp_concat_heads_device_operation.hpp @@ -12,15 +12,15 @@ namespace ttnn::operations::experimental::transformer { -operation::ProgramWithCallbacks multi_core_nlp_concat_heads( +tt::tt_metal::operation::ProgramWithCallbacks multi_core_nlp_concat_heads( const Tensor& input_tensor_a, Tensor& output, CoreCoord compute_with_storage_grid_size); struct NLPConcatHeadsDeviceOperation { - MemoryConfig output_mem_config; + tt::tt_metal::MemoryConfig output_mem_config; void validate(const std::vector& input_tensors) const; std::vector compute_output_specs(const std::vector& input_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector& input_tensors, std::vector& output_tensors) const; }; diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/nlp_concat_heads_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/nlp_concat_heads_program_factory.cpp index c4757a8f0c3..47292ba7a51 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/nlp_concat_heads_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/nlp_concat_heads_program_factory.cpp @@ -8,14 +8,12 @@ #include "nlp_concat_heads_device_operation.hpp" #include -using namespace tt::tt_metal; - namespace ttnn::operations::experimental::transformer { using namespace tt::constants; using namespace tt; -operation::ProgramWithCallbacks multi_core_nlp_concat_heads( +tt::tt_metal::operation::ProgramWithCallbacks multi_core_nlp_concat_heads( const Tensor& a, Tensor& output, CoreCoord compute_with_storage_grid_size) { const auto& ashape = a.get_padded_shape(); @@ -54,7 +52,7 @@ operation::ProgramWithCallbacks multi_core_nlp_concat_heads( core_group_1 = all_cores; num_blocks_per_core_group_1 = a.shard_spec().value().shape[0] / a.get_padded_shape()[-2]; per_tensor_tiles = a.shard_spec().value().shape[0] * a.shard_spec().value().shape[1] / TILE_HW; - row_major = a.shard_spec().value().orientation == ShardOrientation::ROW_MAJOR; + row_major = a.shard_spec().value().orientation == tt::tt_metal::ShardOrientation::ROW_MAJOR; } else { std::tie( num_cores, @@ -83,7 +81,7 @@ operation::ProgramWithCallbacks multi_core_nlp_concat_heads( bool in0_is_dram = in0_buffer->buffer_type() == tt_metal::BufferType::DRAM ? 1 : 0; bool out_is_dram = out_buffer->buffer_type() == tt_metal::BufferType::DRAM ? 1 : 0; - KernelHandle reader_kernel_id = 0, writer_kernel_id = 0; + tt::tt_metal::KernelHandle reader_kernel_id = 0, writer_kernel_id = 0; if (in_sharded) { std::vector compile_time_args = { (std::uint32_t)src0_cb_index, @@ -134,7 +132,7 @@ operation::ProgramWithCallbacks multi_core_nlp_concat_heads( } // Create circular buffers - CBHandle cb_src0 = 0, cb_out = 0; + tt::tt_metal::CBHandle cb_src0 = 0, cb_out = 0; uint32_t cb_src0_num_tiles = per_tensor_tiles; if (!in_sharded) { cb_src0_num_tiles *= 2; // double buffer @@ -202,7 +200,7 @@ operation::ProgramWithCallbacks multi_core_nlp_concat_heads( auto override_runtime_arguments_callback = [reader_kernel_id, writer_kernel_id, cb_src0, cb_out, cores]( const void* operation, - Program& program, + tt::tt_metal::Program& program, const std::vector& input_tensors, const std::vector>&, const std::vector& output_tensors) { diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/nlp_concat_heads.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/nlp_concat_heads.cpp index 2c7b211b3c9..2816bd511c6 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/nlp_concat_heads.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/nlp_concat_heads.cpp @@ -16,7 +16,7 @@ ttnn::Tensor NLPConcatHeadsOperation::invoke( const Tensor& input_tensor, const std::optional& memory_config, std::optional optional_output_tensor) { - return operation::run( + return tt::tt_metal::operation::run( NLPConcatHeadsDeviceOperation{memory_config.value_or(input_tensor.memory_config())}, {input_tensor}, {}, diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/device/nlp_concat_heads_decode_device_operation.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/device/nlp_concat_heads_decode_device_operation.cpp index 39b62aa58bd..b5cfe52e83c 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/device/nlp_concat_heads_decode_device_operation.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/device/nlp_concat_heads_decode_device_operation.cpp @@ -5,8 +5,6 @@ #include "nlp_concat_heads_decode_device_operation.hpp" #include -using namespace tt::tt_metal; - namespace ttnn::operations::experimental::transformer { // NLP ConcatHeads op for decode @@ -15,13 +13,13 @@ void NLPConcatHeadsDecodeDeviceOperation::validate(const std::vector& in const auto input_shape = input_tensor.get_padded_shape(); // input tensor and shape - TT_FATAL(input_tensor.storage_type() == StorageType::DEVICE, "Operands to TM need to be on device!"); + TT_FATAL(input_tensor.storage_type() == tt::tt_metal::StorageType::DEVICE, "Operands to TM need to be on device!"); TT_FATAL(input_tensor.buffer() != nullptr, "Operands to TM need to be allocated in buffers on device!"); TT_FATAL( input_tensor.get_dtype() == tt::tt_metal::DataType::FLOAT32 || input_tensor.get_dtype() == tt::tt_metal::DataType::BFLOAT16, "Unsupported data format"); - TT_FATAL(input_tensor.get_layout() == Layout::TILE, "Error"); + TT_FATAL(input_tensor.get_layout() == tt::tt_metal::Layout::TILE, "Error"); TT_FATAL(input_shape[0] == 1, "seqlen=1 for decode"); TT_FATAL(input_shape[1] <= 32, "currently only support less than 32 users"); TT_FATAL(input_shape[2] == 32, "currently only support 32 padded heads"); @@ -29,7 +27,7 @@ void NLPConcatHeadsDecodeDeviceOperation::validate(const std::vector& in // input tensor shard spec TT_FATAL(input_tensor.is_sharded(), "Error"); - TT_FATAL(input_tensor.memory_config().memory_layout == TensorMemoryLayout::HEIGHT_SHARDED, "Error"); + TT_FATAL(input_tensor.memory_config().memory_layout == tt::tt_metal::TensorMemoryLayout::HEIGHT_SHARDED, "Error"); auto shard_spec = input_tensor.shard_spec().value(); TT_FATAL(shard_spec.shape[1] == input_tensor.get_padded_shape()[-1], "Error"); TT_FATAL(shard_spec.shape[0] == input_tensor.get_padded_shape()[-2], "Error"); @@ -63,20 +61,23 @@ std::vector NLPConcatHeadsDecodeDeviceOperation::compute_outpu const auto input_core_ranges = input_tensor.shard_spec().value().grid.ranges(); CoreRangeSet input_core_grid = input_tensor.shard_spec().value().grid; const auto start_coord = input_core_ranges[0].start_coord; - output_core_grid = num_cores_to_corerangeset_in_subcoregrids(start_coord, num_heads, input_core_grid, true); - } else { output_core_grid = - num_cores_to_corerangeset(num_heads, input_tensor.device()->compute_with_storage_grid_size(), true); + tt::tt_metal::num_cores_to_corerangeset_in_subcoregrids(start_coord, num_heads, input_core_grid, true); + } else { + output_core_grid = tt::tt_metal::num_cores_to_corerangeset( + num_heads, input_tensor.device()->compute_with_storage_grid_size(), true); } - ShardSpec shard_spec{output_core_grid, {batch, head_dim}}; - auto mem_config = tt::tt_metal::MemoryConfig{TensorMemoryLayout::WIDTH_SHARDED, BufferType::L1}; + tt::tt_metal::ShardSpec shard_spec{output_core_grid, {batch, head_dim}}; + auto mem_config = + tt::tt_metal::MemoryConfig{tt::tt_metal::TensorMemoryLayout::WIDTH_SHARDED, tt::tt_metal::BufferType::L1}; mem_config.shard_spec = shard_spec; - return {TensorSpec(output_shape, TensorLayout(input_tensor.get_dtype(), Layout::TILE, mem_config))}; + return {TensorSpec( + output_shape, tt::tt_metal::TensorLayout(input_tensor.get_dtype(), tt::tt_metal::Layout::TILE, mem_config))}; } -operation::ProgramWithCallbacks NLPConcatHeadsDecodeDeviceOperation::create_program( +tt::tt_metal::operation::ProgramWithCallbacks NLPConcatHeadsDecodeDeviceOperation::create_program( const std::vector& input_tensors, std::vector& output_tensors) const { const auto& input_tensor = input_tensors.at(0); auto& output_tensor = output_tensors.at(0); diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/device/nlp_concat_heads_decode_device_operation.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/device/nlp_concat_heads_decode_device_operation.hpp index 6f928ceee9d..b35948815a4 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/device/nlp_concat_heads_decode_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/device/nlp_concat_heads_decode_device_operation.hpp @@ -12,10 +12,10 @@ namespace ttnn::operations::experimental::transformer { -operation::ProgramWithCallbacks multi_core_nlp_concat_heads_decode( +tt::tt_metal::operation::ProgramWithCallbacks multi_core_nlp_concat_heads_decode( const Tensor& input_tensor, Tensor& output, CoreCoord compute_with_storage_grid_size); -operation::ProgramWithCallbacks multi_core_nlp_concat_heads_decode_subcoregrids( +tt::tt_metal::operation::ProgramWithCallbacks multi_core_nlp_concat_heads_decode_subcoregrids( const Tensor& input_tensor, Tensor& output, CoreCoord compute_with_storage_grid_size); struct NLPConcatHeadsDecodeDeviceOperation { @@ -24,7 +24,7 @@ struct NLPConcatHeadsDecodeDeviceOperation { void validate(const std::vector& input_tensors) const; std::vector compute_output_specs(const std::vector& input_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector& input_tensors, std::vector& output_tensors) const; }; diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/device/nlp_concat_heads_decode_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/device/nlp_concat_heads_decode_program_factory.cpp index 367e56a1fcb..981621aaa41 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/device/nlp_concat_heads_decode_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/device/nlp_concat_heads_decode_program_factory.cpp @@ -12,9 +12,8 @@ namespace ttnn::operations::experimental::transformer { using namespace tt; using namespace tt::constants; -using namespace tt::tt_metal; -operation::ProgramWithCallbacks multi_core_nlp_concat_heads_decode( +tt::tt_metal::operation::ProgramWithCallbacks multi_core_nlp_concat_heads_decode( const Tensor& input_tensor, Tensor& output, CoreCoord compute_with_storage_grid_size) { tt_metal::Program program = tt_metal::CreateProgram(); @@ -120,7 +119,7 @@ operation::ProgramWithCallbacks multi_core_nlp_concat_heads_decode( auto override_runtime_arguments_callback = [reader_kernel_id, writer_kernel_id, num_cores, cb_q_output, cores, element_size, sub_tile_line_bytes]( const void* operation, - Program& program, + tt::tt_metal::Program& program, const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector& output_tensors) { @@ -150,7 +149,8 @@ operation::ProgramWithCallbacks multi_core_nlp_concat_heads_decode( return {.program = std::move(program), .override_runtime_arguments_callback = override_runtime_arguments_callback}; } -operation::ProgramWithCallbacks multi_core_nlp_concat_heads_decode_subcoregrids( + +tt::tt_metal::operation::ProgramWithCallbacks multi_core_nlp_concat_heads_decode_subcoregrids( const Tensor& input_tensor, Tensor& output, CoreCoord compute_with_storage_grid_size) { tt_metal::Program program = tt_metal::CreateProgram(); @@ -268,7 +268,7 @@ operation::ProgramWithCallbacks multi_core_nlp_concat_heads_decode_subcoregrids( face_h, tile_w]( const void* operation, - Program& program, + tt::tt_metal::Program& program, const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector& output_tensors) { diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/nlp_concat_heads_decode.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/nlp_concat_heads_decode.cpp index 1c58a79db04..fd0c511cfd0 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/nlp_concat_heads_decode.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/nlp_concat_heads_decode.cpp @@ -24,7 +24,7 @@ ttnn::Tensor NLPConcatHeadsDecodeOperation::invoke( on_subcoregrids = true; } } - return operation::run( + return tt::tt_metal::operation::run( NLPConcatHeadsDecodeDeviceOperation{num_heads, on_subcoregrids}, {input_tensor}, {}, diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/nlp_create_qkv_heads_device_operation.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/nlp_create_qkv_heads_device_operation.cpp index e0230b4e1cc..0c9c9e72b20 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/nlp_create_qkv_heads_device_operation.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/nlp_create_qkv_heads_device_operation.cpp @@ -137,36 +137,49 @@ NlpCreateHeadsDeviceOperation::spec_return_value_t NlpCreateHeadsDeviceOperation if (operation_attributes.output_mem_config.is_sharded()) { auto core_grid = input_tensor.device()->compute_with_storage_grid_size(); - auto q_shard_grid = num_cores_to_corerangeset(operation_attributes.num_q_heads, core_grid, true); - ShardSpec q_shard_spec{q_shard_grid, {TILE_HEIGHT, operation_attributes.head_dim}}; + auto q_shard_grid = tt::tt_metal::num_cores_to_corerangeset(operation_attributes.num_q_heads, core_grid, true); + tt::tt_metal::ShardSpec q_shard_spec{q_shard_grid, {TILE_HEIGHT, operation_attributes.head_dim}}; auto q_mem_config = operation_attributes.output_mem_config; q_mem_config.shard_spec = q_shard_spec; - auto kv_shard_grid = num_cores_to_corerangeset(operation_attributes.num_kv_heads, core_grid, true); - ShardSpec kv_shard_spec{kv_shard_grid, {TILE_HEIGHT, operation_attributes.head_dim}}; + auto kv_shard_grid = + tt::tt_metal::num_cores_to_corerangeset(operation_attributes.num_kv_heads, core_grid, true); + tt::tt_metal::ShardSpec kv_shard_spec{kv_shard_grid, {TILE_HEIGHT, operation_attributes.head_dim}}; auto kv_mem_config = operation_attributes.output_mem_config; kv_mem_config.shard_spec = kv_shard_spec; return { TensorSpec( q_output_shape, - TensorLayout(input_tensor.get_dtype(), PageConfig(input_tensor.get_layout()), q_mem_config)), + tt::tt_metal::TensorLayout( + input_tensor.get_dtype(), tt::tt_metal::PageConfig(input_tensor.get_layout()), q_mem_config)), TensorSpec( k_output_shape, - TensorLayout(input_tensor.get_dtype(), PageConfig(input_tensor.get_layout()), kv_mem_config)), + tt::tt_metal::TensorLayout( + input_tensor.get_dtype(), tt::tt_metal::PageConfig(input_tensor.get_layout()), kv_mem_config)), TensorSpec( v_output_shape, - TensorLayout(input_tensor.get_dtype(), PageConfig(input_tensor.get_layout()), kv_mem_config))}; + tt::tt_metal::TensorLayout( + input_tensor.get_dtype(), tt::tt_metal::PageConfig(input_tensor.get_layout()), kv_mem_config))}; } return { TensorSpec( q_output_shape, - TensorLayout(input_tensor.get_dtype(), PageConfig(Layout::TILE), operation_attributes.output_mem_config)), + tt::tt_metal::TensorLayout( + input_tensor.get_dtype(), + tt::tt_metal::PageConfig(Layout::TILE), + operation_attributes.output_mem_config)), TensorSpec( k_output_shape, - TensorLayout(input_tensor.get_dtype(), PageConfig(Layout::TILE), operation_attributes.output_mem_config)), + tt::tt_metal::TensorLayout( + input_tensor.get_dtype(), + tt::tt_metal::PageConfig(Layout::TILE), + operation_attributes.output_mem_config)), TensorSpec( v_output_shape, - TensorLayout(input_tensor.get_dtype(), PageConfig(Layout::TILE), operation_attributes.output_mem_config))}; + tt::tt_metal::TensorLayout( + input_tensor.get_dtype(), + tt::tt_metal::PageConfig(Layout::TILE), + operation_attributes.output_mem_config))}; } NlpCreateHeadsDeviceOperation::tensor_return_value_t NlpCreateHeadsDeviceOperation::create_output_tensors( diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/nlp_create_qkv_heads_device_operation.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/nlp_create_qkv_heads_device_operation.hpp index 86cfc534c5e..0818e2a485a 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/nlp_create_qkv_heads_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/nlp_create_qkv_heads_device_operation.hpp @@ -36,8 +36,8 @@ struct NlpCreateHeadsDeviceOperation { struct Interleaved { struct shared_variables_t { - KernelHandle reader_kernel_id; - KernelHandle writer_kernel_id; + tt::tt_metal::KernelHandle reader_kernel_id; + tt::tt_metal::KernelHandle writer_kernel_id; std::size_t num_cores; std::size_t num_cores_y; bool read_from_input_tensor_kv; @@ -59,14 +59,14 @@ struct NlpCreateHeadsDeviceOperation { struct Sharded { struct shared_variables_t { - KernelHandle reader_kernel_id; - KernelHandle writer_kernel_id; + tt::tt_metal::KernelHandle reader_kernel_id; + tt::tt_metal::KernelHandle writer_kernel_id; std::size_t num_cores; std::size_t num_cores_y; bool read_from_input_tensor_kv; - CBHandle cb_q_output; - CBHandle cb_k_output; - CBHandle cb_v_output; + tt::tt_metal::CBHandle cb_q_output; + tt::tt_metal::CBHandle cb_k_output; + tt::tt_metal::CBHandle cb_v_output; std::vector cores; uint32_t head_size; uint32_t per_risc0_out_q_heads; diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/nlp_create_qkv_heads_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/nlp_create_qkv_heads_program_factory.cpp index f7b0ce780b7..6c4cbbfee07 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/nlp_create_qkv_heads_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/nlp_create_qkv_heads_program_factory.cpp @@ -12,7 +12,6 @@ namespace ttnn::operations::experimental::transformer { using namespace tt::constants; using namespace tt; -using namespace tt::tt_metal; NlpCreateHeadsDeviceOperation::Interleaved::cached_program_t NlpCreateHeadsDeviceOperation::Interleaved::create( const operation_attributes_t& operation_attributes, diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/device/nlp_create_qkv_heads_decode_device_operation.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/device/nlp_create_qkv_heads_decode_device_operation.cpp index a39d7c03160..1f3e8564566 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/device/nlp_create_qkv_heads_decode_device_operation.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/device/nlp_create_qkv_heads_decode_device_operation.cpp @@ -129,50 +129,55 @@ std::vector NLPCreateHeadsDecodeDeviceOperation::compute_outpu CoreRangeSet q_shard_grid, k_shard_grid, v_shard_grid; if (!this->input_on_subcoregrids) { auto core_grid = input_tensor.device()->compute_with_storage_grid_size(); - q_shard_grid = num_cores_to_corerangeset(batch, core_grid, true); + q_shard_grid = tt::tt_metal::num_cores_to_corerangeset(batch, core_grid, true); if (this->overlap_qk_coregrid) { k_shard_grid = q_shard_grid; } else { - k_shard_grid = - num_cores_to_corerangeset(CoreCoord{batch % core_grid.x, batch / core_grid.x}, batch, core_grid, true); + k_shard_grid = tt::tt_metal::num_cores_to_corerangeset( + CoreCoord{batch % core_grid.x, batch / core_grid.x}, batch, core_grid, true); } v_shard_grid = q_shard_grid; } else { auto input_core_grid = input_tensor.shard_spec().value().grid; auto start_core_coord = input_core_grid.bounding_box().start_coord; - q_shard_grid = num_cores_to_corerangeset_in_subcoregrids(start_core_coord, batch, input_core_grid, true); + q_shard_grid = + tt::tt_metal::num_cores_to_corerangeset_in_subcoregrids(start_core_coord, batch, input_core_grid, true); if (this->overlap_qk_coregrid) { k_shard_grid = q_shard_grid; } else { - CoreRangeSet q_plus_one_grid = - num_cores_to_corerangeset_in_subcoregrids(start_core_coord, batch + 1, input_core_grid, true); + CoreRangeSet q_plus_one_grid = tt::tt_metal::num_cores_to_corerangeset_in_subcoregrids( + start_core_coord, batch + 1, input_core_grid, true); if (!q_plus_one_grid.ranges().empty()) { start_core_coord = q_plus_one_grid.ranges().back().end_coord; } - k_shard_grid = num_cores_to_corerangeset_in_subcoregrids(start_core_coord, batch, input_core_grid, true); + k_shard_grid = + tt::tt_metal::num_cores_to_corerangeset_in_subcoregrids(start_core_coord, batch, input_core_grid, true); } v_shard_grid = q_shard_grid; } - ShardSpec q_shard_spec{q_shard_grid, {num_q_heads_padded, this->head_dim}}; + tt::tt_metal::ShardSpec q_shard_spec{q_shard_grid, {num_q_heads_padded, this->head_dim}}; q_mem_config.shard_spec = q_shard_spec; - ShardSpec k_shard_spec{k_shard_grid, {num_kv_heads_padded, this->head_dim}}; + tt::tt_metal::ShardSpec k_shard_spec{k_shard_grid, {num_kv_heads_padded, this->head_dim}}; k_mem_config.shard_spec = k_shard_spec; - ShardSpec v_shard_spec{v_shard_grid, {num_kv_heads_padded, this->head_dim}}; + tt::tt_metal::ShardSpec v_shard_spec{v_shard_grid, {num_kv_heads_padded, this->head_dim}}; v_mem_config.shard_spec = v_shard_spec; return { TensorSpec( q_output_shape, - TensorLayout(input_tensor.get_dtype(), PageConfig(input_tensor.get_layout()), q_mem_config)), + tt::tt_metal::TensorLayout( + input_tensor.get_dtype(), tt::tt_metal::PageConfig(input_tensor.get_layout()), q_mem_config)), TensorSpec( k_output_shape, - TensorLayout(input_tensor.get_dtype(), PageConfig(input_tensor.get_layout()), k_mem_config)), + tt::tt_metal::TensorLayout( + input_tensor.get_dtype(), tt::tt_metal::PageConfig(input_tensor.get_layout()), k_mem_config)), TensorSpec( v_output_shape, - TensorLayout(input_tensor.get_dtype(), PageConfig(input_tensor.get_layout()), v_mem_config))}; + tt::tt_metal::TensorLayout( + input_tensor.get_dtype(), tt::tt_metal::PageConfig(input_tensor.get_layout()), v_mem_config))}; } -operation::ProgramWithCallbacks NLPCreateHeadsDecodeDeviceOperation::create_program( +tt::tt_metal::operation::ProgramWithCallbacks NLPCreateHeadsDecodeDeviceOperation::create_program( const std::vector& input_tensors, const std::vector>& optional_input_tensors, std::vector& output_tensors) const { diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/device/nlp_create_qkv_heads_decode_device_operation.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/device/nlp_create_qkv_heads_decode_device_operation.hpp index e2d1740ca48..c4263a3eddb 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/device/nlp_create_qkv_heads_decode_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/device/nlp_create_qkv_heads_decode_device_operation.hpp @@ -11,7 +11,7 @@ namespace ttnn::operations::experimental::transformer { -operation::ProgramWithCallbacks multi_core_nlp_create_qkv_heads_decode( +tt::tt_metal::operation::ProgramWithCallbacks multi_core_nlp_create_qkv_heads_decode( const Tensor& input_tensor, const uint32_t num_q_heads, const uint32_t num_kv_heads, @@ -22,14 +22,14 @@ operation::ProgramWithCallbacks multi_core_nlp_create_qkv_heads_decode( std::optional slice_size, std::vector& output, CoreCoord compute_with_storage_grid_size); -operation::ProgramWithCallbacks multi_core_nlp_create_qkv_heads_decode_interleaved_input( +tt::tt_metal::operation::ProgramWithCallbacks multi_core_nlp_create_qkv_heads_decode_interleaved_input( const Tensor& input_tensor, const uint32_t num_q_heads, const uint32_t num_kv_heads, const uint32_t head_dim, std::vector& output, CoreCoord compute_with_storage_grid_size); -operation::ProgramWithCallbacks multi_core_nlp_create_qkv_heads_decode_sharded_input( +tt::tt_metal::operation::ProgramWithCallbacks multi_core_nlp_create_qkv_heads_decode_sharded_input( const Tensor& input_tensor, const uint32_t num_q_heads, const uint32_t num_kv_heads, @@ -39,7 +39,7 @@ operation::ProgramWithCallbacks multi_core_nlp_create_qkv_heads_decode_sharded_i std::optional slice_size, std::vector& output, CoreCoord compute_with_storage_grid_size); -operation::ProgramWithCallbacks multi_core_nlp_create_qkv_heads_decode_sharded_input_subcoregrid( +tt::tt_metal::operation::ProgramWithCallbacks multi_core_nlp_create_qkv_heads_decode_sharded_input_subcoregrid( const Tensor& input_tensor, const uint32_t num_q_heads, const uint32_t num_kv_heads, @@ -63,7 +63,7 @@ struct NLPCreateHeadsDecodeDeviceOperation { const std::vector& input_tensors, const std::vector>& optional_input_tensors) const; std::vector compute_output_specs(const std::vector& input_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector& input_tensors, const std::vector>& optional_input_tensors, std::vector& output_tensors) const; diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/device/nlp_create_qkv_heads_decode_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/device/nlp_create_qkv_heads_decode_program_factory.cpp index 998c8328d53..b83274220f0 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/device/nlp_create_qkv_heads_decode_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/device/nlp_create_qkv_heads_decode_program_factory.cpp @@ -7,12 +7,13 @@ #include #include #include + using namespace tt::constants; using namespace tt; namespace ttnn::operations::experimental::transformer { -operation::ProgramWithCallbacks multi_core_nlp_create_qkv_heads_decode( +tt::tt_metal::operation::ProgramWithCallbacks multi_core_nlp_create_qkv_heads_decode( const Tensor& input_tensor, const uint32_t num_q_heads, const uint32_t num_kv_heads, @@ -54,7 +55,7 @@ operation::ProgramWithCallbacks multi_core_nlp_create_qkv_heads_decode( } } -operation::ProgramWithCallbacks multi_core_nlp_create_qkv_heads_decode_interleaved_input( +tt::tt_metal::operation::ProgramWithCallbacks multi_core_nlp_create_qkv_heads_decode_interleaved_input( const Tensor& input_tensor, const uint32_t num_q_heads, const uint32_t num_kv_heads, @@ -211,7 +212,7 @@ operation::ProgramWithCallbacks multi_core_nlp_create_qkv_heads_decode_interleav return {.program = std::move(program), .override_runtime_arguments_callback = override_runtime_arguments_callback}; } -operation::ProgramWithCallbacks multi_core_nlp_create_qkv_heads_decode_sharded_input( +tt::tt_metal::operation::ProgramWithCallbacks multi_core_nlp_create_qkv_heads_decode_sharded_input( const Tensor& input_tensor, const uint32_t num_q_heads, const uint32_t num_kv_heads, @@ -229,8 +230,8 @@ operation::ProgramWithCallbacks multi_core_nlp_create_qkv_heads_decode_sharded_i // Create CBs for reader/writer for batch_offset uint32_t batch_offset_cb_index_reader = CBIndex::c_15; uint32_t batch_offset_cb_index_writer = CBIndex::c_14; - CBHandle cb_batch_offset_reader = 0; - CBHandle cb_batch_offset_writer = 0; + tt::tt_metal::CBHandle cb_batch_offset_reader = 0; + tt::tt_metal::CBHandle cb_batch_offset_writer = 0; tt::DataFormat cb_data_format = tt_metal::datatype_to_dataformat_converter(input_tensor.get_dtype()); @@ -378,7 +379,7 @@ operation::ProgramWithCallbacks multi_core_nlp_create_qkv_heads_decode_sharded_i q_cores, tt_metal::WriterDataMovementConfig(q_writer_compile_time_args)); - KernelHandle k_reader_kernel_id = 0, k_writer_kernel_id = 0; + tt::tt_metal::KernelHandle k_reader_kernel_id = 0, k_writer_kernel_id = 0; if (!overlap_qk_coregrid) { // Switch process_qv and process_k for k kernels process_qv = 0; @@ -504,7 +505,7 @@ operation::ProgramWithCallbacks multi_core_nlp_create_qkv_heads_decode_sharded_i } // namespace ttnn::operations::experimental::transformer -operation::ProgramWithCallbacks multi_core_nlp_create_qkv_heads_decode_sharded_input_subcoregrid( +tt::tt_metal::operation::ProgramWithCallbacks multi_core_nlp_create_qkv_heads_decode_sharded_input_subcoregrid( const Tensor& input_tensor, const uint32_t num_q_heads, const uint32_t num_kv_heads, @@ -522,8 +523,8 @@ operation::ProgramWithCallbacks multi_core_nlp_create_qkv_heads_decode_sharded_i // Create CBs for reader/writer for batch_offset uint32_t batch_offset_cb_index_reader = CBIndex::c_15; uint32_t batch_offset_cb_index_writer = CBIndex::c_14; - CBHandle cb_batch_offset_reader = 0; - CBHandle cb_batch_offset_writer = 0; + tt::tt_metal::CBHandle cb_batch_offset_reader = 0; + tt::tt_metal::CBHandle cb_batch_offset_writer = 0; tt::DataFormat cb_data_format = tt_metal::datatype_to_dataformat_converter(input_tensor.get_dtype()); @@ -667,7 +668,7 @@ operation::ProgramWithCallbacks multi_core_nlp_create_qkv_heads_decode_sharded_i q_cores, tt_metal::WriterDataMovementConfig(q_writer_compile_time_args)); - KernelHandle k_reader_kernel_id = 0, k_writer_kernel_id = 0; + tt::tt_metal::KernelHandle k_reader_kernel_id = 0, k_writer_kernel_id = 0; if (!overlap_qk_coregrid) { // Switch process_qv and process_k for k kernels process_qv = 0; diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/nlp_create_qkv_heads_decode.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/nlp_create_qkv_heads_decode.cpp index 502f186e2d3..5f9c724b276 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/nlp_create_qkv_heads_decode.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/nlp_create_qkv_heads_decode.cpp @@ -46,7 +46,7 @@ std::tuple NLPCreateHeadsDecodeOperati } else { optional_outputs = {}; } - auto out = operation::run( + auto out = tt::tt_metal::operation::run( NLPCreateHeadsDecodeDeviceOperation{ num_heads, num_kv_heads_val, diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/device/nlp_create_qkv_heads_falcon7b_device_operation.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/device/nlp_create_qkv_heads_falcon7b_device_operation.cpp index 23e27570ede..27ce1e7de56 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/device/nlp_create_qkv_heads_falcon7b_device_operation.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/device/nlp_create_qkv_heads_falcon7b_device_operation.cpp @@ -6,8 +6,6 @@ #include -using namespace tt::tt_metal; - namespace ttnn::operations::experimental::transformer { // Hard-coded for Falcon7B @@ -38,14 +36,15 @@ std::vector NlpCreateHeadsFalcon7BDeviceOperation::compute_out const auto& input_tensor = input_tensors.at(0); const auto input_shape = input_tensor.get_padded_shape(); - TensorLayout layout(input_tensor.get_dtype(), PageConfig(Layout::TILE), output_mem_config); + tt::tt_metal::TensorLayout layout( + input_tensor.get_dtype(), tt::tt_metal::PageConfig(Layout::TILE), output_mem_config); return { TensorSpec(Shape({input_shape[0], 71, input_shape[2], 64}), layout), TensorSpec(Shape({input_shape[0], 1, input_shape[2], 64}), layout), TensorSpec(Shape({input_shape[0], 1, input_shape[2], 64}), layout)}; } -operation::ProgramWithCallbacks NlpCreateHeadsFalcon7BDeviceOperation::create_program( +tt::tt_metal::operation::ProgramWithCallbacks NlpCreateHeadsFalcon7BDeviceOperation::create_program( const std::vector& input_tensors, std::vector& output_tensors) const { const auto& input_tensor = input_tensors.at(0); auto& output_tensor = output_tensors.at(0); diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/device/nlp_create_qkv_heads_falcon7b_device_operation.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/device/nlp_create_qkv_heads_falcon7b_device_operation.hpp index 1f9e2ecfe52..a9956a66fc8 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/device/nlp_create_qkv_heads_falcon7b_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/device/nlp_create_qkv_heads_falcon7b_device_operation.hpp @@ -15,7 +15,7 @@ namespace ttnn::operations::experimental::transformer { -operation::ProgramWithCallbacks multi_core_nlp_create_qkv_heads_falcon7b( +tt::tt_metal::operation::ProgramWithCallbacks multi_core_nlp_create_qkv_heads_falcon7b( const Tensor& input_tensor_a, std::vector& output, CoreCoord compute_with_storage_grid_size); struct NlpCreateHeadsFalcon7BDeviceOperation { @@ -23,7 +23,7 @@ struct NlpCreateHeadsFalcon7BDeviceOperation { void validate(const std::vector& input_tensors) const; std::vector compute_output_specs(const std::vector& input_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector& input_tensors, std::vector& output_tensors) const; }; diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/device/nlp_create_qkv_heads_falcon7b_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/device/nlp_create_qkv_heads_falcon7b_program_factory.cpp index 0421d7cd78e..6d541be0637 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/device/nlp_create_qkv_heads_falcon7b_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/device/nlp_create_qkv_heads_falcon7b_program_factory.cpp @@ -12,9 +12,8 @@ namespace ttnn::operations::experimental::transformer { using namespace tt::constants; using namespace tt; -using namespace tt::tt_metal; -operation::ProgramWithCallbacks multi_core_nlp_create_qkv_heads_falcon7b( +tt::tt_metal::operation::ProgramWithCallbacks multi_core_nlp_create_qkv_heads_falcon7b( const Tensor& a, std::vector& output, CoreCoord compute_with_storage_grid_size) { const auto& ashape = a.get_padded_shape(); diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/nlp_create_qkv_heads_falcon7b.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/nlp_create_qkv_heads_falcon7b.cpp index e899b817946..3e875c54ce1 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/nlp_create_qkv_heads_falcon7b.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/nlp_create_qkv_heads_falcon7b.cpp @@ -20,7 +20,7 @@ std::tuple NLPCreateHeadsFalcon7bOpera } else { optional_outputs = {}; } - auto outputs = operation::run( + auto outputs = tt::tt_metal::operation::run( NlpCreateHeadsFalcon7BDeviceOperation{output_mem_config}, {input_tensor_q}, {}, optional_outputs); return {outputs[0], outputs[1], outputs[2]}; }; diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_device_operation.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_device_operation.cpp index 46f7763ae46..c298e1a0793 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_device_operation.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_device_operation.cpp @@ -6,8 +6,6 @@ #include -using namespace tt::tt_metal; - namespace ttnn::operations::experimental::transformer { // Hard-coded for Segformer @@ -44,11 +42,12 @@ std::vector NlpCreateHeadsSegformerDeviceOperation::compute_ou auto num_heads = input_shape[3] / tt::constants::TILE_HEIGHT; // head_dim is hard-coded = 32 TensorSpec spec( Shape({input_shape[0], num_heads, input_shape[2], head_dim}), - TensorLayout(input_tensor.get_dtype(), PageConfig(Layout::TILE), output_mem_config)); + tt::tt_metal::TensorLayout( + input_tensor.get_dtype(), tt::tt_metal::PageConfig(Layout::TILE), output_mem_config)); return {spec, spec, spec}; } -operation::ProgramWithCallbacks NlpCreateHeadsSegformerDeviceOperation::create_program( +tt::tt_metal::operation::ProgramWithCallbacks NlpCreateHeadsSegformerDeviceOperation::create_program( const std::vector& input_tensors, std::vector& output_tensors) const { const auto& input_tensor = input_tensors.at(0); auto& output_tensor = output_tensors.at(0); diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_device_operation.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_device_operation.hpp index 37acf28eb27..75db81d19b5 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_device_operation.hpp @@ -15,7 +15,7 @@ namespace ttnn::operations::experimental::transformer { -operation::ProgramWithCallbacks multi_core_nlp_create_qkv_heads_segformer( +tt::tt_metal::operation::ProgramWithCallbacks multi_core_nlp_create_qkv_heads_segformer( const Tensor& input_tensor_a, std::vector& output, CoreCoord compute_with_storage_grid_size); struct NlpCreateHeadsSegformerDeviceOperation { @@ -23,7 +23,7 @@ struct NlpCreateHeadsSegformerDeviceOperation { void validate(const std::vector& input_tensors) const; std::vector compute_output_specs(const std::vector& input_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector& input_tensors, std::vector& output_tensors) const; }; diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_program_factory.cpp index 0a2787ebfcf..fb7e40ebb68 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_program_factory.cpp @@ -12,9 +12,8 @@ namespace ttnn::operations::experimental::transformer { using namespace tt::constants; using namespace tt; -using namespace tt::tt_metal; -operation::ProgramWithCallbacks multi_core_nlp_create_qkv_heads_segformer( +tt::tt_metal::operation::ProgramWithCallbacks multi_core_nlp_create_qkv_heads_segformer( const Tensor& a, std::vector& output, CoreCoord compute_with_storage_grid_size) { const auto& ashape = a.get_padded_shape(); diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer.cpp index 2bc5c409dbf..977bdbb57e4 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer.cpp @@ -20,7 +20,7 @@ std::tuple NLPCreateHeadsSegformerOper } else { optional_outputs = {}; } - auto outputs = operation::run( + auto outputs = tt::tt_metal::operation::run( NlpCreateHeadsSegformerDeviceOperation{output_mem_config}, {input_tensor_q}, {}, optional_outputs); return {outputs[0], outputs[1], outputs[2]}; // return {outputs[0]} diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_device_operation.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_device_operation.cpp index cf9e4bb735a..cc8baba1d95 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_device_operation.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_device_operation.cpp @@ -6,8 +6,6 @@ #include -using namespace tt::tt_metal; - namespace ttnn::operations::experimental::transformer { // Hard-coded for Vit @@ -39,11 +37,12 @@ std::vector NlpCreateHeadsVitDeviceOperation::compute_output_s const auto input_shape = input_tensor.get_padded_shape(); TensorSpec spec( Shape({input_shape[0], 12, input_shape[2], 64}), - TensorLayout(input_tensor.get_dtype(), PageConfig(Layout::TILE), output_mem_config)); + tt::tt_metal::TensorLayout( + input_tensor.get_dtype(), tt::tt_metal::PageConfig(Layout::TILE), output_mem_config)); return {spec, spec, spec}; } -operation::ProgramWithCallbacks NlpCreateHeadsVitDeviceOperation::create_program( +tt::tt_metal::operation::ProgramWithCallbacks NlpCreateHeadsVitDeviceOperation::create_program( const std::vector& input_tensors, std::vector& output_tensors) const { const auto& input_tensor = input_tensors.at(0); auto& output_tensor = output_tensors.at(0); diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_device_operation.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_device_operation.hpp index fb938937037..c8567d64961 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_device_operation.hpp @@ -15,7 +15,7 @@ namespace ttnn::operations::experimental::transformer { -operation::ProgramWithCallbacks multi_core_nlp_create_qkv_heads_vit( +tt::tt_metal::operation::ProgramWithCallbacks multi_core_nlp_create_qkv_heads_vit( const Tensor& input_tensor_a, std::vector& output, CoreCoord compute_with_storage_grid_size); struct NlpCreateHeadsVitDeviceOperation { @@ -23,7 +23,7 @@ struct NlpCreateHeadsVitDeviceOperation { void validate(const std::vector& input_tensors) const; std::vector compute_output_specs(const std::vector& input_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector& input_tensors, std::vector& output_tensors) const; }; diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_program_factory.cpp index 87284252881..4822b363283 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_program_factory.cpp @@ -12,9 +12,8 @@ namespace ttnn::operations::experimental::transformer { using namespace tt::constants; using namespace tt; -using namespace tt::tt_metal; -operation::ProgramWithCallbacks multi_core_nlp_create_qkv_heads_vit( +tt::tt_metal::operation::ProgramWithCallbacks multi_core_nlp_create_qkv_heads_vit( const Tensor& a, std::vector& output, CoreCoord compute_with_storage_grid_size) { const auto& ashape = a.get_padded_shape(); diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit.cpp index a7577184fcd..1dff97b814f 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit.cpp @@ -20,8 +20,8 @@ std::tuple NLPCreateHeadsVitOperation: } else { optional_outputs = {}; } - auto outputs = - operation::run(NlpCreateHeadsVitDeviceOperation{output_mem_config}, {input_tensor_q}, {}, optional_outputs); + auto outputs = tt::tt_metal::operation::run( + NlpCreateHeadsVitDeviceOperation{output_mem_config}, {input_tensor_q}, {}, optional_outputs); return {outputs[0], outputs[1], outputs[2]}; }; diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/device/nlp_kv_cache_load_slice_device_operation.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/device/nlp_kv_cache_load_slice_device_operation.cpp index 67ce7d857f3..46153baac77 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/device/nlp_kv_cache_load_slice_device_operation.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/device/nlp_kv_cache_load_slice_device_operation.cpp @@ -5,17 +5,16 @@ #include "nlp_kv_cache_load_slice_device_operation.hpp" #include -using namespace tt::tt_metal; - namespace ttnn::operations::experimental::transformer { // NLP KV Cache Unpad To Sharded op void NlpKVCacheLoadSliceDeviceOperation::validate(const std::vector& input_tensors) const { using namespace tt::constants; const auto& input_tensor_a = input_tensors.at(0); - TT_FATAL(input_tensor_a.storage_type() == StorageType::DEVICE, "Operands to unpad need to be on device!"); + TT_FATAL( + input_tensor_a.storage_type() == tt::tt_metal::StorageType::DEVICE, "Operands to unpad need to be on device!"); TT_FATAL(input_tensor_a.buffer() != nullptr, "Operands to unpad need to be allocated in buffers on device!"); - TT_FATAL(input_tensor_a.get_layout() == Layout::TILE, "Error"); + TT_FATAL(input_tensor_a.get_layout() == tt::tt_metal::Layout::TILE, "Error"); for (uint32_t i = 0; i < input_tensor_a.get_padded_shape().rank(); i++) { TT_FATAL(this->output_tensor_start[i] < input_tensor_a.get_padded_shape()[i], "Error"); @@ -62,16 +61,18 @@ std::vector NlpKVCacheLoadSliceDeviceOperation::compute_output auto fused_batch_heads = dim0 * dim1; auto core_grid = input_tensor_a.device()->compute_with_storage_grid_size(); - auto shard_grid = num_cores_to_corerangeset(fused_batch_heads, core_grid, true); - ShardSpec shard_spec{shard_grid, {unpad_length, head_dim}}; - auto mem_config = tt::tt_metal::MemoryConfig{TensorMemoryLayout::HEIGHT_SHARDED, BufferType::L1}; + auto shard_grid = tt::tt_metal::num_cores_to_corerangeset(fused_batch_heads, core_grid, true); + tt::tt_metal::ShardSpec shard_spec{shard_grid, {unpad_length, head_dim}}; + auto mem_config = + tt::tt_metal::MemoryConfig{tt::tt_metal::TensorMemoryLayout::HEIGHT_SHARDED, tt::tt_metal::BufferType::L1}; mem_config.shard_spec = shard_spec; return {TensorSpec( Shape(out_shape), - TensorLayout(input_tensor_a.get_dtype(), PageConfig(input_tensor_a.get_layout()), mem_config))}; + tt::tt_metal::TensorLayout( + input_tensor_a.get_dtype(), tt::tt_metal::PageConfig(input_tensor_a.get_layout()), mem_config))}; } -operation::ProgramWithCallbacks NlpKVCacheLoadSliceDeviceOperation::create_program( +tt::tt_metal::operation::ProgramWithCallbacks NlpKVCacheLoadSliceDeviceOperation::create_program( const std::vector& input_tensors, std::vector& output_tensors) const { const auto& input_tensor_a = input_tensors.at(0); auto& output_tensor = output_tensors.at(0); diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/device/nlp_kv_cache_load_slice_device_operation.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/device/nlp_kv_cache_load_slice_device_operation.hpp index 9788f53f272..98a4ba192a9 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/device/nlp_kv_cache_load_slice_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/device/nlp_kv_cache_load_slice_device_operation.hpp @@ -12,7 +12,7 @@ namespace ttnn::operations::experimental::transformer { -operation::ProgramWithCallbacks multi_core_nlp_kv_cache_load_slice( +tt::tt_metal::operation::ProgramWithCallbacks multi_core_nlp_kv_cache_load_slice( const Tensor& a, Tensor& output, const ttnn::Shape& output_tensor_start, const ttnn::Shape& output_tensor_end); struct NlpKVCacheLoadSliceDeviceOperation { @@ -23,7 +23,7 @@ struct NlpKVCacheLoadSliceDeviceOperation { void validate(const std::vector& input_tensors) const; std::vector compute_output_specs(const std::vector& input_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector& input_tensors, std::vector& output_tensors) const; }; diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/device/nlp_kv_cache_load_slice_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/device/nlp_kv_cache_load_slice_program_factory.cpp index 19fa636fff1..b132226e4dd 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/device/nlp_kv_cache_load_slice_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/device/nlp_kv_cache_load_slice_program_factory.cpp @@ -9,8 +9,6 @@ #include #include "ttnn/operations/data_movement/slice/device/slice_op.hpp" -using namespace tt::tt_metal; - namespace ttnn::operations::experimental::transformer { using namespace tt::constants; @@ -50,7 +48,7 @@ std::vector, std::vector>> get_unpad_r return ret_val; } -operation::ProgramWithCallbacks multi_core_nlp_kv_cache_load_slice( +tt::tt_metal::operation::ProgramWithCallbacks multi_core_nlp_kv_cache_load_slice( const Tensor& a, Tensor& output, const ttnn::Shape& output_tensor_start, const ttnn::Shape& output_tensor_end) { const auto output_shape = output.get_padded_shape(); const auto input_shape = a.get_padded_shape(); @@ -135,7 +133,7 @@ operation::ProgramWithCallbacks multi_core_nlp_kv_cache_load_slice( auto override_runtime_args_callback = [unary_reader_kernel_id, unary_writer_kernel_id, cb_src0]( const void* operation, - Program& program, + tt::tt_metal::Program& program, const std::vector& input_tensors, const std::vector>&, const std::vector& output_tensors) { @@ -161,8 +159,12 @@ operation::ProgramWithCallbacks multi_core_nlp_kv_cache_load_slice( for (uint32_t i = 0; i < num_cores_total; i++) { CoreCoord core = {i % num_cores_x, i / num_cores_x}; - { SetRuntimeArgs(program, unary_reader_kernel_id, core, all_runtime_args[i].first); } - { SetRuntimeArgs(program, unary_writer_kernel_id, core, all_runtime_args[i].second); } + { + SetRuntimeArgs(program, unary_reader_kernel_id, core, all_runtime_args[i].first); + } + { + SetRuntimeArgs(program, unary_writer_kernel_id, core, all_runtime_args[i].second); + } } }; diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/nlp_kv_cache_load_slice.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/nlp_kv_cache_load_slice.cpp index 6df2cde1478..47c344871b0 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/nlp_kv_cache_load_slice.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/nlp_kv_cache_load_slice.cpp @@ -43,7 +43,7 @@ ttnn::Tensor NLPKVCacheLoadSliceOperation::invoke( output_tensor_end[2] - output_tensor_start[2] + 1, output_tensor_end[3] - output_tensor_start[3] + 1, }); - return operation::run( + return tt::tt_metal::operation::run( NlpKVCacheLoadSliceDeviceOperation{ output_tensor_start, output_tensor_end, output_tensor_shape, input_tensor_shape}, {input_tensor}, diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/device/rotary_embedding_device_operation.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/device/rotary_embedding_device_operation.hpp index 6da81c8ed91..a151614e510 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/device/rotary_embedding_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/device/rotary_embedding_device_operation.hpp @@ -29,7 +29,7 @@ struct RotaryEmbedding { void validate(const std::vector& input_tensors) const; std::vector compute_output_specs(const std::vector& input_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector& input_tensors, std::vector& output_tensors) const; const operation::Hash compute_program_hash(const std::vector& input_tensors) const; diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/device/rotary_embedding_program_factory.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/device/rotary_embedding_program_factory.hpp index 96ce7d93f74..10739105589 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/device/rotary_embedding_program_factory.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/device/rotary_embedding_program_factory.hpp @@ -15,7 +15,7 @@ namespace tt { namespace tt_metal { -operation::ProgramWithCallbacks rotary_embedding_multi_core( +tt::tt_metal::operation::ProgramWithCallbacks rotary_embedding_multi_core( const Tensor& input, const Tensor& cos, const Tensor& sin, diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/rotary_embedding.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/rotary_embedding.cpp index a747b637259..88f84df318e 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/rotary_embedding.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/rotary_embedding.cpp @@ -8,8 +8,6 @@ #include "ttnn/operation.hpp" #include "ttnn/operations/experimental/auto_format/auto_format.hpp" -using namespace tt::tt_metal; - namespace ttnn::operations::experimental::transformer { ttnn::Tensor RotaryEmbeddingOperation::invoke( @@ -68,7 +66,7 @@ ttnn::Tensor RotaryEmbeddingOperation::invoke( auto kernel_config_val = init_device_compute_kernel_config(arch, compute_kernel_config, MathFidelity::HiFi4, true, false, false); - tt::tt_metal::MemoryConfig default_memory_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG; + tt::tt_metal::MemoryConfig default_memory_config = tt::tt_metal::operation::DEFAULT_OUTPUT_MEMORY_CONFIG; if (input_tensor.storage_type() == StorageType::DEVICE) { default_memory_config = input_tensor.memory_config(); } @@ -88,8 +86,9 @@ ttnn::Tensor RotaryEmbeddingOperation::invoke( ttnn::operations::experimental::auto_format::FormatParams sin_format_params = { .pad_shape = sin_pad_shape, .pad_value = 0.0, .target_layout = Layout::TILE}; - return operation::run_with_autoformat( - RotaryEmbedding{seq_len, token_index, memory_config.value_or(default_memory_config), kernel_config_val}, + return tt::tt_metal::operation::run_with_autoformat( + tt::tt_metal::RotaryEmbedding{ + seq_len, token_index, memory_config.value_or(default_memory_config), kernel_config_val}, {input_tensor, cos_cache, sin_cache}, {input_format_params, cos_format_params, sin_format_params}, {Layout::TILE}) diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/device/rotary_embedding_llama_device_operation.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/device/rotary_embedding_llama_device_operation.hpp index fbd008c20fa..bf620ed1578 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/device/rotary_embedding_llama_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/device/rotary_embedding_llama_device_operation.hpp @@ -22,7 +22,7 @@ struct RotaryEmbeddingLlama { void validate(const std::vector& input_tensors) const; std::vector compute_output_specs(const std::vector& input_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector& input_tensors, std::vector& output_tensors) const; }; diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/device/rotary_embedding_llama_program_factory.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/device/rotary_embedding_llama_program_factory.hpp index 43ef87cc173..af822e1e76e 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/device/rotary_embedding_llama_program_factory.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/device/rotary_embedding_llama_program_factory.hpp @@ -13,7 +13,7 @@ namespace tt { namespace tt_metal { -operation::ProgramWithCallbacks rotary_embedding_llama_multi_core( +tt::tt_metal::operation::ProgramWithCallbacks rotary_embedding_llama_multi_core( const Tensor& input, const Tensor& cos, const Tensor& sin, @@ -21,7 +21,7 @@ operation::ProgramWithCallbacks rotary_embedding_llama_multi_core( Tensor& output, ttnn::DeviceComputeKernelConfig compute_kernel_config); -operation::ProgramWithCallbacks rotary_embedding_llama_multi_core_sharded( +tt::tt_metal::operation::ProgramWithCallbacks rotary_embedding_llama_multi_core_sharded( const Tensor& input, const Tensor& cos, const Tensor& sin, diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/rotary_embedding_llama.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/rotary_embedding_llama.cpp index 20abef8b6e1..2c50471ac02 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/rotary_embedding_llama.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/rotary_embedding_llama.cpp @@ -6,8 +6,6 @@ #include "device/rotary_embedding_llama_device_operation.hpp" -using namespace tt::tt_metal; - namespace ttnn::operations::experimental::transformer { Tensor RotaryEmbeddingLlamaOperation::invoke( @@ -19,8 +17,8 @@ Tensor RotaryEmbeddingLlamaOperation::invoke( const std::optional& memory_config, std::optional compute_kernel_config) { std::vector output_tensors = { - Tensor(operation::get_workers_for_op_output({input_tensor, cos_cache, sin_cache, trans_mat}))}; - operation::launch_op( + Tensor(tt::tt_metal::operation::get_workers_for_op_output({input_tensor, cos_cache, sin_cache, trans_mat}))}; + tt::tt_metal::operation::launch_op( [is_decode_mode, memory_config, compute_kernel_config]( const std::vector& input_tensors, const std::vector>& optional_input_tensors, @@ -33,13 +31,14 @@ Tensor RotaryEmbeddingLlamaOperation::invoke( auto kernel_config_val = init_device_compute_kernel_config(arch, compute_kernel_config, MathFidelity::HiFi4, true, false, false); - tt::tt_metal::MemoryConfig default_memory_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG; + tt::tt_metal::MemoryConfig default_memory_config = tt::tt_metal::operation::DEFAULT_OUTPUT_MEMORY_CONFIG; if (input_tensor.storage_type() == StorageType::DEVICE) { default_memory_config = input_tensor.memory_config(); } - return operation::run( - RotaryEmbeddingLlama{is_decode_mode, memory_config.value_or(default_memory_config), kernel_config_val}, + return tt::tt_metal::operation::run( + tt::tt_metal::RotaryEmbeddingLlama{ + is_decode_mode, memory_config.value_or(default_memory_config), kernel_config_val}, input_tensors); }, {input_tensor, cos_cache, sin_cache, trans_mat}, diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama_fused_qk/device/rotary_embedding_llama_fused_qk_device_operation.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama_fused_qk/device/rotary_embedding_llama_fused_qk_device_operation.hpp index 49ef7ec1b9f..216da6f019c 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama_fused_qk/device/rotary_embedding_llama_fused_qk_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama_fused_qk/device/rotary_embedding_llama_fused_qk_device_operation.hpp @@ -22,7 +22,7 @@ struct RotaryEmbeddingLlamaFusedQK { void validate(const std::vector& input_tensors) const; std::vector compute_output_specs(const std::vector& input_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector& input_tensors, std::vector& output_tensors) const; }; diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama_fused_qk/device/rotary_embedding_llama_fused_qk_program_factory.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama_fused_qk/device/rotary_embedding_llama_fused_qk_program_factory.hpp index 8da42cd9cd8..356f249eaca 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama_fused_qk/device/rotary_embedding_llama_fused_qk_program_factory.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama_fused_qk/device/rotary_embedding_llama_fused_qk_program_factory.hpp @@ -13,7 +13,7 @@ namespace tt { namespace tt_metal { -operation::ProgramWithCallbacks rotary_embedding_llama_fused_qk_multi_core_sharded( +tt::tt_metal::operation::ProgramWithCallbacks rotary_embedding_llama_fused_qk_multi_core_sharded( const Tensor& q_input, const Tensor& k_input, const Tensor& cos, diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama_fused_qk/rotary_embedding_llama_fused_qk.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama_fused_qk/rotary_embedding_llama_fused_qk.cpp index 60368f77fb6..8394ca12683 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama_fused_qk/rotary_embedding_llama_fused_qk.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama_fused_qk/rotary_embedding_llama_fused_qk.cpp @@ -6,8 +6,6 @@ #include "device/rotary_embedding_llama_fused_qk_device_operation.hpp" -using namespace tt::tt_metal; - namespace ttnn::operations::experimental::transformer { std::tuple RotaryEmbeddingLlamaFusedQKOperation::invoke( @@ -18,10 +16,11 @@ std::tuple RotaryEmbeddingLlamaFusedQKOperation::inv const Tensor& trans_mat, std::optional compute_kernel_config) { std::vector output_tensors = { - Tensor(operation::get_workers_for_op_output({q_input_tensor, k_input_tensor, cos_cache, sin_cache, trans_mat})), - Tensor( - operation::get_workers_for_op_output({q_input_tensor, k_input_tensor, cos_cache, sin_cache, trans_mat}))}; - operation::launch_op( + Tensor(tt::tt_metal::operation::get_workers_for_op_output( + {q_input_tensor, k_input_tensor, cos_cache, sin_cache, trans_mat})), + Tensor(tt::tt_metal::operation::get_workers_for_op_output( + {q_input_tensor, k_input_tensor, cos_cache, sin_cache, trans_mat}))}; + tt::tt_metal::operation::launch_op( [compute_kernel_config]( const std::vector& input_tensors, const std::vector>& optional_input_tensors, @@ -35,8 +34,8 @@ std::tuple RotaryEmbeddingLlamaFusedQKOperation::inv auto kernel_config_val = init_device_compute_kernel_config(arch, compute_kernel_config, MathFidelity::HiFi4, true, false, false); - tt::tt_metal::MemoryConfig q_output_memory_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG; - tt::tt_metal::MemoryConfig k_output_memory_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG; + tt::tt_metal::MemoryConfig q_output_memory_config = tt::tt_metal::operation::DEFAULT_OUTPUT_MEMORY_CONFIG; + tt::tt_metal::MemoryConfig k_output_memory_config = tt::tt_metal::operation::DEFAULT_OUTPUT_MEMORY_CONFIG; if (q_input_tensor.storage_type() == StorageType::DEVICE) { q_output_memory_config = q_input_tensor.memory_config(); } @@ -44,8 +43,9 @@ std::tuple RotaryEmbeddingLlamaFusedQKOperation::inv k_output_memory_config = k_input_tensor.memory_config(); } - return operation::run( - RotaryEmbeddingLlamaFusedQK{q_output_memory_config, k_output_memory_config, kernel_config_val}, + return tt::tt_metal::operation::run( + tt::tt_metal::RotaryEmbeddingLlamaFusedQK{ + q_output_memory_config, k_output_memory_config, kernel_config_val}, input_tensors); }, {q_input_tensor, k_input_tensor, cos_cache, sin_cache, trans_mat}, diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/rotate_half/device/rotate_half_device_operation.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/rotate_half/device/rotate_half_device_operation.hpp index db143109df8..ce0ff69b75d 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/rotate_half/device/rotate_half_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/rotate_half/device/rotate_half_device_operation.hpp @@ -11,14 +11,14 @@ namespace ttnn::operations::experimental::transformer { enum class RotateHalfOpParallelizationStrategy { SINGLE_CORE }; struct RotateHalf { - const MemoryConfig output_mem_config; + const tt::tt_metal::MemoryConfig output_mem_config; RotateHalfOpParallelizationStrategy get_parallelization_strategy(const std::vector& input_tensors) const; void validate(const std::vector& input_tensors) const; std::vector compute_output_specs(const std::vector& input_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector& input_tensors, std::vector& output_tensors) const; }; diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/rotate_half/device/single_core/rotate_half_program_factory.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/rotate_half/device/single_core/rotate_half_program_factory.hpp index 5a40a034d9a..23ba877a1f1 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/rotate_half/device/single_core/rotate_half_program_factory.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/rotate_half/device/single_core/rotate_half_program_factory.hpp @@ -10,6 +10,7 @@ namespace ttnn::operations::experimental::transformer::detail { -operation::ProgramWithCallbacks rotate_half_single_core(const Tensor& input_tensor, Tensor& output_tensor); +tt::tt_metal::operation::ProgramWithCallbacks rotate_half_single_core( + const Tensor& input_tensor, Tensor& output_tensor); } // namespace ttnn::operations::experimental::transformer::detail diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/rotate_half/rotate_half.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/rotate_half/rotate_half.cpp index f60559f9a9e..114192b934b 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/rotate_half/rotate_half.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/rotate_half/rotate_half.cpp @@ -6,8 +6,6 @@ #include "device/rotate_half_device_operation.hpp" -using namespace tt::tt_metal; - namespace ttnn::operations::experimental::transformer { Tensor RotateHalfOperation::invoke(const Tensor& input_tensor, const std::optional& memory_config) { @@ -26,7 +24,7 @@ Tensor RotateHalfOperation::invoke(const Tensor& input_tensor, const std::option ttnn::operations::experimental::auto_format::AutoFormat::pad_to_tile_shape(input_tensor.get_padded_shape()); ttnn::operations::experimental::auto_format::FormatParams input_format_params = { .pad_shape = pad_shape, .pad_value = 0.0, .target_layout = Layout::TILE}; - return operation::run_with_autoformat( + return tt::tt_metal::operation::run_with_autoformat( RotateHalf{memory_config.value_or(input_tensor.memory_config())}, {input_tensor}, {input_format_params}, diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/device/split_query_key_value_and_split_heads_device_operation.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/device/split_query_key_value_and_split_heads_device_operation.cpp index 90eb579abd3..f8a2cb8cd63 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/device/split_query_key_value_and_split_heads_device_operation.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/device/split_query_key_value_and_split_heads_device_operation.cpp @@ -6,8 +6,6 @@ #include "split_query_key_value_and_split_heads_program_factory.hpp" -using namespace tt::tt_metal; - namespace ttnn::operations::experimental::transformer { void SplitFusedQKVAndSplitHeadsDeviceOperation::validate_with_output_tensors( @@ -16,7 +14,7 @@ void SplitFusedQKVAndSplitHeadsDeviceOperation::validate_with_output_tensors( const auto batch_size = input_tensor.get_padded_shape()[0]; // TODO: See issue #1744 TT_FATAL((input_tensor.get_padded_shape() == ttnn::Shape({batch_size, 1, 384, 3072})), "Unsupported input shape"); - TT_FATAL(input_tensor.storage_type() == StorageType::DEVICE, "Operands to TM need to be on device!"); + TT_FATAL(input_tensor.storage_type() == tt::tt_metal::StorageType::DEVICE, "Operands to TM need to be on device!"); TT_FATAL(input_tensor.buffer() != nullptr, "Operands to TM need to be allocated in buffers on device!"); TT_FATAL( input_tensor.get_dtype() == tt::tt_metal::DataType::BFLOAT16 || @@ -32,7 +30,8 @@ void SplitFusedQKVAndSplitHeadsDeviceOperation::validate_with_output_tensors( bbox.end_coord.y < this->compute_with_storage_grid_size.y), "Error"); TT_FATAL(input_tensor.shard_spec().value().grid.ranges().size() == 1, "Error"); - TT_FATAL(input_tensor.memory_config().memory_layout == TensorMemoryLayout::BLOCK_SHARDED, "Error"); + TT_FATAL( + input_tensor.memory_config().memory_layout == tt::tt_metal::TensorMemoryLayout::BLOCK_SHARDED, "Error"); } if (!output_tensors.empty()) { @@ -42,6 +41,10 @@ void SplitFusedQKVAndSplitHeadsDeviceOperation::validate_with_output_tensors( std::vector SplitFusedQKVAndSplitHeadsDeviceOperation::compute_output_specs( const std::vector& input_tensors, const std::vector>& output_tensors) const { + using tt::tt_metal::Layout; + using tt::tt_metal::PageConfig; + using tt::tt_metal::TensorLayout; + if (output_tensors.size() == 3 && output_tensors[0].has_value() && output_tensors[1].has_value() && output_tensors[2].has_value()) { return { @@ -60,17 +63,17 @@ std::vector SplitFusedQKVAndSplitHeadsDeviceOperation::compute if (input_tensor.is_sharded()) { // core range CoreRangeSet all_cores = input_tensor.shard_spec().value().grid; - ShardOrientation shard_orientation = input_tensor.shard_spec().value().orientation; + tt::tt_metal::ShardOrientation shard_orientation = input_tensor.shard_spec().value().orientation; auto bbox = all_cores.bounding_box(); - uint32_t num_M_cores = - shard_orientation == ShardOrientation::ROW_MAJOR ? bbox.end_coord.x + 1 : bbox.end_coord.y + 1; + uint32_t num_M_cores = shard_orientation == tt::tt_metal::ShardOrientation::ROW_MAJOR ? bbox.end_coord.x + 1 + : bbox.end_coord.y + 1; // shard spec uint32_t per_core_M_qv = (num_heads / num_M_cores) * M; // 768 uint32_t per_core_N_qv = K; // 64 - ShardSpec shard_spec_qv = ShardSpec{all_cores, {per_core_M_qv, per_core_N_qv}, shard_orientation}; + auto shard_spec_qv = tt::tt_metal::ShardSpec{all_cores, {per_core_M_qv, per_core_N_qv}, shard_orientation}; uint32_t per_core_M_k = (num_heads / num_M_cores) * K; // 128 uint32_t per_core_N_k = M; // 384 - ShardSpec shard_spec_k = ShardSpec{all_cores, {per_core_M_k, per_core_N_k}, shard_orientation}; + auto shard_spec_k = tt::tt_metal::ShardSpec{all_cores, {per_core_M_k, per_core_N_k}, shard_orientation}; // create sharded tensors auto mem_config_qv = this->output_mem_config; mem_config_qv.shard_spec = shard_spec_qv; @@ -106,7 +109,7 @@ std::vector SplitFusedQKVAndSplitHeadsDeviceOperation::create_output_ten }; } -operation::ProgramWithCallbacks SplitFusedQKVAndSplitHeadsDeviceOperation::create_program( +tt::tt_metal::operation::ProgramWithCallbacks SplitFusedQKVAndSplitHeadsDeviceOperation::create_program( const std::vector& input_tensors, std::vector& output_tensors) const { const auto& input_tensor = input_tensors.at(0); auto& output_tensor = output_tensors.at(0); diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/device/split_query_key_value_and_split_heads_device_operation.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/device/split_query_key_value_and_split_heads_device_operation.hpp index 6ed228d4bcb..0ce07cf4916 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/device/split_query_key_value_and_split_heads_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/device/split_query_key_value_and_split_heads_device_operation.hpp @@ -14,7 +14,7 @@ namespace ttnn::operations::experimental::transformer { struct SplitFusedQKVAndSplitHeadsDeviceOperation { CoreCoord compute_with_storage_grid_size; - MemoryConfig output_mem_config; + tt::tt_metal::MemoryConfig output_mem_config; uint32_t num_heads; void validate_with_output_tensors( @@ -23,7 +23,7 @@ struct SplitFusedQKVAndSplitHeadsDeviceOperation { const std::vector& input_tensors, const std::vector>& output_tensors) const; std::vector create_output_tensors( const std::vector& input_tensors, const std::vector>& output_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector& input_tensors, std::vector& output_tensors) const; }; diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/device/split_query_key_value_and_split_heads_program_factory.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/device/split_query_key_value_and_split_heads_program_factory.hpp index 9f2c6682a33..1321853bca2 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/device/split_query_key_value_and_split_heads_program_factory.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/device/split_query_key_value_and_split_heads_program_factory.hpp @@ -12,7 +12,7 @@ using namespace tt::constants; using namespace tt; using namespace tt_metal; -operation::ProgramWithCallbacks multi_core_split_query_key_value_and_split_heads( +tt::tt_metal::operation::ProgramWithCallbacks multi_core_split_query_key_value_and_split_heads( const Tensor& a, std::vector& output, CoreCoord compute_with_storage_grid_size) { const auto& ashape = a.get_padded_shape(); @@ -207,7 +207,7 @@ operation::ProgramWithCallbacks multi_core_split_query_key_value_and_split_heads return {std::move(program), override_runtime_args_callback}; } -operation::ProgramWithCallbacks multi_core_split_query_key_value_and_split_heads_sharded( +tt::tt_metal::operation::ProgramWithCallbacks multi_core_split_query_key_value_and_split_heads_sharded( const Tensor& a, std::vector& output, CoreCoord compute_with_storage_grid_size) { tt::DataFormat cb_data_format = tt_metal::datatype_to_dataformat_converter(a.get_dtype()); uint32_t single_tile_size = tt_metal::detail::TileSize(cb_data_format); diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/split_query_key_value_and_split_heads.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/split_query_key_value_and_split_heads.hpp index 0c3a12c4be0..efe7747d618 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/split_query_key_value_and_split_heads.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/split_query_key_value_and_split_heads.hpp @@ -19,7 +19,7 @@ struct SplitFusedQKVAndSplitHeadsOperation { const std::optional& memory_config = std::nullopt, const uint32_t num_heads = 16, std::optional>> optional_output_tensors = std::nullopt) { - auto result = operation::run( + auto result = tt::tt_metal::operation::run( SplitFusedQKVAndSplitHeadsDeviceOperation{ compute_with_storage_grid_size, memory_config.value_or(input_tensor.memory_config()), num_heads}, {input_tensor}, @@ -48,9 +48,9 @@ struct SplitFusedQKVAndSplitHeadsOperation { const std::vector& input_tensors, const std::vector>& optional_inputs) { const auto& input_tensor = input_tensors.at(0); return { - Tensor(operation::get_workers_for_op_output({input_tensor})), - Tensor(operation::get_workers_for_op_output({input_tensor})), - Tensor(operation::get_workers_for_op_output({input_tensor}))}; + Tensor(tt::tt_metal::operation::get_workers_for_op_output({input_tensor})), + Tensor(tt::tt_metal::operation::get_workers_for_op_output({input_tensor})), + Tensor(tt::tt_metal::operation::get_workers_for_op_output({input_tensor}))}; } }; diff --git a/ttnn/cpp/ttnn/operations/full/device/full_device_operation.cpp b/ttnn/cpp/ttnn/operations/full/device/full_device_operation.cpp index 020e01753c1..f67a73a230c 100644 --- a/ttnn/cpp/ttnn/operations/full/device/full_device_operation.cpp +++ b/ttnn/cpp/ttnn/operations/full/device/full_device_operation.cpp @@ -54,8 +54,10 @@ FullOperation::spec_return_value_t FullOperation::compute_output_specs( const operation_attributes_t& operation_attributes, const tensor_args_t&) { return TensorSpec( Shape(operation_attributes.shape), - TensorLayout( - operation_attributes.dtype, PageConfig(operation_attributes.layout), operation_attributes.memory_config)); + tt::tt_metal::TensorLayout( + operation_attributes.dtype, + tt::tt_metal::PageConfig(operation_attributes.layout), + operation_attributes.memory_config)); }; FullOperation::tensor_return_value_t FullOperation::create_output_tensors( diff --git a/ttnn/cpp/ttnn/operations/full/device/full_device_operation.hpp b/ttnn/cpp/ttnn/operations/full/device/full_device_operation.hpp index 7cf76d2c7ca..a015b639b17 100644 --- a/ttnn/cpp/ttnn/operations/full/device/full_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/full/device/full_device_operation.hpp @@ -29,7 +29,7 @@ struct FullOperation { struct ProgramFactory { struct shared_variables_t { - KernelHandle writer_id; + tt::tt_metal::KernelHandle writer_id; std::size_t num_cores; std::size_t core_h; }; diff --git a/ttnn/cpp/ttnn/operations/full/full.cpp b/ttnn/cpp/ttnn/operations/full/full.cpp index 92cf4a6e146..b2ec8ae378a 100644 --- a/ttnn/cpp/ttnn/operations/full/full.cpp +++ b/ttnn/cpp/ttnn/operations/full/full.cpp @@ -9,6 +9,7 @@ #include "ttnn/tensor/types.hpp" namespace ttnn::operations::full { + Tensor Full::invoke( const ttnn::SmallVector& shape, const std::variant fill_value, diff --git a/ttnn/cpp/ttnn/operations/full_like/device/full_like_device_operation.cpp b/ttnn/cpp/ttnn/operations/full_like/device/full_like_device_operation.cpp index be79166991a..2d701937461 100644 --- a/ttnn/cpp/ttnn/operations/full_like/device/full_like_device_operation.cpp +++ b/ttnn/cpp/ttnn/operations/full_like/device/full_like_device_operation.cpp @@ -46,8 +46,10 @@ FullLikeOperation::spec_return_value_t FullLikeOperation::compute_output_specs( const operation_attributes_t& operation_attributes, const tensor_args_t& tensor_args) { return TensorSpec( tensor_args.input.get_logical_shape(), - TensorLayout( - operation_attributes.dtype, PageConfig(operation_attributes.layout), operation_attributes.memory_config)); + tt::tt_metal::TensorLayout( + operation_attributes.dtype, + tt::tt_metal::PageConfig(operation_attributes.layout), + operation_attributes.memory_config)); } FullLikeOperation::tensor_return_value_t FullLikeOperation::create_output_tensors( diff --git a/ttnn/cpp/ttnn/operations/full_like/device/full_like_device_operation.hpp b/ttnn/cpp/ttnn/operations/full_like/device/full_like_device_operation.hpp index 905ba6dbe93..52c5be927e5 100644 --- a/ttnn/cpp/ttnn/operations/full_like/device/full_like_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/full_like/device/full_like_device_operation.hpp @@ -31,7 +31,7 @@ struct FullLikeOperation { struct ProgramFactory { struct shared_variables_t { - KernelHandle writer_kernel_id; + tt::tt_metal::KernelHandle writer_kernel_id; std::size_t num_cores; std::size_t num_cores_y; }; diff --git a/ttnn/cpp/ttnn/operations/full_like/device/full_like_factory.cpp b/ttnn/cpp/ttnn/operations/full_like/device/full_like_factory.cpp index 0e1bd2c7df1..fbfa6ec5622 100644 --- a/ttnn/cpp/ttnn/operations/full_like/device/full_like_factory.cpp +++ b/ttnn/cpp/ttnn/operations/full_like/device/full_like_factory.cpp @@ -14,7 +14,6 @@ namespace ttnn::operations::full_like { using namespace tt; -using namespace tt::tt_metal; using namespace tt::constants; using namespace tt::tt_metal::detail; @@ -72,8 +71,8 @@ FullLikeOperation::ProgramFactory::cached_program_t FullLikeOperation::ProgramFa constexpr CBIndex cb_fill_value_id = CBIndex::c_24; - CircularBufferConfig cb_value_config = CircularBufferConfig(single_tile_size, {{cb_fill_value_id, data_format}}) - .set_page_size(cb_fill_value_id, single_tile_size); + auto cb_value_config = tt::tt_metal::CircularBufferConfig(single_tile_size, {{cb_fill_value_id, data_format}}) + .set_page_size(cb_fill_value_id, single_tile_size); auto cb_fill_value = CreateCircularBuffer(program, all_cores, cb_value_config); std::map writer_defines; @@ -101,7 +100,7 @@ FullLikeOperation::ProgramFactory::cached_program_t FullLikeOperation::ProgramFa program, "ttnn/cpp/ttnn/operations/full/device/kernels/writer_full.cpp", all_cores, - WriterDataMovementConfig(writer_compile_time_args, writer_defines)); + tt::tt_metal::WriterDataMovementConfig(writer_compile_time_args, writer_defines)); uint32_t tiles_offset = 0; for (uint32_t i = 0; i < num_cores; i++) { diff --git a/ttnn/cpp/ttnn/operations/functions.hpp b/ttnn/cpp/ttnn/operations/functions.hpp index f70c7bac474..eb4b966dd24 100644 --- a/ttnn/cpp/ttnn/operations/functions.hpp +++ b/ttnn/cpp/ttnn/operations/functions.hpp @@ -22,8 +22,11 @@ using tt::tt_metal::IDevice; using tt::tt_metal::Layout; using tt::tt_metal::MemoryConfig; using tt::tt_metal::OwnedStorage; +using tt::tt_metal::PageConfig; using tt::tt_metal::StorageType; using tt::tt_metal::Tensor; +using tt::tt_metal::TensorLayout; +using tt::tt_metal::TensorMemoryLayout; template static Tensor index_trilu( @@ -255,7 +258,7 @@ static Tensor fill_first_val_into_tensor( } else { tt::tt_metal::tensor_impl::read_data_from_device_buffer(device_buffer, data_vec); } - auto input_buffer = owned_buffer::create(std::move(data_vec)); + auto input_buffer = tt::tt_metal::owned_buffer::create(std::move(data_vec)); const ttnn::Shape input_tensor_strides = input_tensor.strides(); for (uint32_t i = 0; i < physical_volume; i++) { owned_buffer[i] = input_buffer[0]; @@ -298,7 +301,7 @@ static Tensor prod_result_computation_GS( } else { tt::tt_metal::tensor_impl::read_data_from_device_buffer(device_buffer, data_vec); } - auto input_buffer = owned_buffer::create(std::move(data_vec)); + auto input_buffer = tt::tt_metal::owned_buffer::create(std::move(data_vec)); const ttnn::Shape input_tensor_strides = input_tensor.strides(); auto result = static_cast(1.0f); for (uint32_t i = s_a[0] - 1; i < s_a[0]; i++) { @@ -357,7 +360,7 @@ static Tensor prod_result_computation_WH_B0( } else { tt::tt_metal::tensor_impl::read_data_from_device_buffer(device_buffer, data_vec); } - auto input_buffer = owned_buffer::create(std::move(data_vec)); + auto input_buffer = tt::tt_metal::owned_buffer::create(std::move(data_vec)); const ttnn::Shape input_tensor_strides = input_tensor.strides(); auto result = static_cast(1.0f); // need to access the last 4 rows and alternating columns of index 17 ,19, 21, 23, 25, 27, 29, 31 @@ -507,7 +510,7 @@ static Tensor manual_insertion( } else { tt::tt_metal::tensor_impl::read_data_from_device_buffer(device_buffer, data_vec); } - auto owned_buffer = owned_buffer::create(std::move(data_vec)); + auto owned_buffer = tt::tt_metal::owned_buffer::create(std::move(data_vec)); auto output = Tensor( OwnedStorage{owned_buffer}, TensorSpec( diff --git a/ttnn/cpp/ttnn/operations/index_fill/device/index_fill_device_operation.hpp b/ttnn/cpp/ttnn/operations/index_fill/device/index_fill_device_operation.hpp index 75dd75d3c86..e0b07ead662 100644 --- a/ttnn/cpp/ttnn/operations/index_fill/device/index_fill_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/index_fill/device/index_fill_device_operation.hpp @@ -25,8 +25,8 @@ struct IndexFillOperation { using tensor_return_value_t = Tensor; struct MultiCore { struct shared_variables_t { - KernelHandle reader_kernel_id; - KernelHandle writer_kernel_id; + tt::tt_metal::KernelHandle reader_kernel_id; + tt::tt_metal::KernelHandle writer_kernel_id; std::size_t num_cores; std::size_t num_cores_y; }; diff --git a/ttnn/cpp/ttnn/operations/kv_cache/kv_cache.cpp b/ttnn/cpp/ttnn/operations/kv_cache/kv_cache.cpp index ee8e0eb5044..ea8dc7ebeef 100644 --- a/ttnn/cpp/ttnn/operations/kv_cache/kv_cache.cpp +++ b/ttnn/cpp/ttnn/operations/kv_cache/kv_cache.cpp @@ -16,7 +16,7 @@ ttnn::Tensor ExecuteUpdateCache::invoke( const uint32_t batch_offset, std::optional compute_kernel_config) { auto kernel_config_val = init_device_compute_kernel_config(input.device()->arch(), compute_kernel_config); - operation::run( + tt::tt_metal::operation::run( UpdateCache{0, update_index, batch_offset, UpdateCacheOpType::UPDATE, kernel_config_val}, std::vector{cache, input}); return cache; @@ -24,7 +24,8 @@ ttnn::Tensor ExecuteUpdateCache::invoke( ttnn::Tensor ExecuteFillCache::invoke( const ttnn::Tensor& cache, const ttnn::Tensor& input, const uint32_t batch_index) { - operation::run(UpdateCache{batch_index, 0, 0, UpdateCacheOpType::FILL}, std::vector{cache, input}); + tt::tt_metal::operation::run( + UpdateCache{batch_index, 0, 0, UpdateCacheOpType::FILL}, std::vector{cache, input}); return cache; } diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.cpp index 6f8df9d82ff..3d72eb55267 100644 --- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.cpp +++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.cpp @@ -1291,7 +1291,7 @@ Matmul create_matmul_struct( if (is_optional_output_tensor) { const auto& optional_output_tensor = optional_output_tensors.at(0); - if (output_mem_config == operation::DEFAULT_OUTPUT_MEMORY_CONFIG) { + if (output_mem_config == tt::tt_metal::operation::DEFAULT_OUTPUT_MEMORY_CONFIG) { output_mem_config = optional_output_tensor->memory_config(); } else { TT_FATAL( @@ -2229,7 +2229,7 @@ operation::ProgramWithCallbacks Matmul::create_program( get_program_config(input_tensor_a, input_tensor_b, bias_single_tile_size, this); return std::visit( - [&](const auto& program_config) -> operation::ProgramWithCallbacks { + [&](const auto& program_config) -> tt::tt_metal::operation::ProgramWithCallbacks { using ProgramConfigType = std::decay_t; if constexpr (std::is_same_v) { TT_FATAL(!bias.has_value(), "Bias is not supported for MatmulMultiCoreReuseProgramConfig!"); diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.hpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.hpp index 969d458b52e..c3f99b37056 100644 --- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.hpp +++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.hpp @@ -179,7 +179,7 @@ struct Matmul { const bool transpose_a = false; const bool transpose_b = false; const std::optional output_tile; - const std::optional global_cb; + const std::optional global_cb; void validate( const std::vector& input_tensors, diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_program_factory.cpp index bc7e7af1671..e88f16904ed 100644 --- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_program_factory.cpp @@ -10,7 +10,6 @@ using namespace tt; using namespace tt::constants; -using namespace tt::tt_metal; namespace ttnn { @@ -18,7 +17,8 @@ namespace operations { namespace matmul { -operation::ProgramWithCallbacks matmul_multi_core(const Tensor& a, const Tensor& b, Tensor& output, bool bcast_batch) { +tt::tt_metal::operation::ProgramWithCallbacks matmul_multi_core( + const Tensor& a, const Tensor& b, Tensor& output, bool bcast_batch) { tt_metal::Program program{}; const auto &ashape = a.get_padded_shape(), bshape = b.get_padded_shape(); diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_1d_program_factory.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_1d_program_factory.cpp index 63ce0c232a1..6263a44359d 100644 --- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_1d_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_1d_program_factory.cpp @@ -18,7 +18,6 @@ using namespace tt; using namespace tt::constants; -using namespace tt_metal; using ttnn::operations::unary::UnaryOpType; using ttnn::operations::unary::UnaryWithParam; @@ -59,9 +58,9 @@ uint32_t get_preferred_noc( return use_hop_cores ? 1 : noc; } -operation::ProgramWithCallbacks create_program_mcast_in0( +tt::tt_metal::operation::ProgramWithCallbacks create_program_mcast_in0( tt_metal::Program& program, - const Tensor& a, + const tt::tt_metal::Tensor& a, tt_metal::IDevice* device, MathFidelity math_fidelity, bool fp32_dest_acc_en, @@ -99,6 +98,8 @@ operation::ProgramWithCallbacks create_program_mcast_in0( bool output_is_sharded, bool untilize_out, std::optional& fused_op_signaler) { + using tt::tt_metal::num_cores_to_corerangeset; + // currently only support transpose of the full tile bool in1_transpose_tile = in1_tile.get_transpose_of_faces() && in1_tile.get_transpose_within_face(); @@ -454,8 +455,8 @@ operation::ProgramWithCallbacks create_program_mcast_in0( mm_kernel_in1_sender_writer_defines["SKIP_MCAST"] = "1"; // in1 is the reader of weights/output writer, and we choose to make it use the optimized reader noc - tt_metal::NOC in0_noc = detail::GetPreferredNOCForDRAMWrite(device->arch()); - tt_metal::NOC in1_noc = detail::GetPreferredNOCForDRAMRead(device->arch()); + tt_metal::NOC in0_noc = tt::tt_metal::detail::GetPreferredNOCForDRAMWrite(device->arch()); + tt_metal::NOC in1_noc = tt::tt_metal::detail::GetPreferredNOCForDRAMRead(device->arch()); if (fuse_op) { // Create semaphores @@ -480,8 +481,8 @@ operation::ProgramWithCallbacks create_program_mcast_in0( .compile_args = in0_sender_compile_time_args, .defines = mm_kernel_in0_sender_writer_defines}); - KernelHandle mm_kernel_in0_mcast_cores_without_work_and_in_receiver_grid_id = 0; - KernelHandle mm_kernel_in0_mcast_cores_without_work_and_not_in_receiver_grid_id = 0; + tt::tt_metal::KernelHandle mm_kernel_in0_mcast_cores_without_work_and_in_receiver_grid_id = 0; + tt::tt_metal::KernelHandle mm_kernel_in0_mcast_cores_without_work_and_not_in_receiver_grid_id = 0; if (in0_is_sharded) { if (in0_mcast_cores_without_work_and_in_receiver_grid.num_cores() > 0) { in0_sender_compile_time_args[0] = 0; // core_has_output_block_work @@ -513,7 +514,7 @@ operation::ProgramWithCallbacks create_program_mcast_in0( } } - KernelHandle mm_kernel_in0_receiver_id = 0; + tt::tt_metal::KernelHandle mm_kernel_in0_receiver_id = 0; if (!in0_is_sharded and in0_mcast_receivers.num_cores() > 0) { mm_kernel_in0_receiver_id = tt_metal::CreateKernel( program, @@ -618,7 +619,7 @@ operation::ProgramWithCallbacks create_program_mcast_in0( in1_CB_size); uint32_t src2_cb_index = tt::CBIndex::c_2; - CBHandle cb_src2 = 0; + tt::tt_metal::CBHandle cb_src2 = 0; if (in0_is_sharded) { tt_metal::CircularBufferConfig src2_cb_config = tt_metal::CircularBufferConfig(in2_CB_size, {{src2_cb_index, in0_data_format}}) @@ -636,8 +637,9 @@ operation::ProgramWithCallbacks create_program_mcast_in0( // Local L1 to store temp vars uint32_t l1_cb_index = tt::CBIndex::c_6; - CircularBufferConfig cb_for_l1_array_config = - CircularBufferConfig(32 * 2, {{l1_cb_index, tt::DataFormat::Float16_b}}).set_page_size(l1_cb_index, 32 * 2); + tt::tt_metal::CircularBufferConfig cb_for_l1_array_config = + tt::tt_metal::CircularBufferConfig(32 * 2, {{l1_cb_index, tt::DataFormat::Float16_b}}) + .set_page_size(l1_cb_index, 32 * 2); tt_metal::CreateCircularBuffer(program, all_cores, cb_for_l1_array_config); } @@ -732,7 +734,7 @@ operation::ProgramWithCallbacks create_program_mcast_in0( CoreCoord start_core_noc = top_left_core_physical; CoreCoord end_core_noc = bottom_right_core_physical; - if (in0_noc == NOC::NOC_1) { + if (in0_noc == tt::tt_metal::NOC::NOC_1) { std::swap(start_core_noc, end_core_noc); } @@ -890,10 +892,10 @@ operation::ProgramWithCallbacks create_program_mcast_in0( cores, num_cores_with_work]( const void* operation, - Program& program, - const std::vector& input_tensors, - const std::vector>& optional_input_tensors, - const std::vector& output_tensors) { + tt::tt_metal::Program& program, + const std::vector& input_tensors, + const std::vector>& optional_input_tensors, + const std::vector& output_tensors) { TT_ASSERT(input_tensors.size() + optional_input_tensors.size() == 3); TT_ASSERT(output_tensors.size() == 1); @@ -901,7 +903,7 @@ operation::ProgramWithCallbacks create_program_mcast_in0( auto src_buffer_b = input_tensors.at(1).buffer(); auto bias_tensor = optional_input_tensors.at(0); - std::optional bias_buffer; + std::optional bias_buffer; if (bias_tensor.has_value()) { bias_buffer = bias_tensor.value().buffer(); } @@ -953,7 +955,7 @@ operation::ProgramWithCallbacks create_program_mcast_in0( return {.program = std::move(program), .override_runtime_arguments_callback = override_runtime_arguments_callback}; } -operation::ProgramWithCallbacks create_program_mcast_in1( +tt::tt_metal::operation::ProgramWithCallbacks create_program_mcast_in1( tt_metal::IDevice* device, MathFidelity math_fidelity, bool fp32_dest_acc_en, @@ -1081,7 +1083,7 @@ operation::ProgramWithCallbacks create_program_mcast_in1( constexpr bool row_major = true; CoreRangeSet all_cores = - num_cores_to_corerangeset(start_core, num_cores, compute_with_storage_grid_size, row_major); + tt::tt_metal::num_cores_to_corerangeset(start_core, num_cores, compute_with_storage_grid_size, row_major); CoreRange in1_mcast_receiver_cores_bounding_box = all_cores.bounding_box(); uint32_t in1_mcast_receiver_num_cores = in1_mcast_receiver_cores_bounding_box.size(); // always mcast to full grid @@ -1091,8 +1093,8 @@ operation::ProgramWithCallbacks create_program_mcast_in1( auto receiver_start_core = start_core.x != (compute_with_storage_grid_size.x - 1) ? CoreCoord{start_core.x + 1, start_core.y} : CoreCoord{start_core.x, start_core.y + 1}; - in1_mcast_receivers = - num_cores_to_corerangeset(receiver_start_core, num_cores - 1, compute_with_storage_grid_size, row_major); + in1_mcast_receivers = tt::tt_metal::num_cores_to_corerangeset( + receiver_start_core, num_cores - 1, compute_with_storage_grid_size, row_major); } // Mcast args @@ -1278,8 +1280,8 @@ operation::ProgramWithCallbacks create_program_mcast_in1( } // in1 is the reader of weights/output writer, and we choose to make it use the optimized reader noc - tt_metal::NOC in0_noc = detail::GetPreferredNOCForDRAMWrite(device->arch()); - tt_metal::NOC in1_noc = detail::GetPreferredNOCForDRAMRead(device->arch()); + tt_metal::NOC in0_noc = tt::tt_metal::detail::GetPreferredNOCForDRAMWrite(device->arch()); + tt_metal::NOC in1_noc = tt::tt_metal::detail::GetPreferredNOCForDRAMRead(device->arch()); auto mm_kernel_in0_sender_id = tt_metal::CreateKernel( program, @@ -1301,7 +1303,7 @@ operation::ProgramWithCallbacks create_program_mcast_in1( .compile_args = in1_sender_writer_compile_time_args, .defines = mm_kernel_in1_sender_writer_defines}); - KernelHandle mm_kernel_in1_receiver_writer_id = 0; + tt::tt_metal::KernelHandle mm_kernel_in1_receiver_writer_id = 0; if (in1_mcast_receivers.num_cores() > 0) { mm_kernel_in1_receiver_writer_id = tt_metal::CreateKernel( program, @@ -1384,7 +1386,7 @@ operation::ProgramWithCallbacks create_program_mcast_in1( in0_CB_size); uint32_t src2_cb_index = tt::CBIndex::c_2; - CBHandle cb_src2 = 0; + tt::tt_metal::CBHandle cb_src2 = 0; if (in0_is_sharded and extract_shard_sub_blocks) { // in0_is_sharded is technically redundant tt_metal::CircularBufferConfig src2_cb_config = tt_metal::CircularBufferConfig(in2_CB_size, {{src2_cb_index, in0_data_format}}) @@ -1498,7 +1500,7 @@ operation::ProgramWithCallbacks create_program_mcast_in1( CoreCoord start_core_noc = bottom_right_core_physical; CoreCoord end_core_noc = top_left_core_physical; - if (in1_noc == NOC::NOC_0) { + if (in1_noc == tt::tt_metal::NOC::NOC_0) { std::swap(start_core_noc, end_core_noc); } @@ -1623,10 +1625,10 @@ operation::ProgramWithCallbacks create_program_mcast_in1( start_core, cores]( const void* operation, - Program& program, - const std::vector& input_tensors, - const std::vector>& optional_input_tensors, - const std::vector& output_tensors) { + tt::tt_metal::Program& program, + const std::vector& input_tensors, + const std::vector>& optional_input_tensors, + const std::vector& output_tensors) { TT_ASSERT(input_tensors.size() + optional_input_tensors.size() == 3); TT_ASSERT(output_tensors.size() == 1); @@ -1634,7 +1636,7 @@ operation::ProgramWithCallbacks create_program_mcast_in1( auto src_buffer_b = input_tensors.at(1).buffer(); auto bias_tensor = optional_input_tensors.at(0); - std::optional bias_buffer; + std::optional bias_buffer; if (bias_tensor.has_value()) { bias_buffer = bias_tensor.value().buffer(); } @@ -1691,10 +1693,10 @@ operation::ProgramWithCallbacks create_program_mcast_in1( return {.program = std::move(program), .override_runtime_arguments_callback = override_runtime_arguments_callback}; } -operation::ProgramWithCallbacks create_program_gather_in0( +tt::tt_metal::operation::ProgramWithCallbacks create_program_gather_in0( tt_metal::Program& program, - const Tensor& a, - const Tensor& b, + const tt::tt_metal::Tensor& a, + const tt::tt_metal::Tensor& b, tt_metal::IDevice* device, MathFidelity math_fidelity, bool fp32_dest_acc_en, @@ -1885,8 +1887,8 @@ operation::ProgramWithCallbacks create_program_gather_in0( bmm_op_utils::add_stagger_defines_if_needed(device->arch(), num_cores, mm_kernel_defines); // in1 is the reader of weights/output writer, and we choose to make it use the optimized reader noc - tt_metal::NOC in0_noc = detail::GetPreferredNOCForDRAMWrite(device->arch()); - tt_metal::NOC in1_noc = detail::GetPreferredNOCForDRAMRead(device->arch()); + tt_metal::NOC in0_noc = tt::tt_metal::detail::GetPreferredNOCForDRAMWrite(device->arch()); + tt_metal::NOC in1_noc = tt::tt_metal::detail::GetPreferredNOCForDRAMRead(device->arch()); /* Create the kernels */ auto mm_kernel_in0_id = tt_metal::CreateKernel( @@ -1932,7 +1934,7 @@ operation::ProgramWithCallbacks create_program_gather_in0( auto cb_src0 = tt_metal::CreateCircularBuffer(program, ring_cores, src0_cb_config); uint32_t src1_cb_index = tt::CBIndex::c_1; - CBHandle cb_src1; + tt::tt_metal::CBHandle cb_src1; if (use_global_cb) { uint32_t in1_block_size_bytes = in1_single_tile_size * in1_block_num_tiles; uint32_t remote_cb_index = tt::CBIndex::c_31; @@ -2092,10 +2094,10 @@ operation::ProgramWithCallbacks create_program_gather_in0( auto override_runtime_arguments_callback = [mm_kernel_in0_id, mm_kernel_in1_sender_writer_id, cb_src0, cb_src1, cb_output, num_cores, cores, global_cb]( const void* operation, - Program& program, - const std::vector& input_tensors, - const std::vector>& optional_input_tensors, - const std::vector& output_tensors) { + tt::tt_metal::Program& program, + const std::vector& input_tensors, + const std::vector>& optional_input_tensors, + const std::vector& output_tensors) { TT_ASSERT(input_tensors.size() + optional_input_tensors.size() == 3); TT_ASSERT(output_tensors.size() == 1); @@ -2141,7 +2143,7 @@ namespace operations { namespace matmul { -operation::ProgramWithCallbacks matmul_multi_core_reuse_mcast_1d_optimized_( +tt::tt_metal::operation::ProgramWithCallbacks matmul_multi_core_reuse_mcast_1d_optimized_( tt::tt_metal::Program& program, const Tensor& a, const Tensor& b, @@ -2378,7 +2380,7 @@ operation::ProgramWithCallbacks matmul_multi_core_reuse_mcast_1d_optimized_( } } -operation::ProgramWithCallbacks matmul_multi_core_reuse_mcast_1d_optimized( +tt::tt_metal::operation::ProgramWithCallbacks matmul_multi_core_reuse_mcast_1d_optimized( const Tensor& a, const Tensor& b, const std::optional& bias, @@ -2431,7 +2433,7 @@ operation::ProgramWithCallbacks matmul_multi_core_reuse_mcast_1d_optimized( num_global_cb_receivers); } -operation::ProgramWithCallbacks matmul_multi_core_reuse_mcast_1d_optimized_helper( +tt::tt_metal::operation::ProgramWithCallbacks matmul_multi_core_reuse_mcast_1d_optimized_helper( tt::tt_metal::Program& program, const Tensor& a, const Tensor& b, diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_2d_program_factory.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_2d_program_factory.cpp index 0b8c289aaf8..6490e54bed7 100644 --- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_2d_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_2d_program_factory.cpp @@ -10,6 +10,7 @@ #include #include #include +#include "tt-metalium/buffer_constants.hpp" #include "ttnn/operation.hpp" #include "ttnn/operations/matmul/device/matmul_op.hpp" #include "ttnn/operations/eltwise/unary/common/unary_op_utils.hpp" @@ -20,11 +21,8 @@ using ttnn::operations::unary::UnaryOpType; using ttnn::operations::unary::UnaryWithParam; namespace reuse_mcast_optimized_helpers { -using namespace tt::constants; -using namespace tt; -using namespace tt_metal; -operation::ProgramWithCallbacks create_program_mcast_in0_in1( +tt::tt_metal::operation::ProgramWithCallbacks create_program_mcast_in0_in1( tt_metal::Program& program, tt_metal::IDevice* device, MathFidelity math_fidelity, @@ -59,6 +57,8 @@ operation::ProgramWithCallbacks create_program_mcast_in0_in1( tt::DataFormat output_data_format, bool untilize_out, std::optional& fused_op_signaler) { + using tt::tt_metal::TensorMemoryLayout; + // currently only support transpose of the full tile bool in1_transpose_tile = in1_tile.get_transpose_of_faces() && in1_tile.get_transpose_within_face(); @@ -550,13 +550,13 @@ operation::ProgramWithCallbacks create_program_mcast_in0_in1( } // in1 is the reader of weights/output writer, and we choose to make it use the optimized reader noc - tt_metal::NOC in0_noc = detail::GetPreferredNOCForDRAMWrite(device->arch()); - tt_metal::NOC in1_noc = detail::GetPreferredNOCForDRAMRead(device->arch()); - tt_metal::NOC in0_split_noc = detail::GetPreferredNOCForDRAMRead(device->arch()); - tt_metal::NOC in1_split_noc = detail::GetPreferredNOCForDRAMWrite(device->arch()); + tt_metal::NOC in0_noc = tt::tt_metal::detail::GetPreferredNOCForDRAMWrite(device->arch()); + tt_metal::NOC in1_noc = tt::tt_metal::detail::GetPreferredNOCForDRAMRead(device->arch()); + tt_metal::NOC in0_split_noc = tt::tt_metal::detail::GetPreferredNOCForDRAMRead(device->arch()); + tt_metal::NOC in1_split_noc = tt::tt_metal::detail::GetPreferredNOCForDRAMWrite(device->arch()); - KernelHandle mm_kernel_in0_sender_id = 0; - KernelHandle mm_kernel_in0_mcast_cores_without_work_and_not_in_receiver_grid_id = 0; + tt::tt_metal::KernelHandle mm_kernel_in0_sender_id = 0; + tt::tt_metal::KernelHandle mm_kernel_in0_mcast_cores_without_work_and_not_in_receiver_grid_id = 0; if (in0_block_sharded) { mm_kernel_in0_sender_id = tt_metal::CreateKernel( program, @@ -609,7 +609,7 @@ operation::ProgramWithCallbacks create_program_mcast_in0_in1( .compile_args = in1_sender_writer_compile_time_args, .defines = mm_kernel_in1_sender_writer_defines}); - KernelHandle mm_kernel_in1_receiver_writer_id = 0; + tt::tt_metal::KernelHandle mm_kernel_in1_receiver_writer_id = 0; if (in1_receiver.num_cores() > 0) { mm_kernel_in1_receiver_writer_id = tt_metal::CreateKernel( program, @@ -624,7 +624,7 @@ operation::ProgramWithCallbacks create_program_mcast_in0_in1( .defines = mm_kernel_in1_receiver_writer_defines}); } - KernelHandle mm_kernel_in0_receiver_id = 0; + tt::tt_metal::KernelHandle mm_kernel_in0_receiver_id = 0; if (!in0_block_sharded and in0_receiver_interleaved.num_cores() > 0) { mm_kernel_in0_receiver_id = tt_metal::CreateKernel( program, @@ -637,8 +637,8 @@ operation::ProgramWithCallbacks create_program_mcast_in0_in1( .compile_args = in0_receiver_compile_time_args}); } - KernelHandle mm_kernel_in1_receiver_writer_other_noc_setup_id = mm_kernel_in1_receiver_writer_id; - KernelHandle mm_kernel_in0_receiver_other_noc_setup_id = mm_kernel_in0_receiver_id; + tt::tt_metal::KernelHandle mm_kernel_in1_receiver_writer_other_noc_setup_id = mm_kernel_in1_receiver_writer_id; + tt::tt_metal::KernelHandle mm_kernel_in0_receiver_other_noc_setup_id = mm_kernel_in0_receiver_id; if (in0_receiver_in1_receiver_interleaved_other_cores.has_value()) { mm_kernel_in1_receiver_writer_other_noc_setup_id = tt_metal::CreateKernel( @@ -745,7 +745,7 @@ operation::ProgramWithCallbacks create_program_mcast_in0_in1( in1_CB_size); uint32_t src2_cb_index = tt::CBIndex::c_2; - CBHandle cb_src2 = 0; + tt::tt_metal::CBHandle cb_src2 = 0; if (in0_block_sharded) { tt_metal::CircularBufferConfig src2_cb_config = tt_metal::CircularBufferConfig(in2_CB_size, {{src2_cb_index, in0_data_format}}) @@ -763,8 +763,9 @@ operation::ProgramWithCallbacks create_program_mcast_in0_in1( // Local L1 to store temp vars uint32_t l1_cb_index = tt::CBIndex::c_6; - CircularBufferConfig cb_for_l1_array_config = - CircularBufferConfig(32 * 2, {{l1_cb_index, tt::DataFormat::Float16_b}}).set_page_size(l1_cb_index, 32 * 2); + tt::tt_metal::CircularBufferConfig cb_for_l1_array_config = + tt::tt_metal::CircularBufferConfig(32 * 2, {{l1_cb_index, tt::DataFormat::Float16_b}}) + .set_page_size(l1_cb_index, 32 * 2); tt_metal::CreateCircularBuffer(program, all_cores, cb_for_l1_array_config); } @@ -861,7 +862,7 @@ operation::ProgramWithCallbacks create_program_mcast_in0_in1( (out_block_h / out_subblock_h - last_block_num_nonzero_subblocks_h) * (out_block_w * out_subblock_h); if (in0_block_sharded) { - if (in0_noc == NOC::NOC_1) { + if (in0_noc == tt::tt_metal::NOC::NOC_1) { std::swap(in0_mcast_receiver_grid_diff_coord_start, in0_mcast_receiver_grid_diff_coord_end); } } @@ -910,14 +911,14 @@ operation::ProgramWithCallbacks create_program_mcast_in0_in1( // Assuming in0 is NOC0 auto in0_mcast_start = left_core_plus_one_physical; auto in0_mcast_end = right_core_physical; - if (in0_noc == NOC::NOC_1) { + if (in0_noc == tt::tt_metal::NOC::NOC_1) { std::swap(in0_mcast_start, in0_mcast_end); } // Assuming in1 is NOC1 auto in1_mcast_start = bottom_core_physical; auto in1_mcast_end = top_core_plus_one_physical; - if (in1_noc == NOC::NOC_0) { + if (in1_noc == tt::tt_metal::NOC::NOC_0) { std::swap(in1_mcast_start, in1_mcast_end); } @@ -1230,10 +1231,10 @@ operation::ProgramWithCallbacks create_program_mcast_in0_in1( transpose_mcast, cores]( const void* operation, - Program& program, - const std::vector& input_tensors, - const std::vector>& optional_input_tensors, - const std::vector& output_tensors) { + tt::tt_metal::Program& program, + const std::vector& input_tensors, + const std::vector>& optional_input_tensors, + const std::vector& output_tensors) { TT_ASSERT(input_tensors.size() + optional_input_tensors.size() == 3); TT_ASSERT(output_tensors.size() == 1); @@ -1246,7 +1247,7 @@ operation::ProgramWithCallbacks create_program_mcast_in0_in1( bool src0_sharded = input_tensors[0].memory_config().is_sharded(); bool out_sharded = output_tensors[0].memory_config().is_sharded(); - std::optional bias_buffer; + std::optional bias_buffer; if (bias_tensor.has_value()) { bias_buffer = bias_tensor.value().buffer(); } @@ -1304,7 +1305,7 @@ namespace operations { namespace matmul { -operation::ProgramWithCallbacks matmul_multi_core_reuse_mcast_2d_optimized_( +tt::tt_metal::operation::ProgramWithCallbacks matmul_multi_core_reuse_mcast_2d_optimized_( tt::tt_metal::Program& program, const Tensor& a, const Tensor& b, @@ -1456,7 +1457,7 @@ operation::ProgramWithCallbacks matmul_multi_core_reuse_mcast_2d_optimized_( fused_op_signaler); } -operation::ProgramWithCallbacks matmul_multi_core_reuse_mcast_2d_optimized( +tt::tt_metal::operation::ProgramWithCallbacks matmul_multi_core_reuse_mcast_2d_optimized( const Tensor& a, const Tensor& b, const std::optional& bias, @@ -1501,7 +1502,7 @@ operation::ProgramWithCallbacks matmul_multi_core_reuse_mcast_2d_optimized( empty_fused_op_signaler); } -operation::ProgramWithCallbacks matmul_multi_core_reuse_mcast_2d_optimized_helper( +tt::tt_metal::operation::ProgramWithCallbacks matmul_multi_core_reuse_mcast_2d_optimized_helper( tt_metal::Program& program, /* Take programa as input by reference */ const Tensor& a, const Tensor& b, diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_dram_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_dram_sharded_program_factory.cpp index 9b634d10a40..dbc9a4f1c78 100644 --- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_dram_sharded_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_dram_sharded_program_factory.cpp @@ -44,7 +44,8 @@ void move_common_entries(std::vector& v1, std::vector& v2, } } -void get_optimal_dram_bank_to_reader_assignment(IDevice* device, std::vector& all_worker_cores_ordered, CoreRangeSet& all_worker_cores) { +void get_optimal_dram_bank_to_reader_assignment( + tt::tt_metal::IDevice* device, std::vector& all_worker_cores_ordered, CoreRangeSet& all_worker_cores) { all_worker_cores_ordered = device->get_optimal_dram_bank_to_logical_worker_assignment(); std::set all_cores_set; for (const auto& worker_core : all_worker_cores_ordered) { @@ -53,7 +54,7 @@ void get_optimal_dram_bank_to_reader_assignment(IDevice* device, std::vectorarch()); - tt_metal::NOC in1_noc = detail::GetPreferredNOCForDRAMRead(device->arch()); + tt_metal::NOC in0_noc = tt::tt_metal::detail::GetPreferredNOCForDRAMWrite(device->arch()); + tt_metal::NOC in1_noc = tt::tt_metal::detail::GetPreferredNOCForDRAMRead(device->arch()); CoreCoord start_core_noc = top_left_core_physical; CoreCoord end_core_noc = bottom_right_core_physical; - if (in0_noc == NOC::NOC_1) { + if (in0_noc == tt::tt_metal::NOC::NOC_1) { std::swap(start_core_noc, end_core_noc); } @@ -564,8 +565,8 @@ operation::ProgramWithCallbacks create_program_dram_sharded( in3_CB_size); } - std::vector reader_kernel_ids; - std::vector writer_kernel_ids; + std::vector reader_kernel_ids; + std::vector writer_kernel_ids; std::vector in0_mcast_sender_noc_x; std::vector in0_mcast_sender_noc_y; @@ -850,10 +851,10 @@ operation::ProgramWithCallbacks create_program_dram_sharded( auto override_runtime_arguments_callback = [writer_kernel_ids, all_worker_cores_ordered, cb_src2, cb_output_reshard]( const void* operation, - Program& program, - const std::vector& input_tensors, - const std::vector>& optional_input_tensors, - const std::vector& output_tensors) { + tt::tt_metal::Program& program, + const std::vector& input_tensors, + const std::vector>& optional_input_tensors, + const std::vector& output_tensors) { TT_FATAL(input_tensors.size() + optional_input_tensors.size() == 3, "Error"); TT_FATAL(output_tensors.size() == 1, "Error"); @@ -889,7 +890,7 @@ namespace operations { namespace matmul { -operation::ProgramWithCallbacks matmul_multi_core_reuse_dram_sharded_optimized_( +tt::tt_metal::operation::ProgramWithCallbacks matmul_multi_core_reuse_dram_sharded_optimized_( const Tensor& a, const Tensor& b, const std::optional& bias, @@ -1006,7 +1007,7 @@ operation::ProgramWithCallbacks matmul_multi_core_reuse_dram_sharded_optimized_( skip_write_back); } -operation::ProgramWithCallbacks matmul_multi_core_reuse_dram_sharded_optimized( +tt::tt_metal::operation::ProgramWithCallbacks matmul_multi_core_reuse_dram_sharded_optimized( const Tensor& a, const Tensor& b, const std::optional& bias, diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_optimized_program_factory.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_optimized_program_factory.cpp index a25875c5395..b2fb2c8ae3a 100644 --- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_optimized_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_optimized_program_factory.cpp @@ -14,10 +14,10 @@ using namespace tt::constants; using namespace tt; namespace reuse_optimized_helpers { -using namespace tt::constants; -using namespace tt; -using namespace tt_metal; -operation::ProgramWithCallbacks create_program( + +using tt::tt_metal::Tensor; + +tt::tt_metal::operation::ProgramWithCallbacks create_program( tt_metal::IDevice* device, MathFidelity math_fidelity, bool fp32_dest_acc_en, @@ -115,7 +115,7 @@ operation::ProgramWithCallbacks create_program( uint32_t num_tiles_per_block_in1 = K * per_core_N; uint32_t num_tiles_per_block_out = per_core_M_per_batch * per_core_N; uint32_t num_output_blocks_total = (B * M / per_core_M) * (N / per_core_N); - std::optional shard_spec = std::nullopt; + std::optional shard_spec = std::nullopt; if (in0_is_sharded) { shard_spec = in0.shard_spec().value(); } else if (in1_is_sharded) { @@ -178,17 +178,17 @@ operation::ProgramWithCallbacks create_program( mm_kernel_in1_reader_writer_defines["OUT_SHARDED"] = "1"; } - KernelHandle mm_kernel_in0_reader_id = tt_metal::CreateKernel( + tt::tt_metal::KernelHandle mm_kernel_in0_reader_id = tt_metal::CreateKernel( program, "ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp", all_cores, - ReaderDataMovementConfig(reader_compile_time_args, mm_kernel_in0_reader_defines)); + tt::tt_metal::ReaderDataMovementConfig(reader_compile_time_args, mm_kernel_in0_reader_defines)); - KernelHandle mm_kernel_in1_reader_writer_id = tt_metal::CreateKernel( + tt::tt_metal::KernelHandle mm_kernel_in1_reader_writer_id = tt_metal::CreateKernel( program, "ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp", all_cores, - WriterDataMovementConfig(reader_writer_compile_time_args, mm_kernel_in1_reader_writer_defines)); + tt::tt_metal::WriterDataMovementConfig(reader_writer_compile_time_args, mm_kernel_in1_reader_writer_defines)); std::vector compute_kernel_args_group_1 = { in0_block_w, // in0_block_w @@ -388,7 +388,7 @@ operation::ProgramWithCallbacks create_program( }; bool row_major = false; if (shard_spec.has_value()) { - row_major = shard_spec.value().orientation == ShardOrientation::ROW_MAJOR; + row_major = shard_spec.value().orientation == tt::tt_metal::ShardOrientation::ROW_MAJOR; } const auto cores = grid_to_cores(num_cores, core_range.x, core_range.y, row_major); @@ -417,7 +417,7 @@ operation::ProgramWithCallbacks create_program( auto override_runtime_arguments_callback = [mm_kernel_in0_reader_id, mm_kernel_in1_reader_writer_id, cb_src0, cb_src1, cb_output, num_cores, cores]( const void* operation, - Program& program, + tt::tt_metal::Program& program, const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector& output_tensors) { @@ -476,7 +476,7 @@ namespace operations { namespace matmul { -operation::ProgramWithCallbacks matmul_multi_core_reuse_optimized_( +tt::tt_metal::operation::ProgramWithCallbacks matmul_multi_core_reuse_optimized_( const Tensor& a, const Tensor& b, Tensor& output, @@ -578,7 +578,7 @@ operation::ProgramWithCallbacks matmul_multi_core_reuse_optimized_( // TODO: Get rid of no-op reshapes when we generalize // matmul_multi_core_reuse_optimized_bert_large not used -operation::ProgramWithCallbacks bmm_multi_core_reuse_optimized( +tt::tt_metal::operation::ProgramWithCallbacks bmm_multi_core_reuse_optimized( const Tensor& a, const Tensor& b, Tensor& output, diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_program_factory.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_program_factory.cpp index b9c9299ce36..df2e1e28af7 100644 --- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_program_factory.cpp @@ -84,8 +84,8 @@ tt_metal::operation::ProgramWithCallbacks create_program( uint32_t num_blocks_y = M / per_core_M; uint32_t num_blocks_x = N / per_core_N; - CoreRangeSet all_cores( - num_cores_to_corerangeset(num_blocks_x * num_blocks_y, device->compute_with_storage_grid_size(), true)); + CoreRangeSet all_cores(tt::tt_metal::num_cores_to_corerangeset( + num_blocks_x * num_blocks_y, device->compute_with_storage_grid_size(), true)); // Create circular buffers uint32_t src0_cb_index = 0; @@ -242,7 +242,7 @@ namespace operations { namespace matmul { -operation::ProgramWithCallbacks matmul_multi_core_reuse( +tt::tt_metal::operation::ProgramWithCallbacks matmul_multi_core_reuse( const Tensor& a, const Tensor& b, Tensor& output, bool bcast_batch) { const auto &ashape = a.get_padded_shape(), bshape = b.get_padded_shape(); diff --git a/ttnn/cpp/ttnn/operations/matmul/matmul.cpp b/ttnn/cpp/ttnn/operations/matmul/matmul.cpp index e3a7b866bc1..f5d8060ff2b 100644 --- a/ttnn/cpp/ttnn/operations/matmul/matmul.cpp +++ b/ttnn/cpp/ttnn/operations/matmul/matmul.cpp @@ -118,7 +118,7 @@ Tensor MatmulOperation::invoke( const std::optional core_grid, const std::optional& output_tile, std::optional optional_output_tensor, - const std::optional& global_cb) { + const std::optional& global_cb) { std::optional user_core_coord; if (core_grid.has_value()) { user_core_coord = CoreCoord(core_grid->x, core_grid->y); @@ -160,7 +160,7 @@ Tensor LinearOperation::invoke( const std::optional core_grid, const std::optional& output_tile, std::optional optional_output_tensor, - const std::optional& global_cb) { + const std::optional& global_cb) { std::optional user_core_coord; if (core_grid.has_value()) { user_core_coord = CoreCoord(core_grid->x, core_grid->y); diff --git a/ttnn/cpp/ttnn/operations/matmul/matmul.hpp b/ttnn/cpp/ttnn/operations/matmul/matmul.hpp index 8baeb93db42..abf819d5e63 100644 --- a/ttnn/cpp/ttnn/operations/matmul/matmul.hpp +++ b/ttnn/cpp/ttnn/operations/matmul/matmul.hpp @@ -50,7 +50,7 @@ struct MatmulOperation { const std::optional core_grid = std::nullopt, const std::optional& output_tile = std::nullopt, std::optional optional_output_tensor = std::nullopt, - const std::optional& global_cb = std::nullopt); + const std::optional& global_cb = std::nullopt); }; struct LinearOperation { @@ -68,7 +68,7 @@ struct LinearOperation { const std::optional core_grid = std::nullopt, const std::optional& output_tile = std::nullopt, std::optional optional_output_tensor = std::nullopt, - const std::optional& global_cb = std::nullopt); + const std::optional& global_cb = std::nullopt); }; } // namespace matmul diff --git a/ttnn/cpp/ttnn/operations/matmul/matmul_pybind.cpp b/ttnn/cpp/ttnn/operations/matmul/matmul_pybind.cpp index 143bd7b2367..5dff12923c7 100644 --- a/ttnn/cpp/ttnn/operations/matmul/matmul_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/matmul/matmul_pybind.cpp @@ -344,9 +344,9 @@ void py_module(py::module& module) { const std::optional& activation, const std::optional compute_kernel_config, const std::optional core_grid, - const std::optional& output_tile, + const std::optional& output_tile, std::optional& optional_output_tensor, - const std::optional& global_cb) -> ttnn::Tensor { + const std::optional& global_cb) -> ttnn::Tensor { return self( input_tensor_a, input_tensor_b, @@ -428,9 +428,9 @@ void py_module(py::module& module) { const std::optional& activation, const std::optional compute_kernel_config, const std::optional core_grid, - const std::optional& output_tile, + const std::optional& output_tile, std::optional& optional_output_tensor, - const std::optional& global_cb) -> ttnn::Tensor { + const std::optional& global_cb) -> ttnn::Tensor { return self( input_tensor_a, input_tensor_b, diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/device/moreh_abs_pow_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/device/moreh_abs_pow_device_operation.hpp index d7117e0d6b5..2300001b1dc 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/device/moreh_abs_pow_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/device/moreh_abs_pow_device_operation.hpp @@ -13,8 +13,8 @@ #define MOREH_ABS_POW_FACTORY_H(name) \ struct name { \ struct shared_variables_t { \ - KernelHandle reader_kernels_id; \ - KernelHandle writer_kernels_id; \ + tt::tt_metal::KernelHandle reader_kernels_id; \ + tt::tt_metal::KernelHandle writer_kernels_id; \ std::size_t num_cores_to_be_used; \ std::size_t num_cores_y; \ }; \ diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_adam/device/moreh_adam_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_adam/device/moreh_adam_device_operation.hpp index 17eab21fb79..10360bbfbd8 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_adam/device/moreh_adam_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_adam/device/moreh_adam_device_operation.hpp @@ -42,10 +42,10 @@ struct MorehAdamOperation { struct ProgramFactory { struct shared_variables_t { - KernelHandle unary_reader_kernel_id; - KernelHandle unary_writer_kernel_id; - KernelHandle compute_kernel_group1_id; - KernelHandle compute_kernel_group2_id; + tt::tt_metal::KernelHandle unary_reader_kernel_id; + tt::tt_metal::KernelHandle unary_writer_kernel_id; + tt::tt_metal::KernelHandle compute_kernel_group1_id; + tt::tt_metal::KernelHandle compute_kernel_group2_id; CoreRangeSet core_group_1; CoreRangeSet core_group_2; std::size_t num_cores; diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_adamw/device/moreh_adamw_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_adamw/device/moreh_adamw_device_operation.hpp index 5ef19ca4bb6..d63c8adc04f 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_adamw/device/moreh_adamw_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_adamw/device/moreh_adamw_device_operation.hpp @@ -47,10 +47,10 @@ struct MorehAdamWDeviceOperation { struct MultiCore { struct shared_variables_t { - KernelHandle unary_reader_kernel_id; - KernelHandle unary_writer_kernel_id; - KernelHandle compute_kernel_group1_id; - KernelHandle compute_kernel_group2_id; + tt::tt_metal::KernelHandle unary_reader_kernel_id; + tt::tt_metal::KernelHandle unary_writer_kernel_id; + tt::tt_metal::KernelHandle compute_kernel_group1_id; + tt::tt_metal::KernelHandle compute_kernel_group2_id; CoreRangeSet core_group_1; CoreRangeSet core_group_2; std::size_t num_cores; diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_arange/device/moreh_arange_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_arange/device/moreh_arange_device_operation.hpp index 5e4d31aa3dc..18fd27db579 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_arange/device/moreh_arange_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_arange/device/moreh_arange_device_operation.hpp @@ -28,7 +28,7 @@ struct MorehArangeOperation { struct ProgramFactory { struct shared_variables_t { - KernelHandle kernel_id; + tt::tt_metal::KernelHandle kernel_id; std::size_t num_cores; std::size_t core_h; }; diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm.cpp index 571c68a0f8b..3f33fa52b42 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm.cpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm.cpp @@ -80,8 +80,9 @@ Tensor MorehClipGradNorm::invoke( init_device_compute_kernel_config(inputs.at(0).device()->arch(), compute_kernel_config, MathFidelity::HiFi4)); if (error_if_nonfinite) { - const auto fp32_total_norm = - tensor_impl::cast_vec(owned_buffer::get_as(output_total_norm.cpu())).at(0); + const auto fp32_total_norm = tt::tt_metal::tensor_impl::cast_vec( + tt::tt_metal::owned_buffer::get_as(output_total_norm.cpu())) + .at(0); TT_FATAL( std::isfinite(fp32_total_norm), "The total norm of order {} for gradients from `parameters` is non-finite, so it cannot be " diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step1/device/moreh_clip_grad_norm_step1_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step1/device/moreh_clip_grad_norm_step1_device_operation.hpp index 80afbe6dd6a..5e06c99999e 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step1/device/moreh_clip_grad_norm_step1_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step1/device/moreh_clip_grad_norm_step1_device_operation.hpp @@ -31,9 +31,9 @@ struct MorehClipGradNormStep1Operation { struct ProgramFactory { struct shared_variables_t { - KernelHandle reader_kernel_id; - KernelHandle writer_kernel_id; - KernelHandle compute_kernel_id; + tt::tt_metal::KernelHandle reader_kernel_id; + tt::tt_metal::KernelHandle writer_kernel_id; + tt::tt_metal::KernelHandle compute_kernel_id; uint32_t num_cores_to_be_used; size_t num_cores_y; }; diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step2/device/moreh_clip_grad_norm_step2_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step2/device/moreh_clip_grad_norm_step2_device_operation.hpp index b6191de303d..c0d0434858d 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step2/device/moreh_clip_grad_norm_step2_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step2/device/moreh_clip_grad_norm_step2_device_operation.hpp @@ -32,9 +32,9 @@ struct MorehClipGradNormStep2Operation { struct ProgramFactory { struct shared_variables_t { - KernelHandle reader_kernel_id; - KernelHandle writer_kernel_id; - KernelHandle compute_kernel_id; + tt::tt_metal::KernelHandle reader_kernel_id; + tt::tt_metal::KernelHandle writer_kernel_id; + tt::tt_metal::KernelHandle compute_kernel_id; CoreCoord single_core; }; diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step3/device/moreh_clip_grad_norm_step3_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step3/device/moreh_clip_grad_norm_step3_device_operation.hpp index 65dc5df14f8..918966266cd 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step3/device/moreh_clip_grad_norm_step3_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step3/device/moreh_clip_grad_norm_step3_device_operation.hpp @@ -31,8 +31,8 @@ struct MorehClipGradNormStep3Operation { struct ProgramFactory { struct shared_variables_t { - KernelHandle reader_kernel_id; - KernelHandle writer_kernel_id; + tt::tt_metal::KernelHandle reader_kernel_id; + tt::tt_metal::KernelHandle writer_kernel_id; uint32_t num_cores_to_be_used; size_t num_cores_y; }; diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_cumsum/device/moreh_cumsum_device_operation.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_cumsum/device/moreh_cumsum_device_operation.cpp index b4eaf903cdb..5f96306d305 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_cumsum/device/moreh_cumsum_device_operation.cpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_cumsum/device/moreh_cumsum_device_operation.cpp @@ -53,7 +53,8 @@ MorehCumsumDeviceOperation::spec_return_value_t MorehCumsumDeviceOperation::comp const auto& input = tensor_args.input; return TensorSpec( - input.get_logical_shape(), TensorLayout(input.dtype(), PageConfig(input.layout()), MemoryConfig{})); + input.get_logical_shape(), + tt::tt_metal::TensorLayout(input.dtype(), tt::tt_metal::PageConfig(input.layout()), MemoryConfig{})); } MorehCumsumDeviceOperation::tensor_return_value_t MorehCumsumDeviceOperation::create_output_tensors( diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_cumsum/device/moreh_cumsum_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_cumsum/device/moreh_cumsum_device_operation.hpp index 29ce107fa76..8f0e3f900fc 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_cumsum/device/moreh_cumsum_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_cumsum/device/moreh_cumsum_device_operation.hpp @@ -28,8 +28,8 @@ struct MorehCumsumDeviceOperation { struct ProgramFactory { struct shared_variables_t { - KernelHandle unary_reader_kernel_id; - KernelHandle unary_writer_kernel_id; + tt::tt_metal::KernelHandle unary_reader_kernel_id; + tt::tt_metal::KernelHandle unary_writer_kernel_id; std::size_t num_cores; std::size_t num_cores_y; }; diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_dot/device/moreh_dot_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_dot/device/moreh_dot_device_operation.hpp index 72568bf1f8a..88aeb2452b3 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_dot/device/moreh_dot_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_dot/device/moreh_dot_device_operation.hpp @@ -30,8 +30,8 @@ struct MorehDotOperation { struct SingleCore { struct shared_variables_t { - KernelHandle unary_reader_kernel_id; - KernelHandle unary_writer_kernel_id; + tt::tt_metal::KernelHandle unary_reader_kernel_id; + tt::tt_metal::KernelHandle unary_writer_kernel_id; }; using cached_program_t = ttnn::device_operation::CachedProgram; diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_dot_backward/device/moreh_dot_backward_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_dot_backward/device/moreh_dot_backward_device_operation.hpp index 2b19874b1d5..890e2eba533 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_dot_backward/device/moreh_dot_backward_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_dot_backward/device/moreh_dot_backward_device_operation.hpp @@ -33,8 +33,8 @@ struct MorehDotBackwardOperation { struct SingleCore { struct shared_variables_t { - KernelHandle unary_reader_kernel_id; - KernelHandle unary_writer_kernel_id; + tt::tt_metal::KernelHandle unary_reader_kernel_id; + tt::tt_metal::KernelHandle unary_writer_kernel_id; }; using cached_program_t = ttnn::device_operation::CachedProgram; diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_fold/device/fold_device_operation.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_fold/device/fold_device_operation.cpp index 00da177e3f6..29d2bac1657 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_fold/device/fold_device_operation.cpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_fold/device/fold_device_operation.cpp @@ -86,9 +86,9 @@ MorehFoldOperation::spec_return_value_t MorehFoldOperation::compute_output_specs }(); return TensorSpec( output_shape, - TensorLayout( + tt::tt_metal::TensorLayout( tensor_args.input.get_dtype(), - PageConfig(tensor_args.input.get_layout()), + tt::tt_metal::PageConfig(tensor_args.input.get_layout()), operation_attributes.memory_config)); }; diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_fold/device/fold_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_fold/device/fold_device_operation.hpp index 2087327dd76..84f02fdcf2b 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_fold/device/fold_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_fold/device/fold_device_operation.hpp @@ -30,8 +30,8 @@ struct MorehFoldOperation { struct ProgramFactory { struct shared_variables_t { - KernelHandle unary_reader_kernel_id; - KernelHandle unary_writer_kernel_id; + tt::tt_metal::KernelHandle unary_reader_kernel_id; + tt::tt_metal::KernelHandle unary_writer_kernel_id; std::vector cores; }; diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_device_operation.hpp index 49beda5c4da..532f896d353 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_device_operation.hpp @@ -32,8 +32,8 @@ struct MorehGetItemOperation { struct MorehGetItemRmFactory { struct shared_variables_t { - KernelHandle unary_reader_kernel_id; - KernelHandle unary_writer_kernel_id; + tt::tt_metal::KernelHandle unary_reader_kernel_id; + tt::tt_metal::KernelHandle unary_writer_kernel_id; std::size_t num_cores; uint32_t core_h; ttnn::SmallVector index_dims; @@ -56,8 +56,8 @@ struct MorehGetItemOperation { struct MorehGetItemTilizedFactory { struct shared_variables_t { - KernelHandle unary_reader_kernel_id; - KernelHandle unary_writer_kernel_id; + tt::tt_metal::KernelHandle unary_reader_kernel_id; + tt::tt_metal::KernelHandle unary_writer_kernel_id; std::size_t num_cores; uint32_t core_h; ttnn::SmallVector index_dims; diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm/device/moreh_group_norm_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm/device/moreh_group_norm_device_operation.hpp index 554acf5bb73..befb6e14833 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm/device/moreh_group_norm_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm/device/moreh_group_norm_device_operation.hpp @@ -34,8 +34,8 @@ struct MorehGroupNormOperation { struct MorehGroupNormFactory { struct shared_variables_t { - KernelHandle reader_kernels_id; - KernelHandle writer_kernels_id; + tt::tt_metal::KernelHandle reader_kernels_id; + tt::tt_metal::KernelHandle writer_kernels_id; uint32_t num_cores_to_be_used; std::size_t num_cores_y; }; diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/gamma_beta_grad/moreh_group_norm_backward_gamma_beta_grad_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/gamma_beta_grad/moreh_group_norm_backward_gamma_beta_grad_device_operation.hpp index 58588762567..7c1d9b05417 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/gamma_beta_grad/moreh_group_norm_backward_gamma_beta_grad_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/gamma_beta_grad/moreh_group_norm_backward_gamma_beta_grad_device_operation.hpp @@ -37,8 +37,8 @@ struct MorehGroupNormBackwardGammaBetaGradOperation { struct MorehGroupNormBackwardGammaBetaGradFactory { struct shared_variables_t { - KernelHandle reader_kernels_id; - KernelHandle writer_kernels_id; + tt::tt_metal::KernelHandle reader_kernels_id; + tt::tt_metal::KernelHandle writer_kernels_id; uint32_t num_cores_to_be_used; std::size_t num_cores_y; }; diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/input_grad/moreh_group_norm_backward_input_grad_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/input_grad/moreh_group_norm_backward_input_grad_device_operation.hpp index 6d1269e2479..baf4a4b363f 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/input_grad/moreh_group_norm_backward_input_grad_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/input_grad/moreh_group_norm_backward_input_grad_device_operation.hpp @@ -28,8 +28,8 @@ struct MorehGroupNormBackwardInputGradOperation { struct MorehGroupNormBackwardInputGradFactory { struct shared_variables_t { - KernelHandle reader_kernels_id; - KernelHandle writer_kernels_id; + tt::tt_metal::KernelHandle reader_kernels_id; + tt::tt_metal::KernelHandle writer_kernels_id; uint32_t num_cores_to_be_used; std::size_t num_cores_y; }; diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/moreh_layer_norm_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/moreh_layer_norm_device_operation.hpp index d8179129bc5..c844c178e69 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/moreh_layer_norm_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/moreh_layer_norm_device_operation.hpp @@ -34,8 +34,8 @@ struct MorehLayerNormOperation { struct ProgramFactory { struct shared_variables_t { - KernelHandle unary_reader_kernel_id; - KernelHandle unary_writer_kernel_id; + tt::tt_metal::KernelHandle unary_reader_kernel_id; + tt::tt_metal::KernelHandle unary_writer_kernel_id; std::size_t num_cores; std::size_t num_cores_y; }; diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/moreh_layer_norm_backward_gamma_beta_grad_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/moreh_layer_norm_backward_gamma_beta_grad_device_operation.hpp index 1cd516391c9..4383b77d446 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/moreh_layer_norm_backward_gamma_beta_grad_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/moreh_layer_norm_backward_gamma_beta_grad_device_operation.hpp @@ -32,8 +32,8 @@ struct MorehLayerNormBackwardGammaBetaGradOperation { struct ProgramFactory { struct shared_variables_t { - KernelHandle unary_reader_kernel_id; - KernelHandle unary_writer_kernel_id; + tt::tt_metal::KernelHandle unary_reader_kernel_id; + tt::tt_metal::KernelHandle unary_writer_kernel_id; std::size_t num_cores; std::size_t num_cores_y; }; diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/moreh_layer_norm_backward_input_grad_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/moreh_layer_norm_backward_input_grad_device_operation.hpp index c5cd3c4b06e..7a097a67d05 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/moreh_layer_norm_backward_input_grad_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/moreh_layer_norm_backward_input_grad_device_operation.hpp @@ -31,8 +31,8 @@ struct MorehLayerNormBackwardInputGradOperation { struct ProgramFactory { struct shared_variables_t { - KernelHandle unary_reader_kernel_id; - KernelHandle unary_writer_kernel_id; + tt::tt_metal::KernelHandle unary_reader_kernel_id; + tt::tt_metal::KernelHandle unary_writer_kernel_id; std::size_t num_cores; std::size_t num_cores_y; }; diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/moreh_linear_backward_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/moreh_linear_backward_device_operation.hpp index 64c21089f64..e688605af63 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/moreh_linear_backward_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/moreh_linear_backward_device_operation.hpp @@ -31,8 +31,8 @@ struct MorehBiasAddBackwardOperation { struct SingleCoreProgramFactory { struct shared_variables_t { - KernelHandle unary_reader_kernel_id; - KernelHandle unary_writer_kernel_id; + tt::tt_metal::KernelHandle unary_reader_kernel_id; + tt::tt_metal::KernelHandle unary_writer_kernel_id; }; using cached_program_t = ttnn::device_operation::CachedProgram; @@ -51,8 +51,8 @@ struct MorehBiasAddBackwardOperation { struct MultiCoreProgramFactory { struct shared_variables_t { - KernelHandle unary_reader_kernel_id; - KernelHandle unary_writer_kernel_id; + tt::tt_metal::KernelHandle unary_reader_kernel_id; + tt::tt_metal::KernelHandle unary_writer_kernel_id; std::size_t num_cores_to_be_used; std::size_t num_cores_y; }; diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_matmul/device/moreh_matmul_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_matmul/device/moreh_matmul_device_operation.hpp index e61e5cf51c8..f58c6b3c16d 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_matmul/device/moreh_matmul_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_matmul/device/moreh_matmul_device_operation.hpp @@ -35,8 +35,8 @@ struct MorehMatmulOperation { struct MultiCoreProgramFactory { struct shared_variable_t { - KernelHandle reader_kernel_id; - KernelHandle writer_kernel_id; + tt::tt_metal::KernelHandle reader_kernel_id; + tt::tt_metal::KernelHandle writer_kernel_id; std::size_t num_cores; std::size_t num_cores_y; }; diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_device_operation.hpp index 0c45c50f66d..1f2cb4bca96 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_device_operation.hpp @@ -33,8 +33,8 @@ struct MorehMeanOperation { struct MorehMeanHFactory { struct shared_variables_t { - KernelHandle unary_reader_kernel_id; - KernelHandle unary_writer_kernel_id; + tt::tt_metal::KernelHandle unary_reader_kernel_id; + tt::tt_metal::KernelHandle unary_writer_kernel_id; std::uint32_t num_cores; std::uint32_t core_h; }; @@ -55,8 +55,8 @@ struct MorehMeanOperation { struct MorehMeanNCFactory { struct shared_variables_t { - KernelHandle unary_reader_kernel_id; - KernelHandle unary_writer_kernel_id; + tt::tt_metal::KernelHandle unary_reader_kernel_id; + tt::tt_metal::KernelHandle unary_writer_kernel_id; std::uint32_t num_cores; std::uint32_t core_h; }; @@ -77,8 +77,8 @@ struct MorehMeanOperation { struct MorehMeanWFactory { struct shared_variables_t { - KernelHandle unary_reader_kernel_id; - KernelHandle unary_writer_kernel_id; + tt::tt_metal::KernelHandle unary_reader_kernel_id; + tt::tt_metal::KernelHandle unary_writer_kernel_id; std::uint32_t num_cores; std::uint32_t core_h; }; diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/moreh_mean_backward_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/moreh_mean_backward_device_operation.hpp index a0df31f6d4e..28be26e404a 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/moreh_mean_backward_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/moreh_mean_backward_device_operation.hpp @@ -33,8 +33,8 @@ struct MorehMeanBackwardOperation { struct MorehMeanBackwardFactory { struct shared_variables_t { - KernelHandle unary_reader_kernel_id; - KernelHandle unary_writer_kernel_id; + tt::tt_metal::KernelHandle unary_reader_kernel_id; + tt::tt_metal::KernelHandle unary_writer_kernel_id; std::uint32_t num_cores_to_be_used; std::uint32_t num_cores_y; }; diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/moreh_nll_loss_step1_device_operation.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/moreh_nll_loss_step1_device_operation.cpp index 1db8255ec9e..e5867c03246 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/moreh_nll_loss_step1_device_operation.cpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/moreh_nll_loss_step1_device_operation.cpp @@ -45,7 +45,8 @@ MorehNllLossStep1DeviceOperation::spec_return_value_t MorehNllLossStep1DeviceOpe const auto& target_tensor = tensor_args.target_tensor; return TensorSpec( target_tensor.get_logical_shape(), - TensorLayout(operation_attributes.dtype, PageConfig(Layout::TILE), operation_attributes.memory_config)); + tt::tt_metal::TensorLayout( + operation_attributes.dtype, tt::tt_metal::PageConfig(Layout::TILE), operation_attributes.memory_config)); } MorehNllLossStep1DeviceOperation::tensor_return_value_t MorehNllLossStep1DeviceOperation::create_output_tensors( diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/moreh_nll_loss_step1_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/moreh_nll_loss_step1_device_operation.hpp index 62c52826a49..c656c236ebd 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/moreh_nll_loss_step1_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/moreh_nll_loss_step1_device_operation.hpp @@ -32,8 +32,8 @@ struct MorehNllLossStep1DeviceOperation { struct Factory { struct shared_variables_t { - KernelHandle unary_reader_kernel_id; - KernelHandle unary_writer_kernel_id; + tt::tt_metal::KernelHandle unary_reader_kernel_id; + tt::tt_metal::KernelHandle unary_writer_kernel_id; std::size_t num_cores; std::size_t num_cores_y; }; diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/moreh_nll_loss_step2_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/moreh_nll_loss_step2_device_operation.hpp index ec811680ee8..36d7366b325 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/moreh_nll_loss_step2_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/moreh_nll_loss_step2_device_operation.hpp @@ -33,8 +33,8 @@ struct MorehNllLossStep2DeviceOperation { struct Factory { struct shared_variables_t { - KernelHandle unary_reader_kernel_id; - KernelHandle unary_writer_kernel_id; + tt::tt_metal::KernelHandle unary_reader_kernel_id; + tt::tt_metal::KernelHandle unary_writer_kernel_id; std::size_t num_cores; std::size_t num_cores_y; }; diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/device/moreh_nll_loss_backward_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/device/moreh_nll_loss_backward_device_operation.hpp index c2a369a8d0b..5b1ba818487 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/device/moreh_nll_loss_backward_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/device/moreh_nll_loss_backward_device_operation.hpp @@ -37,8 +37,8 @@ struct MorehNllLossBackwardDeviceOperation { struct Factory { struct shared_variables_t { - KernelHandle unary_reader_kernel_id; - KernelHandle unary_writer_kernel_id; + tt::tt_metal::KernelHandle unary_reader_kernel_id; + tt::tt_metal::KernelHandle unary_writer_kernel_id; std::size_t num_cores; std::size_t num_cores_y; }; diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_unreduced_backward/device/moreh_nll_loss_unreduced_backward_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_unreduced_backward/device/moreh_nll_loss_unreduced_backward_device_operation.hpp index b6a1cd9cfcc..41baf74f548 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_unreduced_backward/device/moreh_nll_loss_unreduced_backward_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_unreduced_backward/device/moreh_nll_loss_unreduced_backward_device_operation.hpp @@ -34,8 +34,8 @@ struct MorehNllLossUnreducedBackwardDeviceOperation { struct Factory { struct shared_variables_t { - KernelHandle unary_reader_kernel_id; - KernelHandle unary_writer_kernel_id; + tt::tt_metal::KernelHandle unary_reader_kernel_id; + tt::tt_metal::KernelHandle unary_writer_kernel_id; std::size_t num_cores; std::size_t num_cores_y; }; diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_device_operation.hpp index d465478d001..485585a332b 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_device_operation.hpp @@ -10,8 +10,8 @@ #define DEFINE_PROGRAM_FACTORY(FactoryName) \ struct FactoryName { \ struct shared_variables_t { \ - KernelHandle reader_kernels_id; \ - KernelHandle writer_kernels_id; \ + tt::tt_metal::KernelHandle reader_kernels_id; \ + tt::tt_metal::KernelHandle writer_kernels_id; \ std::size_t num_cores_to_be_used; \ std::size_t num_cores_y; \ }; \ diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_norm_backward/device/moreh_norm_backward_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_norm_backward/device/moreh_norm_backward_device_operation.hpp index cdccde4aaf3..f24d726ff8b 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_norm_backward/device/moreh_norm_backward_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_norm_backward/device/moreh_norm_backward_device_operation.hpp @@ -10,8 +10,8 @@ #define DEFINE_PROGRAM_FACTORY(FactoryName) \ struct FactoryName { \ struct shared_variables_t { \ - KernelHandle reader_kernels_id; \ - KernelHandle writer_kernels_id; \ + tt::tt_metal::KernelHandle reader_kernels_id; \ + tt::tt_metal::KernelHandle writer_kernels_id; \ std::size_t num_cores_to_be_used; \ std::size_t num_cores_y; \ }; \ diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/moreh_sgd_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/moreh_sgd_device_operation.hpp index c01f8c90234..cdb0f07c31a 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/moreh_sgd_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/moreh_sgd_device_operation.hpp @@ -36,8 +36,8 @@ struct MorehSgdOperation { struct ProgramFactory { struct shared_variables_t { - KernelHandle unary_reader_kernel_id; - KernelHandle unary_writer_kernel_id; + tt::tt_metal::KernelHandle unary_reader_kernel_id; + tt::tt_metal::KernelHandle unary_writer_kernel_id; std::size_t num_cores; std::size_t core_h; bool has_momentum_buffer_out; diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/moreh_softmax_device_operation.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/moreh_softmax_device_operation.cpp index 36b6630fdf5..d13eae4035c 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/moreh_softmax_device_operation.cpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/moreh_softmax_device_operation.cpp @@ -4,6 +4,8 @@ #include "moreh_softmax_device_operation.hpp" +using namespace tt::tt_metal; + namespace ttnn::operations::moreh::moreh_softmax { #define L1_512KB (512 * 1024) diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/moreh_softmax_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/moreh_softmax_device_operation.hpp index dd2a713252f..ae8c4889c1e 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/moreh_softmax_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/moreh_softmax_device_operation.hpp @@ -47,8 +47,8 @@ struct MorehSoftmaxOperation { #define DEFINE_SOFTMAX_FACTORY(factory_name) \ struct factory_name { \ struct shared_variables_t { \ - KernelHandle unary_reader_kernel_id; \ - KernelHandle unary_writer_kernel_id; \ + tt::tt_metal::KernelHandle unary_reader_kernel_id; \ + tt::tt_metal::KernelHandle unary_writer_kernel_id; \ std::size_t num_cores; \ std::size_t num_cores_y; \ }; \ diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/moreh_softmax_backward_device_operation.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/moreh_softmax_backward_device_operation.cpp index 5477d1fa3ea..993c5e24f15 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/moreh_softmax_backward_device_operation.cpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/moreh_softmax_backward_device_operation.cpp @@ -26,7 +26,7 @@ bool is_moreh_softmax_backward_w_small_available(const Tensor& tensor) { cb_usage += 1 * tile_size; // reduce cb_usage += 1 * tile_size; // dy - sum - return (tensor.device()->allocator()->get_base_allocator_addr(HalMemType::L1) + cb_usage <= L1_512KB); + return (tensor.device()->allocator()->get_base_allocator_addr(tt::tt_metal::HalMemType::L1) + cb_usage <= L1_512KB); } bool is_moreh_softmax_backward_h_small_available(const Tensor& tensor) { @@ -47,7 +47,7 @@ bool is_moreh_softmax_backward_h_small_available(const Tensor& tensor) { cb_usage += 1 * tile_size; // reduce cb_usage += 1 * tile_size; // dy - sum - return (tensor.device()->allocator()->get_base_allocator_addr(HalMemType::L1) + cb_usage <= L1_512KB); + return (tensor.device()->allocator()->get_base_allocator_addr(tt::tt_metal::HalMemType::L1) + cb_usage <= L1_512KB); } MorehSoftmaxBackwardOperation::program_factory_t MorehSoftmaxBackwardOperation::select_program_factory( @@ -102,9 +102,9 @@ MorehSoftmaxBackwardOperation::spec_return_value_t MorehSoftmaxBackwardOperation } return TensorSpec( tensor_args.output_tensor.get_logical_shape(), - TensorLayout( + tt::tt_metal::TensorLayout( tensor_args.output_tensor.get_dtype(), - PageConfig(tensor_args.output_tensor.get_layout()), + tt::tt_metal::PageConfig(tensor_args.output_tensor.get_layout()), operation_attributes.memory_config)); } diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/moreh_softmax_backward_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/moreh_softmax_backward_device_operation.hpp index 317c99ef9a4..2a6e76adfb7 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/moreh_softmax_backward_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/moreh_softmax_backward_device_operation.hpp @@ -48,8 +48,8 @@ struct MorehSoftmaxBackwardOperation { #define DEFINE_SOFTMAX_BACKWARD_FACTORY(factory_name) \ struct factory_name { \ struct shared_variables_t { \ - KernelHandle unary_reader_kernel_id; \ - KernelHandle unary_writer_kernel_id; \ + tt::tt_metal::KernelHandle unary_reader_kernel_id; \ + tt::tt_metal::KernelHandle unary_writer_kernel_id; \ std::size_t num_cores; \ std::size_t num_cores_y; \ }; \ diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_device_operation.hpp index 8e14e460fc3..83c670896f3 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_device_operation.hpp @@ -13,8 +13,8 @@ #define MOREH_SUM_FACTORY_H(name) \ struct name { \ struct shared_variables_t { \ - KernelHandle unary_reader_kernel_id; \ - KernelHandle unary_writer_kernel_id; \ + tt::tt_metal::KernelHandle unary_reader_kernel_id; \ + tt::tt_metal::KernelHandle unary_writer_kernel_id; \ std::size_t num_cores; \ std::size_t num_cores_y; \ }; \ diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum_backward/device/moreh_sum_backward_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum_backward/device/moreh_sum_backward_device_operation.hpp index 8c3eeef65b1..09d7080d0af 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum_backward/device/moreh_sum_backward_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum_backward/device/moreh_sum_backward_device_operation.hpp @@ -28,8 +28,8 @@ struct MorehSumBackwardOperation { struct ProgramFactory { struct shared_variables_t { - KernelHandle unary_reader_kernel_id; - KernelHandle unary_writer_kernel_id; + tt::tt_metal::KernelHandle unary_reader_kernel_id; + tt::tt_metal::KernelHandle unary_writer_kernel_id; std::size_t num_cores; std::size_t num_cores_y; }; diff --git a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/batch_norm_program_factory.cpp b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/batch_norm_program_factory.cpp index 4c347a6cfed..96ddddc62d1 100644 --- a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/batch_norm_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/batch_norm_program_factory.cpp @@ -12,7 +12,7 @@ namespace CMAKE_UNIQUE_NAMESPACE { using namespace ttnn::operations::normalization; -std::tuple extract_shape_dims(const Tensor& x) { +std::tuple extract_shape_dims(const tt::tt_metal::Tensor& x) { const auto& shape = x.padded_shape(); const auto& tile = x.tensor_spec().tile(); return {shape[-4], shape[-3], shape[-2] / tile.get_height(), shape[-1] / tile.get_width()}; @@ -20,10 +20,10 @@ std::tuple extract_shape_dims(const Tens template void set_or_update_runtime_arguments( - Program& program, - KernelHandle reader_kernel_id, - KernelHandle writer_kernel_id, - KernelHandle compute_kernel_id, + tt::tt_metal::Program& program, + tt::tt_metal::KernelHandle reader_kernel_id, + tt::tt_metal::KernelHandle writer_kernel_id, + tt::tt_metal::KernelHandle compute_kernel_id, CoreCoord compute_with_storage_grid_size, const BatchNormOperation::operation_attributes_t& operation_attributes, const BatchNormOperation::tensor_args_t& tensor_args, @@ -74,7 +74,7 @@ void set_or_update_runtime_arguments( uint32_t cHtWt = cHt * cWt; const auto scalar = eps; - const auto packed_scalar_eps = input_tensor.get_dtype() == DataType::FLOAT32 + const auto packed_scalar_eps = input_tensor.get_dtype() == tt::tt_metal::DataType::FLOAT32 ? std::bit_cast(scalar) : pack_two_bfloat16_into_uint32({scalar, scalar}); @@ -329,11 +329,12 @@ void BatchNormOperation::BatchNormFactory::override_runtime_arguments( const operation_attributes_t& operation_attributes, const tensor_args_t& tensor_args, tensor_return_value_t& output) { - auto update_args = [](Program& program, KernelHandle kernel_id, CoreCoord core, auto&& args) { - auto& all_args = GetRuntimeArgs(program, kernel_id); - auto& core_args = all_args.at(core.x).at(core.y); - std::copy(args.begin(), args.end(), core_args.data()); - }; + auto update_args = + [](tt::tt_metal::Program& program, tt::tt_metal::KernelHandle kernel_id, CoreCoord core, auto&& args) { + auto& all_args = GetRuntimeArgs(program, kernel_id); + auto& core_args = all_args.at(core.x).at(core.y); + std::copy(args.begin(), args.end(), core_args.data()); + }; CMAKE_UNIQUE_NAMESPACE::set_or_update_runtime_arguments( cached_program.program, diff --git a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/running_statistics_program_factory.cpp b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/running_statistics_program_factory.cpp index 3263f995fd3..9749500ac03 100644 --- a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/running_statistics_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/running_statistics_program_factory.cpp @@ -13,7 +13,7 @@ namespace CMAKE_UNIQUE_NAMESPACE { using namespace ttnn::operations::normalization; -std::tuple extract_shape_dims(const Tensor& x) { +std::tuple extract_shape_dims(const tt::tt_metal::Tensor& x) { const auto& shape = x.padded_shape(); const auto& tile = x.tensor_spec().tile(); return {shape[-4], shape[-3], shape[-2] / tile.get_height(), shape[-1] / tile.get_width()}; @@ -21,10 +21,10 @@ std::tuple extract_shape_dims(const Tens template void set_or_update_runtime_arguments( - Program& program, - KernelHandle reader_kernel_id, - KernelHandle writer_kernel_id, - KernelHandle compute_kernel_id, + tt::tt_metal::Program& program, + tt::tt_metal::KernelHandle reader_kernel_id, + tt::tt_metal::KernelHandle writer_kernel_id, + tt::tt_metal::KernelHandle compute_kernel_id, CoreCoord compute_with_storage_grid_size, const RunningStatistics::operation_attributes_t& operation_attributes, const RunningStatistics::tensor_args_t& tensor_args, @@ -75,7 +75,7 @@ void set_or_update_runtime_arguments( uint32_t cHtWt = cHt * cWt; const auto scalar = momentum; - const auto packed_scalar_momentum = batch_mean_tensor.get_dtype() == DataType::FLOAT32 + const auto packed_scalar_momentum = batch_mean_tensor.get_dtype() == tt::tt_metal::DataType::FLOAT32 ? std::bit_cast(scalar) : pack_two_bfloat16_into_uint32({scalar, scalar}); std::array reader_runtime_args = { @@ -360,11 +360,12 @@ void RunningStatistics::RunningStatisticsProgramFactory::override_runtime_argume const operation_attributes_t& operation_attributes, const tensor_args_t& tensor_args, tensor_return_value_t& output) { - auto update_args = [](Program& program, KernelHandle kernel_id, CoreCoord core, auto&& args) { - auto& all_args = GetRuntimeArgs(program, kernel_id); - auto& core_args = all_args.at(core.x).at(core.y); - std::copy(args.begin(), args.end(), core_args.data()); - }; + auto update_args = + [](tt::tt_metal::Program& program, tt::tt_metal::KernelHandle kernel_id, CoreCoord core, auto&& args) { + auto& all_args = GetRuntimeArgs(program, kernel_id); + auto& core_args = all_args.at(core.x).at(core.y); + std::copy(args.begin(), args.end(), core_args.data()); + }; CMAKE_UNIQUE_NAMESPACE::set_or_update_runtime_arguments( cached_program.program, diff --git a/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/groupnorm_op.hpp b/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/groupnorm_op.hpp index 5fd2878e7e1..e1d530adc05 100644 --- a/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/groupnorm_op.hpp +++ b/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/groupnorm_op.hpp @@ -29,13 +29,13 @@ Ref: https://pytorch.org/docs/stable/generated/torch.nn.GroupNorm.html struct GroupNormShardedMultiCoreProgramConfig { CoreCoord compute_with_storage_grid_size; MathFidelity math_fidelity; - DataType im_data_format; - DataType out_data_format; + tt::tt_metal::DataType im_data_format; + tt::tt_metal::DataType out_data_format; bool inplace; - Layout output_layout; + tt::tt_metal::Layout output_layout; }; -operation::ProgramWithCallbacks groupnorm_multi_core_sharded( +tt::tt_metal::operation::ProgramWithCallbacks groupnorm_multi_core_sharded( const Tensor& a, const std::optional& gamma, const std::optional& beta, @@ -45,14 +45,14 @@ operation::ProgramWithCallbacks groupnorm_multi_core_sharded( const uint32_t num_groups, const uint32_t num_batches, MathFidelity fidelity, - DataType im_data_format, + tt::tt_metal::DataType im_data_format, CoreCoord grid_size, bool inplace); struct GroupNorm { float eps; uint32_t num_groups; - MemoryConfig output_mem_config; + tt::tt_metal::MemoryConfig output_mem_config; GroupNormShardedMultiCoreProgramConfig program_config; void validate( @@ -60,7 +60,7 @@ struct GroupNorm { const std::vector>& optional_input_tensors) const; std::vector compute_output_specs(const std::vector& input_tensors) const; std::vector create_output_tensors(const std::vector& input_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector& input_tensors, const std::vector>& optional_input_tensors, std::vector& output_tensors) const; diff --git a/ttnn/cpp/ttnn/operations/normalization/groupnorm/groupnorm.cpp b/ttnn/cpp/ttnn/operations/normalization/groupnorm/groupnorm.cpp index 59b2dcefbd6..44851f65984 100644 --- a/ttnn/cpp/ttnn/operations/normalization/groupnorm/groupnorm.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/groupnorm/groupnorm.cpp @@ -87,7 +87,7 @@ ttnn::Tensor ExecuteGroupNorm::invoke( .inplace = inplace.value_or(false), .output_layout = output_layout.value_or(input_tensor.get_layout())}; - return operation::run( + return tt::tt_metal::operation::run( GroupNorm{ .eps = epsilon, .num_groups = static_cast(num_groups), diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/layernorm_op.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/layernorm_op.cpp index 7ccca09b68c..f5a2a7ae552 100644 --- a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/layernorm_op.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/layernorm_op.cpp @@ -13,6 +13,7 @@ using uint32_t = std::uint32_t; using namespace tt::constants; +using namespace tt::tt_metal; namespace ttnn::operations::normalization { @@ -306,7 +307,7 @@ operation::ProgramWithCallbacks LayerNorm::create_program( auto& output_tensor = output_tensors.at(0); return std::visit( - [&](const auto& program_config) -> operation::ProgramWithCallbacks { + [&](const auto& program_config) -> tt::tt_metal::operation::ProgramWithCallbacks { using ProgramConfigType = std::decay_t; if constexpr (std::is_same_v) { uint32_t num_cores_x = program_config.compute_with_storage_grid_size.x; diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/layernorm_op.hpp b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/layernorm_op.hpp index 0decbdf33d1..ee753b19bf8 100644 --- a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/layernorm_op.hpp +++ b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/layernorm_op.hpp @@ -15,7 +15,7 @@ namespace ttnn::operations::normalization { -operation::ProgramWithCallbacks layernorm_multi_core( +tt::tt_metal::operation::ProgramWithCallbacks layernorm_multi_core( const Tensor& a, const std::optional& b, const std::optional& gamma, @@ -25,7 +25,7 @@ operation::ProgramWithCallbacks layernorm_multi_core( float eps, DeviceComputeKernelConfig compute_kernel_config); -operation::ProgramWithCallbacks layernorm_multi_core_sharded( +tt::tt_metal::operation::ProgramWithCallbacks layernorm_multi_core_sharded( const Tensor& a, const std::optional& b, const std::optional& gamma, @@ -55,7 +55,7 @@ struct LayerNorm { const std::vector>& optional_input_tensors) const; std::vector compute_output_specs(const std::vector& input_tensors) const; std::vector create_output_tensors(const std::vector& input_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector& input_tensors, const std::vector>& optional_input_tensors, std::vector& output_tensors) const; diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/multi_core/layernorm_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/multi_core/layernorm_op_multi_core.cpp index 2055b4eb078..25b32a89cfc 100644 --- a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/multi_core/layernorm_op_multi_core.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/multi_core/layernorm_op_multi_core.cpp @@ -15,6 +15,7 @@ using uint32_t = std::uint32_t; using namespace tt::constants; +using namespace tt::tt_metal; namespace ttnn::operations::normalization { diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm/layernorm.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm/layernorm.cpp index 87d4973daa9..4678f720766 100644 --- a/ttnn/cpp/ttnn/operations/normalization/layernorm/layernorm.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/layernorm/layernorm.cpp @@ -23,7 +23,7 @@ ttnn::Tensor ExecuteLayerNorm::invoke( : ttnn::operations::experimental::auto_format::AutoFormat::GetDefaultDevice()->arch(); auto kernel_config_val = init_device_compute_kernel_config(arch, compute_kernel_config, MathFidelity::HiFi4, true, false, false); - return operation::run( + return tt::tt_metal::operation::run( LayerNorm{ .norm_type = LayerNormType::LAYERNORM, .distributed_norm_stage = DistributedLayerNormStage::NOT_DISTRIBUTED, diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/layernorm_post_all_gather_op.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/layernorm_post_all_gather_op.cpp index 56611052b8d..4749ebc0c77 100644 --- a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/layernorm_post_all_gather_op.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/layernorm_post_all_gather_op.cpp @@ -16,7 +16,6 @@ using uint32_t = std::uint32_t; using namespace tt::constants; -using namespace tt::tt_metal; namespace ttnn::operations::normalization { @@ -99,10 +98,11 @@ std::vector LayerNormPostAllGather::compute_output_specs(const std:: auto& input_tensor = input_tensors.at(0); return {TensorSpec( input_tensor.get_logical_shape(), - TensorLayout(this->dtype.value_or(input_tensor.get_dtype()), PageConfig(Layout::TILE), memory_config))}; + tt::tt_metal::TensorLayout( + this->dtype.value_or(input_tensor.get_dtype()), tt::tt_metal::PageConfig(Layout::TILE), memory_config))}; } -operation::ProgramWithCallbacks LayerNormPostAllGather::create_program( +tt::tt_metal::operation::ProgramWithCallbacks LayerNormPostAllGather::create_program( const std::vector& input_tensors, const std::vector>& optional_input_tensors, std::vector& output_tensors) const { diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/layernorm_post_all_gather_op.hpp b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/layernorm_post_all_gather_op.hpp index 56092a848a2..42cce76852d 100644 --- a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/layernorm_post_all_gather_op.hpp +++ b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/layernorm_post_all_gather_op.hpp @@ -16,7 +16,7 @@ using namespace tt::constants; namespace ttnn::operations::normalization { -operation::ProgramWithCallbacks layernorm_post_allgather_multi_core( +tt::tt_metal::operation::ProgramWithCallbacks layernorm_post_allgather_multi_core( const Tensor& a, const Tensor& stats, const std::optional& gamma, @@ -37,7 +37,7 @@ struct LayerNormPostAllGather { const std::vector& input_tensors, const std::vector>& optional_input_tensors) const; std::vector compute_output_specs(const std::vector& input_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector& input_tensors, const std::vector>& optional_input_tensors, std::vector& output_tensors) const; diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/layernorm_pre_all_gather_op.hpp b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/layernorm_pre_all_gather_op.hpp index 299f0328952..29ba554b11a 100644 --- a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/layernorm_pre_all_gather_op.hpp +++ b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/layernorm_pre_all_gather_op.hpp @@ -16,7 +16,7 @@ using namespace tt::tt_metal; namespace ttnn::operations::normalization { -operation::ProgramWithCallbacks layernorm_pre_allgather_multi_core( +tt::tt_metal::operation::ProgramWithCallbacks layernorm_pre_allgather_multi_core( const Tensor& a, Tensor& output, LayerNormDistributedType norm_type, @@ -29,7 +29,7 @@ struct LayerNormPreAllGather { void validate(const std::vector& input_tensors) const; std::vector compute_output_specs(const std::vector& input_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector& input_tensors, std::vector& output_tensors) const; }; diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_post_all_gather_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_post_all_gather_op_multi_core.cpp index d27e46ebf35..0fc55ecd04b 100644 --- a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_post_all_gather_op_multi_core.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_post_all_gather_op_multi_core.cpp @@ -4,6 +4,7 @@ #include "cpp/ttnn/operations/normalization/layernorm_distributed/device/layernorm_post_all_gather_op.hpp" #include +#include "tt-metalium/circular_buffer_types.hpp" #include "ttnn/operations/math.hpp" #include @@ -15,7 +16,6 @@ using uint32_t = std::uint32_t; using namespace tt::constants; -using namespace tt::tt_metal; namespace ttnn::operations::normalization { @@ -47,7 +47,7 @@ inline uint32_t pack_two_bfloat16_into_uint32(std::pair two_ } // namespace // computes layernorm(a)*gamma + beta -operation::ProgramWithCallbacks layernorm_post_allgather_multi_core( +tt::tt_metal::operation::ProgramWithCallbacks layernorm_post_allgather_multi_core( const Tensor& a, const Tensor& stats, const std::optional& gamma, @@ -57,6 +57,10 @@ operation::ProgramWithCallbacks layernorm_post_allgather_multi_core( float eps, ttnn::DeviceComputeKernelConfig compute_kernel_config) { using namespace CMAKE_UNIQUE_NAMESPACE; + using tt::tt_metal::CBHandle; + using tt::tt_metal::CircularBuffer; + using tt::tt_metal::CircularBufferConfig; + const bool is_rmsnorm = norm_type == LayerNormDistributedType::RMSNORM; const auto shape = a.get_padded_shape(); const uint32_t W = shape[-1], H = shape[-2]; @@ -97,7 +101,8 @@ operation::ProgramWithCallbacks layernorm_post_allgather_multi_core( auto [math_fidelity, math_approx_mode, fp32_dest_acc_en, packer_l1_acc, dst_full_sync_en] = get_compute_kernel_config_args(device->arch(), compute_kernel_config); - uint32_t block_size = fp32_dest_acc_en ? find_max_divisor(Wt, 4) : find_max_divisor(Wt, 8); + uint32_t block_size = + fp32_dest_acc_en ? tt::tt_metal::find_max_divisor(Wt, 4) : tt::tt_metal::find_max_divisor(Wt, 8); tt::DataFormat in_data_format = tt::tt_metal::datatype_to_dataformat_converter(a.get_dtype()); tt::DataFormat stats_data_format = tt::tt_metal::datatype_to_dataformat_converter(stats.get_dtype()); @@ -238,7 +243,7 @@ operation::ProgramWithCallbacks layernorm_post_allgather_multi_core( //////////////////////////////////////////////////////////////////////////// // Application Setup //////////////////////////////////////////////////////////////////////////// - Program program = CreateProgram(); + Program program = tt::tt_metal::CreateProgram(); std::vector reader_compile_time_args = { // interleaved accessor args @@ -252,7 +257,7 @@ operation::ProgramWithCallbacks layernorm_post_allgather_multi_core( if (gamma.has_value() and gamma.value().get_layout() == Layout::ROW_MAJOR) { auto gamma_stick_size = gamma.value().get_padded_shape()[-1] * gamma.value().element_size(); - bool gamma_stick_size_is_power_of_two = is_power_of_two_at_least_32(gamma_stick_size); + bool gamma_stick_size_is_power_of_two = tt::tt_metal::is_power_of_two_at_least_32(gamma_stick_size); TT_FATAL(gamma_stick_size_is_power_of_two, "Only power of 2 gammas are supported"); reader_compile_time_args.push_back((std::uint32_t)gamma_stick_size_is_power_of_two); // if (gamma_stick_size_is_power_of_two) { diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/layernorm_post_all_gather.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/layernorm_post_all_gather.cpp index 9310ad01ca6..3a735b995ec 100644 --- a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/layernorm_post_all_gather.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/layernorm_post_all_gather.cpp @@ -7,6 +7,7 @@ #include "device/layernorm_post_all_gather_op.hpp" #include "ttnn/operations/normalization/layernorm/device/layernorm_op.hpp" + namespace ttnn::operations::normalization { ttnn::Tensor ExecuteLayerNormPostAllGather::invoke( @@ -25,7 +26,7 @@ ttnn::Tensor ExecuteLayerNormPostAllGather::invoke( auto kernel_config_val = init_device_compute_kernel_config(arch, compute_kernel_config, MathFidelity::HiFi4, true, false, false); if (input_tensor.is_sharded()) { - return operation::run( + return tt::tt_metal::operation::run( LayerNorm{ .norm_type = LayerNormType::LAYERNORM, .distributed_norm_stage = DistributedLayerNormStage::POST_ALL_GATHER, @@ -38,7 +39,7 @@ ttnn::Tensor ExecuteLayerNormPostAllGather::invoke( {std::nullopt, weight, bias, stats}) .at(0); } else { - return operation::run( + return tt::tt_metal::operation::run( LayerNormPostAllGather{ .norm_type = LayerNormDistributedType::LAYERNORM, .eps = epsilon, diff --git a/ttnn/cpp/ttnn/operations/normalization/rmsnorm/rmsnorm.cpp b/ttnn/cpp/ttnn/operations/normalization/rmsnorm/rmsnorm.cpp index f0867097211..8767bebe1a3 100644 --- a/ttnn/cpp/ttnn/operations/normalization/rmsnorm/rmsnorm.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/rmsnorm/rmsnorm.cpp @@ -22,7 +22,7 @@ ttnn::Tensor ExecuteRMSNorm::invoke( : ttnn::operations::experimental::auto_format::AutoFormat::GetDefaultDevice()->arch(); auto kernel_config_val = init_device_compute_kernel_config(arch, compute_kernel_config, MathFidelity::HiFi4, true, false, false); - return operation::run( + return tt::tt_metal::operation::run( LayerNorm{ .norm_type = LayerNormType::RMSNORM, .eps = epsilon, diff --git a/ttnn/cpp/ttnn/operations/normalization/rmsnorm_distributed/rmsnorm_post_all_gather.cpp b/ttnn/cpp/ttnn/operations/normalization/rmsnorm_distributed/rmsnorm_post_all_gather.cpp index fd7ce317df5..6402f6b3874 100644 --- a/ttnn/cpp/ttnn/operations/normalization/rmsnorm_distributed/rmsnorm_post_all_gather.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/rmsnorm_distributed/rmsnorm_post_all_gather.cpp @@ -25,7 +25,7 @@ ttnn::Tensor ExecuteRMSNormPostAllGather::invoke( auto kernel_config_val = init_device_compute_kernel_config(arch, compute_kernel_config, MathFidelity::HiFi4, true, false, false); if (input_tensor.is_sharded()) { - return operation::run( + return tt::tt_metal::operation::run( LayerNorm{ .norm_type = LayerNormType::RMSNORM, .distributed_norm_stage = DistributedLayerNormStage::POST_ALL_GATHER, @@ -38,7 +38,7 @@ ttnn::Tensor ExecuteRMSNormPostAllGather::invoke( {std::nullopt, weight, bias, stats}) .at(0); } else { - return operation::run( + return tt::tt_metal::operation::run( LayerNormPostAllGather{ .norm_type = LayerNormDistributedType::RMSNORM, .eps = epsilon, diff --git a/ttnn/cpp/ttnn/operations/normalization/softmax/device/multi_core/softmax_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/normalization/softmax/device/multi_core/softmax_op_multi_core.cpp index 2dd81b44ddc..a3e0bb8c2eb 100644 --- a/ttnn/cpp/ttnn/operations/normalization/softmax/device/multi_core/softmax_op_multi_core.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/softmax/device/multi_core/softmax_op_multi_core.cpp @@ -4,6 +4,8 @@ #include #include +#include "tt-metalium/circular_buffer.hpp" +#include "tt-metalium/circular_buffer_types.hpp" #include "ttnn/operation.hpp" #include "ttnn/operations/normalization/softmax/device/softmax_op.hpp" #include "ttnn/operations/math.hpp" @@ -22,16 +24,18 @@ namespace ttnn::operations::normalization { namespace { namespace CMAKE_UNIQUE_NAMESPACE { -inline bool is_dram(const Tensor& input_tensor) { return input_tensor.memory_config().buffer_type == BufferType::DRAM; } +inline bool is_dram(const Tensor& input_tensor) { + return input_tensor.memory_config().buffer_type == tt::tt_metal::BufferType::DRAM; +} inline bool is_dram(const std::optional& input_tensor) { return input_tensor.has_value() ? is_dram(input_tensor.value()) : true; } -inline bool is_dram(const Buffer* b) { return b->buffer_type() == BufferType::DRAM; } +inline bool is_dram(const tt::tt_metal::Buffer* b) { return b->buffer_type() == tt::tt_metal::BufferType::DRAM; } } // namespace CMAKE_UNIQUE_NAMESPACE } // namespace // implementation of softmax with optional scale/mask (see the header for input_tensor more detailed description) -operation::ProgramWithCallbacks scale_mask_softmax_multi_core( +tt::tt_metal::operation::ProgramWithCallbacks scale_mask_softmax_multi_core( const Tensor& input_tensor, const Tensor& output_tensor, const std::optional& mask, @@ -62,10 +66,10 @@ operation::ProgramWithCallbacks scale_mask_softmax_multi_core( } uint32_t mask_Ht = mask_H / TILE_HEIGHT; - Program program = CreateProgram(); + auto program = tt::tt_metal::CreateProgram(); // This should allocate input_tensor DRAM buffer on the device - IDevice* device = input_tensor.device(); + auto* device = input_tensor.device(); tt::DataFormat in0_cb_data_format = tt::tt_metal::datatype_to_dataformat_converter(input_tensor.get_dtype()); uint32_t in0_tile_size = tt::tt_metal::detail::TileSize(in0_cb_data_format); @@ -100,7 +104,8 @@ operation::ProgramWithCallbacks scale_mask_softmax_multi_core( uint32_t num_tiles = input_tensor.volume() / TILE_HW; - uint32_t block_size = fp32_dest_acc_en ? find_max_divisor(Wt, 4) : find_max_divisor(Wt, 8); + uint32_t block_size = + fp32_dest_acc_en ? tt::tt_metal::find_max_divisor(Wt, 4) : tt::tt_metal::find_max_divisor(Wt, 8); // These tile capacity counts for CBs need to match the number of tiles expected by the kernel (softmax.cpp) uint32_t in0_t = numeric_stable ? tt::div_up(Wt, block_size) * block_size : block_size * 2; @@ -206,7 +211,9 @@ operation::ProgramWithCallbacks scale_mask_softmax_multi_core( // Create circular buffers // see softmax.cpp for which buffers are needed - + using tt::tt_metal::CBHandle; + using tt::tt_metal::CircularBuffer; + using tt::tt_metal::CircularBufferConfig; auto c_in0_config = CircularBufferConfig(in0_t * in0_tile_size, {{tt::CBIndex::c_0, in0_cb_data_format}}) .set_page_size(tt::CBIndex::c_0, in0_tile_size); auto cb_in0_id = CreateCircularBuffer(program, all_device_cores, c_in0_config); @@ -374,7 +381,7 @@ operation::ProgramWithCallbacks scale_mask_softmax_multi_core( cb_intermed2_id, cb_intermed4_id]( const void* operation, - Program& program, + tt::tt_metal::Program& program, const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector& output_tensors) { @@ -403,7 +410,8 @@ operation::ProgramWithCallbacks scale_mask_softmax_multi_core( } int32_t num_tiles = input_tensors.at(0).volume() / TILE_HW; - uint32_t block_size = fp32_dest_acc_en ? find_max_divisor(Wt, 4) : find_max_divisor(Wt, 8); + uint32_t block_size = + fp32_dest_acc_en ? tt::tt_metal::find_max_divisor(Wt, 4) : tt::tt_metal::find_max_divisor(Wt, 8); // These tile capacity counts for CBs need to match the number of tiles expected by the kernel (softmax.cpp) uint32_t in0_t = numeric_stable ? tt::div_up(Wt, block_size) * block_size : block_size * 2; @@ -547,7 +555,7 @@ operation::ProgramWithCallbacks scale_mask_softmax_multi_core( } // scale_mask_softmax_multi_core // implementation of softmax with optional scale/mask (see the header for input_tensor more detailed description) -operation::ProgramWithCallbacks scale_mask_softmax_sharded_multi_core( +tt::tt_metal::operation::ProgramWithCallbacks scale_mask_softmax_sharded_multi_core( const Tensor& input_tensor, const Tensor& output_tensor, const std::optional& mask, @@ -560,10 +568,13 @@ operation::ProgramWithCallbacks scale_mask_softmax_sharded_multi_core( uint32_t block_wt, DeviceComputeKernelConfig compute_kernel_config, bool numeric_stable) { + using tt::tt_metal::CBHandle; + using tt::tt_metal::CircularBuffer; + using tt::tt_metal::CircularBufferConfig; //////////////////////////////////////////////////////////////////////////// // Device Setup //////////////////////////////////////////////////////////////////////////// - IDevice* device = input_tensor.device(); + auto* device = input_tensor.device(); // convert data format tt::DataFormat in0_cb_data_format = tt::tt_metal::datatype_to_dataformat_converter(input_tensor.get_dtype()); @@ -662,7 +673,7 @@ operation::ProgramWithCallbacks scale_mask_softmax_sharded_multi_core( //////////////////////////////////////////////////////////////////////////// // Application Setup //////////////////////////////////////////////////////////////////////////// - Program program = CreateProgram(); + auto program = tt::tt_metal::CreateProgram(); // define core ranges uint32_t start_core_x = 0; uint32_t start_core_y = 0; @@ -680,10 +691,10 @@ operation::ProgramWithCallbacks scale_mask_softmax_sharded_multi_core( std::vector reader_compile_time_args = {(std::uint32_t)block_wt, (std::uint32_t)is_dram_mask}; std::map softmax_defines; // hw_dims_only_causal_mask does not support RM Layout atm - bool use_row_major_kernel = (mask.has_value() and mask->get_layout() == Layout::ROW_MAJOR); + bool use_row_major_kernel = (mask.has_value() and mask->get_layout() == tt::tt_metal::Layout::ROW_MAJOR); if (use_row_major_kernel) { auto mask_stick_size = mask->get_padded_shape()[3] * mask->element_size(); - bool mask_stick_size_is_power_of_two = is_power_of_two_at_least_32(mask_stick_size); + bool mask_stick_size_is_power_of_two = tt::tt_metal::is_power_of_two_at_least_32(mask_stick_size); reader_compile_time_args.push_back((std::uint32_t)mask_stick_size_is_power_of_two); if (mask_stick_size_is_power_of_two) { uint32_t mask_log2_stick_size = (std::uint32_t)log2(mask_stick_size); @@ -832,7 +843,7 @@ operation::ProgramWithCallbacks scale_mask_softmax_sharded_multi_core( } uint32_t num_cores_per_batch_index = 0; - if (shard_orient == ShardOrientation::COL_MAJOR) { + if (shard_orient == tt::tt_metal::ShardOrientation::COL_MAJOR) { for (int core_idx_x = 0; core_idx_x < num_cores_c; core_idx_x++) { for (int core_idx_y = 0; core_idx_y < num_cores_r; core_idx_y++) { CoreCoord core = {(std::size_t)start_core_x + core_idx_x, (std::size_t)start_core_y + core_idx_y}; @@ -915,7 +926,7 @@ operation::ProgramWithCallbacks scale_mask_softmax_sharded_multi_core( auto override_runtime_arguments_callback = [reader_kernels_id, cb_in0_id, cb_out0_id, cb_in3_id, num_cores, grid_size]( const void* operation, - Program& program, + tt::tt_metal::Program& program, const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector& output_tensors) { diff --git a/ttnn/cpp/ttnn/operations/normalization/softmax/device/softmax_op.cpp b/ttnn/cpp/ttnn/operations/normalization/softmax/device/softmax_op.cpp index 13fcf2f479f..16be9643b3d 100644 --- a/ttnn/cpp/ttnn/operations/normalization/softmax/device/softmax_op.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/softmax/device/softmax_op.cpp @@ -5,6 +5,7 @@ #include "softmax_op.hpp" #include #include +#include "ttnn/tensor/storage.hpp" #include "ttnn/tensor/types.hpp" #include "ttnn/operations/math.hpp" #include @@ -25,6 +26,10 @@ namespace ttnn::operations::normalization { void Softmax::validate( const std::vector& input_tensors, const std::vector>& optional_input_tensors) const { + using tt::tt_metal::DataType; + using tt::tt_metal::Layout; + using tt::tt_metal::StorageType; + TT_FATAL(input_tensors.size() == 1 and optional_input_tensors.size() <= 1, "Must have 1 or 2 input tensors"); auto& input_tensor = input_tensors.at(0); TT_FATAL(input_tensor.storage_type() == StorageType::DEVICE, "Operands to softmax need to be on device!"); @@ -94,7 +99,9 @@ void Softmax::validate( TT_FATAL(mask.is_sharded() == false, "Error"); TT_FATAL(input_tensor.get_layout() == Layout::TILE, "Error"); TT_FATAL(input_tensor.is_sharded(), "Error"); - TT_FATAL(input_tensor.shard_spec()->orientation == ShardOrientation::ROW_MAJOR, "Error"); + TT_FATAL( + input_tensor.shard_spec()->orientation == tt::tt_metal::ShardOrientation::ROW_MAJOR, + "Error"); TT_FATAL(this->scale.has_value(), "Error"); } } @@ -116,7 +123,8 @@ std::vector Softmax::compute_output_specs(const std::vector& } return {TensorSpec( input_tensor.get_logical_shape(), - TensorLayout(input_tensor.get_dtype(), PageConfig(Layout::TILE), output_mem_config))}; + tt::tt_metal::TensorLayout( + input_tensor.get_dtype(), tt::tt_metal::PageConfig(tt::tt_metal::Layout::TILE), output_mem_config))}; } std::vector Softmax::create_output_tensors(const std::vector& input_tensors) const { @@ -127,7 +135,7 @@ std::vector Softmax::create_output_tensors(const std::vector& in return {create_device_tensor(compute_output_specs(input_tensors)[0], input_tensors.at(0).device())}; } -operation::ProgramWithCallbacks Softmax::create_program( +tt::tt_metal::operation::ProgramWithCallbacks Softmax::create_program( const std::vector& input_tensors, const std::vector>& optional_input_tensors, std::vector& output_tensors) const { @@ -139,7 +147,7 @@ operation::ProgramWithCallbacks Softmax::create_program( bool causal_mask = this->is_causal_mask; return std::visit( - [&](const auto& program_config) -> operation::ProgramWithCallbacks { + [&](const auto& program_config) -> tt::tt_metal::operation::ProgramWithCallbacks { using ProgramConfigType = std::decay_t; if constexpr (std::is_same_v) { return scale_mask_softmax_sharded_multi_core( @@ -169,14 +177,15 @@ operation::ProgramWithCallbacks Softmax::create_program( this->program_config); } -const operation::Hash Softmax::compute_program_hash( +const tt::tt_metal::operation::Hash Softmax::compute_program_hash( const std::vector& input_tensors, const std::vector>& optional_input_tensors) const { - return operation::hash_operation( - std::get(input_tensors.at(0).storage()).memory_config(), + return tt::tt_metal::operation::hash_operation( + std::get(input_tensors.at(0).storage()).memory_config(), input_tensors.at(0).dtype(), optional_input_tensors.at(0).has_value() - ? std::optional{std::get(optional_input_tensors.at(0).value().storage()).memory_config()} + ? std::optional{std::get(optional_input_tensors.at(0).value().storage()) + .memory_config()} : std::nullopt, optional_input_tensors.at(0).has_value() ? std::optional{optional_input_tensors.at(0).value().dtype()} : std::nullopt, @@ -200,8 +209,9 @@ Tensor scale_mask_softmax_in_place( const bool is_causal_mask, std::optional compute_kernel_config, const bool numeric_stable) { - std::vector dummy_output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor}))}; - operation::launch_op( + std::vector dummy_output_tensors = { + Tensor(tt::tt_metal::operation::get_workers_for_op_output({input_tensor}))}; + tt::tt_metal::operation::launch_op( [scale, mask, program_config, is_causal_mask, compute_kernel_config, numeric_stable]( const std::vector& input_tensors, const std::vector>& optional_input_tensors, @@ -210,7 +220,7 @@ Tensor scale_mask_softmax_in_place( auto& mask = optional_input_tensors.at(0); auto kernel_config_val = init_device_compute_kernel_config( input_tensor.device()->arch(), compute_kernel_config, MathFidelity::HiFi4, true, false, false); - return operation::run( + return tt::tt_metal::operation::run( Softmax{ .scale = scale, .inplace = true, @@ -235,8 +245,9 @@ Tensor scale_causal_mask_hw_dims_softmax_in_place( const SoftmaxProgramConfig& program_config, std::optional compute_kernel_config, const bool numeric_stable) { - std::vector dummy_output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor}))}; - operation::launch_op( + std::vector dummy_output_tensors = { + Tensor(tt::tt_metal::operation::get_workers_for_op_output({input_tensor}))}; + tt::tt_metal::operation::launch_op( [scale, mask, program_config, compute_kernel_config, numeric_stable]( const std::vector& input_tensors, const std::vector>& optional_input_tensors, @@ -245,7 +256,7 @@ Tensor scale_causal_mask_hw_dims_softmax_in_place( auto& mask = optional_input_tensors.at(0); auto kernel_config_val = init_device_compute_kernel_config( input_tensor.device()->arch(), compute_kernel_config, MathFidelity::HiFi4, true, false, false); - return operation::run( + return tt::tt_metal::operation::run( Softmax{ .scale = scale, .inplace = true, @@ -266,7 +277,7 @@ Tensor scale_causal_mask_hw_dims_softmax_in_place( Tensor softmax( const Tensor& input_tensor, - const MemoryConfig& output_mem_config, + const tt::tt_metal::MemoryConfig& output_mem_config, std::optional compute_kernel_config, const bool numeric_stable) { return scale_mask_softmax( @@ -277,12 +288,12 @@ Tensor scale_mask_softmax( const Tensor& input_tensor, std::optional scale, const std::optional& mask, - const MemoryConfig& output_mem_config, + const tt::tt_metal::MemoryConfig& output_mem_config, const bool is_causal_mask, std::optional compute_kernel_config, const bool numeric_stable) { - std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor}))}; - operation::launch_with_autoformat( + std::vector output_tensors = {Tensor(tt::tt_metal::operation::get_workers_for_op_output({input_tensor}))}; + tt::tt_metal::operation::launch_with_autoformat( [scale, mask, output_mem_config, is_causal_mask, compute_kernel_config, numeric_stable]( const std::vector& input_tensors, const std::vector>& optional_input_tensors, @@ -294,7 +305,7 @@ Tensor scale_mask_softmax( ttnn::operations::experimental::auto_format::FormatParams input_format_params = { .pad_shape = input_pad_shape, .pad_value = -std::numeric_limits::infinity(), - .target_layout = Layout::TILE}; + .target_layout = tt::tt_metal::Layout::TILE}; std::optional mask_format_params = std::nullopt; if (mask.has_value()) { TT_FATAL(input_tensor.get_padded_shape()[-1] == mask.value().get_padded_shape()[-1], "Error"); @@ -310,11 +321,11 @@ Tensor scale_mask_softmax( mask_format_params = { .pad_shape = mask_pad_shape, .pad_value = -std::numeric_limits::infinity(), - .target_layout = Layout::TILE}; + .target_layout = tt::tt_metal::Layout::TILE}; } auto kernel_config_val = init_device_compute_kernel_config( input_tensor.device()->arch(), compute_kernel_config, MathFidelity::HiFi4, true, false, false); - return operation::run_with_autoformat( + return tt::tt_metal::operation::run_with_autoformat( Softmax{ .scale = scale, .inplace = false, @@ -324,7 +335,7 @@ Tensor scale_mask_softmax( .numeric_stable = numeric_stable}, {input_tensor}, {input_format_params}, - {Layout::TILE}, + {tt::tt_metal::Layout::TILE}, {mask}, {mask_format_params}); }, diff --git a/ttnn/cpp/ttnn/operations/normalization/softmax/device/softmax_op.hpp b/ttnn/cpp/ttnn/operations/normalization/softmax/device/softmax_op.hpp index 67a1ffe7dbe..2b4250c4efd 100644 --- a/ttnn/cpp/ttnn/operations/normalization/softmax/device/softmax_op.hpp +++ b/ttnn/cpp/ttnn/operations/normalization/softmax/device/softmax_op.hpp @@ -20,7 +20,7 @@ namespace ttnn::operations::normalization { struct Softmax { const std::optional scale; const bool inplace; - const MemoryConfig output_mem_config; + const tt::tt_metal::MemoryConfig output_mem_config; const SoftmaxProgramConfig program_config; const bool is_causal_mask; const DeviceComputeKernelConfig compute_kernel_config; @@ -32,17 +32,17 @@ struct Softmax { const std::vector>& optional_input_tensors) const; std::vector compute_output_specs(const std::vector& input_tensors) const; std::vector create_output_tensors(const std::vector& input_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector& input_tensors, const std::vector>& optional_input_tensors, std::vector& output_tensors) const; - const operation::Hash compute_program_hash( + const tt::tt_metal::operation::Hash compute_program_hash( const std::vector& input_tensors, const std::vector>& optional_input_tensors) const; }; -operation::ProgramWithCallbacks scale_mask_softmax_multi_core( +tt::tt_metal::operation::ProgramWithCallbacks scale_mask_softmax_multi_core( const Tensor& input_tensor, const Tensor& output_tensor, const std::optional& mask, @@ -53,7 +53,7 @@ operation::ProgramWithCallbacks scale_mask_softmax_multi_core( // hw_dims_only_causal_mask - represents if the causal mask is of shape [1, 1, h, w] // valid only if causal_mask == true, and is interleaved -operation::ProgramWithCallbacks scale_mask_softmax_sharded_multi_core( +tt::tt_metal::operation::ProgramWithCallbacks scale_mask_softmax_sharded_multi_core( const Tensor& input_tensor, const Tensor& output_tensor, const std::optional& mask, @@ -70,7 +70,7 @@ operation::ProgramWithCallbacks scale_mask_softmax_sharded_multi_core( // softmax Tensor softmax( const Tensor& input_tensor, - const MemoryConfig& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, + const tt::tt_metal::MemoryConfig& output_mem_config = tt::tt_metal::operation::DEFAULT_OUTPUT_MEMORY_CONFIG, std::optional compute_kernel_config = std::nullopt, const bool numeric_stable = false); // const ref prevents in-place @@ -111,7 +111,7 @@ Tensor scale_mask_softmax( const Tensor& input_tensor, std::optional scale, const std::optional& mask, - const MemoryConfig& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, + const tt::tt_metal::MemoryConfig& output_mem_config = tt::tt_metal::operation::DEFAULT_OUTPUT_MEMORY_CONFIG, const bool is_causal_mask = false, std::optional compute_kernel_config = std::nullopt, const bool numeric_stable = false); diff --git a/ttnn/cpp/ttnn/operations/pool/downsample/device/downsample_op.hpp b/ttnn/cpp/ttnn/operations/pool/downsample/device/downsample_op.hpp index eae1360e2e4..5264ec27a13 100644 --- a/ttnn/cpp/ttnn/operations/pool/downsample/device/downsample_op.hpp +++ b/ttnn/cpp/ttnn/operations/pool/downsample/device/downsample_op.hpp @@ -17,17 +17,19 @@ namespace downsample { struct Downsample { std::array downsample_params; - DataType dtype; + tt::tt_metal::DataType dtype; void validate(const std::vector& input_tensors) const; std::vector compute_output_specs(const std::vector& input_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector& input_tensors, std::vector& output_tensors) const; }; Tensor downsample( - const Tensor& a, std::array downsample_params, std::optional dtype = std::nullopt); -// operation::ProgramWithCallbacks downsample_multi_core(const Tensor &a, Tensor& output); -operation::ProgramWithCallbacks downsample_single_core( + const Tensor& a, + std::array downsample_params, + std::optional dtype = std::nullopt); +// tt::tt_metal::operation::ProgramWithCallbacks downsample_multi_core(const Tensor &a, Tensor& output); +tt::tt_metal::operation::ProgramWithCallbacks downsample_single_core( const Tensor& a, std::array downsample_params, Tensor& output); // namespace downsample_helpers { diff --git a/ttnn/cpp/ttnn/operations/pool/downsample/device/downsample_program_factory.hpp b/ttnn/cpp/ttnn/operations/pool/downsample/device/downsample_program_factory.hpp index e6491646496..5c2e07387a8 100644 --- a/ttnn/cpp/ttnn/operations/pool/downsample/device/downsample_program_factory.hpp +++ b/ttnn/cpp/ttnn/operations/pool/downsample/device/downsample_program_factory.hpp @@ -5,13 +5,16 @@ #include "ttnn/operation.hpp" +// TODO: DELETE Do not use using namespace in header file using namespace tt::constants; namespace ttnn::operations::downsample::detail { std::pair get_num_cores_height_width_sliced( - const CoreRangeSet& all_cores, TensorMemoryLayout memory_layout, ShardOrientation shard_orientation); -operation::ProgramWithCallbacks downsample_single_core( + const CoreRangeSet& all_cores, + tt::tt_metal::TensorMemoryLayout memory_layout, + tt::tt_metal::ShardOrientation shard_orientation); +tt::tt_metal::operation::ProgramWithCallbacks downsample_single_core( const Tensor& a, std::array downsample_params, Tensor& output); } // namespace ttnn::operations::downsample::detail diff --git a/ttnn/cpp/ttnn/operations/pool/generic/device/pool_multi_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/pool/generic/device/pool_multi_core_program_factory.cpp index 1ca36790eb8..b176432e2ae 100644 --- a/ttnn/cpp/ttnn/operations/pool/generic/device/pool_multi_core_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/pool/generic/device/pool_multi_core_program_factory.cpp @@ -3,6 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 #include "pool_op.hpp" +#include "tt-metalium/circular_buffer.hpp" +#include "tt-metalium/circular_buffer_types.hpp" #include "ttnn/operations/reduction/generic/device/reduce_op.hpp" // for reduce_op_utils #include @@ -111,6 +113,9 @@ Pool2D::MultiCore::cached_program_t pool2d_multi_core_sharded_with_halo_v2_impl_ uint32_t split_reader = 1; // scalar CB as coefficient of reduce + using tt::tt_metal::CBHandle; + using tt::tt_metal::CircularBuffer; + using tt::tt_metal::CircularBufferConfig; uint32_t in_scalar_cb_id = tt::CBIndex::c_4; uint32_t in_scalar_cb_pagesize = tile_size(in_df); uint32_t in_scalar_cb_npages = 1; @@ -326,12 +331,16 @@ Pool2D::MultiCore::cached_program_t pool2d_multi_core_sharded_with_halo_v2_impl_ "reader_max_pool_2d_multi_core_sharded_with_halo_v2.cpp"; } - auto reader0_config = DataMovementConfig{ - .processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default, .compile_args = reader0_ct_args}; + auto reader0_config = tt::tt_metal::DataMovementConfig{ + .processor = tt::tt_metal::DataMovementProcessor::RISCV_0, + .noc = tt::tt_metal::NOC::RISCV_0_default, + .compile_args = reader0_ct_args}; auto reader0_kernel = CreateKernel(program, reader_kernel_fname, all_cores, reader0_config); - auto reader1_config = DataMovementConfig{ - .processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default, .compile_args = reader1_ct_args}; + auto reader1_config = tt::tt_metal::DataMovementConfig{ + .processor = tt::tt_metal::DataMovementProcessor::RISCV_1, + .noc = tt::tt_metal::NOC::RISCV_1_default, + .compile_args = reader1_ct_args}; auto reader1_kernel = split_reader ? CreateKernel(program, reader_kernel_fname, all_cores, reader1_config) : 0; /** @@ -359,7 +368,7 @@ Pool2D::MultiCore::cached_program_t pool2d_multi_core_sharded_with_halo_v2_impl_ auto reduce_op = tt::tt_metal::ReduceOpMath::MAX; auto reduce_dim = tt::tt_metal::ReduceOpDim::H; - auto compute_config = ComputeConfig{ + auto compute_config = tt::tt_metal::ComputeConfig{ .math_fidelity = MathFidelity::HiFi4, .fp32_dest_acc_en = false, .math_approx_mode = false, diff --git a/ttnn/cpp/ttnn/operations/pool/generic/device/pool_op.cpp b/ttnn/cpp/ttnn/operations/pool/generic/device/pool_op.cpp index 800a8a77054..9ddbf404980 100644 --- a/ttnn/cpp/ttnn/operations/pool/generic/device/pool_op.cpp +++ b/ttnn/cpp/ttnn/operations/pool/generic/device/pool_op.cpp @@ -95,13 +95,13 @@ Pool2D::spec_return_value_t Pool2D::compute_output_specs( uint32_t out_nhw_per_core = output_shape[0] * output_shape[1] * output_shape[2] / ncores; CoreRangeSet shard_grid = sliding_window_config.core_range_set; std::array shard_shape = {out_nhw_per_core, input.get_padded_shape()[-1]}; - mem_config.shard_spec = ShardSpec{shard_grid, shard_shape, ShardOrientation::ROW_MAJOR}; + mem_config.shard_spec = tt::tt_metal::ShardSpec{shard_grid, shard_shape, ShardOrientation::ROW_MAJOR}; } return TensorSpec( output_shape, - TensorLayout::fromPaddedShape( - output_dtype, PageConfig(input.get_layout()), mem_config, output_shape, padded_output_shape)); + tt::tt_metal::TensorLayout::fromPaddedShape( + output_dtype, tt::tt_metal::PageConfig(input.get_layout()), mem_config, output_shape, padded_output_shape)); } Pool2D::tensor_return_value_t Pool2D::create_output_tensors( @@ -114,11 +114,11 @@ tt::stl::hash::hash_t Pool2D::compute_program_hash( const operation_attributes_t& op_attr, const tensor_args_t& tensors) { auto input_mem_config = tensors.input_tensor_.memory_config(); auto dtype = tensors.input_tensor_.dtype(); - return operation::hash_operation( + return tt::tt_metal::operation::hash_operation( op_attr.sliding_window_config_.get_hash(), op_attr.pool_type_, op_attr.memory_config_, input_mem_config, dtype); } -operation::OpPerformanceModel Pool2D::create_op_performance_model( +tt::tt_metal::operation::OpPerformanceModel Pool2D::create_op_performance_model( const operation_attributes_t& op_attr, const tensor_args_t& inputs, const Tensor& output) { const auto& input = inputs.input_tensor_; const auto& input_shape = input.get_logical_shape(); @@ -150,7 +150,7 @@ operation::OpPerformanceModel Pool2D::create_op_performance_model( int ideal_dev_clock_cycles = std::ceil((float)num_mul_adds / (float)(num_cores * tensix_mul_adds_per_cycle_lofi)); - operation::OpPerformanceModel result({input}, {output}, ideal_dev_clock_cycles); + tt::tt_metal::operation::OpPerformanceModel result({input}, {output}, ideal_dev_clock_cycles); return result; } diff --git a/ttnn/cpp/ttnn/operations/pool/generic/device/pool_op.hpp b/ttnn/cpp/ttnn/operations/pool/generic/device/pool_op.hpp index 7077436c97c..c125c099012 100644 --- a/ttnn/cpp/ttnn/operations/pool/generic/device/pool_op.hpp +++ b/ttnn/cpp/ttnn/operations/pool/generic/device/pool_op.hpp @@ -40,10 +40,10 @@ struct Pool2D { struct MultiCore { struct shared_variables_t { - KernelHandle reader0_kernel; - KernelHandle reader1_kernel; - CBHandle raw_in_cb; - CBHandle cb_out; + tt::tt_metal::KernelHandle reader0_kernel; + tt::tt_metal::KernelHandle reader1_kernel; + tt::tt_metal::CBHandle raw_in_cb; + tt::tt_metal::CBHandle cb_out; uint32_t ncores; uint32_t ncores_w; std::shared_ptr reader_indices_buffer; @@ -70,7 +70,7 @@ struct Pool2D { static spec_return_value_t compute_output_specs(const operation_attributes_t&, const tensor_args_t&); static Tensor create_output_tensors(const operation_attributes_t&, const tensor_args_t&); static tt::stl::hash::hash_t compute_program_hash(const operation_attributes_t&, const tensor_args_t&); - static operation::OpPerformanceModel create_op_performance_model( + static tt::tt_metal::operation::OpPerformanceModel create_op_performance_model( const operation_attributes_t&, const tensor_args_t&, const Tensor&); static std::tuple invoke( diff --git a/ttnn/cpp/ttnn/operations/pool/generic/generic_pools.cpp b/ttnn/cpp/ttnn/operations/pool/generic/generic_pools.cpp index 0c62e5c91f8..d6c04b0d593 100644 --- a/ttnn/cpp/ttnn/operations/pool/generic/generic_pools.cpp +++ b/ttnn/cpp/ttnn/operations/pool/generic/generic_pools.cpp @@ -113,7 +113,7 @@ Tensor Pool2DOp::invoke( uint32_t output_nhw_padded = tt::round_up(output_nhw, num_cores_nhw * (is_out_tiled ? tt::constants::TILE_HEIGHT : 1)); uint32_t output_shard_height_padded = output_nhw_padded / num_cores_nhw; log_debug(tt::LogOp, "output_nhw: {}, output_nhw_padded: {}, output_shard_height_padded: {}, output_shard_width_padded: {}", output_nhw, output_nhw_padded, output_shard_height_padded, output_shard_width_padded); - out_memory_config.shard_spec = ShardSpec{shard_spec.grid, {output_shard_height_padded, output_shard_width_padded}, ShardOrientation::ROW_MAJOR}; + out_memory_config.shard_spec = tt::tt_metal::ShardSpec{shard_spec.grid, {output_shard_height_padded, output_shard_width_padded}, ShardOrientation::ROW_MAJOR}; sliding_window_config = sliding_window::SlidingWindowConfig{ .batch_size = batch_size, diff --git a/ttnn/cpp/ttnn/operations/pool/global_avg_pool/global_avg_pool.hpp b/ttnn/cpp/ttnn/operations/pool/global_avg_pool/global_avg_pool.hpp index c14ab6c97b6..80f1987555f 100644 --- a/ttnn/cpp/ttnn/operations/pool/global_avg_pool/global_avg_pool.hpp +++ b/ttnn/cpp/ttnn/operations/pool/global_avg_pool/global_avg_pool.hpp @@ -15,7 +15,7 @@ enum class PoolType { AVG }; Tensor global_avg_pool2d( const Tensor& input, - const MemoryConfig& memory_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, + const MemoryConfig& memory_config = tt::tt_metal::operation::DEFAULT_OUTPUT_MEMORY_CONFIG, const std::optional& output_dtype = std::nullopt); } // namespace tt_metal diff --git a/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_bilinear_program_factory_multicore.cpp b/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_bilinear_program_factory_multicore.cpp index d65937c9b2f..86e7364f9a3 100644 --- a/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_bilinear_program_factory_multicore.cpp +++ b/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_bilinear_program_factory_multicore.cpp @@ -4,6 +4,8 @@ #include +#include "tt-metalium/circular_buffer.hpp" +#include "tt-metalium/circular_buffer_types.hpp" #include "upsample_op.hpp" #include "ttnn/operations/math.hpp" @@ -59,13 +61,13 @@ Tensor HaloTensorCreation(const Tensor& input) { return halo_output; } -operation::ProgramWithCallbacks bilinear_multi_core( +tt::tt_metal::operation::ProgramWithCallbacks bilinear_multi_core( const Tensor& input, Tensor& output, const uint32_t scale_factor_h, const uint32_t scale_factor_w, const DeviceComputeKernelConfig compute_kernel_config) { - Program program = CreateProgram(); + Program program = tt::tt_metal::CreateProgram(); IDevice* device = input.device(); auto input_shape = input.get_padded_shape(); @@ -137,6 +139,10 @@ operation::ProgramWithCallbacks bilinear_multi_core( auto halo_shard_shape = halo_in.shard_spec().value().shape; // CBs + using tt::tt_metal::CBHandle; + using tt::tt_metal::CircularBuffer; + using tt::tt_metal::CircularBufferConfig; + uint32_t buffering_factor = 2; // input data is in a sharded CB @@ -261,14 +267,14 @@ operation::ProgramWithCallbacks bilinear_multi_core( output_nsticks_per_core, // loop count with blocks }; - auto reader_kernel = - CreateKernel(program, reader_kernel_fname, all_cores, ReaderDataMovementConfig(reader_compile_time_args)); - auto writer_kernel = - CreateKernel(program, writer_kernel_fname, all_cores, WriterDataMovementConfig(writer_compile_time_args)); + auto reader_kernel = CreateKernel( + program, reader_kernel_fname, all_cores, tt::tt_metal::ReaderDataMovementConfig(reader_compile_time_args)); + auto writer_kernel = CreateKernel( + program, writer_kernel_fname, all_cores, tt::tt_metal::WriterDataMovementConfig(writer_compile_time_args)); TT_FATAL(fp32_dest_acc_en == false, "fp32_dest_acc_en as true not supported. #12787 issue raised"); - auto reduce_op = ReduceOpMath::SUM; - auto reduce_dim = ReduceOpDim::H; - auto compute_config = ComputeConfig{ + auto reduce_op = tt::tt_metal::ReduceOpMath::SUM; + auto reduce_dim = tt::tt_metal::ReduceOpDim::H; + auto compute_config = tt::tt_metal::ComputeConfig{ .math_fidelity = math_fidelity, .fp32_dest_acc_en = fp32_dest_acc_en, .math_approx_mode = math_approx_mode, diff --git a/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_op.hpp b/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_op.hpp index be183519d98..5b3d25651d9 100644 --- a/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_op.hpp +++ b/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_op.hpp @@ -16,21 +16,21 @@ struct UpSample { const int scale_factor_h_; const int scale_factor_w_; const string mode_; - const MemoryConfig output_mem_config_; + const tt::tt_metal::MemoryConfig output_mem_config_; const DeviceComputeKernelConfig compute_kernel_config_; void validate(const std::vector& input_tensors) const; std::vector compute_output_specs(const std::vector& input_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector& input_tensors, std::vector& output_tensors) const; UpSampleParallelizationStrategy get_parallelization_strategy(const std::vector& input_tensors) const; }; -operation::ProgramWithCallbacks upsample_single_core( +tt::tt_metal::operation::ProgramWithCallbacks upsample_single_core( const Tensor& input, Tensor& output, const uint32_t scale_factor_h, const uint32_t scale_factor_w); -operation::ProgramWithCallbacks upsample_multi_core( +tt::tt_metal::operation::ProgramWithCallbacks upsample_multi_core( const Tensor& input, Tensor& output, const uint32_t scale_factor_h, const uint32_t scale_factor_w); -operation::ProgramWithCallbacks bilinear_multi_core( +tt::tt_metal::operation::ProgramWithCallbacks bilinear_multi_core( const Tensor& input, Tensor& output, const uint32_t scale_factor_h, diff --git a/ttnn/cpp/ttnn/operations/pool/upsample/upsample.cpp b/ttnn/cpp/ttnn/operations/pool/upsample/upsample.cpp index cad2cc12345..60ab8bd2733 100644 --- a/ttnn/cpp/ttnn/operations/pool/upsample/upsample.cpp +++ b/ttnn/cpp/ttnn/operations/pool/upsample/upsample.cpp @@ -53,7 +53,8 @@ ttnn::Tensor ExecuteUpSample::invoke( } // return ttnn::upsample(input_tensor, scale_h, scale_w, mem_config); - auto output_tensor = operation::run(UpSample{scale_h, scale_w, mode, mem_config, config}, {input_tensor}).front(); + auto output_tensor = + tt::tt_metal::operation::run(UpSample{scale_h, scale_w, mode, mem_config, config}, {input_tensor}).front(); return output_tensor; } diff --git a/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/device/dram_prefetcher_op.cpp b/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/device/dram_prefetcher_op.cpp index 1789b4b1b2c..b5adf303e8e 100644 --- a/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/device/dram_prefetcher_op.cpp +++ b/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/device/dram_prefetcher_op.cpp @@ -21,7 +21,7 @@ void DramPrefetcher::validate(const std::vector& input_tensors) const { TT_FATAL(global_cb.has_value(), "Global circular buffer must be provided"); ttnn::Tensor tensor_addrs = input_tensors.back(); // Last tensor is tensor_addrs - auto global_cb = get_global_circular_buffer(*this->global_cb, input_tensors[0].device()->id()); + auto global_cb = tt::tt_metal::get_global_circular_buffer(*this->global_cb, input_tensors[0].device()->id()); uint32_t num_receiver_cores = global_cb.receiver_cores().num_cores(); uint32_t num_sender_cores = global_cb.sender_cores().num_cores(); @@ -75,11 +75,12 @@ void DramPrefetcher::validate(const std::vector& input_tensors) const { std::vector DramPrefetcher::compute_output_specs(const std::vector& input_tensors) const { return {TensorSpec( ttnn::Shape{32, 32}, - TensorLayout(input_tensors[0].get_dtype(), PageConfig(input_tensors[0].get_layout()), MemoryConfig{}))}; + tt::tt_metal::TensorLayout( + input_tensors[0].get_dtype(), tt::tt_metal::PageConfig(input_tensors[0].get_layout()), MemoryConfig{}))}; } -operation::ProgramWithCallbacks DramPrefetcher::create_program( +tt::tt_metal::operation::ProgramWithCallbacks DramPrefetcher::create_program( const std::vector& input_tensors, std::vector& output_tensors) const { - auto global_cb = get_global_circular_buffer(*this->global_cb, input_tensors[0].device()->id()); + auto global_cb = tt::tt_metal::get_global_circular_buffer(*this->global_cb, input_tensors[0].device()->id()); return dram_prefetcher_multi_core(input_tensors, this->num_layers, global_cb); } diff --git a/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/device/dram_prefetcher_op.hpp b/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/device/dram_prefetcher_op.hpp index 6b320d8663e..1497f479828 100644 --- a/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/device/dram_prefetcher_op.hpp +++ b/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/device/dram_prefetcher_op.hpp @@ -17,18 +17,18 @@ namespace ttnn::operations::dram_prefetcher { -operation::ProgramWithCallbacks dram_prefetcher_multi_core( +tt::tt_metal::operation::ProgramWithCallbacks dram_prefetcher_multi_core( const std::vector& input_tensors, const uint32_t num_layers, const tt::tt_metal::v1::experimental::GlobalCircularBuffer& global_cb); struct DramPrefetcher { - const std::optional global_cb; + const std::optional global_cb; const uint32_t num_layers; void validate(const std::vector& input_tensors) const; std::vector compute_output_specs(const std::vector& input_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector& input_tensors, std::vector& output_tensors) const; }; diff --git a/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/dram_prefetcher.cpp b/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/dram_prefetcher.cpp index db379545bc9..09b018160fe 100644 --- a/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/dram_prefetcher.cpp +++ b/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/dram_prefetcher.cpp @@ -14,14 +14,14 @@ namespace ttnn::operations::dram_prefetcher { Tensor ExecuteDramPrefetcher::invoke( std::vector& tensors, const uint32_t num_layers, - const std::optional& global_cb) { - std::vector output_tensors = {Tensor(operation::get_workers_for_op_output(tensors))}; - operation::launch_op( + const std::optional& global_cb) { + std::vector output_tensors = {Tensor(tt::tt_metal::operation::get_workers_for_op_output(tensors))}; + tt::tt_metal::operation::launch_op( [num_layers, global_cb]( const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { - return operation::run(DramPrefetcher{global_cb, num_layers}, input_tensors); + return tt::tt_metal::operation::run(DramPrefetcher{global_cb, num_layers}, input_tensors); }, tensors, output_tensors); diff --git a/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/dram_prefetcher.hpp b/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/dram_prefetcher.hpp index 8841bc75f2d..4144fe64abc 100644 --- a/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/dram_prefetcher.hpp +++ b/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/dram_prefetcher.hpp @@ -17,7 +17,7 @@ struct ExecuteDramPrefetcher { static ttnn::Tensor invoke( std::vector& tensors, const uint32_t num_layers, - const std::optional& global_cb); + const std::optional& global_cb); }; } // namespace operations::dram_prefetcher diff --git a/ttnn/cpp/ttnn/operations/reduction/argmax/argmax.cpp b/ttnn/cpp/ttnn/operations/reduction/argmax/argmax.cpp index b94832bcb55..366cda19e8e 100644 --- a/ttnn/cpp/ttnn/operations/reduction/argmax/argmax.cpp +++ b/ttnn/cpp/ttnn/operations/reduction/argmax/argmax.cpp @@ -20,7 +20,7 @@ ttnn::Tensor ArgMaxOperation::invoke( const bool use_muticore, const std::optional& memory_config, std::optional optional_output_tensor) { - return operation::run( + return tt::tt_metal::operation::run( ArgMax{ tt::tt_metal::DataType::UINT32, dim, diff --git a/ttnn/cpp/ttnn/operations/reduction/argmax/device/argmax_op.hpp b/ttnn/cpp/ttnn/operations/reduction/argmax/device/argmax_op.hpp index bd6bc5ed104..deacf7ecd15 100644 --- a/ttnn/cpp/ttnn/operations/reduction/argmax/device/argmax_op.hpp +++ b/ttnn/cpp/ttnn/operations/reduction/argmax/device/argmax_op.hpp @@ -13,10 +13,10 @@ namespace ttnn::operations::reduction { struct ArgMax { - const DataType output_dtype; + const tt::tt_metal::DataType output_dtype; const std::optional dim; const bool use_multicore; - const MemoryConfig output_mem_config; + const tt::tt_metal::MemoryConfig output_mem_config; void validate_with_output_tensors( const std::vector& input_tensors, const std::vector>& output_tensors) const; @@ -24,7 +24,7 @@ struct ArgMax { const std::vector& input_tensors, const std::vector>& output_tensors) const; std::vector create_output_tensors( const std::vector& input_tensors, const std::vector>& output_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector& input_tensors, std::vector& output_tensors) const; }; diff --git a/ttnn/cpp/ttnn/operations/reduction/argmax/device/argmax_program_factory.hpp b/ttnn/cpp/ttnn/operations/reduction/argmax/device/argmax_program_factory.hpp index dfdcbf91065..a6d4527967f 100644 --- a/ttnn/cpp/ttnn/operations/reduction/argmax/device/argmax_program_factory.hpp +++ b/ttnn/cpp/ttnn/operations/reduction/argmax/device/argmax_program_factory.hpp @@ -7,10 +7,10 @@ namespace ttnn::operations::reduction::detail { using namespace tt::constants; -operation::ProgramWithCallbacks argmax_single_core( +tt::tt_metal::operation::ProgramWithCallbacks argmax_single_core( const Tensor& input, const Tensor& output, const std::optional dim); -operation::ProgramWithCallbacks argmax_multi_core( +tt::tt_metal::operation::ProgramWithCallbacks argmax_multi_core( const Tensor& input, const Tensor& output, const std::optional dim); } // namespace ttnn::operations::reduction::detail diff --git a/ttnn/cpp/ttnn/operations/reduction/generic/device/reduce_op.cpp b/ttnn/cpp/ttnn/operations/reduction/generic/device/reduce_op.cpp index b793645b5da..3830d528a7b 100644 --- a/ttnn/cpp/ttnn/operations/reduction/generic/device/reduce_op.cpp +++ b/ttnn/cpp/ttnn/operations/reduction/generic/device/reduce_op.cpp @@ -14,22 +14,20 @@ using namespace tt::constants; namespace reduce_op_utils { -using namespace tt::tt_metal; - -std::map get_defines(ReduceOpMath reduce_op, ReduceOpDim reduce_dim) { +std::map get_defines(tt::tt_metal::ReduceOpMath reduce_op, tt::tt_metal::ReduceOpDim reduce_dim) { std::map defines; // TOOD(AP): need a sync with Reduce::Max from HLK headers - bool do_max = reduce_op == ReduceOpMath::MAX; + bool do_max = reduce_op == tt::tt_metal::ReduceOpMath::MAX; string reduce_dim_str; switch (reduce_dim) { - case ReduceOpDim::W: reduce_dim_str = "ReduceDim::REDUCE_ROW"; break; - case ReduceOpDim::H: reduce_dim_str = "ReduceDim::REDUCE_COL"; break; - case ReduceOpDim::HW: reduce_dim_str = "ReduceDim::REDUCE_SCALAR"; break; + case tt::tt_metal::ReduceOpDim::W: reduce_dim_str = "ReduceDim::REDUCE_ROW"; break; + case tt::tt_metal::ReduceOpDim::H: reduce_dim_str = "ReduceDim::REDUCE_COL"; break; + case tt::tt_metal::ReduceOpDim::HW: reduce_dim_str = "ReduceDim::REDUCE_SCALAR"; break; default: TT_ASSERT(false && "Invalid reduce_op!"); } defines["REDUCE_OP"] = (do_max ? "PoolType::MAX" : "PoolType::SUM"); defines["REDUCE_DIM"] = reduce_dim_str; - if (reduce_dim == ReduceOpDim::W && reduce_op == ReduceOpMath::SUM) { + if (reduce_dim == tt::tt_metal::ReduceOpDim::W && reduce_op == tt::tt_metal::ReduceOpMath::SUM) { defines["REDUCE_ROW_SUM_VIA_MM"] = 1; } return defines; @@ -158,7 +156,7 @@ Tensor reduce_min( const Tensor& input_tensor, ReduceOpDim reduce_dim, float scaler = 1.0f, - const MemoryConfig& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, + const MemoryConfig& output_mem_config = tt::tt_metal::operation::DEFAULT_OUTPUT_MEMORY_CONFIG, const std::optional& compute_kernel_config = std::nullopt) { Tensor input = input_tensor; if (input.get_layout() == Layout::ROW_MAJOR && input.storage_type() == StorageType::DEVICE) { diff --git a/ttnn/cpp/ttnn/operations/reduction/generic/device/reduce_op.hpp b/ttnn/cpp/ttnn/operations/reduction/generic/device/reduce_op.hpp index 503f4cd7d2f..26f98b467e4 100644 --- a/ttnn/cpp/ttnn/operations/reduction/generic/device/reduce_op.hpp +++ b/ttnn/cpp/ttnn/operations/reduction/generic/device/reduce_op.hpp @@ -16,19 +16,19 @@ namespace tt { namespace tt_metal { // TODO: Accept parallelization -operation::ProgramWithCallbacks reduce_single_core_hw( +tt::tt_metal::operation::ProgramWithCallbacks reduce_single_core_hw( const Tensor& input_tensor, Tensor& output_tensor, ReduceOpMath reduce_math, const ttnn::DeviceComputeKernelConfig& compute_kernel_config, float scaler = 1.0f); -operation::ProgramWithCallbacks reduce_multi_core_h( +tt::tt_metal::operation::ProgramWithCallbacks reduce_multi_core_h( const Tensor& input_tensor, Tensor& output_tensor, ReduceOpMath reduce_math, const ttnn::DeviceComputeKernelConfig& compute_kernel_config, float scaler = 1.0f); -operation::ProgramWithCallbacks reduce_multi_core_w( +tt::tt_metal::operation::ProgramWithCallbacks reduce_multi_core_w( const Tensor& input_tensor, Tensor& output_tensor, ReduceOpMath reduce_math, @@ -45,7 +45,7 @@ struct Reduce { void validate(const std::vector& input_tensors) const; std::vector compute_output_specs(const std::vector& input_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector& input_tensors, std::vector& output_tensors) const; ReduceOpParallelizationStrategy get_parallelization_strategy(const std::vector& input_tensors) const; }; @@ -55,7 +55,7 @@ Tensor reduce( ReduceOpMath reduce_math, ReduceOpDim reduce_dim, float scaler = 1.0f, - const MemoryConfig& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, + const MemoryConfig& output_mem_config = tt::tt_metal::operation::DEFAULT_OUTPUT_MEMORY_CONFIG, const std::optional& output_dtype = std::nullopt, const std::optional& compute_kernel_config = std::nullopt); @@ -65,6 +65,6 @@ Tensor reduce( namespace reduce_op_utils { -std::map get_defines(ReduceOpMath reduce_op, ReduceOpDim reduce_dim); +std::map get_defines(tt::tt_metal::ReduceOpMath reduce_op, tt::tt_metal::ReduceOpDim reduce_dim); } // namespace reduce_op_utils diff --git a/ttnn/cpp/ttnn/operations/reduction/moe/device/moe_op.hpp b/ttnn/cpp/ttnn/operations/reduction/moe/device/moe_op.hpp index f380dbb0a87..4efdd1a03fa 100644 --- a/ttnn/cpp/ttnn/operations/reduction/moe/device/moe_op.hpp +++ b/ttnn/cpp/ttnn/operations/reduction/moe/device/moe_op.hpp @@ -13,7 +13,7 @@ namespace ttnn::operations::reduction { struct MoeDeviceOperation { const uint16_t k; - const MemoryConfig output_mem_config; + const tt::tt_metal::MemoryConfig output_mem_config; void validate_with_output_tensors( const std::vector& input_tensors, const std::vector>& output_tensors) const; @@ -21,7 +21,7 @@ struct MoeDeviceOperation { const std::vector& input_tensors, const std::vector>& output_tensors) const; std::vector create_output_tensors( const std::vector& input_tensors, const std::vector>& output_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector& input_tensors, std::vector& output_tensors) const; }; diff --git a/ttnn/cpp/ttnn/operations/reduction/moe/device/moe_program_factory.hpp b/ttnn/cpp/ttnn/operations/reduction/moe/device/moe_program_factory.hpp index 079e51a94da..68b4e72b9a6 100644 --- a/ttnn/cpp/ttnn/operations/reduction/moe/device/moe_program_factory.hpp +++ b/ttnn/cpp/ttnn/operations/reduction/moe/device/moe_program_factory.hpp @@ -6,11 +6,10 @@ namespace ttnn::operations::reduction::detail { -operation::ProgramWithCallbacks moe_single_core_interleaved( +tt::tt_metal::operation::ProgramWithCallbacks moe_single_core_interleaved( const Tensor& input_tensor, const Tensor& expert_mask_tensor, const Tensor& topk_mask_tensor, const uint16_t k, Tensor& out_tensor); - } diff --git a/ttnn/cpp/ttnn/operations/reduction/moe/moe.cpp b/ttnn/cpp/ttnn/operations/reduction/moe/moe.cpp index a230aea3d7d..d088aadf834 100644 --- a/ttnn/cpp/ttnn/operations/reduction/moe/moe.cpp +++ b/ttnn/cpp/ttnn/operations/reduction/moe/moe.cpp @@ -23,7 +23,7 @@ ttnn::Tensor MoeOperation::invoke( const uint16_t k, const std::optional& memory_config, std::optional optional_output_tensor) { - return operation::run( + return tt::tt_metal::operation::run( MoeDeviceOperation{k, memory_config.value_or(input_tensor.memory_config())}, {input_tensor, expert_mask_tensor, topk_mask_tensor}, {}, @@ -54,7 +54,8 @@ std::vector MoeOperation::create_async_output_tensors( const auto& input_tensor = input_tensors.at(0); const auto& expert_mask_tensor = input_tensors.at(1); const auto& topk_mask_tensor = input_tensors.at(2); - return {Tensor(operation::get_workers_for_op_output({input_tensor, expert_mask_tensor, topk_mask_tensor}))}; + return {Tensor( + tt::tt_metal::operation::get_workers_for_op_output({input_tensor, expert_mask_tensor, topk_mask_tensor}))}; } } // namespace ttnn::operations::reduction diff --git a/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_all_program_factory.cpp b/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_all_program_factory.cpp index 8015799a2b8..390fab8ca98 100644 --- a/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_all_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_all_program_factory.cpp @@ -12,8 +12,9 @@ using namespace constants; namespace operations { namespace primary { -operation::ProgramWithCallbacks prod_single_core(const Tensor& a, const Tensor& output) { - Program program{}; +tt::tt_metal::operation::ProgramWithCallbacks prod_single_core( + const tt::tt_metal::Tensor& a, const tt::tt_metal::Tensor& output) { + tt::tt_metal::Program program{}; CoreRange core({0, 0}, {0, 0}); @@ -86,9 +87,9 @@ operation::ProgramWithCallbacks prod_single_core(const Tensor& a, const Tensor& SetRuntimeArgs(program, unary_writer_kernel_id, core, {dst_buffer->address(), num_tiles, 0}); auto override_runtime_args_callback = [unary_reader_kernel_id, unary_writer_kernel_id]( - const Program& program, - const std::vector& input_buffers, - const std::vector& output_buffers) { + const tt::tt_metal::Program& program, + const std::vector& input_buffers, + const std::vector& output_buffers) { auto src_buffer = input_buffers.at(0); auto dst_buffer = output_buffers.at(0); diff --git a/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_nc_op.hpp b/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_nc_op.hpp index e061c1ddeb1..49974dfce0e 100644 --- a/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_nc_op.hpp +++ b/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_nc_op.hpp @@ -25,23 +25,23 @@ struct Prod { void validate(const std::vector& inputs) const; std::vector compute_output_specs(const std::vector& inputs) const; std::vector create_output_tensors(const std::vector& inputs) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector& inputs, std::vector& outputs) const; }; -operation::ProgramWithCallbacks prod_nc_format(const Tensor& input, const Tensor& output, int64_t dim); +tt::tt_metal::operation::ProgramWithCallbacks prod_nc_format(const Tensor& input, const Tensor& output, int64_t dim); Tensor prod_( const Tensor& input, std::optional> output, const int64_t& dim, - const MemoryConfig& mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG); + const MemoryConfig& mem_config = tt::tt_metal::operation::DEFAULT_OUTPUT_MEMORY_CONFIG); Tensor prod_nc( const Tensor& input, const Tensor& output, ttnn::SmallVector& dims, - const MemoryConfig& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG); + const MemoryConfig& output_mem_config = tt::tt_metal::operation::DEFAULT_OUTPUT_MEMORY_CONFIG); } // namespace primary diff --git a/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_nc_program_factory.cpp b/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_nc_program_factory.cpp index 38d4f885d43..5f2220bc7d4 100644 --- a/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_nc_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_nc_program_factory.cpp @@ -9,28 +9,27 @@ #include #include "ttnn/operation.hpp" -using namespace tt::tt_metal; - namespace tt { using namespace constants; namespace operations { namespace primary { -operation::ProgramWithCallbacks prod_nc_format(const Tensor& input, const Tensor& output, int64_t dim) { +tt::tt_metal::operation::ProgramWithCallbacks prod_nc_format( + const tt::tt_metal::Tensor& input, const tt::tt_metal::Tensor& output, int64_t dim) { TT_ASSERT(dim == 0 || dim == 1); //////////////////////////////////////////////////////////////////////////// // Device Setup //////////////////////////////////////////////////////////////////////////// auto* device = input.device(); - auto program = Program(); + auto program = tt::tt_metal::Program(); //////////////////////////////////////////////////////////////////////////// // Parameters Setup //////////////////////////////////////////////////////////////////////////// const auto cb_data_format = datatype_to_dataformat_converter(output.get_dtype()); - const auto single_tile_size = detail::TileSize(cb_data_format); + const auto single_tile_size = tt::tt_metal::detail::TileSize(cb_data_format); const auto input_shape = input.get_padded_shape(); const auto input_shape_without_padding = input.get_logical_shape(); @@ -118,7 +117,7 @@ operation::ProgramWithCallbacks prod_nc_format(const Tensor& input, const Tensor const auto compute_kernel_1_id = ttnn::operations::CreateComputeKernel( program, compute_kernel_file, {core_group_1, num_cols_per_core_group_1, compute_args_group_1}, compute_defines); - std::optional compute_kernel_2_id = std::nullopt; + std::optional compute_kernel_2_id = std::nullopt; if (!core_group_2.ranges().empty()) { const std::vector compute_args_group_2{num_cols_per_core_group_2}; compute_kernel_2_id = ttnn::operations::CreateComputeKernel( @@ -179,10 +178,10 @@ operation::ProgramWithCallbacks prod_nc_format(const Tensor& input, const Tensor auto override_runtime_arguments_callback = [reader_kernel_id, writer_kernel_id, num_cores_to_be_used, num_cores_y]( const void* operation, - const Program& program, - const std::vector& input_tensors, - const std::vector>&, - const std::vector& output_tensors) { + const tt::tt_metal::Program& program, + const std::vector& input_tensors, + const std::vector>&, + const std::vector& output_tensors) { const auto* input_buffer = input_tensors.at(0).buffer(); const auto* output_buffer = input_tensors.at(1).buffer(); for (uint32_t i = 0; i < num_cores_to_be_used; ++i) { diff --git a/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_op_all.cpp b/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_op_all.cpp index d9bc6d24ece..7940ced124b 100644 --- a/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_op_all.cpp +++ b/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_op_all.cpp @@ -18,32 +18,35 @@ using namespace constants; namespace operations { namespace primary { -void Prod_op::validate(const std::vector& input_tensors) const { +void Prod_op::validate(const std::vector& input_tensors) const { const auto& input_tensor_a = input_tensors.at(0); - TT_FATAL(input_tensor_a.storage_type() == StorageType::DEVICE, "Operands need to be on device!"); + TT_FATAL(input_tensor_a.storage_type() == tt::tt_metal::StorageType::DEVICE, "Operands need to be on device!"); TT_FATAL(input_tensor_a.buffer() != nullptr, "Operands need to be allocated in buffers on device!"); - TT_FATAL((input_tensor_a.get_layout() == Layout::TILE), "Input Layout must be tilized"); - TT_FATAL(input_tensor_a.memory_config().memory_layout == TensorMemoryLayout::INTERLEAVED, "Error"); - TT_FATAL(input_tensor_a.get_dtype() == DataType::BFLOAT16, "Error"); + TT_FATAL((input_tensor_a.get_layout() == tt::tt_metal::Layout::TILE), "Input Layout must be tilized"); + TT_FATAL(input_tensor_a.memory_config().memory_layout == tt::tt_metal::TensorMemoryLayout::INTERLEAVED, "Error"); + TT_FATAL(input_tensor_a.get_dtype() == tt::tt_metal::DataType::BFLOAT16, "Error"); } -std::vector Prod_op::compute_output_specs(const std::vector& input_tensors) const { +std::vector Prod_op::compute_output_specs( + const std::vector& input_tensors) const { const auto& input_tensor = input_tensors.at(0); - return {TensorSpec( + return {tt::tt_metal::TensorSpec( input_tensor.get_logical_shape(), - TensorLayout(input_tensor.get_dtype(), PageConfig(Layout::TILE), output_mem_config))}; + tt::tt_metal::TensorLayout( + input_tensor.get_dtype(), tt::tt_metal::PageConfig(tt::tt_metal::Layout::TILE), output_mem_config))}; } -operation::ProgramWithCallbacks Prod_op::create_program( - const std::vector& input_tensors, std::vector& output_tensors) const { +tt::tt_metal::operation::ProgramWithCallbacks Prod_op::create_program( + const std::vector& input_tensors, std::vector& output_tensors) const { const auto& input_tensor_a = input_tensors.at(0); auto& output_tensor = output_tensors.at(0); return prod_single_core(input_tensor_a, output_tensor); } -Tensor prod_all(const Tensor& input, const MemoryConfig& output_mem_config) { - Tensor result = ttnn::tiled_prod( - operation::run(Prod_op{.output_mem_config = output_mem_config}, {input}).at(0), output_mem_config); +tt::tt_metal::Tensor prod_all(const tt::tt_metal::Tensor& input, const tt::tt_metal::MemoryConfig& output_mem_config) { + tt::tt_metal::Tensor result = ttnn::tiled_prod( + tt::tt_metal::operation::run(Prod_op{.output_mem_config = output_mem_config}, {input}).at(0), + output_mem_config); auto arch_env = tt_ClusterDescriptor::detect_arch((chip_id_t)0); if (arch_env == tt::ARCH::WORMHOLE_B0) { return ttnn::prod_result_computation_WH_B0( diff --git a/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_op_all.hpp b/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_op_all.hpp index aea785c5f25..dcde2680edd 100644 --- a/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_op_all.hpp +++ b/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_op_all.hpp @@ -19,17 +19,22 @@ namespace primary { */ struct Prod_op { - const MemoryConfig output_mem_config; - const DataType output_dtype; // TODO: Uplift output_dtype as an option for general dot/bmm - void validate(const std::vector& input_tensors) const; - std::vector compute_output_specs(const std::vector& input_tensors) const; - operation::ProgramWithCallbacks create_program( - const std::vector& input_tensors, std::vector& output_tensors) const; + const tt::tt_metal::MemoryConfig output_mem_config; + const tt::tt_metal::DataType output_dtype; // TODO: Uplift output_dtype as an option for general dot/bmm + void validate(const std::vector& input_tensors) const; + std::vector compute_output_specs( + const std::vector& input_tensors) const; + tt::tt_metal::operation::ProgramWithCallbacks create_program( + const std::vector& input_tensors, + std::vector& output_tensors) const; }; -operation::ProgramWithCallbacks prod_single_core(const Tensor& input_tensor_a, const Tensor& output_tensor); +tt::tt_metal::operation::ProgramWithCallbacks prod_single_core( + const tt::tt_metal::Tensor& input_tensor_a, const tt::tt_metal::Tensor& output_tensor); -Tensor prod_all(const Tensor& input, const MemoryConfig& mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG); +tt::tt_metal::Tensor prod_all( + const tt::tt_metal::Tensor& input, + const tt::tt_metal::MemoryConfig& mem_config = tt::tt_metal::operation::DEFAULT_OUTPUT_MEMORY_CONFIG); } // namespace primary } // namespace operations diff --git a/ttnn/cpp/ttnn/operations/reduction/sampling/device/sampling_op.hpp b/ttnn/cpp/ttnn/operations/reduction/sampling/device/sampling_op.hpp index 0841fb17245..a80f8072f31 100644 --- a/ttnn/cpp/ttnn/operations/reduction/sampling/device/sampling_op.hpp +++ b/ttnn/cpp/ttnn/operations/reduction/sampling/device/sampling_op.hpp @@ -24,7 +24,7 @@ struct Sampling { const std::vector& input_tensors, const std::vector>& output_tensors) const; std::vector create_output_tensors( const std::vector& input_tensors, const std::vector>& output_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector& input_tensors, std::vector& output_tensors) const; }; diff --git a/ttnn/cpp/ttnn/operations/reduction/sampling/device/sampling_program_factory.cpp b/ttnn/cpp/ttnn/operations/reduction/sampling/device/sampling_program_factory.cpp index 84164dd780b..3099d45ec18 100644 --- a/ttnn/cpp/ttnn/operations/reduction/sampling/device/sampling_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/reduction/sampling/device/sampling_program_factory.cpp @@ -13,7 +13,7 @@ namespace ttnn::operations::reduction::detail { -operation::ProgramWithCallbacks sampling_multicore_interleaved( +tt::tt_metal::operation::ProgramWithCallbacks sampling_multicore_interleaved( const Tensor& input_values_tensor, const Tensor& input_indices_tensor, const std::vector& k, @@ -54,7 +54,7 @@ operation::ProgramWithCallbacks sampling_multicore_interleaved( auto compute_with_storage_grid_size = device->compute_with_storage_grid_size(); uint32_t num_cores_x = compute_with_storage_grid_size.x; uint32_t num_cores_y = compute_with_storage_grid_size.y; - CoreRangeSet core_grid = num_cores_to_corerangeset(num_cores, compute_with_storage_grid_size, true); + CoreRangeSet core_grid = tt::tt_metal::num_cores_to_corerangeset(num_cores, compute_with_storage_grid_size, true); if (sub_core_grids.has_value()) { core_grid = sub_core_grids.value(); @@ -149,8 +149,8 @@ operation::ProgramWithCallbacks sampling_multicore_interleaved( // random number const uint32_t rand_tile_size = tile_size(tt::DataFormat::Float16_b); constexpr uint32_t rand_tile_index = tt::CBIndex::c_11; - CircularBufferConfig cb_rand_config = - CircularBufferConfig(rand_tile_size, {{rand_tile_index, tt::DataFormat::Float16_b}}) + tt::tt_metal::CircularBufferConfig cb_rand_config = + tt::tt_metal::CircularBufferConfig(rand_tile_size, {{rand_tile_index, tt::DataFormat::Float16_b}}) .set_page_size(rand_tile_index, rand_tile_size); auto cb_rand = tt::tt_metal::CreateCircularBuffer(program, core_grid, cb_rand_config); @@ -269,9 +269,9 @@ operation::ProgramWithCallbacks sampling_multicore_interleaved( } auto override_runtime_args_callback = [reader_kernel_id, writer_kernel_ids, cores]( - const Program& program, - const std::vector& input_buffers, - const std::vector& output_buffers) { + const tt::tt_metal::Program& program, + const std::vector& input_buffers, + const std::vector& output_buffers) { auto input_values_buffer = input_buffers.at(0); auto input_indices_buffer = input_buffers.at(1); auto output_buffer = output_buffers.at(0); diff --git a/ttnn/cpp/ttnn/operations/reduction/sampling/device/sampling_program_factory.hpp b/ttnn/cpp/ttnn/operations/reduction/sampling/device/sampling_program_factory.hpp index 0f226d057c0..cd0f04bca82 100644 --- a/ttnn/cpp/ttnn/operations/reduction/sampling/device/sampling_program_factory.hpp +++ b/ttnn/cpp/ttnn/operations/reduction/sampling/device/sampling_program_factory.hpp @@ -6,7 +6,7 @@ namespace ttnn::operations::reduction::detail { -operation::ProgramWithCallbacks sampling_multicore_interleaved( +tt::tt_metal::operation::ProgramWithCallbacks sampling_multicore_interleaved( const Tensor& input_values_tensor, const Tensor& input_indices_tensor, const std::vector& k, diff --git a/ttnn/cpp/ttnn/operations/reduction/sampling/sampling.cpp b/ttnn/cpp/ttnn/operations/reduction/sampling/sampling.cpp index 25102a5c799..3e1b468cf60 100644 --- a/ttnn/cpp/ttnn/operations/reduction/sampling/sampling.cpp +++ b/ttnn/cpp/ttnn/operations/reduction/sampling/sampling.cpp @@ -22,7 +22,7 @@ ttnn::Tensor SamplingOperation::invoke( const uint32_t seed, const std::optional& sub_core_grids, std::optional optional_output_tensor) { - return operation::run( + return tt::tt_metal::operation::run( Sampling{k, p, seed, sub_core_grids}, {input_values_tensor, input_indices_tensor}, {}, diff --git a/ttnn/cpp/ttnn/operations/reduction/topk/device/topk_op.cpp b/ttnn/cpp/ttnn/operations/reduction/topk/device/topk_op.cpp index a3bc59e8124..b22bbb05d65 100644 --- a/ttnn/cpp/ttnn/operations/reduction/topk/device/topk_op.cpp +++ b/ttnn/cpp/ttnn/operations/reduction/topk/device/topk_op.cpp @@ -60,7 +60,7 @@ void TopK::validate_with_output_tensors( input_shape[0] * input_shape[1] * input_shape[2]); TT_FATAL(this->output_mem_config.is_sharded() == false, "Sharded implementation not supported yet"); - TT_FATAL(input_tensors.at(0).get_layout() == Layout::TILE, "The input must be in tiled format"); + TT_FATAL(input_tensors.at(0).get_layout() == tt::tt_metal::Layout::TILE, "The input must be in tiled format"); if (input_shape[dim] >= topk_utils::multi_core_min_width) { // multicore implementation auto device = input_tensors.at(0).device(); @@ -95,10 +95,14 @@ std::vector TopK::compute_output_specs( auto output_shape = input_tensors.at(0).get_logical_shape(); output_shape[-1] = this->k; - auto values_spec = - TensorSpec(output_shape, TensorLayout(input_tensor.get_dtype(), PageConfig(Layout::TILE), output_mem_config)); - auto index_spec = - TensorSpec(output_shape, TensorLayout(DataType::UINT16, PageConfig(Layout::TILE), output_mem_config)); + auto values_spec = TensorSpec( + output_shape, + tt::tt_metal::TensorLayout( + input_tensor.get_dtype(), tt::tt_metal::PageConfig(tt::tt_metal::Layout::TILE), output_mem_config)); + auto index_spec = TensorSpec( + output_shape, + tt::tt_metal::TensorLayout( + tt::tt_metal::DataType::UINT16, tt::tt_metal::PageConfig(tt::tt_metal::Layout::TILE), output_mem_config)); return {values_spec, index_spec}; } @@ -117,7 +121,7 @@ std::vector TopK::create_output_tensors( }; } -operation::ProgramWithCallbacks TopK::create_program( +tt::tt_metal::operation::ProgramWithCallbacks TopK::create_program( const std::vector& input_tensors, std::vector& output_tensors) const { const auto& input_tensor = input_tensors.at(0); if (input_tensor.get_padded_shape()[dim] < topk_utils::multi_core_min_width) { diff --git a/ttnn/cpp/ttnn/operations/reduction/topk/device/topk_op.hpp b/ttnn/cpp/ttnn/operations/reduction/topk/device/topk_op.hpp index dac6dbc6766..2bcd18cb477 100644 --- a/ttnn/cpp/ttnn/operations/reduction/topk/device/topk_op.hpp +++ b/ttnn/cpp/ttnn/operations/reduction/topk/device/topk_op.hpp @@ -16,7 +16,7 @@ struct TopK { const int8_t dim; const bool largest; const bool sorted; - const MemoryConfig output_mem_config; + const tt::tt_metal::MemoryConfig output_mem_config; void validate_with_output_tensors( const std::vector& input_tensors, const std::vector>& output_tensors) const; @@ -24,7 +24,7 @@ struct TopK { const std::vector& input_tensors, const std::vector>& output_tensors) const; std::vector create_output_tensors( const std::vector& input_tensors, const std::vector>& output_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector& input_tensors, std::vector& output_tensors) const; }; diff --git a/ttnn/cpp/ttnn/operations/reduction/topk/device/topk_program_factory.hpp b/ttnn/cpp/ttnn/operations/reduction/topk/device/topk_program_factory.hpp index d70adcb00fe..198d538ee1a 100644 --- a/ttnn/cpp/ttnn/operations/reduction/topk/device/topk_program_factory.hpp +++ b/ttnn/cpp/ttnn/operations/reduction/topk/device/topk_program_factory.hpp @@ -10,7 +10,7 @@ namespace ttnn::operations::reduction::detail { -operation::ProgramWithCallbacks topk_single_core_interleaved( +tt::tt_metal::operation::ProgramWithCallbacks topk_single_core_interleaved( const Tensor& input_tensor, const uint16_t k, const int8_t dim, @@ -144,9 +144,9 @@ operation::ProgramWithCallbacks topk_single_core_interleaved( tt::tt_metal::ComputeConfig{.compile_args = compute_args}); auto override_runtime_args_callback = [unary_reader_kernel_id, binary_writer_kernel_id]( - const Program& program, - const std::vector& input_buffers, - const std::vector& output_buffers) { + const tt::tt_metal::Program& program, + const std::vector& input_buffers, + const std::vector& output_buffers) { auto input_buffer = input_buffers.at(0); auto values_buffer = output_buffers.at(0); auto index_buffer = output_buffers.at(1); @@ -207,7 +207,7 @@ static inline std::tuple cores_utilized( * Then gather the results of each split onto a single core, where the final topk values and indices are computed. * */ -operation::ProgramWithCallbacks topk_multicore_interleaved( +tt::tt_metal::operation::ProgramWithCallbacks topk_multicore_interleaved( const Tensor& input_tensor, const uint16_t k, const int8_t dim, @@ -484,9 +484,9 @@ operation::ProgramWithCallbacks topk_multicore_interleaved( } auto override_runtime_args_callback = [unary_reader_kernel_id, binary_writer_final_kernel_id, num_cores]( - const Program& program, - const std::vector& input_buffers, - const std::vector& output_buffers) { + const tt::tt_metal::Program& program, + const std::vector& input_buffers, + const std::vector& output_buffers) { auto input_buffer = input_buffers.at(0); auto values_buffer = output_buffers.at(0); auto index_buffer = output_buffers.at(1); diff --git a/ttnn/cpp/ttnn/operations/reduction/topk/topk.hpp b/ttnn/cpp/ttnn/operations/reduction/topk/topk.hpp index 9c4b17f659b..06ff3101142 100644 --- a/ttnn/cpp/ttnn/operations/reduction/topk/topk.hpp +++ b/ttnn/cpp/ttnn/operations/reduction/topk/topk.hpp @@ -32,7 +32,7 @@ struct ExecuteTopK { const bool sorted, const std::optional& memory_config, std::optional> optional_output_tensors = std::nullopt) { - return operation::run( + return tt::tt_metal::operation::run( TopK{k, dim, largest, sorted, memory_config.value_or(input_tensor.memory_config())}, {input_tensor}, {}, @@ -56,8 +56,8 @@ struct ExecuteTopK { const std::vector& input_tensors, const std::vector>& optional_inputs) { const auto& input_tensor = input_tensors.at(0); return { - Tensor(operation::get_workers_for_op_output({input_tensor})), - Tensor(operation::get_workers_for_op_output({input_tensor}))}; + Tensor(tt::tt_metal::operation::get_workers_for_op_output({input_tensor})), + Tensor(tt::tt_metal::operation::get_workers_for_op_output({input_tensor}))}; } }; diff --git a/ttnn/cpp/ttnn/operations/sliding_window/halo/device/halo_device_operation.hpp b/ttnn/cpp/ttnn/operations/sliding_window/halo/device/halo_device_operation.hpp index 88e502181f1..df5e9c68a31 100644 --- a/ttnn/cpp/ttnn/operations/sliding_window/halo/device/halo_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/sliding_window/halo/device/halo_device_operation.hpp @@ -23,12 +23,12 @@ struct HaloDeviceOperation { bool transpose_mcast_; uint32_t reshard_num_cores_nhw_; uint32_t max_out_nsticks_per_core_; - MemoryConfig output_memory_config_; + tt::tt_metal::MemoryConfig output_memory_config_; bool is_out_tiled_; void validate(const std::vector& input_tensors) const; std::vector compute_output_specs(const std::vector& input_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector& input_tensors, std::vector& output_tensors) const; // const operation::Hash compute_program_hash(const std::vector &input_tensors) const; @@ -63,7 +63,7 @@ Tensor halo_op( bool remote_read = false, bool transpose_mcast = true, uint32_t reshard_num_cores_nhw = 0, - const MemoryConfig& output_memory_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, + const tt::tt_metal::MemoryConfig& output_memory_config = tt::tt_metal::operation::DEFAULT_OUTPUT_MEMORY_CONFIG, bool is_out_tiled = true); } // namespace halo diff --git a/ttnn/cpp/ttnn/operations/sliding_window/halo/halo.hpp b/ttnn/cpp/ttnn/operations/sliding_window/halo/halo.hpp index 31df09955ea..190577be174 100644 --- a/ttnn/cpp/ttnn/operations/sliding_window/halo/halo.hpp +++ b/ttnn/cpp/ttnn/operations/sliding_window/halo/halo.hpp @@ -19,7 +19,7 @@ struct HaloOperation { bool remote_read = false, bool transpose_mcast = true, uint32_t reshard_num_cores_nhw = 0, - const MemoryConfig& output_memory_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, + const MemoryConfig& output_memory_config = tt::tt_metal::operation::DEFAULT_OUTPUT_MEMORY_CONFIG, bool is_out_tiled = true); // invoke can be overloaded as many times as needed to provide all desired APIs diff --git a/ttnn/cpp/ttnn/operations/sliding_window/reference_sliding_window.hpp b/ttnn/cpp/ttnn/operations/sliding_window/reference_sliding_window.hpp index ea5e6cbb90a..c91c1b8bc75 100644 --- a/ttnn/cpp/ttnn/operations/sliding_window/reference_sliding_window.hpp +++ b/ttnn/cpp/ttnn/operations/sliding_window/reference_sliding_window.hpp @@ -19,7 +19,7 @@ using tt::tt_metal::Tensor; namespace ttnn::operations::sliding_window { // Calculate Convolution on padded input buffer. -owned_buffer::Buffer ref_conv_op( +tt::tt_metal::owned_buffer::Buffer ref_conv_op( const Tensor& input_padded_tensor, const ttnn::Shape& input_nchw_shape, uint32_t stride_h, @@ -29,8 +29,8 @@ owned_buffer::Buffer ref_conv_op( const ttnn::Shape& out_golden_pyt_tensor_shape); // Calculate convolution using op_trace_metadata on padded input buffer. -owned_buffer::Buffer conv_using_op_trace_metadata( - const owned_buffer::Buffer& input_padded_tensor_buf, +tt::tt_metal::owned_buffer::Buffer conv_using_op_trace_metadata( + const tt::tt_metal::owned_buffer::Buffer& input_padded_tensor_buf, const std::vector& filter_vector, const std::vector& op_trace_metadata, uint32_t stride_h, @@ -41,8 +41,8 @@ owned_buffer::Buffer conv_using_op_trace_metadata( uint32_t out_tensor_size); // Calculate convolution using shards on padded input buffer. -owned_buffer::Buffer conv_using_shard_boundaries( - const owned_buffer::Buffer& input_padded_tensor_buf, +tt::tt_metal::owned_buffer::Buffer conv_using_shard_boundaries( + const tt::tt_metal::owned_buffer::Buffer& input_padded_tensor_buf, const std::vector& filter_vector, const std::vector& shard_boundaries, uint32_t stride_h, @@ -56,8 +56,8 @@ owned_buffer::Buffer conv_using_shard_boundaries( uint32_t out_tensor_size); // Calculate convolution using sliding window op configs on padded input buffer. -owned_buffer::Buffer conv_using_sliding_window_op_config( - const owned_buffer::Buffer& input_padded_tensor_buf, +tt::tt_metal::owned_buffer::Buffer conv_using_sliding_window_op_config( + const tt::tt_metal::owned_buffer::Buffer& input_padded_tensor_buf, const std::vector& filter_vector, const std::vector& op_trace_metadata, const std::vector& shard_boundaries, diff --git a/ttnn/cpp/ttnn/operations/trace.cpp b/ttnn/cpp/ttnn/operations/trace.cpp index 00f7c13253c..b3bc3298366 100644 --- a/ttnn/cpp/ttnn/operations/trace.cpp +++ b/ttnn/cpp/ttnn/operations/trace.cpp @@ -13,7 +13,7 @@ namespace ttnn::operations::trace { uint32_t begin_trace_capture(IDevice* device, QueueId cq_id) { ZoneScoped; - uint32_t trace_id = Trace::next_id(); + uint32_t trace_id = tt::tt_metal::Trace::next_id(); device->begin_trace(*cq_id, trace_id); return trace_id; } diff --git a/ttnn/cpp/ttnn/operations/transformer/attention_softmax/attention_softmax.cpp b/ttnn/cpp/ttnn/operations/transformer/attention_softmax/attention_softmax.cpp index 622aae44ef7..76af0720357 100644 --- a/ttnn/cpp/ttnn/operations/transformer/attention_softmax/attention_softmax.cpp +++ b/ttnn/cpp/ttnn/operations/transformer/attention_softmax/attention_softmax.cpp @@ -33,7 +33,7 @@ ttnn::Tensor ExecuteAttentionSoftmax::invoke( std::optional compute_kernel_config = std::nullopt; auto kernel_config_val = init_device_compute_kernel_config( input_tensor.device()->arch(), compute_kernel_config, MathFidelity::HiFi4, true, false, false); - auto output_tensor = operation::run( + auto output_tensor = tt::tt_metal::operation::run( ttnn::operations::normalization::Softmax{ head_size, in_place, diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa/device/joint_sdpa_op.hpp b/ttnn/cpp/ttnn/operations/transformer/sdpa/device/joint_sdpa_op.hpp index 6f67131e55b..e40e4d40ca9 100644 --- a/ttnn/cpp/ttnn/operations/transformer/sdpa/device/joint_sdpa_op.hpp +++ b/ttnn/cpp/ttnn/operations/transformer/sdpa/device/joint_sdpa_op.hpp @@ -16,7 +16,7 @@ namespace ttnn::operations::transformer { struct JointScaledDotProductAttention { const std::string joint_strategy; const std::optional scale; - const MemoryConfig output_mem_config; + const tt::tt_metal::MemoryConfig output_mem_config; const std::optional program_config; const DeviceComputeKernelConfig compute_kernel_config; @@ -24,7 +24,7 @@ struct JointScaledDotProductAttention { std::vector compute_output_specs(const std::vector& input_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector& input_tensors, std::vector& output_tensors) const; std::uint32_t get_q_chunk_size() const; diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa/device/joint_sdpa_program_factory.hpp b/ttnn/cpp/ttnn/operations/transformer/sdpa/device/joint_sdpa_program_factory.hpp index 285be3dfb2c..67dfb532554 100644 --- a/ttnn/cpp/ttnn/operations/transformer/sdpa/device/joint_sdpa_program_factory.hpp +++ b/ttnn/cpp/ttnn/operations/transformer/sdpa/device/joint_sdpa_program_factory.hpp @@ -10,7 +10,7 @@ namespace ttnn::operations::transformer::detail { -operation::ProgramWithCallbacks joint_sdpa( +tt::tt_metal::operation::ProgramWithCallbacks joint_sdpa( const Tensor& input_tensor_q, const Tensor& input_tensor_k, const Tensor& input_tensor_v, diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa/device/sdpa_op.hpp b/ttnn/cpp/ttnn/operations/transformer/sdpa/device/sdpa_op.hpp index 75358ec3e4c..0639b613ce4 100644 --- a/ttnn/cpp/ttnn/operations/transformer/sdpa/device/sdpa_op.hpp +++ b/ttnn/cpp/ttnn/operations/transformer/sdpa/device/sdpa_op.hpp @@ -15,7 +15,7 @@ namespace ttnn::operations::transformer { struct ScaledDotProductAttention { const std::optional scale; - const MemoryConfig output_mem_config; + const tt::tt_metal::MemoryConfig output_mem_config; const std::optional program_config; const bool is_causal; const std::optional chunk_start_idx; @@ -27,12 +27,12 @@ struct ScaledDotProductAttention { std::vector compute_output_specs(const std::vector& input_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector& input_tensors, const std::vector>& optional_input_tensors, std::vector& output_tensors) const; - operation::Hash compute_program_hash( + tt::tt_metal::operation::Hash compute_program_hash( const std::vector& input_tensors, const std::vector>& optional_input_tensors) const; diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa/device/sdpa_program_factory.hpp b/ttnn/cpp/ttnn/operations/transformer/sdpa/device/sdpa_program_factory.hpp index f9fcb254419..44c1f82e7c6 100644 --- a/ttnn/cpp/ttnn/operations/transformer/sdpa/device/sdpa_program_factory.hpp +++ b/ttnn/cpp/ttnn/operations/transformer/sdpa/device/sdpa_program_factory.hpp @@ -10,7 +10,7 @@ namespace ttnn::operations::transformer::detail { -operation::ProgramWithCallbacks sdpa_multi_core( +tt::tt_metal::operation::ProgramWithCallbacks sdpa_multi_core( const Tensor& input_tensor_q, const Tensor& input_tensor_k, const Tensor& input_tensor_v, diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa/sdpa.cpp b/ttnn/cpp/ttnn/operations/transformer/sdpa/sdpa.cpp index 6d8bc2723fe..926782177fe 100644 --- a/ttnn/cpp/ttnn/operations/transformer/sdpa/sdpa.cpp +++ b/ttnn/cpp/ttnn/operations/transformer/sdpa/sdpa.cpp @@ -11,8 +11,6 @@ #include "ttnn/common/queue_id.hpp" #include "ttnn/run_operation.hpp" -using namespace tt::tt_metal; - namespace ttnn::operations::transformer { ttnn::Tensor ExecuteScaledDotProductAttention::invoke( @@ -32,10 +30,10 @@ ttnn::Tensor ExecuteScaledDotProductAttention::invoke( auto kernel_config_val = init_device_compute_kernel_config( input_tensor_q.device()->arch(), compute_kernel_config, MathFidelity::HiFi2, true, false, false); - return operation::run( + return tt::tt_metal::operation::run( ScaledDotProductAttention{ .scale = scale, - .output_mem_config = memory_config.value_or(operation::DEFAULT_OUTPUT_MEMORY_CONFIG), + .output_mem_config = memory_config.value_or(tt::tt_metal::operation::DEFAULT_OUTPUT_MEMORY_CONFIG), .program_config = std::move(program_config), .is_causal = is_causal, .chunk_start_idx = std::nullopt, @@ -87,10 +85,10 @@ ttnn::Tensor ExecuteChunkedScaledDotProductAttention::invoke( auto kernel_config_val = init_device_compute_kernel_config( input_tensor_q.device()->arch(), compute_kernel_config, MathFidelity::HiFi2, true, false, false); - return operation::run( + return tt::tt_metal::operation::run( ScaledDotProductAttention{ .scale = scale, - .output_mem_config = memory_config.value_or(operation::DEFAULT_OUTPUT_MEMORY_CONFIG), + .output_mem_config = memory_config.value_or(tt::tt_metal::operation::DEFAULT_OUTPUT_MEMORY_CONFIG), .program_config = std::move(program_config), .is_causal = true, // Always causal for chunked version .chunk_start_idx = chunk_start_idx, @@ -143,11 +141,11 @@ std::tuple ExecuteJointAttention::invoke( auto kernel_config_val = init_device_compute_kernel_config( input_tensor_q.device()->arch(), compute_kernel_config, MathFidelity::HiFi2, true, false, false); - auto results = operation::run( + auto results = tt::tt_metal::operation::run( JointScaledDotProductAttention{ .joint_strategy = joint_strategy, .scale = scale, - .output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, + .output_mem_config = tt::tt_metal::operation::DEFAULT_OUTPUT_MEMORY_CONFIG, .program_config = std::move(program_config), .compute_kernel_config = kernel_config_val}, {input_tensor_q, input_tensor_k, input_tensor_v, joint_tensor_q, joint_tensor_k, joint_tensor_v}, diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/sdpa_decode_op.hpp b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/sdpa_decode_op.hpp index 30c5c4ed999..0406b4268b4 100644 --- a/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/sdpa_decode_op.hpp +++ b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/sdpa_decode_op.hpp @@ -17,7 +17,7 @@ struct ScaledDotProductAttentionDecode { const bool is_causal; std::vector cur_pos; const std::optional scale; - const MemoryConfig output_mem_config; + const tt::tt_metal::MemoryConfig output_mem_config; const std::optional program_config; const DeviceComputeKernelConfig compute_kernel_config; const uint32_t k_chunk_size; @@ -30,12 +30,12 @@ struct ScaledDotProductAttentionDecode { std::vector compute_output_specs(const std::vector& input_tensors) const; - operation::ProgramWithCallbacks create_program( + tt::tt_metal::operation::ProgramWithCallbacks create_program( const std::vector& input_tensors, const std::vector>& optional_input_tensors, std::vector& output_tensors) const; - operation::Hash compute_program_hash( + tt::tt_metal::operation::Hash compute_program_hash( const std::vector& input_tensors, const std::vector>& optional_input_tensors) const; }; diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/sdpa_decode_program_factory.hpp b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/sdpa_decode_program_factory.hpp index 61e01d9a2ba..6c3d90b81eb 100644 --- a/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/sdpa_decode_program_factory.hpp +++ b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/sdpa_decode_program_factory.hpp @@ -10,7 +10,7 @@ namespace ttnn::operations::transformer::detail { -operation::ProgramWithCallbacks sdpa_decode_multi_core( +tt::tt_metal::operation::ProgramWithCallbacks sdpa_decode_multi_core( const Tensor& input_tensor_q, const Tensor& input_tensor_k, const Tensor& input_tensor_v, diff --git a/ttnn/cpp/ttnn/operations/uniform/device/uniform_device_operation.hpp b/ttnn/cpp/ttnn/operations/uniform/device/uniform_device_operation.hpp index cbd0427938a..8908d02f030 100644 --- a/ttnn/cpp/ttnn/operations/uniform/device/uniform_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/uniform/device/uniform_device_operation.hpp @@ -27,8 +27,8 @@ struct UniformDeviceOperation { struct ProgramFactory { struct shared_variables_t { - KernelHandle compute_kernel_id; - KernelHandle writer_kernel_id; + tt::tt_metal::KernelHandle compute_kernel_id; + tt::tt_metal::KernelHandle writer_kernel_id; std::vector cores; }; diff --git a/ttnn/cpp/ttnn/reports.hpp b/ttnn/cpp/ttnn/reports.hpp index 5603c684dce..6fe13c8bbf3 100644 --- a/ttnn/cpp/ttnn/reports.hpp +++ b/ttnn/cpp/ttnn/reports.hpp @@ -33,9 +33,10 @@ struct DeviceInfo { size_t cb_limit; }; -DeviceInfo get_device_info(const IDevice& device) { +DeviceInfo get_device_info(const tt::tt_metal::IDevice& device) { DeviceInfo info{}; - const auto& dispatch_core_config = dispatch_core_manager::instance().get_dispatch_core_config(device.id()); + const auto& dispatch_core_config = + tt::tt_metal::dispatch_core_manager::instance().get_dispatch_core_config(device.id()); const auto descriptor = tt::get_core_descriptor_config(device.id(), device.num_hw_cqs(), dispatch_core_config); const auto& device_allocator = device.allocator(); info.num_y_cores = device.logical_grid_size().y; @@ -43,10 +44,10 @@ DeviceInfo get_device_info(const IDevice& device) { info.num_y_compute_cores = descriptor.compute_grid_size.y; info.num_x_compute_cores = descriptor.compute_grid_size.x; info.worker_l1_size = device_allocator->get_config().worker_l1_size; - info.l1_num_banks = device_allocator->get_num_banks(BufferType::L1); - info.l1_bank_size = device_allocator->get_bank_size(BufferType::L1); - info.address_at_first_l1_bank = device_allocator->get_bank_offset(BufferType::L1, 0); - info.address_at_first_l1_cb_buffer = device_allocator->get_base_allocator_addr(HalMemType::L1); + info.l1_num_banks = device_allocator->get_num_banks(tt::tt_metal::BufferType::L1); + info.l1_bank_size = device_allocator->get_bank_size(tt::tt_metal::BufferType::L1); + info.address_at_first_l1_bank = device_allocator->get_bank_offset(tt::tt_metal::BufferType::L1, 0); + info.address_at_first_l1_cb_buffer = device_allocator->get_base_allocator_addr(tt::tt_metal::HalMemType::L1); info.num_banks_per_storage_core = device_allocator->get_config().worker_l1_size / info.l1_bank_size; info.num_storage_cores = descriptor.relative_storage_cores.size(); info.num_compute_cores = descriptor.relative_compute_cores.size(); @@ -56,8 +57,8 @@ DeviceInfo get_device_info(const IDevice& device) { (info.num_storage_cores + info.num_compute_cores + (info.num_banks_per_storage_core * info.num_storage_cores)) * info.l1_bank_size; info.total_l1_for_sharded_buffers = info.num_compute_cores * info.l1_bank_size; - info.cb_limit = - device_allocator->get_config().worker_l1_size - device_allocator->get_base_allocator_addr(HalMemType::L1); + info.cb_limit = device_allocator->get_config().worker_l1_size - + device_allocator->get_base_allocator_addr(tt::tt_metal::HalMemType::L1); return info; } @@ -65,7 +66,7 @@ struct BufferInfo { uint32_t device_id; uint32_t address; uint32_t max_size_per_bank; - BufferType buffer_type; + tt::tt_metal::BufferType buffer_type; }; std::vector get_buffers() { @@ -129,7 +130,7 @@ struct BufferPageInfo { uint32_t page_index; uint32_t page_address; uint32_t page_size; - BufferType buffer_type; + tt::tt_metal::BufferType buffer_type; }; std::vector get_buffer_pages() { @@ -150,7 +151,7 @@ std::vector get_buffer_pages() { uint32_t bank_id = 0; for (int page_index = 0; page_index < num_pages; page_index++) { CoreCoord core; - DeviceAddr page_address = 0; + tt::tt_metal::DeviceAddr page_address = 0; if (buffer->buffer_layout() == tt::tt_metal::TensorMemoryLayout::INTERLEAVED) { page_address = buffer->page_address(bank_id, page_index); diff --git a/ttnn/cpp/ttnn/tensor/flatbuffer/tensor_types_from_flatbuffer.cpp b/ttnn/cpp/ttnn/tensor/flatbuffer/tensor_types_from_flatbuffer.cpp index 9c187e5d418..cab815e3874 100644 --- a/ttnn/cpp/ttnn/tensor/flatbuffer/tensor_types_from_flatbuffer.cpp +++ b/ttnn/cpp/ttnn/tensor/flatbuffer/tensor_types_from_flatbuffer.cpp @@ -6,81 +6,81 @@ namespace ttnn { -BufferType from_flatbuffer(flatbuffer::BufferType type) { +tt::tt_metal::BufferType from_flatbuffer(flatbuffer::BufferType type) { switch (type) { - case flatbuffer::BufferType::DRAM: return BufferType::DRAM; - case flatbuffer::BufferType::L1: return BufferType::L1; - case flatbuffer::BufferType::SystemMemory: return BufferType::SYSTEM_MEMORY; - case flatbuffer::BufferType::L1Small: return BufferType::L1_SMALL; - case flatbuffer::BufferType::Trace: return BufferType::TRACE; + case flatbuffer::BufferType::DRAM: return tt::tt_metal::BufferType::DRAM; + case flatbuffer::BufferType::L1: return tt::tt_metal::BufferType::L1; + case flatbuffer::BufferType::SystemMemory: return tt::tt_metal::BufferType::SYSTEM_MEMORY; + case flatbuffer::BufferType::L1Small: return tt::tt_metal::BufferType::L1_SMALL; + case flatbuffer::BufferType::Trace: return tt::tt_metal::BufferType::TRACE; } TT_THROW("Unsupported BufferType from flatbuffer."); } -TensorMemoryLayout from_flatbuffer(flatbuffer::TensorMemoryLayout layout) { +tt::tt_metal::TensorMemoryLayout from_flatbuffer(flatbuffer::TensorMemoryLayout layout) { switch (layout) { - case flatbuffer::TensorMemoryLayout::Interleaved: return TensorMemoryLayout::INTERLEAVED; - case flatbuffer::TensorMemoryLayout::SingleBank: return TensorMemoryLayout::SINGLE_BANK; - case flatbuffer::TensorMemoryLayout::HeightSharded: return TensorMemoryLayout::HEIGHT_SHARDED; - case flatbuffer::TensorMemoryLayout::WidthSharded: return TensorMemoryLayout::WIDTH_SHARDED; - case flatbuffer::TensorMemoryLayout::BlockSharded: return TensorMemoryLayout::BLOCK_SHARDED; + case flatbuffer::TensorMemoryLayout::Interleaved: return tt::tt_metal::TensorMemoryLayout::INTERLEAVED; + case flatbuffer::TensorMemoryLayout::SingleBank: return tt::tt_metal::TensorMemoryLayout::SINGLE_BANK; + case flatbuffer::TensorMemoryLayout::HeightSharded: return tt::tt_metal::TensorMemoryLayout::HEIGHT_SHARDED; + case flatbuffer::TensorMemoryLayout::WidthSharded: return tt::tt_metal::TensorMemoryLayout::WIDTH_SHARDED; + case flatbuffer::TensorMemoryLayout::BlockSharded: return tt::tt_metal::TensorMemoryLayout::BLOCK_SHARDED; } TT_THROW("Unsupported TensorMemoryLayout from flatbuffer."); } -DataType from_flatbuffer(flatbuffer::DataType type) { +tt::tt_metal::DataType from_flatbuffer(flatbuffer::DataType type) { switch (type) { - case flatbuffer::DataType::BFloat16: return DataType::BFLOAT16; - case flatbuffer::DataType::Float32: return DataType::FLOAT32; - case flatbuffer::DataType::UInt32: return DataType::UINT32; - case flatbuffer::DataType::BFloat8B: return DataType::BFLOAT8_B; - case flatbuffer::DataType::BFloat4B: return DataType::BFLOAT4_B; - case flatbuffer::DataType::UInt8: return DataType::UINT8; - case flatbuffer::DataType::UInt16: return DataType::UINT16; - case flatbuffer::DataType::Int32: return DataType::INT32; - case flatbuffer::DataType::Invalid: return DataType::INVALID; + case flatbuffer::DataType::BFloat16: return tt::tt_metal::DataType::BFLOAT16; + case flatbuffer::DataType::Float32: return tt::tt_metal::DataType::FLOAT32; + case flatbuffer::DataType::UInt32: return tt::tt_metal::DataType::UINT32; + case flatbuffer::DataType::BFloat8B: return tt::tt_metal::DataType::BFLOAT8_B; + case flatbuffer::DataType::BFloat4B: return tt::tt_metal::DataType::BFLOAT4_B; + case flatbuffer::DataType::UInt8: return tt::tt_metal::DataType::UINT8; + case flatbuffer::DataType::UInt16: return tt::tt_metal::DataType::UINT16; + case flatbuffer::DataType::Int32: return tt::tt_metal::DataType::INT32; + case flatbuffer::DataType::Invalid: return tt::tt_metal::DataType::INVALID; } TT_THROW("Unsupported DataType from flatbuffer."); } -MemoryConfig from_flatbuffer(const flatbuffer::MemoryConfig* config) { - std::optional shard_spec; +tt::tt_metal::MemoryConfig from_flatbuffer(const flatbuffer::MemoryConfig* config) { + std::optional shard_spec; if (config->shard_spec()) { shard_spec = from_flatbuffer(config->shard_spec()); } - return MemoryConfig{ + return tt::tt_metal::MemoryConfig{ from_flatbuffer(config->memory_layout()), from_flatbuffer(config->buffer_type()), shard_spec, }; } -ShardOrientation from_flatbuffer(flatbuffer::ShardOrientation orientation) { +tt::tt_metal::ShardOrientation from_flatbuffer(flatbuffer::ShardOrientation orientation) { switch (orientation) { - case flatbuffer::ShardOrientation::RowMajor: return ShardOrientation::ROW_MAJOR; - case flatbuffer::ShardOrientation::ColMajor: return ShardOrientation::COL_MAJOR; + case flatbuffer::ShardOrientation::RowMajor: return tt::tt_metal::ShardOrientation::ROW_MAJOR; + case flatbuffer::ShardOrientation::ColMajor: return tt::tt_metal::ShardOrientation::COL_MAJOR; } TT_THROW("Unsupported ShardOrientation from flatbuffer."); } -ShardMode from_flatbuffer(flatbuffer::ShardMode mode) { +tt::tt_metal::ShardMode from_flatbuffer(flatbuffer::ShardMode mode) { switch (mode) { - case flatbuffer::ShardMode::Physical: return ShardMode::PHYSICAL; - case flatbuffer::ShardMode::Logical: return ShardMode::LOGICAL; + case flatbuffer::ShardMode::Physical: return tt::tt_metal::ShardMode::PHYSICAL; + case flatbuffer::ShardMode::Logical: return tt::tt_metal::ShardMode::LOGICAL; } TT_THROW("Unsupported ShardMode from flatbuffer."); } -ShardSpec from_flatbuffer(const flatbuffer::ShardSpec* spec) { +tt::tt_metal::ShardSpec from_flatbuffer(const flatbuffer::ShardSpec* spec) { CoreRangeSet grid = from_flatbuffer(spec->grid()); std::array shape = {spec->shape_h(), spec->shape_w()}; - ShardOrientation orientation = from_flatbuffer(spec->orientation()); - ShardMode mode = from_flatbuffer(spec->shard_mode()); + tt::tt_metal::ShardOrientation orientation = from_flatbuffer(spec->orientation()); + tt::tt_metal::ShardMode mode = from_flatbuffer(spec->shard_mode()); if (const auto* fb_shard_shape = spec->physical_shard_shape()) { std::array physical_shard_shape = {fb_shard_shape->height(), fb_shard_shape->width()}; - return ShardSpec(grid, shape, physical_shard_shape, orientation); + return tt::tt_metal::ShardSpec(grid, shape, physical_shard_shape, orientation); } - return ShardSpec(grid, shape, orientation, mode); + return tt::tt_metal::ShardSpec(grid, shape, orientation, mode); } CoreCoord from_flatbuffer(const flatbuffer::CoreCoord* core_coord) { @@ -101,26 +101,26 @@ CoreRangeSet from_flatbuffer(const flatbuffer::CoreRangeSet* core_range_set) { return CoreRangeSet{ranges}; } -TensorLayout from_flatbuffer(const flatbuffer::TensorLayout* layout) { - PageConfig page_config = [&] { +tt::tt_metal::TensorLayout from_flatbuffer(const flatbuffer::TensorLayout* layout) { + tt::tt_metal::PageConfig page_config = [&] { switch (layout->page_config_type()) { - case flatbuffer::PageConfig::row_major: return PageConfig(Layout::ROW_MAJOR); + case flatbuffer::PageConfig::row_major: return tt::tt_metal::PageConfig(tt::tt_metal::Layout::ROW_MAJOR); case flatbuffer::PageConfig::tile: { const auto* tile_page_config = layout->page_config_as_tile(); const auto* flat_tile = tile_page_config->tile(); - Tile tile( + tt::tt_metal::Tile tile( std::array{flat_tile->tile_shape_h(), flat_tile->tile_shape_w()}, flat_tile->transpose_tile()); - return PageConfig(Layout::TILE, tile); + return tt::tt_metal::PageConfig(tt::tt_metal::Layout::TILE, tile); } default: TT_THROW("Unsupported PageConfig type from flatbuffer."); } }(); - return TensorLayout::restore_from_serialized( + return tt::tt_metal::TensorLayout::restore_from_serialized( from_flatbuffer(layout->data_type()), page_config, from_flatbuffer(layout->memory_config()), - Alignment(SmallVector(layout->alignment()->cbegin(), layout->alignment()->cend()))); + tt::tt_metal::Alignment(SmallVector(layout->alignment()->cbegin(), layout->alignment()->cend()))); } TensorSpec from_flatbuffer(const flatbuffer::TensorSpec* spec) { diff --git a/ttnn/cpp/ttnn/tensor/flatbuffer/tensor_types_from_flatbuffer.hpp b/ttnn/cpp/ttnn/tensor/flatbuffer/tensor_types_from_flatbuffer.hpp index 906b0d8940e..b35e24f28bf 100644 --- a/ttnn/cpp/ttnn/tensor/flatbuffer/tensor_types_from_flatbuffer.hpp +++ b/ttnn/cpp/ttnn/tensor/flatbuffer/tensor_types_from_flatbuffer.hpp @@ -10,17 +10,17 @@ namespace ttnn { -BufferType from_flatbuffer(flatbuffer::BufferType type); -TensorMemoryLayout from_flatbuffer(flatbuffer::TensorMemoryLayout layout); -DataType from_flatbuffer(flatbuffer::DataType type); -ShardOrientation from_flatbuffer(flatbuffer::ShardOrientation orientation); -ShardMode from_flatbuffer(flatbuffer::ShardMode mode); +tt::tt_metal::BufferType from_flatbuffer(flatbuffer::BufferType type); +tt::tt_metal::TensorMemoryLayout from_flatbuffer(flatbuffer::TensorMemoryLayout layout); +tt::tt_metal::DataType from_flatbuffer(flatbuffer::DataType type); +tt::tt_metal::ShardOrientation from_flatbuffer(flatbuffer::ShardOrientation orientation); +tt::tt_metal::ShardMode from_flatbuffer(flatbuffer::ShardMode mode); CoreCoord from_flatbuffer(const flatbuffer::CoreCoord* fb_coord); CoreRange from_flatbuffer(const flatbuffer::CoreRange* fb_coord); CoreRangeSet from_flatbuffer(const flatbuffer::CoreRangeSet* fb_coord); -ShardSpec from_flatbuffer(const flatbuffer::ShardSpec* spec); -MemoryConfig from_flatbuffer(const flatbuffer::MemoryConfig* config); -TensorLayout from_flatbuffer(const flatbuffer::TensorLayout* layout); -TensorSpec from_flatbuffer(const flatbuffer::TensorSpec* spec); +tt::tt_metal::ShardSpec from_flatbuffer(const flatbuffer::ShardSpec* spec); +tt::tt_metal::MemoryConfig from_flatbuffer(const flatbuffer::MemoryConfig* config); +tt::tt_metal::TensorLayout from_flatbuffer(const flatbuffer::TensorLayout* layout); +tt::tt_metal::TensorSpec from_flatbuffer(const flatbuffer::TensorSpec* spec); } // namespace ttnn diff --git a/ttnn/cpp/ttnn/tensor/flatbuffer/tensor_types_to_flatbuffer.cpp b/ttnn/cpp/ttnn/tensor/flatbuffer/tensor_types_to_flatbuffer.cpp index dce51ca4177..d60d1591bbc 100644 --- a/ttnn/cpp/ttnn/tensor/flatbuffer/tensor_types_to_flatbuffer.cpp +++ b/ttnn/cpp/ttnn/tensor/flatbuffer/tensor_types_to_flatbuffer.cpp @@ -6,24 +6,24 @@ namespace ttnn { -flatbuffer::ShardOrientation to_flatbuffer(ShardOrientation orientation) { +flatbuffer::ShardOrientation to_flatbuffer(tt::tt_metal::ShardOrientation orientation) { switch (orientation) { - case ShardOrientation::ROW_MAJOR: return flatbuffer::ShardOrientation::RowMajor; - case ShardOrientation::COL_MAJOR: return flatbuffer::ShardOrientation::ColMajor; + case tt::tt_metal::ShardOrientation::ROW_MAJOR: return flatbuffer::ShardOrientation::RowMajor; + case tt::tt_metal::ShardOrientation::COL_MAJOR: return flatbuffer::ShardOrientation::ColMajor; } TT_THROW("Unsupported ShardOrientation to flatbuffer."); } -flatbuffer::ShardMode to_flatbuffer(ShardMode shard_mode) { +flatbuffer::ShardMode to_flatbuffer(tt::tt_metal::ShardMode shard_mode) { switch (shard_mode) { - case ShardMode::LOGICAL: return flatbuffer::ShardMode::Logical; - case ShardMode::PHYSICAL: return flatbuffer::ShardMode::Physical; + case tt::tt_metal::ShardMode::LOGICAL: return flatbuffer::ShardMode::Logical; + case tt::tt_metal::ShardMode::PHYSICAL: return flatbuffer::ShardMode::Physical; } TT_THROW("Unsupported ShardMode to flatbuffer."); } flatbuffers::Offset to_flatbuffer( - const ShardSpec& spec, flatbuffers::FlatBufferBuilder& builder) { + const tt::tt_metal::ShardSpec& spec, flatbuffers::FlatBufferBuilder& builder) { flatbuffers::Offset physical_shard_shape = 0; if (spec.physical_shard_shape.has_value()) { const auto& phys_shape = *spec.physical_shard_shape; @@ -63,45 +63,45 @@ flatbuffers::Offset to_flatbuffer( return flatbuffer::CreateCoreRangeSet(builder, ranges_vector); } -flatbuffer::TensorMemoryLayout to_flatbuffer(TensorMemoryLayout layout) { +flatbuffer::TensorMemoryLayout to_flatbuffer(tt::tt_metal::TensorMemoryLayout layout) { switch (layout) { - case TensorMemoryLayout::INTERLEAVED: return flatbuffer::TensorMemoryLayout::Interleaved; - case TensorMemoryLayout::SINGLE_BANK: return flatbuffer::TensorMemoryLayout::SingleBank; - case TensorMemoryLayout::HEIGHT_SHARDED: return flatbuffer::TensorMemoryLayout::HeightSharded; - case TensorMemoryLayout::WIDTH_SHARDED: return flatbuffer::TensorMemoryLayout::WidthSharded; - case TensorMemoryLayout::BLOCK_SHARDED: return flatbuffer::TensorMemoryLayout::BlockSharded; + case tt::tt_metal::TensorMemoryLayout::INTERLEAVED: return flatbuffer::TensorMemoryLayout::Interleaved; + case tt::tt_metal::TensorMemoryLayout::SINGLE_BANK: return flatbuffer::TensorMemoryLayout::SingleBank; + case tt::tt_metal::TensorMemoryLayout::HEIGHT_SHARDED: return flatbuffer::TensorMemoryLayout::HeightSharded; + case tt::tt_metal::TensorMemoryLayout::WIDTH_SHARDED: return flatbuffer::TensorMemoryLayout::WidthSharded; + case tt::tt_metal::TensorMemoryLayout::BLOCK_SHARDED: return flatbuffer::TensorMemoryLayout::BlockSharded; } TT_THROW("Unsupported TensorMemoryLayout to flatbuffer."); } -flatbuffer::BufferType to_flatbuffer(BufferType type) { +flatbuffer::BufferType to_flatbuffer(tt::tt_metal::BufferType type) { switch (type) { - case BufferType::DRAM: return flatbuffer::BufferType::DRAM; - case BufferType::L1: return flatbuffer::BufferType::L1; - case BufferType::SYSTEM_MEMORY: return flatbuffer::BufferType::SystemMemory; - case BufferType::L1_SMALL: return flatbuffer::BufferType::L1Small; - case BufferType::TRACE: return flatbuffer::BufferType::Trace; + case tt::tt_metal::BufferType::DRAM: return flatbuffer::BufferType::DRAM; + case tt::tt_metal::BufferType::L1: return flatbuffer::BufferType::L1; + case tt::tt_metal::BufferType::SYSTEM_MEMORY: return flatbuffer::BufferType::SystemMemory; + case tt::tt_metal::BufferType::L1_SMALL: return flatbuffer::BufferType::L1Small; + case tt::tt_metal::BufferType::TRACE: return flatbuffer::BufferType::Trace; } TT_THROW("Unsupported BufferType to flatbuffer."); } -flatbuffer::DataType to_flatbuffer(DataType type) { +flatbuffer::DataType to_flatbuffer(tt::tt_metal::DataType type) { switch (type) { - case DataType::BFLOAT16: return flatbuffer::DataType::BFloat16; - case DataType::FLOAT32: return flatbuffer::DataType::Float32; - case DataType::UINT32: return flatbuffer::DataType::UInt32; - case DataType::BFLOAT8_B: return flatbuffer::DataType::BFloat8B; - case DataType::BFLOAT4_B: return flatbuffer::DataType::BFloat4B; - case DataType::UINT8: return flatbuffer::DataType::UInt8; - case DataType::UINT16: return flatbuffer::DataType::UInt16; - case DataType::INT32: return flatbuffer::DataType::Int32; - case DataType::INVALID: return flatbuffer::DataType::Invalid; + case tt::tt_metal::DataType::BFLOAT16: return flatbuffer::DataType::BFloat16; + case tt::tt_metal::DataType::FLOAT32: return flatbuffer::DataType::Float32; + case tt::tt_metal::DataType::UINT32: return flatbuffer::DataType::UInt32; + case tt::tt_metal::DataType::BFLOAT8_B: return flatbuffer::DataType::BFloat8B; + case tt::tt_metal::DataType::BFLOAT4_B: return flatbuffer::DataType::BFloat4B; + case tt::tt_metal::DataType::UINT8: return flatbuffer::DataType::UInt8; + case tt::tt_metal::DataType::UINT16: return flatbuffer::DataType::UInt16; + case tt::tt_metal::DataType::INT32: return flatbuffer::DataType::Int32; + case tt::tt_metal::DataType::INVALID: return flatbuffer::DataType::Invalid; } TT_THROW("Unsupported DataType to flatbuffer."); } flatbuffers::Offset to_flatbuffer( - const MemoryConfig& config, flatbuffers::FlatBufferBuilder& builder) { + const tt::tt_metal::MemoryConfig& config, flatbuffers::FlatBufferBuilder& builder) { flatbuffers::Offset shard_spec = 0; if (config.shard_spec.has_value()) { shard_spec = to_flatbuffer(*config.shard_spec, builder); @@ -111,11 +111,11 @@ flatbuffers::Offset to_flatbuffer( } flatbuffers::Offset to_flatbuffer( - const TensorLayout& layout, flatbuffers::FlatBufferBuilder& builder) { + const tt::tt_metal::TensorLayout& layout, flatbuffers::FlatBufferBuilder& builder) { const auto& alignment = layout.get_alignment(); auto flat_alignment = builder.CreateVector(alignment.view().data(), alignment.size()); auto page_config = layout.get_page_config(); - if (page_config.get_layout() == Layout::TILE) { + if (page_config.get_layout() == tt::tt_metal::Layout::TILE) { auto tile = page_config.get_tile(); auto flat_tile = flatbuffer::CreateTile(builder, tile.get_height(), tile.get_width(), tile.get_transpose_of_faces()); @@ -126,7 +126,7 @@ flatbuffers::Offset to_flatbuffer( flatbuffer::CreateTilePageConfig(builder, flat_tile).Union(), to_flatbuffer(layout.get_memory_config(), builder), flat_alignment); - } else if (page_config.get_layout() == Layout::ROW_MAJOR) { + } else if (page_config.get_layout() == tt::tt_metal::Layout::ROW_MAJOR) { return flatbuffer::CreateTensorLayout( builder, to_flatbuffer(layout.get_data_type()), @@ -139,7 +139,7 @@ flatbuffers::Offset to_flatbuffer( } flatbuffers::Offset to_flatbuffer( - const TensorSpec& spec, flatbuffers::FlatBufferBuilder& builder) { + const tt::tt_metal::TensorSpec& spec, flatbuffers::FlatBufferBuilder& builder) { const auto& shape = spec.logical_shape(); auto flat_shape = builder.CreateVector(shape.view().data(), shape.rank()); return flatbuffer::CreateTensorSpec(builder, flat_shape, to_flatbuffer(spec.tensor_layout(), builder)); diff --git a/ttnn/cpp/ttnn/tensor/flatbuffer/tensor_types_to_flatbuffer.hpp b/ttnn/cpp/ttnn/tensor/flatbuffer/tensor_types_to_flatbuffer.hpp index ab7e3a2533e..69500eb3f74 100644 --- a/ttnn/cpp/ttnn/tensor/flatbuffer/tensor_types_to_flatbuffer.hpp +++ b/ttnn/cpp/ttnn/tensor/flatbuffer/tensor_types_to_flatbuffer.hpp @@ -18,20 +18,20 @@ flatbuffers::Offset to_flatbuffer( flatbuffers::Offset to_flatbuffer( flatbuffers::FlatBufferBuilder& builder, const CoreRangeSet& core_range_set); -flatbuffer::ShardOrientation to_flatbuffer(ShardOrientation orientation); -flatbuffer::ShardMode to_flatbuffer(ShardMode shard_mode); +flatbuffer::ShardOrientation to_flatbuffer(tt::tt_metal::ShardOrientation orientation); +flatbuffer::ShardMode to_flatbuffer(tt::tt_metal::ShardMode shard_mode); flatbuffers::Offset to_flatbuffer( - const ShardSpec& spec, flatbuffers::FlatBufferBuilder& builder); + const tt::tt_metal::ShardSpec& spec, flatbuffers::FlatBufferBuilder& builder); -flatbuffer::TensorMemoryLayout to_flatbuffer(TensorMemoryLayout layout); -flatbuffer::BufferType to_flatbuffer(BufferType type); -flatbuffer::DataType to_flatbuffer(DataType type); +flatbuffer::TensorMemoryLayout to_flatbuffer(tt::tt_metal::TensorMemoryLayout layout); +flatbuffer::BufferType to_flatbuffer(tt::tt_metal::BufferType type); +flatbuffer::DataType to_flatbuffer(tt::tt_metal::DataType type); flatbuffers::Offset to_flatbuffer( - const MemoryConfig& config, flatbuffers::FlatBufferBuilder& builder); + const tt::tt_metal::MemoryConfig& config, flatbuffers::FlatBufferBuilder& builder); flatbuffers::Offset to_flatbuffer( - const TensorLayout& layout, flatbuffers::FlatBufferBuilder& builder); + const tt::tt_metal::TensorLayout& layout, flatbuffers::FlatBufferBuilder& builder); flatbuffers::Offset to_flatbuffer( - const TensorSpec& spec, flatbuffers::FlatBufferBuilder& builder); + const tt::tt_metal::TensorSpec& spec, flatbuffers::FlatBufferBuilder& builder); } // namespace ttnn diff --git a/ttnn/cpp/ttnn/tensor/xtensor/partition.cpp b/ttnn/cpp/ttnn/tensor/xtensor/partition.cpp index 69a6d96504b..77028c8fc06 100644 --- a/ttnn/cpp/ttnn/tensor/xtensor/partition.cpp +++ b/ttnn/cpp/ttnn/tensor/xtensor/partition.cpp @@ -122,7 +122,7 @@ namespace adaptor { namespace { template -Tensor concat_impl(const std::vector& tensors, const TensorLayout& layout, int dim) { +Tensor concat_impl(const std::vector& tensors, const tt::tt_metal::TensorLayout& layout, int dim) { std::vector> xtensors; for (const auto& tensor : tensors) { xtensors.push_back(to_xtensor(tensor)); @@ -132,7 +132,8 @@ Tensor concat_impl(const std::vector& tensors, const TensorLayout& layou } template -std::vector chunk_impl(const Tensor& tensor, const TensorLayout& layout, int num_chunks, int dim) { +std::vector chunk_impl( + const Tensor& tensor, const tt::tt_metal::TensorLayout& layout, int num_chunks, int dim) { xt::xarray xtensor = to_xtensor(tensor); auto xtensor_chunks = chunk(xtensor, num_chunks, dim); @@ -151,14 +152,19 @@ std::vector chunk_impl(const Tensor& tensor, const TensorLayout& layout, std::vector chunk(const Tensor& tensor, int num_chunks, int dim) { const auto& reference_layout = tensor.tensor_spec().tensor_layout(); switch (reference_layout.get_data_type()) { - case DataType::BFLOAT4_B: - case DataType::BFLOAT8_B: - case DataType::BFLOAT16: - case DataType::FLOAT32: return adaptor::chunk_impl(tensor, reference_layout, num_chunks, dim); - case DataType::INT32: return adaptor::chunk_impl(tensor, reference_layout, num_chunks, dim); - case DataType::UINT8: return adaptor::chunk_impl(tensor, reference_layout, num_chunks, dim); - case DataType::UINT16: return adaptor::chunk_impl(tensor, reference_layout, num_chunks, dim); - case DataType::UINT32: return adaptor::chunk_impl(tensor, reference_layout, num_chunks, dim); + case tt::tt_metal::DataType::BFLOAT4_B: + case tt::tt_metal::DataType::BFLOAT8_B: + case tt::tt_metal::DataType::BFLOAT16: + case tt::tt_metal::DataType::FLOAT32: + return adaptor::chunk_impl(tensor, reference_layout, num_chunks, dim); + case tt::tt_metal::DataType::INT32: + return adaptor::chunk_impl(tensor, reference_layout, num_chunks, dim); + case tt::tt_metal::DataType::UINT8: + return adaptor::chunk_impl(tensor, reference_layout, num_chunks, dim); + case tt::tt_metal::DataType::UINT16: + return adaptor::chunk_impl(tensor, reference_layout, num_chunks, dim); + case tt::tt_metal::DataType::UINT32: + return adaptor::chunk_impl(tensor, reference_layout, num_chunks, dim); default: TT_THROW("Unsupported data type: {}", reference_layout.get_data_type()); } } @@ -167,14 +173,14 @@ Tensor concat(const std::vector& tensors, int dim) { TT_FATAL(tensors.size() > 0, "Cannot concatenate an empty list of tensors"); const auto& reference_layout = tensors.front().tensor_spec().tensor_layout(); switch (reference_layout.get_data_type()) { - case DataType::BFLOAT4_B: - case DataType::BFLOAT8_B: - case DataType::BFLOAT16: - case DataType::FLOAT32: return adaptor::concat_impl(tensors, reference_layout, dim); - case DataType::INT32: return adaptor::concat_impl(tensors, reference_layout, dim); - case DataType::UINT8: return adaptor::concat_impl(tensors, reference_layout, dim); - case DataType::UINT16: return adaptor::concat_impl(tensors, reference_layout, dim); - case DataType::UINT32: return adaptor::concat_impl(tensors, reference_layout, dim); + case tt::tt_metal::DataType::BFLOAT4_B: + case tt::tt_metal::DataType::BFLOAT8_B: + case tt::tt_metal::DataType::BFLOAT16: + case tt::tt_metal::DataType::FLOAT32: return adaptor::concat_impl(tensors, reference_layout, dim); + case tt::tt_metal::DataType::INT32: return adaptor::concat_impl(tensors, reference_layout, dim); + case tt::tt_metal::DataType::UINT8: return adaptor::concat_impl(tensors, reference_layout, dim); + case tt::tt_metal::DataType::UINT16: return adaptor::concat_impl(tensors, reference_layout, dim); + case tt::tt_metal::DataType::UINT32: return adaptor::concat_impl(tensors, reference_layout, dim); default: TT_THROW("Unsupported data type: {}", reference_layout.get_data_type()); } } diff --git a/ttnn/cpp/ttnn/types.hpp b/ttnn/cpp/ttnn/types.hpp index aa19295ec5f..cd584c585aa 100644 --- a/ttnn/cpp/ttnn/types.hpp +++ b/ttnn/cpp/ttnn/types.hpp @@ -9,6 +9,7 @@ #include #include #include +#include #include "ttnn/distributed/types.hpp" #include "ttnn/tensor/tensor.hpp" @@ -18,12 +19,15 @@ namespace ttnn { namespace types { using IDevice = tt::tt_metal::IDevice; +using Program = tt::tt_metal::Program; constexpr auto TILE_SIZE = 32; using tt::tt_metal::BufferType; using tt::tt_metal::DataType; using tt::tt_metal::MemoryConfig; +using tt::tt_metal::ShardMode; +using tt::tt_metal::ShardOrientation; using tt::tt_metal::TensorMemoryLayout; static const auto DRAM_MEMORY_CONFIG = MemoryConfig{TensorMemoryLayout::INTERLEAVED, BufferType::DRAM}; diff --git a/ttnn/tools/profiler/op_profiler.hpp b/ttnn/tools/profiler/op_profiler.hpp index e8faca20043..597e077d9b3 100644 --- a/ttnn/tools/profiler/op_profiler.hpp +++ b/ttnn/tools/profiler/op_profiler.hpp @@ -391,7 +391,7 @@ inline std::string op_meta_data_serialized_json( return device_operation_t::create_op_performance_model( operation_attributes, tensor_args, tensor_return_value); } else { - return operation::OpPerformanceModel{}; + return tt::tt_metal::operation::OpPerformanceModel{}; } }(); j["performance_model"]["compute_ns"] = perfModel.get_compute_ns(); @@ -418,16 +418,16 @@ inline std::string op_meta_data_serialized_json( #define TracyOpTTNNDevice( \ operation, operation_id, device_id, program, operation_attributes, tensor_args, tensor_return_value) \ - std::string op_message = op_profiler::op_meta_data_serialized_json( \ + std::string op_message = tt::tt_metal::op_profiler::op_meta_data_serialized_json( \ operation, operation_id, device_id, program, operation_attributes, tensor_args, tensor_return_value); \ std::string op_text = fmt::format("id:{}", operation_id); \ ZoneText(op_text.c_str(), op_text.size()); \ TracyMessage(op_message.c_str(), op_message.size()); -#define TracyOpTTNNExternal(op_id, op, input_tensors) \ - std::string op_message = op_profiler::op_meta_data_serialized_json(op_id, op, input_tensors); \ - std::string op_text = fmt::format("id:{}", op_id); \ - ZoneText(op_text.c_str(), op_text.size()); \ +#define TracyOpTTNNExternal(op_id, op, input_tensors) \ + std::string op_message = tt::tt_metal::op_profiler::op_meta_data_serialized_json(op_id, op, input_tensors); \ + std::string op_text = fmt::format("id:{}", op_id); \ + ZoneText(op_text.c_str(), op_text.size()); \ TracyMessage(op_message.c_str(), op_message.size()); #else