Skip to content

Commit

Permalink
Enable multi-buffer per channel in EDM (#11387)
Browse files Browse the repository at this point in the history
#6300: Add multi-buffering per EDM channel

Adds the option to add a multiple buffers (e.g. double buffered) per EDM
channel. This is useful for improving performance of CCL operation.

To simplify the worker <-> EDM interface to allow a kernel to
automatically support multi-buffered channels, new adapter components
are added:
 - WorkerToEdmReader: for a worker pulling data from EDM
 - WorkerToEdmSender: for a worker pushing data to the EDM

These hide details such as buffer offsets in the channel and any other
details that may only be relevant to the EDM. Additionally, their use
encapsulates the worker <-> EDM data movement protocol, allowing future
low level changes to buffer layouts and allocations on the EDM without
requiring worker kernel changes.

As an a coinciding required step to enable this functionality the EDM
channel count limit has been lifted to unlimited (limited only to as
many buffers can fit into L1). This provides additional flexibility for
op writers and let's the `erisc_info::channels` to be shrunk back to
single entry.

Note that this commit only adds this feature, but does not yet enable it
for CCL ops.
  • Loading branch information
SeanNijjar authored Aug 18, 2024
1 parent 6ca0fbb commit 6c566aa
Show file tree
Hide file tree
Showing 34 changed files with 1,900 additions and 500 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/ttnn-post-commit.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ jobs:
fast_runtime_mode_off: true
- name: ttnn examples and cpp tests
cmd: ./build/test/ttnn/unit_tests_ttnn && ./tests/scripts/run_ttnn_examples.sh
- name: ttnn ccl cpp unit tests
cmd: ./build/test/ttnn/unit_tests_ttnn_ccl
name: ${{ matrix.test-group.name }} ${{ inputs.arch }} ${{ inputs.runner-label }}
env:
TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
Expand Down
2 changes: 1 addition & 1 deletion tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,5 @@ add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/tt_metal/tt_metal)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/tt_eager) # this should go away and be replaced with link to ttnn
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ttnn/unit_tests/gtests)

set(TESTS_DEPENDS_LIST metal_tests eager_tests unit_tests_ttnn test_multi_device galaxy_unit_tests_ttnn ttnn watcher_dump)
set(TESTS_DEPENDS_LIST metal_tests eager_tests unit_tests_ttnn unit_tests_ttnn_ccl test_multi_device galaxy_unit_tests_ttnn ttnn watcher_dump)
add_custom_target(tests DEPENDS ${TESTS_DEPENDS_LIST})
1 change: 1 addition & 0 deletions tests/scripts/t3000/run_t3000_unit_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ run_t3000_ttnn_tests() {
echo "LOG_METAL: Running run_t3000_ttnn_tests"
WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml ./build/test/ttnn/test_multi_device
WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml ./build/test/ttnn/unit_tests_ttnn
./build/test/ttnn/unit_tests_ttnn_ccl
WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest tests/ttnn/unit_tests/test_multi_device_trace.py ; fail+=$?
WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest tests/ttnn/unit_tests/test_multi_device_events.py ; fail+=$?
pytest -n auto tests/ttnn/unit_tests/test_multi_device.py ; fail+=$?
Expand Down
3 changes: 2 additions & 1 deletion tests/tt_eager/ops/ccl/test_ccl_helpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ TEST(CclHelpers, CreateEriscDatamoverBuilder_Chan4_PageSize2048_RRBufferSharingM
ttnn::ccl::EriscDataMoverBufferSharingMode buffer_sharing_mode = ttnn::ccl::EriscDataMoverBufferSharingMode::ROUND_ROBIN;
ttnn::ccl::EriscDataMoverTerminationMode termination_mode = ttnn::ccl::EriscDataMoverTerminationMode::MESSAGE_COUNT_REACHED;

auto edm_builder = create_erisc_datamover_builder(num_channels, page_size, buffer_sharing_mode, termination_mode);
std::size_t num_buffers_per_channel = 1;
auto edm_builder = create_erisc_datamover_builder(num_channels, page_size, num_buffers_per_channel, buffer_sharing_mode, termination_mode);
std::vector<uint32_t> worker_semaphore_ids = {0, 1, 2, 3};
std::vector<uint32_t> message_counts = {256, 512, 24, 1};
std::vector<std::vector<ttnn::ccl::WorkerXY>> const& worker_coords = {
Expand Down

This file was deleted.

This file was deleted.

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,6 @@ void kernel_main() {
channel_addrs[sender_channel],
channel_addrs[sender_channel],
message_size_payload,
sender_channel,
message_size_payload,
message_size_payload_eth_words + 1);
ready_to_send_payload &= ~(1 << s_i);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ FORCE_INLINE void run_loop_iteration(
reinterpret_cast<uint32_t>(channel_sync_addrs[i]),
reinterpret_cast<uint32_t>(channel_sync_addrs[i]),
sizeof(eth_channel_sync_t),
i, // remove this field - it's superfluous
sizeof(eth_channel_sync_t),
sizeof(eth_channel_sync_t) >> 4);
}
Expand All @@ -67,7 +66,6 @@ FORCE_INLINE void run_loop_iteration(
reinterpret_cast<uint32_t>(channel_sync_addrs[i]),
reinterpret_cast<uint32_t>(channel_sync_addrs[i]),
sizeof(eth_channel_sync_t),
i, // remove this field - it's superfluous
sizeof(eth_channel_sync_t),
sizeof(eth_channel_sync_t) >> 4);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ FORCE_INLINE void run_loop_iteration(
channel_addrs[i],
channel_addrs[i],
full_payload_size,
i,
full_payload_size,
full_payload_size_eth_words);
}
Expand All @@ -60,7 +59,6 @@ FORCE_INLINE void run_loop_iteration(
channel_addrs[i],
channel_addrs[i],
full_payload_size,
i,
full_payload_size,
full_payload_size_eth_words);
}
Expand Down
5 changes: 5 additions & 0 deletions tests/ttnn/unit_tests/gtests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,12 @@ set(TTNN_UNIT_TESTS_SRC
${CMAKE_CURRENT_SOURCE_DIR}/test_reflect.cpp
${CMAKE_CURRENT_SOURCE_DIR}/test_to_and_from_json.cpp
)
set(TTNN_CCL_UNIT_TESTS_SRC
${CMAKE_CURRENT_SOURCE_DIR}/ccl/test_erisc_data_mover_with_workers.cpp
)

add_executable(unit_tests_ttnn ${TTNN_UNIT_TESTS_SRC})
add_executable(unit_tests_ttnn_ccl ${TTNN_CCL_UNIT_TESTS_SRC})
add_executable(test_multi_device ${CMAKE_CURRENT_SOURCE_DIR}/test_multi_device.cpp)
add_executable(galaxy_unit_tests_ttnn ${CMAKE_CURRENT_SOURCE_DIR}/test_ccl_on_tg.cpp)

Expand All @@ -28,5 +32,6 @@ endfunction()

# Set up properties for both targets
setup_ttnn_test_target(unit_tests_ttnn)
setup_ttnn_test_target(unit_tests_ttnn_ccl)
setup_ttnn_test_target(test_multi_device)
setup_ttnn_test_target(galaxy_unit_tests_ttnn)
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

#include <cstdint>
#include <array>

#include "dataflow_api.h"
#include "ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_edm_utils.hpp"
#include "ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_edm_adapters.hpp"

void kernel_main() {
constexpr uint32_t eth_receiver_l1_base_addr = get_compile_time_arg_val(0);
constexpr uint32_t eth_receiver_l1_sem_addr = get_compile_time_arg_val(1);
constexpr uint32_t num_buffers_per_channel = get_compile_time_arg_val(2);
constexpr ttnn::ccl::EriscDataMoverTerminationMode termination_mode = static_cast<ttnn::ccl::EriscDataMoverTerminationMode>(get_compile_time_arg_val(3));
const uint32_t num_pages_per_read_chunk = get_arg_val<uint32_t>(0);
const uint32_t total_pages_to_read = get_arg_val<uint32_t>(1);
const uint32_t page_size = get_arg_val<uint32_t>(2);
const uint32_t receiver_erisc_datamover_noc_x = get_arg_val<uint32_t>(3);
const uint32_t receiver_erisc_datamover_noc_y = get_arg_val<uint32_t>(4);
// Worker local L1 semaphore that erisc datamover signals to
volatile uint32_t* const receiver_read_sem_addr = reinterpret_cast<volatile uint32_t* const >(get_semaphore(get_arg_val<uint32_t>(5)));
const uint32_t num_buffers_per_edm_channel = get_arg_val<uint32_t>(6);

ccl::edm::WorkerToEdmReader<termination_mode> reader(
ttnn::ccl::WorkerXY(receiver_erisc_datamover_noc_x, receiver_erisc_datamover_noc_y),
eth_receiver_l1_base_addr,
num_buffers_per_channel,
eth_receiver_l1_sem_addr,
num_pages_per_read_chunk * page_size,
receiver_read_sem_addr);

constexpr uint32_t cb_id_in0 = tt::CB::c_in0;

for (uint32_t i = 0; i < total_pages_to_read; i += num_pages_per_read_chunk) {
bool last_message = (i + num_pages_per_read_chunk) >= total_pages_to_read;
uint32_t num_pages_to_read = std::min(total_pages_to_read - i, num_pages_per_read_chunk);
reader.wait_for_payload_available();
reader.fetch_payload_blocking(cb_id_in0, num_pages_to_read, page_size, last_message);
}

reader.close();
}
Loading

0 comments on commit 6c566aa

Please sign in to comment.