From 9bc2211ee10b186aae092a8d131845ecc1fe9097 Mon Sep 17 00:00:00 2001 From: pjanevski Date: Tue, 25 Feb 2025 13:04:48 +0000 Subject: [PATCH] Implement wait for Blackhole DRAM training --- device/api/umd/device/chip/chip.h | 2 + device/api/umd/device/chip/local_chip.h | 2 + .../device/tt_device/blackhole_tt_device.h | 2 + device/api/umd/device/tt_device/tt_device.h | 4 ++ device/chip/chip.cpp | 7 ++- device/chip/local_chip.cpp | 53 +++++++++++++++++++ device/tt_device/blackhole_tt_device.cpp | 10 ++++ device/tt_device/tt_device.cpp | 2 + 8 files changed, 81 insertions(+), 1 deletion(-) diff --git a/device/api/umd/device/chip/chip.h b/device/api/umd/device/chip/chip.h index 25460896..d3378768 100644 --- a/device/api/umd/device/chip/chip.h +++ b/device/api/umd/device/chip/chip.h @@ -52,6 +52,8 @@ class Chip { void wait_chip_to_be_ready(); virtual void wait_eth_cores_training(const uint32_t timeout_ms = 60000); + + virtual void wait_dram_cores_training(const uint32_t timeout_ms = 60000); }; } // namespace tt::umd diff --git a/device/api/umd/device/chip/local_chip.h b/device/api/umd/device/chip/local_chip.h index 450f0158..b9f7b4a4 100644 --- a/device/api/umd/device/chip/local_chip.h +++ b/device/api/umd/device/chip/local_chip.h @@ -29,5 +29,7 @@ class LocalChip : public Chip { protected: void wait_eth_cores_training(const uint32_t timeout_ms = 60000) override; + + void wait_dram_cores_training(const uint32_t timeout_ms = 60000) override; }; } // namespace tt::umd diff --git a/device/api/umd/device/tt_device/blackhole_tt_device.h b/device/api/umd/device/tt_device/blackhole_tt_device.h index d373192f..afb6974b 100644 --- a/device/api/umd/device/tt_device/blackhole_tt_device.h +++ b/device/api/umd/device/tt_device/blackhole_tt_device.h @@ -23,6 +23,8 @@ class BlackholeTTDevice : public TTDevice { void wait_arc_core_start(const tt_xy_pair arc_core, const uint32_t timeout_ms = 1000) override; + std::pair get_dram_training_status() override; + private: static constexpr uint64_t ATU_OFFSET_IN_BH_BAR2 = 0x1200; std::set iatu_regions_; diff --git a/device/api/umd/device/tt_device/tt_device.h b/device/api/umd/device/tt_device/tt_device.h index 81799931..f4e3c7af 100644 --- a/device/api/umd/device/tt_device/tt_device.h +++ b/device/api/umd/device/tt_device/tt_device.h @@ -131,6 +131,10 @@ class TTDevice { virtual void wait_arc_core_start(const tt_xy_pair arc_core, const uint32_t timeout_ms = 1000); + // TODO: find a way to expose this in a better way, probably through getting telemetry reader and reading the + // required fields. Returns the information whether DRAM training status is available and the status value. + virtual std::pair get_dram_training_status(); + protected: std::unique_ptr pci_device_; std::unique_ptr architecture_impl_; diff --git a/device/chip/chip.cpp b/device/chip/chip.cpp index 53016957..8b4d2c1b 100644 --- a/device/chip/chip.cpp +++ b/device/chip/chip.cpp @@ -51,8 +51,13 @@ void Chip::set_barrier_address_params(const barrier_address_params& barrier_addr const ChipInfo& Chip::get_chip_info() { return chip_info_; } -void Chip::wait_chip_to_be_ready() { wait_eth_cores_training(); } +void Chip::wait_chip_to_be_ready() { + wait_eth_cores_training(); + wait_dram_cores_training(); +} void Chip::wait_eth_cores_training(const uint32_t timeout_ms) {} +void Chip::wait_dram_cores_training(const uint32_t timeout_ms) {} + } // namespace tt::umd diff --git a/device/chip/local_chip.cpp b/device/chip/local_chip.cpp index 60e08c81..e5b724b2 100644 --- a/device/chip/local_chip.cpp +++ b/device/chip/local_chip.cpp @@ -7,6 +7,7 @@ #include "umd/device/chip/local_chip.h" #include "logger.hpp" +#include "umd/device/blackhole_implementation.h" #include "umd/device/tt_device/tlb_manager.h" #include "umd/device/tt_device/tt_device.h" #include "umd/device/types/blackhole_eth.h" @@ -82,4 +83,56 @@ void LocalChip::wait_eth_cores_training(const uint32_t timeout_ms) { } } +void LocalChip::wait_dram_cores_training(const uint32_t timeout_ms) { + if (get_tt_device()->get_arch() != tt::ARCH::BLACKHOLE) { + return; + } + + TTDevice* tt_device = get_tt_device(); + + auto start = std::chrono::system_clock::now(); + while (true) { + const auto [dram_training_info_available, dram_training_status] = tt_device->get_dram_training_status(); + + if (!dram_training_info_available) { + // DRAM training status is not available, breaking the wait for DRAM training. + break; + } + + bool all_dram_channels_trained = true; + // Format of the dram training status is as follows: + // Each channel gets two bits in the 32-bit value (16 bits used). The lower bits are for lower channels. + // Lower of the two bits is for training error and higher of the two bits is for training status. + // Example: 0b 00 00 00 00 00 00 01 10 + // would mean that only channel 0 is trained, channel 1 has the error and other are not trained and don't have + // errors. If some channel is harvested the bits are always going to be zero. + const uint32_t dram_harvesting_mask = get_soc_descriptor().harvesting_masks.dram_harvesting_mask; + for (uint32_t dram_channel = 0; dram_channel < blackhole::NUM_DRAM_BANKS; dram_channel++) { + // Skip the check for harvested channels. + if (dram_harvesting_mask & (1 << dram_channel)) { + continue; + } + + // Check if there is an error in training for the channel. + if (dram_training_status & (1 << (2 * dram_channel))) { + throw std::runtime_error("DRAM training failed"); + } + + // Verify whether the channel is trained. + all_dram_channels_trained &= (dram_training_status & (1 << (2 * dram_channel + 1))); + } + + if (all_dram_channels_trained) { + break; + } + + auto end = std::chrono::system_clock::now(); + auto duration = std::chrono::duration_cast(end - start); + if (duration.count() > timeout_ms) { + throw std::runtime_error(fmt::format("DRAM training timed out after {} ms", timeout_ms)); + break; + } + } +} + } // namespace tt::umd diff --git a/device/tt_device/blackhole_tt_device.cpp b/device/tt_device/blackhole_tt_device.cpp index 72c00682..f15c28b7 100644 --- a/device/tt_device/blackhole_tt_device.cpp +++ b/device/tt_device/blackhole_tt_device.cpp @@ -141,4 +141,14 @@ void BlackholeTTDevice::wait_arc_core_start(const tt_xy_pair arc_core, const uin } } +std::pair BlackholeTTDevice::get_dram_training_status() { + if (telemetry->is_entry_available(tt::umd::blackhole::TAG_DDR_STATUS)) { + uint32_t ddr_status = telemetry->read_entry(tt::umd::blackhole::TAG_DDR_STATUS); + + return {true, ddr_status}; + } + + return {false, 0}; +} + } // namespace tt::umd diff --git a/device/tt_device/tt_device.cpp b/device/tt_device/tt_device.cpp index 5941d137..6f362118 100644 --- a/device/tt_device/tt_device.cpp +++ b/device/tt_device/tt_device.cpp @@ -351,4 +351,6 @@ void TTDevice::wait_arc_core_start(const tt_xy_pair arc_core, const uint32_t tim throw std::runtime_error("Waiting for ARC core to start is supported only for Blackhole TTDevice."); } +std::pair TTDevice::get_dram_training_status() { return {false, 0}; } + } // namespace tt::umd