Skip to content

Commit

Permalink
Implement wait for Blackhole DRAM training
Browse files Browse the repository at this point in the history
  • Loading branch information
pjanevskiTT committed Feb 28, 2025
1 parent 7366ec8 commit 9bc2211
Show file tree
Hide file tree
Showing 8 changed files with 81 additions and 1 deletion.
2 changes: 2 additions & 0 deletions device/api/umd/device/chip/chip.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ class Chip {
void wait_chip_to_be_ready();

virtual void wait_eth_cores_training(const uint32_t timeout_ms = 60000);

virtual void wait_dram_cores_training(const uint32_t timeout_ms = 60000);
};

} // namespace tt::umd
2 changes: 2 additions & 0 deletions device/api/umd/device/chip/local_chip.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,7 @@ class LocalChip : public Chip {

protected:
void wait_eth_cores_training(const uint32_t timeout_ms = 60000) override;

void wait_dram_cores_training(const uint32_t timeout_ms = 60000) override;
};
} // namespace tt::umd
2 changes: 2 additions & 0 deletions device/api/umd/device/tt_device/blackhole_tt_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ class BlackholeTTDevice : public TTDevice {

void wait_arc_core_start(const tt_xy_pair arc_core, const uint32_t timeout_ms = 1000) override;

std::pair<bool, uint32_t> get_dram_training_status() override;

private:
static constexpr uint64_t ATU_OFFSET_IN_BH_BAR2 = 0x1200;
std::set<size_t> iatu_regions_;
Expand Down
4 changes: 4 additions & 0 deletions device/api/umd/device/tt_device/tt_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,10 @@ class TTDevice {

virtual void wait_arc_core_start(const tt_xy_pair arc_core, const uint32_t timeout_ms = 1000);

// TODO: find a way to expose this in a better way, probably through getting telemetry reader and reading the
// required fields. Returns the information whether DRAM training status is available and the status value.
virtual std::pair<bool, uint32_t> get_dram_training_status();

protected:
std::unique_ptr<PCIDevice> pci_device_;
std::unique_ptr<architecture_implementation> architecture_impl_;
Expand Down
7 changes: 6 additions & 1 deletion device/chip/chip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,13 @@ void Chip::set_barrier_address_params(const barrier_address_params& barrier_addr

const ChipInfo& Chip::get_chip_info() { return chip_info_; }

void Chip::wait_chip_to_be_ready() { wait_eth_cores_training(); }
void Chip::wait_chip_to_be_ready() {
wait_eth_cores_training();
wait_dram_cores_training();
}

void Chip::wait_eth_cores_training(const uint32_t timeout_ms) {}

void Chip::wait_dram_cores_training(const uint32_t timeout_ms) {}

} // namespace tt::umd
53 changes: 53 additions & 0 deletions device/chip/local_chip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "umd/device/chip/local_chip.h"

#include "logger.hpp"
#include "umd/device/blackhole_implementation.h"
#include "umd/device/tt_device/tlb_manager.h"
#include "umd/device/tt_device/tt_device.h"
#include "umd/device/types/blackhole_eth.h"
Expand Down Expand Up @@ -82,4 +83,56 @@ void LocalChip::wait_eth_cores_training(const uint32_t timeout_ms) {
}
}

void LocalChip::wait_dram_cores_training(const uint32_t timeout_ms) {
if (get_tt_device()->get_arch() != tt::ARCH::BLACKHOLE) {
return;
}

TTDevice* tt_device = get_tt_device();

auto start = std::chrono::system_clock::now();
while (true) {
const auto [dram_training_info_available, dram_training_status] = tt_device->get_dram_training_status();

if (!dram_training_info_available) {
// DRAM training status is not available, breaking the wait for DRAM training.
break;
}

bool all_dram_channels_trained = true;
// Format of the dram training status is as follows:
// Each channel gets two bits in the 32-bit value (16 bits used). The lower bits are for lower channels.
// Lower of the two bits is for training error and higher of the two bits is for training status.
// Example: 0b 00 00 00 00 00 00 01 10
// would mean that only channel 0 is trained, channel 1 has the error and other are not trained and don't have
// errors. If some channel is harvested the bits are always going to be zero.
const uint32_t dram_harvesting_mask = get_soc_descriptor().harvesting_masks.dram_harvesting_mask;
for (uint32_t dram_channel = 0; dram_channel < blackhole::NUM_DRAM_BANKS; dram_channel++) {
// Skip the check for harvested channels.
if (dram_harvesting_mask & (1 << dram_channel)) {
continue;
}

// Check if there is an error in training for the channel.
if (dram_training_status & (1 << (2 * dram_channel))) {
throw std::runtime_error("DRAM training failed");
}

// Verify whether the channel is trained.
all_dram_channels_trained &= (dram_training_status & (1 << (2 * dram_channel + 1)));
}

if (all_dram_channels_trained) {
break;
}

auto end = std::chrono::system_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
if (duration.count() > timeout_ms) {
throw std::runtime_error(fmt::format("DRAM training timed out after {} ms", timeout_ms));
break;
}
}
}

} // namespace tt::umd
10 changes: 10 additions & 0 deletions device/tt_device/blackhole_tt_device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -141,4 +141,14 @@ void BlackholeTTDevice::wait_arc_core_start(const tt_xy_pair arc_core, const uin
}
}

std::pair<bool, uint32_t> BlackholeTTDevice::get_dram_training_status() {
if (telemetry->is_entry_available(tt::umd::blackhole::TAG_DDR_STATUS)) {
uint32_t ddr_status = telemetry->read_entry(tt::umd::blackhole::TAG_DDR_STATUS);

return {true, ddr_status};
}

return {false, 0};
}

} // namespace tt::umd
2 changes: 2 additions & 0 deletions device/tt_device/tt_device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -351,4 +351,6 @@ void TTDevice::wait_arc_core_start(const tt_xy_pair arc_core, const uint32_t tim
throw std::runtime_error("Waiting for ARC core to start is supported only for Blackhole TTDevice.");
}

std::pair<bool, uint32_t> TTDevice::get_dram_training_status() { return {false, 0}; }

} // namespace tt::umd

0 comments on commit 9bc2211

Please sign in to comment.