Skip to content

Commit

Permalink
Add dram training status enum
Browse files Browse the repository at this point in the history
  • Loading branch information
pjanevskiTT committed Mar 3, 2025
1 parent d91bb76 commit e7bc2ba
Show file tree
Hide file tree
Showing 6 changed files with 43 additions and 18 deletions.
2 changes: 1 addition & 1 deletion device/api/umd/device/tt_device/blackhole_tt_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class BlackholeTTDevice : public TTDevice {

void wait_arc_core_start(const tt_xy_pair arc_core, const uint32_t timeout_ms = 1000) override;

std::optional<uint32_t> get_dram_training_status() override;
std::vector<DramTrainingStatus> get_dram_training_status() override;

private:
static constexpr uint64_t ATU_OFFSET_IN_BH_BAR2 = 0x1200;
Expand Down
2 changes: 1 addition & 1 deletion device/api/umd/device/tt_device/tt_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ class TTDevice {

// TODO: find a way to expose this in a better way, probably through getting telemetry reader and reading the
// required fields. Returns the information whether DRAM training status is available and the status value.
virtual std::optional<uint32_t> get_dram_training_status();
virtual std::vector<DramTrainingStatus> get_dram_training_status();

protected:
std::unique_ptr<PCIDevice> pci_device_;
Expand Down
6 changes: 6 additions & 0 deletions device/api/umd/device/types/cluster_descriptor_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,12 @@ struct ChipInfo {
bool noc_translation_enabled;
};

enum class DramTrainingStatus : uint8_t {
IN_PROGRESS = 0,
FAIL = 1,
SUCCESS = 2,
};

namespace std {
template <>
struct hash<eth_coord_t> {
Expand Down
18 changes: 7 additions & 11 deletions device/chip/local_chip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,34 +92,30 @@ void LocalChip::wait_dram_cores_training(const uint32_t timeout_ms) {

auto start = std::chrono::system_clock::now();
while (true) {
std::optional<uint32_t> dram_training_status = tt_device->get_dram_training_status();
std::vector<DramTrainingStatus> dram_training_status = tt_device->get_dram_training_status();

if (!dram_training_status) {
if (dram_training_status.empty()) {
// DRAM training status is not available, breaking the wait for DRAM training.
break;
}

bool all_dram_channels_trained = true;
// Format of the dram training status is as follows:
// Each channel gets two bits in the 32-bit value (16 bits used). The lower bits are for lower channels.
// Lower of the two bits is for training error and higher of the two bits is for training status.
// Example: 0b 00 00 00 00 00 00 01 10
// would mean that only channel 0 is trained, channel 1 has the error and other are not trained and don't have
// errors. If some channel is harvested the bits are always going to be zero.
const uint32_t chip_num_dram_channels =
std::min(dram_training_status.size(), get_soc_descriptor().get_dram_cores().size());
const uint32_t dram_harvesting_mask = get_soc_descriptor().harvesting_masks.dram_harvesting_mask;
for (uint32_t dram_channel = 0; dram_channel < blackhole::NUM_DRAM_BANKS; dram_channel++) {
for (uint32_t dram_channel = 0; dram_channel < chip_num_dram_channels; dram_channel++) {
// Skip the check for harvested channels.
if (dram_harvesting_mask & (1 << dram_channel)) {
continue;
}

// Check if there is an error in training for the channel.
if (dram_training_status.value() & (1 << (2 * dram_channel))) {
if (dram_training_status[dram_channel] == DramTrainingStatus::FAIL) {
throw std::runtime_error("DRAM training failed");
}

// Verify whether the channel is trained.
all_dram_channels_trained &= (dram_training_status.value() & (1 << (2 * dram_channel + 1)));
all_dram_channels_trained &= (dram_training_status[dram_channel] == DramTrainingStatus::SUCCESS);
}

if (all_dram_channels_trained) {
Expand Down
31 changes: 27 additions & 4 deletions device/tt_device/blackhole_tt_device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -141,12 +141,35 @@ void BlackholeTTDevice::wait_arc_core_start(const tt_xy_pair arc_core, const uin
}
}

std::optional<uint32_t> BlackholeTTDevice::get_dram_training_status() {
if (telemetry->is_entry_available(tt::umd::blackhole::TAG_DDR_STATUS)) {
return telemetry->read_entry(tt::umd::blackhole::TAG_DDR_STATUS);
std::vector<DramTrainingStatus> BlackholeTTDevice::get_dram_training_status() {
if (!telemetry->is_entry_available(tt::umd::blackhole::TAG_DDR_STATUS)) {
return {};
}

return std::nullopt;
uint32_t telemetry_data = telemetry->read_entry(tt::umd::blackhole::TAG_DDR_STATUS);
std::vector<DramTrainingStatus> dram_training_status;
const uint32_t num_dram_channels = blackhole::NUM_DRAM_BANKS;
// Format of the dram training status is as follows:
// Each channel gets two bits in the 32-bit value (16 bits used). The lower bits are for lower channels.
// Lower of the two bits is for training error and higher of the two bits is for training status.
// Example: 0b 00 00 00 00 00 00 01 10
// would mean that only channel 0 is trained, channel 1 has the error and other are not trained and don't have
// errors. If some channel is harvested the bits are always going to be zero.
for (uint32_t dram_channel = 0; dram_channel < num_dram_channels; dram_channel++) {
if (telemetry_data & (1 << (2 * dram_channel))) {
dram_training_status.push_back(DramTrainingStatus::FAIL);
continue;
}

if (telemetry_data & (1 << (2 * dram_channel + 1))) {
dram_training_status.push_back(DramTrainingStatus::SUCCESS);
continue;
}

dram_training_status.push_back(DramTrainingStatus::IN_PROGRESS);
}

return dram_training_status;
}

} // namespace tt::umd
2 changes: 1 addition & 1 deletion device/tt_device/tt_device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,6 @@ void TTDevice::wait_arc_core_start(const tt_xy_pair arc_core, const uint32_t tim
throw std::runtime_error("Waiting for ARC core to start is supported only for Blackhole TTDevice.");
}

std::optional<uint32_t> TTDevice::get_dram_training_status() { return std::nullopt; }
std::vector<DramTrainingStatus> TTDevice::get_dram_training_status() { return {}; }

} // namespace tt::umd

0 comments on commit e7bc2ba

Please sign in to comment.