diff --git a/device/api/umd/device/tt_device/blackhole_tt_device.h b/device/api/umd/device/tt_device/blackhole_tt_device.h index 3e4e421f..a764d33f 100644 --- a/device/api/umd/device/tt_device/blackhole_tt_device.h +++ b/device/api/umd/device/tt_device/blackhole_tt_device.h @@ -23,7 +23,7 @@ class BlackholeTTDevice : public TTDevice { void wait_arc_core_start(const tt_xy_pair arc_core, const uint32_t timeout_ms = 1000) override; - std::optional get_dram_training_status() override; + std::vector get_dram_training_status() override; private: static constexpr uint64_t ATU_OFFSET_IN_BH_BAR2 = 0x1200; diff --git a/device/api/umd/device/tt_device/tt_device.h b/device/api/umd/device/tt_device/tt_device.h index 888ef564..095cceb0 100644 --- a/device/api/umd/device/tt_device/tt_device.h +++ b/device/api/umd/device/tt_device/tt_device.h @@ -133,7 +133,7 @@ class TTDevice { // TODO: find a way to expose this in a better way, probably through getting telemetry reader and reading the // required fields. Returns the information whether DRAM training status is available and the status value. - virtual std::optional get_dram_training_status(); + virtual std::vector get_dram_training_status(); protected: std::unique_ptr pci_device_; diff --git a/device/api/umd/device/types/cluster_descriptor_types.h b/device/api/umd/device/types/cluster_descriptor_types.h index 0cc6d2d7..c7cf005a 100644 --- a/device/api/umd/device/types/cluster_descriptor_types.h +++ b/device/api/umd/device/types/cluster_descriptor_types.h @@ -147,6 +147,12 @@ struct ChipInfo { bool noc_translation_enabled; }; +enum class DramTrainingStatus : uint8_t { + IN_PROGRESS = 0, + FAIL = 1, + SUCCESS = 2, +}; + namespace std { template <> struct hash { diff --git a/device/chip/local_chip.cpp b/device/chip/local_chip.cpp index 7ee3820e..6a2c27be 100644 --- a/device/chip/local_chip.cpp +++ b/device/chip/local_chip.cpp @@ -92,34 +92,30 @@ void LocalChip::wait_dram_cores_training(const uint32_t timeout_ms) { auto start = std::chrono::system_clock::now(); while (true) { - std::optional dram_training_status = tt_device->get_dram_training_status(); + std::vector dram_training_status = tt_device->get_dram_training_status(); - if (!dram_training_status) { + if (dram_training_status.empty()) { // DRAM training status is not available, breaking the wait for DRAM training. break; } bool all_dram_channels_trained = true; - // Format of the dram training status is as follows: - // Each channel gets two bits in the 32-bit value (16 bits used). The lower bits are for lower channels. - // Lower of the two bits is for training error and higher of the two bits is for training status. - // Example: 0b 00 00 00 00 00 00 01 10 - // would mean that only channel 0 is trained, channel 1 has the error and other are not trained and don't have - // errors. If some channel is harvested the bits are always going to be zero. + const uint32_t chip_num_dram_channels = + std::min(dram_training_status.size(), get_soc_descriptor().get_dram_cores().size()); const uint32_t dram_harvesting_mask = get_soc_descriptor().harvesting_masks.dram_harvesting_mask; - for (uint32_t dram_channel = 0; dram_channel < blackhole::NUM_DRAM_BANKS; dram_channel++) { + for (uint32_t dram_channel = 0; dram_channel < chip_num_dram_channels; dram_channel++) { // Skip the check for harvested channels. if (dram_harvesting_mask & (1 << dram_channel)) { continue; } // Check if there is an error in training for the channel. - if (dram_training_status.value() & (1 << (2 * dram_channel))) { + if (dram_training_status[dram_channel] == DramTrainingStatus::FAIL) { throw std::runtime_error("DRAM training failed"); } // Verify whether the channel is trained. - all_dram_channels_trained &= (dram_training_status.value() & (1 << (2 * dram_channel + 1))); + all_dram_channels_trained &= (dram_training_status[dram_channel] == DramTrainingStatus::SUCCESS); } if (all_dram_channels_trained) { diff --git a/device/tt_device/blackhole_tt_device.cpp b/device/tt_device/blackhole_tt_device.cpp index d7d42f95..8fac0214 100644 --- a/device/tt_device/blackhole_tt_device.cpp +++ b/device/tt_device/blackhole_tt_device.cpp @@ -141,12 +141,35 @@ void BlackholeTTDevice::wait_arc_core_start(const tt_xy_pair arc_core, const uin } } -std::optional BlackholeTTDevice::get_dram_training_status() { - if (telemetry->is_entry_available(tt::umd::blackhole::TAG_DDR_STATUS)) { - return telemetry->read_entry(tt::umd::blackhole::TAG_DDR_STATUS); +std::vector BlackholeTTDevice::get_dram_training_status() { + if (!telemetry->is_entry_available(tt::umd::blackhole::TAG_DDR_STATUS)) { + return {}; } - return std::nullopt; + uint32_t telemetry_data = telemetry->read_entry(tt::umd::blackhole::TAG_DDR_STATUS); + std::vector dram_training_status; + const uint32_t num_dram_channels = blackhole::NUM_DRAM_BANKS; + // Format of the dram training status is as follows: + // Each channel gets two bits in the 32-bit value (16 bits used). The lower bits are for lower channels. + // Lower of the two bits is for training error and higher of the two bits is for training status. + // Example: 0b 00 00 00 00 00 00 01 10 + // would mean that only channel 0 is trained, channel 1 has the error and other are not trained and don't have + // errors. If some channel is harvested the bits are always going to be zero. + for (uint32_t dram_channel = 0; dram_channel < num_dram_channels; dram_channel++) { + if (telemetry_data & (1 << (2 * dram_channel))) { + dram_training_status.push_back(DramTrainingStatus::FAIL); + continue; + } + + if (telemetry_data & (1 << (2 * dram_channel + 1))) { + dram_training_status.push_back(DramTrainingStatus::SUCCESS); + continue; + } + + dram_training_status.push_back(DramTrainingStatus::IN_PROGRESS); + } + + return dram_training_status; } } // namespace tt::umd diff --git a/device/tt_device/tt_device.cpp b/device/tt_device/tt_device.cpp index 9bb8059b..96873e61 100644 --- a/device/tt_device/tt_device.cpp +++ b/device/tt_device/tt_device.cpp @@ -351,6 +351,6 @@ void TTDevice::wait_arc_core_start(const tt_xy_pair arc_core, const uint32_t tim throw std::runtime_error("Waiting for ARC core to start is supported only for Blackhole TTDevice."); } -std::optional TTDevice::get_dram_training_status() { return std::nullopt; } +std::vector TTDevice::get_dram_training_status() { return {}; } } // namespace tt::umd