Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove setup_core_to_tlb_map #403

Merged
merged 7 commits into from
Dec 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 8 additions & 19 deletions device/api/umd/device/cluster.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,18 +83,6 @@ class tt_device {
throw std::runtime_error("---- tt_device::set_fallback_tlb_ordering_mode is not implemented\n");
}

/**
* Give UMD a 1:1 function mapping a core to its appropriate static TLB (currently only support a single TLB per
* core).
*
* @param logical_device_id MMIO chip being targeted.
* @param mapping_function Function which maps core to TLB index.
*/
virtual void setup_core_to_tlb_map(
const chip_id_t logical_device_id, std::function<std::int32_t(tt_xy_pair)> mapping_function) {
throw std::runtime_error("---- tt_device::setup_core_to_tlb_map is not implemented\n");
}

/**
* Pass in ethernet cores with active links for a specific MMIO chip. When called, this function will force UMD to
* use a subset of cores from the active_eth_cores_per_chip set for all host->cluster non-MMIO transfers. If this
Expand Down Expand Up @@ -531,8 +519,6 @@ class Cluster : public tt_device {
uint64_t address,
uint64_t ordering = TLB_DATA::Posted);
virtual void set_fallback_tlb_ordering_mode(const std::string& fallback_tlb, uint64_t ordering = TLB_DATA::Posted);
virtual void setup_core_to_tlb_map(
const chip_id_t logical_device_id, std::function<std::int32_t(tt_xy_pair)> mapping_function);
virtual void configure_active_ethernet_cores_for_mmio_device(
chip_id_t mmio_chip, const std::unordered_set<tt_xy_pair>& active_eth_cores_per_chip);
virtual void start_device(const tt_device_params& device_params);
Expand Down Expand Up @@ -752,8 +738,16 @@ class Cluster : public tt_device {
int timeout = 1,
uint32_t* return_3 = nullptr,
uint32_t* return_4 = nullptr);

// TODO: These will be moved to a dedicated class for TLB management
bool address_in_tlb_space(
uint64_t address, uint32_t size_in_bytes, int32_t tlb_index, uint64_t tlb_size, uint32_t chip);
bool is_tlb_mapped(tt_cxy_pair target);
bool is_tlb_mapped(tt_cxy_pair target, uint64_t address, uint32_t size_in_bytes);
// Note that these maps holds only entries for local PCIe chips.
std::map<chip_id_t, std::unordered_map<int32_t, uint64_t>> tlb_config_map = {};
std::unordered_map<chip_id_t, std::unordered_map<tt_xy_pair, std::int32_t>> map_core_to_tlb_per_chip = {};

std::shared_ptr<boost::interprocess::named_mutex> get_mutex(const std::string& tlb_name, int pci_interface_id);
virtual uint32_t get_harvested_noc_rows_for_chip(
int logical_device_id); // Returns one-hot encoded harvesting mask for PCIe mapped chips
Expand Down Expand Up @@ -820,11 +814,6 @@ class Cluster : public tt_device {
std::unordered_map<chip_id_t, std::unordered_set<tt_xy_pair>> workers_per_chip = {};
std::unordered_set<tt_xy_pair> eth_cores = {};
std::unordered_set<tt_xy_pair> dram_cores = {};
std::map<chip_id_t, std::unordered_map<int32_t, uint64_t>> tlb_config_map = {};

// Note that these maps holds only entries for local PCIe chips.
std::unordered_map<chip_id_t, std::function<std::int32_t(tt_xy_pair)>> map_core_to_tlb_per_chip = {};
std::unordered_map<chip_id_t, bool> tlbs_init_per_chip = {};

std::unordered_map<std::string, std::int32_t> dynamic_tlb_config = {};
std::unordered_map<std::string, uint64_t> dynamic_tlb_ordering_modes = {};
Expand Down
106 changes: 60 additions & 46 deletions device/cluster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,31 @@ bool Cluster::address_in_tlb_space(
return false;
}

bool Cluster::is_tlb_mapped(tt_cxy_pair target) {
if (map_core_to_tlb_per_chip.find(target.chip) == map_core_to_tlb_per_chip.end()) {
return false;
}

auto& map_core_to_tlb = map_core_to_tlb_per_chip.at(target.chip);
tt_xy_pair target_core = tt_xy_pair(target.x, target.y);

return map_core_to_tlb.find(target_core) != map_core_to_tlb.end();
}

bool Cluster::is_tlb_mapped(tt_cxy_pair target, uint64_t address, uint32_t size_in_bytes) {
if (!is_tlb_mapped(target)) {
return false;
}

auto* dev = get_tt_device(target.chip);

int32_t tlb_index = map_core_to_tlb_per_chip.at(target.chip).at(tt_xy_pair(target.x, target.y));
auto tlb_description = dev->get_architecture_implementation()->describe_tlb(tlb_index);

return tlb_description.has_value() &&
address_in_tlb_space(address, size_in_bytes, tlb_index, std::get<1>(tlb_description.value()), target.chip);
}

void Cluster::initialize_interprocess_mutexes(int pci_interface_id, bool cleanup_mutexes_in_shm) {
// These mutexes are intended to be based on physical devices/pci-intf not logical. Set these up ahead of time here
// (during device init) since its unsafe to modify shared state during multithreaded runtime. cleanup_mutexes_in_shm
Expand Down Expand Up @@ -1040,23 +1065,18 @@ tt::Writer Cluster::get_static_tlb_writer(tt_cxy_pair target) {
throw std::runtime_error(fmt::format("Target not in MMIO chip: {}", target.str()));
}

if (!tlbs_init_per_chip[target.chip] || !map_core_to_tlb_per_chip[target.chip]) {
throw std::runtime_error("TLBs not initialized");
if (!is_tlb_mapped(target)) {
throw std::runtime_error(fmt::format("TLBs not initialized for core: {}", target.str()));
}

auto* dev = get_tt_device(target.chip);

if (!dev->get_pci_device()->bar0_wc) {
throw std::runtime_error("No write-combined mapping for BAR0");
}

auto tlb_index = map_core_to_tlb_per_chip[target.chip](tt_xy_pair(target.x, target.y));
auto tlb_index = map_core_to_tlb_per_chip.at(target.chip).at(tt_xy_pair(target.x, target.y));
auto tlb_data = dev->get_architecture_implementation()->describe_tlb(tlb_index);

if (!tlb_data.has_value()) {
throw std::runtime_error(fmt::format("No TLB mapped to core {}", target.str()));
}

auto [tlb_offset, tlb_size] = tlb_data.value();
auto* base = reinterpret_cast<uint8_t*>(dev->get_pci_device()->bar0_wc);

Expand All @@ -1082,16 +1102,10 @@ void Cluster::write_device_memory(
size_in_bytes,
small_access);

std::int32_t tlb_index = 0;
std::optional<std::tuple<std::uint64_t, std::uint64_t>> tlb_data = std::nullopt;
if (tlbs_init_per_chip[target.chip]) {
tlb_index = map_core_to_tlb_per_chip[target.chip](tt_xy_pair(target.x, target.y));
tlb_data = dev->get_architecture_implementation()->describe_tlb(tlb_index);
}

if (tlb_data.has_value() &&
address_in_tlb_space(address, size_in_bytes, tlb_index, std::get<1>(tlb_data.value()), target.chip)) {
auto [tlb_offset, tlb_size] = tlb_data.value();
if (is_tlb_mapped(target, address, size_in_bytes)) {
auto tlb_description = dev->get_architecture_implementation()->describe_tlb(
map_core_to_tlb_per_chip.at(target.chip).at(tt_xy_pair(target.x, target.y)));
auto [tlb_offset, tlb_size] = tlb_description.value();
if (dev->get_pci_device()->bar4_wc != nullptr && tlb_size == BH_4GB_TLB_SIZE) {
// This is only for Blackhole. If we want to write to DRAM (BAR4 space), we add offset
// to which we write so write_block knows it needs to target BAR4
Expand Down Expand Up @@ -1132,20 +1146,14 @@ void Cluster::read_device_memory(
address,
size_in_bytes);
TTDevice* dev = get_tt_device(target.chip);

uint8_t* buffer_addr = static_cast<uint8_t*>(mem_ptr);

std::int32_t tlb_index = 0;
std::optional<std::tuple<std::uint64_t, std::uint64_t>> tlb_data = std::nullopt;
if (tlbs_init_per_chip[target.chip]) {
tlb_index = map_core_to_tlb_per_chip[target.chip](tt_xy_pair(target.x, target.y));
tlb_data = dev->get_architecture_implementation()->describe_tlb(tlb_index);
}
log_debug(LogSiliconDriver, " tlb_index: {}, tlb_data.has_value(): {}", tlb_index, tlb_data.has_value());

if (tlb_data.has_value() &&
address_in_tlb_space(address, size_in_bytes, tlb_index, std::get<1>(tlb_data.value()), target.chip)) {
auto [tlb_offset, tlb_size] = tlb_data.value();
if (is_tlb_mapped(target, address, size_in_bytes)) {
auto tlb_description = dev->get_architecture_implementation()->describe_tlb(
map_core_to_tlb_per_chip.at(target.chip).at(tt_xy_pair(target.x, target.y)));
auto [tlb_offset, tlb_size] = tlb_description.value();
if (dev->get_pci_device()->bar4_wc != nullptr && tlb_size == BH_4GB_TLB_SIZE) {
// This is only for Blackhole. If we want to read from DRAM (BAR4 space), we add offset
// from which we read so read_block knows it needs to target BAR4
Expand Down Expand Up @@ -1314,29 +1322,41 @@ Cluster::~Cluster() {
}

std::optional<std::tuple<uint32_t, uint32_t>> Cluster::get_tlb_data_from_target(const tt_cxy_pair& target) {
std::int32_t tlb_index = 0;
std::optional<std::tuple<std::uint32_t, std::uint32_t>> tlb_data;

if (tlbs_init_per_chip[target.chip]) {
tlb_index = map_core_to_tlb_per_chip[target.chip](tt_xy_pair(target.x, target.y));
auto architecture_implementation = tt::umd::architecture_implementation::create(arch_name);
tlb_data = architecture_implementation->describe_tlb(tlb_index);
if (!is_tlb_mapped(target)) {
return std::nullopt;
}
return tlb_data;

int tlb_index = map_core_to_tlb_per_chip.at(target.chip).at(tt_xy_pair(target.x, target.y));
return get_tt_device(target.chip)->get_architecture_implementation()->describe_tlb(tlb_index);
}

void Cluster::configure_tlb(
chip_id_t logical_device_id, tt_xy_pair core, int32_t tlb_index, uint64_t address, uint64_t ordering) {
log_assert(
ordering == TLB_DATA::Strict || ordering == TLB_DATA::Posted || ordering == TLB_DATA::Relaxed,
"Invalid ordering specified in Cluster::configure_tlb");
TTDevice* tt_device = get_tt_device(logical_device_id);
tt_device->set_dynamic_tlb(tlb_index, core, address, harvested_coord_translation.at(logical_device_id), ordering);
auto tlb_size = std::get<1>(tt_device->get_architecture_implementation()->describe_tlb(tlb_index).value());
if (tlb_config_map.find(logical_device_id) == tlb_config_map.end()) {
tlb_config_map.insert({logical_device_id, {}});
map_core_to_tlb_per_chip.insert({logical_device_id, {}});
}
tlb_config_map[logical_device_id].insert({tlb_index, (address / tlb_size) * tlb_size});
log_debug(
LogSiliconDriver,
"Configuring TLB for chip: {} core: {} tlb_index: {} address: {} ordering: {}",
logical_device_id,
core.str(),
tlb_index,
address,
ordering);
log_assert(
tlb_config_map.at(logical_device_id).find(tlb_index) == tlb_config_map.at(logical_device_id).end(),
"TLB index already configured {}",
tlb_index);

TTDevice* tt_device = get_tt_device(logical_device_id);
tt_device->set_dynamic_tlb(tlb_index, core, address, harvested_coord_translation.at(logical_device_id), ordering);
auto tlb_size = std::get<1>(tt_device->get_architecture_implementation()->describe_tlb(tlb_index).value());
tlb_config_map.at(logical_device_id).insert({tlb_index, (address / tlb_size) * tlb_size});
map_core_to_tlb_per_chip.at(logical_device_id).insert({core, tlb_index});
}

void Cluster::set_fallback_tlb_ordering_mode(const std::string& fallback_tlb, uint64_t ordering) {
Expand Down Expand Up @@ -3264,12 +3284,6 @@ void Cluster::close_device() {
broadcast_tensix_risc_reset_to_cluster(TENSIX_ASSERT_SOFT_RESET);
}

void Cluster::setup_core_to_tlb_map(
const chip_id_t logical_device_id, std::function<std::int32_t(tt_xy_pair)> mapping_function) {
map_core_to_tlb_per_chip[logical_device_id] = mapping_function;
tlbs_init_per_chip[logical_device_id] = true;
}

std::uint32_t Cluster::get_num_dram_channels(std::uint32_t device_id) {
log_assert(
all_chip_ids_.find(device_id) != all_chip_ids_.end(),
Expand Down
2 changes: 0 additions & 2 deletions tests/api/test_chip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,6 @@ TEST(ApiChipTest, ManualTLBConfiguration) {
for (tt_xy_pair core : soc_desc.workers) {
umd_cluster->configure_tlb(mmio_chip, core, get_static_tlb_index(core), c_zero_address);
}

umd_cluster->setup_core_to_tlb_map(mmio_chip, get_static_tlb_index);
}

// Expect not to throw for now configured mmio chip, same one as before.
Expand Down
4 changes: 0 additions & 4 deletions tests/blackhole/test_cluster_bh.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,6 @@ TEST(SiliconDriverBH, CreateDestroy) {
// }
// }
// }
// cluster.setup_core_to_tlb_map(get_static_tlb_index_callback);

// tt_device_params default_params;
// cluster.start_device(default_params);
Expand Down Expand Up @@ -292,7 +291,6 @@ TEST(SiliconDriverBH, UnalignedStaticTLB_RW) {
cluster.configure_tlb(
i, core, get_static_tlb_index_callback(core), l1_mem::address_map::NCRISC_FIRMWARE_BASE);
}
cluster.setup_core_to_tlb_map(i, get_static_tlb_index_callback);
}
}

Expand Down Expand Up @@ -349,7 +347,6 @@ TEST(SiliconDriverBH, StaticTLB_RW) {
cluster.configure_tlb(
i, core, get_static_tlb_index_callback(core), l1_mem::address_map::NCRISC_FIRMWARE_BASE);
}
cluster.setup_core_to_tlb_map(i, get_static_tlb_index_callback);
}
}

Expand Down Expand Up @@ -570,7 +567,6 @@ TEST(SiliconDriverBH, MultiThreadedMemBar) {
// Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE.
cluster.configure_tlb(i, core, get_static_tlb_index_callback(core), base_addr);
}
cluster.setup_core_to_tlb_map(i, get_static_tlb_index_callback);
}

tt_device_params default_params;
Expand Down
11 changes: 4 additions & 7 deletions tests/grayskull/test_cluster_gs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@
#include "tests/test_utils/device_test_utils.hpp"
#include "tests/test_utils/generate_cluster_desc.hpp"
#include "umd/device/cluster.h"
#include "umd/device/grayskull_implementation.h"
#include "umd/device/tt_cluster_descriptor.h"
#include "umd/device/tt_soc_descriptor.h"
#include "umd/device/wormhole_implementation.h"

using namespace tt::umd;

Expand Down Expand Up @@ -95,7 +95,7 @@ TEST(SiliconDriverGS, CustomSocDesc) {

TEST(SiliconDriverGS, HarvestingRuntime) {
auto get_static_tlb_index = [](tt_xy_pair target) {
int flat_index = target.y * tt::umd::wormhole::GRID_SIZE_X + target.x;
int flat_index = target.y * tt::umd::grayskull::GRID_SIZE_X + target.x;
if (flat_index == 0) {
return -1;
}
Expand All @@ -114,7 +114,6 @@ TEST(SiliconDriverGS, HarvestingRuntime) {
// Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE.
cluster.configure_tlb(i, core, get_static_tlb_index(core), l1_mem::address_map::DATA_BUFFER_SPACE_BASE);
}
cluster.setup_core_to_tlb_map(i, get_static_tlb_index);
}

tt_device_params default_params;
Expand Down Expand Up @@ -189,7 +188,7 @@ TEST(SiliconDriverGS, HarvestingRuntime) {

TEST(SiliconDriverGS, StaticTLB_RW) {
auto get_static_tlb_index = [](tt_xy_pair target) {
int flat_index = target.y * tt::umd::wormhole::GRID_SIZE_X + target.x;
int flat_index = target.y * tt::umd::grayskull::GRID_SIZE_X + target.x;
if (flat_index == 0) {
return -1;
}
Expand All @@ -207,7 +206,6 @@ TEST(SiliconDriverGS, StaticTLB_RW) {
cluster.configure_tlb(
i, core, get_static_tlb_index(core), l1_mem::address_map::DATA_BUFFER_SPACE_BASE, TLB_DATA::Posted);
}
cluster.setup_core_to_tlb_map(i, get_static_tlb_index);
}

tt_device_params default_params;
Expand Down Expand Up @@ -405,7 +403,7 @@ TEST(SiliconDriverGS, MultiThreadedMemBar) { // this tests takes ~5 mins to run
// Memory barrier flags get sent to address 0 for all channels in this test

auto get_static_tlb_index = [](tt_xy_pair target) {
int flat_index = target.y * tt::umd::wormhole::GRID_SIZE_X + target.x;
int flat_index = target.y * tt::umd::grayskull::GRID_SIZE_X + target.x;
if (flat_index == 0) {
return -1;
}
Expand All @@ -425,7 +423,6 @@ TEST(SiliconDriverGS, MultiThreadedMemBar) { // this tests takes ~5 mins to run
// Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE.
cluster.configure_tlb(i, core, get_static_tlb_index(core), base_addr);
}
cluster.setup_core_to_tlb_map(i, get_static_tlb_index);
}

tt_device_params default_params;
Expand Down
13 changes: 4 additions & 9 deletions tests/wormhole/test_cluster_wh.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,6 @@ TEST(SiliconDriverWH, HarvestingRuntime) {
}
}
}
cluster.setup_core_to_tlb_map(get_static_tlb_index_callback);

tt_device_params default_params;
cluster.start_device(default_params);
Expand Down Expand Up @@ -233,7 +232,6 @@ TEST(SiliconDriverWH, UnalignedStaticTLB_RW) {
cluster.configure_tlb(
i, core, get_static_tlb_index_callback(core), l1_mem::address_map::NCRISC_FIRMWARE_BASE);
}
cluster.setup_core_to_tlb_map(i, get_static_tlb_index_callback);
}
}

Expand Down Expand Up @@ -289,7 +287,6 @@ TEST(SiliconDriverWH, StaticTLB_RW) {
cluster.configure_tlb(
i, core, get_static_tlb_index_callback(core), l1_mem::address_map::NCRISC_FIRMWARE_BASE);
}
cluster.setup_core_to_tlb_map(i, get_static_tlb_index_callback);
}
}

Expand Down Expand Up @@ -474,7 +471,6 @@ TEST(SiliconDriverWH, MultiThreadedMemBar) {
// Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE.
cluster.configure_tlb(i, core, get_static_tlb_index_callback(core), base_addr);
}
cluster.setup_core_to_tlb_map(i, get_static_tlb_index_callback);
}
}

Expand Down Expand Up @@ -947,14 +943,13 @@ TEST(SiliconDriverWH, LargeAddressTlb) {
true, // clean system resources - yes
true); // perform harvesting - yes

const auto ARC = cluster.get_soc_descriptor(0).arc_cores.at(0);
const tt_cxy_pair ARC_CORE(0, ARC.x, ARC.y);
const tt_xy_pair ARC_CORE = cluster.get_soc_descriptor(0).arc_cores.at(0);
const tt_cxy_pair ARC_CORE_CHIP(0, ARC_CORE.x, ARC_CORE.y);

set_barrier_params(cluster);
cluster.start_device(tt_device_params{});

auto get_static_tlb_index_callback = [](tt_xy_pair target) { return 0; };
cluster.setup_core_to_tlb_map(0, get_static_tlb_index_callback);

// Address of the reset unit in ARC core:
uint64_t arc_reset_noc = 0x880030000ULL;
Expand All @@ -976,10 +971,10 @@ TEST(SiliconDriverWH, LargeAddressTlb) {
value0 = cluster.bar_read32(0, 0x1ff30060);

// Read the scratch register via the TLB:
cluster.read_from_device(&value1, ARC_CORE, addr, sizeof(uint32_t), "LARGE_READ_TLB");
cluster.read_from_device(&value1, ARC_CORE_CHIP, addr, sizeof(uint32_t), "LARGE_READ_TLB");

// Read the scratch register via a different TLB, different code path:
cluster.read_from_device(&value2, ARC_CORE, addr, sizeof(uint32_t), "REG_TLB");
cluster.read_from_device(&value2, ARC_CORE_CHIP, addr, sizeof(uint32_t), "REG_TLB");

// Mask off lower 16 bits; FW changes these dynamically:
value0 &= 0xffff0000;
Expand Down
Loading