Skip to content

Commit

Permalink
clean up some comments and log messages
Browse files Browse the repository at this point in the history
  • Loading branch information
joelsmithTT committed Dec 6, 2024
1 parent 1003a47 commit c03bac4
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 23 deletions.
22 changes: 14 additions & 8 deletions device/cluster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -268,18 +268,24 @@ void Cluster::create_device(
!(arch_name == tt::ARCH::BLACKHOLE && num_host_mem_channels > 1),
"More channels are not yet supported for Blackhole");
// Same number of host channels per device for now
bool hugepages_initialized = m_pci_device_map.at(logical_device_id)->init_hugepage(num_host_mem_channels);
// Large writes to remote chips require hugepages to be initialized.
// Conservative assert - end workload if remote chips present but hugepages not initialized (failures caused
// if using remote only for small transactions)
if (target_remote_chips.size()) {
bool hugepages_initialized = dev->init_hugepage(num_host_mem_channels);

if (!hugepages_initialized) {
log_warning(
LogSiliconDriver,
"Hugepages not initialized for device {} (logical_device_id: {} pci_interface_id: {})",
dev->get_device_num(),
logical_device_id,
pci_interface_id);
}

// Large writes to remote chips require at least one hugepage.
bool no_hugepages = (dev->get_hugepage_mapping(0).mapping == nullptr);
if (target_remote_chips.size() && no_hugepages) {
log_assert(
hugepages_initialized,
"Hugepages must be successfully initialized if workload contains remote chips!");
}
if (not m_pci_device_map.at(logical_device_id)->get_hugepage_mapping(0).mapping) {
log_warning(LogSiliconDriver, "No hugepage mapping at device {}.", logical_device_id);
}
}
// translation layer for harvested coords. Default is identity map
harvested_coord_translation.insert({logical_device_id, create_harvested_coord_translation(arch_name, true)});
Expand Down
21 changes: 6 additions & 15 deletions device/pcie/pci_device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -723,10 +723,7 @@ bool PCIDevice::init_hugepage(uint32_t num_host_mem_channels) {
auto physical_device_id = get_device_num();
std::string hugepage_dir = find_hugepage_dir(hugepage_size);
if (hugepage_dir.empty()) {
log_warning(
LogSiliconDriver,
"ttSiliconDevice::init_hugepage: no huge page mount found for hugepage_size: {}.",
hugepage_size);
log_warning(LogSiliconDriver, "init_hugepage: no huge page mount found for hugepage_size: {}.", hugepage_size);
return false;
}

Expand All @@ -741,7 +738,7 @@ bool PCIDevice::init_hugepage(uint32_t num_host_mem_channels) {
// Probably a permissions problem.
log_warning(
LogSiliconDriver,
"ttSiliconDevice::init_hugepage: physical_device_id: {} ch: {} creating hugepage mapping file failed.",
"init_hugepage: physical_device_id: {} ch: {} creating hugepage mapping file failed.",
physical_device_id,
ch);
success = false;
Expand All @@ -762,10 +759,10 @@ bool PCIDevice::init_hugepage(uint32_t num_host_mem_channels) {
if (mapping == MAP_FAILED) {
log_warning(
LogSiliconDriver,
"UMD: Mapping a hugepage failed. (device: {}, {}/{} errno: {}).",
"Mapping a hugepage failed. (device: {}, channel {}/{} errno: {}).",
physical_device_id,
ch,
num_host_mem_channels,
num_host_mem_channels - 1,
strerror(errno));
if (hugepage_st.st_size == 0) {
log_warning(
Expand All @@ -790,13 +787,7 @@ bool PCIDevice::init_hugepage(uint32_t num_host_mem_channels) {
auto fd = get_fd();

if (ioctl(fd, TENSTORRENT_IOCTL_PIN_PAGES, &pin_pages) == -1) {
log_warning(
LogSiliconDriver,
"---- ttSiliconDevice::init_hugepage: physical_device_id: {} ch: {} TENSTORRENT_IOCTL_PIN_PAGES failed "
"(errno: {}). Common Issue: Requires TTMKD >= 1.11, see following file contents...",
physical_device_id,
ch,
strerror(errno));
log_warning(LogSiliconDriver, "Failed to pin pages (errno: {}).", strerror(errno));
munmap(mapping, hugepage_size);
print_file_contents("/sys/module/tenstorrent/version", "(TTKMD version)");
print_file_contents("/proc/meminfo");
Expand All @@ -809,7 +800,7 @@ bool PCIDevice::init_hugepage(uint32_t num_host_mem_channels) {

log_debug(
LogSiliconDriver,
"ttSiliconDevice::init_hugepage: physical_device_id: {} ch: {} mapping_size: {} physical address 0x{:x}",
"init_hugepage: physical_device_id: {} ch: {} mapping_size: {} physical address 0x{:x}",
physical_device_id,
ch,
hugepage_size,
Expand Down

0 comments on commit c03bac4

Please sign in to comment.