Skip to content

Commit

Permalink
TTDevice init (#359)
Browse files Browse the repository at this point in the history
### Issue
Related to #98 

### Description
Initial TTDevice class. It holds PCIDevice and arch implementation.
Gradually, I'd like to move all branches on arch type to be moved from
cluster and pci_device to tt_device, and that class to be the only one
in the stack which offers different implementation for each arch.
According to:
https://docs.google.com/drawings/d/1-m1azdsBqMA0A6ATYRMfkhyeuOJuGCEI62N5a96LXj0/edit

### List of the changes
- Create TTDevice class.
- Created arch specific TTDevice classes, which currently don't hold
much implementation.
- architecture_implementation is moved to TTDevice, and ~half of
PCIDevice is moved. PCIDevice should hold only non-arch specific code,
except getting the arch itself. There were only mild, compile related
changes in the functions moved from PCIDevice to TTDevice.
- cluster.cpp and tests changed accordingly.
- read_checking_offset moved to architecture_implementation
- Blackhole destructor specific code moved to BlackholeTTDevice from
PCIDevice.


### Testing
Existing CI tests should be enough.

### API Changes
There are no API changes in this PR.
But I scheduled post commit tests just to be sure.
- [x] All post-commit tests :
https://github.com/tenstorrent/tt-metal/actions/runs/12156329972
- [x] Blackhole post-commit tests :
https://github.com/tenstorrent/tt-metal/actions/runs/12156332357
  • Loading branch information
broskoTT authored Dec 6, 2024
1 parent a98ddd2 commit c853fdf
Show file tree
Hide file tree
Showing 18 changed files with 701 additions and 541 deletions.
4 changes: 4 additions & 0 deletions device/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ target_sources(
simulation/tt_simulation_host.cpp
tlb.cpp
tt_cluster_descriptor.cpp
tt_device/blackhole_tt_device.cpp
tt_device/grayskull_tt_device.cpp
tt_device/tt_device.cpp
tt_device/wormhole_tt_device.cpp
tt_silicon_driver_common.cpp
tt_soc_descriptor.cpp
grayskull/grayskull_coordinate_manager.cpp
Expand Down
1 change: 1 addition & 0 deletions device/api/umd/device/architecture_implementation.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ class architecture_implementation {
virtual uint32_t get_mem_large_write_tlb() const = 0;
virtual uint32_t get_static_tlb_cfg_addr() const = 0;
virtual uint32_t get_static_tlb_size() const = 0;
virtual uint32_t get_read_checking_offset() const = 0;
virtual uint32_t get_reg_tlb() const = 0;
virtual uint32_t get_tlb_base_index_16m() const = 0;
virtual uint32_t get_tensix_soft_reset_addr() const = 0;
Expand Down
4 changes: 4 additions & 0 deletions device/api/umd/device/blackhole_implementation.h
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,8 @@ static constexpr uint32_t TENSIX_SOFT_RESET_ADDR = 0xFFB121B0;

static constexpr uint32_t MSG_TYPE_SETUP_IATU_FOR_PEER_TO_PEER = 0x97;

static const uint32_t BH_NOC_NODE_ID_OFFSET = 0x1FD04044;

static const size_t eth_translated_coordinate_start_x = 20;
static const size_t eth_translated_coordinate_start_y = 25;

Expand Down Expand Up @@ -265,6 +267,8 @@ class blackhole_implementation : public architecture_implementation {

uint32_t get_static_tlb_size() const override { return blackhole::STATIC_TLB_SIZE; }

uint32_t get_read_checking_offset() const override { return blackhole::BH_NOC_NODE_ID_OFFSET; }

uint32_t get_reg_tlb() const override { return blackhole::REG_TLB; }

uint32_t get_tlb_base_index_16m() const override {
Expand Down
11 changes: 6 additions & 5 deletions device/api/umd/device/cluster.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@
#include "tt_silicon_driver_common.hpp"
#include "tt_soc_descriptor.h"
#include "tt_xy_pair.h"
#include "umd/device/pci_device.hpp"
#include "umd/device/tlb.h"
#include "umd/device/tt_cluster_descriptor_types.h"
#include "umd/device/tt_device/tt_device.h"
#include "umd/device/tt_io.hpp"

using TLB_DATA = tt::umd::tlb_data;
Expand Down Expand Up @@ -818,8 +818,8 @@ class Cluster : public tt_device {
virtual std::uint32_t get_host_channel_size(std::uint32_t device_id, std::uint32_t channel);
virtual std::uint32_t get_numa_node_for_pcie_device(std::uint32_t device_id);
virtual tt_version get_ethernet_fw_version() const;
// TODO: This should be accessible through public API, probably to be moved to tt_device.
PCIDevice* get_pci_device(int device_id) const;

TTDevice* get_tt_device(chip_id_t device_id) const;

// Destructor
virtual ~Cluster();
Expand Down Expand Up @@ -973,8 +973,9 @@ class Cluster : public tt_device {
std::set<chip_id_t> target_devices_in_cluster = {};
std::set<chip_id_t> target_remote_chips = {};
tt::ARCH arch_name;
std::unordered_map<chip_id_t, std::unique_ptr<PCIDevice>> m_pci_device_map; // Map of enabled pci devices
int m_num_pci_devices; // Number of pci devices in system (enabled or disabled)

// Map of enabled tt devices
std::unordered_map<chip_id_t, std::unique_ptr<TTDevice>> m_tt_device_map;
std::shared_ptr<tt_ClusterDescriptor> cluster_desc;

// remote eth transfer setup
Expand Down
4 changes: 4 additions & 0 deletions device/api/umd/device/grayskull_implementation.h
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,8 @@ static constexpr uint32_t ARC_CSM_MAILBOX_SIZE_OFFSET = 0x1FEF84BC;

static constexpr uint32_t TENSIX_SOFT_RESET_ADDR = 0xFFB121B0;

static constexpr uint32_t ARC_SCRATCH_6_OFFSET = 0x1FF30078;

} // namespace grayskull

class grayskull_implementation : public architecture_implementation {
Expand Down Expand Up @@ -267,6 +269,8 @@ class grayskull_implementation : public architecture_implementation {

uint32_t get_static_tlb_size() const override { return grayskull::STATIC_TLB_SIZE; }

uint32_t get_read_checking_offset() const override { return grayskull::ARC_SCRATCH_6_OFFSET; }

uint32_t get_reg_tlb() const override { return grayskull::REG_TLB; }

uint32_t get_tlb_base_index_16m() const override { return grayskull::TLB_BASE_INDEX_16M; }
Expand Down
85 changes: 16 additions & 69 deletions device/api/umd/device/pci_device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,28 +19,10 @@
#include "umd/device/tt_cluster_descriptor_types.h"
#include "umd/device/tt_xy_pair.h"

// TODO: this is used up in cluster.cpp but that logic ought to be
// lowered into the PCIDevice class since it is specific to PCIe cards.
// See /vendor_ip/synopsys/052021/bh_pcie_ctl_gen5/export/configuration/DWC_pcie_ctl.h
static const uint64_t UNROLL_ATU_OFFSET_BAR = 0x1200;

// TODO: this is a bit of a hack... something to revisit when we formalize an
// abstraction for IO.
// BAR0 size for Blackhole, used to determine whether write block should use BAR0 or BAR4
static const uint64_t BAR0_BH_SIZE = 512 * 1024 * 1024;

constexpr unsigned int c_hang_read_value = 0xffffffffu;

namespace tt::umd {
class architecture_implementation;
struct semver_t;
} // namespace tt::umd

struct dynamic_tlb {
uint64_t bar_offset; // Offset that address is mapped to, within the PCI BAR.
uint64_t remaining_size; // Bytes remaining between bar_offset and end of the TLB.
};

// These are not necessarily hugepages if IOMMU is enabled.
struct hugepage_mapping {
void *mapping = nullptr;
Expand Down Expand Up @@ -74,7 +56,6 @@ class PCIDevice {
const tt::ARCH arch; // e.g. Grayskull, Wormhole, Blackhole
const semver_t kmd_version; // KMD version
const bool iommu_enabled; // Whether the system is protected from this device by an IOMMU
std::unique_ptr<tt::umd::architecture_implementation> architecture_implementation;

public:
/**
Expand Down Expand Up @@ -150,52 +131,6 @@ class PCIDevice {
*/
bool is_iommu_enabled() const { return iommu_enabled; }

// Note: byte_addr is (mostly but not always) offset into BAR0. This
// interface assumes the caller knows what they are doing - but it's unclear
// how to use this interface correctly without knowing details of the chip
// and its state.
// TODO: build a proper abstraction for IO. At this level, that is access
// to registers in BAR0 (although possibly the right abstraction is to add
// methods that perform specific operations as opposed to generic register
// read/write methods) and access to segments of BAR0/4 that are mapped to
// NOC endpoints. Probably worth waiting for the KMD to start owning the
// resource management aspect of these PCIe->NOC mappings (the "TLBs")
// before doing too much work here...
void write_block(uint64_t byte_addr, uint64_t num_bytes, const uint8_t *buffer_addr);
void read_block(uint64_t byte_addr, uint64_t num_bytes, uint8_t *buffer_addr);
void write_regs(uint32_t byte_addr, uint32_t word_len, const void *data);
void write_regs(volatile uint32_t *dest, const uint32_t *src, uint32_t word_len);
void read_regs(uint32_t byte_addr, uint32_t word_len, void *data);

// TLB related functions.
// TODO: These are architecture specific, and will be moved out of the class.
void write_tlb_reg(
uint32_t byte_addr, std::uint64_t value_lower, std::uint64_t value_upper, std::uint32_t tlb_cfg_reg_size);
dynamic_tlb set_dynamic_tlb(
unsigned int tlb_index,
tt_xy_pair start,
tt_xy_pair end,
std::uint64_t address,
bool multicast,
std::unordered_map<tt_xy_pair, tt_xy_pair> &harvested_coord_translation,
std::uint64_t ordering);
dynamic_tlb set_dynamic_tlb(
unsigned int tlb_index,
tt_xy_pair target,
std::uint64_t address,
std::unordered_map<tt_xy_pair, tt_xy_pair> &harvested_coord_translation,
std::uint64_t ordering = tt::umd::tlb_data::Relaxed);
dynamic_tlb set_dynamic_tlb_broadcast(
unsigned int tlb_index,
std::uint64_t address,
std::unordered_map<tt_xy_pair, tt_xy_pair> &harvested_coord_translation,
tt_xy_pair start,
tt_xy_pair end,
std::uint64_t ordering = tt::umd::tlb_data::Relaxed);

tt::umd::architecture_implementation *get_architecture_implementation() const;
void detect_hang_read(uint32_t data_read = c_hang_read_value);

// TODO: this also probably has more sense to live in the future TTDevice class.
bool init_hugepage(uint32_t num_host_mem_channels);

Expand Down Expand Up @@ -248,12 +183,24 @@ class PCIDevice {

uint32_t read_checking_offset;

private:
bool is_hardware_hung();

template <typename T>
T *get_register_address(uint32_t register_offset);
T *get_register_address(uint32_t register_offset) {
// Right now, address can either be exposed register in BAR, or TLB window in BAR0 (BAR4 for Blackhole).
// Should clarify this interface
void *reg_mapping;
if (system_reg_mapping != nullptr && register_offset >= system_reg_start_offset) {
register_offset -= system_reg_offset_adjust;
reg_mapping = system_reg_mapping;
} else if (bar0_wc != bar0_uc && register_offset < bar0_wc_size) {
reg_mapping = bar0_wc;
} else {
register_offset -= bar0_uc_offset;
reg_mapping = bar0_uc;
}
return reinterpret_cast<T *>(static_cast<uint8_t *>(reg_mapping) + register_offset);
}

private:
// For debug purposes when various stages fails.
void print_file_contents(std::string filename, std::string hint = "");

Expand Down
17 changes: 17 additions & 0 deletions device/api/umd/device/tt_device/blackhole_tt_device.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
/*
* SPDX-FileCopyrightText: (c) 2024 Tenstorrent Inc.
*
* SPDX-License-Identifier: Apache-2.0
*/

#pragma once

#include "umd/device/tt_device/tt_device.h"

namespace tt::umd {
class BlackholeTTDevice : public TTDevice {
public:
BlackholeTTDevice(std::unique_ptr<PCIDevice> pci_device);
~BlackholeTTDevice();
};
} // namespace tt::umd
16 changes: 16 additions & 0 deletions device/api/umd/device/tt_device/grayskull_tt_device.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
/*
* SPDX-FileCopyrightText: (c) 2024 Tenstorrent Inc.
*
* SPDX-License-Identifier: Apache-2.0
*/

#pragma once

#include "umd/device/tt_device/tt_device.h"

namespace tt::umd {
class GrayskullTTDevice : public TTDevice {
public:
GrayskullTTDevice(std::unique_ptr<PCIDevice> pci_device);
};
} // namespace tt::umd
108 changes: 108 additions & 0 deletions device/api/umd/device/tt_device/tt_device.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
/*
* SPDX-FileCopyrightText: (c) 2024 Tenstorrent Inc.
*
* SPDX-License-Identifier: Apache-2.0
*/

#pragma once

#include "umd/device/architecture_implementation.h"
#include "umd/device/pci_device.hpp"

// TODO: Should be moved to blackhole_architecture_implementation.h
// See /vendor_ip/synopsys/052021/bh_pcie_ctl_gen5/export/configuration/DWC_pcie_ctl.h
static const uint64_t UNROLL_ATU_OFFSET_BAR = 0x1200;

// TODO: should be removed from tt_device.h, and put into blackhole_tt_device.h
// TODO: this is a bit of a hack... something to revisit when we formalize an
// abstraction for IO.
// BAR0 size for Blackhole, used to determine whether write block should use BAR0 or BAR4
static const uint64_t BAR0_BH_SIZE = 512 * 1024 * 1024;

constexpr unsigned int c_hang_read_value = 0xffffffffu;

struct dynamic_tlb {
uint64_t bar_offset; // Offset that address is mapped to, within the PCI BAR.
uint64_t remaining_size; // Bytes remaining between bar_offset and end of the TLB.
};

namespace tt::umd {

class TTDevice {
public:
/**
* Creates a proper TTDevice object for the given PCI device number.
*/
static std::unique_ptr<TTDevice> create(int pci_device_number);
TTDevice(std::unique_ptr<PCIDevice> pci_device, std::unique_ptr<architecture_implementation> architecture_impl);
virtual ~TTDevice() = default;

architecture_implementation *get_architecture_implementation();
PCIDevice *get_pci_device();

void detect_hang_read(uint32_t data_read = c_hang_read_value);

// Note: byte_addr is (mostly but not always) offset into BAR0. This
// interface assumes the caller knows what they are doing - but it's unclear
// how to use this interface correctly without knowing details of the chip
// and its state.
// TODO: build a proper abstraction for IO. At this level, that is access
// to registers in BAR0 (although possibly the right abstraction is to add
// methods that perform specific operations as opposed to generic register
// read/write methods) and access to segments of BAR0/4 that are mapped to
// NOC endpoints. Probably worth waiting for the KMD to start owning the
// resource management aspect of these PCIe->NOC mappings (the "TLBs")
// before doing too much work here...
void write_block(uint64_t byte_addr, uint64_t num_bytes, const uint8_t *buffer_addr);
void read_block(uint64_t byte_addr, uint64_t num_bytes, uint8_t *buffer_addr);
void write_regs(volatile uint32_t *dest, const uint32_t *src, uint32_t word_len);
void write_regs(uint32_t byte_addr, uint32_t word_len, const void *data);
void read_regs(uint32_t byte_addr, uint32_t word_len, void *data);

// TLB related functions.
// TODO: These are architecture specific, and will be moved out of the class.
void write_tlb_reg(
uint32_t byte_addr, std::uint64_t value_lower, std::uint64_t value_upper, std::uint32_t tlb_cfg_reg_size);
dynamic_tlb set_dynamic_tlb(
unsigned int tlb_index,
tt_xy_pair start,
tt_xy_pair end,
std::uint64_t address,
bool multicast,
std::unordered_map<tt_xy_pair, tt_xy_pair> &harvested_coord_translation,
std::uint64_t ordering);
dynamic_tlb set_dynamic_tlb(
unsigned int tlb_index,
tt_xy_pair target,
std::uint64_t address,
std::unordered_map<tt_xy_pair, tt_xy_pair> &harvested_coord_translation,
std::uint64_t ordering = tt::umd::tlb_data::Relaxed);
dynamic_tlb set_dynamic_tlb_broadcast(
unsigned int tlb_index,
std::uint64_t address,
std::unordered_map<tt_xy_pair, tt_xy_pair> &harvested_coord_translation,
tt_xy_pair start,
tt_xy_pair end,
std::uint64_t ordering = tt::umd::tlb_data::Relaxed);

protected:
std::unique_ptr<architecture_implementation> architecture_impl_;
std::unique_ptr<PCIDevice> pci_device_;
tt::ARCH arch;

bool is_hardware_hung();

template <typename T>
T *get_register_address(uint32_t register_offset);

// Custom device memcpy. This is only safe for memory-like regions on the device (Tensix L1, DRAM, ARC CSM).
// Both routines assume that misaligned accesses are permitted on host memory.
//
// 1. AARCH64 device memory does not allow unaligned accesses (including pair loads/stores),
// which glibc's memcpy may perform when unrolling. This affects from and to device.
// 2. syseng#3487 WH GDDR5 controller has a bug when 1-byte writes are temporarily adjacent
// to 2-byte writes. We avoid ever performing a 1-byte write to the device. This only affects to device.
void memcpy_to_device(void *dest, const void *src, std::size_t num_bytes);
void memcpy_from_device(void *dest, const void *src, std::size_t num_bytes);
};
} // namespace tt::umd
16 changes: 16 additions & 0 deletions device/api/umd/device/tt_device/wormhole_tt_device.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
/*
* SPDX-FileCopyrightText: (c) 2024 Tenstorrent Inc.
*
* SPDX-License-Identifier: Apache-2.0
*/

#pragma once

#include "umd/device/tt_device/tt_device.h"

namespace tt::umd {
class WormholeTTDevice : public TTDevice {
public:
WormholeTTDevice(std::unique_ptr<PCIDevice> pci_device);
};
} // namespace tt::umd
4 changes: 4 additions & 0 deletions device/api/umd/device/wormhole_implementation.h
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,8 @@ static constexpr uint32_t ARC_CSM_MAILBOX_SIZE_OFFSET = 0x1FEF84C4;

static constexpr uint32_t TENSIX_SOFT_RESET_ADDR = 0xFFB121B0;

static constexpr uint32_t ARC_SCRATCH_6_OFFSET = 0x1FF30078;

static const size_t tensix_translated_coordinate_start_x = 18;
static const size_t tensix_translated_coordinate_start_y = 18;

Expand Down Expand Up @@ -304,6 +306,8 @@ class wormhole_implementation : public architecture_implementation {

uint32_t get_static_tlb_cfg_addr() const override { return wormhole::STATIC_TLB_CFG_ADDR; }

uint32_t get_read_checking_offset() const override { return wormhole::ARC_SCRATCH_6_OFFSET; }

uint32_t get_static_tlb_size() const override { return wormhole::STATIC_TLB_SIZE; }

uint32_t get_reg_tlb() const override { return wormhole::REG_TLB; }
Expand Down
Loading

0 comments on commit c853fdf

Please sign in to comment.