Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TTDevice init #359

Merged
merged 2 commits into from
Dec 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions device/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ target_sources(
simulation/tt_simulation_host.cpp
tlb.cpp
tt_cluster_descriptor.cpp
tt_device/blackhole_tt_device.cpp
tt_device/grayskull_tt_device.cpp
tt_device/tt_device.cpp
tt_device/wormhole_tt_device.cpp
tt_silicon_driver_common.cpp
tt_soc_descriptor.cpp
grayskull/grayskull_coordinate_manager.cpp
Expand Down
1 change: 1 addition & 0 deletions device/api/umd/device/architecture_implementation.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ class architecture_implementation {
virtual uint32_t get_mem_large_write_tlb() const = 0;
virtual uint32_t get_static_tlb_cfg_addr() const = 0;
virtual uint32_t get_static_tlb_size() const = 0;
virtual uint32_t get_read_checking_offset() const = 0;
virtual uint32_t get_reg_tlb() const = 0;
virtual uint32_t get_tlb_base_index_16m() const = 0;
virtual uint32_t get_tensix_soft_reset_addr() const = 0;
Expand Down
4 changes: 4 additions & 0 deletions device/api/umd/device/blackhole_implementation.h
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,8 @@ static constexpr uint32_t TENSIX_SOFT_RESET_ADDR = 0xFFB121B0;

static constexpr uint32_t MSG_TYPE_SETUP_IATU_FOR_PEER_TO_PEER = 0x97;

static const uint32_t BH_NOC_NODE_ID_OFFSET = 0x1FD04044;

static const size_t eth_translated_coordinate_start_x = 20;
static const size_t eth_translated_coordinate_start_y = 25;

Expand Down Expand Up @@ -265,6 +267,8 @@ class blackhole_implementation : public architecture_implementation {

uint32_t get_static_tlb_size() const override { return blackhole::STATIC_TLB_SIZE; }

uint32_t get_read_checking_offset() const override { return blackhole::BH_NOC_NODE_ID_OFFSET; }

uint32_t get_reg_tlb() const override { return blackhole::REG_TLB; }

uint32_t get_tlb_base_index_16m() const override {
Expand Down
11 changes: 6 additions & 5 deletions device/api/umd/device/cluster.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@
#include "tt_silicon_driver_common.hpp"
#include "tt_soc_descriptor.h"
#include "tt_xy_pair.h"
#include "umd/device/pci_device.hpp"
#include "umd/device/tlb.h"
#include "umd/device/tt_cluster_descriptor_types.h"
#include "umd/device/tt_device/tt_device.h"
#include "umd/device/tt_io.hpp"

using TLB_DATA = tt::umd::tlb_data;
Expand Down Expand Up @@ -818,8 +818,8 @@ class Cluster : public tt_device {
virtual std::uint32_t get_host_channel_size(std::uint32_t device_id, std::uint32_t channel);
virtual std::uint32_t get_numa_node_for_pcie_device(std::uint32_t device_id);
virtual tt_version get_ethernet_fw_version() const;
// TODO: This should be accessible through public API, probably to be moved to tt_device.
PCIDevice* get_pci_device(int device_id) const;

TTDevice* get_tt_device(chip_id_t device_id) const;

// Destructor
virtual ~Cluster();
Expand Down Expand Up @@ -973,8 +973,9 @@ class Cluster : public tt_device {
std::set<chip_id_t> target_devices_in_cluster = {};
std::set<chip_id_t> target_remote_chips = {};
tt::ARCH arch_name;
std::unordered_map<chip_id_t, std::unique_ptr<PCIDevice>> m_pci_device_map; // Map of enabled pci devices
int m_num_pci_devices; // Number of pci devices in system (enabled or disabled)

// Map of enabled tt devices
std::unordered_map<chip_id_t, std::unique_ptr<TTDevice>> m_tt_device_map;
std::shared_ptr<tt_ClusterDescriptor> cluster_desc;

// remote eth transfer setup
Expand Down
4 changes: 4 additions & 0 deletions device/api/umd/device/grayskull_implementation.h
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,8 @@ static constexpr uint32_t ARC_CSM_MAILBOX_SIZE_OFFSET = 0x1FEF84BC;

static constexpr uint32_t TENSIX_SOFT_RESET_ADDR = 0xFFB121B0;

static constexpr uint32_t ARC_SCRATCH_6_OFFSET = 0x1FF30078;

} // namespace grayskull

class grayskull_implementation : public architecture_implementation {
Expand Down Expand Up @@ -267,6 +269,8 @@ class grayskull_implementation : public architecture_implementation {

uint32_t get_static_tlb_size() const override { return grayskull::STATIC_TLB_SIZE; }

uint32_t get_read_checking_offset() const override { return grayskull::ARC_SCRATCH_6_OFFSET; }

uint32_t get_reg_tlb() const override { return grayskull::REG_TLB; }

uint32_t get_tlb_base_index_16m() const override { return grayskull::TLB_BASE_INDEX_16M; }
Expand Down
85 changes: 16 additions & 69 deletions device/api/umd/device/pci_device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,28 +19,10 @@
#include "umd/device/tt_cluster_descriptor_types.h"
#include "umd/device/tt_xy_pair.h"

// TODO: this is used up in cluster.cpp but that logic ought to be
// lowered into the PCIDevice class since it is specific to PCIe cards.
// See /vendor_ip/synopsys/052021/bh_pcie_ctl_gen5/export/configuration/DWC_pcie_ctl.h
static const uint64_t UNROLL_ATU_OFFSET_BAR = 0x1200;

// TODO: this is a bit of a hack... something to revisit when we formalize an
// abstraction for IO.
// BAR0 size for Blackhole, used to determine whether write block should use BAR0 or BAR4
static const uint64_t BAR0_BH_SIZE = 512 * 1024 * 1024;

constexpr unsigned int c_hang_read_value = 0xffffffffu;

namespace tt::umd {
class architecture_implementation;
struct semver_t;
} // namespace tt::umd

struct dynamic_tlb {
uint64_t bar_offset; // Offset that address is mapped to, within the PCI BAR.
uint64_t remaining_size; // Bytes remaining between bar_offset and end of the TLB.
};

// These are not necessarily hugepages if IOMMU is enabled.
struct hugepage_mapping {
void *mapping = nullptr;
Expand Down Expand Up @@ -74,7 +56,6 @@ class PCIDevice {
const tt::ARCH arch; // e.g. Grayskull, Wormhole, Blackhole
const semver_t kmd_version; // KMD version
const bool iommu_enabled; // Whether the system is protected from this device by an IOMMU
std::unique_ptr<tt::umd::architecture_implementation> architecture_implementation;

public:
/**
Expand Down Expand Up @@ -150,52 +131,6 @@ class PCIDevice {
*/
bool is_iommu_enabled() const { return iommu_enabled; }

// Note: byte_addr is (mostly but not always) offset into BAR0. This
// interface assumes the caller knows what they are doing - but it's unclear
// how to use this interface correctly without knowing details of the chip
// and its state.
// TODO: build a proper abstraction for IO. At this level, that is access
// to registers in BAR0 (although possibly the right abstraction is to add
// methods that perform specific operations as opposed to generic register
// read/write methods) and access to segments of BAR0/4 that are mapped to
// NOC endpoints. Probably worth waiting for the KMD to start owning the
// resource management aspect of these PCIe->NOC mappings (the "TLBs")
// before doing too much work here...
void write_block(uint64_t byte_addr, uint64_t num_bytes, const uint8_t *buffer_addr);
void read_block(uint64_t byte_addr, uint64_t num_bytes, uint8_t *buffer_addr);
void write_regs(uint32_t byte_addr, uint32_t word_len, const void *data);
void write_regs(volatile uint32_t *dest, const uint32_t *src, uint32_t word_len);
void read_regs(uint32_t byte_addr, uint32_t word_len, void *data);

// TLB related functions.
// TODO: These are architecture specific, and will be moved out of the class.
void write_tlb_reg(
uint32_t byte_addr, std::uint64_t value_lower, std::uint64_t value_upper, std::uint32_t tlb_cfg_reg_size);
dynamic_tlb set_dynamic_tlb(
unsigned int tlb_index,
tt_xy_pair start,
tt_xy_pair end,
std::uint64_t address,
bool multicast,
std::unordered_map<tt_xy_pair, tt_xy_pair> &harvested_coord_translation,
std::uint64_t ordering);
dynamic_tlb set_dynamic_tlb(
unsigned int tlb_index,
tt_xy_pair target,
std::uint64_t address,
std::unordered_map<tt_xy_pair, tt_xy_pair> &harvested_coord_translation,
std::uint64_t ordering = tt::umd::tlb_data::Relaxed);
dynamic_tlb set_dynamic_tlb_broadcast(
unsigned int tlb_index,
std::uint64_t address,
std::unordered_map<tt_xy_pair, tt_xy_pair> &harvested_coord_translation,
tt_xy_pair start,
tt_xy_pair end,
std::uint64_t ordering = tt::umd::tlb_data::Relaxed);

tt::umd::architecture_implementation *get_architecture_implementation() const;
void detect_hang_read(uint32_t data_read = c_hang_read_value);

// TODO: this also probably has more sense to live in the future TTDevice class.
bool init_hugepage(uint32_t num_host_mem_channels);

Expand Down Expand Up @@ -248,12 +183,24 @@ class PCIDevice {

uint32_t read_checking_offset;

private:
bool is_hardware_hung();

template <typename T>
T *get_register_address(uint32_t register_offset);
T *get_register_address(uint32_t register_offset) {
// Right now, address can either be exposed register in BAR, or TLB window in BAR0 (BAR4 for Blackhole).
// Should clarify this interface
void *reg_mapping;
if (system_reg_mapping != nullptr && register_offset >= system_reg_start_offset) {
register_offset -= system_reg_offset_adjust;
reg_mapping = system_reg_mapping;
} else if (bar0_wc != bar0_uc && register_offset < bar0_wc_size) {
reg_mapping = bar0_wc;
} else {
register_offset -= bar0_uc_offset;
reg_mapping = bar0_uc;
}
return reinterpret_cast<T *>(static_cast<uint8_t *>(reg_mapping) + register_offset);
}

private:
// For debug purposes when various stages fails.
void print_file_contents(std::string filename, std::string hint = "");

Expand Down
17 changes: 17 additions & 0 deletions device/api/umd/device/tt_device/blackhole_tt_device.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
/*
* SPDX-FileCopyrightText: (c) 2024 Tenstorrent Inc.
*
* SPDX-License-Identifier: Apache-2.0
*/

#pragma once

#include "umd/device/tt_device/tt_device.h"

namespace tt::umd {
class BlackholeTTDevice : public TTDevice {
public:
BlackholeTTDevice(std::unique_ptr<PCIDevice> pci_device);
~BlackholeTTDevice();
};
} // namespace tt::umd
16 changes: 16 additions & 0 deletions device/api/umd/device/tt_device/grayskull_tt_device.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
/*
* SPDX-FileCopyrightText: (c) 2024 Tenstorrent Inc.
*
* SPDX-License-Identifier: Apache-2.0
*/

#pragma once

#include "umd/device/tt_device/tt_device.h"

namespace tt::umd {
class GrayskullTTDevice : public TTDevice {
public:
GrayskullTTDevice(std::unique_ptr<PCIDevice> pci_device);
};
} // namespace tt::umd
108 changes: 108 additions & 0 deletions device/api/umd/device/tt_device/tt_device.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
/*
* SPDX-FileCopyrightText: (c) 2024 Tenstorrent Inc.
*
* SPDX-License-Identifier: Apache-2.0
*/

#pragma once

#include "umd/device/architecture_implementation.h"
#include "umd/device/pci_device.hpp"

// TODO: Should be moved to blackhole_architecture_implementation.h
// See /vendor_ip/synopsys/052021/bh_pcie_ctl_gen5/export/configuration/DWC_pcie_ctl.h
static const uint64_t UNROLL_ATU_OFFSET_BAR = 0x1200;

// TODO: should be removed from tt_device.h, and put into blackhole_tt_device.h
// TODO: this is a bit of a hack... something to revisit when we formalize an
// abstraction for IO.
// BAR0 size for Blackhole, used to determine whether write block should use BAR0 or BAR4
static const uint64_t BAR0_BH_SIZE = 512 * 1024 * 1024;

constexpr unsigned int c_hang_read_value = 0xffffffffu;

struct dynamic_tlb {
uint64_t bar_offset; // Offset that address is mapped to, within the PCI BAR.
uint64_t remaining_size; // Bytes remaining between bar_offset and end of the TLB.
};

namespace tt::umd {

class TTDevice {
public:
/**
* Creates a proper TTDevice object for the given PCI device number.
*/
static std::unique_ptr<TTDevice> create(int pci_device_number);
TTDevice(std::unique_ptr<PCIDevice> pci_device, std::unique_ptr<architecture_implementation> architecture_impl);
virtual ~TTDevice() = default;

architecture_implementation *get_architecture_implementation();
PCIDevice *get_pci_device();

void detect_hang_read(uint32_t data_read = c_hang_read_value);

// Note: byte_addr is (mostly but not always) offset into BAR0. This
// interface assumes the caller knows what they are doing - but it's unclear
// how to use this interface correctly without knowing details of the chip
// and its state.
// TODO: build a proper abstraction for IO. At this level, that is access
// to registers in BAR0 (although possibly the right abstraction is to add
// methods that perform specific operations as opposed to generic register
// read/write methods) and access to segments of BAR0/4 that are mapped to
// NOC endpoints. Probably worth waiting for the KMD to start owning the
// resource management aspect of these PCIe->NOC mappings (the "TLBs")
// before doing too much work here...
void write_block(uint64_t byte_addr, uint64_t num_bytes, const uint8_t *buffer_addr);
void read_block(uint64_t byte_addr, uint64_t num_bytes, uint8_t *buffer_addr);
void write_regs(volatile uint32_t *dest, const uint32_t *src, uint32_t word_len);
void write_regs(uint32_t byte_addr, uint32_t word_len, const void *data);
void read_regs(uint32_t byte_addr, uint32_t word_len, void *data);

// TLB related functions.
// TODO: These are architecture specific, and will be moved out of the class.
void write_tlb_reg(
uint32_t byte_addr, std::uint64_t value_lower, std::uint64_t value_upper, std::uint32_t tlb_cfg_reg_size);
dynamic_tlb set_dynamic_tlb(
unsigned int tlb_index,
tt_xy_pair start,
tt_xy_pair end,
std::uint64_t address,
bool multicast,
std::unordered_map<tt_xy_pair, tt_xy_pair> &harvested_coord_translation,
std::uint64_t ordering);
dynamic_tlb set_dynamic_tlb(
unsigned int tlb_index,
tt_xy_pair target,
std::uint64_t address,
std::unordered_map<tt_xy_pair, tt_xy_pair> &harvested_coord_translation,
std::uint64_t ordering = tt::umd::tlb_data::Relaxed);
dynamic_tlb set_dynamic_tlb_broadcast(
unsigned int tlb_index,
std::uint64_t address,
std::unordered_map<tt_xy_pair, tt_xy_pair> &harvested_coord_translation,
tt_xy_pair start,
tt_xy_pair end,
std::uint64_t ordering = tt::umd::tlb_data::Relaxed);

protected:
std::unique_ptr<architecture_implementation> architecture_impl_;
std::unique_ptr<PCIDevice> pci_device_;
tt::ARCH arch;

bool is_hardware_hung();

template <typename T>
T *get_register_address(uint32_t register_offset);

// Custom device memcpy. This is only safe for memory-like regions on the device (Tensix L1, DRAM, ARC CSM).
// Both routines assume that misaligned accesses are permitted on host memory.
//
// 1. AARCH64 device memory does not allow unaligned accesses (including pair loads/stores),
// which glibc's memcpy may perform when unrolling. This affects from and to device.
// 2. syseng#3487 WH GDDR5 controller has a bug when 1-byte writes are temporarily adjacent
// to 2-byte writes. We avoid ever performing a 1-byte write to the device. This only affects to device.
void memcpy_to_device(void *dest, const void *src, std::size_t num_bytes);
void memcpy_from_device(void *dest, const void *src, std::size_t num_bytes);
};
} // namespace tt::umd
16 changes: 16 additions & 0 deletions device/api/umd/device/tt_device/wormhole_tt_device.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
/*
* SPDX-FileCopyrightText: (c) 2024 Tenstorrent Inc.
*
* SPDX-License-Identifier: Apache-2.0
*/

#pragma once

#include "umd/device/tt_device/tt_device.h"

namespace tt::umd {
class WormholeTTDevice : public TTDevice {
public:
WormholeTTDevice(std::unique_ptr<PCIDevice> pci_device);
};
} // namespace tt::umd
4 changes: 4 additions & 0 deletions device/api/umd/device/wormhole_implementation.h
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,8 @@ static constexpr uint32_t ARC_CSM_MAILBOX_SIZE_OFFSET = 0x1FEF84C4;

static constexpr uint32_t TENSIX_SOFT_RESET_ADDR = 0xFFB121B0;

static constexpr uint32_t ARC_SCRATCH_6_OFFSET = 0x1FF30078;

static const size_t tensix_translated_coordinate_start_x = 18;
static const size_t tensix_translated_coordinate_start_y = 18;

Expand Down Expand Up @@ -304,6 +306,8 @@ class wormhole_implementation : public architecture_implementation {

uint32_t get_static_tlb_cfg_addr() const override { return wormhole::STATIC_TLB_CFG_ADDR; }

uint32_t get_read_checking_offset() const override { return wormhole::ARC_SCRATCH_6_OFFSET; }

uint32_t get_static_tlb_size() const override { return wormhole::STATIC_TLB_SIZE; }

uint32_t get_reg_tlb() const override { return wormhole::REG_TLB; }
Expand Down
Loading
Loading