Skip to content

Commit

Permalink
Remove MultiDeviceGlobalCircularBuffer
Browse files Browse the repository at this point in the history
  - TT-Metal already supports Global CBs backed by MeshBuffer
  - Expose this concept to TTNN, and remove the concept of the
    MultiDeviceGlobalCircularBuffer
  - create_global_circular_buffer(mesh_device...) is now hooked
    up to a GlobalCircularBuffer
  - Add T3K prefetcher test using Global CBs to track regressions
  • Loading branch information
tt-asaigal committed Mar 9, 2025
1 parent 6da7594 commit 2833697
Show file tree
Hide file tree
Showing 15 changed files with 73 additions and 88 deletions.
1 change: 1 addition & 0 deletions tests/scripts/t3000/run_t3000_unit_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ run_t3000_ttnn_tests() {
./build/test/ttnn/unit_tests_ttnn_ccl
WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest tests/ttnn/unit_tests/test_multi_device_trace.py ; fail+=$?
WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest tests/ttnn/unit_tests/test_multi_device_events.py ; fail+=$?
WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest tests/ttnn/unit_tests/operations/test_prefetcher.py::test_run_prefetcher_post_commit_multi_device ; fail+=$?
pytest -n auto tests/ttnn/unit_tests/test_multi_device.py ; fail+=$?
pytest -n auto tests/ttnn/unit_tests/test_multi_device_async.py ; fail+=$?
pytest tests/ttnn/distributed/test_tensor_parallel_example_T3000.py ; fail+=$?
Expand Down
54 changes: 54 additions & 0 deletions tests/ttnn/unit_tests/operations/test_prefetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,57 @@ def test_run_prefetcher_post_commit(
dtypes,
is_functional_test=True,
)


# This is not tested on main, but we need to test Global CB functionality through
# TT-Mesh.
# Test DRAM Prefetcher x Matmul on T3K. Can be removed once we bringup support on Galaxy
# since we have tests for that system.
@pytest.mark.skipif(is_grayskull(), reason="GS not supported")
@pytest.mark.parametrize(
"num_reader_cores, num_tensors, input_shapes, dtypes, num_layers",
[
(2, 2, [(256, 512), (256, 512)], [ttnn.bfloat4_b] * 2, 5),
(2, 2, [(1024, 256), (1024, 256)], [ttnn.bfloat4_b] * 2, 5),
(2, 2, [(128, 128), (128, 128)], [ttnn.bfloat4_b] * 2, 2),
(2, 2, [(256, 1024), (256, 2048)], [ttnn.bfloat4_b, ttnn.bfloat8_b], 1),
(2, 3, [(256, 1024), (256, 2048), (512, 256)], [ttnn.bfloat4_b, ttnn.bfloat8_b, ttnn.bfloat4_b], 5),
(2, 2, [(256, 1024), (128, 128)], [ttnn.bfloat4_b, ttnn.bfloat8_b], 5),
(2, 3, [(256, 1024), (128, 128), (1024, 256)], [ttnn.bfloat4_b, ttnn.bfloat8_b, ttnn.bfloat4_b], 5),
# Padding check
(
2,
3,
[(256 + 32, 512 + 224), (128, 128 + 64), (512 + 256, 224)],
[ttnn.bfloat4_b, ttnn.bfloat8_b, ttnn.bfloat4_b],
5,
),
],
)
@pytest.mark.parametrize(
"device_params",
[{"trace_region_size": 23887872}],
indirect=True,
)
def test_run_prefetcher_post_commit_multi_device(
mesh_device,
num_tensors,
input_shapes,
num_layers,
num_reader_cores,
dtypes,
use_program_cache,
function_level_defaults,
):
if mesh_device.get_num_devices() <= 1:
pytest.skip("This test requires multiple devices")

run_prefetcher_mm(
mesh_device,
num_tensors,
input_shapes,
num_layers,
num_reader_cores,
dtypes,
is_functional_test=True,
)
1 change: 0 additions & 1 deletion ttnn/cpp/pybind11/global_circular_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ namespace ttnn::global_circular_buffer {

void py_module_types(py::module& module) {
py::class_<GlobalCircularBuffer, std::shared_ptr<GlobalCircularBuffer>>(module, "global_circular_buffer");
py::class_<MultiDeviceGlobalCircularBuffer>(module, "multi_device_global_circular_buffer");
}

void py_module(py::module& module) {
Expand Down
22 changes: 4 additions & 18 deletions ttnn/cpp/ttnn/global_circular_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,6 @@

namespace ttnn::global_circular_buffer {

MultiDeviceGlobalCircularBuffer::MultiDeviceGlobalCircularBuffer(MeshDevice* mesh_device) {
TT_ASSERT(
mesh_device != nullptr,
"Must provide a valid mesh_device when initializing a global circular buffer on multiple devices.");
this->global_circular_buffers.reserve(mesh_device->num_devices());
}

GlobalCircularBuffer create_global_circular_buffer(
IDevice* device,
const std::vector<std::pair<CoreCoord, CoreRangeSet>>& sender_receiver_core_mapping,
Expand All @@ -26,20 +19,13 @@ GlobalCircularBuffer create_global_circular_buffer(
device, sender_receiver_core_mapping, size, buffer_type);
}

MultiDeviceGlobalCircularBuffer create_global_circular_buffer(
MeshDevice* mesh_device,
GlobalCircularBuffer create_global_circular_buffer(
MeshDevice* device,
const std::vector<std::pair<CoreCoord, CoreRangeSet>>& sender_receiver_core_mapping,
uint32_t size,
BufferType buffer_type) {
MultiDeviceGlobalCircularBuffer multi_device_global_cb(mesh_device);
auto& global_circular_buffers = multi_device_global_cb.global_circular_buffers;
const auto& devices = mesh_device->get_devices();
for (uint32_t i = 0; i < devices.size(); ++i) {
auto* device = devices[i];
global_circular_buffers.push_back(
create_global_circular_buffer(device, sender_receiver_core_mapping, size, buffer_type));
}
return multi_device_global_cb;
return tt::tt_metal::v1::experimental::CreateGlobalCircularBuffer(
device, sender_receiver_core_mapping, size, buffer_type);
}

} // namespace ttnn::global_circular_buffer
10 changes: 1 addition & 9 deletions ttnn/cpp/ttnn/global_circular_buffer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,6 @@

namespace ttnn::global_circular_buffer {

struct MultiDeviceGlobalCircularBuffer {
MultiDeviceGlobalCircularBuffer(MeshDevice* mesh_device);
std::vector<GlobalCircularBuffer> global_circular_buffers;

static constexpr auto attribute_names = std::forward_as_tuple("global_circular_buffers");
const auto attribute_values() const { return std::forward_as_tuple(this->global_circular_buffers); }
};

// Single Device APIs
GlobalCircularBuffer create_global_circular_buffer(
IDevice* device,
Expand All @@ -26,7 +18,7 @@ GlobalCircularBuffer create_global_circular_buffer(
BufferType buffer_type = BufferType::L1);

// Multi Device APIs
MultiDeviceGlobalCircularBuffer create_global_circular_buffer(
GlobalCircularBuffer create_global_circular_buffer(
MeshDevice* mesh_device,
const std::vector<std::pair<CoreCoord, CoreRangeSet>>& sender_receiver_core_mapping,
uint32_t size,
Expand Down
40 changes: 0 additions & 40 deletions ttnn/cpp/ttnn/operations/global_cb_utils.hpp

This file was deleted.

6 changes: 1 addition & 5 deletions ttnn/cpp/ttnn/operations/matmul/device/matmul_op.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2271,10 +2271,6 @@ operation::ProgramWithCallbacks Matmul::create_program(
program_config.fused_activation,
this->untilize_out);
} else if constexpr (std::is_same_v<ProgramConfigType, MatmulMultiCoreReuseMultiCast1DProgramConfig>) {
std::optional<tt::tt_metal::v1::experimental::GlobalCircularBuffer> global_cb = std::nullopt;
if (this->global_cb.has_value()) {
global_cb = get_global_circular_buffer(*this->global_cb, input_tensor_a.device()->id());
}
return matmul_multi_core_reuse_mcast_1d_optimized(
input_tensor_a,
input_tensor_b,
Expand All @@ -2296,7 +2292,7 @@ operation::ProgramWithCallbacks Matmul::create_program(
program_config.gather_in0,
program_config.hop_cores,
this->untilize_out,
global_cb,
this->global_cb,
program_config.num_global_cb_receivers);
} else if constexpr (std::is_same_v<
ProgramConfigType,
Expand Down
3 changes: 1 addition & 2 deletions ttnn/cpp/ttnn/operations/matmul/device/matmul_op.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
#include "ttnn/operations/ccl/ccl_op_fusion.hpp"
#include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
#include "ttnn/operations/eltwise/unary/common/unary_op_types.hpp"
#include "ttnn/operations/global_cb_utils.hpp"
#include "ttnn/run_operation.hpp"
#include "ttnn/tensor/tensor.hpp"
#include "ttnn/tensor/tensor_utils.hpp"
Expand Down Expand Up @@ -179,7 +178,7 @@ struct Matmul {
const bool transpose_a = false;
const bool transpose_b = false;
const std::optional<const tt::tt_metal::Tile> output_tile;
const std::optional<const tt::tt_metal::DeviceGlobalCircularBuffer> global_cb;
const std::optional<const GlobalCircularBuffer> global_cb;

void validate(
const std::vector<Tensor>& input_tensors,
Expand Down
4 changes: 2 additions & 2 deletions ttnn/cpp/ttnn/operations/matmul/matmul.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ Tensor MatmulOperation::invoke(
const std::optional<const CoreGrid> core_grid,
const std::optional<const tt::tt_metal::Tile>& output_tile,
std::optional<Tensor> optional_output_tensor,
const std::optional<const tt::tt_metal::DeviceGlobalCircularBuffer>& global_cb) {
const std::optional<const GlobalCircularBuffer>& global_cb) {
std::optional<CoreCoord> user_core_coord;
if (core_grid.has_value()) {
user_core_coord = CoreCoord(core_grid->x, core_grid->y);
Expand Down Expand Up @@ -160,7 +160,7 @@ Tensor LinearOperation::invoke(
const std::optional<const CoreGrid> core_grid,
const std::optional<const tt::tt_metal::Tile>& output_tile,
std::optional<ttnn::Tensor> optional_output_tensor,
const std::optional<const tt::tt_metal::DeviceGlobalCircularBuffer>& global_cb) {
const std::optional<const GlobalCircularBuffer>& global_cb) {
std::optional<CoreCoord> user_core_coord;
if (core_grid.has_value()) {
user_core_coord = CoreCoord(core_grid->x, core_grid->y);
Expand Down
4 changes: 2 additions & 2 deletions ttnn/cpp/ttnn/operations/matmul/matmul.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ struct MatmulOperation {
const std::optional<const CoreGrid> core_grid = std::nullopt,
const std::optional<const tt::tt_metal::Tile>& output_tile = std::nullopt,
std::optional<Tensor> optional_output_tensor = std::nullopt,
const std::optional<const tt::tt_metal::DeviceGlobalCircularBuffer>& global_cb = std::nullopt);
const std::optional<const GlobalCircularBuffer>& global_cb = std::nullopt);
};

struct LinearOperation {
Expand All @@ -68,7 +68,7 @@ struct LinearOperation {
const std::optional<const CoreGrid> core_grid = std::nullopt,
const std::optional<const tt::tt_metal::Tile>& output_tile = std::nullopt,
std::optional<Tensor> optional_output_tensor = std::nullopt,
const std::optional<const tt::tt_metal::DeviceGlobalCircularBuffer>& global_cb = std::nullopt);
const std::optional<const GlobalCircularBuffer>& global_cb = std::nullopt);
};

} // namespace matmul
Expand Down
4 changes: 2 additions & 2 deletions ttnn/cpp/ttnn/operations/matmul/matmul_pybind.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -346,7 +346,7 @@ void py_module(py::module& module) {
const std::optional<const ttnn::CoreGrid> core_grid,
const std::optional<const tt::tt_metal::Tile>& output_tile,
std::optional<Tensor>& optional_output_tensor,
const std::optional<const tt::tt_metal::DeviceGlobalCircularBuffer>& global_cb) -> ttnn::Tensor {
const std::optional<const GlobalCircularBuffer>& global_cb) -> ttnn::Tensor {
return self(
input_tensor_a,
input_tensor_b,
Expand Down Expand Up @@ -430,7 +430,7 @@ void py_module(py::module& module) {
const std::optional<const ttnn::CoreGrid> core_grid,
const std::optional<const tt::tt_metal::Tile>& output_tile,
std::optional<Tensor>& optional_output_tensor,
const std::optional<const tt::tt_metal::DeviceGlobalCircularBuffer>& global_cb) -> ttnn::Tensor {
const std::optional<const GlobalCircularBuffer>& global_cb) -> ttnn::Tensor {
return self(
input_tensor_a,
input_tensor_b,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ void DramPrefetcher::validate(const std::vector<Tensor>& input_tensors) const {
TT_FATAL(global_cb.has_value(), "Global circular buffer must be provided");
ttnn::Tensor tensor_addrs = input_tensors.back(); // Last tensor is tensor_addrs

auto global_cb = tt::tt_metal::get_global_circular_buffer(*this->global_cb, input_tensors[0].device()->id());
auto global_cb = *this->global_cb;
uint32_t num_receiver_cores = global_cb.receiver_cores().num_cores();
uint32_t num_sender_cores = global_cb.sender_cores().num_cores();

Expand Down Expand Up @@ -80,8 +80,7 @@ std::vector<ttnn::TensorSpec> DramPrefetcher::compute_output_specs(const std::ve
}
tt::tt_metal::operation::ProgramWithCallbacks DramPrefetcher::create_program(
const std::vector<Tensor>& input_tensors, std::vector<Tensor>& output_tensors) const {
auto global_cb = tt::tt_metal::get_global_circular_buffer(*this->global_cb, input_tensors[0].device()->id());
return dram_prefetcher_multi_core(input_tensors, this->num_layers, global_cb);
return dram_prefetcher_multi_core(input_tensors, this->num_layers, *this->global_cb);
}

} // namespace ttnn::operations::dram_prefetcher
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
#include "ttnn/run_operation.hpp"
#include "ttnn/tensor/tensor.hpp"
#include "ttnn/operations/core/core.hpp"
#include "ttnn/operations/global_cb_utils.hpp"

#include <tt-metalium/global_circular_buffer_impl.hpp>
#include <tt-metalium/global_circular_buffer.hpp>
Expand All @@ -23,7 +22,7 @@ tt::tt_metal::operation::ProgramWithCallbacks dram_prefetcher_multi_core(
const tt::tt_metal::v1::experimental::GlobalCircularBuffer& global_cb);

struct DramPrefetcher {
const std::optional<const tt::tt_metal::DeviceGlobalCircularBuffer> global_cb;
const std::optional<const GlobalCircularBuffer> global_cb;
const uint32_t num_layers;

void validate(const std::vector<Tensor>& input_tensors) const;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ namespace ttnn::operations::dram_prefetcher {
Tensor ExecuteDramPrefetcher::invoke(
std::vector<ttnn::Tensor>& tensors,
const uint32_t num_layers,
const std::optional<const tt::tt_metal::DeviceGlobalCircularBuffer>& global_cb) {
const std::optional<const GlobalCircularBuffer>& global_cb) {
std::vector<Tensor> output_tensors = {Tensor(tt::tt_metal::operation::get_workers_for_op_output(tensors))};
tt::tt_metal::operation::launch_op(
[num_layers, global_cb](
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ struct ExecuteDramPrefetcher {
static ttnn::Tensor invoke(
std::vector<ttnn::Tensor>& tensors,
const uint32_t num_layers,
const std::optional<const tt::tt_metal::DeviceGlobalCircularBuffer>& global_cb);
const std::optional<const GlobalCircularBuffer>& global_cb);
};

} // namespace operations::dram_prefetcher
Expand Down

0 comments on commit 2833697

Please sign in to comment.