Remove MultiDeviceGlobalCircularBuffer

- TT-Metal already supports Global CBs backed by MeshBuffer - Expose this concept to TTNN, and remove the concept of the MultiDeviceGlobalCircularBuffer - create_global_circular_buffer(mesh_device...) is now hooked up to a GlobalCircularBuffer - Add T3K prefetcher test using Global CBs to track regressions
tenstorrent · Mar 9, 2025 · 2833697 · 2833697
1 parent 6da7594
commit 2833697
Show file tree

Hide file tree

Showing 15 changed files with 73 additions and 88 deletions.
diff --git a/tests/scripts/t3000/run_t3000_unit_tests.sh b/tests/scripts/t3000/run_t3000_unit_tests.sh
@@ -83,6 +83,7 @@ run_t3000_ttnn_tests() {
   ./build/test/ttnn/unit_tests_ttnn_ccl
   WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest tests/ttnn/unit_tests/test_multi_device_trace.py ; fail+=$?
   WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest tests/ttnn/unit_tests/test_multi_device_events.py ; fail+=$?
+  WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest tests/ttnn/unit_tests/operations/test_prefetcher.py::test_run_prefetcher_post_commit_multi_device ; fail+=$?
   pytest -n auto tests/ttnn/unit_tests/test_multi_device.py ; fail+=$?
   pytest -n auto tests/ttnn/unit_tests/test_multi_device_async.py ; fail+=$?
   pytest tests/ttnn/distributed/test_tensor_parallel_example_T3000.py ; fail+=$?

diff --git a/tests/ttnn/unit_tests/operations/test_prefetcher.py b/tests/ttnn/unit_tests/operations/test_prefetcher.py
@@ -56,3 +56,57 @@ def test_run_prefetcher_post_commit(
         dtypes,
         is_functional_test=True,
     )
+
+
+# This is not tested on main, but we need to test Global CB functionality through
+# TT-Mesh.
+# Test DRAM Prefetcher x Matmul on T3K. Can be removed once we bringup support on Galaxy
+# since we have tests for that system.
+@pytest.mark.skipif(is_grayskull(), reason="GS not supported")
+@pytest.mark.parametrize(
+    "num_reader_cores, num_tensors, input_shapes, dtypes, num_layers",
+    [
+        (2, 2, [(256, 512), (256, 512)], [ttnn.bfloat4_b] * 2, 5),
+        (2, 2, [(1024, 256), (1024, 256)], [ttnn.bfloat4_b] * 2, 5),
+        (2, 2, [(128, 128), (128, 128)], [ttnn.bfloat4_b] * 2, 2),
+        (2, 2, [(256, 1024), (256, 2048)], [ttnn.bfloat4_b, ttnn.bfloat8_b], 1),
+        (2, 3, [(256, 1024), (256, 2048), (512, 256)], [ttnn.bfloat4_b, ttnn.bfloat8_b, ttnn.bfloat4_b], 5),
+        (2, 2, [(256, 1024), (128, 128)], [ttnn.bfloat4_b, ttnn.bfloat8_b], 5),
+        (2, 3, [(256, 1024), (128, 128), (1024, 256)], [ttnn.bfloat4_b, ttnn.bfloat8_b, ttnn.bfloat4_b], 5),
+        # Padding check
+        (
+            2,
+            3,
+            [(256 + 32, 512 + 224), (128, 128 + 64), (512 + 256, 224)],
+            [ttnn.bfloat4_b, ttnn.bfloat8_b, ttnn.bfloat4_b],
+            5,
+        ),
+    ],
+)
+@pytest.mark.parametrize(
+    "device_params",
+    [{"trace_region_size": 23887872}],
+    indirect=True,
+)
+def test_run_prefetcher_post_commit_multi_device(
+    mesh_device,
+    num_tensors,
+    input_shapes,
+    num_layers,
+    num_reader_cores,
+    dtypes,
+    use_program_cache,
+    function_level_defaults,
+):
+    if mesh_device.get_num_devices() <= 1:
+        pytest.skip("This test requires multiple devices")
+
+    run_prefetcher_mm(
+        mesh_device,
+        num_tensors,
+        input_shapes,
+        num_layers,
+        num_reader_cores,
+        dtypes,
+        is_functional_test=True,
+    )
diff --git a/ttnn/cpp/pybind11/global_circular_buffer.cpp b/ttnn/cpp/pybind11/global_circular_buffer.cpp
@@ -12,7 +12,6 @@ namespace ttnn::global_circular_buffer {
 
 void py_module_types(py::module& module) {
     py::class_<GlobalCircularBuffer, std::shared_ptr<GlobalCircularBuffer>>(module, "global_circular_buffer");
-    py::class_<MultiDeviceGlobalCircularBuffer>(module, "multi_device_global_circular_buffer");
 }
 
 void py_module(py::module& module) {

diff --git a/ttnn/cpp/ttnn/global_circular_buffer.cpp b/ttnn/cpp/ttnn/global_circular_buffer.cpp
@@ -10,13 +10,6 @@
 
 namespace ttnn::global_circular_buffer {
 
-MultiDeviceGlobalCircularBuffer::MultiDeviceGlobalCircularBuffer(MeshDevice* mesh_device) {
-    TT_ASSERT(
-        mesh_device != nullptr,
-        "Must provide a valid mesh_device when initializing a global circular buffer on multiple devices.");
-    this->global_circular_buffers.reserve(mesh_device->num_devices());
-}
-
 GlobalCircularBuffer create_global_circular_buffer(
     IDevice* device,
     const std::vector<std::pair<CoreCoord, CoreRangeSet>>& sender_receiver_core_mapping,
@@ -26,20 +19,13 @@ GlobalCircularBuffer create_global_circular_buffer(
         device, sender_receiver_core_mapping, size, buffer_type);
 }
 
-MultiDeviceGlobalCircularBuffer create_global_circular_buffer(
-    MeshDevice* mesh_device,
+GlobalCircularBuffer create_global_circular_buffer(
+    MeshDevice* device,
     const std::vector<std::pair<CoreCoord, CoreRangeSet>>& sender_receiver_core_mapping,
     uint32_t size,
     BufferType buffer_type) {
-    MultiDeviceGlobalCircularBuffer multi_device_global_cb(mesh_device);
-    auto& global_circular_buffers = multi_device_global_cb.global_circular_buffers;
-    const auto& devices = mesh_device->get_devices();
-    for (uint32_t i = 0; i < devices.size(); ++i) {
-        auto* device = devices[i];
-        global_circular_buffers.push_back(
-            create_global_circular_buffer(device, sender_receiver_core_mapping, size, buffer_type));
-    }
-    return multi_device_global_cb;
+    return tt::tt_metal::v1::experimental::CreateGlobalCircularBuffer(
+        device, sender_receiver_core_mapping, size, buffer_type);
 }
 
 }  // namespace ttnn::global_circular_buffer
diff --git a/ttnn/cpp/ttnn/global_circular_buffer.hpp b/ttnn/cpp/ttnn/global_circular_buffer.hpp
@@ -10,14 +10,6 @@
 
 namespace ttnn::global_circular_buffer {
 
-struct MultiDeviceGlobalCircularBuffer {
-    MultiDeviceGlobalCircularBuffer(MeshDevice* mesh_device);
-    std::vector<GlobalCircularBuffer> global_circular_buffers;
-
-    static constexpr auto attribute_names = std::forward_as_tuple("global_circular_buffers");
-    const auto attribute_values() const { return std::forward_as_tuple(this->global_circular_buffers); }
-};
-
 // Single Device APIs
 GlobalCircularBuffer create_global_circular_buffer(
     IDevice* device,
@@ -26,7 +18,7 @@ GlobalCircularBuffer create_global_circular_buffer(
     BufferType buffer_type = BufferType::L1);
 
 // Multi Device APIs
-MultiDeviceGlobalCircularBuffer create_global_circular_buffer(
+GlobalCircularBuffer create_global_circular_buffer(
     MeshDevice* mesh_device,
     const std::vector<std::pair<CoreCoord, CoreRangeSet>>& sender_receiver_core_mapping,
     uint32_t size,

diff --git a/ttnn/cpp/ttnn/operations/global_cb_utils.hpp b/ttnn/cpp/ttnn/operations/global_cb_utils.hpp
diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.cpp
@@ -2271,10 +2271,6 @@ operation::ProgramWithCallbacks Matmul::create_program(
                     program_config.fused_activation,
                     this->untilize_out);
             } else if constexpr (std::is_same_v<ProgramConfigType, MatmulMultiCoreReuseMultiCast1DProgramConfig>) {
-                std::optional<tt::tt_metal::v1::experimental::GlobalCircularBuffer> global_cb = std::nullopt;
-                if (this->global_cb.has_value()) {
-                    global_cb = get_global_circular_buffer(*this->global_cb, input_tensor_a.device()->id());
-                }
                 return matmul_multi_core_reuse_mcast_1d_optimized(
                     input_tensor_a,
                     input_tensor_b,
@@ -2296,7 +2292,7 @@ operation::ProgramWithCallbacks Matmul::create_program(
                     program_config.gather_in0,
                     program_config.hop_cores,
                     this->untilize_out,
-                    global_cb,
+                    this->global_cb,
                     program_config.num_global_cb_receivers);
             } else if constexpr (std::is_same_v<
                                      ProgramConfigType,

diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.hpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.hpp
@@ -8,7 +8,6 @@
 #include "ttnn/operations/ccl/ccl_op_fusion.hpp"
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
 #include "ttnn/operations/eltwise/unary/common/unary_op_types.hpp"
-#include "ttnn/operations/global_cb_utils.hpp"
 #include "ttnn/run_operation.hpp"
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/tensor/tensor_utils.hpp"
@@ -179,7 +178,7 @@ struct Matmul {
     const bool transpose_a = false;
     const bool transpose_b = false;
     const std::optional<const tt::tt_metal::Tile> output_tile;
-    const std::optional<const tt::tt_metal::DeviceGlobalCircularBuffer> global_cb;
+    const std::optional<const GlobalCircularBuffer> global_cb;
 
     void validate(
         const std::vector<Tensor>& input_tensors,

diff --git a/ttnn/cpp/ttnn/operations/matmul/matmul.cpp b/ttnn/cpp/ttnn/operations/matmul/matmul.cpp
@@ -118,7 +118,7 @@ Tensor MatmulOperation::invoke(
     const std::optional<const CoreGrid> core_grid,
     const std::optional<const tt::tt_metal::Tile>& output_tile,
     std::optional<Tensor> optional_output_tensor,
-    const std::optional<const tt::tt_metal::DeviceGlobalCircularBuffer>& global_cb) {
+    const std::optional<const GlobalCircularBuffer>& global_cb) {
     std::optional<CoreCoord> user_core_coord;
     if (core_grid.has_value()) {
         user_core_coord = CoreCoord(core_grid->x, core_grid->y);
@@ -160,7 +160,7 @@ Tensor LinearOperation::invoke(
     const std::optional<const CoreGrid> core_grid,
     const std::optional<const tt::tt_metal::Tile>& output_tile,
     std::optional<ttnn::Tensor> optional_output_tensor,
-    const std::optional<const tt::tt_metal::DeviceGlobalCircularBuffer>& global_cb) {
+    const std::optional<const GlobalCircularBuffer>& global_cb) {
     std::optional<CoreCoord> user_core_coord;
     if (core_grid.has_value()) {
         user_core_coord = CoreCoord(core_grid->x, core_grid->y);

diff --git a/ttnn/cpp/ttnn/operations/matmul/matmul.hpp b/ttnn/cpp/ttnn/operations/matmul/matmul.hpp
@@ -50,7 +50,7 @@ struct MatmulOperation {
         const std::optional<const CoreGrid> core_grid = std::nullopt,
         const std::optional<const tt::tt_metal::Tile>& output_tile = std::nullopt,
         std::optional<Tensor> optional_output_tensor = std::nullopt,
-        const std::optional<const tt::tt_metal::DeviceGlobalCircularBuffer>& global_cb = std::nullopt);
+        const std::optional<const GlobalCircularBuffer>& global_cb = std::nullopt);
 };
 
 struct LinearOperation {
@@ -68,7 +68,7 @@ struct LinearOperation {
         const std::optional<const CoreGrid> core_grid = std::nullopt,
         const std::optional<const tt::tt_metal::Tile>& output_tile = std::nullopt,
         std::optional<Tensor> optional_output_tensor = std::nullopt,
-        const std::optional<const tt::tt_metal::DeviceGlobalCircularBuffer>& global_cb = std::nullopt);
+        const std::optional<const GlobalCircularBuffer>& global_cb = std::nullopt);
 };
 
 }  // namespace matmul

diff --git a/ttnn/cpp/ttnn/operations/matmul/matmul_pybind.cpp b/ttnn/cpp/ttnn/operations/matmul/matmul_pybind.cpp
@@ -346,7 +346,7 @@ void py_module(py::module& module) {
                const std::optional<const ttnn::CoreGrid> core_grid,
                const std::optional<const tt::tt_metal::Tile>& output_tile,
                std::optional<Tensor>& optional_output_tensor,
-               const std::optional<const tt::tt_metal::DeviceGlobalCircularBuffer>& global_cb) -> ttnn::Tensor {
+               const std::optional<const GlobalCircularBuffer>& global_cb) -> ttnn::Tensor {
                 return self(
                     input_tensor_a,
                     input_tensor_b,
@@ -430,7 +430,7 @@ void py_module(py::module& module) {
                const std::optional<const ttnn::CoreGrid> core_grid,
                const std::optional<const tt::tt_metal::Tile>& output_tile,
                std::optional<Tensor>& optional_output_tensor,
-               const std::optional<const tt::tt_metal::DeviceGlobalCircularBuffer>& global_cb) -> ttnn::Tensor {
+               const std::optional<const GlobalCircularBuffer>& global_cb) -> ttnn::Tensor {
                 return self(
                     input_tensor_a,
                     input_tensor_b,

diff --git a/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/device/dram_prefetcher_op.cpp b/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/device/dram_prefetcher_op.cpp
@@ -21,7 +21,7 @@ void DramPrefetcher::validate(const std::vector<Tensor>& input_tensors) const {
     TT_FATAL(global_cb.has_value(), "Global circular buffer must be provided");
     ttnn::Tensor tensor_addrs = input_tensors.back();  // Last tensor is tensor_addrs
 
-    auto global_cb = tt::tt_metal::get_global_circular_buffer(*this->global_cb, input_tensors[0].device()->id());
+    auto global_cb = *this->global_cb;
     uint32_t num_receiver_cores = global_cb.receiver_cores().num_cores();
     uint32_t num_sender_cores = global_cb.sender_cores().num_cores();
 
@@ -80,8 +80,7 @@ std::vector<ttnn::TensorSpec> DramPrefetcher::compute_output_specs(const std::ve
 }
 tt::tt_metal::operation::ProgramWithCallbacks DramPrefetcher::create_program(
     const std::vector<Tensor>& input_tensors, std::vector<Tensor>& output_tensors) const {
-    auto global_cb = tt::tt_metal::get_global_circular_buffer(*this->global_cb, input_tensors[0].device()->id());
-    return dram_prefetcher_multi_core(input_tensors, this->num_layers, global_cb);
+    return dram_prefetcher_multi_core(input_tensors, this->num_layers, *this->global_cb);
 }
 
 }  // namespace ttnn::operations::dram_prefetcher
diff --git a/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/device/dram_prefetcher_op.hpp b/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/device/dram_prefetcher_op.hpp
@@ -9,7 +9,6 @@
 #include "ttnn/run_operation.hpp"
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/operations/core/core.hpp"
-#include "ttnn/operations/global_cb_utils.hpp"
 
 #include <tt-metalium/global_circular_buffer_impl.hpp>
 #include <tt-metalium/global_circular_buffer.hpp>
@@ -23,7 +22,7 @@ tt::tt_metal::operation::ProgramWithCallbacks dram_prefetcher_multi_core(
     const tt::tt_metal::v1::experimental::GlobalCircularBuffer& global_cb);
 
 struct DramPrefetcher {
-    const std::optional<const tt::tt_metal::DeviceGlobalCircularBuffer> global_cb;
+    const std::optional<const GlobalCircularBuffer> global_cb;
     const uint32_t num_layers;
 
     void validate(const std::vector<Tensor>& input_tensors) const;

diff --git a/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/dram_prefetcher.cpp b/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/dram_prefetcher.cpp
@@ -14,7 +14,7 @@ namespace ttnn::operations::dram_prefetcher {
 Tensor ExecuteDramPrefetcher::invoke(
     std::vector<ttnn::Tensor>& tensors,
     const uint32_t num_layers,
-    const std::optional<const tt::tt_metal::DeviceGlobalCircularBuffer>& global_cb) {
+    const std::optional<const GlobalCircularBuffer>& global_cb) {
     std::vector<Tensor> output_tensors = {Tensor(tt::tt_metal::operation::get_workers_for_op_output(tensors))};
     tt::tt_metal::operation::launch_op(
         [num_layers, global_cb](

diff --git a/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/dram_prefetcher.hpp b/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/dram_prefetcher.hpp
@@ -17,7 +17,7 @@ struct ExecuteDramPrefetcher {
     static ttnn::Tensor invoke(
         std::vector<ttnn::Tensor>& tensors,
         const uint32_t num_layers,
-        const std::optional<const tt::tt_metal::DeviceGlobalCircularBuffer>& global_cb);
+        const std::optional<const GlobalCircularBuffer>& global_cb);
 };
 
 }  // namespace operations::dram_prefetcher