diff --git a/tests/scripts/t3000/run_t3000_unit_tests.sh b/tests/scripts/t3000/run_t3000_unit_tests.sh index beed6455b8e..2b0cdf8cb57 100755 --- a/tests/scripts/t3000/run_t3000_unit_tests.sh +++ b/tests/scripts/t3000/run_t3000_unit_tests.sh @@ -83,6 +83,7 @@ run_t3000_ttnn_tests() { ./build/test/ttnn/unit_tests_ttnn_ccl WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest tests/ttnn/unit_tests/test_multi_device_trace.py ; fail+=$? WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest tests/ttnn/unit_tests/test_multi_device_events.py ; fail+=$? + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest tests/ttnn/unit_tests/operations/test_prefetcher.py::test_run_prefetcher_post_commit_multi_device ; fail+=$? pytest -n auto tests/ttnn/unit_tests/test_multi_device.py ; fail+=$? pytest -n auto tests/ttnn/unit_tests/test_multi_device_async.py ; fail+=$? pytest tests/ttnn/distributed/test_tensor_parallel_example_T3000.py ; fail+=$? diff --git a/tests/ttnn/unit_tests/operations/test_prefetcher.py b/tests/ttnn/unit_tests/operations/test_prefetcher.py index 83bb0dbfabc..37e78e144c7 100644 --- a/tests/ttnn/unit_tests/operations/test_prefetcher.py +++ b/tests/ttnn/unit_tests/operations/test_prefetcher.py @@ -56,3 +56,57 @@ def test_run_prefetcher_post_commit( dtypes, is_functional_test=True, ) + + +# This is not tested on main, but we need to test Global CB functionality through +# TT-Mesh. +# Test DRAM Prefetcher x Matmul on T3K. Can be removed once we bringup support on Galaxy +# since we have tests for that system. +@pytest.mark.skipif(is_grayskull(), reason="GS not supported") +@pytest.mark.parametrize( + "num_reader_cores, num_tensors, input_shapes, dtypes, num_layers", + [ + (2, 2, [(256, 512), (256, 512)], [ttnn.bfloat4_b] * 2, 5), + (2, 2, [(1024, 256), (1024, 256)], [ttnn.bfloat4_b] * 2, 5), + (2, 2, [(128, 128), (128, 128)], [ttnn.bfloat4_b] * 2, 2), + (2, 2, [(256, 1024), (256, 2048)], [ttnn.bfloat4_b, ttnn.bfloat8_b], 1), + (2, 3, [(256, 1024), (256, 2048), (512, 256)], [ttnn.bfloat4_b, ttnn.bfloat8_b, ttnn.bfloat4_b], 5), + (2, 2, [(256, 1024), (128, 128)], [ttnn.bfloat4_b, ttnn.bfloat8_b], 5), + (2, 3, [(256, 1024), (128, 128), (1024, 256)], [ttnn.bfloat4_b, ttnn.bfloat8_b, ttnn.bfloat4_b], 5), + # Padding check + ( + 2, + 3, + [(256 + 32, 512 + 224), (128, 128 + 64), (512 + 256, 224)], + [ttnn.bfloat4_b, ttnn.bfloat8_b, ttnn.bfloat4_b], + 5, + ), + ], +) +@pytest.mark.parametrize( + "device_params", + [{"trace_region_size": 23887872}], + indirect=True, +) +def test_run_prefetcher_post_commit_multi_device( + mesh_device, + num_tensors, + input_shapes, + num_layers, + num_reader_cores, + dtypes, + use_program_cache, + function_level_defaults, +): + if mesh_device.get_num_devices() <= 1: + pytest.skip("This test requires multiple devices") + + run_prefetcher_mm( + mesh_device, + num_tensors, + input_shapes, + num_layers, + num_reader_cores, + dtypes, + is_functional_test=True, + ) diff --git a/ttnn/cpp/pybind11/global_circular_buffer.cpp b/ttnn/cpp/pybind11/global_circular_buffer.cpp index ce73ea3f358..cdee6a139f8 100644 --- a/ttnn/cpp/pybind11/global_circular_buffer.cpp +++ b/ttnn/cpp/pybind11/global_circular_buffer.cpp @@ -12,7 +12,6 @@ namespace ttnn::global_circular_buffer { void py_module_types(py::module& module) { py::class_>(module, "global_circular_buffer"); - py::class_(module, "multi_device_global_circular_buffer"); } void py_module(py::module& module) { diff --git a/ttnn/cpp/ttnn/global_circular_buffer.cpp b/ttnn/cpp/ttnn/global_circular_buffer.cpp index 757f3feb350..34f2ce0610a 100644 --- a/ttnn/cpp/ttnn/global_circular_buffer.cpp +++ b/ttnn/cpp/ttnn/global_circular_buffer.cpp @@ -10,13 +10,6 @@ namespace ttnn::global_circular_buffer { -MultiDeviceGlobalCircularBuffer::MultiDeviceGlobalCircularBuffer(MeshDevice* mesh_device) { - TT_ASSERT( - mesh_device != nullptr, - "Must provide a valid mesh_device when initializing a global circular buffer on multiple devices."); - this->global_circular_buffers.reserve(mesh_device->num_devices()); -} - GlobalCircularBuffer create_global_circular_buffer( IDevice* device, const std::vector>& sender_receiver_core_mapping, @@ -26,20 +19,13 @@ GlobalCircularBuffer create_global_circular_buffer( device, sender_receiver_core_mapping, size, buffer_type); } -MultiDeviceGlobalCircularBuffer create_global_circular_buffer( - MeshDevice* mesh_device, +GlobalCircularBuffer create_global_circular_buffer( + MeshDevice* device, const std::vector>& sender_receiver_core_mapping, uint32_t size, BufferType buffer_type) { - MultiDeviceGlobalCircularBuffer multi_device_global_cb(mesh_device); - auto& global_circular_buffers = multi_device_global_cb.global_circular_buffers; - const auto& devices = mesh_device->get_devices(); - for (uint32_t i = 0; i < devices.size(); ++i) { - auto* device = devices[i]; - global_circular_buffers.push_back( - create_global_circular_buffer(device, sender_receiver_core_mapping, size, buffer_type)); - } - return multi_device_global_cb; + return tt::tt_metal::v1::experimental::CreateGlobalCircularBuffer( + device, sender_receiver_core_mapping, size, buffer_type); } } // namespace ttnn::global_circular_buffer diff --git a/ttnn/cpp/ttnn/global_circular_buffer.hpp b/ttnn/cpp/ttnn/global_circular_buffer.hpp index 6235e4d9461..2e1733e80ad 100644 --- a/ttnn/cpp/ttnn/global_circular_buffer.hpp +++ b/ttnn/cpp/ttnn/global_circular_buffer.hpp @@ -10,14 +10,6 @@ namespace ttnn::global_circular_buffer { -struct MultiDeviceGlobalCircularBuffer { - MultiDeviceGlobalCircularBuffer(MeshDevice* mesh_device); - std::vector global_circular_buffers; - - static constexpr auto attribute_names = std::forward_as_tuple("global_circular_buffers"); - const auto attribute_values() const { return std::forward_as_tuple(this->global_circular_buffers); } -}; - // Single Device APIs GlobalCircularBuffer create_global_circular_buffer( IDevice* device, @@ -26,7 +18,7 @@ GlobalCircularBuffer create_global_circular_buffer( BufferType buffer_type = BufferType::L1); // Multi Device APIs -MultiDeviceGlobalCircularBuffer create_global_circular_buffer( +GlobalCircularBuffer create_global_circular_buffer( MeshDevice* mesh_device, const std::vector>& sender_receiver_core_mapping, uint32_t size, diff --git a/ttnn/cpp/ttnn/operations/global_cb_utils.hpp b/ttnn/cpp/ttnn/operations/global_cb_utils.hpp deleted file mode 100644 index 536ce6e723c..00000000000 --- a/ttnn/cpp/ttnn/operations/global_cb_utils.hpp +++ /dev/null @@ -1,40 +0,0 @@ -// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "ttnn/global_circular_buffer.hpp" - -#include -#include - -namespace tt::tt_metal { - -// TODO: remove once the multidevice/single device objects are unified -using DeviceGlobalCircularBuffer = std::variant< - std::monostate, - tt::tt_metal::v1::experimental::GlobalCircularBuffer, - ttnn::global_circular_buffer::MultiDeviceGlobalCircularBuffer>; - -inline tt::tt_metal::v1::experimental::GlobalCircularBuffer get_global_circular_buffer( - DeviceGlobalCircularBuffer device_global_cb, chip_id_t device_id) { - if (std::holds_alternative(device_global_cb)) { - return std::get(device_global_cb); - } else { - auto& multi_device_global_cb = - std::get(device_global_cb); - - for (auto& global_cb_ : multi_device_global_cb.global_circular_buffers) { - tt::tt_metal::IDevice* global_cb_device = global_cb_.get_device(); - auto global_device_id = global_cb_device->id(); - if (device_id == global_device_id) { - return global_cb_; - } - } - - TT_THROW("Error finding a device for a GlobalCB in MultiDeviceGlobalCircularBuffer"); - } -} - -} // namespace tt::tt_metal diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.cpp index 3d72eb55267..7a03cc81b5a 100644 --- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.cpp +++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.cpp @@ -2271,10 +2271,6 @@ operation::ProgramWithCallbacks Matmul::create_program( program_config.fused_activation, this->untilize_out); } else if constexpr (std::is_same_v) { - std::optional global_cb = std::nullopt; - if (this->global_cb.has_value()) { - global_cb = get_global_circular_buffer(*this->global_cb, input_tensor_a.device()->id()); - } return matmul_multi_core_reuse_mcast_1d_optimized( input_tensor_a, input_tensor_b, @@ -2296,7 +2292,7 @@ operation::ProgramWithCallbacks Matmul::create_program( program_config.gather_in0, program_config.hop_cores, this->untilize_out, - global_cb, + this->global_cb, program_config.num_global_cb_receivers); } else if constexpr (std::is_same_v< ProgramConfigType, diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.hpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.hpp index c3f99b37056..9c55a665f35 100644 --- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.hpp +++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.hpp @@ -8,7 +8,6 @@ #include "ttnn/operations/ccl/ccl_op_fusion.hpp" #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp" #include "ttnn/operations/eltwise/unary/common/unary_op_types.hpp" -#include "ttnn/operations/global_cb_utils.hpp" #include "ttnn/run_operation.hpp" #include "ttnn/tensor/tensor.hpp" #include "ttnn/tensor/tensor_utils.hpp" @@ -179,7 +178,7 @@ struct Matmul { const bool transpose_a = false; const bool transpose_b = false; const std::optional output_tile; - const std::optional global_cb; + const std::optional global_cb; void validate( const std::vector& input_tensors, diff --git a/ttnn/cpp/ttnn/operations/matmul/matmul.cpp b/ttnn/cpp/ttnn/operations/matmul/matmul.cpp index f5d8060ff2b..2581dc706fd 100644 --- a/ttnn/cpp/ttnn/operations/matmul/matmul.cpp +++ b/ttnn/cpp/ttnn/operations/matmul/matmul.cpp @@ -118,7 +118,7 @@ Tensor MatmulOperation::invoke( const std::optional core_grid, const std::optional& output_tile, std::optional optional_output_tensor, - const std::optional& global_cb) { + const std::optional& global_cb) { std::optional user_core_coord; if (core_grid.has_value()) { user_core_coord = CoreCoord(core_grid->x, core_grid->y); @@ -160,7 +160,7 @@ Tensor LinearOperation::invoke( const std::optional core_grid, const std::optional& output_tile, std::optional optional_output_tensor, - const std::optional& global_cb) { + const std::optional& global_cb) { std::optional user_core_coord; if (core_grid.has_value()) { user_core_coord = CoreCoord(core_grid->x, core_grid->y); diff --git a/ttnn/cpp/ttnn/operations/matmul/matmul.hpp b/ttnn/cpp/ttnn/operations/matmul/matmul.hpp index abf819d5e63..30d42648143 100644 --- a/ttnn/cpp/ttnn/operations/matmul/matmul.hpp +++ b/ttnn/cpp/ttnn/operations/matmul/matmul.hpp @@ -50,7 +50,7 @@ struct MatmulOperation { const std::optional core_grid = std::nullopt, const std::optional& output_tile = std::nullopt, std::optional optional_output_tensor = std::nullopt, - const std::optional& global_cb = std::nullopt); + const std::optional& global_cb = std::nullopt); }; struct LinearOperation { @@ -68,7 +68,7 @@ struct LinearOperation { const std::optional core_grid = std::nullopt, const std::optional& output_tile = std::nullopt, std::optional optional_output_tensor = std::nullopt, - const std::optional& global_cb = std::nullopt); + const std::optional& global_cb = std::nullopt); }; } // namespace matmul diff --git a/ttnn/cpp/ttnn/operations/matmul/matmul_pybind.cpp b/ttnn/cpp/ttnn/operations/matmul/matmul_pybind.cpp index 5dff12923c7..f94363aa0aa 100644 --- a/ttnn/cpp/ttnn/operations/matmul/matmul_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/matmul/matmul_pybind.cpp @@ -346,7 +346,7 @@ void py_module(py::module& module) { const std::optional core_grid, const std::optional& output_tile, std::optional& optional_output_tensor, - const std::optional& global_cb) -> ttnn::Tensor { + const std::optional& global_cb) -> ttnn::Tensor { return self( input_tensor_a, input_tensor_b, @@ -430,7 +430,7 @@ void py_module(py::module& module) { const std::optional core_grid, const std::optional& output_tile, std::optional& optional_output_tensor, - const std::optional& global_cb) -> ttnn::Tensor { + const std::optional& global_cb) -> ttnn::Tensor { return self( input_tensor_a, input_tensor_b, diff --git a/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/device/dram_prefetcher_op.cpp b/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/device/dram_prefetcher_op.cpp index b5adf303e8e..8ada061a34a 100644 --- a/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/device/dram_prefetcher_op.cpp +++ b/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/device/dram_prefetcher_op.cpp @@ -21,7 +21,7 @@ void DramPrefetcher::validate(const std::vector& input_tensors) const { TT_FATAL(global_cb.has_value(), "Global circular buffer must be provided"); ttnn::Tensor tensor_addrs = input_tensors.back(); // Last tensor is tensor_addrs - auto global_cb = tt::tt_metal::get_global_circular_buffer(*this->global_cb, input_tensors[0].device()->id()); + auto global_cb = *this->global_cb; uint32_t num_receiver_cores = global_cb.receiver_cores().num_cores(); uint32_t num_sender_cores = global_cb.sender_cores().num_cores(); @@ -80,8 +80,7 @@ std::vector DramPrefetcher::compute_output_specs(const std::ve } tt::tt_metal::operation::ProgramWithCallbacks DramPrefetcher::create_program( const std::vector& input_tensors, std::vector& output_tensors) const { - auto global_cb = tt::tt_metal::get_global_circular_buffer(*this->global_cb, input_tensors[0].device()->id()); - return dram_prefetcher_multi_core(input_tensors, this->num_layers, global_cb); + return dram_prefetcher_multi_core(input_tensors, this->num_layers, *this->global_cb); } } // namespace ttnn::operations::dram_prefetcher diff --git a/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/device/dram_prefetcher_op.hpp b/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/device/dram_prefetcher_op.hpp index 1497f479828..1a28ca57409 100644 --- a/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/device/dram_prefetcher_op.hpp +++ b/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/device/dram_prefetcher_op.hpp @@ -9,7 +9,6 @@ #include "ttnn/run_operation.hpp" #include "ttnn/tensor/tensor.hpp" #include "ttnn/operations/core/core.hpp" -#include "ttnn/operations/global_cb_utils.hpp" #include #include @@ -23,7 +22,7 @@ tt::tt_metal::operation::ProgramWithCallbacks dram_prefetcher_multi_core( const tt::tt_metal::v1::experimental::GlobalCircularBuffer& global_cb); struct DramPrefetcher { - const std::optional global_cb; + const std::optional global_cb; const uint32_t num_layers; void validate(const std::vector& input_tensors) const; diff --git a/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/dram_prefetcher.cpp b/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/dram_prefetcher.cpp index 09b018160fe..6505e557e28 100644 --- a/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/dram_prefetcher.cpp +++ b/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/dram_prefetcher.cpp @@ -14,7 +14,7 @@ namespace ttnn::operations::dram_prefetcher { Tensor ExecuteDramPrefetcher::invoke( std::vector& tensors, const uint32_t num_layers, - const std::optional& global_cb) { + const std::optional& global_cb) { std::vector output_tensors = {Tensor(tt::tt_metal::operation::get_workers_for_op_output(tensors))}; tt::tt_metal::operation::launch_op( [num_layers, global_cb]( diff --git a/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/dram_prefetcher.hpp b/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/dram_prefetcher.hpp index 4144fe64abc..3e829a7c997 100644 --- a/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/dram_prefetcher.hpp +++ b/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/dram_prefetcher.hpp @@ -17,7 +17,7 @@ struct ExecuteDramPrefetcher { static ttnn::Tensor invoke( std::vector& tensors, const uint32_t num_layers, - const std::optional& global_cb); + const std::optional& global_cb); }; } // namespace operations::dram_prefetcher