Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Changing determining parallel config logic #18601

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,6 @@ def get_conv_input_memory_config(
compute_grid_size=compute_grid,
block_shard_orientation=ttnn.ShardOrientation.ROW_MAJOR,
enable_channels_padding=True,
is_out_tiled=True,
)

if override_num_cores:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,6 @@ def __init__(
compute_grid_size=self.device.compute_with_storage_grid_size(),
block_shard_orientation=ttnn.ShardOrientation.ROW_MAJOR,
enable_channels_padding=False,
is_out_tiled=True,
),
tile_size=32,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,6 @@ def __init__(
compute_grid_size=self.device.compute_with_storage_grid_size(),
block_shard_orientation=ttnn.ShardOrientation.ROW_MAJOR,
enable_channels_padding=False,
is_out_tiled=True,
),
tile_size=32,
)
Expand Down Expand Up @@ -207,7 +206,6 @@ def __init__(
compute_grid_size=self.device.compute_with_storage_grid_size(),
block_shard_orientation=ttnn.ShardOrientation.ROW_MAJOR,
enable_channels_padding=False,
is_out_tiled=True,
),
tile_size=32,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,8 @@
compute_grid_size=device.compute_with_storage_grid_size(),
block_shard_orientation=ttnn.ShardOrientation.ROW_MAJOR,
enable_channels_padding=False,
is_out_tiled=False,
is_shard_height_tile_multiple=False,
is_shard_width_tile_multiple=False,
)
sharded_memory_config = ttnn._ttnn.operations.conv.create_sharded_memory_config_from_parallel_config(
tensor_shape=ttact_device.shape,
Expand Down Expand Up @@ -452,7 +453,7 @@
use_program_cache,
ceil_mode,
):
run_max_pool(

Check failure on line 456 in tests/ttnn/nightly/unit_tests/operations/max_pool2d/test_maxpool2d.py

View workflow job for this annotation

GitHub Actions / ttnn nightly tests

test_run_max_pool_width_shard[ceil_mode=False-dtype=DataType.BFLOAT16-dilation=(1, 1)-stride=(1, 1)-padding=(4, 4)-kernel_size=(9, 9)-act_shape=[4, 1024, 40, 40]-device_params={'l1_small_size': 24576}] RuntimeError: TT_THROW @ /work/tt_metal/impl/allocator/bank_manager.cpp:132: tt::exception info: Out of Memory: Not enough space to allocate 13107200 B L1 buffer across 32 banks, where each bank needs to store 409600 B backtrace: --- /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.local/lib/python3.8/site-packages/ttnn/build/lib/libtt_metal.so(+0x216675) [0x7f9040c5d675] --- tt::tt_metal::BankManager::allocate_buffer(unsigned long, unsigned long, bool, CoreRangeSet const&, std::__1::optional<unsigned int>) --- tt::tt_metal::Allocator::allocate_buffer(tt::tt_metal::v0::Buffer*) --- tt::tt_metal::v0::Buffer::allocate_impl() --- /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.local/lib/python3.8/site-packages/ttnn/build/lib/libtt_metal.so(+0x1d82c0) [0x7f9040c1f2c0] --- /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.local/lib/python3.8/site-packages/ttnn/build/lib/libtt_metal.so(+0x1be5c1) [0x7f9040c055c1] --- tt::tt_metal::v0::Buffer::create(tt::tt_metal::v0::IDevice*, unsigned long, unsigned long, tt::tt_metal::BufferType, tt::tt_metal::TensorMemoryLayout, std::__1::optional<tt::tt_metal::ShardSpecBuffer> const&, std::__1::optional<bool>, std::__1::optional<tt::stl::StrongType<unsigned char, tt::tt_metal::SubDeviceIdTag>>) --- tt::tt_metal::tensor_impl::allocate_buffer_on_device(tt::tt_metal::v0::IDevice*, tt::tt_metal::TensorSpec const&) --- tt::tt_metal::create_device_tensor(tt::tt_metal::TensorSpec const&, tt::tt_metal::v0::IDevice*) --- ttnn::operations::pool::Pool2D::create_output_tensors(ttnn::operations::pool::Pool2D::operation_attributes_t const&, ttnn::operations::pool::Pool2D::tensor_args_t const&) --- ttnn::operations::pool::Pool2D::tensor_return_value_t ttnn::device_operation::detail::launch_on_single_device<ttnn::operations::pool::Pool2D>(tt::stl::StrongType<unsigned char, ttnn::QueueIdTag>, ttnn::operations::pool::Pool2D::operation_attributes_t const&, ttnn::operations::pool::Pool2D::tensor_args_t const&) --- /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.local/lib/python3.8/site-packages/ttnn/_ttnn.cpython-38-x86_64-linux-gnu.so(+0x228d5aa) [0x7f904377e5aa] --- ttnn::operations::pool::Pool2D::tensor_return_value_t ttnn::device_operation::detail::invoke<ttnn::operations::pool::Pool2D>(tt::stl::StrongType<unsigned char, ttnn::QueueIdTag>, ttnn::operations::pool::Pool2D::operation_attributes_t const&, ttnn::operations::pool::Pool2D::tensor_args_t const&) --- /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.local/lib/python3.8/site-packages/ttnn/_ttnn.cpython-38-x86_64-linux-gnu.so(+0x228d04c) [0x7f904377e04c] --- /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.local/lib/python3.8/site-packages/ttnn/_ttnn.cpython-38-x86_64-linux-gnu.so(+0x228cc38) [0x7f904377dc38] --- ttnn::operations::pool::Pool2DOp<(ttnn::operations::pool::Pool2DType)0>::invoke(tt::stl::StrongType<unsigned char, ttnn::QueueIdTag>, tt::tt_metal::Tensor const&, unsigned int, unsigned int, unsigned int, unsigned int, std::__1::array<unsigned int, 2ul>, std::__1::array<unsigned int, 2ul>, std::__1::array<unsigned int, 2ul>, std::__1::array<unsigned int, 2ul>, std::__1::optional<tt::tt_metal::MemoryConfig const> const&, std::__1::optional<tt::tt_metal::TensorMemoryLayout const>, bool) --- /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.local/lib/python3.8/site-packages/ttnn/_ttnn.cpython-38-x86_64-linux-gnu.so(+0x15d84e2) [0x7f9042ac94e2] --- /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.local/lib/python3.8/site-packages/ttnn/_ttnn.cpython-38-x86_64-linux-gnu.so(+0x15d7f74) [0x7f9042ac8f74] --- /home/ubuntu/actions-runner/_work/tt-metal/tt-metal/.local/lib/python3.8/site-packages/ttnn/_ttnn.cpython-38-x86_64-linux-gnu.so(+0x15d7cdd) [0x7f9042ac8cdd] --- void tt::tt_metal::operation::launch_op_func<std::__1::vector<tt::tt_metal::Tensor, std::__1::allocator<tt::tt_metal::Tensor>>>(std::__
act_shape,
kernel_size,
padding,
Expand Down Expand Up @@ -834,7 +835,8 @@
compute_grid_size=device.compute_with_storage_grid_size(),
block_shard_orientation=ttnn.ShardOrientation.ROW_MAJOR,
enable_channels_padding=False,
is_out_tiled=True,
is_shard_height_tile_multiple=True,
is_shard_width_tile_multiple=True,
)
sharded_memory_config = ttnn._ttnn.operations.conv.create_sharded_memory_config_from_parallel_config(
tensor_shape=ttact_device.shape,
Expand Down
9 changes: 6 additions & 3 deletions ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,8 @@ void py_bind_conv2d(py::module& module) {
const CoreCoord& compute_grid_size,
tt::tt_metal::ShardOrientation block_shard_orientation,
bool enable_channels_padding,
bool is_out_tiled) -> ttnn::operations::sliding_window::ParallelConfig {
bool is_shard_height_tile_multiple,
bool is_shard_width_tile_multiple) -> ttnn::operations::sliding_window::ParallelConfig {
return determine_parallel_config(
shard_layout,
batch_size,
Expand All @@ -295,7 +296,8 @@ void py_bind_conv2d(py::module& module) {
compute_grid_size,
block_shard_orientation,
enable_channels_padding,
is_out_tiled);
is_shard_height_tile_multiple,
is_shard_width_tile_multiple);
},
py::arg("shard_layout"),
py::arg("batch_size"),
Expand All @@ -306,7 +308,8 @@ void py_bind_conv2d(py::module& module) {
py::arg("compute_grid_size"),
py::arg("block_shard_orientation"),
py::arg("enable_channels_padding"),
py::arg("is_out_tiled") = true);
py::arg("is_shard_height_tile_multiple") = true,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there a way to eliminate these args? My only concern is you are relying on user to do right thing, not that it is too bad, but if it can be avoided, pls do so.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That determine_parallel_config is the same function as one we use in our ops to choose on which cores op is going to be executed so guess these can not be avoided, but that args are optional so user can skip it

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not the function, the args, and yes user can avoid them in case of tile multiple but in case of non tile multiple shard height and width, user will have to explicitly calculate, right(or I am missing something)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think there is no intuitive way to avoid them - if we just call this function we don't know is it used for conv or for pool, and it is going to have different behaviour based on type of op. Possibly more intuitive parameter could be type of op, but we don't know if sometimes, for example, some pool operations will work on whole tiles and some won't

py::arg("is_shard_width_tile_multiple") = true);

module.def(
"create_sharded_memory_config_from_parallel_config",
Expand Down
32 changes: 25 additions & 7 deletions ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,16 @@ uint32_t find_closest_largest_divisor_with_num_padding(uint32_t num1, uint32_t n
return divisor;
}

// If shard width is tile width, and it is allowed to have half tile shard width, and we have enough cores to do it,
// double number of cores
static void set_shard_width_to_half_tile_if_possible(
uint32_t& num_cores, uint32_t channels_ntiles, uint32_t max_num_cores, bool width_shard_half_tile_possible) {
if (width_shard_half_tile_possible && (div_up(channels_ntiles, num_cores) == 1) &&
(2 * num_cores <= max_num_cores)) {
num_cores *= 2;
}
}

ParallelConfig determine_parallel_config(
const TensorMemoryLayout shard_layout,
uint32_t batch_size,
Expand All @@ -91,12 +101,15 @@ ParallelConfig determine_parallel_config(
const CoreCoord& compute_grid_size,
ShardOrientation block_shard_orientation,
bool enable_channels_padding,
bool is_out_tiled,
bool is_shard_height_tile_multiple,
bool is_shard_width_tile_multiple,
uint32_t act_block_h_override) {
uint32_t effective_tile_height = is_out_tiled ? tt::constants::TILE_HEIGHT : 1;
uint32_t effective_tile_width = is_out_tiled ? tt::constants::TILE_WIDTH : 1;
uint32_t out_nhw_ntiles =
tt::round_up(batch_size * output_height * output_width, tt::constants::TILE_HEIGHT) / effective_tile_height;
// Currently, convolution requires multiples of the tile size for both shard height and width,
// while pooling can accept any height and either a tile multiple or half a tile for width.
// This approach needs to be modified when other shard dimensions are supported.
uint32_t effective_tile_height = is_shard_height_tile_multiple ? tt::constants::TILE_HEIGHT : 1;
uint32_t effective_tile_width = tt::constants::TILE_WIDTH;
uint32_t out_nhw_ntiles = tt::div_up(batch_size * output_height * output_width, effective_tile_height);
uint32_t input_channles_ntiles = tt::div_up(input_channels, effective_tile_width);
uint32_t out_channels_ntiles = tt::div_up(output_channels, effective_tile_width);
// In case non native activation block height is used, we need to ensure that the amount
Expand All @@ -123,6 +136,8 @@ ParallelConfig determine_parallel_config(
? find_closest_largest_divisor_with_num_padding(
out_channels_ntiles, input_channles_ntiles, start_divisor_c)
: find_closest_largest_divisor(out_channels_ntiles, input_channles_ntiles, start_divisor_c);
set_shard_width_to_half_tile_if_possible(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why half tile??
I feel that this will eliminate non-tile multiple shard width scenario in conv in some cases. You can refer this. Pls make sure that we do not by-pass that.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that feature is removed in #17937, so we expect tile multiples for conv

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ohh.. I did not know that :|

num_cores_c, input_channles_ntiles, start_divisor_c, !is_shard_width_tile_multiple);
uint32_t cores_x = block_shard_orientation == ShardOrientation::COL_MAJOR ? num_cores_nhw : num_cores_c;
uint32_t cores_y = block_shard_orientation == ShardOrientation::COL_MAJOR ? num_cores_c : num_cores_nhw;
CoreRange core_range = CoreRange(CoreCoord({0, 0}), CoreCoord({cores_x - 1, cores_y - 1}));
Expand All @@ -131,6 +146,8 @@ ParallelConfig determine_parallel_config(
uint32_t num_cores_c = enable_channels_padding
? find_closest_largest_divisor_with_num_padding(input_channles_ntiles, max_num_cores)
: find_closest_largest_divisor(input_channles_ntiles, max_num_cores);
set_shard_width_to_half_tile_if_possible(
num_cores_c, input_channles_ntiles, max_num_cores, !is_shard_width_tile_multiple);
grid = tt::tt_metal::num_cores_to_corerangeset(num_cores_c, compute_grid_size, true);
} else {
TT_THROW("Conv2d supports Height, Block or Width Sharded Layouts but got {}", shard_layout);
Expand Down Expand Up @@ -475,6 +492,7 @@ static std::tuple<ttnn::Shape, ttnn::MemoryConfig, bool> get_conv_padded_input_s
block_shard_orientation,
!is_mm_conv,
true,
true,
conv_config.act_block_h_override);

if (conv_config.override_sharding_config) {
Expand Down Expand Up @@ -703,7 +721,6 @@ Conv2dConfig determine_conv_config_for_auto_shard(

ShardOrientation shard_orientation =
conv_config.transpose_shards ? ShardOrientation::COL_MAJOR : ShardOrientation::ROW_MAJOR;
const bool is_out_tiled = conv_config.output_layout == Layout::TILE;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This condition seems more intuitive to user(we usually have output layouts defined) also. In latest case, user will have to find shard shape and then do the calculations on whether it is tile multiple or not. well, I know that I am not solving anything but just putting my thoughts here so you can consider all aspects before mainlining.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For conv shard shape is always tile multiple for both dimensions


struct core_count_and_size {
uint32_t core_count;
Expand Down Expand Up @@ -753,7 +770,8 @@ Conv2dConfig determine_conv_config_for_auto_shard(
compute_grid_size,
shard_orientation,
!is_mm_conv,
is_out_tiled,
true,
true,
conv_config.act_block_h_override);

const ParallelConfig output_parallel_config = determine_output_parallel_config(
Expand Down
3 changes: 2 additions & 1 deletion ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ sliding_window::ParallelConfig determine_parallel_config(
const CoreCoord& compute_grid_size,
tt::tt_metal::ShardOrientation block_shard_orientation,
bool enable_channels_padding,
bool is_out_tiled = true,
bool is_shard_height_tile_multiple = true,
bool is_shard_width_tile_multiple = true,
uint32_t act_block_h_override = 0);

sliding_window::ParallelConfig determine_output_parallel_config(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -599,6 +599,7 @@ static OptimizedConvBlockConfig get_opt_block_config(
shard_orientation,
!mm_conv,
true,
true,
conv_config.act_block_h_override);
}
auto output_parallel_config = parallel_config;
Expand Down Expand Up @@ -839,6 +840,7 @@ ttnn::Tensor prepare_conv_weights(
shard_orientation,
!mm_conv,
true,
true,
conv_config.act_block_h_override);
}

Expand Down Expand Up @@ -941,6 +943,7 @@ ttnn::Tensor prepare_conv_bias(
shard_orientation,
!mm_conv,
true,
true,
conv_config.act_block_h_override);
}

Expand Down
Loading