-
Notifications
You must be signed in to change notification settings - Fork 115
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changing determining parallel config logic #18601
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -81,6 +81,16 @@ uint32_t find_closest_largest_divisor_with_num_padding(uint32_t num1, uint32_t n | |
return divisor; | ||
} | ||
|
||
// If shard width is tile width, and it is allowed to have half tile shard width, and we have enough cores to do it, | ||
// double number of cores | ||
static void set_shard_width_to_half_tile_if_possible( | ||
uint32_t& num_cores, uint32_t channels_ntiles, uint32_t max_num_cores, bool width_shard_half_tile_possible) { | ||
if (width_shard_half_tile_possible && (div_up(channels_ntiles, num_cores) == 1) && | ||
(2 * num_cores <= max_num_cores)) { | ||
num_cores *= 2; | ||
} | ||
} | ||
|
||
ParallelConfig determine_parallel_config( | ||
const TensorMemoryLayout shard_layout, | ||
uint32_t batch_size, | ||
|
@@ -91,12 +101,15 @@ ParallelConfig determine_parallel_config( | |
const CoreCoord& compute_grid_size, | ||
ShardOrientation block_shard_orientation, | ||
bool enable_channels_padding, | ||
bool is_out_tiled, | ||
bool is_shard_height_tile_multiple, | ||
bool is_shard_width_tile_multiple, | ||
uint32_t act_block_h_override) { | ||
uint32_t effective_tile_height = is_out_tiled ? tt::constants::TILE_HEIGHT : 1; | ||
uint32_t effective_tile_width = is_out_tiled ? tt::constants::TILE_WIDTH : 1; | ||
uint32_t out_nhw_ntiles = | ||
tt::round_up(batch_size * output_height * output_width, tt::constants::TILE_HEIGHT) / effective_tile_height; | ||
// Currently, convolution requires multiples of the tile size for both shard height and width, | ||
// while pooling can accept any height and either a tile multiple or half a tile for width. | ||
// This approach needs to be modified when other shard dimensions are supported. | ||
uint32_t effective_tile_height = is_shard_height_tile_multiple ? tt::constants::TILE_HEIGHT : 1; | ||
uint32_t effective_tile_width = tt::constants::TILE_WIDTH; | ||
uint32_t out_nhw_ntiles = tt::div_up(batch_size * output_height * output_width, effective_tile_height); | ||
uint32_t input_channles_ntiles = tt::div_up(input_channels, effective_tile_width); | ||
uint32_t out_channels_ntiles = tt::div_up(output_channels, effective_tile_width); | ||
// In case non native activation block height is used, we need to ensure that the amount | ||
|
@@ -123,6 +136,8 @@ ParallelConfig determine_parallel_config( | |
? find_closest_largest_divisor_with_num_padding( | ||
out_channels_ntiles, input_channles_ntiles, start_divisor_c) | ||
: find_closest_largest_divisor(out_channels_ntiles, input_channles_ntiles, start_divisor_c); | ||
set_shard_width_to_half_tile_if_possible( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why half tile?? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that feature is removed in #17937, so we expect tile multiples for conv There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ohh.. I did not know that :| |
||
num_cores_c, input_channles_ntiles, start_divisor_c, !is_shard_width_tile_multiple); | ||
uint32_t cores_x = block_shard_orientation == ShardOrientation::COL_MAJOR ? num_cores_nhw : num_cores_c; | ||
uint32_t cores_y = block_shard_orientation == ShardOrientation::COL_MAJOR ? num_cores_c : num_cores_nhw; | ||
CoreRange core_range = CoreRange(CoreCoord({0, 0}), CoreCoord({cores_x - 1, cores_y - 1})); | ||
|
@@ -131,6 +146,8 @@ ParallelConfig determine_parallel_config( | |
uint32_t num_cores_c = enable_channels_padding | ||
? find_closest_largest_divisor_with_num_padding(input_channles_ntiles, max_num_cores) | ||
: find_closest_largest_divisor(input_channles_ntiles, max_num_cores); | ||
set_shard_width_to_half_tile_if_possible( | ||
num_cores_c, input_channles_ntiles, max_num_cores, !is_shard_width_tile_multiple); | ||
grid = tt::tt_metal::num_cores_to_corerangeset(num_cores_c, compute_grid_size, true); | ||
} else { | ||
TT_THROW("Conv2d supports Height, Block or Width Sharded Layouts but got {}", shard_layout); | ||
|
@@ -475,6 +492,7 @@ static std::tuple<ttnn::Shape, ttnn::MemoryConfig, bool> get_conv_padded_input_s | |
block_shard_orientation, | ||
!is_mm_conv, | ||
true, | ||
true, | ||
conv_config.act_block_h_override); | ||
|
||
if (conv_config.override_sharding_config) { | ||
|
@@ -703,7 +721,6 @@ Conv2dConfig determine_conv_config_for_auto_shard( | |
|
||
ShardOrientation shard_orientation = | ||
conv_config.transpose_shards ? ShardOrientation::COL_MAJOR : ShardOrientation::ROW_MAJOR; | ||
const bool is_out_tiled = conv_config.output_layout == Layout::TILE; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This condition seems more intuitive to user(we usually have output layouts defined) also. In latest case, user will have to find shard shape and then do the calculations on whether it is tile multiple or not. well, I know that I am not solving anything but just putting my thoughts here so you can consider all aspects before mainlining. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For conv shard shape is always tile multiple for both dimensions |
||
|
||
struct core_count_and_size { | ||
uint32_t core_count; | ||
|
@@ -753,7 +770,8 @@ Conv2dConfig determine_conv_config_for_auto_shard( | |
compute_grid_size, | ||
shard_orientation, | ||
!is_mm_conv, | ||
is_out_tiled, | ||
true, | ||
true, | ||
conv_config.act_block_h_override); | ||
|
||
const ParallelConfig output_parallel_config = determine_output_parallel_config( | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
is there a way to eliminate these args? My only concern is you are relying on user to do right thing, not that it is too bad, but if it can be avoided, pls do so.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That determine_parallel_config is the same function as one we use in our ops to choose on which cores op is going to be executed so guess these can not be avoided, but that args are optional so user can skip it
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not the function, the args, and yes user can avoid them in case of tile multiple but in case of non tile multiple shard height and width, user will have to explicitly calculate, right(or I am missing something)?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think there is no intuitive way to avoid them - if we just call this function we don't know is it used for conv or for pool, and it is going to have different behaviour based on type of op. Possibly more intuitive parameter could be type of op, but we don't know if sometimes, for example, some pool operations will work on whole tiles and some won't