Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#18250: Reverting changes from CFGSHIFTMASK PR #21

Merged
merged 1 commit into from
Feb 25, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 33 additions & 19 deletions tt_llk_blackhole/llk_lib/llk_unpack_AB_matmul.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ inline void _llk_unpack_AB_matmul_mop_config_(const bool transpose, const std::u
// in1/inB - loaded to SrcA

const bool reuse_a = ct_dim >= rt_dim;
const std::uint32_t replay_buf_prog_len = (reuse_a && unpA_partial_face) ? 12 : ((!reuse_a && unpB_partial_face) ? 12 : 6);
const std::uint32_t replay_buf_prog_len = (reuse_a && unpA_partial_face) ? 18 : ((!reuse_a && unpB_partial_face) ? 18 : 12);
const std::uint32_t replay_buf_run_len = replay_buf_prog_len/2;

if (reuse_a) {
Expand All @@ -43,9 +43,14 @@ inline void _llk_unpack_AB_matmul_mop_config_(const bool transpose, const std::u
}
if constexpr (kernel_broadcast_b==1) {
TTI_NOP;
TTI_NOP;
TTI_NOP;
TTI_NOP;
} else {
// THCON_SEC0_REG3_Base_address_ADDR32 = THCON_SEC0_REG3_Base_address_ADDR32 + SCRATCH_SEC0_val_ADDR32
TTI_CFGSHIFTMASK(1, 0b011, 32 - 1, 0, 0b11, THCON_SEC0_REG3_Base_address_ADDR32);
TTI_RDCFG(p_gpr_unpack::TMP0, THCON_SEC0_REG3_Base_address_ADDR32);
TTI_ADDDMAREG(0, p_gpr_unpack::TMP0, p_gpr_unpack::TMP0, p_gpr_unpack::TILE_SIZE_A);
TTI_STALLWAIT(p_stall::STALL_CFG, p_stall::THCON);
TTI_WRCFG(p_gpr_unpack::TMP0,0,THCON_SEC0_REG3_Base_address_ADDR32);
}
// Added to ensure WRCFG instruction has finished, since it takes 2 cycles.
TTI_NOP;
Expand All @@ -60,9 +65,14 @@ inline void _llk_unpack_AB_matmul_mop_config_(const bool transpose, const std::u
}
if constexpr (kernel_broadcast_b==1) {
TTI_NOP;
TTI_NOP;
TTI_NOP;
TTI_NOP;
} else {
// THCON_SEC0_REG3_Base_cntx1_address_ADDR32 = THCON_SEC0_REG3_Base_cntx1_address_ADDR32 + SCRATCH_SEC0_val_ADDR32
TTI_CFGSHIFTMASK(1, 0b011, 32 - 1, 0, 0b11, THCON_SEC0_REG3_Base_cntx1_address_ADDR32);
TTI_RDCFG(p_gpr_unpack::TMP0, THCON_SEC0_REG3_Base_cntx1_address_ADDR32);
TTI_ADDDMAREG(0, p_gpr_unpack::TMP0, p_gpr_unpack::TMP0, p_gpr_unpack::TILE_SIZE_A);
TTI_STALLWAIT(p_stall::STALL_CFG, p_stall::THCON);
TTI_WRCFG(p_gpr_unpack::TMP0,0,THCON_SEC0_REG3_Base_cntx1_address_ADDR32);
}
// Added to ensure WRCFG instruction has finished, since it takes 2 cycles.
TTI_NOP;
Expand All @@ -89,9 +99,14 @@ inline void _llk_unpack_AB_matmul_mop_config_(const bool transpose, const std::u
}
if constexpr (kernel_broadcast_a==1) {
TTI_NOP;
TTI_NOP;
TTI_NOP;
TTI_NOP;
} else {
// THCON_SEC1_REG3_Base_address_ADDR32 = THCON_SEC1_REG3_Base_address_ADDR32 + SCRATCH_SEC0_val_ADDR32
TTI_CFGSHIFTMASK(1, 0b011, 32 - 1, 0, 0b11, THCON_SEC1_REG3_Base_address_ADDR32);
TTI_RDCFG(p_gpr_unpack::TMP0, THCON_SEC1_REG3_Base_address_ADDR32);
TTI_ADDDMAREG(0, p_gpr_unpack::TMP0, p_gpr_unpack::TMP0, p_gpr_unpack::TMP_LO);
TTI_STALLWAIT(p_stall::STALL_CFG, p_stall::THCON);
TTI_WRCFG(p_gpr_unpack::TMP0,0,THCON_SEC1_REG3_Base_address_ADDR32);
}
// Added to ensure WRCFG instruction has finished, since it takes 2 cycles.
TTI_NOP;
Expand All @@ -106,9 +121,14 @@ inline void _llk_unpack_AB_matmul_mop_config_(const bool transpose, const std::u
}
if constexpr (kernel_broadcast_a==1) {
TTI_NOP;
TTI_NOP;
TTI_NOP;
TTI_NOP;
} else {
// THCON_SEC1_REG3_Base_cntx1_address_ADDR32 = THCON_SEC1_REG3_Base_cntx1_address_ADDR32 + SCRATCH_SEC0_val_ADDR32
TTI_CFGSHIFTMASK(1, 0b011, 32 - 1, 0, 0b11, THCON_SEC1_REG3_Base_cntx1_address_ADDR32);
TTI_RDCFG(p_gpr_unpack::TMP0, THCON_SEC1_REG3_Base_cntx1_address_ADDR32);
TTI_ADDDMAREG(0, p_gpr_unpack::TMP0, p_gpr_unpack::TMP0, p_gpr_unpack::TMP_LO);
TTI_STALLWAIT(p_stall::STALL_CFG, p_stall::THCON);
TTI_WRCFG(p_gpr_unpack::TMP0,0,THCON_SEC1_REG3_Base_cntx1_address_ADDR32);
}
// Added to ensure WRCFG instruction has finished, since it takes 2 cycles.
TTI_NOP;
Expand Down Expand Up @@ -199,16 +219,6 @@ __attribute__((always_inline)) inline void _llk_unpack_AB_matmul_init_(const std

TT_SETDMAREG(0, LOWER_HALFWORD(kt_dim), 0, LO_16(p_gpr_unpack::KT_DIM)); // store kt_dim to gpr for scaling tile size

// Write to scratch cfg register L1 address increment
if (reuse_a) {
TTI_WRCFG(p_gpr_unpack::TILE_SIZE_A, 0, SCRATCH_SEC0_val_ADDR32);
} else {
TTI_MULDMAREG(0, p_gpr_unpack::TMP_LO, p_gpr_unpack::TILE_SIZE_B, p_gpr_unpack::KT_DIM);
TTI_STALLWAIT(p_stall::STALL_CFG, p_stall::THCON);
TTI_WRCFG(p_gpr_unpack::TMP_LO, 0, SCRATCH_SEC0_val_ADDR32);
}
TTI_NOP;

_llk_unpack_AB_matmul_mop_config_<kernel_broadcast_a, kernel_broadcast_b>(transpose != 0, ct_dim, rt_dim, kt_dim, unpA_partial_face, unpB_partial_face);
}

Expand All @@ -224,6 +234,10 @@ inline void _llk_unpack_AB_matmul_(
const bool reuse_a = ct_dim >= rt_dim;
const std::uint32_t t_dim = reuse_a ? rt_dim : ct_dim;

if (!reuse_a) {
TTI_MULDMAREG(0, p_gpr_unpack::TMP_LO, p_gpr_unpack::TILE_SIZE_B, p_gpr_unpack::KT_DIM);
}

for (uint t = 0; t < t_dim; t++) {

std::uint32_t offset_address_a =tile_size_a*(tile_index_a + (reuse_a ? (t*kt_dim) : (0)));
Expand Down