Skip to content

Commit

Permalink
Add more vlogs to p2p pipeliner to aid debugging
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 728206464
  • Loading branch information
frgossen authored and Google-ML-Automation committed Feb 18, 2025
1 parent f416615 commit 8e9c867
Showing 1 changed file with 7 additions and 1 deletion.
8 changes: 7 additions & 1 deletion xla/service/gpu/gpu_p2p_pipeliner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -353,6 +353,8 @@ static HloInstruction* FindSendRecvDoneInstruction(HloInstruction* instr) {
static absl::Status AddControlDependencies(
std::vector<HloInstruction*>& from_instructions, HloInstruction* to_instr) {
for (HloInstruction* from_instr : from_instructions) {
VLOG(5) << "Adding control dependency from " << from_instr->ToShortString()
<< " to " << to_instr->ToShortString();
TF_RETURN_IF_ERROR(from_instr->AddControlDependencyTo(to_instr));
}
return absl::OkStatus();
Expand All @@ -362,6 +364,8 @@ static absl::Status AddControlDependencies(
HloInstruction* from_instr,
absl::flat_hash_set<HloInstruction*>& to_instructions) {
for (HloInstruction* to_instr : to_instructions) {
VLOG(5) << "Adding control dependency from " << from_instr->ToShortString()
<< " to " << to_instr->ToShortString();
TF_RETURN_IF_ERROR(from_instr->AddControlDependencyTo(to_instr));
}
return absl::OkStatus();
Expand Down Expand Up @@ -398,7 +402,7 @@ static absl::Status PostProcessPeeledSendRecvOps(
if (peeled_send_recvs_set.contains(instr)) continue;
unpeeled_conflicting_collectives.insert(instr);
}
VLOG(5) << "#Conflicting collectives: "
VLOG(5) << "Conflicting collectives: "
<< unpeeled_conflicting_collectives.size();

// Find the while loop.
Expand Down Expand Up @@ -432,6 +436,7 @@ static absl::Status PostProcessPeeledSendRecvOps(
// peeled send/recv instruction. This guarantees that the conflicting
// collectives cannot slip in between the peeled send/recv instructions
// where it could cause a deadlock.
VLOG(5) << "Adding control dependencies FROM dominating conflicting";
TF_RETURN_IF_ERROR(AddControlDependencies(
dominating_unpeeled_conflicting_collectives, peeled_instr));

Expand All @@ -440,6 +445,7 @@ static absl::Status PostProcessPeeledSendRecvOps(
// while loop. This guarantees that the conflicting collectives cannot slip
// in between the peeled send/recv instructions where it could cause a
// deadlock.
VLOG(5) << "Adding control dependencies TO dominating conflicting";
HloInstruction* done_op = FindSendRecvDoneInstruction(peeled_instr);
CHECK_NE(done_op, nullptr);
TF_RETURN_IF_ERROR(
Expand Down

0 comments on commit 8e9c867

Please sign in to comment.