Skip to content

Commit

Permalink
[NVPTX] Switch to imm offset variants for LDG and LDU (llvm#128270)
Browse files Browse the repository at this point in the history
  • Loading branch information
AlexMaclean authored Feb 25, 2025
1 parent f5d80c3 commit d3dae84
Show file tree
Hide file tree
Showing 4 changed files with 339 additions and 211 deletions.
86 changes: 38 additions & 48 deletions llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1106,9 +1106,6 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
std::nullopt, NVPTX::LDV_f32_v4_asi, std::nullopt);
break;
}
if (!Opcode)
return false;
Ops.append({Base, Offset, Chain});
} else {
if (PointerSize == 64) {
SelectADDRri64(Op1.getNode(), Op1, Base, Offset);
Expand Down Expand Up @@ -1148,10 +1145,10 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
break;
}
}
if (!Opcode)
return false;
Ops.append({Base, Offset, Chain});
}
if (!Opcode)
return false;
Ops.append({Base, Offset, Chain});
LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);

MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
Expand Down Expand Up @@ -1202,63 +1199,59 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
std::optional<unsigned> Opcode;
SDLoc DL(N);
SDNode *LD;
SDValue Base, Offset, Addr;
SDValue Base, Offset;

if (SelectDirectAddr(Op1, Addr)) {
if (SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
switch (N->getOpcode()) {
default:
return false;
case ISD::LOAD:
Opcode = pickOpcodeForVT(
EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_GLOBAL_i8avar,
NVPTX::INT_PTX_LDG_GLOBAL_i16avar, NVPTX::INT_PTX_LDG_GLOBAL_i32avar,
NVPTX::INT_PTX_LDG_GLOBAL_i64avar, NVPTX::INT_PTX_LDG_GLOBAL_f32avar,
NVPTX::INT_PTX_LDG_GLOBAL_f64avar);
EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_GLOBAL_i8asi,
NVPTX::INT_PTX_LDG_GLOBAL_i16asi, NVPTX::INT_PTX_LDG_GLOBAL_i32asi,
NVPTX::INT_PTX_LDG_GLOBAL_i64asi, NVPTX::INT_PTX_LDG_GLOBAL_f32asi,
NVPTX::INT_PTX_LDG_GLOBAL_f64asi);
break;
case ISD::INTRINSIC_W_CHAIN:
Opcode = pickOpcodeForVT(
EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_GLOBAL_i8avar,
NVPTX::INT_PTX_LDU_GLOBAL_i16avar, NVPTX::INT_PTX_LDU_GLOBAL_i32avar,
NVPTX::INT_PTX_LDU_GLOBAL_i64avar, NVPTX::INT_PTX_LDU_GLOBAL_f32avar,
NVPTX::INT_PTX_LDU_GLOBAL_f64avar);
EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_GLOBAL_i8asi,
NVPTX::INT_PTX_LDU_GLOBAL_i16asi, NVPTX::INT_PTX_LDU_GLOBAL_i32asi,
NVPTX::INT_PTX_LDU_GLOBAL_i64asi, NVPTX::INT_PTX_LDU_GLOBAL_f32asi,
NVPTX::INT_PTX_LDU_GLOBAL_f64asi);
break;
case NVPTXISD::LoadV2:
Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar,
NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar,
NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar,
NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar,
NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar,
NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar);
NVPTX::INT_PTX_LDG_G_v2i8_ELE_asi,
NVPTX::INT_PTX_LDG_G_v2i16_ELE_asi,
NVPTX::INT_PTX_LDG_G_v2i32_ELE_asi,
NVPTX::INT_PTX_LDG_G_v2i64_ELE_asi,
NVPTX::INT_PTX_LDG_G_v2f32_ELE_asi,
NVPTX::INT_PTX_LDG_G_v2f64_ELE_asi);
break;
case NVPTXISD::LDUV2:
Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar,
NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar,
NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar,
NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar,
NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar,
NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar);
NVPTX::INT_PTX_LDU_G_v2i8_ELE_asi,
NVPTX::INT_PTX_LDU_G_v2i16_ELE_asi,
NVPTX::INT_PTX_LDU_G_v2i32_ELE_asi,
NVPTX::INT_PTX_LDU_G_v2i64_ELE_asi,
NVPTX::INT_PTX_LDU_G_v2f32_ELE_asi,
NVPTX::INT_PTX_LDU_G_v2f64_ELE_asi);
break;
case NVPTXISD::LoadV4:
Opcode = pickOpcodeForVT(
EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar,
NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar,
NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar, std::nullopt,
NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar, std::nullopt);
EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_asi,
NVPTX::INT_PTX_LDG_G_v4i16_ELE_asi,
NVPTX::INT_PTX_LDG_G_v4i32_ELE_asi, std::nullopt,
NVPTX::INT_PTX_LDG_G_v4f32_ELE_asi, std::nullopt);
break;
case NVPTXISD::LDUV4:
Opcode = pickOpcodeForVT(
EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar,
NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar,
NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar, std::nullopt,
NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar, std::nullopt);
EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_asi,
NVPTX::INT_PTX_LDU_G_v4i16_ELE_asi,
NVPTX::INT_PTX_LDU_G_v4i32_ELE_asi, std::nullopt,
NVPTX::INT_PTX_LDU_G_v4f32_ELE_asi, std::nullopt);
break;
}
if (!Opcode)
return false;
SDValue Ops[] = { Addr, Chain };
LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
} else {
if (TM.is64Bit()) {
SelectADDRri64(Op1.getNode(), Op1, Base, Offset);
Expand Down Expand Up @@ -1369,11 +1362,11 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
break;
}
}
if (!Opcode)
return false;
SDValue Ops[] = {Base, Offset, Chain};
LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
}
if (!Opcode)
return false;
SDValue Ops[] = {Base, Offset, Chain};
LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);

// For automatic generation of LDG (through SelectLoad[Vector], not the
// intrinsics), we may have an extending load like:
Expand Down Expand Up @@ -1577,7 +1570,6 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
std::nullopt, NVPTX::STV_f32_v4_asi, std::nullopt);
break;
}
Ops.append({Base, Offset});
} else {
if (PointerSize == 64) {
SelectADDRri64(N2.getNode(), N2, Base, Offset);
Expand Down Expand Up @@ -1617,12 +1609,10 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
break;
}
}
Ops.append({Base, Offset});
}
if (!Opcode)
return false;

Ops.push_back(Chain);
Ops.append({Base, Offset, Chain});

ST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);

Expand Down
169 changes: 66 additions & 103 deletions llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
Original file line number Diff line number Diff line change
Expand Up @@ -2718,80 +2718,64 @@ defm INT_PTX_SATOM_XOR : ATOM2_bitwise_impl<"xor">;
// Scalar

multiclass LDU_G<string TyStr, NVPTXRegClass regclass> {
def avar: NVPTXInst<(outs regclass:$result), (ins imemAny:$src),
!strconcat("ldu.global.", TyStr),
def asi: NVPTXInst<(outs regclass:$result), (ins imemAny:$src, Offseti32imm:$offset),
"ldu.global." # TyStr # " \t$result, [$src$offset];",
[]>, Requires<[hasLDU]>;
def ari : NVPTXInst<(outs regclass:$result), (ins MEMri:$src),
!strconcat("ldu.global.", TyStr),
"ldu.global." # TyStr # " \t$result, [$src];",
[]>, Requires<[hasLDU]>;
def ari64 : NVPTXInst<(outs regclass:$result), (ins MEMri64:$src),
!strconcat("ldu.global.", TyStr),
"ldu.global." # TyStr # " \t$result, [$src];",
[]>, Requires<[hasLDU]>;
}

defm INT_PTX_LDU_GLOBAL_i8 : LDU_G<"u8 \t$result, [$src];", Int16Regs>;
defm INT_PTX_LDU_GLOBAL_i16 : LDU_G<"u16 \t$result, [$src];", Int16Regs>;
defm INT_PTX_LDU_GLOBAL_i32 : LDU_G<"u32 \t$result, [$src];", Int32Regs>;
defm INT_PTX_LDU_GLOBAL_i64 : LDU_G<"u64 \t$result, [$src];", Int64Regs>;
defm INT_PTX_LDU_GLOBAL_f32 : LDU_G<"f32 \t$result, [$src];", Float32Regs>;
defm INT_PTX_LDU_GLOBAL_f64 : LDU_G<"f64 \t$result, [$src];", Float64Regs>;
defm INT_PTX_LDU_GLOBAL_i8 : LDU_G<"u8", Int16Regs>;
defm INT_PTX_LDU_GLOBAL_i16 : LDU_G<"u16", Int16Regs>;
defm INT_PTX_LDU_GLOBAL_i32 : LDU_G<"u32", Int32Regs>;
defm INT_PTX_LDU_GLOBAL_i64 : LDU_G<"u64", Int64Regs>;
defm INT_PTX_LDU_GLOBAL_f32 : LDU_G<"f32", Float32Regs>;
defm INT_PTX_LDU_GLOBAL_f64 : LDU_G<"f64", Float64Regs>;

// vector

// Elementized vector ldu
multiclass VLDU_G_ELE_V2<string TyStr, NVPTXRegClass regclass> {
def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
(ins MEMri:$src),
!strconcat("ldu.global.", TyStr), []>;
"ldu.global.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src];", []>;
def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
(ins MEMri64:$src),
!strconcat("ldu.global.", TyStr), []>;
def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
(ins imemAny:$src),
!strconcat("ldu.global.", TyStr), []>;
"ldu.global.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src];", []>;
def _asi: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
(ins imemAny:$src, Offseti32imm:$offset),
"ldu.global.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src$offset];", []>;
}

multiclass VLDU_G_ELE_V4<string TyStr, NVPTXRegClass regclass> {
def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
regclass:$dst4), (ins MEMri:$src),
!strconcat("ldu.global.", TyStr), []>;
"ldu.global.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
regclass:$dst4), (ins MEMri64:$src),
!strconcat("ldu.global.", TyStr), []>;
def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
regclass:$dst4), (ins imemAny:$src),
!strconcat("ldu.global.", TyStr), []>;
}

defm INT_PTX_LDU_G_v2i8_ELE
: VLDU_G_ELE_V2<"v2.u8 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
defm INT_PTX_LDU_G_v2i16_ELE
: VLDU_G_ELE_V2<"v2.u16 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
defm INT_PTX_LDU_G_v2i32_ELE
: VLDU_G_ELE_V2<"v2.u32 \t{{$dst1, $dst2}}, [$src];", Int32Regs>;
defm INT_PTX_LDU_G_v2f32_ELE
: VLDU_G_ELE_V2<"v2.f32 \t{{$dst1, $dst2}}, [$src];", Float32Regs>;
defm INT_PTX_LDU_G_v2i64_ELE
: VLDU_G_ELE_V2<"v2.u64 \t{{$dst1, $dst2}}, [$src];", Int64Regs>;
defm INT_PTX_LDU_G_v2f64_ELE
: VLDU_G_ELE_V2<"v2.f64 \t{{$dst1, $dst2}}, [$src];", Float64Regs>;
defm INT_PTX_LDU_G_v4i8_ELE
: VLDU_G_ELE_V4<"v4.u8 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>;
defm INT_PTX_LDU_G_v4i16_ELE
: VLDU_G_ELE_V4<"v4.u16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
Int16Regs>;
defm INT_PTX_LDU_G_v4i32_ELE
: VLDU_G_ELE_V4<"v4.u32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
Int32Regs>;
defm INT_PTX_LDU_G_v4f16_ELE
: VLDU_G_ELE_V4<"v4.b16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
Int16Regs>;
defm INT_PTX_LDU_G_v4f16x2_ELE
: VLDU_G_ELE_V4<"v4.b32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
Int32Regs>;
defm INT_PTX_LDU_G_v4f32_ELE
: VLDU_G_ELE_V4<"v4.f32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
Float32Regs>;
"ldu.global.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
def _asi: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
regclass:$dst4), (ins imemAny:$src, Offseti32imm:$offset),
"ldu.global.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src$offset];", []>;
}

defm INT_PTX_LDU_G_v2i8_ELE : VLDU_G_ELE_V2<"u8", Int16Regs>;
defm INT_PTX_LDU_G_v2i16_ELE : VLDU_G_ELE_V2<"u16", Int16Regs>;
defm INT_PTX_LDU_G_v2i32_ELE : VLDU_G_ELE_V2<"u32", Int32Regs>;
defm INT_PTX_LDU_G_v2f32_ELE : VLDU_G_ELE_V2<"f32", Float32Regs>;
defm INT_PTX_LDU_G_v2i64_ELE : VLDU_G_ELE_V2<"u64", Int64Regs>;
defm INT_PTX_LDU_G_v2f64_ELE : VLDU_G_ELE_V2<"f64", Float64Regs>;

defm INT_PTX_LDU_G_v4i8_ELE : VLDU_G_ELE_V4<"u8", Int16Regs>;
defm INT_PTX_LDU_G_v4i16_ELE : VLDU_G_ELE_V4<"u16", Int16Regs>;
defm INT_PTX_LDU_G_v4i32_ELE : VLDU_G_ELE_V4<"u32", Int32Regs>;
defm INT_PTX_LDU_G_v4f16_ELE : VLDU_G_ELE_V4<"b16", Int16Regs>;
defm INT_PTX_LDU_G_v4f16x2_ELE : VLDU_G_ELE_V4<"b32", Int32Regs>;
defm INT_PTX_LDU_G_v4f32_ELE : VLDU_G_ELE_V4<"f32", Float32Regs>;


//-----------------------------------
Expand All @@ -2803,84 +2787,63 @@ defm INT_PTX_LDU_G_v4f32_ELE
// during the lifetime of the kernel.

multiclass LDG_G<string TyStr, NVPTXRegClass regclass> {
def avar: NVPTXInst<(outs regclass:$result), (ins imemAny:$src),
!strconcat("ld.global.nc.", TyStr),
def asi: NVPTXInst<(outs regclass:$result), (ins imemAny:$src, Offseti32imm:$offset),
"ld.global.nc." # TyStr # " \t$result, [$src$offset];",
[]>, Requires<[hasLDG]>;
def ari : NVPTXInst<(outs regclass:$result), (ins MEMri:$src),
!strconcat("ld.global.nc.", TyStr),
"ld.global.nc." # TyStr # " \t$result, [$src];",
[]>, Requires<[hasLDG]>;
def ari64 : NVPTXInst<(outs regclass:$result), (ins MEMri64:$src),
!strconcat("ld.global.nc.", TyStr),
"ld.global.nc." # TyStr # " \t$result, [$src];",
[]>, Requires<[hasLDG]>;
}

defm INT_PTX_LDG_GLOBAL_i8
: LDG_G<"u8 \t$result, [$src];", Int16Regs>;
defm INT_PTX_LDG_GLOBAL_i16
: LDG_G<"u16 \t$result, [$src];", Int16Regs>;
defm INT_PTX_LDG_GLOBAL_i32
: LDG_G<"u32 \t$result, [$src];", Int32Regs>;
defm INT_PTX_LDG_GLOBAL_i64
: LDG_G<"u64 \t$result, [$src];", Int64Regs>;
defm INT_PTX_LDG_GLOBAL_f32
: LDG_G<"f32 \t$result, [$src];", Float32Regs>;
defm INT_PTX_LDG_GLOBAL_f64
: LDG_G<"f64 \t$result, [$src];", Float64Regs>;
defm INT_PTX_LDG_GLOBAL_i8 : LDG_G<"u8", Int16Regs>;
defm INT_PTX_LDG_GLOBAL_i16 : LDG_G<"u16", Int16Regs>;
defm INT_PTX_LDG_GLOBAL_i32 : LDG_G<"u32", Int32Regs>;
defm INT_PTX_LDG_GLOBAL_i64 : LDG_G<"u64", Int64Regs>;
defm INT_PTX_LDG_GLOBAL_f32 : LDG_G<"f32", Float32Regs>;
defm INT_PTX_LDG_GLOBAL_f64 : LDG_G<"f64", Float64Regs>;

// vector

// Elementized vector ldg
multiclass VLDG_G_ELE_V2<string TyStr, NVPTXRegClass regclass> {
def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
(ins MEMri:$src),
!strconcat("ld.global.nc.", TyStr), []>;
"ld.global.nc.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src];", []>;
def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
(ins MEMri64:$src),
!strconcat("ld.global.nc.", TyStr), []>;
def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
(ins imemAny:$src),
!strconcat("ld.global.nc.", TyStr), []>;
"ld.global.nc.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src];", []>;
def _asi: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
(ins imemAny:$src, Offseti32imm:$offset),
"ld.global.nc.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src$offset];", []>;
}

multiclass VLDG_G_ELE_V4<string TyStr, NVPTXRegClass regclass> {
def _areg32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
regclass:$dst4), (ins Int32Regs:$src),
!strconcat("ld.global.nc.", TyStr), []>;
def _areg64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
regclass:$dst4), (ins Int64Regs:$src),
!strconcat("ld.global.nc.", TyStr), []>;
def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
regclass:$dst4), (ins MEMri:$src),
!strconcat("ld.global.nc.", TyStr), []>;
"ld.global.nc.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
regclass:$dst4), (ins MEMri64:$src),
!strconcat("ld.global.nc.", TyStr), []>;
def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
regclass:$dst4), (ins imemAny:$src),
!strconcat("ld.global.nc.", TyStr), []>;
"ld.global.nc.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
def _asi: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
regclass:$dst4), (ins imemAny:$src, Offseti32imm:$offset),
"ld.global.nc.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src$offset];", []>;
}

// FIXME: 8-bit LDG should be fixed once LDG/LDU nodes are made into proper loads.
defm INT_PTX_LDG_G_v2i8_ELE
: VLDG_G_ELE_V2<"v2.u8 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
defm INT_PTX_LDG_G_v2i16_ELE
: VLDG_G_ELE_V2<"v2.u16 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
defm INT_PTX_LDG_G_v2i32_ELE
: VLDG_G_ELE_V2<"v2.u32 \t{{$dst1, $dst2}}, [$src];", Int32Regs>;
defm INT_PTX_LDG_G_v2f32_ELE
: VLDG_G_ELE_V2<"v2.f32 \t{{$dst1, $dst2}}, [$src];", Float32Regs>;
defm INT_PTX_LDG_G_v2i64_ELE
: VLDG_G_ELE_V2<"v2.u64 \t{{$dst1, $dst2}}, [$src];", Int64Regs>;
defm INT_PTX_LDG_G_v2f64_ELE
: VLDG_G_ELE_V2<"v2.f64 \t{{$dst1, $dst2}}, [$src];", Float64Regs>;
defm INT_PTX_LDG_G_v4i8_ELE
: VLDG_G_ELE_V4<"v4.u8 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>;
defm INT_PTX_LDG_G_v4i16_ELE
: VLDG_G_ELE_V4<"v4.u16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>;
defm INT_PTX_LDG_G_v4i32_ELE
: VLDG_G_ELE_V4<"v4.u32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int32Regs>;
defm INT_PTX_LDG_G_v4f32_ELE
: VLDG_G_ELE_V4<"v4.f32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Float32Regs>;
defm INT_PTX_LDG_G_v2i8_ELE : VLDG_G_ELE_V2<"u8", Int16Regs>;
defm INT_PTX_LDG_G_v2i16_ELE : VLDG_G_ELE_V2<"u16", Int16Regs>;
defm INT_PTX_LDG_G_v2i32_ELE : VLDG_G_ELE_V2<"u32", Int32Regs>;
defm INT_PTX_LDG_G_v2f32_ELE : VLDG_G_ELE_V2<"f32", Float32Regs>;
defm INT_PTX_LDG_G_v2i64_ELE : VLDG_G_ELE_V2<"u64", Int64Regs>;
defm INT_PTX_LDG_G_v2f64_ELE : VLDG_G_ELE_V2<"f64", Float64Regs>;

defm INT_PTX_LDG_G_v4i8_ELE : VLDG_G_ELE_V4<"u8", Int16Regs>;
defm INT_PTX_LDG_G_v4i16_ELE : VLDG_G_ELE_V4<"u16", Int16Regs>;
defm INT_PTX_LDG_G_v4i32_ELE : VLDG_G_ELE_V4<"u32", Int32Regs>;
defm INT_PTX_LDG_G_v4f32_ELE : VLDG_G_ELE_V4<"f32", Float32Regs>;


multiclass NG_TO_G<string Str> {
Expand Down
Loading

0 comments on commit d3dae84

Please sign in to comment.