dbgs() << "\n");
SDValue R = SDValue();
+ // See if the target wants to custom expand this node.
+ if (CustomLowerNode(N, N->getValueType(ResNo), true)) {
+ LLVM_DEBUG(dbgs() << "Node has been custom expanded, done\n");
+ return;
+ }
+
switch (N->getOpcode()) {
// These opcodes cannot appear if promotion of FP16 is done in the backend
// instead of Clang
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, f32, "BUFFER_LOAD_DWORD">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, i32, "BUFFER_LOAD_DWORD">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2i16, "BUFFER_LOAD_DWORD">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f16, "BUFFER_LOAD_DWORD">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f32, "BUFFER_LOAD_DWORDX2">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2i32, "BUFFER_LOAD_DWORDX2">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4i16, "BUFFER_LOAD_DWORDX2">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f16, "BUFFER_LOAD_DWORDX2">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v3f32, "BUFFER_LOAD_DWORDX3">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v3i32, "BUFFER_LOAD_DWORDX3">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f32, "BUFFER_LOAD_DWORDX4">;
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2i16, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4i16, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::f16, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i16, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL);
}
+SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
+ SelectionDAG &DAG,
+ ArrayRef<SDValue> Ops) const {
+ SDLoc DL(M);
+ EVT LoadVT = M->getValueType(0);
+ EVT EltType = LoadVT.getScalarType();
+ EVT IntVT = LoadVT.changeTypeToInteger();
+
+ bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
+
+ unsigned Opc =
+ IsFormat ? AMDGPUISD::BUFFER_LOAD_FORMAT : AMDGPUISD::BUFFER_LOAD;
+
+ if (IsD16) {
+ return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
+ }
+
+ // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
+ if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
+ return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);
+
+ if (isTypeLegal(LoadVT)) {
+ return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
+ M->getMemOperand(), DAG);
+ }
+
+ EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
+ SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
+ SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
+ M->getMemOperand(), DAG);
+ return DAG.getMergeValues(
+ {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
+ DL);
+}
+
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI,
SDNode *N, SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
}
case ISD::INTRINSIC_W_CHAIN: {
if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
- Results.push_back(Res);
- Results.push_back(Res.getValue(1));
+ if (Res.getOpcode() == ISD::MERGE_VALUES) {
+ // FIXME: Hacky
+ Results.push_back(Res.getOperand(0));
+ Results.push_back(Res.getOperand(1));
+ } else {
+ Results.push_back(Res);
+ Results.push_back(Res.getValue(1));
+ }
return;
}
}
case Intrinsic::amdgcn_raw_buffer_load:
case Intrinsic::amdgcn_raw_buffer_load_format: {
+ const bool IsFormat = IntrID == Intrinsic::amdgcn_raw_buffer_load_format;
+
auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
SDValue Ops[] = {
Op.getOperand(0), // Chain
DAG.getConstant(0, DL, MVT::i1), // idxen
};
- unsigned Opc = (IntrID == Intrinsic::amdgcn_raw_buffer_load) ?
- AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
-
- EVT VT = Op.getValueType();
- EVT IntVT = VT.changeTypeToInteger();
- auto *M = cast<MemSDNode>(Op);
- EVT LoadVT = Op.getValueType();
-
- if (LoadVT.getScalarType() == MVT::f16)
- return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
- M, DAG, Ops);
-
- // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
- if (LoadVT.getScalarType() == MVT::i8 ||
- LoadVT.getScalarType() == MVT::i16)
- return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);
-
- return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
- M->getMemOperand(), DAG);
+ return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
}
case Intrinsic::amdgcn_struct_buffer_load:
case Intrinsic::amdgcn_struct_buffer_load_format: {
+ const bool IsFormat = IntrID == Intrinsic::amdgcn_struct_buffer_load_format;
+
auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
SDValue Ops[] = {
Op.getOperand(0), // Chain
DAG.getConstant(1, DL, MVT::i1), // idxen
};
- unsigned Opc = (IntrID == Intrinsic::amdgcn_struct_buffer_load) ?
- AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
-
- EVT VT = Op.getValueType();
- EVT IntVT = VT.changeTypeToInteger();
- auto *M = cast<MemSDNode>(Op);
- EVT LoadVT = Op.getValueType();
-
- if (LoadVT.getScalarType() == MVT::f16)
- return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
- M, DAG, Ops);
-
- // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
- if (LoadVT.getScalarType() == MVT::i8 ||
- LoadVT.getScalarType() == MVT::i16)
- return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);
-
- return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
- M->getMemOperand(), DAG);
+ return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
}
case Intrinsic::amdgcn_tbuffer_load: {
MemSDNode *M = cast<MemSDNode>(Op);
SDValue BufferLoad = DAG.getMemIntrinsicNode(Opc, DL, ResList,
Ops, IntVT,
M->getMemOperand());
- SDValue BufferLoadTrunc = DAG.getNode(ISD::TRUNCATE, DL,
- LoadVT.getScalarType(), BufferLoad);
- return DAG.getMergeValues({BufferLoadTrunc, BufferLoad.getValue(1)}, DL);
+ SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
+ LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
+
+ return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
}
// Handle 8 bit and 16 bit buffer stores
SelectionDAG &DAG, ArrayRef<SDValue> Ops,
bool IsIntrinsic = false) const;
+ SDValue lowerIntrinsicLoad(MemSDNode *M, bool IsFormat, SelectionDAG &DAG,
+ ArrayRef<SDValue> Ops) const;
+
// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
// dwordx4 if on SI.
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
}
- if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()){
+ if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) {
setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
- setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
+ if (Subtarget->hasFullFP16())
+ setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
}
if (!Subtarget->hasFP16())
ret float %val
}
-;CHECK-LABEL: {{^}}raw_buffer_load_ushort:
+;CHECK-LABEL: {{^}}raw_buffer_load_i16:
;CHECK-NEXT: %bb.
;CHECK-NEXT: buffer_load_ushort v{{[0-9]}}, off, s[0:3], 0
;CHECK: s_waitcnt vmcnt(0)
;CHECK-NEXT: v_cvt_f32_u32_e32 v0, v0
;CHECK-NEXT: ; return to shader part epilog
-define amdgpu_ps float @raw_buffer_load_ushort(<4 x i32> inreg %rsrc) {
+define amdgpu_ps float @raw_buffer_load_i16(<4 x i32> inreg %rsrc) {
main_body:
%tmp = call i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
%tmp2 = zext i16 %tmp to i32
ret float %val
}
+;CHECK-LABEL: {{^}}raw_buffer_load_f16:
+;CHECK-NEXT: %bb.
+;CHECK-NEXT: buffer_load_ushort [[VAL:v[0-9]+]], off, s[0:3], 0
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: ds_write_b16 v0, [[VAL]]
+define amdgpu_ps void @raw_buffer_load_f16(<4 x i32> inreg %rsrc, half addrspace(3)* %ptr) {
+main_body:
+ %val = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
+ store half %val, half addrspace(3)* %ptr
+ ret void
+}
+
+;CHECK-LABEL: {{^}}raw_buffer_load_v2f16:
+;CHECK-NEXT: %bb.
+;CHECK-NEXT: buffer_load_dword [[VAL:v[0-9]+]], off, s[0:3], 0
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: ds_write_b32 v0, [[VAL]]
+define amdgpu_ps void @raw_buffer_load_v2f16(<4 x i32> inreg %rsrc, <2 x half> addrspace(3)* %ptr) {
+main_body:
+ %val = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
+ store <2 x half> %val, <2 x half> addrspace(3)* %ptr
+ ret void
+}
+
+;CHECK-LABEL: {{^}}raw_buffer_load_v4f16:
+;CHECK-NEXT: %bb.
+;CHECK-NEXT: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], off, s[0:3], 0
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: ds_write_b64 v0, [[VAL]]
+define amdgpu_ps void @raw_buffer_load_v4f16(<4 x i32> inreg %rsrc, <4 x half> addrspace(3)* %ptr) {
+main_body:
+ %val = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
+ store <4 x half> %val, <4 x half> addrspace(3)* %ptr
+ ret void
+}
+
+;CHECK-LABEL: {{^}}raw_buffer_load_v2i16:
+;CHECK-NEXT: %bb.
+;CHECK-NEXT: buffer_load_dword [[VAL:v[0-9]+]], off, s[0:3], 0
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: ds_write_b32 v0, [[VAL]]
+define amdgpu_ps void @raw_buffer_load_v2i16(<4 x i32> inreg %rsrc, <2 x i16> addrspace(3)* %ptr) {
+main_body:
+ %val = call <2 x i16> @llvm.amdgcn.raw.buffer.load.v2i16(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
+ store <2 x i16> %val, <2 x i16> addrspace(3)* %ptr
+ ret void
+}
+
+;CHECK-LABEL: {{^}}raw_buffer_load_v4i16:
+;CHECK-NEXT: %bb.
+;CHECK-NEXT: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], off, s[0:3], 0
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: ds_write_b64 v0, [[VAL]]
+define amdgpu_ps void @raw_buffer_load_v4i16(<4 x i32> inreg %rsrc, <4 x i16> addrspace(3)* %ptr) {
+main_body:
+ %val = call <4 x i16> @llvm.amdgcn.raw.buffer.load.v4i16(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
+ store <4 x i16> %val, <4 x i16> addrspace(3)* %ptr
+ ret void
+}
+
declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32) #0
declare <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32>, i32, i32, i32) #0
declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #0
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
declare i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32>, i32, i32, i32) #0
declare i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32>, i32, i32, i32) #0
+declare <2 x i16> @llvm.amdgcn.raw.buffer.load.v2i16(<4 x i32>, i32, i32, i32) #0
+declare <4 x i16> @llvm.amdgcn.raw.buffer.load.v4i16(<4 x i32>, i32, i32, i32) #0
+declare half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32>, i32, i32, i32) #0
+declare <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32>, i32, i32, i32) #0
+declare <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32>, i32, i32, i32) #0
attributes #0 = { nounwind readonly }
ret float %val
}
+;CHECK-LABEL: {{^}}struct_buffer_load_f16:
+;CHECK-NEXT: %bb.
+;CHECK-NEXT: buffer_load_ushort [[VAL:v[0-9]+]], v1, s[0:3], 0 idxen
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: ds_write_b16 v0, [[VAL]]
+define amdgpu_ps void @struct_buffer_load_f16(<4 x i32> inreg %rsrc, half addrspace(3)* %ptr, i32 %idx) {
+main_body:
+ %val = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0)
+ store half %val, half addrspace(3)* %ptr
+ ret void
+}
+
+;CHECK-LABEL: {{^}}struct_buffer_load_v2f16:
+;CHECK-NEXT: %bb.
+;CHECK-NEXT: buffer_load_dword [[VAL:v[0-9]+]], v1, s[0:3], 0 idxen
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: ds_write_b32 v0, [[VAL]]
+define amdgpu_ps void @struct_buffer_load_v2f16(<4 x i32> inreg %rsrc, <2 x half> addrspace(3)* %ptr, i32 %idx) {
+main_body:
+ %val = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0)
+ store <2 x half> %val, <2 x half> addrspace(3)* %ptr
+ ret void
+}
+
+;CHECK-LABEL: {{^}}struct_buffer_load_v4f16:
+;CHECK-NEXT: %bb.
+;CHECK-NEXT: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], v1, s[0:3], 0 idxen
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: ds_write_b64 v0, [[VAL]]
+define amdgpu_ps void @struct_buffer_load_v4f16(<4 x i32> inreg %rsrc, <4 x half> addrspace(3)* %ptr, i32 %idx) {
+main_body:
+ %val = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0)
+ store <4 x half> %val, <4 x half> addrspace(3)* %ptr
+ ret void
+}
+
+;CHECK-LABEL: {{^}}struct_buffer_load_i16:
+;CHECK-NEXT: %bb.
+;CHECK-NEXT: buffer_load_ushort [[VAL:v[0-9]+]], v1, s[0:3], 0 idxen
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: ds_write_b16 v0, [[VAL]]
+define amdgpu_ps void @struct_buffer_load_i16(<4 x i32> inreg %rsrc, i16 addrspace(3)* %ptr, i32 %idx) {
+main_body:
+ %val = call i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0)
+ store i16 %val, i16 addrspace(3)* %ptr
+ ret void
+}
+
+;CHECK-LABEL: {{^}}struct_buffer_load_v2i16:
+;CHECK-NEXT: %bb.
+;CHECK-NEXT: buffer_load_dword [[VAL:v[0-9]+]], v1, s[0:3], 0 idxen
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: ds_write_b32 v0, [[VAL]]
+define amdgpu_ps void @struct_buffer_load_v2i16(<4 x i32> inreg %rsrc, <2 x i16> addrspace(3)* %ptr, i32 %idx) {
+main_body:
+ %val = call <2 x i16> @llvm.amdgcn.struct.buffer.load.v2i16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0)
+ store <2 x i16> %val, <2 x i16> addrspace(3)* %ptr
+ ret void
+}
+
+;CHECK-LABEL: {{^}}struct_buffer_load_v4i16:
+;CHECK-NEXT: %bb.
+;CHECK-NEXT: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], v1, s[0:3], 0 idxen
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: ds_write_b64 v0, [[VAL]]
+define amdgpu_ps void @struct_buffer_load_v4i16(<4 x i32> inreg %rsrc, <4 x i16> addrspace(3)* %ptr, i32 %idx) {
+main_body:
+ %val = call <4 x i16> @llvm.amdgcn.struct.buffer.load.v4i16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0)
+ store <4 x i16> %val, <4 x i16> addrspace(3)* %ptr
+ ret void
+}
+
declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32) #0
declare <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32>, i32, i32, i32, i32) #0
declare <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32>, i32, i32, i32, i32) #0
declare <4 x i32> @llvm.amdgcn.struct.buffer.load.v4i32(<4 x i32>, i32, i32, i32, i32) #0
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
declare i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32>, i32, i32, i32, i32) #0
+
+declare half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32>, i32, i32, i32, i32) #0
+declare <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32>, i32, i32, i32, i32) #0
+declare <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32>, i32, i32, i32, i32) #0
+
declare i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32>, i32, i32, i32, i32) #0
+declare <2 x i16> @llvm.amdgcn.struct.buffer.load.v2i16(<4 x i32>, i32, i32, i32, i32) #0
+declare <4 x i16> @llvm.amdgcn.struct.buffer.load.v4i16(<4 x i32>, i32, i32, i32, i32) #0
attributes #0 = { nounwind readonly }