From: Tim Renouf Date: Fri, 22 Mar 2019 14:58:02 +0000 (+0000) Subject: [AMDGPU] Implemented dwordx3 variants of buffer/tbuffer load/store intrinsics X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=c89bf6fdf363051a396c007712215050b8ad2eef;p=llvm [AMDGPU] Implemented dwordx3 variants of buffer/tbuffer load/store intrinsics Now we have vec3 MVTs, this commit implements dwordx3 variants of the buffer intrinsics. On gfx6, a dwordx3 buffer load intrinsic is implemented as a dwordx4 instruction, and a dwordx3 buffer store intrinsic is not supported. We need to support the dwordx3 load intrinsic because it is generated by subtarget-unaware code in InstCombine. Differential Revision: https://reviews.llvm.org/D58904 Change-Id: I016729d8557b98a52f529638ae97c340a5922a4e git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@356755 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 3d460199d94..623d9817f45 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4295,7 +4295,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(STORE_MSKOR) NODE_NAME_CASE(LOAD_CONSTANT) NODE_NAME_CASE(TBUFFER_STORE_FORMAT) - NODE_NAME_CASE(TBUFFER_STORE_FORMAT_X3) NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16) NODE_NAME_CASE(TBUFFER_LOAD_FORMAT) NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16) diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h index 0f79e019300..93240d8f1a5 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -494,7 +494,6 @@ enum NodeType : unsigned { STORE_MSKOR, LOAD_CONSTANT, TBUFFER_STORE_FORMAT, - TBUFFER_STORE_FORMAT_X3, TBUFFER_STORE_FORMAT_D16, TBUFFER_LOAD_FORMAT, TBUFFER_LOAD_FORMAT_D16, diff --git a/lib/Target/AMDGPU/BUFInstructions.td b/lib/Target/AMDGPU/BUFInstructions.td index d720eb9d46f..030fd459b0f 100644 --- a/lib/Target/AMDGPU/BUFInstructions.td +++ b/lib/Target/AMDGPU/BUFInstructions.td @@ -1011,11 +1011,11 @@ def BUFFER_WBINVL1 : MUBUF_Invalidate <"buffer_wbinvl1", defm TBUFFER_LOAD_FORMAT_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_x", VGPR_32>; defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_xy", VReg_64>; -defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyz", VReg_128>; +defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyz", VReg_96>; defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyzw", VReg_128>; defm TBUFFER_STORE_FORMAT_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_x", VGPR_32>; defm TBUFFER_STORE_FORMAT_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_xy", VReg_64>; -defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz", VReg_128>; +defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz", VReg_96>; defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", VReg_128>; let SubtargetPredicate = HasUnpackedD16VMem, D16Buf = 1 in { @@ -1104,6 +1104,8 @@ defm : MUBUF_LoadIntrinsicPat defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; @@ -1127,6 +1129,8 @@ defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; @@ -1172,6 +1176,8 @@ defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; @@ -1195,6 +1201,8 @@ defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; @@ -1535,9 +1543,11 @@ multiclass MTBUF_LoadIntrinsicPat; defm : MTBUF_LoadIntrinsicPat; +defm : MTBUF_LoadIntrinsicPat; defm : MTBUF_LoadIntrinsicPat; defm : MTBUF_LoadIntrinsicPat; defm : MTBUF_LoadIntrinsicPat; +defm : MTBUF_LoadIntrinsicPat; defm : MTBUF_LoadIntrinsicPat; let SubtargetPredicate = HasUnpackedD16VMem in { @@ -1591,11 +1601,11 @@ multiclass MTBUF_StoreIntrinsicPat; defm : MTBUF_StoreIntrinsicPat; -defm : MTBUF_StoreIntrinsicPat; +defm : MTBUF_StoreIntrinsicPat; defm : MTBUF_StoreIntrinsicPat; defm : MTBUF_StoreIntrinsicPat; defm : MTBUF_StoreIntrinsicPat; -defm : MTBUF_StoreIntrinsicPat; +defm : MTBUF_StoreIntrinsicPat; defm : MTBUF_StoreIntrinsicPat; let SubtargetPredicate = HasUnpackedD16VMem in { diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 7280edd3fcc..4155a013ad8 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -5625,8 +5625,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, LoadVT.getScalarType() == MVT::i16) return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M); - return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, - M->getMemOperand()); + return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, + M->getMemOperand(), DAG); } case Intrinsic::amdgcn_raw_buffer_load: case Intrinsic::amdgcn_raw_buffer_load_format: { @@ -5659,8 +5659,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, LoadVT.getScalarType() == MVT::i16) return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M); - return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, - M->getMemOperand()); + return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, + M->getMemOperand(), DAG); } case Intrinsic::amdgcn_struct_buffer_load: case Intrinsic::amdgcn_struct_buffer_load_format: { @@ -5693,8 +5693,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, LoadVT.getScalarType() == MVT::i16) return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M); - return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, - M->getMemOperand()); + return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, + M->getMemOperand(), DAG); } case Intrinsic::amdgcn_tbuffer_load: { MemSDNode *M = cast(Op); @@ -5722,9 +5722,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, if (LoadVT.getScalarType() == MVT::f16) return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG, Ops); - return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, - Op->getVTList(), Ops, LoadVT, - M->getMemOperand()); + return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, + Op->getVTList(), Ops, LoadVT, M->getMemOperand(), + DAG); } case Intrinsic::amdgcn_raw_tbuffer_load: { MemSDNode *M = cast(Op); @@ -5746,9 +5746,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, if (LoadVT.getScalarType() == MVT::f16) return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG, Ops); - return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, - Op->getVTList(), Ops, LoadVT, - M->getMemOperand()); + return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, + Op->getVTList(), Ops, LoadVT, M->getMemOperand(), + DAG); } case Intrinsic::amdgcn_struct_tbuffer_load: { MemSDNode *M = cast(Op); @@ -5770,9 +5770,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, if (LoadVT.getScalarType() == MVT::f16) return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG, Ops); - return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, - Op->getVTList(), Ops, LoadVT, - M->getMemOperand()); + return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, + Op->getVTList(), Ops, LoadVT, M->getMemOperand(), + DAG); } case Intrinsic::amdgcn_buffer_atomic_swap: case Intrinsic::amdgcn_buffer_atomic_add: @@ -6047,6 +6047,39 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, } } +// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to +// dwordx4 if on SI. +SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL, + SDVTList VTList, + ArrayRef Ops, EVT MemVT, + MachineMemOperand *MMO, + SelectionDAG &DAG) const { + EVT VT = VTList.VTs[0]; + EVT WidenedVT = VT; + EVT WidenedMemVT = MemVT; + if (!Subtarget->hasDwordx3LoadStores() && + (WidenedVT == MVT::v3i32 || WidenedVT == MVT::v3f32)) { + WidenedVT = EVT::getVectorVT(*DAG.getContext(), + WidenedVT.getVectorElementType(), 4); + WidenedMemVT = EVT::getVectorVT(*DAG.getContext(), + WidenedMemVT.getVectorElementType(), 4); + MMO = DAG.getMachineFunction().getMachineMemOperand(MMO, 0, 16); + } + + assert(VTList.NumVTs == 2); + SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]); + + auto NewOp = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops, + WidenedMemVT, MMO); + if (WidenedVT != VT) { + auto Extract = DAG.getNode( + ISD::EXTRACT_SUBVECTOR, DL, VT, NewOp, + DAG.getConstant(0, DL, getVectorIdxTy(DAG.getDataLayout()))); + NewOp = DAG.getMergeValues({ Extract, SDValue(NewOp.getNode(), 1) }, DL); + } + return NewOp; +} + SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG) const { EVT StoreVT = VData.getValueType(); diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h index 01684867cb0..b75f609a507 100644 --- a/lib/Target/AMDGPU/SIISelLowering.h +++ b/lib/Target/AMDGPU/SIISelLowering.h @@ -94,6 +94,12 @@ private: SelectionDAG &DAG, ArrayRef Ops, bool IsIntrinsic = false) const; + // Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to + // dwordx4 if on SI. + SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList, + ArrayRef Ops, EVT MemVT, + MachineMemOperand *MMO, SelectionDAG &DAG) const; + SDValue handleD16VData(SDValue VData, SelectionDAG &DAG) const; /// Converts \p Op, which must be of floating point type, to the diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td index e10c45cf9ea..13903d70ec5 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.td +++ b/lib/Target/AMDGPU/SIInstrInfo.td @@ -108,9 +108,6 @@ def SDTtbuffer_store : SDTypeProfile<0, 9, def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT", SDTtbuffer_store, [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; -def SItbuffer_store_x3 : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT_X3", - SDTtbuffer_store, - [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; def SItbuffer_store_d16 : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT_D16", SDTtbuffer_store, [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.dwordx3.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.dwordx3.ll new file mode 100644 index 00000000000..7c077f4b933 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.dwordx3.ll @@ -0,0 +1,60 @@ +;RUN: llc < %s -march=amdgcn -mcpu=gfx600 -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,SI +;RUN: llc < %s -march=amdgcn -mcpu=gfx700 -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,GCNX3 + +;CHECK-LABEL: {{^}}buffer_load_format_immoffs_x3: +;SI: buffer_load_format_xyzw v[0:3], off, s[0:3], 0 offset:42 +;GCNX3: buffer_load_format_xyz v[0:2], off, s[0:3], 0 offset:42 +;CHECK: s_waitcnt +define amdgpu_ps <3 x float> @buffer_load_format_immoffs_x3(<4 x i32> inreg) { +main_body: + %data = call <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(<4 x i32> %0, i32 0, i32 42, i1 0, i1 0) + ret <3 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_immoffs_x3: +;SI: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:40 +;GCNX3: buffer_load_dwordx3 v[0:2], off, s[0:3], 0 offset:40 +;CHECK: s_waitcnt +define amdgpu_ps <3 x float> @buffer_load_immoffs_x3(<4 x i32> inreg) { +main_body: + %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %0, i32 0, i32 40, i1 0, i1 0) + ret <3 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_raw_load_immoffs_x3: +;SI: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:40 +;GCNX3: buffer_load_dwordx3 v[0:2], off, s[0:3], 0 offset:40 +;CHECK: s_waitcnt +define amdgpu_ps <3 x float> @buffer_raw_load_immoffs_x3(<4 x i32> inreg) { +main_body: + %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %0, i32 40, i32 0, i32 0) + ret <3 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_struct_load_format_immoffs_x3: +;SI: buffer_load_format_xyzw v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen offset:42 +;GCNX3: buffer_load_format_xyz v[0:2], {{v[0-9]+}}, s[0:3], 0 idxen offset:42 +;CHECK: s_waitcnt +define amdgpu_ps <3 x float> @buffer_struct_load_format_immoffs_x3(<4 x i32> inreg) { +main_body: + %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %0, i32 0, i32 42, i32 0, i32 0) + ret <3 x float> %data +} + +;CHECK-LABEL: {{^}}struct_buffer_load_immoffs_x3: +;SI: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen offset:40 +;GCNX3: buffer_load_dwordx3 v[0:2], {{v[0-9]+}}, s[0:3], 0 idxen offset:40 +;CHECK: s_waitcnt +define amdgpu_ps <3 x float> @struct_buffer_load_immoffs_x3(<4 x i32> inreg) { +main_body: + %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %0, i32 0, i32 40, i32 0, i32 0) + ret <3 x float> %data +} + +declare <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(<4 x i32>, i32, i32, i1, i1) #0 +declare <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32>, i32, i32, i1, i1) #0 +declare <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32>, i32, i32, i32) #0 +declare <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32>, i32, i32, i32) #0 +declare <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32>, i32, i32, i32, i32) #0 +declare <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32>, i32, i32, i32, i32) #0 + diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.dwordx3.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.dwordx3.ll new file mode 100644 index 00000000000..b44b07fd393 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.dwordx3.ll @@ -0,0 +1,53 @@ +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK + +;CHECK-LABEL: {{^}}buffer_store_format_immoffs_x3: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_format_xyz v[0:2], off, s[0:3], 0 offset:42 +define amdgpu_ps void @buffer_store_format_immoffs_x3(<4 x i32> inreg, <3 x float>) { +main_body: + call void @llvm.amdgcn.buffer.store.format.v3f32(<3 x float> %1, <4 x i32> %0, i32 0, i32 42, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_immoffs_x3: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 offset:42 +define amdgpu_ps void @buffer_store_immoffs_x3(<4 x i32> inreg, <3 x float>) { +main_body: + call void @llvm.amdgcn.buffer.store.v3f32(<3 x float> %1, <4 x i32> %0, i32 0, i32 42, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}raw_buffer_store_format_immoffs_x3: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_format_xyz v[0:2], off, s[0:3], 0 offset:42 +define amdgpu_ps void @raw_buffer_store_format_immoffs_x3(<4 x i32> inreg, <3 x float>) { +main_body: + call void @llvm.amdgcn.raw.buffer.store.format.v3f32(<3 x float> %1, <4 x i32> %0, i32 42, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}raw_buffer_store_immoffs_x3: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 offset:42 +define amdgpu_ps void @raw_buffer_store_immoffs_x3(<4 x i32> inreg, <3 x float>) { +main_body: + call void @llvm.amdgcn.raw.buffer.store.v3f32(<3 x float> %1, <4 x i32> %0, i32 42, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}struct_buffer_store_immoffs_x3: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx3 v[0:2], {{v[0-9]+}}, s[0:3], 0 idxen offset:42 +define amdgpu_ps void @struct_buffer_store_immoffs_x3(<4 x i32> inreg, <3 x float>) { +main_body: + call void @llvm.amdgcn.struct.buffer.store.v3f32(<3 x float> %1, <4 x i32> %0, i32 0, i32 42, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.buffer.store.v3f32(<3 x float>, <4 x i32>, i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.buffer.store.format.v3f32(<3 x float>, <4 x i32>, i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.raw.buffer.store.format.v3f32(<3 x float>, <4 x i32>, i32, i32, i32) #0 +declare void @llvm.amdgcn.raw.buffer.store.v3f32(<3 x float>, <4 x i32>, i32, i32, i32) #0 +declare void @llvm.amdgcn.struct.buffer.store.format.v3f32(<3 x float>, <4 x i32>, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.struct.buffer.store.v3f32(<3 x float>, <4 x i32>, i32, i32, i32, i32) #0 diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.dwordx3.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.dwordx3.ll new file mode 100644 index 00000000000..0193d973989 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.dwordx3.ll @@ -0,0 +1,40 @@ +;RUN: llc < %s -march=amdgcn -mcpu=gfx600 -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,SI +;RUN: llc < %s -march=amdgcn -mcpu=gfx700 -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,GCNX3 + +; GCN-LABEL: {{^}}tbuffer_raw_load_immoffs_x3: +; SI: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offset:42 +; GCNX3: tbuffer_load_format_xyz {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offset:42 +define amdgpu_vs <3 x float> @tbuffer_raw_load_immoffs_x3(<4 x i32> inreg) { +main_body: + %vdata = call <3 x i32> @llvm.amdgcn.raw.tbuffer.load.v3i32(<4 x i32> %0, i32 42, i32 0, i32 78, i32 0) + %vdata.f = bitcast <3 x i32> %vdata to <3 x float> + ret <3 x float> %vdata.f +} + + +; GCN-LABEL: {{^}}tbuffer_struct_load_immoffs_x3: +; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 +; SI: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen offset:42 +; GCNX3: tbuffer_load_format_xyz {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen offset:42 +define amdgpu_vs <3 x float> @tbuffer_struct_load_immoffs_x3(<4 x i32> inreg) { +main_body: + %vdata = call <3 x i32> @llvm.amdgcn.struct.tbuffer.load.v3i32(<4 x i32> %0, i32 0, i32 42, i32 0, i32 78, i32 0) + %vdata.f = bitcast <3 x i32> %vdata to <3 x float> + ret <3 x float> %vdata.f +} + + +; GCN-LABEL: {{^}}tbuffer_load_format_immoffs_x3: +; SI: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offset:42 +; GCNX3: tbuffer_load_format_xyz {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offset:42 +define amdgpu_vs <3 x float> @tbuffer_load_format_immoffs_x3(<4 x i32> inreg) { +main_body: + %vdata = call <3 x i32> @llvm.amdgcn.tbuffer.load.v3i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 42, i32 14, i32 4, i1 0, i1 0) + %vdata.f = bitcast <3 x i32> %vdata to <3 x float> + ret <3 x float> %vdata.f +} + +declare <3 x i32> @llvm.amdgcn.raw.tbuffer.load.v3i32(<4 x i32>, i32, i32, i32, i32) +declare <3 x i32> @llvm.amdgcn.struct.tbuffer.load.v3i32(<4 x i32>, i32, i32, i32, i32, i32) +declare <3 x i32> @llvm.amdgcn.tbuffer.load.v3i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) + diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.dwordx3.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.dwordx3.ll new file mode 100644 index 00000000000..a39614c1cf0 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.dwordx3.ll @@ -0,0 +1,35 @@ +;RUN: llc < %s -march=amdgcn -mcpu=gfx700 -verify-machineinstrs | FileCheck %s -check-prefixes=GCN + +; GCN-LABEL: {{^}}tbuffer_raw_store_immoffs_x3: +; GCN: tbuffer_store_format_xyz v[0:2], off, s[0:3], dfmt:5, nfmt:7, 0 offset:42 +define amdgpu_ps void @tbuffer_raw_store_immoffs_x3(<4 x i32> inreg, <3 x float>) { +main_body: + %in1 = bitcast <3 x float> %1 to <3 x i32> + call void @llvm.amdgcn.raw.tbuffer.store.v3i32(<3 x i32> %in1, <4 x i32> %0, i32 42, i32 0, i32 117, i32 0) + ret void +} + + +; GCN-LABEL: {{^}}tbuffer_struct_store_immoffs_x3: +; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 +; GCN: tbuffer_store_format_xyz v[0:2], [[ZEROREG]], s[0:3], dfmt:5, nfmt:7, 0 idxen offset:42 +define amdgpu_ps void @tbuffer_struct_store_immoffs_x3(<4 x i32> inreg, <3 x float>) { +main_body: + %in1 = bitcast <3 x float> %1 to <3 x i32> + call void @llvm.amdgcn.struct.tbuffer.store.v3i32(<3 x i32> %in1, <4 x i32> %0, i32 0, i32 42, i32 0, i32 117, i32 0) + ret void +} + +; GCN-LABEL: {{^}}tbuffer_store_immoffs_x3: +; GCN: tbuffer_store_format_xyz v[0:2], off, s[0:3], dfmt:5, nfmt:7, 0 offset:42 +define amdgpu_ps void @tbuffer_store_immoffs_x3(<4 x i32> inreg, <3 x float>) { +main_body: + %in1 = bitcast <3 x float> %1 to <3 x i32> + call void @llvm.amdgcn.tbuffer.store.v3i32(<3 x i32> %in1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 42, i32 5, i32 7, i1 0, i1 0) + ret void +} + +declare void @llvm.amdgcn.raw.tbuffer.store.v3i32(<3 x i32>, <4 x i32>, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.struct.tbuffer.store.v3i32(<3 x i32>, <4 x i32>, i32, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.tbuffer.store.v3i32(<3 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0 + diff --git a/test/MC/AMDGPU/mtbuf.s b/test/MC/AMDGPU/mtbuf.s index 381b4f73c55..bfffb67bd76 100644 --- a/test/MC/AMDGPU/mtbuf.s +++ b/test/MC/AMDGPU/mtbuf.s @@ -14,9 +14,9 @@ tbuffer_load_format_xy v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1 // SICI: tbuffer_load_format_xy v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x79,0xe9,0x00,0x01,0x01,0x01] // VI: tbuffer_load_format_xy v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x80,0x78,0xe9,0x00,0x01,0x01,0x01] -tbuffer_load_format_xyz v[1:4], off, s[4:7], dfmt:15, nfmt:2, s1 -// SICI: tbuffer_load_format_xyz v[1:4], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x7a,0xe9,0x00,0x01,0x01,0x01] -// VI: tbuffer_load_format_xyz v[1:4], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x79,0xe9,0x00,0x01,0x01,0x01] +tbuffer_load_format_xyz v[1:3], off, s[4:7], dfmt:15, nfmt:2, s1 +// SICI: tbuffer_load_format_xyz v[1:3], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x7a,0xe9,0x00,0x01,0x01,0x01] +// VI: tbuffer_load_format_xyz v[1:3], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x79,0xe9,0x00,0x01,0x01,0x01] tbuffer_load_format_xyzw v[1:4], off, s[4:7], dfmt:15, nfmt:2, s1 // SICI: tbuffer_load_format_xyzw v[1:4], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x7b,0xe9,0x00,0x01,0x01,0x01]