From: Matt Arsenault Date: Fri, 8 Dec 2017 20:00:57 +0000 (+0000) Subject: AMDGPU: image_getlod and image_getresinfo do not read memory X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=75d7256af49f395bd565ba9e2898d27525727633;p=llvm AMDGPU: image_getlod and image_getresinfo do not read memory git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@320187 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/include/llvm/IR/IntrinsicsAMDGPU.td b/include/llvm/IR/IntrinsicsAMDGPU.td index f507f9c1668..d72a8c6798a 100644 --- a/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/include/llvm/IR/IntrinsicsAMDGPU.td @@ -294,7 +294,7 @@ class AMDGPUAtomicIncIntrin : Intrinsic<[llvm_anyint_ty], def int_amdgcn_atomic_inc : AMDGPUAtomicIncIntrin; def int_amdgcn_atomic_dec : AMDGPUAtomicIncIntrin; -class AMDGPUImageLoad : Intrinsic < +class AMDGPUImageLoad : Intrinsic < [llvm_anyfloat_ty], // vdata(VGPR) [llvm_anyint_ty, // vaddr(VGPR) llvm_anyint_ty, // rsrc(SGPR) @@ -303,11 +303,11 @@ class AMDGPUImageLoad : Intrinsic < llvm_i1_ty, // slc(imm) llvm_i1_ty, // lwe(imm) llvm_i1_ty], // da(imm) - [IntrReadMem]>; + !if(NoMem, [IntrNoMem], [IntrReadMem])>; def int_amdgcn_image_load : AMDGPUImageLoad; def int_amdgcn_image_load_mip : AMDGPUImageLoad; -def int_amdgcn_image_getresinfo : AMDGPUImageLoad; +def int_amdgcn_image_getresinfo : AMDGPUImageLoad<1>; class AMDGPUImageStore : Intrinsic < [], @@ -324,7 +324,7 @@ class AMDGPUImageStore : Intrinsic < def int_amdgcn_image_store : AMDGPUImageStore; def int_amdgcn_image_store_mip : AMDGPUImageStore; -class AMDGPUImageSample : Intrinsic < +class AMDGPUImageSample : Intrinsic < [llvm_anyfloat_ty], // vdata(VGPR) [llvm_anyfloat_ty, // vaddr(VGPR) llvm_anyint_ty, // rsrc(SGPR) @@ -335,7 +335,7 @@ class AMDGPUImageSample : Intrinsic < llvm_i1_ty, // slc(imm) llvm_i1_ty, // lwe(imm) llvm_i1_ty], // da(imm) - [IntrReadMem]>; + !if(NoMem, [IntrNoMem], [IntrReadMem])>; // Basic sample def int_amdgcn_image_sample : AMDGPUImageSample; @@ -417,7 +417,7 @@ def int_amdgcn_image_gather4_c_b_o : AMDGPUImageSample; def int_amdgcn_image_gather4_c_b_cl_o : AMDGPUImageSample; def int_amdgcn_image_gather4_c_lz_o : AMDGPUImageSample; -def int_amdgcn_image_getlod : AMDGPUImageSample; +def int_amdgcn_image_getlod : AMDGPUImageSample<1>; class AMDGPUImageAtomic : Intrinsic < [llvm_i32_ty], diff --git a/lib/Target/AMDGPU/MIMGInstructions.td b/lib/Target/AMDGPU/MIMGInstructions.td index 99a018d2e24..9622bccddf9 100644 --- a/lib/Target/AMDGPU/MIMGInstructions.td +++ b/lib/Target/AMDGPU/MIMGInstructions.td @@ -257,7 +257,11 @@ defm IMAGE_STORE : MIMG_Store <0x00000008, "image_store">; defm IMAGE_STORE_MIP : MIMG_Store <0x00000009, "image_store_mip">; //def IMAGE_STORE_PCK : MIMG_NoPattern_ <"image_store_pck", 0x0000000a>; //def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"image_store_mip_pck", 0x0000000b>; + +let mayLoad = 0, mayStore = 0 in { defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "image_get_resinfo">; +} + defm IMAGE_ATOMIC_SWAP : MIMG_Atomic , "image_atomic_swap">; defm IMAGE_ATOMIC_CMPSWAP : MIMG_Atomic , "image_atomic_cmpswap", VReg_64>; defm IMAGE_ATOMIC_ADD : MIMG_Atomic , "image_atomic_add">; @@ -331,7 +335,11 @@ defm IMAGE_GATHER4_C_L_O : MIMG_Gather <0x0000005c, "image_gather4_c_l_o">; defm IMAGE_GATHER4_C_B_O : MIMG_Gather_WQM <0x0000005d, "image_gather4_c_b_o">; defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <0x0000005e, "image_gather4_c_b_cl_o">; defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <0x0000005f, "image_gather4_c_lz_o">; + +let mayLoad = 0, mayStore = 0 in { defm IMAGE_GET_LOD : MIMG_Sampler_WQM <0x00000060, "image_get_lod">; +} + defm IMAGE_SAMPLE_CD : MIMG_Sampler <0x00000068, "image_sample_cd">; defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <0x00000069, "image_sample_cd_cl">; defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <0x0000006a, "image_sample_c_cd">; diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 9f9f0a9007f..92087623cec 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2957,8 +2957,11 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( if (MI.mayLoad()) Flags |= MachineMemOperand::MOLoad; - auto MMO = MF->getMachineMemOperand(PtrInfo, Flags, 0, 0); - MI.addMemOperand(*MF, MMO); + if (Flags != MachineMemOperand::MODereferenceable) { + auto MMO = MF->getMachineMemOperand(PtrInfo, Flags, 0, 0); + MI.addMemOperand(*MF, MMO); + } + return BB; } @@ -4233,6 +4236,16 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src), 0); } + case Intrinsic::amdgcn_image_getlod: + case Intrinsic::amdgcn_image_getresinfo: { + unsigned Idx = (IntrinsicID == Intrinsic::amdgcn_image_getresinfo) ? 3 : 4; + + // Replace dmask with everything disabled with undef. + const ConstantSDNode *DMask = dyn_cast(Op.getOperand(Idx)); + if (!DMask || DMask->isNullValue()) + return DAG.getUNDEF(Op.getValueType()); + return SDValue(); + } default: return Op; } @@ -4441,9 +4454,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_image_sample_c_b_cl_o: case Intrinsic::amdgcn_image_sample_c_lz_o: case Intrinsic::amdgcn_image_sample_c_cd_o: - case Intrinsic::amdgcn_image_sample_c_cd_cl_o: - - case Intrinsic::amdgcn_image_getlod: { + case Intrinsic::amdgcn_image_sample_c_cd_cl_o: { // Replace dmask with everything disabled with undef. const ConstantSDNode *DMask = dyn_cast(Op.getOperand(5)); if (!DMask || DMask->isNullValue()) { @@ -6593,6 +6604,12 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, unsigned DmaskIdx = (Node->getNumOperands() - Node->getNumValues() == 9) ? 2 : 3; unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx); unsigned NewDmask = 0; + bool HasChain = Node->getNumValues() > 1; + + if (OldDmask == 0) { + // These are folded out, but on the chance it happens don't assert. + return Node; + } // Try to figure out the used register components for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end(); @@ -6616,7 +6633,6 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, // Set which texture component corresponds to the lane. unsigned Comp; for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) { - assert(Dmask); Comp = countTrailingZeros(Dmask); Dmask &= ~(1 << Comp); } @@ -6649,17 +6665,20 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT(); - auto NewVTList = - DAG.getVTList(BitsSet == 1 ? - SVT : MVT::getVectorVT(SVT, BitsSet == 3 ? 4 : BitsSet), - MVT::Other); + MVT ResultVT = BitsSet == 1 ? + SVT : MVT::getVectorVT(SVT, BitsSet == 3 ? 4 : BitsSet); + SDVTList NewVTList = HasChain ? + DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT); + MachineSDNode *NewNode = DAG.getMachineNode(NewOpcode, SDLoc(Node), NewVTList, Ops); - NewNode->setMemRefs(Node->memoperands_begin(), Node->memoperands_end()); - // Update chain. - DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1)); + if (HasChain) { + // Update chain. + NewNode->setMemRefs(Node->memoperands_begin(), Node->memoperands_end()); + DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1)); + } if (BitsSet == 1) { assert(Node->hasNUsesOfValue(1, 0)); diff --git a/test/CodeGen/AMDGPU/adjust-writemask-invalid-copy.ll b/test/CodeGen/AMDGPU/adjust-writemask-invalid-copy.ll index d8cf67af7b0..e967723384b 100644 --- a/test/CodeGen/AMDGPU/adjust-writemask-invalid-copy.ll +++ b/test/CodeGen/AMDGPU/adjust-writemask-invalid-copy.ll @@ -1,11 +1,11 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -; GCN-LABEL: {{^}}adjust_writemask_crash_0: +; GCN-LABEL: {{^}}adjust_writemask_crash_0_nochain: ; GCN: image_get_lod v0, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x2 ; GCN-NOT: v1 ; GCN-NOT: v0 ; GCN: buffer_store_dword v0 -define amdgpu_ps void @adjust_writemask_crash_0() #0 { +define amdgpu_ps void @adjust_writemask_crash_0_nochain() #0 { main_body: %tmp = call <2 x float> @llvm.amdgcn.image.getlod.v2f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 3, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp1 = bitcast <2 x float> %tmp to <2 x i32> @@ -16,12 +16,12 @@ main_body: ret void } -; GCN-LABEL: {{^}}adjust_writemask_crash_1: +; GCN-LABEL: {{^}}adjust_writemask_crash_1_nochain: ; GCN: image_get_lod v0, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x1 ; GCN-NOT: v1 ; GCN-NOT: v0 ; GCN: buffer_store_dword v0 -define amdgpu_ps void @adjust_writemask_crash_1() #0 { +define amdgpu_ps void @adjust_writemask_crash_1_nochain() #0 { main_body: %tmp = call <2 x float> @llvm.amdgcn.image.getlod.v2f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 3, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp1 = bitcast <2 x float> %tmp to <2 x i32> @@ -32,6 +32,38 @@ main_body: ret void } +; GCN-LABEL: {{^}}adjust_writemask_crash_0_chain: +; GCN: image_sample v0, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x2 +; GCN-NOT: v1 +; GCN-NOT: v0 +; GCN: buffer_store_dword v0 +define amdgpu_ps void @adjust_writemask_crash_0_chain() #0 { +main_body: + %tmp = call <2 x float> @llvm.amdgcn.image.sample.v2f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 3, i1 false, i1 false, i1 false, i1 false, i1 false) + %tmp1 = bitcast <2 x float> %tmp to <2 x i32> + %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> + %tmp3 = bitcast <4 x i32> %tmp2 to <4 x float> + %tmp4 = extractelement <4 x float> %tmp3, i32 0 + store volatile float %tmp4, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}adjust_writemask_crash_1_chain: +; GCN: image_sample v0, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x1 +; GCN-NOT: v1 +; GCN-NOT: v0 +; GCN: buffer_store_dword v0 +define amdgpu_ps void @adjust_writemask_crash_1_chain() #0 { +main_body: + %tmp = call <2 x float> @llvm.amdgcn.image.sample.v2f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 3, i1 false, i1 false, i1 false, i1 false, i1 false) + %tmp1 = bitcast <2 x float> %tmp to <2 x i32> + %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> + %tmp3 = bitcast <4 x i32> %tmp2 to <4 x float> + %tmp4 = extractelement <4 x float> %tmp3, i32 1 + store volatile float %tmp4, float addrspace(1)* undef + ret void +} + define amdgpu_ps void @adjust_writemask_crash_0_v4() #0 { main_body: %tmp = call <4 x float> @llvm.amdgcn.image.getlod.v4f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 5, i1 false, i1 false, i1 false, i1 false, i1 false) @@ -44,6 +76,7 @@ main_body: } +declare <2 x float> @llvm.amdgcn.image.sample.v2f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #1 declare <2 x float> @llvm.amdgcn.image.getlod.v2f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #1 declare <4 x float> @llvm.amdgcn.image.getlod.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #1 diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.ll index 2e78e2a4c6f..dfe4aff7bc1 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.ll @@ -3,6 +3,8 @@ ; GCN-LABEL: {{^}}getlod: ; GCN: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf da +; GCN: s_waitcnt vmcnt(0) +; GCN: store_dwordx4 define amdgpu_kernel void @getlod(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.getlod.v4f32.f32.v8i32(float undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 1) @@ -12,6 +14,8 @@ main_body: ; GCN-LABEL: {{^}}getlod_v2: ; GCN: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf da +; GCN: s_waitcnt vmcnt(0) +; GCN: store_dwordx4 define amdgpu_kernel void @getlod_v2(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.getlod.v4f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 1) @@ -21,6 +25,8 @@ main_body: ; GCN-LABEL: {{^}}getlod_v4: ; GCN: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf da +; GCN: s_waitcnt vmcnt(0) +; GCN: store_dwordx4 define amdgpu_kernel void @getlod_v4(<4 x float> addrspace(1)* %out) { main_body: %r = call <4 x float> @llvm.amdgcn.image.getlod.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 1) diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll index 42c87056746..d9be4a4d019 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll @@ -129,6 +129,8 @@ main_body: ; GCN-LABEL: {{^}}getresinfo: ; GCN-NOT: s_waitcnt ; GCN: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf +; GCN: s_waitcnt vmcnt(0) +; GCN: exp define amdgpu_ps void @getresinfo() #0 { main_body: %r = call <4 x float> @llvm.amdgcn.image.getresinfo.v4f32.i32.v8i32(i32 undef, <8 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false) @@ -140,6 +142,19 @@ main_body: ret void } +; GCN-LABEL: {{^}}getresinfo_dmask0: +; GCN-NOT: image_get_resinfo +define amdgpu_ps void @getresinfo_dmask0() #0 { +main_body: + %r = call <4 x float> @llvm.amdgcn.image.getresinfo.v4f32.i32.v8i32(i32 undef, <8 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r0, float %r1, float %r2, float %r3, i1 true, i1 true) #0 + ret void +} + ; Ideally, the register allocator would avoid the wait here ; ; GCN-LABEL: {{^}}image_store_wait: @@ -186,9 +201,10 @@ declare <4 x float> @llvm.amdgcn.image.load.v4f32.i32.v8i32(i32, <8 x i32>, i32, declare <4 x float> @llvm.amdgcn.image.load.v4f32.v2i32.v8i32(<2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 declare <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 declare <4 x float> @llvm.amdgcn.image.load.mip.v4f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 -declare <4 x float> @llvm.amdgcn.image.getresinfo.v4f32.i32.v8i32(i32, <8 x i32>, i32, i1, i1, i1, i1) #1 +declare <4 x float> @llvm.amdgcn.image.getresinfo.v4f32.i32.v8i32(i32, <8 x i32>, i32, i1, i1, i1, i1) #2 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 attributes #0 = { nounwind } attributes #1 = { nounwind readonly } +attributes #2 = { nounwind readnone }