From: Tim Renouf Date: Mon, 11 Sep 2017 13:55:39 +0000 (+0000) Subject: [AMDGPU] exp should not be in WQM mode X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=2c5cb5f335cb5af7aaf0955b204c3c3a66891cc8;p=llvm [AMDGPU] exp should not be in WQM mode A mrt exp with vm=1 must be in exact (non-WQM) mode, as it also exports the exec mask as the valid mask to determine which pixels to render. This commit marks any exp as needing to be in exact mode. Actually, if there are multiple mrt exps, only one needs to have vm=1, and only that one needs to be in exact mode. But that is an optimization for another day. Differential Revision: https://reviews.llvm.org/D36305 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@312915 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td index eb94c00e1ca..f908abb3ed8 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.td +++ b/lib/Target/AMDGPU/SIInstrInfo.td @@ -849,7 +849,7 @@ class EXP_Helper : EXPCommon< // Split EXP instruction into EXP and EXP_DONE so we can set // mayLoad for done=1. multiclass EXP_m { - let mayLoad = done in { + let mayLoad = done, DisableWQM = 1 in { let isPseudo = 1, isCodeGenOnly = 1 in { def "" : EXP_Helper, SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.NONE>; diff --git a/test/CodeGen/AMDGPU/spill-m0.ll b/test/CodeGen/AMDGPU/spill-m0.ll index 1147464c1dd..a7b522165ab 100644 --- a/test/CodeGen/AMDGPU/spill-m0.ll +++ b/test/CodeGen/AMDGPU/spill-m0.ll @@ -66,13 +66,13 @@ endif: ; TOSMEM-NOT: s_m0 ; TOSMEM: s_add_u32 m0, s7, 0x100 -; TOSMEM-NEXT: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Spill +; TOSMEM-NEXT: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Spill ; FIXME: RegScavenger::isRegUsed() always returns true if m0 is reserved, so we have to save and restore it ; FIXME-TOSMEM-NOT: m0 ; FIXME-TOSMEM-NOT: m0 -; TOSMEM: s_add_u32 m0, s7, 0x200 -; TOSMEM: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Spill +; TOSMEM: s_add_u32 m0, s7, 0x300 +; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Spill ; FIXME-TOSMEM-NOT: m0 ; TOSMEM: s_mov_b64 exec, @@ -80,7 +80,7 @@ endif: ; TOSMEM: s_branch ; TOSMEM: BB{{[0-9]+_[0-9]+}}: -; TOSMEM: s_add_u32 m0, s7, 0x200 +; TOSMEM: s_add_u32 m0, s7, 0x400 ; TOSMEM-NEXT: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Reload diff --git a/test/CodeGen/AMDGPU/wqm.ll b/test/CodeGen/AMDGPU/wqm.ll index 12fb6cd4d6f..e403ad128a0 100644 --- a/test/CodeGen/AMDGPU/wqm.ll +++ b/test/CodeGen/AMDGPU/wqm.ll @@ -12,24 +12,33 @@ main_body: ret <4 x float> %tex } -; Check that WQM is triggered by image samples and left untouched for loads... +; Check that WQM is triggered by code calculating inputs to image samples and is disabled as soon as possible ; ;CHECK-LABEL: {{^}}test2: ;CHECK-NEXT: ; %main_body +;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec ;CHECK-NEXT: s_wqm_b64 exec, exec +;CHECK: interp +;CHECK: s_and_b64 exec, exec, [[ORIG]] +;CHECK-NOT: interp +;CHECK: image_sample ;CHECK-NOT: exec -define amdgpu_ps void @test2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x float> %c) { +;CHECK: .size test2 +define amdgpu_ps <4 x float> @test2(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #6 { main_body: - %c.1 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 - %c.2 = bitcast <4 x float> %c.1 to <4 x i32> - %c.3 = extractelement <4 x i32> %c.2, i32 0 - %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.3 - %data = load float, float addrspace(1)* %gep - call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %data, float undef, float undef, float undef, i1 true, i1 true) #1 - ret void + %inst23 = extractelement <2 x float> %pos, i32 0 + %inst24 = extractelement <2 x float> %pos, i32 1 + %inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0) + %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0) + %inst27 = insertelement <2 x float> undef, float %inst26, i32 0 + %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0) + %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0) + %inst30 = insertelement <2 x float> %inst27, float %inst29, i32 1 + %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %inst30, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 + ret <4 x float> %tex } -; ... but disabled for stores (and, in this simple case, not re-enabled). +; ... but disabled for stores (and, in this simple case, not re-enabled) ... ; ;CHECK-LABEL: {{^}}test3: ;CHECK-NEXT: ; %main_body @@ -51,6 +60,36 @@ main_body: ret <4 x float> %tex } +; ... and disabled for export. +; +;CHECK-LABEL: {{^}}test3x: +;CHECK-NEXT: ; %main_body +;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec +;CHECK-NEXT: s_wqm_b64 exec, exec +;CHECK: s_and_b64 exec, exec, [[ORIG]] +;CHECK: image_sample +;CHECK: exp +;CHECK-NOT: exec +;CHECK: .size test3x +define amdgpu_ps void @test3x(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #6 { +main_body: + %inst23 = extractelement <2 x float> %pos, i32 0 + %inst24 = extractelement <2 x float> %pos, i32 1 + %inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0) + %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0) + %inst27 = insertelement <2 x float> undef, float %inst26, i32 0 + %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0) + %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0) + %inst30 = insertelement <2 x float> %inst27, float %inst29, i32 1 + %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %inst30, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 + %tex.0 = extractelement <4 x float> %tex, i32 0 + %tex.1 = extractelement <4 x float> %tex, i32 1 + %tex.2 = extractelement <4 x float> %tex, i32 2 + %tex.3 = extractelement <4 x float> %tex, i32 3 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tex.0, float %tex.1, float %tex.2, float %tex.3, i1 true, i1 true) + ret void +} + ; Check that WQM is re-enabled when required. ; ;CHECK-LABEL: {{^}}test4: @@ -724,9 +763,14 @@ declare i32 @llvm.amdgcn.wwm.i32(i32) #3 declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #4 declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #3 declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #3 +declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #3 +declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #1 +declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #2 +declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #2 attributes #1 = { nounwind } attributes #2 = { nounwind readonly } attributes #3 = { nounwind readnone } attributes #4 = { nounwind readnone convergent } attributes #5 = { "amdgpu-ps-wqm-outputs" } +attributes #6 = { nounwind "InitialPSInputAddr"="2" }