[AMDGPU] exp should not be in WQM mode

author Tim Renouf <tim.renouf@amd.com>

Mon, 11 Sep 2017 13:55:39 +0000 (13:55 +0000)

committer Tim Renouf <tim.renouf@amd.com>

Mon, 11 Sep 2017 13:55:39 +0000 (13:55 +0000)
author Tim Renouf <tim.renouf@amd.com>
Mon, 11 Sep 2017 13:55:39 +0000 (13:55 +0000)
committer Tim Renouf <tim.renouf@amd.com>
Mon, 11 Sep 2017 13:55:39 +0000 (13:55 +0000)
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td

index eb94c00e1caf5b4fbd98169b723f7436e155b43f..f908abb3ed8c0d4d02749d653fa5ed7f682f9796 100644 (file)
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -849,7 +849,7 @@ class EXP_Helper<bit done, SDPatternOperator node = null_frag> : EXPCommon<
  // Split EXP instruction into EXP and EXP_DONE so we can set
  // mayLoad for done=1.
  multiclass EXP_m<bit done, SDPatternOperator node> {
-  let mayLoad = done in {
+  let mayLoad = done, DisableWQM = 1 in {
      let isPseudo = 1, isCodeGenOnly = 1 in {
        def "" : EXP_Helper<done, node>,
                 SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.NONE>;
diff --git a/test/CodeGen/AMDGPU/spill-m0.ll b/test/CodeGen/AMDGPU/spill-m0.ll

index 1147464c1ddae2652b555c96266ccbd8cd1ed29e..a7b522165ab891b714c3262f02d5425e30326d3a 100644 (file)
--- a/test/CodeGen/AMDGPU/spill-m0.ll
+++ b/test/CodeGen/AMDGPU/spill-m0.ll
@@ -66,13 +66,13 @@ endif:
  
  ; TOSMEM-NOT: s_m0
  ; TOSMEM: s_add_u32 m0, s7, 0x100
-; TOSMEM-NEXT: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Spill
+; TOSMEM-NEXT: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Spill
  ; FIXME: RegScavenger::isRegUsed() always returns true if m0 is reserved, so we have to save and restore it
  ; FIXME-TOSMEM-NOT: m0
  
  ; FIXME-TOSMEM-NOT: m0
-; TOSMEM: s_add_u32 m0, s7, 0x200
-; TOSMEM: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Spill
+; TOSMEM: s_add_u32 m0, s7, 0x300
+; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Spill
  ; FIXME-TOSMEM-NOT: m0
  
  ; TOSMEM: s_mov_b64 exec,
@@ -80,7 +80,7 @@ endif:
  ; TOSMEM: s_branch
  
  ; TOSMEM: BB{{[0-9]+_[0-9]+}}:
-; TOSMEM: s_add_u32 m0, s7, 0x200
+; TOSMEM: s_add_u32 m0, s7, 0x400
  ; TOSMEM-NEXT: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Reload
  
  
diff --git a/test/CodeGen/AMDGPU/wqm.ll b/test/CodeGen/AMDGPU/wqm.ll

index 12fb6cd4d6f3b9637bedb2ec35cacb38e8cb4bed..e403ad128a06ca530763d3ae56d9a4bd402550d4 100644 (file)
--- a/test/CodeGen/AMDGPU/wqm.ll
+++ b/test/CodeGen/AMDGPU/wqm.ll
@@ -12,24 +12,33 @@ main_body:
    ret <4 x float> %tex
  }
  
-; Check that WQM is triggered by image samples and left untouched for loads...
+; Check that WQM is triggered by code calculating inputs to image samples and is disabled as soon as possible
  ;
  ;CHECK-LABEL: {{^}}test2:
  ;CHECK-NEXT: ; %main_body
+;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
  ;CHECK-NEXT: s_wqm_b64 exec, exec
+;CHECK: interp
+;CHECK: s_and_b64 exec, exec, [[ORIG]]
+;CHECK-NOT: interp
+;CHECK: image_sample
  ;CHECK-NOT: exec
-define amdgpu_ps void @test2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x float> %c) {
+;CHECK: .size test2
+define amdgpu_ps <4 x float> @test2(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #6 {
  main_body:
-  %c.1 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
-  %c.2 = bitcast <4 x float> %c.1 to <4 x i32>
-  %c.3 = extractelement <4 x i32> %c.2, i32 0
-  %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.3
-  %data = load float, float addrspace(1)* %gep
-  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %data, float undef, float undef, float undef, i1 true, i1 true) #1
-  ret void
+  %inst23 = extractelement <2 x float> %pos, i32 0
+  %inst24 = extractelement <2 x float> %pos, i32 1
+  %inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0)
+  %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0)
+  %inst27 = insertelement <2 x float> undef, float %inst26, i32 0
+  %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0)
+  %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0)
+  %inst30 = insertelement <2 x float> %inst27, float %inst29, i32 1
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %inst30, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
+  ret <4 x float> %tex
  }
  
-; ... but disabled for stores (and, in this simple case, not re-enabled).
+; ... but disabled for stores (and, in this simple case, not re-enabled) ...
  ;
  ;CHECK-LABEL: {{^}}test3:
  ;CHECK-NEXT: ; %main_body
@@ -51,6 +60,36 @@ main_body:
    ret <4 x float> %tex
  }
  
+; ... and disabled for export.
+;
+;CHECK-LABEL: {{^}}test3x:
+;CHECK-NEXT: ; %main_body
+;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+;CHECK-NEXT: s_wqm_b64 exec, exec
+;CHECK: s_and_b64 exec, exec, [[ORIG]]
+;CHECK: image_sample
+;CHECK: exp
+;CHECK-NOT: exec
+;CHECK: .size test3x
+define amdgpu_ps void @test3x(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #6 {
+main_body:
+  %inst23 = extractelement <2 x float> %pos, i32 0
+  %inst24 = extractelement <2 x float> %pos, i32 1
+  %inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0)
+  %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0)
+  %inst27 = insertelement <2 x float> undef, float %inst26, i32 0
+  %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0)
+  %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0)
+  %inst30 = insertelement <2 x float> %inst27, float %inst29, i32 1
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %inst30, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
+  %tex.0 = extractelement <4 x float> %tex, i32 0
+  %tex.1 = extractelement <4 x float> %tex, i32 1
+  %tex.2 = extractelement <4 x float> %tex, i32 2
+  %tex.3 = extractelement <4 x float> %tex, i32 3
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tex.0, float %tex.1, float %tex.2, float %tex.3, i1 true, i1 true)
+  ret void
+}
+
  ; Check that WQM is re-enabled when required.
  ;
  ;CHECK-LABEL: {{^}}test4:
@@ -724,9 +763,14 @@ declare i32 @llvm.amdgcn.wwm.i32(i32) #3
  declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #4
  declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #3
  declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #3
+declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #3
+declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #1
+declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #2
+declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #2
  
  attributes #1 = { nounwind }
  attributes #2 = { nounwind readonly }
  attributes #3 = { nounwind readnone }
  attributes #4 = { nounwind readnone convergent }
  attributes #5 = { "amdgpu-ps-wqm-outputs" }
+attributes #6 = { nounwind "InitialPSInputAddr"="2" }
author	Tim Renouf <tim.renouf@amd.com>
	Mon, 11 Sep 2017 13:55:39 +0000 (13:55 +0000)
committer	Tim Renouf <tim.renouf@amd.com>
	Mon, 11 Sep 2017 13:55:39 +0000 (13:55 +0000)
lib/Target/AMDGPU/SIInstrInfo.td		patch \| blob \| history
test/CodeGen/AMDGPU/spill-m0.ll		patch \| blob \| history
test/CodeGen/AMDGPU/wqm.ll		patch \| blob \| history