AMDGPU: Undo sub x, c -> add x, -c canonicalization

author Matt Arsenault <Matthew.Arsenault@amd.com>

Mon, 30 Jan 2017 19:30:24 +0000 (19:30 +0000)

committer Matt Arsenault <Matthew.Arsenault@amd.com>

Mon, 30 Jan 2017 19:30:24 +0000 (19:30 +0000)
author Matt Arsenault <Matthew.Arsenault@amd.com>
Mon, 30 Jan 2017 19:30:24 +0000 (19:30 +0000)
committer Matt Arsenault <Matthew.Arsenault@amd.com>
Mon, 30 Jan 2017 19:30:24 +0000 (19:30 +0000)
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td

index cfa17739829730dff0bddd02d67e85c5be6a2ec3..6f4746b7559a3170cb08465a3abc7036a19d70d3 100644 (file)
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -295,6 +295,19 @@ class VGPRImm <dag frag> : PatLeaf<frag, [{
    return Limit < 10;
  }]>;
  
+def NegateImm : SDNodeXForm<imm, [{
+  return CurDAG->getConstant(-N->getSExtValue(), SDLoc(N), MVT::i32);
+}]>;
+
+// TODO: When FP inline imm values work?
+def NegSubInlineConst32 : ImmLeaf<i32, [{
+  return Imm < -16 && Imm >= -64;
+}], NegateImm>;
+
+def NegSubInlineConst16 : ImmLeaf<i16, [{
+  return Imm < -16 && Imm >= -64;
+}], NegateImm>;
+
  //===----------------------------------------------------------------------===//
  // Custom Operands
  //===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td

index 813abc915d914547135dd3ffb87c55c39912a1e8..06516b24f329f60a3f99dad05b3f09b06f3a0b55 100644 (file)
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -1125,6 +1125,15 @@ def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64>;
  def : IntMed3Pat<V_MED3_I32, smax, smax_oneuse, smin_oneuse>;
  def : IntMed3Pat<V_MED3_U32, umax, umax_oneuse, umin_oneuse>;
  
+
+// Undo sub x, c -> add x, -c canonicalization since c is more likely
+// an inline immediate than -c.
+// TODO: Also do for 64-bit.
+def : Pat<
+  (add i32:$src0, (i32 NegSubInlineConst32:$src1)),
+  (S_SUB_I32 $src0, NegSubInlineConst32:$src1)
+>;
+
  //============================================================================//
  // Assembler aliases
  //============================================================================//
diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td

index 78b4e2ec49a4646d0759a8220c43bb6f448d1866..cd5f044ecd5388f74f0dc7eb38d3e64206cb3980 100644 (file)
--- a/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/lib/Target/AMDGPU/VOP2Instructions.td
@@ -494,6 +494,14 @@ def : Pat <
    (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src)
  >;
  
+// Undo sub x, c -> add x, -c canonicalization since c is more likely
+// an inline immediate than -c.
+// TODO: Also do for 64-bit.
+def : Pat<
+  (add i16:$src0, (i16 NegSubInlineConst16:$src1)),
+  (V_SUB_U16_e64 $src0, NegSubInlineConst16:$src1)
+>;
+
  } // End Predicates = [isVI]
  
  //===----------------------------------------------------------------------===//
diff --git a/test/CodeGen/AMDGPU/s_addk_i32.ll b/test/CodeGen/AMDGPU/s_addk_i32.ll

index f776faca8397316c10d76032eae6014db37ed861..acceb3272fc3d5c85aecb6d55c1d38214acf1136 100644 (file)
--- a/test/CodeGen/AMDGPU/s_addk_i32.ll
+++ b/test/CodeGen/AMDGPU/s_addk_i32.ll
@@ -37,7 +37,7 @@ define void @s_addk_i32_k1(i32 addrspace(1)* %out, i32 %b) {
  }
  
  ; SI-LABEL: {{^}}s_addk_i32_k2:
-; SI: s_addk_i32 {{s[0-9]+}}, 0xffef{{$}}
+; SI: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, 17
  ; SI: s_endpgm
  define void @s_addk_i32_k2(i32 addrspace(1)* %out, i32 %b) {
    %add = add i32 %b, -17
@@ -45,6 +45,15 @@ define void @s_addk_i32_k2(i32 addrspace(1)* %out, i32 %b) {
    ret void
  }
  
+; SI-LABEL: {{^}}s_addk_i32_k3:
+; SI: s_addk_i32 {{s[0-9]+}}, 0xffbf{{$}}
+; SI: s_endpgm
+define void @s_addk_i32_k3(i32 addrspace(1)* %out, i32 %b) {
+  %add = add i32 %b, -65
+  store i32 %add, i32 addrspace(1)* %out
+  ret void
+}
+
  ; SI-LABEL: {{^}}s_addk_v2i32_k0:
  ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x41
  ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x42
diff --git a/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll

new file mode 100644 (file)

index 0000000..08d44cc
--- /dev/null
+++ b/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
@@ -0,0 +1,186 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+; Test that add/sub with a constant is swapped to sub/add with negated
+; constant to minimize code size.
+
+; GCN-LABEL: {{^}}v_test_i32_x_sub_64:
+; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
+; GCN: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 64, [[X]]
+define void @v_test_i32_x_sub_64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %x = load i32, i32 addrspace(1)* %gep
+  %result = sub i32 %x, 64
+  store i32 %result, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_i32_x_sub_64_multi_use:
+; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[Y:v[0-9]+]]
+; GCN-DAG: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 64, [[X]]
+; GCN-DAG: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 64, [[Y]]
+define void @v_test_i32_x_sub_64_multi_use(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %x = load volatile i32, i32 addrspace(1)* %gep
+  %y = load volatile i32, i32 addrspace(1)* %gep
+  %result0 = sub i32 %x, 64
+  %result1 = sub i32 %y, 64
+  store volatile i32 %result0, i32 addrspace(1)* %gep.out
+  store volatile i32 %result1, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_i32_64_sub_x:
+; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
+; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, 64, [[X]]
+define void @v_test_i32_64_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %x = load i32, i32 addrspace(1)* %gep
+  %result = sub i32 64, %x
+  store i32 %result, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_i32_x_sub_65:
+; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
+; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, 0xffffffbf, [[X]]
+define void @v_test_i32_x_sub_65(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %x = load i32, i32 addrspace(1)* %gep
+  %result = sub i32 %x, 65
+  store i32 %result, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_i32_65_sub_x:
+; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
+; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, 0x41, [[X]]
+define void @v_test_i32_65_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %x = load i32, i32 addrspace(1)* %gep
+  %result = sub i32 65, %x
+  store i32 %result, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_i32_x_sub_neg16:
+; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
+; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, 16, [[X]]
+define void @v_test_i32_x_sub_neg16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %x = load i32, i32 addrspace(1)* %gep
+  %result = sub i32 %x, -16
+  store i32 %result, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_i32_neg16_sub_x:
+; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
+; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, -16, [[X]]
+define void @v_test_i32_neg16_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %x = load i32, i32 addrspace(1)* %gep
+  %result = sub i32 -16, %x
+  store i32 %result, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_i32_x_sub_neg17:
+; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
+; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, 17, [[X]]
+define void @v_test_i32_x_sub_neg17(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %x = load i32, i32 addrspace(1)* %gep
+  %result = sub i32 %x, -17
+  store i32 %result, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_i32_neg17_sub_x:
+; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
+; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, 0xffffffef, [[X]]
+define void @v_test_i32_neg17_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %x = load i32, i32 addrspace(1)* %gep
+  %result = sub i32 -17, %x
+  store i32 %result, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_i32_x_sub_64:
+; GCN: s_load_dword [[X:s[0-9]+]]
+; GCN: s_sub_i32 s{{[0-9]+}}, [[X]], 64
+define void @s_test_i32_x_sub_64(i32 %x) #0 {
+  %result = sub i32 %x, 64
+  call void asm sideeffect "; use $0", "s"(i32 %result)
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_i16_x_sub_64:
+; VI: {{buffer|flat}}_load_ushort [[X:v[0-9]+]]
+; VI: v_subrev_u16_e32 v{{[0-9]+}}, 64, [[X]]
+define void @v_test_i16_x_sub_64(i16 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext
+  %x = load i16, i16 addrspace(1)* %gep
+  %result = sub i16 %x, 64
+  store i16 %result, i16 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_i16_x_sub_64_multi_use:
+; GCN: {{buffer|flat}}_load_ushort [[X:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_ushort [[Y:v[0-9]+]]
+; VI-DAG: v_subrev_u16_e32 v{{[0-9]+}}, 64, [[X]]
+; VI-DAG: v_subrev_u16_e32 v{{[0-9]+}}, 64, [[Y]]
+
+; SI-DAG: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 64, [[X]]
+; SI-DAG: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 64, [[Y]]
+define void @v_test_i16_x_sub_64_multi_use(i16 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext
+  %x = load volatile i16, i16 addrspace(1)* %gep
+  %y = load volatile i16, i16 addrspace(1)* %gep
+  %result0 = sub i16 %x, 64
+  %result1 = sub i16 %y, 64
+  store volatile i16 %result0, i16 addrspace(1)* %gep.out
+  store volatile i16 %result1, i16 addrspace(1)* %gep.out
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/sub.i16.ll b/test/CodeGen/AMDGPU/sub.i16.ll

index b5d5f56b27963729e4cf56bb5b5e9a0dd4d6c2b4..b2797ceecf3d3432f24547fe24e6512d332f21d6 100644 (file)
--- a/test/CodeGen/AMDGPU/sub.i16.ll
+++ b/test/CodeGen/AMDGPU/sub.i16.ll
@@ -52,7 +52,7 @@ define void @v_test_sub_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1
  ; FIXME: Need to handle non-uniform case for function below (load without gep).
  ; GCN-LABEL: {{^}}v_test_sub_i16_inline_63:
  ; VI: flat_load_ushort [[A:v[0-9]+]]
-; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0xffffffc1, [[A]]
+; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], 63, [[A]]
  ; VI-NEXT: buffer_store_short [[ADD]]
  define void @v_test_sub_i16_inline_63(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
    %tid = call i32 @llvm.amdgcn.workitem.id.x()
author	Matt Arsenault <Matthew.Arsenault@amd.com>
	Mon, 30 Jan 2017 19:30:24 +0000 (19:30 +0000)
committer	Matt Arsenault <Matthew.Arsenault@amd.com>
	Mon, 30 Jan 2017 19:30:24 +0000 (19:30 +0000)
lib/Target/AMDGPU/SIInstrInfo.td		patch \| blob \| history
lib/Target/AMDGPU/SIInstructions.td		patch \| blob \| history
lib/Target/AMDGPU/VOP2Instructions.td		patch \| blob \| history
test/CodeGen/AMDGPU/s_addk_i32.ll		patch \| blob \| history
test/CodeGen/AMDGPU/shrink-add-sub-constant.ll	[new file with mode: 0644]	patch \| blob
test/CodeGen/AMDGPU/sub.i16.ll		patch \| blob \| history