[AMDGPU] Fix dwordx3/southern-islands failures.

author Neil Henning <neil.henning@amd.com>

Thu, 10 Jan 2019 16:21:08 +0000 (16:21 +0000)

committer Neil Henning <neil.henning@amd.com>

Thu, 10 Jan 2019 16:21:08 +0000 (16:21 +0000)
author Neil Henning <neil.henning@amd.com>
Thu, 10 Jan 2019 16:21:08 +0000 (16:21 +0000)
committer Neil Henning <neil.henning@amd.com>
Thu, 10 Jan 2019 16:21:08 +0000 (16:21 +0000)
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h

index 51be81a7a81f6b963f74bad8139360dafca9b8d4..43c83efe47b4598c850ea642de5777cf66260ef2 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -822,6 +822,11 @@ public:
      return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
    }
  
+  // \returns true if the subtarget supports DWORDX3 load/store instructions.
+  bool hasDwordx3LoadStores() const {
+    return CIInsts;
+  }
+
    bool hasSMovFedHazard() const {
      return getGeneration() >= AMDGPUSubtarget::GFX9;
    }
diff --git a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp

index 52bbe5c03450b0137d8d8fab666f0abc3b13bcaf..be291b127301db31d27da75a9422834f50800717 100644 (file)
--- a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -160,7 +160,7 @@ private:
    bool OptimizeAgain;
  
    static bool offsetsCanBeCombined(CombineInfo &CI);
-  static bool widthsFit(const CombineInfo &CI);
+  static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI);
    static unsigned getNewOpcode(const CombineInfo &CI);
    static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI);
    const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI);
@@ -367,11 +367,12 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
    return false;
  }
  
-bool SILoadStoreOptimizer::widthsFit(const CombineInfo &CI) {
+bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
+                                     const CombineInfo &CI) {
    const unsigned Width = (CI.Width0 + CI.Width1);
    switch (CI.InstClass) {
    default:
-    return Width <= 4;
+    return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
    case S_BUFFER_LOAD_IMM:
      switch (Width) {
      default:
@@ -645,7 +646,7 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
        // We also need to go through the list of instructions that we plan to
        // move and make sure they are all safe to move down past the merged
        // instruction.
-      if (widthsFit(CI) && offsetsCanBeCombined(CI))
+      if (widthsFit(*STM, CI) && offsetsCanBeCombined(CI))
          if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
            return true;
      }
diff --git a/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll

index 20d2e9edd825849da2407b003398680449bfc9d7..4c41565dca9598a319236c6f85c2e43e8eea1c5c 100644 (file)
--- a/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -37,9 +37,10 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias
  ; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]]
  ; GCN-NOT: v_cvt_f32_ubyte3_e32
  ; GCN-DAG: v_cvt_f32_ubyte2_e32 v[[HIRESULT:[0-9]+]], [[VAL]]
-; GCN-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, [[VAL]]
+; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[MDRESULT:[0-9]+]], [[VAL]]
  ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[VAL]]
-; GCN: buffer_store_dwordx3 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
+; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[MDRESULT]]{{\]}},
+; VI: buffer_store_dwordx3 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
  define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
    %tid = call i32 @llvm.amdgcn.workitem.id.x()
    %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid
diff --git a/test/CodeGen/AMDGPU/early-if-convert-cost.ll b/test/CodeGen/AMDGPU/early-if-convert-cost.ll

index b8bb10632acb61d6e81db8b39b275ffaa06a7d03..626a6e2c5b82854815e57ab6bf6e31300616c660 100644 (file)
--- a/test/CodeGen/AMDGPU/early-if-convert-cost.ll
+++ b/test/CodeGen/AMDGPU/early-if-convert-cost.ll
@@ -60,7 +60,8 @@ endif:
  ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc
  ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc
  
-; GCN-DAG: buffer_store_dwordx3
+; GCN-DAG: buffer_store_dword v
+; GCN-DAG: buffer_store_dwordx2
  define amdgpu_kernel void @test_vccnz_ifcvt_triangle96(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in, float %cnd) #0 {
  entry:
    %v = load <3 x i32>, <3 x i32> addrspace(1)* %in
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll

index ba9f206f1512ca0110c0bce7a746c65ca526933e..bcde25ade0fca55ecb59aaf51d5c841548861ee8 100644 (file)
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
@@ -195,7 +195,7 @@ main_body:
  
  ;CHECK-LABEL: {{^}}buffer_load_x3_offen_merged:
  ;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
+;VI-NEXT: buffer_load_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
  ;CHECK: s_waitcnt
  define amdgpu_ps void @buffer_load_x3_offen_merged(<4 x i32> inreg %rsrc, i32 %a) {
  main_body:
@@ -245,7 +245,7 @@ main_body:
  
  ;CHECK-LABEL: {{^}}buffer_load_x3_offset_merged:
  ;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
+;VI-NEXT: buffer_load_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
  ;CHECK: s_waitcnt
  define amdgpu_ps void @buffer_load_x3_offset_merged(<4 x i32> inreg %rsrc) {
  main_body:
diff --git a/test/CodeGen/AMDGPU/merge-stores.ll b/test/CodeGen/AMDGPU/merge-stores.ll

index 7d0c0db22b7e3750630fe230265dcd7619bf1111..7fb0e354df2c14e5aa6f3cc0531306562c423413 100644 (file)
--- a/test/CodeGen/AMDGPU/merge-stores.ll
+++ b/test/CodeGen/AMDGPU/merge-stores.ll
@@ -1,5 +1,5 @@
  ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=GCN-AA %s
  
  ; This test is mostly to test DAG store merging, so disable the vectorizer.
  ; Run with devices with different unaligned load restrictions.
@@ -65,8 +65,8 @@ define amdgpu_kernel void @merge_global_store_2_constants_i16_natural_align(i16
  }
  
  ; GCN-LABEL: {{^}}merge_global_store_2_constants_i32:
-; SI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
-; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
+; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
  ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
  define amdgpu_kernel void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
    %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
@@ -87,8 +87,8 @@ define amdgpu_kernel void @merge_global_store_2_constants_i32_f32(i32 addrspace(
  }
  
  ; GCN-LABEL: {{^}}merge_global_store_2_constants_f32_i32:
-; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 4.0
-; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0x7b
+; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 4.0
+; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0x7b
  ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
  define amdgpu_kernel void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
    %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
@@ -164,9 +164,10 @@ define amdgpu_kernel void @merge_global_store_4_constants_mixed_i32_f32(float ad
  }
  
  ; GCN-LABEL: {{^}}merge_global_store_3_constants_i32:
-; SI-DAG: buffer_store_dwordx3
-; SI-NOT: buffer_store_dwordx2
-; SI-NOT: buffer_store_dword
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dword v
+; CI-DAG: buffer_store_dwordx3
+; GCN-NOT: buffer_store_dword
  ; GCN: s_endpgm
  define amdgpu_kernel void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
    %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
@@ -274,9 +275,13 @@ define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32(i32 addrspace
  }
  
  ; GCN-LABEL: {{^}}merge_global_store_3_adjacent_loads_i32:
-; SI-DAG: buffer_load_dwordx3
+; SI-DAG: buffer_load_dwordx2
+; SI-DAG: buffer_load_dword v
+; CI-DAG: buffer_load_dwordx3
  ; GCN: s_waitcnt
-; SI-DAG: buffer_store_dwordx3 v
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dword v
+; CI-DAG: buffer_store_dwordx3
  ; GCN: s_endpgm
  define amdgpu_kernel void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
    %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
@@ -561,7 +566,9 @@ define amdgpu_kernel void @merge_global_store_6_constants_i32(i32 addrspace(1)*
  
  ; GCN-LABEL: {{^}}merge_global_store_7_constants_i32:
  ; GCN: buffer_store_dwordx4
-; GCN: buffer_store_dwordx3
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dword v
+; CI: buffer_store_dwordx3
  define amdgpu_kernel void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
    store i32 34, i32 addrspace(1)* %out, align 4
    %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
@@ -608,11 +615,15 @@ define amdgpu_kernel void @merge_global_store_8_constants_i32(i32 addrspace(1)*
  
  ; GCN-LABEL: {{^}}copy_v3i32_align4:
  ; GCN-NOT: SCRATCH_RSRC_DWORD
-; GCN-DAG: buffer_load_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; SI-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
+; SI-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; CI-DAG: buffer_load_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
  ; GCN-NOT: offen
  ; GCN: s_waitcnt vmcnt
  ; GCN-NOT: offen
-; GCN-DAG: buffer_store_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; SI-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; SI-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
+; CI-DAG: buffer_store_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
  
  ; GCN: ScratchSize: 0{{$}}
  define amdgpu_kernel void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 {
@@ -639,11 +650,15 @@ define amdgpu_kernel void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %ou
  
  ; GCN-LABEL: {{^}}copy_v3f32_align4:
  ; GCN-NOT: SCRATCH_RSRC_DWORD
-; GCN-DAG: buffer_load_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; SI-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
+; SI-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; CI-DAG: buffer_load_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
  ; GCN-NOT: offen
  ; GCN: s_waitcnt vmcnt
  ; GCN-NOT: offen
-; GCN-DAG: buffer_store_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; SI-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; SI-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
+; CI-DAG: buffer_store_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
  ; GCN: ScratchSize: 0{{$}}
  define amdgpu_kernel void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 {
    %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4
diff --git a/test/CodeGen/AMDGPU/store-global.ll b/test/CodeGen/AMDGPU/store-global.ll

index 50f6b81d7ac71cadbc229b1fd9b9fea9b15cf5a8..c0b9f6cdc333b744d8af7ec4dea044e5801c271c 100644 (file)
--- a/test/CodeGen/AMDGPU/store-global.ll
+++ b/test/CodeGen/AMDGPU/store-global.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI -check-prefix=VI -check-prefix=FUNC %s
  ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=FUNC %s
  ; RUN: llc -march=r600 -mtriple=r600-- -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
  ; RUN: llc -march=r600 -mtriple=r600-- -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
@@ -273,7 +273,10 @@ entry:
  }
  
  ; FUNC-LABEL: {{^}}store_v3i32:
-; SIVI-DAG: buffer_store_dwordx3
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dword v
+
+; VI-DAG: buffer_store_dwordx3
  
  ; GFX9-DAG: global_store_dwordx2
  ; GFX9-DAG: global_store_dword v
diff --git a/test/CodeGen/AMDGPU/store-v3i64.ll b/test/CodeGen/AMDGPU/store-v3i64.ll

index 534347f3fae96b0674e938d22151ce9044c5884d..7af1736b3207dc3b42b47e6d6eb091bc0927236d 100644 (file)
--- a/test/CodeGen/AMDGPU/store-v3i64.ll
+++ b/test/CodeGen/AMDGPU/store-v3i64.ll
@@ -89,7 +89,9 @@ define amdgpu_kernel void @local_store_v3i64_unaligned(<3 x i64> addrspace(3)* %
  }
  
  ; GCN-LABEL: {{^}}global_truncstore_v3i64_to_v3i32:
-; GCN-DAG: buffer_store_dwordx3
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dword v
+; VI-DAG: buffer_store_dwordx3
  define amdgpu_kernel void @global_truncstore_v3i64_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i64> %x) {
    %trunc = trunc <3 x i64> %x to <3 x i32>
    store <3 x i32> %trunc, <3 x i32> addrspace(1)* %out
author	Neil Henning <neil.henning@amd.com>
	Thu, 10 Jan 2019 16:21:08 +0000 (16:21 +0000)
committer	Neil Henning <neil.henning@amd.com>
	Thu, 10 Jan 2019 16:21:08 +0000 (16:21 +0000)
lib/Target/AMDGPU/AMDGPUSubtarget.h		patch \| blob \| history
lib/Target/AMDGPU/SILoadStoreOptimizer.cpp		patch \| blob \| history
test/CodeGen/AMDGPU/cvt_f32_ubyte.ll		patch \| blob \| history
test/CodeGen/AMDGPU/early-if-convert-cost.ll		patch \| blob \| history
test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll		patch \| blob \| history
test/CodeGen/AMDGPU/merge-stores.ll		patch \| blob \| history
test/CodeGen/AMDGPU/store-global.ll		patch \| blob \| history
test/CodeGen/AMDGPU/store-v3i64.ll		patch \| blob \| history