[AMDGPU] Prevent post-RA scheduler from breaking memory clauses

author Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>

Tue, 19 Sep 2017 20:54:38 +0000 (20:54 +0000)

committer Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>

Tue, 19 Sep 2017 20:54:38 +0000 (20:54 +0000)
author Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Tue, 19 Sep 2017 20:54:38 +0000 (20:54 +0000)
committer Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Tue, 19 Sep 2017 20:54:38 +0000 (20:54 +0000)
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

index 18fdaf441e0948e8bf4a7326c5863eb584800eb2..59f9baf9af04f83520fa956601877a6b1d547b8b 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -524,3 +524,57 @@ unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
  
    return MaxNumVGPRs - getReservedNumVGPRs(MF);
  }
+
+struct MemOpClusterMutation : ScheduleDAGMutation {
+  const SIInstrInfo *TII;
+
+  MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
+
+  void apply(ScheduleDAGInstrs *DAGInstrs) override {
+    ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
+
+    SUnit *SUa = nullptr;
+    // Search for two consequent memory operations and link them
+    // to prevent scheduler from moving them apart.
+    // In DAG pre-process SUnits are in the original order of
+    // the instructions before scheduling.
+    for (SUnit &SU : DAG->SUnits) {
+      MachineInstr &MI2 = *SU.getInstr();
+      if (!MI2.mayLoad() && !MI2.mayStore()) {
+        SUa = nullptr;
+        continue;
+      }
+      if (!SUa) {
+        SUa = &SU;
+        continue;
+      }
+
+      MachineInstr &MI1 = *SUa->getInstr();
+      if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
+          (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
+          (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
+          (TII->isDS(MI1)   && TII->isDS(MI2))) {
+        SU.addPredBarrier(SUa);
+
+        for (const SDep &SI : SU.Preds) {
+          if (SI.getSUnit() != SUa)
+            SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
+        }
+
+        if (&SU != &DAG->ExitSU) {
+          for (const SDep &SI : SUa->Succs) {
+            if (SI.getSUnit() != &SU)
+              SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
+          }
+        }
+      }
+
+      SUa = &SU;
+    }
+  }
+};
+
+void SISubtarget::getPostRAMutations(
+    std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
+  Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
+}
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h

index c2ae2227830f05df2261b41a0a380950d40cab0d..7e7a09648ed114c9e20c1c39f9a9ce998268ff7b 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -883,6 +883,10 @@ public:
    /// subtarget's specifications, or does not meet number of waves per execution
    /// unit requirement.
    unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
+
+  void getPostRAMutations(
+      std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
+      const override;
  };
  
  } // end namespace llvm
diff --git a/test/CodeGen/AMDGPU/and.ll b/test/CodeGen/AMDGPU/and.ll

index ee0190149e92eaf30182932670bba47f4efa063d..86930892f7c26a14b1cc9136251b5f2520a64000 100644 (file)
--- a/test/CodeGen/AMDGPU/and.ll
+++ b/test/CodeGen/AMDGPU/and.ll
@@ -219,10 +219,10 @@ define amdgpu_kernel void @s_and_32_bit_constant_i64(i64 addrspace(1)* %out, i64
  }
  
  ; FUNC-LABEL: {{^}}s_and_multi_use_inline_imm_i64:
+; SI: s_load_dwordx2
  ; SI: s_load_dword [[A:s[0-9]+]]
  ; SI: s_load_dword [[B:s[0-9]+]]
  ; SI: s_load_dwordx2
-; SI: s_load_dwordx2
  ; SI-NOT: and
  ; SI: s_lshl_b32 [[A]], [[A]], 1
  ; SI: s_lshl_b32 [[B]], [[B]], 1
diff --git a/test/CodeGen/AMDGPU/ashr.v2i16.ll b/test/CodeGen/AMDGPU/ashr.v2i16.ll

index e87a36afa80cac0d03a205f679f5364c5ddd2b30..073d71ebad050d741c331f154a8b047cca675c41 100644 (file)
--- a/test/CodeGen/AMDGPU/ashr.v2i16.ll
+++ b/test/CodeGen/AMDGPU/ashr.v2i16.ll
@@ -11,10 +11,10 @@
  ; VI: v_ashrrev_i32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
  ; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
  
-; CI: v_ashrrev_i32_e32
-; CI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
-; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; CI-DAG: v_ashrrev_i32_e32
+; CI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
+; CI-DAG: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
  ; CI: v_or_b32_e32
  define amdgpu_kernel void @s_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
    %result = ashr <2 x i16> %lhs, %rhs
diff --git a/test/CodeGen/AMDGPU/br_cc.f16.ll b/test/CodeGen/AMDGPU/br_cc.f16.ll

index 283384dc29855658d6246535cacf99b4ac9cfa32..566c361142832d22d42f8166beec2225a4ad4aa2 100644 (file)
--- a/test/CodeGen/AMDGPU/br_cc.f16.ll
+++ b/test/CodeGen/AMDGPU/br_cc.f16.ll
@@ -5,19 +5,19 @@
  ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
  ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
  
-; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
-; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI:  v_cmp_nlt_f32_e32 vcc, v[[B_F32]], v[[A_F32]]
+; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
+; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
+; SI:  v_cmp_nlt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
  ; VI:  v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
  ; GCN: s_cbranch_vccnz
  
  ; GCN: one{{$}}
-; SI: v_cvt_f16_f32_e32 v[[A_F16:[0-9]+]], v[[B_F32]]
+; SI: v_cvt_f16_f32_e32 v[[A_F16:[0-9]+]], v[[A_F32]]
  ; GCN: buffer_store_short
  ; GCN: s_endpgm
  
  ; GCN: two{{$}}
-; SI:  v_cvt_f16_f32_e32 v[[B_F16:[0-9]+]], v[[A_F32]]
+; SI:  v_cvt_f16_f32_e32 v[[B_F16:[0-9]+]], v[[B_F32]]
  ; GCN: buffer_store_short v[[B_F16]]
  ; GCN: s_endpgm
  define amdgpu_kernel void @br_cc_f16(
diff --git a/test/CodeGen/AMDGPU/call-argument-types.ll b/test/CodeGen/AMDGPU/call-argument-types.ll

index 60c0480eaa71c24b459e23471d6af4fdf0f993ca..e42546e60775b079f1e3ce0bd75049a10b5ecc7e 100644 (file)
--- a/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -400,9 +400,9 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
  ; GCN-DAG: buffer_load_dwordx4 v[24:27], off
  ; GCN-DAG: buffer_load_dwordx4 v[28:31], off
  
-; GCN: buffer_store_dword [[VAL1]], off, s[{{[0-9]+}}:{{[0-9]+}}], s32 offset:4{{$}}
  ; GCN: s_waitcnt
-; GCN-NEXT: s_swappc_b64
+; GCN: buffer_store_dword [[VAL1]], off, s[{{[0-9]+}}:{{[0-9]+}}], s32 offset:4{{$}}
+; GCN: s_swappc_b64
  ; GCN-NEXT: s_endpgm
  define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
    %ptr0 = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(2)* undef
@@ -452,15 +452,15 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
  ; HSA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[0:3], s33 offset:8
  ; HSA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[0:3], s33 offset:12
  
-; HSA: buffer_store_dword [[RELOAD_VAL0]], off, s[0:3], [[SP]] offset:4
-; HSA: buffer_store_dword [[RELOAD_VAL1]], off, s[0:3], [[SP]] offset:8
+; HSA-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s[0:3], [[SP]] offset:4
+; HSA-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s[0:3], [[SP]] offset:8
  
  
  ; MESA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[36:39], s33 offset:8
  ; MESA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[36:39], s33 offset:12
  
-; MESA: buffer_store_dword [[RELOAD_VAL0]], off, s[36:39], [[SP]] offset:4
-; MESA: buffer_store_dword [[RELOAD_VAL1]], off, s[36:39], [[SP]] offset:8
+; MESA-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s[36:39], [[SP]] offset:4
+; MESA-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s[36:39], [[SP]] offset:8
  
  ; GCN-NEXT: s_swappc_b64
  ; GCN-NOT: [[SP]]
@@ -487,8 +487,8 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
  ; GCN-DAG: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:12
  
  ; GCN-NOT: s_add_u32 [[SP]]
-; GCN: buffer_store_dword [[RELOAD_VAL0]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]] offset:4
-; GCN: buffer_store_dword [[RELOAD_VAL1]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]] offset:8
+; GCN-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]] offset:4
+; GCN-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]] offset:8
  ; GCN-NEXT: s_swappc_b64
  ; GCN-DAG: buffer_load_ubyte [[LOAD_OUT_VAL0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:16
  ; GCN-DAG: buffer_load_dword [[LOAD_OUT_VAL1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:20
diff --git a/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll

index f2db57336892b1130a58f1c2c2a43bc5c010161d..9b3bb69dc9ce1c499673881f882051cb04115c04 100644 (file)
--- a/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
+++ b/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
@@ -327,8 +327,8 @@ define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
  ; Requires loading and storing to stack slot.
  ; GCN-LABEL: {{^}}too_many_args_call_too_many_args_use_workitem_id_x:
  ; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill
-; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4
  ; GCN: s_add_u32 s32, s32, 0x400{{$}}
+; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4
  
  ; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:8{{$}}
  
diff --git a/test/CodeGen/AMDGPU/cluster-flat-loads-postra.mir b/test/CodeGen/AMDGPU/cluster-flat-loads-postra.mir

new file mode 100644 (file)

index 0000000..4c528b5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/cluster-flat-loads-postra.mir
@@ -0,0 +1,31 @@
+# RUN: llc -march=amdgcn -mcpu=tonga -run-pass post-RA-sched -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
+
+# GCN:      FLAT_LOAD_DWORD
+# GCN-NEXT: FLAT_LOAD_DWORD
+# GCN:      FLAT_STORE_DWORD
+# GCN-NEXT: FLAT_STORE_DWORD
+
+---
+name:            cluster_loads_post_ra
+tracksRegLiveness: true
+registers:
+liveins:
+  - { reg: '%vgpr0' }
+body:             |
+  bb.0:
+    liveins: %vgpr0
+
+    %vgpr0_vgpr1 = IMPLICIT_DEF
+    %vgpr4_vgpr5 = IMPLICIT_DEF
+    %vgpr0 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4)
+    %vgpr4 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4)
+    %vgpr2 = IMPLICIT_DEF
+    %vgpr3 = IMPLICIT_DEF
+    %vgpr6 = IMPLICIT_DEF
+    %vgpr0 = V_ADD_I32_e32 16, %vgpr2, implicit-def %vcc, implicit %exec
+    %vgpr1 = V_ADDC_U32_e32 %vgpr3, killed %vgpr6, implicit-def dead %vcc, implicit %vcc, implicit %exec
+    FLAT_STORE_DWORD %vgpr2_vgpr3, killed %vgpr0, 0, 0, 0, implicit %exec, implicit %flat_scr :: (store 4)
+    FLAT_STORE_DWORD %vgpr0_vgpr1, killed %vgpr4, 0, 0, 0, implicit %exec, implicit %flat_scr :: (store 4)
+    S_ENDPGM
+
+...
diff --git a/test/CodeGen/AMDGPU/fcopysign.f16.ll b/test/CodeGen/AMDGPU/fcopysign.f16.ll

index 2f24f5f51cec3b51073ff0dcbf277f8f4bf9c172..b14f4c85ba618ab27de096653fba027ce4aa8a67 100644 (file)
--- a/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -12,15 +12,15 @@ declare <4 x half> @llvm.copysign.v4f16(<4 x half>, <4 x half>)
  declare i32 @llvm.amdgcn.workitem.id.x()
  
  ; GCN-LABEL: {{^}}test_copysign_f16:
-; SI: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]]
  ; SI: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]]
+; SI: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]]
  ; SI: s_brev_b32 s[[CONST:[0-9]+]], -2
  ; SI-DAG: v_cvt_f32_f16_e32 v[[MAG_F32:[0-9]+]], v[[MAG]]
  ; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]]
  ; SI: v_bfi_b32 v[[OUT_F32:[0-9]+]], s[[CONST]], v[[MAG_F32]], v[[SIGN_F32]]
  ; SI: v_cvt_f16_f32_e32 v[[OUT:[0-9]+]], v[[OUT_F32]]
-; GFX89: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]]
  ; GFX89: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]]
+; GFX89: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]]
  ; GFX89: s_movk_i32 s[[CONST:[0-9]+]], 0x7fff
  ; GFX89: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN]]
  ; GCN: buffer_store_short v[[OUT]]
diff --git a/test/CodeGen/AMDGPU/fcopysign.f64.ll b/test/CodeGen/AMDGPU/fcopysign.f64.ll

index 67779a8ff3b9b7767460bf4b6f386e2d8c15aa39..161b7ad18e5dbd1aaffb0789cd274e6179212eab 100644 (file)
--- a/test/CodeGen/AMDGPU/fcopysign.f64.ll
+++ b/test/CodeGen/AMDGPU/fcopysign.f64.ll
@@ -24,7 +24,8 @@ define amdgpu_kernel void @test_copysign_f64(double addrspace(1)* %out, double %
  }
  
  ; FUNC-LABEL: {{^}}test_copysign_f64_f32:
-; GCN-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}
+; SI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; VI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
  ; GCN-DAG: s_load_dword s[[SSIGN:[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}
  ; GCN-DAG: s_brev_b32 [[SCONST:s[0-9]+]], -2{{$}}
  ; GCN-DAG: v_mov_b32_e32 v[[VMAG_HI:[0-9]+]], s[[SMAG_HI]]
diff --git a/test/CodeGen/AMDGPU/frame-index-amdgiz.ll b/test/CodeGen/AMDGPU/frame-index-amdgiz.ll

index 47716a40ffeb57bd641d97ddf48f8946e63689a5..e2b1950f8855dc0b5a27df970a7b21fcb2b974ab 100644 (file)
--- a/test/CodeGen/AMDGPU/frame-index-amdgiz.ll
+++ b/test/CodeGen/AMDGPU/frame-index-amdgiz.ll
@@ -12,8 +12,8 @@ target datalayout = "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32-i64:
  
  define amdgpu_kernel void @f(i32 addrspace(1)* nocapture %a, i32 %i, i32 %j) local_unnamed_addr #0 {
  entry:
-; CHECK: s_load_dword s2, s[0:1], 0xb
  ; CHECK: s_load_dwordx2 s[4:5], s[0:1], 0x9
+; CHECK: s_load_dword s2, s[0:1], 0xb
  ; CHECK: s_load_dword s0, s[0:1], 0xc
  ; CHECK: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
  ; CHECK: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
@@ -27,9 +27,9 @@ entry:
  ; CHECK: s_lshl_b32 s0, s0, 2
  ; CHECK: buffer_store_dword v2, v1, s[8:11], s3 offen
  ; CHECK: v_add_i32_e32 v0, vcc, s0, v0
-; CHECK: buffer_load_dword v0, v0, s[8:11], s3 offen
  ; CHECK: s_mov_b32 s7, 0xf000
  ; CHECK: s_mov_b32 s6, -1
+; CHECK: buffer_load_dword v0, v0, s[8:11], s3 offen
  ; CHECK: s_waitcnt vmcnt(0)
  ; CHECK: buffer_store_dword v0, off, s[4:7], 0
  ; CHECK: s_endpgm
diff --git a/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll

index bed238cf2e0717b770ae78227a612afcec413735..7a6919c7a1551e061d12be863aafaf2620453086 100644 (file)
--- a/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -421,11 +421,11 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspac
  }
  
  ; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr:
+; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
  ; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
  ; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
  ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
  
-; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
  ; GCN-DAG:   v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
  
  ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]]
@@ -450,11 +450,11 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspac
  }
  
  ; GCN-LABEL: {{^}}v_insertelement_v2f16_dynamic_vgpr:
+; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
  ; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
  ; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
  ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
  
-; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
  ; GCN-DAG:   v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
  
  ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]]
diff --git a/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll

index add22dbfab578c5f92e1636502771f9ccaa82be8..ebf8bafd3c8872fe15f5bffebe29450b186b6023 100644 (file)
--- a/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
@@ -95,8 +95,9 @@ define amdgpu_kernel void @fmuladd_f16_imm_b(
  }
  
  ; GCN-LABEL: {{^}}fmuladd_v2f16
+; VI:  buffer_load_dword v[[B_V2_F16:[0-9]+]]
  ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
+; SI:  buffer_load_dword v[[B_V2_F16:[0-9]+]]
  ; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]]
  
  ; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
@@ -124,11 +125,11 @@ define amdgpu_kernel void @fmuladd_f16_imm_b(
  ; VI-FLUSH-NOT: v_and_b32
  ; VI-FLUSH:     v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[R_F16_HI]]
  
-; VI-DENORM: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; VI-DENORM: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; VI-DENORM: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
-; VI-DENORM-DAG: v_fma_f16 v[[RES0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]]
-; VI-DENORM-DAG: v_fma_f16 v[[RES1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]]
+; VI-DENORM-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; VI-DENORM-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; VI-DENORM-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
+; VI-DENORM-DAG: v_fma_f16 v[[RES0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]], v[[C_V2_F16]]
+; VI-DENORM-DAG: v_fma_f16 v[[RES1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]], v[[C_F16_1]]
  ; VI-DENORM-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[RES1]]
  ; VI-DENORM-NOT: v_and_b32
  ; VI-DENORM: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[RES0]], v[[R_F16_HI]]
diff --git a/test/CodeGen/AMDGPU/load-global-i32.ll b/test/CodeGen/AMDGPU/load-global-i32.ll

index 6360d39666c775e067a857d8385ec6f9fabb9f2e..246969303b04c523e7a038853b636ee7dfb55c9a 100644 (file)
--- a/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -424,25 +424,25 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(<16 x i64> addrspace
  ; GCN-NOHSA: buffer_store_dwordx4
  ; GCN-NOHSA: buffer_store_dwordx4
  
-; GCN-HSA: flat_store_dwordx4
-; GCN-HSA: flat_store_dwordx4
-; GCN-HSA: flat_store_dwordx4
-; GCN-HSA: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
  
-; GCN-HSA: flat_store_dwordx4
-; GCN-HSA: flat_store_dwordx4
-; GCN-HSA: flat_store_dwordx4
-; GCN-HSA: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
  
-; GCN-HSA: flat_store_dwordx4
-; GCN-HSA: flat_store_dwordx4
-; GCN-HSA: flat_store_dwordx4
-; GCN-HSA: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
  
-; GCN-HSA: flat_store_dwordx4
-; GCN-HSA: flat_store_dwordx4
-; GCN-HSA: flat_store_dwordx4
-; GCN-HSA: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
  
  define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* %in) #0 {
    %ld = load <32 x i32>, <32 x i32> addrspace(1)* %in
diff --git a/test/CodeGen/AMDGPU/load-weird-sizes.ll b/test/CodeGen/AMDGPU/load-weird-sizes.ll

index f9ba6241fe0675a3ac970d719edc8103fe0a8fab..fff0d3520d74231a785dacc609c261df16d0c9e3 100644 (file)
--- a/test/CodeGen/AMDGPU/load-weird-sizes.ll
+++ b/test/CodeGen/AMDGPU/load-weird-sizes.ll
@@ -5,8 +5,8 @@
  ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=CM -check-prefix=FUNC %s
  
  ; FUNC-LABEL: {{^}}load_i24:
-; SI: {{flat|buffer}}_load_ubyte
-; SI: {{flat|buffer}}_load_ushort
+; SI-DAG: {{flat|buffer}}_load_ubyte
+; SI-DAG: {{flat|buffer}}_load_ushort
  ; SI: {{flat|buffer}}_store_dword
  define amdgpu_kernel void @load_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) #0 {
    %1 = load i24, i24 addrspace(1)* %in
diff --git a/test/CodeGen/AMDGPU/lshr.v2i16.ll b/test/CodeGen/AMDGPU/lshr.v2i16.ll

index bf731dbbe888b55dda7f9a10c292aebcb498b2ac..54969768f8826e0ac539601170acc3749d5ab925 100644 (file)
--- a/test/CodeGen/AMDGPU/lshr.v2i16.ll
+++ b/test/CodeGen/AMDGPU/lshr.v2i16.ll
@@ -10,9 +10,9 @@
  
  ; VI: v_lshrrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
  ; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
-; CIVI: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 16
-; CIVI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CI-DAG:   v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; CIVI-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 16
+; CIVI:     v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
  define amdgpu_kernel void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
    %result = lshr <2 x i16> %lhs, %rhs
    store <2 x i16> %result, <2 x i16> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll b/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll

index e1fb00a1de307c3b8b6227bb7bac1a294700aa1c..5bc59c9f636b25db34051a610912332208d11dce 100644 (file)
--- a/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll
+++ b/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll
@@ -6,11 +6,11 @@
  ; FIXME: We should be able to use the SGPR directly as src0 to v_add_i32
  
  ; GCN-LABEL: {{^}}clobber_vgpr_pair_pointer_add:
-; GCN-DAG: s_load_dwordx2 s{{\[}}[[ARG1LO:[0-9]+]]:[[ARG1HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
-; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[LDPTRLO:[0-9]+]]:[[LDPTRHI:[0-9]+]]{{\]}}
+; GCN: s_load_dwordx2 s{{\[}}[[ARG1LO:[0-9]+]]:[[ARG1HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
  
  ; GCN-NOT: v_mov_b32
  ; GCN: v_mov_b32_e32 v[[VARG1LO:[0-9]+]], s[[ARG1LO]]
+; GCN: buffer_load_dwordx2 v{{\[}}[[LDPTRLO:[0-9]+]]:[[LDPTRHI:[0-9]+]]{{\]}}
  ; GCN-NOT: v_mov_b32
  ; GCN: v_mov_b32_e32 v[[VARG1HI:[0-9]+]], s[[ARG1HI]]
  ; GCN-NOT: v_mov_b32
diff --git a/test/CodeGen/AMDGPU/salu-to-valu.ll b/test/CodeGen/AMDGPU/salu-to-valu.ll

index d5b2fa0b6754011a4e0bdf27307809ab99515812..a826661442e1075942159fd5f2830021bc3d4b1a 100644 (file)
--- a/test/CodeGen/AMDGPU/salu-to-valu.ll
+++ b/test/CodeGen/AMDGPU/salu-to-valu.ll
@@ -170,10 +170,10 @@ entry:
  ; CI.
  
  ; GCN-LABEL: {{^}}smrd_valu_ci_offset_x8:
-; GCN-NOHSA: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x9a50{{$}}
-; GCN-NOHSA-NOT: v_add
  ; GCN-NOHSA: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x9a40{{$}}
  ; GCN-NOHSA-NOT: v_add
+; GCN-NOHSA: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x9a50{{$}}
+; GCN-NOHSA-NOT: v_add
  ; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}}
  ; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}}
  
diff --git a/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll b/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll

index fd34978dc07d18fe748119b63f4a9c366c3f3e18..118bb9aa66e8ff924d0378355ca2b62429cebc50 100644 (file)
--- a/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll
+++ b/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll
@@ -2,16 +2,14 @@
  ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=VI -check-prefix=GCN %s
  
  ; FUNC-LABEL: {{^}}cluster_arg_loads:
-; FIXME: Due to changes in the load clustering heuristics.  We no longer
-;        cluster all argument loads together on SI.
-; SI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
  ; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x9
  ; SI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
  ; SI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xe
-; VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
  ; VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x24
-; VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x38
+; VI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
+; VI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x38
  define amdgpu_kernel void @cluster_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) nounwind {
    store i32 %x, i32 addrspace(1)* %out0, align 4
    store i32 %y, i32 addrspace(1)* %out1, align 4
diff --git a/test/CodeGen/AMDGPU/select-vectors.ll b/test/CodeGen/AMDGPU/select-vectors.ll

index 4f47bffaf4e692970942954d04a8e36377d2eedc..e5c8191b24b99786a569eda102699db49431d1b9 100644 (file)
--- a/test/CodeGen/AMDGPU/select-vectors.ll
+++ b/test/CodeGen/AMDGPU/select-vectors.ll
@@ -215,9 +215,9 @@ define amdgpu_kernel void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32>
  ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[ALO]]
  ; GCN-DAG: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}}
  
-; GCN: v_cndmask_b32_e32
-; GCN: v_mov_b32_e32 v{{[0-9]+}}, s[[BLO]]
-; GCN: v_cndmask_b32_e32
+; GCN-DAG: v_cndmask_b32_e32
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[BLO]]
+; GCN-DAG: v_cndmask_b32_e32
  ; GCN: buffer_store_dwordx2
  define amdgpu_kernel void @s_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) #0 {
    %cmp = icmp eq i32 %c, 0
diff --git a/test/CodeGen/AMDGPU/select.f16.ll b/test/CodeGen/AMDGPU/select.f16.ll

index ce024edf51035ae928b6c2dc7ed5cf8cc6eb7fb0..9030baa04c584c44436176515a1e336460b3e98a 100644 (file)
--- a/test/CodeGen/AMDGPU/select.f16.ll
+++ b/test/CodeGen/AMDGPU/select.f16.ll
@@ -8,9 +8,9 @@
  ; GCN: buffer_load_ushort v[[D_F16:[0-9]+]]
  ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
  ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI:  v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
-; SI:  v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]]
+; SI-DAG:  v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]]
  ; SI:  v_cndmask_b32_e32 v[[R_F32:[0-9]+]], v[[D_F32]], v[[C_F32]]
  ; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
  ; VI:  v_cmp_lt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
@@ -39,9 +39,9 @@ entry:
  ; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
  ; GCN: buffer_load_ushort v[[D_F16:[0-9]+]]
  ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI:  v_cmp_lt_f32_e32 vcc, 0.5, v[[B_F32]]
-; SI:  v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]]
+; SI-DAG:  v_cmp_lt_f32_e32 vcc, 0.5, v[[B_F32]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]]
  ; SI:  v_cndmask_b32_e32 v[[R_F32:[0-9]+]], v[[D_F32]], v[[C_F32]]
  ; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
  ; VI:  v_cmp_lt_f16_e32 vcc, 0.5, v[[B_F16]]
@@ -68,9 +68,9 @@ entry:
  ; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
  ; GCN: buffer_load_ushort v[[D_F16:[0-9]+]]
  ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
-; SI:  v_cmp_gt_f32_e32 vcc, 0.5, v[[A_F32]]
-; SI:  v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]]
+; SI-DAG:  v_cmp_gt_f32_e32 vcc, 0.5, v[[A_F32]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]]
  ; SI:  v_cndmask_b32_e32 v[[R_F32:[0-9]+]], v[[D_F32]], v[[C_F32]]
  ; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
  
diff --git a/test/CodeGen/AMDGPU/shl.v2i16.ll b/test/CodeGen/AMDGPU/shl.v2i16.ll

index 0f722723b0a6410c520e804db6941c711c5e22b0..6bbf9363888fb062791621bd077244b41558bbf8 100644 (file)
--- a/test/CodeGen/AMDGPU/shl.v2i16.ll
+++ b/test/CodeGen/AMDGPU/shl.v2i16.ll
@@ -12,10 +12,10 @@
  ; VI: v_lshlrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
  ; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
  
-; CI: v_lshlrev_b32_e32
-; CI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
-; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; CI-DAG: v_lshlrev_b32_e32
+; CI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
+; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
  ; CI: v_or_b32_e32
  define amdgpu_kernel void @s_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
    %result = shl <2 x i16> %lhs, %rhs
diff --git a/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll

index 1109f60528e661c46cd817c26936f8dbc273520e..6d2f89ab96da52c7661fa1d1411a2368e51c3c29 100644 (file)
--- a/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
+++ b/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
@@ -63,11 +63,11 @@ define amdgpu_kernel void @no_reorder_local_load_volatile_global_store_local_loa
  ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
  ; CI: buffer_store_dword
  
-; GFX9: global_store_dword
-; GFX9: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
+; GFX9-DAG: global_store_dword
+; GFX9-DAG: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
  ; GFX9: s_barrier
-; GFX9: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
-; GFX9: global_store_dword
+; GFX9-DAG: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
+; GFX9-DAG: global_store_dword
  define amdgpu_kernel void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
    %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
  
@@ -256,11 +256,12 @@ define amdgpu_kernel void @reorder_global_offsets(i32 addrspace(1)* nocapture %o
  ; CI: v_mov_b32
  
  ; CI: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; CI-NEXT: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20{{$}}
  
  ; CI: v_add_i32
  ; CI: v_add_i32
  
+; CI: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20{{$}}
+
  ; CI: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:36{{$}}
  ; CI-NEXT: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:52{{$}}
  
diff --git a/test/CodeGen/AMDGPU/v_cndmask.ll b/test/CodeGen/AMDGPU/v_cndmask.ll

index 5cbfae34e1bb510ae8b29682395659f12cb61c10..4fe30aeb2f1e643dbfd08ceb19480b567b7b7264 100644 (file)
--- a/test/CodeGen/AMDGPU/v_cndmask.ll
+++ b/test/CodeGen/AMDGPU/v_cndmask.ll
@@ -122,7 +122,7 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(float addrspace(1)*
  ; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k1_vgprZ_f32:
  ; GCN-DAG: {{buffer|flat}}_load_dword [[Z:v[0-9]+]]
  ; GCN-DAG: s_load_dword [[X:s[0-9]+]]
-; GCN: v_cmp_nlg_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], [[X]], 0
+; GCN-DAG: v_cmp_nlg_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], [[X]], 0
  ; GCN: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, 1.0, [[Z]], [[COND]]
  define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float %x, float addrspace(1)* %z.ptr) #0 {
    %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
diff --git a/test/CodeGen/AMDGPU/wqm.ll b/test/CodeGen/AMDGPU/wqm.ll

index e403ad128a06ca530763d3ae56d9a4bd402550d4..915877fc44a7e21822caa16e7d6c13aa3d4af610 100644 (file)
--- a/test/CodeGen/AMDGPU/wqm.ll
+++ b/test/CodeGen/AMDGPU/wqm.ll
@@ -473,8 +473,8 @@ END:
  ;CHECK: image_sample
  ;CHECK: s_and_b64 exec, exec, [[ORIG]]
  ;CHECK: image_sample
-;CHECK: v_cmp
-;CHECK: store
+;CHECK-DAG: v_cmp
+;CHECK-DAG: store
  define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %coord) {
  main_body:
    %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
author	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
	Tue, 19 Sep 2017 20:54:38 +0000 (20:54 +0000)
committer	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
	Tue, 19 Sep 2017 20:54:38 +0000 (20:54 +0000)
lib/Target/AMDGPU/AMDGPUSubtarget.cpp		patch \| blob \| history
lib/Target/AMDGPU/AMDGPUSubtarget.h		patch \| blob \| history
test/CodeGen/AMDGPU/and.ll		patch \| blob \| history
test/CodeGen/AMDGPU/ashr.v2i16.ll		patch \| blob \| history
test/CodeGen/AMDGPU/br_cc.f16.ll		patch \| blob \| history
test/CodeGen/AMDGPU/call-argument-types.ll		patch \| blob \| history
test/CodeGen/AMDGPU/callee-special-input-vgprs.ll		patch \| blob \| history
test/CodeGen/AMDGPU/cluster-flat-loads-postra.mir	[new file with mode: 0644]	patch \| blob
test/CodeGen/AMDGPU/fcopysign.f16.ll		patch \| blob \| history
test/CodeGen/AMDGPU/fcopysign.f64.ll		patch \| blob \| history
test/CodeGen/AMDGPU/frame-index-amdgiz.ll		patch \| blob \| history
test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll		patch \| blob \| history
test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll		patch \| blob \| history
test/CodeGen/AMDGPU/load-global-i32.ll		patch \| blob \| history
test/CodeGen/AMDGPU/load-weird-sizes.ll		patch \| blob \| history
test/CodeGen/AMDGPU/lshr.v2i16.ll		patch \| blob \| history
test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll		patch \| blob \| history
test/CodeGen/AMDGPU/salu-to-valu.ll		patch \| blob \| history
test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll		patch \| blob \| history
test/CodeGen/AMDGPU/select-vectors.ll		patch \| blob \| history
test/CodeGen/AMDGPU/select.f16.ll		patch \| blob \| history
test/CodeGen/AMDGPU/shl.v2i16.ll		patch \| blob \| history
test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll		patch \| blob \| history
test/CodeGen/AMDGPU/v_cndmask.ll		patch \| blob \| history
test/CodeGen/AMDGPU/wqm.ll		patch \| blob \| history