[AMDGPU] Switch scalarize global loads ON by default

author Alexander Timofeev <Alexander.Timofeev@amd.com>

Mon, 3 Jul 2017 14:54:11 +0000 (14:54 +0000)

committer Alexander Timofeev <Alexander.Timofeev@amd.com>

Mon, 3 Jul 2017 14:54:11 +0000 (14:54 +0000)
author Alexander Timofeev <Alexander.Timofeev@amd.com>
Mon, 3 Jul 2017 14:54:11 +0000 (14:54 +0000)
committer Alexander Timofeev <Alexander.Timofeev@amd.com>
Mon, 3 Jul 2017 14:54:11 +0000 (14:54 +0000)
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

index 425fd35d47de666623a4cf18e005084ececcd962..0e1b6ea4d9a0495dae064014d71b1d61cbbc5752 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -85,7 +85,7 @@ static cl::opt<bool> EnableLoadStoreVectorizer(
  static cl::opt<bool> ScalarizeGlobal(
    "amdgpu-scalarize-global-loads",
    cl::desc("Enable global load scalarization"),
-  cl::init(false),
+  cl::init(true),
    cl::Hidden);
  
  // Option to run internalize pass.
diff --git a/test/CodeGen/AMDGPU/add.ll b/test/CodeGen/AMDGPU/add.ll

index 7e4546d2cfb3f0b2c54007450a4212f9f3bd4ad0..6dcd7c234dc6d6d54c0f36f6db52ad5a6eb83154 100644 (file)
--- a/test/CodeGen/AMDGPU/add.ll
+++ b/test/CodeGen/AMDGPU/add.ll
@@ -5,9 +5,9 @@
  ;FUNC-LABEL: {{^}}test1:
  ;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
  
-;SI: v_add_i32_e32 [[REG:v[0-9]+]], vcc, {{v[0-9]+, v[0-9]+}}
-;SI-NOT: [[REG]]
-;SI: buffer_store_dword [[REG]],
+;SI: s_add_i32 s[[REG:[0-9]+]], {{s[0-9]+, s[0-9]+}}
+;SI: v_mov_b32_e32 v[[REG]], s[[REG]]
+;SI: buffer_store_dword v[[REG]],
  define amdgpu_kernel void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
    %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
    %a = load i32, i32 addrspace(1)* %in
@@ -21,8 +21,8 @@ define amdgpu_kernel void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
  ;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
  ;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
  
-;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
-;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
+;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
+;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
  
  define amdgpu_kernel void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
    %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
@@ -39,10 +39,10 @@ define amdgpu_kernel void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspa
  ;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
  ;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
  
-;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
-;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
-;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
-;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
+;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
+;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
+;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
+;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
  
  define amdgpu_kernel void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
    %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
diff --git a/test/CodeGen/AMDGPU/add_i128.ll b/test/CodeGen/AMDGPU/add_i128.ll

index 00a125c2e44fb16f846f8c7d6aa678edc9db341e..d33965d4dda7a47755e32d78d9ad5fd36650b2dd 100644 (file)
--- a/test/CodeGen/AMDGPU/add_i128.ll
+++ b/test/CodeGen/AMDGPU/add_i128.ll
@@ -19,10 +19,10 @@ define amdgpu_kernel void @test_i128_vreg(i128 addrspace(1)* noalias %out, i128
  
  ; Check that the SGPR add operand is correctly moved to a VGPR.
  ; GCN-LABEL: {{^}}sgpr_operand:
-; GCN: v_add_i32
-; GCN: v_addc_u32
-; GCN: v_addc_u32
-; GCN: v_addc_u32
+; GCN: s_add_u32
+; GCN: s_addc_u32
+; GCN: s_addc_u32
+; GCN: s_addc_u32
  define amdgpu_kernel void @sgpr_operand(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) {
    %foo = load i128, i128 addrspace(1)* %in, align 8
    %result = add i128 %foo, %a
@@ -31,10 +31,10 @@ define amdgpu_kernel void @sgpr_operand(i128 addrspace(1)* noalias %out, i128 ad
  }
  
  ; GCN-LABEL: {{^}}sgpr_operand_reversed:
-; GCN: v_add_i32
-; GCN: v_addc_u32
-; GCN: v_addc_u32
-; GCN: v_addc_u32
+; GCN: s_add_u32
+; GCN: s_addc_u32
+; GCN: s_addc_u32
+; GCN: s_addc_u32
  define amdgpu_kernel void @sgpr_operand_reversed(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) {
    %foo = load i128, i128 addrspace(1)* %in, align 8
    %result = add i128 %a, %foo
diff --git a/test/CodeGen/AMDGPU/add_i64.ll b/test/CodeGen/AMDGPU/add_i64.ll

index 62733d5bfb6c93aaa87101c5ac42b279fdc57fdd..f673d91192b847ca46dec33613dd89adbf5120fc 100644 (file)
--- a/test/CodeGen/AMDGPU/add_i64.ll
+++ b/test/CodeGen/AMDGPU/add_i64.ll
@@ -19,8 +19,8 @@ define amdgpu_kernel void @test_i64_vreg(i64 addrspace(1)* noalias %out, i64 add
  
  ; Check that the SGPR add operand is correctly moved to a VGPR.
  ; SI-LABEL: {{^}}sgpr_operand:
-; SI: v_add_i32
-; SI: v_addc_u32
+; SI: s_add_u32
+; SI: s_addc_u32
  define amdgpu_kernel void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 addrspace(1)* noalias %in_bar, i64 %a) {
    %foo = load i64, i64 addrspace(1)* %in, align 8
    %result = add i64 %foo, %a
@@ -32,8 +32,8 @@ define amdgpu_kernel void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addr
  ; SGPR as other operand.
  ;
  ; SI-LABEL: {{^}}sgpr_operand_reversed:
-; SI: v_add_i32
-; SI: v_addc_u32
+; SI: s_add_u32
+; SI: s_addc_u32
  define amdgpu_kernel void @sgpr_operand_reversed(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %a) {
    %foo = load i64, i64 addrspace(1)* %in, align 8
    %result = add i64 %a, %foo
diff --git a/test/CodeGen/AMDGPU/and-gcn.ll b/test/CodeGen/AMDGPU/and-gcn.ll

index 2aec03aff8a3aad43987050fb5a2f46babbaf3a1..ef11ae87267eb13fd887b5765a6d804b612ddd15 100644 (file)
--- a/test/CodeGen/AMDGPU/and-gcn.ll
+++ b/test/CodeGen/AMDGPU/and-gcn.ll
@@ -2,8 +2,7 @@
  ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
  
  ; FUNC-LABEL: {{^}}v_and_i64_br:
-; SI: v_and_b32
-; SI: v_and_b32
+; SI: s_and_b64
  define amdgpu_kernel void @v_and_i64_br(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
  entry:
    %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
diff --git a/test/CodeGen/AMDGPU/and.ll b/test/CodeGen/AMDGPU/and.ll

index c356f8b87cfc6d920dcfe81a54b2bb77a288de55..ee0190149e92eaf30182932670bba47f4efa063d 100644 (file)
--- a/test/CodeGen/AMDGPU/and.ll
+++ b/test/CodeGen/AMDGPU/and.ll
@@ -8,8 +8,8 @@ declare i32 @llvm.r600.read.tidig.x() #0
  ; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
  ; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
  
-; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
+; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
  
  define amdgpu_kernel void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
    %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
@@ -26,10 +26,11 @@ define amdgpu_kernel void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspa
  ; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
  ; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
  
-; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
+; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
+; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
+; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
  
  define amdgpu_kernel void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
    %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
@@ -136,7 +137,9 @@ define amdgpu_kernel void @v_and_i32_vgpr_sgpr(i32 addrspace(1)* %out, i32 addrs
  ; FUNC-LABEL: {{^}}v_and_constant_i32
  ; SI: v_and_b32_e32 v{{[0-9]+}}, 0x12d687, v{{[0-9]+}}
  define amdgpu_kernel void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
-  %a = load i32, i32 addrspace(1)* %aptr, align 4
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep, align 4
    %and = and i32 %a, 1234567
    store i32 %and, i32 addrspace(1)* %out, align 4
    ret void
@@ -145,7 +148,9 @@ define amdgpu_kernel void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrsp
  ; FUNC-LABEL: {{^}}v_and_inline_imm_64_i32
  ; SI: v_and_b32_e32 v{{[0-9]+}}, 64, v{{[0-9]+}}
  define amdgpu_kernel void @v_and_inline_imm_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
-  %a = load i32, i32 addrspace(1)* %aptr, align 4
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep, align 4
    %and = and i32 %a, 64
    store i32 %and, i32 addrspace(1)* %out, align 4
    ret void
@@ -154,7 +159,9 @@ define amdgpu_kernel void @v_and_inline_imm_64_i32(i32 addrspace(1)* %out, i32 a
  ; FUNC-LABEL: {{^}}v_and_inline_imm_neg_16_i32
  ; SI: v_and_b32_e32 v{{[0-9]+}}, -16, v{{[0-9]+}}
  define amdgpu_kernel void @v_and_inline_imm_neg_16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
-  %a = load i32, i32 addrspace(1)* %aptr, align 4
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep, align 4
    %and = and i32 %a, -16
    store i32 %and, i32 addrspace(1)* %out, align 4
    ret void
@@ -239,8 +246,11 @@ define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out
  ; SI: v_and_b32
  ; SI: v_and_b32
  define amdgpu_kernel void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
-  %a = load i64, i64 addrspace(1)* %aptr, align 8
-  %b = load i64, i64 addrspace(1)* %bptr, align 8
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.a = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
+  %a = load i64, i64 addrspace(1)* %gep.a, align 8
+  %gep.b = getelementptr i64, i64 addrspace(1)* %bptr, i32 %tid
+  %b = load i64, i64 addrspace(1)* %gep.b, align 8
    %and = and i64 %a, %b
    store i64 %and, i64 addrspace(1)* %out, align 8
    ret void
@@ -251,7 +261,9 @@ define amdgpu_kernel void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %
  ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0x11e, {{v[0-9]+}}
  ; SI: buffer_store_dwordx2
  define amdgpu_kernel void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
-  %a = load i64, i64 addrspace(1)* %aptr, align 8
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.a = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
+  %a = load i64, i64 addrspace(1)* %gep.a, align 8
    %and = and i64 %a, 1231231234567
    store i64 %and, i64 addrspace(1)* %out, align 8
    ret void
@@ -299,26 +311,30 @@ define amdgpu_kernel void @v_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out
  }
  
  ; FUNC-LABEL: {{^}}v_and_i64_32_bit_constant:
-; SI: buffer_load_dword [[VAL:v[0-9]+]]
+; SI: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]]
  ; SI-NOT: and
  ; SI: v_and_b32_e32 {{v[0-9]+}}, 0x12d687, [[VAL]]
  ; SI-NOT: and
  ; SI: buffer_store_dwordx2
  define amdgpu_kernel void @v_and_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
-  %a = load i64, i64 addrspace(1)* %aptr, align 8
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.a = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
+  %a = load i64, i64 addrspace(1)* %gep.a, align 8
    %and = and i64 %a, 1234567
    store i64 %and, i64 addrspace(1)* %out, align 8
    ret void
  }
  
  ; FUNC-LABEL: {{^}}v_and_inline_imm_i64:
-; SI: buffer_load_dword v{{[0-9]+}}
+; SI: {{buffer|flat}}_load_dword v{{[0-9]+}}
  ; SI-NOT: and
  ; SI: v_and_b32_e32 {{v[0-9]+}}, 64, {{v[0-9]+}}
  ; SI-NOT: and
  ; SI: buffer_store_dwordx2
  define amdgpu_kernel void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
-  %a = load i64, i64 addrspace(1)* %aptr, align 8
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.a = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
+  %a = load i64, i64 addrspace(1)* %gep.a, align 8
    %and = and i64 %a, 64
    store i64 %and, i64 addrspace(1)* %out, align 8
    ret void
@@ -326,13 +342,15 @@ define amdgpu_kernel void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addr
  
  ; FIXME: Should be able to reduce load width
  ; FUNC-LABEL: {{^}}v_and_inline_neg_imm_i64:
-; SI: buffer_load_dwordx2 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
+; SI: {{buffer|flat}}_load_dwordx2 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
  ; SI-NOT: and
  ; SI: v_and_b32_e32 v[[VAL_LO]], -8, v[[VAL_LO]]
  ; SI-NOT: and
  ; SI: buffer_store_dwordx2 v{{\[}}[[VAL_LO]]:[[VAL_HI]]{{\]}}
  define amdgpu_kernel void @v_and_inline_neg_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
-  %a = load i64, i64 addrspace(1)* %aptr, align 8
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.a = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
+  %a = load i64, i64 addrspace(1)* %gep.a, align 8
    %and = and i64 %a, -8
    store i64 %and, i64 addrspace(1)* %out, align 8
    ret void
@@ -549,5 +567,4 @@ define amdgpu_kernel void @s_and_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1
    store i64 %and, i64 addrspace(1)* %out, align 8
    ret void
  }
-
  attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll b/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll

index c61c23222bc7e4aebf73c1a0dcb08447bd0a5157..cdc60ab504e014a4efe3201b2c77dae3f6125084 100644 (file)
--- a/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll
+++ b/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll
@@ -2,9 +2,9 @@
  ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
  
  ; GCN-LABEL: {{^}}any_extend_vector_inreg_v16i8_to_v4i32:
-; GCN: {{buffer|flat}}_load_dwordx4
-; GCN-DAG: {{buffer|flat}}_load_dwordx4
-; GCN-DAG: {{buffer|flat}}_load_dword
+; GCN: s_load_dwordx4
+; GCN-DAG: s_load_dwordx4
+; GCN-DAG: s_load_dword
  
  ; GCN: {{buffer|flat}}_store_byte
  ; GCN: {{buffer|flat}}_store_byte
diff --git a/test/CodeGen/AMDGPU/bitreverse.ll b/test/CodeGen/AMDGPU/bitreverse.ll

index 539373f7bdeb4e27faffcfb68135603cc5556ef5..f29bfb46b94bdb92fe6d1a1d0d5337fef962bd7e 100644 (file)
--- a/test/CodeGen/AMDGPU/bitreverse.ll
+++ b/test/CodeGen/AMDGPU/bitreverse.ll
@@ -2,6 +2,8 @@
  ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
  ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s
  
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
  declare i16 @llvm.bitreverse.i16(i16) #1
  declare i32 @llvm.bitreverse.i32(i32) #1
  declare i64 @llvm.bitreverse.i64(i64) #1
@@ -42,12 +44,14 @@ define amdgpu_kernel void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val)
  }
  
  ; FUNC-LABEL: {{^}}v_brev_i32:
-; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
  ; SI: v_bfrev_b32_e32 [[RESULT:v[0-9]+]], [[VAL]]
  ; SI: buffer_store_dword [[RESULT]],
  ; SI: s_endpgm
  define amdgpu_kernel void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 {
-  %val = load i32, i32 addrspace(1)* %valptr
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+  %val = load i32, i32 addrspace(1)* %gep
    %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1
    store i32 %brev, i32 addrspace(1)* %out
    ret void
@@ -66,7 +70,9 @@ define amdgpu_kernel void @s_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2
  ; SI: v_bfrev_b32_e32
  ; SI: v_bfrev_b32_e32
  define amdgpu_kernel void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) #0 {
-  %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid
+  %val = load <2 x i32>, <2 x i32> addrspace(1)* %gep
    %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1
    store <2 x i32> %brev, <2 x i32> addrspace(1)* %out
    ret void
@@ -82,7 +88,9 @@ define amdgpu_kernel void @s_brev_i64(i64 addrspace(1)* noalias %out, i64 %val)
  ; FUNC-LABEL: {{^}}v_brev_i64:
  ; SI-NOT: v_or_b32_e64 v{{[0-9]+}}, 0, 0
  define amdgpu_kernel void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %valptr) #0 {
-  %val = load i64, i64 addrspace(1)* %valptr
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr i64, i64 addrspace(1)* %valptr, i32 %tid
+  %val = load i64, i64 addrspace(1)* %gep
    %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1
    store i64 %brev, i64 addrspace(1)* %out
    ret void
@@ -97,7 +105,9 @@ define amdgpu_kernel void @s_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2
  
  ; FUNC-LABEL: {{^}}v_brev_v2i64:
  define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %valptr) #0 {
-  %val = load <2 x i64>, <2 x i64> addrspace(1)* %valptr
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr <2 x i64> , <2 x i64> addrspace(1)* %valptr, i32 %tid
+  %val = load <2 x i64>, <2 x i64> addrspace(1)* %gep
    %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1
    store <2 x i64> %brev, <2 x i64> addrspace(1)* %out
    ret void
diff --git a/test/CodeGen/AMDGPU/bswap.ll b/test/CodeGen/AMDGPU/bswap.ll

index d2dacd7c17b3f119a20119e92b25172f5e6e6a76..eb3fc2fab34fdb998f818251e63fc622d9e4ef2e 100644 (file)
--- a/test/CodeGen/AMDGPU/bswap.ll
+++ b/test/CodeGen/AMDGPU/bswap.ll
@@ -10,7 +10,7 @@ declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>) nounwind readnone
  declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>) nounwind readnone
  
  ; FUNC-LABEL: @test_bswap_i32
-; SI: buffer_load_dword [[VAL:v[0-9]+]]
+; SI: s_load_dword [[VAL:s[0-9]+]]
  ; SI-DAG: v_alignbit_b32 [[TMP0:v[0-9]+]], [[VAL]], [[VAL]], 8
  ; SI-DAG: v_alignbit_b32 [[TMP1:v[0-9]+]], [[VAL]], [[VAL]], 24
  ; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0xff00ff
diff --git a/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll b/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll

index 5dec3e35ab3d0d74a51bd887295e754f5e7a4adc..c114332a58872c7f17614d2344a132ae88a51b97 100644 (file)
--- a/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
+++ b/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
@@ -1,9 +1,9 @@
  ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI -check-prefix=OPT-CIVI %s
  ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI -check-prefix=OPT-CIVI %s
  ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-GFX9 %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
+; RUN: llc -march=amdgcn -amdgpu-scalarize-global-loads=false -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s
+; RUN: llc -march=amdgcn -amdgpu-scalarize-global-loads=false -mcpu=tonga -mattr=-flat-for-global -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s
+; RUN: llc -march=amdgcn -amdgpu-scalarize-global-loads=false -mcpu=gfx900 -mattr=-flat-for-global -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
  
  ; OPT-LABEL: @test_no_sink_flat_small_offset_i32(
  ; OPT-CIVI: getelementptr i32, i32 addrspace(4)* %in
@@ -40,7 +40,7 @@ done:
  
  ; OPT-LABEL: @test_sink_noop_addrspacecast_flat_to_global_i32(
  ; OPT: getelementptr i32, i32 addrspace(4)* %out,
-; OPT-CI-NOT: getelementptr
+; rOPT-CI-NOT: getelementptr
  ; OPT: br i1
  
  ; OPT-CI: addrspacecast
diff --git a/test/CodeGen/AMDGPU/cgp-addressing-modes.ll b/test/CodeGen/AMDGPU/cgp-addressing-modes.ll

index c1cf56e5058eca39dab6af53816c79e141b647db..c01d834bc33d676321be2c1c2eaa2d865d4da9d1 100644 (file)
--- a/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
+++ b/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
@@ -1,9 +1,9 @@
  ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tahiti < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-SI %s
  ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI %s
  ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI %s
-; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -amdgpu-scalarize-global-loads=false -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -amdgpu-scalarize-global-loads=false -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-scalarize-global-loads=false -mattr=-flat-for-global -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
  
  target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
  
diff --git a/test/CodeGen/AMDGPU/coalescer_remat.ll b/test/CodeGen/AMDGPU/coalescer_remat.ll

index 3e1b76a1df09453533c7990fea4022e572c852b1..14b798ba822b7a2635c76571dc165222e6d1d9b8 100644 (file)
--- a/test/CodeGen/AMDGPU/coalescer_remat.ll
+++ b/test/CodeGen/AMDGPU/coalescer_remat.ll
@@ -12,7 +12,7 @@ declare float @llvm.fma.f32(float, float, float)
  ; CHECK:  v_mov_b32_e32 v{{[0-9]+}}, 0
  ; CHECK:  v_mov_b32_e32 v{{[0-9]+}}, 0
  ; It's probably OK if this is slightly higher:
-; CHECK: ; NumVgprs: 8
+; CHECK: ; NumVgprs: 4
  define amdgpu_kernel void @foobar(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %flag) {
  entry:
    %cmpflag = icmp eq i32 %flag, 1
diff --git a/test/CodeGen/AMDGPU/copy-illegal-type.ll b/test/CodeGen/AMDGPU/copy-illegal-type.ll

index d772d1b679369800398bdf64a83a11a321e6b16b..e39bd60a1cc88cffd9e28b2ea45067f6a9cf64c9 100644 (file)
--- a/test/CodeGen/AMDGPU/copy-illegal-type.ll
+++ b/test/CodeGen/AMDGPU/copy-illegal-type.ll
@@ -5,35 +5,41 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
  declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
  
  ; FUNC-LABEL: {{^}}test_copy_v4i8:
-; GCN: buffer_load_dword [[REG:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[REG:v[0-9]+]]
  ; GCN: buffer_store_dword [[REG]]
  ; GCN: s_endpgm
  define amdgpu_kernel void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
-  %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
+  %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
+  %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
    store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
    ret void
  }
  
  ; FUNC-LABEL: {{^}}test_copy_v4i8_x2:
-; GCN: buffer_load_dword [[REG:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[REG:v[0-9]+]]
  ; GCN: buffer_store_dword [[REG]]
  ; GCN: buffer_store_dword [[REG]]
  ; GCN: s_endpgm
  define amdgpu_kernel void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
-  %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
+  %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
+  %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
    store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
    store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
    ret void
  }
  
  ; FUNC-LABEL: {{^}}test_copy_v4i8_x3:
-; GCN: buffer_load_dword [[REG:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[REG:v[0-9]+]]
  ; GCN: buffer_store_dword [[REG]]
  ; GCN: buffer_store_dword [[REG]]
  ; GCN: buffer_store_dword [[REG]]
  ; GCN: s_endpgm
  define amdgpu_kernel void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
-  %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
+  %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
+  %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
    store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
    store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
    store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4
@@ -41,14 +47,16 @@ define amdgpu_kernel void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x
  }
  
  ; FUNC-LABEL: {{^}}test_copy_v4i8_x4:
-; GCN: buffer_load_dword [[REG:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[REG:v[0-9]+]]
  ; GCN: buffer_store_dword [[REG]]
  ; GCN: buffer_store_dword [[REG]]
  ; GCN: buffer_store_dword [[REG]]
  ; GCN: buffer_store_dword [[REG]]
  ; GCN: s_endpgm
  define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind {
-  %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
+  %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
+  %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
    store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
    store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
    store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4
@@ -57,7 +65,7 @@ define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x
  }
  
  ; FUNC-LABEL: {{^}}test_copy_v4i8_extra_use:
-; GCN: buffer_load_dword
+; GCN: {{buffer|flat}}_load_dword
  ; GCN-DAG: v_lshrrev_b32
  ; GCN: v_and_b32
  ; GCN: v_or_b32
@@ -66,7 +74,9 @@ define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x
  
  ; GCN: s_endpgm
  define amdgpu_kernel void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
-  %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
+  %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
+  %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
    %add = add <4 x i8> %val, <i8 9, i8 9, i8 9, i8 9>
    store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
    store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4
@@ -97,19 +107,21 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %o
  }
  
  ; FUNC-LABEL: {{^}}test_copy_v3i8_align4:
-; GCN: buffer_load_dword
+; GCN: {{buffer|flat}}_load_dword
  ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
  ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
  ; GCN: s_endpgm
  define amdgpu_kernel void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
-  %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4
+  %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid.x
+  %val = load <3 x i8>, <3 x i8> addrspace(1)* %gep, align 4
    store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 4
    ret void
  }
  
  ; FUNC-LABEL: {{^}}test_copy_v3i8_align2:
-; GCN-DAG: buffer_load_ushort v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; GCN-DAG: buffer_load_ubyte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
+; GCN-DAG: {{buffer|flat}}_load_ushort v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN-DAG: {{buffer|flat}}_load_ubyte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
  ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
  ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
  ; GCN: s_endpgm
@@ -120,9 +132,9 @@ define amdgpu_kernel void @test_copy_v3i8_align2(<3 x i8> addrspace(1)* %out, <3
  }
  
  ; FUNC-LABEL: {{^}}test_copy_v3i8_align1:
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
+; GCN: {{buffer|flat}}_load_ubyte
+; GCN: {{buffer|flat}}_load_ubyte
+; GCN: {{buffer|flat}}_load_ubyte
  
  ; GCN: buffer_store_byte
  ; GCN: buffer_store_byte
@@ -135,10 +147,10 @@ define amdgpu_kernel void @test_copy_v3i8_align1(<3 x i8> addrspace(1)* %out, <3
  }
  
  ; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_load:
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
+; GCN: {{buffer|flat}}_load_ubyte
+; GCN: {{buffer|flat}}_load_ubyte
+; GCN: {{buffer|flat}}_load_ubyte
+; GCN: {{buffer|flat}}_load_ubyte
  ; GCN: buffer_store_dword
  ; GCN: s_endpgm
  define amdgpu_kernel void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
@@ -148,10 +160,10 @@ define amdgpu_kernel void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %
  }
  
  ; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_store:
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
+; GCN: {{buffer|flat}}_load_ubyte
+; GCN: {{buffer|flat}}_load_ubyte
+; GCN: {{buffer|flat}}_load_ubyte
+; GCN: {{buffer|flat}}_load_ubyte
  ; GCN: buffer_store_byte
  ; GCN: buffer_store_byte
  ; GCN: buffer_store_byte
diff --git a/test/CodeGen/AMDGPU/ctlz.ll b/test/CodeGen/AMDGPU/ctlz.ll

index 149c50685b1db3a02fe7c68a06a3bbc09c4e7df2..3031ee8bbecd4a6d5fbddd4a91ff0afac6a26f24 100644 (file)
--- a/test/CodeGen/AMDGPU/ctlz.ll
+++ b/test/CodeGen/AMDGPU/ctlz.ll
@@ -34,7 +34,7 @@ define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val)
  }
  
  ; FUNC-LABEL: {{^}}v_ctlz_i32:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
  ; GCN-DAG: v_ffbh_u32_e32 [[CTLZ:v[0-9]+]], [[VAL]]
  ; GCN-DAG: v_cmp_ne_u32_e32 vcc, 0, [[CTLZ]]
  ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], 32, [[CTLZ]], vcc
@@ -44,14 +44,16 @@ define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val)
  ; EG: FFBH_UINT
  ; EG: CNDE_INT
  define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
-  %val = load i32, i32 addrspace(1)* %valptr, align 4
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+  %val = load i32, i32 addrspace(1)* %in.gep, align 4
    %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
    store i32 %ctlz, i32 addrspace(1)* %out, align 4
    ret void
  }
  
  ; FUNC-LABEL: {{^}}v_ctlz_v2i32:
-; GCN: buffer_load_dwordx2
+; GCN: {{buffer|flat}}_load_dwordx2
  ; GCN: v_ffbh_u32_e32
  ; GCN: v_ffbh_u32_e32
  ; GCN: buffer_store_dwordx2
@@ -62,14 +64,16 @@ define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrsp
  ; EG: FFBH_UINT
  ; EG: CNDE_INT
  define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
-  %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid
+  %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8
    %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 false) nounwind readnone
    store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8
    ret void
  }
  
  ; FUNC-LABEL: {{^}}v_ctlz_v4i32:
-; GCN: buffer_load_dwordx4
+; GCN: {{buffer|flat}}_load_dwordx4
  ; GCN: v_ffbh_u32_e32
  ; GCN: v_ffbh_u32_e32
  ; GCN: v_ffbh_u32_e32
@@ -90,14 +94,16 @@ define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2
  ; EG-DAG: FFBH_UINT
  ; EG-DAG: CNDE_INT
  define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
-  %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid
+  %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16
    %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 false) nounwind readnone
    store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16
    ret void
  }
  
  ; FUNC-LABEL: {{^}}v_ctlz_i8:
-; GCN: buffer_load_ubyte [[VAL:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_ubyte [[VAL:v[0-9]+]],
  ; SI-DAG: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
  ; VI-DAG: v_ffbh_u32_sdwa [[RESULT:v[0-9]+]], [[VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
  ; GCN: buffer_store_byte [[RESULT]],
@@ -168,12 +174,14 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64
  }
  
  ; FUNC-LABEL: {{^}}v_ctlz_i32_sel_eq_neg1:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
  ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
  ; GCN: buffer_store_dword [[RESULT]],
  ; GCN: s_endpgm
- define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
-  %val = load i32, i32 addrspace(1)* %valptr
+define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+  %val = load i32, i32 addrspace(1)* %in.gep
    %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
    %cmp = icmp eq i32 %val, 0
    %sel = select i1 %cmp, i32 -1, i32 %ctlz
@@ -182,12 +190,14 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64
  }
  
  ; FUNC-LABEL: {{^}}v_ctlz_i32_sel_ne_neg1:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
  ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
  ; GCN: buffer_store_dword [[RESULT]],
  ; GCN: s_endpgm
  define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
-  %val = load i32, i32 addrspace(1)* %valptr
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+  %val = load i32, i32 addrspace(1)* %in.gep
    %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
    %cmp = icmp ne i32 %val, 0
    %sel = select i1 %cmp, i32 %ctlz, i32 -1
@@ -197,13 +207,15 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out
  
  ; TODO: Should be able to eliminate select here as well.
  ; FUNC-LABEL: {{^}}v_ctlz_i32_sel_eq_bitwidth:
-; GCN: buffer_load_dword
+; GCN: {{buffer|flat}}_load_dword
  ; GCN: v_ffbh_u32_e32
  ; GCN: v_cmp
  ; GCN: v_cndmask
  ; GCN: s_endpgm
  define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
-  %val = load i32, i32 addrspace(1)* %valptr
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+  %val = load i32, i32 addrspace(1)* %in.gep
    %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
    %cmp = icmp eq i32 %ctlz, 32
    %sel = select i1 %cmp, i32 -1, i32 %ctlz
@@ -212,13 +224,15 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias
  }
  
  ; FUNC-LABEL: {{^}}v_ctlz_i32_sel_ne_bitwidth:
-; GCN: buffer_load_dword
+; GCN: {{buffer|flat}}_load_dword
  ; GCN: v_ffbh_u32_e32
  ; GCN: v_cmp
  ; GCN: v_cndmask
  ; GCN: s_endpgm
  define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
-  %val = load i32, i32 addrspace(1)* %valptr
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+  %val = load i32, i32 addrspace(1)* %in.gep
    %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
    %cmp = icmp ne i32 %ctlz, 32
    %sel = select i1 %cmp, i32 %ctlz, i32 -1
@@ -242,7 +256,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
  }
  
  ; FUNC-LABEL: {{^}}v_ctlz_i16_sel_eq_neg1:
-; SI: buffer_load_ushort [[VAL:v[0-9]+]],
+; SI: {{buffer|flat}}_load_ushort [[VAL:v[0-9]+]],
  ; SI: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
  ; SI: buffer_store_short [[FFBH]],
   define amdgpu_kernel void @v_ctlz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind {
diff --git a/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/test/CodeGen/AMDGPU/ctlz_zero_undef.ll

index 48f3e4401f1a8b0136f78378ca80eb8e9f34f9af..c99ce8659e77875f0639a55ca39b65f2115f7619 100644 (file)
--- a/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -29,21 +29,23 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out,
  }
  
  ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
  ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
  ; GCN: buffer_store_dword [[RESULT]],
  ; GCN: s_endpgm
  ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
  ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
  define amdgpu_kernel void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
-  %val = load i32, i32 addrspace(1)* %valptr, align 4
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+  %val = load i32, i32 addrspace(1)* %in.gep, align 4
    %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
    store i32 %ctlz, i32 addrspace(1)* %out, align 4
    ret void
  }
  
  ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_v2i32:
-; GCN: buffer_load_dwordx2
+; GCN: {{buffer|flat}}_load_dwordx2
  ; GCN: v_ffbh_u32_e32
  ; GCN: v_ffbh_u32_e32
  ; GCN: buffer_store_dwordx2
@@ -52,14 +54,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out,
  ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
  ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
  define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
-  %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid
+  %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8
    %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
    store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8
    ret void
  }
  
  ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_v4i32:
-; GCN: buffer_load_dwordx4
+; GCN: {{buffer|flat}}_load_dwordx4
  ; GCN: v_ffbh_u32_e32
  ; GCN: v_ffbh_u32_e32
  ; GCN: v_ffbh_u32_e32
@@ -72,18 +76,22 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noali
  ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
  ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
  define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
-  %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid
+  %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16
    %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
    store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16
    ret void
  }
  
  ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i8:
-; GCN: buffer_load_ubyte [[VAL:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_ubyte [[VAL:v[0-9]+]],
  ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
  ; GCN: buffer_store_byte [[RESULT]],
  define amdgpu_kernel void @v_ctlz_zero_undef_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
-  %val = load i8, i8 addrspace(1)* %valptr
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr i8, i8 addrspace(1)* %valptr, i32 %tid
+  %val = load i8, i8 addrspace(1)* %in.gep
    %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone
    store i8 %ctlz, i8 addrspace(1)* %out
    ret void
@@ -145,11 +153,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias
  }
  
  ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_eq_neg1:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
  ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
  ; GCN: buffer_store_dword [[RESULT]],
- define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
-  %val = load i32, i32 addrspace(1)* %valptr
+define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+  %val = load i32, i32 addrspace(1)* %in.gep
    %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
    %cmp = icmp eq i32 %val, 0
    %sel = select i1 %cmp, i32 -1, i32 %ctlz
@@ -158,11 +168,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias
  }
  
  ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_ne_neg1:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
  ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
  ; GCN: buffer_store_dword [[RESULT]],
  define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
-  %val = load i32, i32 addrspace(1)* %valptr
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+  %val = load i32, i32 addrspace(1)* %in.gep
    %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
    %cmp = icmp ne i32 %val, 0
    %sel = select i1 %cmp, i32 %ctlz, i32 -1
@@ -186,15 +198,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noa
  }
  
  ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_eq_neg1_two_use:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
  ; GCN-DAG: v_ffbh_u32_e32 [[RESULT0:v[0-9]+]], [[VAL]]
  ; GCN-DAG: v_cmp_eq_u32_e32 vcc, 0, [[VAL]]
  ; GCN-DAG: v_cndmask_b32_e64 [[RESULT1:v[0-9]+]], 0, 1, vcc
  ; GCN-DAG: buffer_store_dword [[RESULT0]]
  ; GCN-DAG: buffer_store_byte [[RESULT1]]
  ; GCN: s_endpgm
- define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
-  %val = load i32, i32 addrspace(1)* %valptr
+define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+  %val = load i32, i32 addrspace(1)* %in.gep
    %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
    %cmp = icmp eq i32 %val, 0
    %sel = select i1 %cmp, i32 -1, i32 %ctlz
@@ -205,13 +219,15 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noa
  
  ; Selected on wrong constant
  ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_eq_0:
-; GCN: buffer_load_dword
+; GCN: {{buffer|flat}}_load_dword
  ; GCN: v_ffbh_u32_e32
  ; GCN: v_cmp
  ; GCN: v_cndmask
  ; GCN: buffer_store_dword
- define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
-  %val = load i32, i32 addrspace(1)* %valptr
+define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+  %val = load i32, i32 addrspace(1)* %in.gep
    %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
    %cmp = icmp eq i32 %val, 0
    %sel = select i1 %cmp, i32 0, i32 %ctlz
@@ -221,13 +237,15 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noa
  
  ; Selected on wrong constant
  ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_ne_0:
-; GCN: buffer_load_dword
+; GCN: {{buffer|flat}}_load_dword
  ; GCN: v_ffbh_u32_e32
  ; GCN: v_cmp
  ; GCN: v_cndmask
  ; GCN: buffer_store_dword
  define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
-  %val = load i32, i32 addrspace(1)* %valptr
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+  %val = load i32, i32 addrspace(1)* %in.gep
    %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
    %cmp = icmp ne i32 %val, 0
    %sel = select i1 %cmp, i32 %ctlz, i32 0
@@ -237,13 +255,15 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(i32 addrspace(1)* noal
  
  ; Compare on wrong constant
  ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_eq_cmp_non0:
-; GCN: buffer_load_dword
+; GCN: {{buffer|flat}}_load_dword
  ; GCN: v_ffbh_u32_e32
  ; GCN: v_cmp
  ; GCN: v_cndmask
  ; GCN: buffer_store_dword
- define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
-  %val = load i32, i32 addrspace(1)* %valptr
+define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+  %val = load i32, i32 addrspace(1)* %in.gep
    %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
    %cmp = icmp eq i32 %val, 1
    %sel = select i1 %cmp, i32 0, i32 %ctlz
@@ -253,13 +273,15 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(i32 addrspace(1)* noal
  
  ; Selected on wrong constant
  ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_ne_cmp_non0:
-; GCN: buffer_load_dword
+; GCN: {{buffer|flat}}_load_dword
  ; GCN: v_ffbh_u32_e32
  ; GCN: v_cmp
  ; GCN: v_cndmask
  ; GCN: buffer_store_dword
  define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
-  %val = load i32, i32 addrspace(1)* %valptr
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+  %val = load i32, i32 addrspace(1)* %in.gep
    %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
    %cmp = icmp ne i32 %val, 1
    %sel = select i1 %cmp, i32 %ctlz, i32 0
diff --git a/test/CodeGen/AMDGPU/ctpop.ll b/test/CodeGen/AMDGPU/ctpop.ll

index aa913ad406d2bc9af48b13859d38257b7cc6c117..68b39bad2bc12c7f4de5ec4a17be5d8340460713 100644 (file)
--- a/test/CodeGen/AMDGPU/ctpop.ll
+++ b/test/CodeGen/AMDGPU/ctpop.ll
@@ -8,6 +8,8 @@ declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) nounwind readnone
  declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>) nounwind readnone
  declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>) nounwind readnone
  
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
  ; FUNC-LABEL: {{^}}s_ctpop_i32:
  ; GCN: s_load_dword [[SVAL:s[0-9]+]],
  ; GCN: s_bcnt1_i32_b32 [[SRESULT:s[0-9]+]], [[SVAL]]
@@ -24,22 +26,24 @@ define amdgpu_kernel void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val)
  
  ; XXX - Why 0 in register?
  ; FUNC-LABEL: {{^}}v_ctpop_i32:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
  ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], 0
  ; GCN: buffer_store_dword [[RESULT]],
  ; GCN: s_endpgm
  
  ; EG: BCNT_INT
  define amdgpu_kernel void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
-  %val = load i32, i32 addrspace(1)* %in, align 4
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %val = load i32, i32 addrspace(1)* %in.gep, align 4
    %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
    store i32 %ctpop, i32 addrspace(1)* %out, align 4
    ret void
  }
  
  ; FUNC-LABEL: {{^}}v_ctpop_add_chain_i32:
-; GCN: buffer_load_dword [[VAL1:v[0-9]+]],
-; GCN: buffer_load_dword [[VAL0:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_dword [[VAL0:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_dword [[VAL1:v[0-9]+]],
  ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], [[VAL1]], 0
  ; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
  ; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
@@ -49,8 +53,11 @@ define amdgpu_kernel void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrs
  ; EG: BCNT_INT
  ; EG: BCNT_INT
  define amdgpu_kernel void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1) nounwind {
-  %val0 = load i32, i32 addrspace(1)* %in0, align 4
-  %val1 = load i32, i32 addrspace(1)* %in1, align 4
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in0.gep = getelementptr i32, i32 addrspace(1)* %in0, i32 %tid
+  %in1.gep = getelementptr i32, i32 addrspace(1)* %in1, i32 %tid
+  %val0 = load i32, i32 addrspace(1)* %in0.gep, align 4
+  %val1 = load i32, i32 addrspace(1)* %in1.gep, align 4
    %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
    %ctpop1 = call i32 @llvm.ctpop.i32(i32 %val1) nounwind readnone
    %add = add i32 %ctpop0, %ctpop1
@@ -59,15 +66,17 @@ define amdgpu_kernel void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out,
  }
  
  ; FUNC-LABEL: {{^}}v_ctpop_add_sgpr_i32:
-; GCN: buffer_load_dword [[VAL0:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_dword [[VAL0:v[0-9]+]],
  ; GCN: s_waitcnt
  ; GCN-NEXT: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}}
  ; GCN: buffer_store_dword [[RESULT]],
  ; GCN: s_endpgm
-define amdgpu_kernel void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1, i32 %sval) nounwind {
-  %val0 = load i32, i32 addrspace(1)* %in0, align 4
-  %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
-  %add = add i32 %ctpop0, %sval
+define amdgpu_kernel void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %sval) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %val = load i32, i32 addrspace(1)* %in.gep, align 4
+  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
+  %add = add i32 %ctpop, %sval
    store i32 %add, i32 addrspace(1)* %out, align 4
    ret void
  }
@@ -80,7 +89,9 @@ define amdgpu_kernel void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out,
  ; EG: BCNT_INT
  ; EG: BCNT_INT
  define amdgpu_kernel void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) nounwind {
-  %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 %tid
+  %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8
    %ctpop = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %val) nounwind readnone
    store <2 x i32> %ctpop, <2 x i32> addrspace(1)* %out, align 8
    ret void
@@ -98,7 +109,9 @@ define amdgpu_kernel void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <
  ; EG: BCNT_INT
  ; EG: BCNT_INT
  define amdgpu_kernel void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %in) nounwind {
-  %val = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 %tid
+  %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16
    %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %val) nounwind readnone
    store <4 x i32> %ctpop, <4 x i32> addrspace(1)* %out, align 16
    ret void
@@ -124,7 +137,9 @@ define amdgpu_kernel void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <
  ; EG: BCNT_INT
  ; EG: BCNT_INT
  define amdgpu_kernel void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <8 x i32> addrspace(1)* noalias %in) nounwind {
-  %val = load <8 x i32>, <8 x i32> addrspace(1)* %in, align 32
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr <8 x i32>, <8 x i32> addrspace(1)* %in, i32 %tid
+  %val = load <8 x i32>, <8 x i32> addrspace(1)* %in.gep, align 32
    %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %val) nounwind readnone
    store <8 x i32> %ctpop, <8 x i32> addrspace(1)* %out, align 32
    ret void
@@ -166,21 +181,25 @@ define amdgpu_kernel void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <
  ; EG: BCNT_INT
  ; EG: BCNT_INT
  define amdgpu_kernel void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out, <16 x i32> addrspace(1)* noalias %in) nounwind {
-  %val = load <16 x i32>, <16 x i32> addrspace(1)* %in, align 32
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr <16 x i32>, <16 x i32> addrspace(1)* %in, i32 %tid
+  %val = load <16 x i32>, <16 x i32> addrspace(1)* %in.gep, align 32
    %ctpop = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %val) nounwind readnone
    store <16 x i32> %ctpop, <16 x i32> addrspace(1)* %out, align 32
    ret void
  }
  
  ; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
  ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], 4
  ; GCN: buffer_store_dword [[RESULT]],
  ; GCN: s_endpgm
  
  ; EG: BCNT_INT
  define amdgpu_kernel void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
-  %val = load i32, i32 addrspace(1)* %in, align 4
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %val = load i32, i32 addrspace(1)* %in.gep, align 4
    %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
    %add = add i32 %ctpop, 4
    store i32 %add, i32 addrspace(1)* %out, align 4
@@ -188,14 +207,16 @@ define amdgpu_kernel void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noa
  }
  
  ; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant_inv:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
  ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], 4
  ; GCN: buffer_store_dword [[RESULT]],
  ; GCN: s_endpgm
  
  ; EG: BCNT_INT
  define amdgpu_kernel void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
-  %val = load i32, i32 addrspace(1)* %in, align 4
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %val = load i32, i32 addrspace(1)* %in.gep, align 4
    %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
    %add = add i32 4, %ctpop
    store i32 %add, i32 addrspace(1)* %out, align 4
@@ -203,14 +224,16 @@ define amdgpu_kernel void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)*
  }
  
  ; FUNC-LABEL: {{^}}v_ctpop_i32_add_literal:
-; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN-DAG: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
  ; GCN-DAG: v_mov_b32_e32 [[LIT:v[0-9]+]], 0x1869f
  ; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
  ; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
  ; GCN: buffer_store_dword [[RESULT]],
  ; GCN: s_endpgm
  define amdgpu_kernel void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
-  %val = load i32, i32 addrspace(1)* %in, align 4
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %val = load i32, i32 addrspace(1)* %in.gep, align 4
    %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
    %add = add i32 %ctpop, 99999
    store i32 %add, i32 addrspace(1)* %out, align 4
@@ -218,7 +241,7 @@ define amdgpu_kernel void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %ou
  }
  
  ; FUNC-LABEL: {{^}}v_ctpop_i32_add_var:
-; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN-DAG: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
  ; GCN-DAG: s_load_dword [[VAR:s[0-9]+]],
  ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
  ; GCN: buffer_store_dword [[RESULT]],
@@ -226,7 +249,9 @@ define amdgpu_kernel void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %ou
  
  ; EG: BCNT_INT
  define amdgpu_kernel void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
-  %val = load i32, i32 addrspace(1)* %in, align 4
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %val = load i32, i32 addrspace(1)* %in.gep, align 4
    %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
    %add = add i32 %ctpop, %const
    store i32 %add, i32 addrspace(1)* %out, align 4
@@ -234,7 +259,7 @@ define amdgpu_kernel void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i
  }
  
  ; FUNC-LABEL: {{^}}v_ctpop_i32_add_var_inv:
-; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN-DAG: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
  ; GCN-DAG: s_load_dword [[VAR:s[0-9]+]],
  ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
  ; GCN: buffer_store_dword [[RESULT]],
@@ -242,7 +267,9 @@ define amdgpu_kernel void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i
  
  ; EG: BCNT_INT
  define amdgpu_kernel void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
-  %val = load i32, i32 addrspace(1)* %in, align 4
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %val = load i32, i32 addrspace(1)* %in.gep, align 4
    %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
    %add = add i32 %const, %ctpop
    store i32 %add, i32 addrspace(1)* %out, align 4
@@ -250,18 +277,22 @@ define amdgpu_kernel void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %ou
  }
  
  ; FUNC-LABEL: {{^}}v_ctpop_i32_add_vvar_inv:
-; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], {{0$}}
-; GCN-DAG: buffer_load_dword [[VAR:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:16
-; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
+; SI: buffer_load_dword [[VAR:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
+; SI: buffer_load_dword [[VAL:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
+; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAR]], [[VAL]]
+; VI: flat_load_dword [[VAL:v[0-9]+]], v[{{[0-9]+:[0-9]+}}]
+; VI: flat_load_dword [[VAR:v[0-9]+]], v[{{[0-9]+:[0-9]+}}]
  ; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
  ; GCN: buffer_store_dword [[RESULT]],
  ; GCN: s_endpgm
  
  ; EG: BCNT_INT
  define amdgpu_kernel void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 addrspace(1)* noalias %constptr) nounwind {
-  %val = load i32, i32 addrspace(1)* %in, align 4
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %val = load i32, i32 addrspace(1)* %in.gep, align 4
    %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
-  %gep = getelementptr i32, i32 addrspace(1)* %constptr, i32 4
+  %gep = getelementptr i32, i32 addrspace(1)* %constptr, i32 %tid
    %const = load i32, i32 addrspace(1)* %gep, align 4
    %add = add i32 %const, %ctpop
    store i32 %add, i32 addrspace(1)* %out, align 4
diff --git a/test/CodeGen/AMDGPU/ctpop64.ll b/test/CodeGen/AMDGPU/ctpop64.ll

index f18bd9fd8174b1c5fdc9e1d38397b1683932dab6..4850370851f63cd16ec035f2d03f28a0d373a24d 100644 (file)
--- a/test/CodeGen/AMDGPU/ctpop64.ll
+++ b/test/CodeGen/AMDGPU/ctpop64.ll
@@ -1,6 +1,8 @@
  ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
  ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
  
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
  declare i64 @llvm.ctpop.i64(i64) nounwind readnone
  declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) nounwind readnone
  declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>) nounwind readnone
@@ -25,14 +27,16 @@ define amdgpu_kernel void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val)
  }
  
  ; FUNC-LABEL: {{^}}v_ctpop_i64:
-; GCN: buffer_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}},
+; GCN: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}},
  ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0
  ; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
  ; VI-NEXT: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
  ; GCN: buffer_store_dword [[RESULT]],
  ; GCN: s_endpgm
  define amdgpu_kernel void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
-  %val = load i64, i64 addrspace(1)* %in, align 8
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %val = load i64, i64 addrspace(1)* %in.gep, align 8
    %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
    %truncctpop = trunc i64 %ctpop to i32
    store i32 %truncctpop, i32 addrspace(1)* %out, align 4
@@ -40,7 +44,7 @@ define amdgpu_kernel void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrs
  }
  
  ; FUNC-LABEL: {{^}}v_ctpop_i64_user:
-; GCN: buffer_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}},
+; GCN: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}},
  ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0
  ; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
  ; VI-NEXT: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
@@ -49,7 +53,9 @@ define amdgpu_kernel void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrs
  ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
  ; GCN: s_endpgm
  define amdgpu_kernel void @v_ctpop_i64_user(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %s.val) nounwind {
-  %val = load i64, i64 addrspace(1)* %in, align 8
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %val = load i64, i64 addrspace(1)* %in.gep, align 8
    %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
    %or = or i64 %ctpop, %s.val
    store i64 %or, i64 addrspace(1)* %out
@@ -87,7 +93,9 @@ define amdgpu_kernel void @s_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <
  ; GCN: v_bcnt_u32_b32
  ; GCN: s_endpgm
  define amdgpu_kernel void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in) nounwind {
-  %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i32 %tid
+  %val = load <2 x i64>, <2 x i64> addrspace(1)* %in.gep, align 16
    %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone
    %truncctpop = trunc <2 x i64> %ctpop to <2 x i32>
    store <2 x i32> %truncctpop, <2 x i32> addrspace(1)* %out, align 8
@@ -105,7 +113,9 @@ define amdgpu_kernel void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <
  ; GCN: v_bcnt_u32_b32
  ; GCN: s_endpgm
  define amdgpu_kernel void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrspace(1)* noalias %in) nounwind {
-  %val = load <4 x i64>, <4 x i64> addrspace(1)* %in, align 32
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i32 %tid
+  %val = load <4 x i64>, <4 x i64> addrspace(1)* %in.gep, align 32
    %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
    %truncctpop = trunc <4 x i64> %ctpop to <4 x i32>
    store <4 x i32> %truncctpop, <4 x i32> addrspace(1)* %out, align 16
@@ -169,7 +179,8 @@ define amdgpu_kernel void @s_ctpop_i65(i32 addrspace(1)* noalias %out, i65 %val)
  ; FIXME: Should not have extra add
  
  ; FUNC-LABEL: {{^}}v_ctpop_i128:
-; GCN: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; SI: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
+; VI: flat_load_dwordx4   v{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}
  
  ; GCN-DAG: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT0:v[0-9]+]], v{{[0-9]+}}, 0
  ; GCN-DAG: v_bcnt_u32_b32{{(_e32)*(_e64)*}} [[MIDRESULT1:v[0-9]+]], v[[VAL3]], [[MIDRESULT0]]
@@ -182,7 +193,9 @@ define amdgpu_kernel void @s_ctpop_i65(i32 addrspace(1)* noalias %out, i65 %val)
  ; GCN: buffer_store_dword [[RESULT]],
  ; GCN: s_endpgm
  define amdgpu_kernel void @v_ctpop_i128(i32 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in) nounwind {
-  %val = load i128, i128 addrspace(1)* %in, align 8
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %tid
+  %val = load i128, i128 addrspace(1)* %in.gep, align 8
    %ctpop = call i128 @llvm.ctpop.i128(i128 %val) nounwind readnone
    %truncctpop = trunc i128 %ctpop to i32
    store i32 %truncctpop, i32 addrspace(1)* %out, align 4
diff --git a/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/test/CodeGen/AMDGPU/cttz_zero_undef.ll

index 1fa6407647eb8be0b8a1e692c44844e3f96e8255..1bfd38d94bfdf93504ce62fb06fc156cf6561a4e 100644 (file)
--- a/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -5,6 +5,7 @@
  declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone
  declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1) nounwind readnone
  declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) nounwind readnone
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
  
  ; FUNC-LABEL: {{^}}s_cttz_zero_undef_i32:
  ; SI: s_load_dword [[VAL:s[0-9]+]],
@@ -21,21 +22,23 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out,
  }
  
  ; FUNC-LABEL: {{^}}v_cttz_zero_undef_i32:
-; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
  ; SI: v_ffbl_b32_e32 [[RESULT:v[0-9]+]], [[VAL]]
  ; SI: buffer_store_dword [[RESULT]],
  ; SI: s_endpgm
  ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
  ; EG: FFBL_INT {{\*? *}}[[RESULT]]
  define amdgpu_kernel void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
-  %val = load i32, i32 addrspace(1)* %valptr, align 4
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+  %val = load i32, i32 addrspace(1)* %in.gep, align 4
    %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
    store i32 %cttz, i32 addrspace(1)* %out, align 4
    ret void
  }
  
  ; FUNC-LABEL: {{^}}v_cttz_zero_undef_v2i32:
-; SI: buffer_load_dwordx2
+; SI: {{buffer|flat}}_load_dwordx2
  ; SI: v_ffbl_b32_e32
  ; SI: v_ffbl_b32_e32
  ; SI: buffer_store_dwordx2
@@ -44,14 +47,16 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out,
  ; EG: FFBL_INT {{\*? *}}[[RESULT]]
  ; EG: FFBL_INT {{\*? *}}[[RESULT]]
  define amdgpu_kernel void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
-  %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid
+  %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8
    %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
    store <2 x i32> %cttz, <2 x i32> addrspace(1)* %out, align 8
    ret void
  }
  
  ; FUNC-LABEL: {{^}}v_cttz_zero_undef_v4i32:
-; SI: buffer_load_dwordx4
+; SI: {{buffer|flat}}_load_dwordx4
  ; SI: v_ffbl_b32_e32
  ; SI: v_ffbl_b32_e32
  ; SI: v_ffbl_b32_e32
@@ -64,7 +69,9 @@ define amdgpu_kernel void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noali
  ; EG: FFBL_INT {{\*? *}}[[RESULT]]
  ; EG: FFBL_INT {{\*? *}}[[RESULT]]
  define amdgpu_kernel void @v_cttz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
-  %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid
+  %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16
    %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
    store <4 x i32> %cttz, <4 x i32> addrspace(1)* %out, align 16
    ret void
diff --git a/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll

index 0328ce31002df9aace971435d844ea1cd52e69fc..f839129fc3d871d45b658b29fca1c7bea615d71c 100644 (file)
--- a/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -5,46 +5,52 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
  declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
  
  ; GCN-LABEL: {{^}}load_i8_to_f32:
-; GCN: buffer_load_ubyte [[LOADREG:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_ubyte [[LOADREG:v[0-9]+]],
  ; GCN-NOT: bfe
  ; GCN-NOT: lshr
  ; GCN: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[LOADREG]]
  ; GCN: buffer_store_dword [[CONV]],
  define amdgpu_kernel void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
-  %load = load i8, i8 addrspace(1)* %in, align 1
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid 
+  %load = load i8, i8 addrspace(1)* %gep, align 1
    %cvt = uitofp i8 %load to float
    store float %cvt, float addrspace(1)* %out, align 4
    ret void
  }
  
  ; GCN-LABEL: {{^}}load_v2i8_to_v2f32:
-; GCN: buffer_load_ushort [[LD:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_ushort [[LD:v[0-9]+]]
  ; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LD]]
  ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LD]]
  ; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
  define amdgpu_kernel void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind {
-  %load = load <2 x i8>, <2 x i8> addrspace(1)* %in, align 2
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr <2 x i8>, <2 x i8> addrspace(1)* %in, i32 %tid 
+  %load = load <2 x i8>, <2 x i8> addrspace(1)* %gep, align 2
    %cvt = uitofp <2 x i8> %load to <2 x float>
    store <2 x float> %cvt, <2 x float> addrspace(1)* %out, align 16
    ret void
  }
  
  ; GCN-LABEL: {{^}}load_v3i8_to_v3f32:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]]
  ; GCN-NOT: v_cvt_f32_ubyte3_e32
  ; GCN-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[VAL]]
  ; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[VAL]]
  ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[VAL]]
  ; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
  define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
-  %load = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid 
+  %load = load <3 x i8>, <3 x i8> addrspace(1)* %gep, align 4
    %cvt = uitofp <3 x i8> %load to <3 x float>
    store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16
    ret void
  }
  
  ; GCN-LABEL: {{^}}load_v4i8_to_v4f32:
-; GCN: buffer_load_dword [[LOADREG:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[LOADREG:v[0-9]+]]
  ; GCN-NOT: bfe
  ; GCN-NOT: lshr
  ; GCN-DAG: v_cvt_f32_ubyte3_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]]
@@ -53,7 +59,9 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias
  ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
  ; GCN: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
  define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
-  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid 
+  %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
    %cvt = uitofp <4 x i8> %load to <4 x float>
    store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
    ret void
@@ -64,10 +72,10 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias
  
  ; FIXME: Packing bytes
  ; GCN-LABEL: {{^}}load_v4i8_to_v4f32_unaligned:
-; GCN: buffer_load_ubyte [[LOADREG3:v[0-9]+]]
-; GCN: buffer_load_ubyte [[LOADREG2:v[0-9]+]]
-; GCN: buffer_load_ubyte [[LOADREG1:v[0-9]+]]
-; GCN: buffer_load_ubyte [[LOADREG0:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_ubyte [[LOADREG3:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_ubyte [[LOADREG2:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_ubyte [[LOADREG1:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_ubyte [[LOADREG0:v[0-9]+]]
  ; GCN-DAG: v_lshlrev_b32
  ; GCN-DAG: v_or_b32
  ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]],
@@ -77,7 +85,9 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias
  
  ; GCN: buffer_store_dwordx4
  define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
-  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid 
+  %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1
    %cvt = uitofp <4 x i8> %load to <4 x float>
    store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
    ret void
@@ -124,14 +134,16 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n
  ; GCN-LABEL: {{^}}load_v7i8_to_v7f32:
  ; GCN: s_endpgm
  define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind {
-  %load = load <7 x i8>, <7 x i8> addrspace(1)* %in, align 1
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr <7 x i8>, <7 x i8> addrspace(1)* %in, i32 %tid 
+  %load = load <7 x i8>, <7 x i8> addrspace(1)* %gep, align 1
    %cvt = uitofp <7 x i8> %load to <7 x float>
    store <7 x float> %cvt, <7 x float> addrspace(1)* %out, align 16
    ret void
  }
  
  ; GCN-LABEL: {{^}}load_v8i8_to_v8f32:
-; GCN: buffer_load_dwordx2 v{{\[}}[[LOLOAD:[0-9]+]]:[[HILOAD:[0-9]+]]{{\]}},
+; GCN: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LOLOAD:[0-9]+]]:[[HILOAD:[0-9]+]]{{\]}},
  ; GCN-NOT: bfe
  ; GCN-NOT: lshr
  ; GCN-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[LOLOAD]]
@@ -147,19 +159,23 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias
  ; GCN: buffer_store_dwordx4
  ; GCN: buffer_store_dwordx4
  define amdgpu_kernel void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind {
-  %load = load <8 x i8>, <8 x i8> addrspace(1)* %in, align 8
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr <8 x i8>, <8 x i8> addrspace(1)* %in, i32 %tid 
+  %load = load <8 x i8>, <8 x i8> addrspace(1)* %gep, align 8
    %cvt = uitofp <8 x i8> %load to <8 x float>
    store <8 x float> %cvt, <8 x float> addrspace(1)* %out, align 16
    ret void
  }
  
  ; GCN-LABEL: {{^}}i8_zext_inreg_i32_to_f32:
-; GCN: buffer_load_dword [[LOADREG:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_dword [[LOADREG:v[0-9]+]],
  ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 2, [[LOADREG]]
  ; GCN-NEXT: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[ADD]]
  ; GCN: buffer_store_dword [[CONV]],
  define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
-  %load = load i32, i32 addrspace(1)* %in, align 4
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid 
+  %load = load i32, i32 addrspace(1)* %gep, align 4
    %add = add i32 %load, 2
    %inreg = and i32 %add, 255
    %cvt = uitofp i32 %inreg to float
@@ -169,7 +185,9 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias
  
  ; GCN-LABEL: {{^}}i8_zext_inreg_hi1_to_f32:
  define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
-  %load = load i32, i32 addrspace(1)* %in, align 4
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid 
+  %load = load i32, i32 addrspace(1)* %gep, align 4
    %inreg = and i32 %load, 65280
    %shr = lshr i32 %inreg, 8
    %cvt = uitofp i32 %shr to float
@@ -181,7 +199,9 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias
  ; them so it shouldn't really matter.
  ; GCN-LABEL: {{^}}i8_zext_i32_to_f32:
  define amdgpu_kernel void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
-  %load = load i8, i8 addrspace(1)* %in, align 1
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid 
+  %load = load i8, i8 addrspace(1)* %gep, align 1
    %ext = zext i8 %load to i32
    %cvt = uitofp i32 %ext to float
    store float %cvt, float addrspace(1)* %out, align 4
@@ -190,7 +210,9 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out,
  
  ; GCN-LABEL: {{^}}v4i8_zext_v4i32_to_v4f32:
  define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
-  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid 
+  %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1
    %ext = zext <4 x i8> %load to <4 x i32>
    %cvt = uitofp <4 x i32> %ext to <4 x float>
    store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
@@ -198,12 +220,14 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* no
  }
  
  ; GCN-LABEL: {{^}}extract_byte0_to_f32:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]]
  ; GCN-NOT: [[VAL]]
  ; GCN: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[VAL]]
  ; GCN: buffer_store_dword [[CONV]]
  define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
-  %val = load i32, i32 addrspace(1)* %in
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid 
+  %val = load i32, i32 addrspace(1)* %gep
    %and = and i32 %val, 255
    %cvt = uitofp i32 %and to float
    store float %cvt, float addrspace(1)* %out
@@ -211,12 +235,14 @@ define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out
  }
  
  ; GCN-LABEL: {{^}}extract_byte1_to_f32:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]]
  ; GCN-NOT: [[VAL]]
  ; GCN: v_cvt_f32_ubyte1_e32 [[CONV:v[0-9]+]], [[VAL]]
  ; GCN: buffer_store_dword [[CONV]]
  define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
-  %val = load i32, i32 addrspace(1)* %in
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid 
+  %val = load i32, i32 addrspace(1)* %gep
    %srl = lshr i32 %val, 8
    %and = and i32 %srl, 255
    %cvt = uitofp i32 %and to float
@@ -225,12 +251,14 @@ define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out
  }
  
  ; GCN-LABEL: {{^}}extract_byte2_to_f32:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]]
  ; GCN-NOT: [[VAL]]
  ; GCN: v_cvt_f32_ubyte2_e32 [[CONV:v[0-9]+]], [[VAL]]
  ; GCN: buffer_store_dword [[CONV]]
  define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
-  %val = load i32, i32 addrspace(1)* %in
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid 
+  %val = load i32, i32 addrspace(1)* %gep
    %srl = lshr i32 %val, 16
    %and = and i32 %srl, 255
    %cvt = uitofp i32 %and to float
@@ -239,12 +267,14 @@ define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out
  }
  
  ; GCN-LABEL: {{^}}extract_byte3_to_f32:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]]
  ; GCN-NOT: [[VAL]]
  ; GCN: v_cvt_f32_ubyte3_e32 [[CONV:v[0-9]+]], [[VAL]]
  ; GCN: buffer_store_dword [[CONV]]
  define amdgpu_kernel void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
-  %val = load i32, i32 addrspace(1)* %in
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid 
+  %val = load i32, i32 addrspace(1)* %gep
    %srl = lshr i32 %val, 24
    %and = and i32 %srl, 255
    %cvt = uitofp i32 %and to float
diff --git a/test/CodeGen/AMDGPU/early-if-convert-cost.ll b/test/CodeGen/AMDGPU/early-if-convert-cost.ll

index ace01593808b7eef238357c9eef4437f0d44ee5a..74404989f8c71e55f98397145ac15e283a580ff7 100644 (file)
--- a/test/CodeGen/AMDGPU/early-if-convert-cost.ll
+++ b/test/CodeGen/AMDGPU/early-if-convert-cost.ll
@@ -1,4 +1,4 @@
-; RUN: llc -stress-early-ifcvt -amdgpu-early-ifcvt=1 -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -stress-early-ifcvt -amdgpu-early-ifcvt=1 -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
  
  ; FIXME: Most of these cases that don't trigger because of broken cost
  ; heuristics. Should not need -stress-early-ifcvt
diff --git a/test/CodeGen/AMDGPU/early-if-convert.ll b/test/CodeGen/AMDGPU/early-if-convert.ll

index 9439130deb9ef79499a300b6826cfa83be163201..792f0b1eaef46ad4896873b38004a407590a8f75 100644 (file)
--- a/test/CodeGen/AMDGPU/early-if-convert.ll
+++ b/test/CodeGen/AMDGPU/early-if-convert.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=verde -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=verde -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
  ; XUN: llc -march=amdgcn -mcpu=tonga -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
  
  ; FIXME: This leaves behind a now unnecessary and with exec
diff --git a/test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll b/test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll

index 6eb1fc1d0cc29a3d12fa279699a0cfe88090498e..4e52a5f80d8e62c198a01e48831a0c6b9ccfcec2 100644 (file)
--- a/test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll
+++ b/test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll
@@ -2,6 +2,8 @@
  ; RUN: llc -march=amdgcn -enable-no-signed-zeros-fp-math=1 < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-UNSAFE %s
  ; RUN: llc -march=amdgcn -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-UNSAFE %s
  
+declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+
  ; Test that the -enable-no-signed-zeros-fp-math flag works
  
  ; GCN-LABEL: {{^}}fneg_fsub_f32:
@@ -10,8 +12,11 @@
  
  ; GCN-UNSAFE-NOT: xor
  define amdgpu_kernel void @fneg_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
-  %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
-  %a = load float, float addrspace(1)* %in, align 4
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %add = add i32 %tid, 1
+  %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %b_ptr = getelementptr float, float addrspace(1)* %in, i32 %add
+  %a = load float, float addrspace(1)* %gep, align 4
    %b = load float, float addrspace(1)* %b_ptr, align 4
    %result = fsub float %a, %b
    %neg.result = fsub float -0.0, %result
diff --git a/test/CodeGen/AMDGPU/extractelt-to-trunc.ll b/test/CodeGen/AMDGPU/extractelt-to-trunc.ll

index 34999fa3aea43584e11aa6a55ae39f1a09cb1e93..3fb452de1ccf458e5945c923e8add6d9f65b72dc 100644 (file)
--- a/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
+++ b/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
@@ -1,5 +1,7 @@
  ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
  
+declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+
  ; Make sure the add and load are reduced to 32-bits even with the
  ; bitcast to vector.
  ; GCN-LABEL: {{^}}bitcast_int_to_vector_extract_0:
@@ -8,7 +10,9 @@
  ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, [[B]], [[A]]
  ; GCN: buffer_store_dword [[ADD]]
  define amdgpu_kernel void @bitcast_int_to_vector_extract_0(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) {
-   %a = load i64, i64 addrspace(1)* %in
+   %tid = call i32 @llvm.amdgcn.workitem.id.x()
+   %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid 
+   %a = load i64, i64 addrspace(1)* %gep
     %add = add i64 %a, %b
     %val.bc = bitcast i64 %add to <2 x i32>
     %extract = extractelement <2 x i32> %val.bc, i32 0
@@ -21,7 +25,9 @@ define amdgpu_kernel void @bitcast_int_to_vector_extract_0(i32 addrspace(1)* %ou
  ; GCN: v_add_f64
  ; GCN: buffer_store_dword v
  define amdgpu_kernel void @bitcast_fp_to_vector_extract_0(i32 addrspace(1)* %out, double addrspace(1)* %in, double %b) {
-   %a = load double, double addrspace(1)* %in
+   %tid = call i32 @llvm.amdgcn.workitem.id.x()
+   %gep = getelementptr double, double addrspace(1)* %in, i32 %tid 
+   %a = load double, double addrspace(1)* %gep
     %add = fadd double %a, %b
     %val.bc = bitcast double %add to <2 x i32>
     %extract = extractelement <2 x i32> %val.bc, i32 0
@@ -34,7 +40,9 @@ define amdgpu_kernel void @bitcast_fp_to_vector_extract_0(i32 addrspace(1)* %out
  ; GCN: v_add_i32
  ; GCN: buffer_store_dword
  define amdgpu_kernel void @bitcast_int_to_fpvector_extract_0(float addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) {
-   %a = load i64, i64 addrspace(1)* %in
+   %tid = call i32 @llvm.amdgcn.workitem.id.x()
+   %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid 
+   %a = load i64, i64 addrspace(1)* %gep
     %add = add i64 %a, %b
     %val.bc = bitcast i64 %add to <2 x float>
     %extract = extractelement <2 x float> %val.bc, i32 0
diff --git a/test/CodeGen/AMDGPU/fabs.f16.ll b/test/CodeGen/AMDGPU/fabs.f16.ll

index 4e2ec4b3054fed2c63d33ec7b1e65b0ef0abeff0..158e64454aa688606adfb67ee9acb9690a67ff3d 100644 (file)
--- a/test/CodeGen/AMDGPU/fabs.f16.ll
+++ b/test/CodeGen/AMDGPU/fabs.f16.ll
@@ -134,7 +134,9 @@ define amdgpu_kernel void @fabs_free_v2f16(<2 x half> addrspace(1)* %out, i32 %i
  ; GFX9: v_and_b32_e32 [[FABS:v[0-9]+]], 0x7fff7fff, [[VAL]]
  ; GFX9: v_pk_mul_f16 v{{[0-9]+}}, [[FABS]], v{{[0-9]+$}}
  define amdgpu_kernel void @v_fabs_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
-  %val = load <2 x half>, <2 x half> addrspace(1)* %in
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid 
+  %val = load <2 x half>, <2 x half> addrspace(1)* %gep
    %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
    %fmul = fmul <2 x half> %fabs, %val
    store <2 x half> %fmul, <2 x half> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/fadd.f16.ll b/test/CodeGen/AMDGPU/fadd.f16.ll

index 08199be144f4914b8d860e62587875faf9f77a04..1d7652f80657a7bd310e5fb98e7701fb4e73e1ff 100644 (file)
--- a/test/CodeGen/AMDGPU/fadd.f16.ll
+++ b/test/CodeGen/AMDGPU/fadd.f16.ll
@@ -2,8 +2,8 @@
  ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
  
  ; GCN-LABEL: {{^}}fadd_f16
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
+; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]]
+; GCN: {{buffer|flat}}_load_ushort v[[B_F16:[0-9]+]]
  ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
  ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
  ; SI:  v_add_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
@@ -24,7 +24,7 @@ entry:
  }
  
  ; GCN-LABEL: {{^}}fadd_f16_imm_a
-; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
+; GCN: {{buffer|flat}}_load_ushort v[[B_F16:[0-9]+]]
  ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
  ; SI:  v_add_f32_e32 v[[R_F32:[0-9]+]], 1.0, v[[B_F32]]
  ; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
@@ -42,7 +42,7 @@ entry:
  }
  
  ; GCN-LABEL: {{^}}fadd_f16_imm_b
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
+; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]]
  ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
  ; SI:  v_add_f32_e32 v[[R_F32:[0-9]+]], 2.0, v[[A_F32]]
  ; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
@@ -60,8 +60,8 @@ entry:
  }
  
  ; GCN-LABEL: {{^}}fadd_v2f16:
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword v[[A_V2_F16:[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword v[[B_V2_F16:[0-9]+]]
  
  ; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
  ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
@@ -70,8 +70,8 @@ entry:
  
  ; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
  ; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI:  v_add_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]]
-; SI:  v_add_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
+; SI-DAG:  v_add_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]]
+; SI-DAG:  v_add_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
  ; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
  ; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
  ; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
@@ -88,15 +88,18 @@ define amdgpu_kernel void @fadd_v2f16(
      <2 x half> addrspace(1)* %a,
      <2 x half> addrspace(1)* %b) {
  entry:
-  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
-  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.a = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %a, i32 %tid
+  %gep.b = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %b, i32 %tid
+  %a.val = load <2 x half>, <2 x half> addrspace(1)* %gep.a
+  %b.val = load <2 x half>, <2 x half> addrspace(1)* %gep.b
    %r.val = fadd <2 x half> %a.val, %b.val
    store <2 x half> %r.val, <2 x half> addrspace(1)* %r
    ret void
  }
  
  ; GCN-LABEL: {{^}}fadd_v2f16_imm_a:
-; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]]
+; GCN-DAG: {{buffer|flat}}_load_dword v[[B_V2_F16:[0-9]+]]
  ; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
  ; SI:  v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
  ; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
@@ -118,14 +121,16 @@ define amdgpu_kernel void @fadd_v2f16_imm_a(
      <2 x half> addrspace(1)* %r,
      <2 x half> addrspace(1)* %b) {
  entry:
-  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.b = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %b, i32 %tid
+  %b.val = load <2 x half>, <2 x half> addrspace(1)* %gep.b
    %r.val = fadd <2 x half> <half 1.0, half 2.0>, %b.val
    store <2 x half> %r.val, <2 x half> addrspace(1)* %r
    ret void
  }
  
  ; GCN-LABEL: {{^}}fadd_v2f16_imm_b:
-; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
+; GCN-DAG: {{buffer|flat}}_load_dword v[[A_V2_F16:[0-9]+]]
  ; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
  ; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
  ; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
@@ -147,8 +152,15 @@ define amdgpu_kernel void @fadd_v2f16_imm_b(
      <2 x half> addrspace(1)* %r,
      <2 x half> addrspace(1)* %a) {
  entry:
-  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.a = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %a, i32 %tid
+  %a.val = load <2 x half>, <2 x half> addrspace(1)* %gep.a
    %r.val = fadd <2 x half> %a.val, <half 2.0, half 1.0>
    store <2 x half> %r.val, <2 x half> addrspace(1)* %r
    ret void
  }
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fadd64.ll b/test/CodeGen/AMDGPU/fadd64.ll

index c936d98673ba1dbdd9ac2384950e6f2dc5acfe3a..8fd1f52006fbbf9550501107b3ef4453a7eb8171 100644 (file)
--- a/test/CodeGen/AMDGPU/fadd64.ll
+++ b/test/CodeGen/AMDGPU/fadd64.ll
@@ -5,8 +5,11 @@
  ; CHECK: v_add_f64 {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}
  define amdgpu_kernel void @v_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                          double addrspace(1)* %in2) {
-  %r0 = load double, double addrspace(1)* %in1
-  %r1 = load double, double addrspace(1)* %in2
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr inbounds double, double addrspace(1)* %in1, i32 %tid
+  %gep2 = getelementptr inbounds double, double addrspace(1)* %in2, i32 %tid
+  %r0 = load double, double addrspace(1)* %gep1
+  %r1 = load double, double addrspace(1)* %gep2
    %r2 = fadd double %r0, %r1
    store double %r2, double addrspace(1)* %out
    ret void
@@ -42,3 +45,8 @@ define amdgpu_kernel void @s_fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x do
    store <2 x double> %r2, <2 x double> addrspace(1)* %out
    ret void
  }
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/test/CodeGen/AMDGPU/fcanonicalize.f16.ll

index 404358f0ecb98e5b4c9e58e4c6c5aa6ec3e6bfc3..dd8e277c1c75f6d31135b087dcbe54ee7b5f4c18 100644 (file)
--- a/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ b/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -5,6 +5,8 @@ declare half @llvm.fabs.f16(half) #0
  declare half @llvm.canonicalize.f16(half) #0
  declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #0
  declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>) #0
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
  
  ; GCN-LABEL: {{^}}v_test_canonicalize_var_f16:
  ; GCN: v_mul_f16_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}}
@@ -213,7 +215,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(half addrspace
  ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+$}}
  ; GFX9: buffer_store_dword [[REG]]
  define amdgpu_kernel void @v_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
-  %val = load <2 x half>, <2 x half> addrspace(1)* %out
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
+  %val = load <2 x half>, <2 x half> addrspace(1)* %gep
    %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val)
    store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
    ret void
@@ -233,7 +237,9 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f16(<2 x half> addrspace(1)
  ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, [[ABS]]{{$}}
  ; GCN: buffer_store_dword
  define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
-  %val = load <2 x half>, <2 x half> addrspace(1)* %out
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
+  %val = load <2 x half>, <2 x half> addrspace(1)* %gep
    %val.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
    %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val.fabs)
    store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
@@ -251,7 +257,9 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(<2 x half> addrspa
  ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, [[ABS]] neg_lo:[0,1] neg_hi:[0,1]{{$}}
  ; GCN: buffer_store_dword
  define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
-  %val = load <2 x half>, <2 x half> addrspace(1)* %out
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
+  %val = load <2 x half>, <2 x half> addrspace(1)* %gep
    %val.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
    %val.fabs.fneg = fsub <2 x half> <half -0.0, half -0.0>, %val.fabs
    %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val.fabs.fneg)
@@ -270,7 +278,9 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(<2 x half> ad
  ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} neg_lo:[0,1] neg_hi:[0,1]{{$}}
  ; GFX9: buffer_store_dword [[REG]]
  define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
-  %val = load <2 x half>, <2 x half> addrspace(1)* %out
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
+  %val = load <2 x half>, <2 x half> addrspace(1)* %gep
    %fneg.val = fsub <2 x half> <half -0.0, half -0.0>, %val
    %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %fneg.val)
    store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/fcanonicalize.ll b/test/CodeGen/AMDGPU/fcanonicalize.ll

index 8c385f40b1c5f9676732d75c46d29196aae102e1..feb4c7bd4a183816cbf111c026a3974ddfc577e7 100644 (file)
--- a/test/CodeGen/AMDGPU/fcanonicalize.ll
+++ b/test/CodeGen/AMDGPU/fcanonicalize.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
  
  declare float @llvm.fabs.f32(float) #0
  declare float @llvm.canonicalize.f32(float) #0
diff --git a/test/CodeGen/AMDGPU/fcmp.f16.ll b/test/CodeGen/AMDGPU/fcmp.f16.ll

index 7916226462f774312cb98b03ae5b440f17850ea5..13d5f7e91c7e487a2d7796906fe249ddaf676fad 100644 (file)
--- a/test/CodeGen/AMDGPU/fcmp.f16.ll
+++ b/test/CodeGen/AMDGPU/fcmp.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
  
  ; GCN-LABEL: {{^}}fcmp_f16_lt
  ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
diff --git a/test/CodeGen/AMDGPU/fcmp64.ll b/test/CodeGen/AMDGPU/fcmp64.ll

index b9e1921d4c45589454a62c324576084883725e56..95f7e0be7d9c9c75e49e9cebe6d78097bbcca3fe 100644 (file)
--- a/test/CodeGen/AMDGPU/fcmp64.ll
+++ b/test/CodeGen/AMDGPU/fcmp64.ll
@@ -2,7 +2,7 @@
  ; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s
  
  ; CHECK-LABEL: {{^}}flt_f64:
-; CHECK: v_cmp_nge_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
+; CHECK: v_cmp_nge_f64_e32 vcc, {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
  define amdgpu_kernel void @flt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
                       double addrspace(1)* %in2) {
     %r0 = load double, double addrspace(1)* %in1
@@ -14,7 +14,7 @@ define amdgpu_kernel void @flt_f64(i32 addrspace(1)* %out, double addrspace(1)*
  }
  
  ; CHECK-LABEL: {{^}}fle_f64:
-; CHECK: v_cmp_ngt_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
+; CHECK: v_cmp_ngt_f64_e32 vcc, {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
  define amdgpu_kernel void @fle_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
                       double addrspace(1)* %in2) {
     %r0 = load double, double addrspace(1)* %in1
@@ -26,7 +26,7 @@ define amdgpu_kernel void @fle_f64(i32 addrspace(1)* %out, double addrspace(1)*
  }
  
  ; CHECK-LABEL: {{^}}fgt_f64:
-; CHECK: v_cmp_nle_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
+; CHECK: v_cmp_nle_f64_e32 vcc, {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
  define amdgpu_kernel void @fgt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
                       double addrspace(1)* %in2) {
     %r0 = load double, double addrspace(1)* %in1
@@ -38,7 +38,7 @@ define amdgpu_kernel void @fgt_f64(i32 addrspace(1)* %out, double addrspace(1)*
  }
  
  ; CHECK-LABEL: {{^}}fge_f64:
-; CHECK: v_cmp_nlt_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
+; CHECK: v_cmp_nlt_f64_e32 vcc, {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
  define amdgpu_kernel void @fge_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
                       double addrspace(1)* %in2) {
     %r0 = load double, double addrspace(1)* %in1
@@ -50,7 +50,7 @@ define amdgpu_kernel void @fge_f64(i32 addrspace(1)* %out, double addrspace(1)*
  }
  
  ; CHECK-LABEL: {{^}}fne_f64:
-; CHECK: v_cmp_neq_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
+; CHECK: v_cmp_neq_f64_e32 vcc, {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
  define amdgpu_kernel void @fne_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                       double addrspace(1)* %in2) {
     %r0 = load double, double addrspace(1)* %in1
@@ -62,7 +62,7 @@ define amdgpu_kernel void @fne_f64(double addrspace(1)* %out, double addrspace(1
  }
  
  ; CHECK-LABEL: {{^}}feq_f64:
-; CHECK: v_cmp_nlg_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
+; CHECK: v_cmp_nlg_f64_e32 vcc, {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
  define amdgpu_kernel void @feq_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                       double addrspace(1)* %in2) {
     %r0 = load double, double addrspace(1)* %in1
diff --git a/test/CodeGen/AMDGPU/fconst64.ll b/test/CodeGen/AMDGPU/fconst64.ll

index 1255977962454b17aa05417140b4ca4b72efc3c4..ca313d80894a638f1befda79a58f8ca8b4ac2578 100644 (file)
--- a/test/CodeGen/AMDGPU/fconst64.ll
+++ b/test/CodeGen/AMDGPU/fconst64.ll
@@ -6,8 +6,15 @@
  ; CHECK-DAG: s_mov_b32 {{s[0-9]+}}, 0
  
  define amdgpu_kernel void @fconst_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
-   %r1 = load double, double addrspace(1)* %in
+   %tid = call i32 @llvm.amdgcn.workitem.id.x()
+   %gep = getelementptr inbounds double, double addrspace(1)* %in, i32 %tid
+   %r1 = load double, double addrspace(1)* %gep
     %r2 = fadd double %r1, 5.000000e+00
     store double %r2, double addrspace(1)* %out
     ret void
  }
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fcopysign.f16.ll b/test/CodeGen/AMDGPU/fcopysign.f16.ll

index 4e2bf765cd95f193230d5cac05a5f4fc971388ea..8e984246cc94d80cf0765284576331f7f8b6efd3 100644 (file)
--- a/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX8 %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX8 %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s
  
  declare half @llvm.copysign.f16(half, half)
  declare float @llvm.copysign.f32(float, float)
@@ -9,16 +9,18 @@ declare <2 x half> @llvm.copysign.v2f16(<2 x half>, <2 x half>)
  declare <3 x half> @llvm.copysign.v3f16(<3 x half>, <3 x half>)
  declare <4 x half> @llvm.copysign.v4f16(<4 x half>, <4 x half>)
  
+declare i32 @llvm.amdgcn.workitem.id.x()
+
  ; GCN-LABEL: {{^}}test_copysign_f16:
-; SI: buffer_load_ushort v[[SIGN:[0-9]+]]
-; SI: buffer_load_ushort v[[MAG:[0-9]+]]
+; SI: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]]
+; SI: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]]
  ; SI: s_brev_b32 s[[CONST:[0-9]+]], -2
  ; SI-DAG: v_cvt_f32_f16_e32 v[[MAG_F32:[0-9]+]], v[[MAG]]
  ; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]]
  ; SI: v_bfi_b32 v[[OUT_F32:[0-9]+]], s[[CONST]], v[[MAG_F32]], v[[SIGN_F32]]
  ; SI: v_cvt_f16_f32_e32 v[[OUT:[0-9]+]], v[[OUT_F32]]
-; GFX89: buffer_load_ushort v[[SIGN:[0-9]+]]
-; GFX89: buffer_load_ushort v[[MAG:[0-9]+]]
+; GFX89: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]]
+; GFX89: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]]
  ; GFX89: s_movk_i32 s[[CONST:[0-9]+]], 0x7fff
  ; GFX89: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN]]
  ; GCN: buffer_store_short v[[OUT]]
@@ -36,8 +38,8 @@ entry:
  }
  
  ; GCN-LABEL: {{^}}test_copysign_out_f32_mag_f16_sign_f32:
-; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]]
-; GCN-DAG: buffer_load_dword v[[SIGN:[0-9]+]]
+; GCN-DAG: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]]
+; GCN-DAG: {{buffer|flat}}_load_dword v[[SIGN:[0-9]+]]
  ; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
  ; GCN-DAG: v_cvt_f32_f16_e32 v[[MAG_EXT:[0-9]+]], v[[MAG]]
  ; GCN: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG_EXT]], v[[SIGN]]
@@ -48,17 +50,20 @@ define amdgpu_kernel void @test_copysign_out_f32_mag_f16_sign_f32(
    half addrspace(1)* %arg_mag,
    float addrspace(1)* %arg_sign) {
  entry:
-  %mag = load half, half addrspace(1)* %arg_mag
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %arg_mag_gep = getelementptr half, half addrspace(1)* %arg_mag, i32 %tid
+  %mag = load half, half addrspace(1)* %arg_mag_gep
    %mag.ext = fpext half %mag to float
-  %sign = load float, float addrspace(1)* %arg_sign
+  %arg_sign_gep = getelementptr float, float addrspace(1)* %arg_sign, i32 %tid
+  %sign = load float, float addrspace(1)* %arg_sign_gep
    %out = call float @llvm.copysign.f32(float %mag.ext, float %sign)
    store float %out, float addrspace(1)* %arg_out
    ret void
  }
  
  ; GCN-LABEL: {{^}}test_copysign_out_f64_mag_f16_sign_f64:
-; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]]
-; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]{{\]}}
+; GCN-DAG: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]]
+; GCN-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]{{\]}}
  ; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
  ; GCN-DAG: v_cvt_f32_f16_e32 v[[MAG_EXT:[0-9]+]], v[[MAG]]
  ; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[MAG_EXT_LO:[0-9]+]]:[[MAG_EXT_HI:[0-9]+]]{{\]}}, v[[MAG_EXT]]
@@ -70,17 +75,20 @@ define amdgpu_kernel void @test_copysign_out_f64_mag_f16_sign_f64(
    half addrspace(1)* %arg_mag,
    double addrspace(1)* %arg_sign) {
  entry:
-  %mag = load half, half addrspace(1)* %arg_mag
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %arg_mag_gep = getelementptr half, half addrspace(1)* %arg_mag, i32 %tid
+  %mag = load half, half addrspace(1)* %arg_mag_gep
    %mag.ext = fpext half %mag to double
-  %sign = load double, double addrspace(1)* %arg_sign
+  %arg_sign_gep = getelementptr double, double addrspace(1)* %arg_sign, i32 %tid
+  %sign = load double, double addrspace(1)* %arg_sign_gep
    %out = call double @llvm.copysign.f64(double %mag.ext, double %sign)
    store double %out, double addrspace(1)* %arg_out
    ret void
  }
  
  ; GCN-LABEL: {{^}}test_copysign_out_f32_mag_f32_sign_f16:
-; GCN-DAG: buffer_load_dword v[[MAG:[0-9]+]]
-; GCN-DAG: buffer_load_ushort v[[SIGN:[0-9]+]]
+; GCN-DAG: {{buffer|flat}}_load_dword v[[MAG:[0-9]+]]
+; GCN-DAG: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]]
  ; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
  ; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]]
  ; SI: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN_F32]]
@@ -93,8 +101,11 @@ define amdgpu_kernel void @test_copysign_out_f32_mag_f32_sign_f16(
    float addrspace(1)* %arg_mag,
    half addrspace(1)* %arg_sign) {
  entry:
-  %mag = load float, float addrspace(1)* %arg_mag
-  %sign = load half, half addrspace(1)* %arg_sign
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %arg_mag_gep = getelementptr float, float addrspace(1)* %arg_mag, i32 %tid
+  %mag = load float, float addrspace(1)* %arg_mag_gep
+  %arg_sign_gep = getelementptr half, half addrspace(1)* %arg_sign, i32 %tid
+  %sign = load half, half addrspace(1)* %arg_sign_gep
    %sign.ext = fpext half %sign to float
    %out = call float @llvm.copysign.f32(float %mag, float %sign.ext)
    store float %out, float addrspace(1)* %arg_out
@@ -102,8 +113,8 @@ entry:
  }
  
  ; GCN-LABEL: {{^}}test_copysign_out_f64_mag_f64_sign_f16:
-; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[MAG_LO:[0-9]+]]:[[MAG_HI:[0-9]+]]{{\]}}
-; GCN-DAG: buffer_load_ushort v[[SIGN:[0-9]+]]
+; GCN-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[MAG_LO:[0-9]+]]:[[MAG_HI:[0-9]+]]{{\]}}
+; GCN-DAG: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]]
  ; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
  ; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]]
  ; SI: v_bfi_b32 v[[OUT_HI:[0-9]+]], s[[CONST]], v[[MAG_HI]], v[[SIGN_F32]]
@@ -116,8 +127,11 @@ define amdgpu_kernel void @test_copysign_out_f64_mag_f64_sign_f16(
    double addrspace(1)* %arg_mag,
    half addrspace(1)* %arg_sign) {
  entry:
-  %mag = load double, double addrspace(1)* %arg_mag
-  %sign = load half, half addrspace(1)* %arg_sign
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %arg_mag_gep = getelementptr double, double addrspace(1)* %arg_mag, i32 %tid
+  %mag = load double, double addrspace(1)* %arg_mag_gep
+  %arg_sign_gep = getelementptr half, half addrspace(1)* %arg_sign, i32 %tid
+  %sign = load half, half addrspace(1)* %arg_sign_gep
    %sign.ext = fpext half %sign to double
    %out = call double @llvm.copysign.f64(double %mag, double %sign.ext)
    store double %out, double addrspace(1)* %arg_out
@@ -125,8 +139,8 @@ entry:
  }
  
  ; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f16_sign_f32:
-; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]]
-; GCN-DAG: buffer_load_dword v[[SIGN:[0-9]+]]
+; GCN-DAG: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]]
+; GCN-DAG: {{buffer|flat}}_load_dword v[[SIGN:[0-9]+]]
  ; SI-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
  ; SI-DAG: v_cvt_f32_f16_e32 v[[MAG_F32:[0-9]+]], v[[MAG]]
  ; SI: v_bfi_b32 v[[OUT_F32:[0-9]+]], s[[CONST]], v[[MAG_F32]], v[[SIGN]]
@@ -141,8 +155,11 @@ define amdgpu_kernel void @test_copysign_out_f16_mag_f16_sign_f32(
    half addrspace(1)* %arg_mag,
    float addrspace(1)* %arg_sign) {
  entry:
-  %mag = load half, half addrspace(1)* %arg_mag
-  %sign = load float, float addrspace(1)* %arg_sign
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %arg_mag_gep = getelementptr half, half addrspace(1)* %arg_mag, i32 %tid
+  %mag = load half, half addrspace(1)* %arg_mag_gep
+  %arg_sign_gep = getelementptr float, float addrspace(1)* %arg_sign, i32 %tid
+  %sign = load float, float addrspace(1)* %arg_sign_gep
    %sign.trunc = fptrunc float %sign to half
    %out = call half @llvm.copysign.f16(half %mag, half %sign.trunc)
    store half %out, half addrspace(1)* %arg_out
@@ -150,8 +167,8 @@ entry:
  }
  
  ; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f16_sign_f64:
-; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]]
-; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]{{\]}}
+; GCN-DAG: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]]
+; GCN-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]{{\]}}
  ; SI-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
  ; SI-DAG: v_cvt_f32_f16_e32 v[[MAG_F32:[0-9]+]], v[[MAG]]
  ; SI: v_bfi_b32 v[[OUT_F32:[0-9]+]], s[[CONST]], v[[MAG_F32]], v[[SIGN_HI]]
@@ -166,8 +183,11 @@ define amdgpu_kernel void @test_copysign_out_f16_mag_f16_sign_f64(
    half addrspace(1)* %arg_mag,
    double addrspace(1)* %arg_sign) {
  entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %arg_mag_gep = getelementptr half, half addrspace(1)* %arg_mag, i32 %tid
    %mag = load half, half addrspace(1)* %arg_mag
-  %sign = load double, double addrspace(1)* %arg_sign
+  %arg_sign_gep = getelementptr double, double addrspace(1)* %arg_sign, i32 %tid
+  %sign = load double, double addrspace(1)* %arg_sign_gep
    %sign.trunc = fptrunc double %sign to half
    %out = call half @llvm.copysign.f16(half %mag, half %sign.trunc)
    store half %out, half addrspace(1)* %arg_out
@@ -175,8 +195,8 @@ entry:
  }
  
  ; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f32_sign_f16:
-; GCN-DAG: buffer_load_dword v[[MAG:[0-9]+]]
-; GCN-DAG: buffer_load_ushort v[[SIGN:[0-9]+]]
+; GCN-DAG: {{buffer|flat}}_load_dword v[[MAG:[0-9]+]]
+; GCN-DAG: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]]
  ; SI-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
  ; SI-DAG: v_cvt_f16_f32_e32 v[[MAG_TRUNC:[0-9]+]], v[[MAG]]
  ; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]]
@@ -193,9 +213,12 @@ define amdgpu_kernel void @test_copysign_out_f16_mag_f32_sign_f16(
    float addrspace(1)* %arg_mag,
    half addrspace(1)* %arg_sign) {
  entry:
-  %mag = load float, float addrspace(1)* %arg_mag
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %arg_mag_gep = getelementptr float, float addrspace(1)* %arg_mag, i32 %tid
+  %mag = load float, float addrspace(1)* %arg_mag_gep
    %mag.trunc = fptrunc float %mag to half
-  %sign = load half, half addrspace(1)* %arg_sign
+  %arg_sign_gep = getelementptr half, half addrspace(1)* %arg_sign, i32 %tid
+  %sign = load half, half addrspace(1)* %arg_sign_gep
    %out = call half @llvm.copysign.f16(half %mag.trunc, half %sign)
    store half %out, half addrspace(1)* %arg_out
    ret void
diff --git a/test/CodeGen/AMDGPU/fma-combine.ll b/test/CodeGen/AMDGPU/fma-combine.ll

index 4113ba8dc1f0798ff1c61f8355bb0fc46c654fc1..ba6b53bad6b792d669fe81f9f58575533a0cca76 100644 (file)
--- a/test/CodeGen/AMDGPU/fma-combine.ll
+++ b/test/CodeGen/AMDGPU/fma-combine.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math -mattr=+fp32-denormals < %s | FileCheck -check-prefix=SI-FMA -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=verde -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math -mattr=+fp32-denormals < %s | FileCheck -check-prefix=SI-FMA -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s
  
  ; Note: The SI-FMA conversions of type x * (y + 1) --> x * y + x would be
  ; beneficial even without fp32 denormals, but they do require no-infs-fp-math
diff --git a/test/CodeGen/AMDGPU/fma.f64.ll b/test/CodeGen/AMDGPU/fma.f64.ll

index 4d3f3712621ef86461c10a06f99bfa7c8bb8bdd8..907121f1cd46bb3c10ec20f53a44006977857813 100644 (file)
--- a/test/CodeGen/AMDGPU/fma.f64.ll
+++ b/test/CodeGen/AMDGPU/fma.f64.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
  
  declare double @llvm.fma.f64(double, double, double) nounwind readnone
  declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/fma.ll b/test/CodeGen/AMDGPU/fma.ll

index 659cecb59ebf7c1a1fd70c07cdc22268479ffb34..6be4c450a51edea007f124c9780882f769f97752 100644 (file)
--- a/test/CodeGen/AMDGPU/fma.ll
+++ b/test/CodeGen/AMDGPU/fma.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
  
  declare float @llvm.fma.f32(float, float, float) nounwind readnone
  declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/fmul.f16.ll b/test/CodeGen/AMDGPU/fmul.f16.ll

index cd86409e20384803f648d4062d5f6fee876894ce..eaf95a62b5ac758e0c493e0389d392f314f6a5c2 100644 (file)
--- a/test/CodeGen/AMDGPU/fmul.f16.ll
+++ b/test/CodeGen/AMDGPU/fmul.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
  
  ; GCN-LABEL: {{^}}fmul_f16
  ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
diff --git a/test/CodeGen/AMDGPU/fmul64.ll b/test/CodeGen/AMDGPU/fmul64.ll

index f14233f267b2be6f2ddf03658e0683f4551701b1..d37d432842f37d1b94fa73a78bb13bde38546ceb 100644 (file)
--- a/test/CodeGen/AMDGPU/fmul64.ll
+++ b/test/CodeGen/AMDGPU/fmul64.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
  
  ; FUNC-LABEL: {{^}}fmul_f64:
  ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
diff --git a/test/CodeGen/AMDGPU/fmuladd.f32.ll b/test/CodeGen/AMDGPU/fmuladd.f32.ll

index e422550266924e542de644927d256573f0f9c416..58c3d7fabd87406f0c6658b86605c877fe85ebf9 100644 (file)
--- a/test/CodeGen/AMDGPU/fmuladd.f32.ll
+++ b/test/CodeGen/AMDGPU/fmuladd.f32.ll
@@ -1,12 +1,12 @@
-; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=on   < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-STRICT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,SI %s
-; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=on   < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,SI %s
-; RUN: llc -verify-machineinstrs -mcpu=verde  -mattr=-fp32-denormals,-fast-fmaf -fp-contract=on   < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-STRICT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-STRICT,SI %s
-; RUN: llc -verify-machineinstrs -mcpu=verde  -mattr=+fp32-denormals,-fast-fmaf -fp-contract=on   < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-STRICT,SI %s
-
-; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-CONTRACT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-CONTRACT,SI %s
-; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-CONTRACT,SI %s
-; RUN: llc -verify-machineinstrs -mcpu=verde  -mattr=-fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-CONTRACT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-CONTRACT,SI %s
-; RUN: llc -verify-machineinstrs -mcpu=verde  -mattr=+fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-CONTRACT,SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=on   < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-STRICT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=on   < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -verify-machineinstrs -mcpu=verde  -mattr=-fp32-denormals,-fast-fmaf -fp-contract=on   < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-STRICT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-STRICT,SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -verify-machineinstrs -mcpu=verde  -mattr=+fp32-denormals,-fast-fmaf -fp-contract=on   < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-STRICT,SI %s
+
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-CONTRACT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-CONTRACT,SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-CONTRACT,SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -verify-machineinstrs -mcpu=verde  -mattr=-fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-CONTRACT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-CONTRACT,SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -verify-machineinstrs -mcpu=verde  -mattr=+fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-CONTRACT,SI %s
  
  ; Test all permutations of: fp32 denormals, fast fp contract, fp contract enabled for fmuladd, fmaf fast/slow.
  
diff --git a/test/CodeGen/AMDGPU/fmuladd.f64.ll b/test/CodeGen/AMDGPU/fmuladd.f64.ll

index 86e91e04b0fc3bc5c82afc4550e301fed26f5601..8d91a56ee421142a46eb7acacbd927273b6050da 100644 (file)
--- a/test/CodeGen/AMDGPU/fmuladd.f64.ll
+++ b/test/CodeGen/AMDGPU/fmuladd.f64.ll
@@ -1,9 +1,9 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICTSI %s
-; RUN: llc -march=amdgcn -mcpu=verde  -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,SI %s
-; RUN: llc -march=amdgcn -mcpu=tahiti -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,SI %s
-; RUN: llc -march=amdgcn -mcpu=verde  -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global  -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,VI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global  -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,VI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tahiti -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICTSI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=verde  -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tahiti -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=verde  -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global  -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,VI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global  -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,VI %s
  
  ; GCN-LABEL: {{^}}fmuladd_f64:
  ; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
diff --git a/test/CodeGen/AMDGPU/fmuladd.v2f16.ll b/test/CodeGen/AMDGPU/fmuladd.v2f16.ll

index 624610096cbc5ea3f52ecd4975978fd12b8cc1db..b50a26c023ca3314d499280cf4332f3b4799ba67 100644 (file)
--- a/test/CodeGen/AMDGPU/fmuladd.v2f16.ll
+++ b/test/CodeGen/AMDGPU/fmuladd.v2f16.ll
@@ -1,12 +1,12 @@
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s
-
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s
+
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s
  
  declare i32 @llvm.amdgcn.workitem.id.x() #1
  declare <2 x half> @llvm.fmuladd.v2f16(<2 x half>, <2 x half>, <2 x half>) #1
diff --git a/test/CodeGen/AMDGPU/fneg-fabs.ll b/test/CodeGen/AMDGPU/fneg-fabs.ll

index 0a7346f410c943d7cb13c286e3bc622194976f70..3f20ca73e9228d0c7e4e610780e398ba795cb21d 100644 (file)
--- a/test/CodeGen/AMDGPU/fneg-fabs.ll
+++ b/test/CodeGen/AMDGPU/fneg-fabs.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
  
  ; FUNC-LABEL: {{^}}fneg_fabs_fadd_f32:
  ; SI-NOT: and
diff --git a/test/CodeGen/AMDGPU/fneg.f16.ll b/test/CodeGen/AMDGPU/fneg.f16.ll

index 2d94726cbe204fa0b9bd5b6f4eaef685673f41d9..25ed2086732a0540238b361913ddb7f8e7b5dd18 100644 (file)
--- a/test/CodeGen/AMDGPU/fneg.f16.ll
+++ b/test/CodeGen/AMDGPU/fneg.f16.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=kaveri -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN -check-prefix=GFX89 %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN -check-prefix=GFX89 %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=kaveri -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN -check-prefix=GFX89 %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx901 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN -check-prefix=GFX89 %s
  
  ; FIXME: Should be able to do scalar op
  ; GCN-LABEL: {{^}}s_fneg_f16:
diff --git a/test/CodeGen/AMDGPU/fp32_to_fp16.ll b/test/CodeGen/AMDGPU/fp32_to_fp16.ll

index 2c6b1cb18f7e64e414e82c9509e5ec0d38acc92b..579a1454dd9ae326cb0e30f219b28ed4ebedc706 100644 (file)
--- a/test/CodeGen/AMDGPU/fp32_to_fp16.ll
+++ b/test/CodeGen/AMDGPU/fp32_to_fp16.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
  
  declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone
  
diff --git a/test/CodeGen/AMDGPU/fpext.f16.ll b/test/CodeGen/AMDGPU/fpext.f16.ll

index 15cc73b9ee53ea9e650f952bf14d4ac2222ab04f..c483feae3af7a6b972db50c09c9d39b26b05029f 100644 (file)
--- a/test/CodeGen/AMDGPU/fpext.f16.ll
+++ b/test/CodeGen/AMDGPU/fpext.f16.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s
  
  ; GCN-LABEL: {{^}}fpext_f16_to_f32
  ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
diff --git a/test/CodeGen/AMDGPU/fptosi.f16.ll b/test/CodeGen/AMDGPU/fptosi.f16.ll

index f310618d8bdb674ee6b1cfce512ef16704228f00..6f23df1eff9a0a0efea58747b4c54eb6d55c40c4 100644 (file)
--- a/test/CodeGen/AMDGPU/fptosi.f16.ll
+++ b/test/CodeGen/AMDGPU/fptosi.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
  
  ; GCN-LABEL: {{^}}fptosi_f16_to_i16
  ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
diff --git a/test/CodeGen/AMDGPU/fptoui.f16.ll b/test/CodeGen/AMDGPU/fptoui.f16.ll

index 7641c08e33c367b9eb07f5c479068b9c885da981..335e41f54c6933c91459f4283ba35d01e5a715e1 100644 (file)
--- a/test/CodeGen/AMDGPU/fptoui.f16.ll
+++ b/test/CodeGen/AMDGPU/fptoui.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
  
  ; GCN-LABEL: {{^}}fptoui_f16_to_i16
  ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
diff --git a/test/CodeGen/AMDGPU/fptrunc.f16.ll b/test/CodeGen/AMDGPU/fptrunc.f16.ll

index bc72f4424c98f04a52eeb57cea308a2987940c18..32ef5edb35936150115efc32207dacbbc08ab482 100644 (file)
--- a/test/CodeGen/AMDGPU/fptrunc.f16.ll
+++ b/test/CodeGen/AMDGPU/fptrunc.f16.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SIVI %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global,-fp64-fp16-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SIVI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global,-fp64-fp16-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
  
  ; GCN-LABEL: {{^}}fptrunc_f32_to_f16:
  ; GCN: buffer_load_dword v[[A_F32:[0-9]+]]
diff --git a/test/CodeGen/AMDGPU/fract.f64.ll b/test/CodeGen/AMDGPU/fract.f64.ll

index 9a56cbe983cdd4198b6a7c74509acc96f3ce41a1..1314dfe3c7cab0a0b256bcf9af2315176ba673a7 100644 (file)
--- a/test/CodeGen/AMDGPU/fract.f64.ll
+++ b/test/CodeGen/AMDGPU/fract.f64.ll
@@ -1,9 +1,9 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
  
-; RUN: llc -march=amdgcn -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=SI-UNSAFE -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=VI-UNSAFE -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=SI-UNSAFE -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=VI-UNSAFE -check-prefix=FUNC %s
  
  declare double @llvm.fabs.f64(double) #0
  declare double @llvm.floor.f64(double) #0
diff --git a/test/CodeGen/AMDGPU/fract.ll b/test/CodeGen/AMDGPU/fract.ll

index 207fe280c9a69d8e407920f78e1c686bcae6b9e6..023316527f8f5370d938f35033bc01c424d7aa39 100644 (file)
--- a/test/CodeGen/AMDGPU/fract.ll
+++ b/test/CodeGen/AMDGPU/fract.ll
@@ -1,8 +1,8 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=CI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=GCN %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=CI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=GCN %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=GCN %s
  
  declare float @llvm.fabs.f32(float) #0
  declare float @llvm.floor.f32(float) #0
diff --git a/test/CodeGen/AMDGPU/frem.ll b/test/CodeGen/AMDGPU/frem.ll

index 9778069d0477be7597b199a6d53724b96146fb10..bf6312949f79f5d2855366e36418a6075f284fef 100644 (file)
--- a/test/CodeGen/AMDGPU/frem.ll
+++ b/test/CodeGen/AMDGPU/frem.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs  < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs  < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s
  
  ; FUNC-LABEL: {{^}}frem_f32:
  ; GCN-DAG: buffer_load_dword [[X:v[0-9]+]], {{.*$}}
diff --git a/test/CodeGen/AMDGPU/fsqrt.f64.ll b/test/CodeGen/AMDGPU/fsqrt.f64.ll

index 453d8fb37f2f480d781112e4320c46a72b8225b2..186757e4c5d8445daf16f0c6e737778e9c893c5a 100644 (file)
--- a/test/CodeGen/AMDGPU/fsqrt.f64.ll
+++ b/test/CodeGen/AMDGPU/fsqrt.f64.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
  
  ; FUNC-LABEL: {{^}}v_safe_fsqrt_f64:
  ; GCN: v_sqrt_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
diff --git a/test/CodeGen/AMDGPU/fsqrt.ll b/test/CodeGen/AMDGPU/fsqrt.ll

index a0fd3411ca05c03974e90e94ce734bda68345094..6bd9a0db14f660ff0c62462a1ca225af729a9f48 100644 (file)
--- a/test/CodeGen/AMDGPU/fsqrt.ll
+++ b/test/CodeGen/AMDGPU/fsqrt.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
  
  
  ; Run with unsafe-fp-math to make sure nothing tries to turn this into 1 / rsqrt(x)
diff --git a/test/CodeGen/AMDGPU/fsub.f16.ll b/test/CodeGen/AMDGPU/fsub.f16.ll

index fa00c06546dbdcb4debde5d1a6c7f52649187bd5..4731b0600845eeaf103cc7e6c5aa1b8dc3ba115a 100644 (file)
--- a/test/CodeGen/AMDGPU/fsub.f16.ll
+++ b/test/CodeGen/AMDGPU/fsub.f16.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=VI -check-prefix=SIVI %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=VI -check-prefix=SIVI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s
  
  ; GCN-LABEL: {{^}}fsub_f16:
  ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
diff --git a/test/CodeGen/AMDGPU/fsub.ll b/test/CodeGen/AMDGPU/fsub.ll

index e7a92d95d48593e879bb2ece4c95835057153bcf..153b9ade77b8a60ccfdf3ccaee0c469f23b46a13 100644 (file)
--- a/test/CodeGen/AMDGPU/fsub.ll
+++ b/test/CodeGen/AMDGPU/fsub.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
  
  ; FUNC-LABEL: {{^}}v_fsub_f32:
  ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
diff --git a/test/CodeGen/AMDGPU/fsub64.ll b/test/CodeGen/AMDGPU/fsub64.ll

index dc332414a1527e742364268566b4a21679127f25..73f1a69eeb9d62b93b913b95893dc7287bd17c11 100644 (file)
--- a/test/CodeGen/AMDGPU/fsub64.ll
+++ b/test/CodeGen/AMDGPU/fsub64.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
  
  declare double @llvm.fabs.f64(double) #0
  
diff --git a/test/CodeGen/AMDGPU/ftrunc.f64.ll b/test/CodeGen/AMDGPU/ftrunc.f64.ll

index 1f72ec65588ea0faf04ece9e6912edc98da722d8..bb2a6ba8e34832132068d380e4e8483e489bed4d 100644 (file)
--- a/test/CodeGen/AMDGPU/ftrunc.f64.ll
+++ b/test/CodeGen/AMDGPU/ftrunc.f64.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
  
  declare double @llvm.trunc.f64(double) nounwind readnone
  declare <2 x double> @llvm.trunc.v2f64(<2 x double>) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/global-extload-i16.ll b/test/CodeGen/AMDGPU/global-extload-i16.ll

index 19e592f50beaf0d69b6342cf10ab98f30cd81d63..4e50f995d27e748d9e1c073d655b13fd99a5fdc9 100644 (file)
--- a/test/CodeGen/AMDGPU/global-extload-i16.ll
+++ b/test/CodeGen/AMDGPU/global-extload-i16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
  ; XUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
  ; FIXME: cypress is broken because the bigger testcases spill and it's not implemented
  
diff --git a/test/CodeGen/AMDGPU/half.ll b/test/CodeGen/AMDGPU/half.ll

index 41ae5a4a0b00b79460ab88d62dece4842ded6cc4..64a495e0974e6bff291a3e5b9300d8e1fcabb920 100644 (file)
--- a/test/CodeGen/AMDGPU/half.ll
+++ b/test/CodeGen/AMDGPU/half.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
  
  ; half args should be promoted to float for SI and lower.
  
diff --git a/test/CodeGen/AMDGPU/imm.ll b/test/CodeGen/AMDGPU/imm.ll

index c2668a077b0987fa90e4a5ceb6f67ae9b1ad8968..8cda01a10f76508b413d03951a4ba0fa9f2d1e6b 100644 (file)
--- a/test/CodeGen/AMDGPU/imm.ll
+++ b/test/CodeGen/AMDGPU/imm.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
  
  ; Use a 64-bit value with lo bits that can be represented as an inline constant
  ; GCN-LABEL: {{^}}i64_imm_inline_lo:
diff --git a/test/CodeGen/AMDGPU/immv216.ll b/test/CodeGen/AMDGPU/immv216.ll

index cd3502baee7bef7444cb5ba35703762fe813e3d5..b29e93d13b8e59632f78cc04b808fe05ce61cae5 100644 (file)
--- a/test/CodeGen/AMDGPU/immv216.ll
+++ b/test/CodeGen/AMDGPU/immv216.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn--amdhsa -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
  ; FIXME: Merge into imm.ll
  
  ; GCN-LABEL: {{^}}store_inline_imm_neg_0.0_v2i16:
diff --git a/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/test/CodeGen/AMDGPU/indirect-addressing-si.ll

index 0d20c32a4770c35c07fc16866e67fcc4e6238bfb..62200b988bea3ab1c236a2fd3b3249e494913f94 100644 (file)
--- a/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -1,7 +1,7 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s
  
  ; Tests for indirect addressing on SI, which is implemented using dynamic
  ; indexing of vectors.
diff --git a/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll b/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll

index 5cd965d2fa9c3bae6be93525569c5cf37be0223c..eea26192ed32229c77001469a28b97f73f6eb891 100644 (file)
--- a/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll
+++ b/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -amdgpu-load-store-vectorizer=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -amdgpu-load-store-vectorizer=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
  
  ; GatherAllAliases gives up on trying to analyze cases where the
  ; pointer may have been loaded from an aliased store, so make sure
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll

index f08d4b6c791567e19e31245a5e8c376a187944c8..06dc2cc8b90e1aa3f6f6ea728761b5717526ab4b 100644 (file)
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
  
  declare half @llvm.fabs.f16(half %a)
  declare i1 @llvm.amdgcn.class.f16(half %a, i32 %b)
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll

index 1fcdac537fba6ff3a6b0f9932ab992bfb7fb0003..f71b9752e9a10ea4ac23bcba83b7c262d86fb91b 100644 (file)
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
  
  declare i1 @llvm.amdgcn.class.f32(float, i32) #1
  declare i1 @llvm.amdgcn.class.f64(double, i32) #1
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll

index 2cc63ae74bf10c4b0603c03627be636c12790fcd..1b3e09a81e5a02248196ed0b74bc5b691ea1c8d5 100644 (file)
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=SI %s
  ; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=VI %s
  
  ; FIXME: Enable for VI.
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll

index fe211d356070c9a2b8b7f3ddc124c80298f180e9..7068f45590551410f4bc36bf61c6cf93dd14a096 100644 (file)
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
  
  declare half @llvm.amdgcn.ldexp.f16(half %a, i32 %b)
  
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll

index 593c95856811eb07d0ecb11c7d46f80f2ecc1901..871b8c4f99b998a30e610b4e68935d199f70f255 100644 (file)
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
  
  ; GCN-LABEL: {{^}}bfe_i32_arg_arg_arg:
  ; GCN: v_bfe_i32
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll

index 495e36b09f8faee6300f37a0a4e253c05785400a..39370e41e8aa9953da65d52ee011dc61b79a84f1 100644 (file)
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
  
  declare i32 @llvm.amdgcn.sffbh.i32(i32) #1
  
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll

index e0cec2134e70c49856cb7149488fdf7f17f7b73a..8468aa3a7b3ef33d8dd53a2199bd87cb876940bb 100644 (file)
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
  
  declare double @llvm.amdgcn.trig.preop.f64(double, i32) nounwind readnone
  
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll

index 92e3a1099da0a92881a866d73ec4d2c305b2d01c..68fd08f778c43fb3977dea9a5cb54adbb0483339 100644 (file)
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
  
  ; GCN-LABEL: {{^}}bfe_u32_arg_arg_arg:
  ; GCN: v_bfe_u32
diff --git a/test/CodeGen/AMDGPU/llvm.ceil.f16.ll b/test/CodeGen/AMDGPU/llvm.ceil.f16.ll

index 0604a49372a2b6585a06c732a85f06bcb3968d1e..e6dacfab565b0b775aa2f9e6dd9e4c11fc2a7060 100644 (file)
--- a/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
  
  declare half @llvm.ceil.f16(half %a)
  declare <2 x half> @llvm.ceil.v2f16(<2 x half> %a)
diff --git a/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/test/CodeGen/AMDGPU/llvm.cos.f16.ll

index d836ea36ef6320e94e67d8e202652f88614e33b1..0f4f6d9e45e7fc19e581ea5de4f1ec10aa3a3b79 100644 (file)
--- a/test/CodeGen/AMDGPU/llvm.cos.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.cos.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
  
  declare half @llvm.cos.f16(half %a)
  declare <2 x half> @llvm.cos.v2f16(<2 x half> %a)
diff --git a/test/CodeGen/AMDGPU/llvm.exp2.f16.ll b/test/CodeGen/AMDGPU/llvm.exp2.f16.ll

index 5757142b9e9544c78aee6bc61449c6db505c4add..8a20b9af658ff217e711a3661cdaeb6d22b0d21d 100644 (file)
--- a/test/CodeGen/AMDGPU/llvm.exp2.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.exp2.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
  
  declare half @llvm.exp2.f16(half %a)
  declare <2 x half> @llvm.exp2.v2f16(<2 x half> %a)
diff --git a/test/CodeGen/AMDGPU/llvm.floor.f16.ll b/test/CodeGen/AMDGPU/llvm.floor.f16.ll

index 6a18141d8035e23aace6901a4381dcb11591fe2e..a551f67d51d4185083fdaef654bcf38d9b140692 100644 (file)
--- a/test/CodeGen/AMDGPU/llvm.floor.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.floor.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
  
  declare half @llvm.floor.f16(half %a)
  declare <2 x half> @llvm.floor.v2f16(<2 x half> %a)
diff --git a/test/CodeGen/AMDGPU/llvm.fma.f16.ll b/test/CodeGen/AMDGPU/llvm.fma.f16.ll

index 3f4fba7d8ead02180825c0248239d48d64fff2e0..da0841d910d1f16ad2bcb53030090f2dacc35659 100644 (file)
--- a/test/CodeGen/AMDGPU/llvm.fma.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.fma.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
  
  declare half @llvm.fma.f16(half %a, half %b, half %c)
  declare <2 x half> @llvm.fma.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
diff --git a/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll

index 806723e5136ca618fad08e6260baa019a4a8645a..72cd834dd692cb043871103a704c15a961f374a1 100644 (file)
--- a/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
@@ -1,7 +1,7 @@
-; RUN: llc -march=amdgcn -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SI-FLUSH %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-FLUSH %s
-; RUN: llc -march=amdgcn -mattr=+fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SI-DENORM %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-DENORM %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SI-FLUSH %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-FLUSH %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mattr=+fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SI-DENORM %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-DENORM %s
  
  declare half @llvm.fmuladd.f16(half %a, half %b, half %c)
  declare <2 x half> @llvm.fmuladd.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
diff --git a/test/CodeGen/AMDGPU/llvm.log2.f16.ll b/test/CodeGen/AMDGPU/llvm.log2.f16.ll

index 773eb55283e44d46f46c54566f4ef13b1fb2b932..7e5553ec53b0d02b179c95e561226506e2443dca 100644 (file)
--- a/test/CodeGen/AMDGPU/llvm.log2.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.log2.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
  
  declare half @llvm.log2.f16(half %a)
  declare <2 x half> @llvm.log2.v2f16(<2 x half> %a)
diff --git a/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll

index 8f4b314ffabb2b949b722e12aa20f29aeaf46880..393ee218eb9e5d1ec7e0c0475737dc090e38a7b2 100644 (file)
--- a/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
  
  declare half @llvm.maxnum.f16(half %a, half %b)
  declare <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
diff --git a/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/test/CodeGen/AMDGPU/llvm.minnum.f16.ll

index 1a86286f7136cffd37939bc547ef0ea07fb6acc9..afd05ecabfa5eca8392964d02f179a9878f02af3 100644 (file)
--- a/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
  
  declare half @llvm.minnum.f16(half %a, half %b)
  declare <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
diff --git a/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/test/CodeGen/AMDGPU/llvm.rint.f16.ll

index 30cb969a76e5afd3f1f6539f913801a47a34c070..07470630cc02278acf6c46ed323266b4110f3ff9 100644 (file)
--- a/test/CodeGen/AMDGPU/llvm.rint.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.rint.f16.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI -check-prefix=VI -check-prefix=GFX89 %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI -check-prefix=VI -check-prefix=GFX89 %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s
  
  declare half @llvm.rint.f16(half %a)
  declare <2 x half> @llvm.rint.v2f16(<2 x half> %a)
diff --git a/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/test/CodeGen/AMDGPU/llvm.sin.f16.ll

index eb1f32c981f88eae786d7a5c203e382812310f4c..d768f7f9561eac1d7de10b3b662e03c1826b9717 100644 (file)
--- a/test/CodeGen/AMDGPU/llvm.sin.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.sin.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
  
  declare half @llvm.sin.f16(half %a)
  declare <2 x half> @llvm.sin.v2f16(<2 x half> %a)
diff --git a/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll b/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll

index 46ee6526aca2f09505eacf02f17be5981a39536b..f7da626f683f23b6533b343b62bbd14e96dcc189 100644 (file)
--- a/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
  
  declare half @llvm.sqrt.f16(half %a)
  declare <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
diff --git a/test/CodeGen/AMDGPU/llvm.trunc.f16.ll b/test/CodeGen/AMDGPU/llvm.trunc.f16.ll

index dc7182aa0d89a3b15d4bb24b35df01eb49c8602e..b22defb797dea889adea657a259cb171a3f1d0c0 100644 (file)
--- a/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
  
  declare half @llvm.trunc.f16(half %a)
  declare <2 x half> @llvm.trunc.v2f16(<2 x half> %a)
diff --git a/test/CodeGen/AMDGPU/load-global-f32.ll b/test/CodeGen/AMDGPU/load-global-f32.ll

index bd6fea587b42fdb5b24a3d77f7c26530e7718da0..77557a584093f70bea12ce246bee95f075a3b6f2 100644 (file)
--- a/test/CodeGen/AMDGPU/load-global-f32.ll
+++ b/test/CodeGen/AMDGPU/load-global-f32.ll
@@ -1,9 +1,9 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
  
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
  
  ; FUNC-LABEL: {{^}}global_load_f32:
  ; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}
diff --git a/test/CodeGen/AMDGPU/load-global-f64.ll b/test/CodeGen/AMDGPU/load-global-f64.ll

index 5b772e1fe5ee30da92933ca977a75816d92a1faa..84214b7dbc106e0f8290ad72c2a7422dfbf2145c 100644 (file)
--- a/test/CodeGen/AMDGPU/load-global-f64.ll
+++ b/test/CodeGen/AMDGPU/load-global-f64.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
  
  ; FUNC-LABEL: {{^}}global_load_f64:
  ; GCN-NOHSA: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
diff --git a/test/CodeGen/AMDGPU/load-global-i16.ll b/test/CodeGen/AMDGPU/load-global-i16.ll

index e3415b9c47dec596a0d083a81242ecaf1f3e88ae..cb2495d5fdcf7864cd6c331109b9b14572216d06 100644 (file)
--- a/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -1,8 +1,8 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-SI,FUNC %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-VI,FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=EGCM -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=EGCM -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-SI,FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-VI,FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=EGCM -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=EGCM -check-prefix=FUNC %s
  
  ; FIXME: r600 is broken because the bigger testcases spill and it's not implemented
  
diff --git a/test/CodeGen/AMDGPU/load-global-i32.ll b/test/CodeGen/AMDGPU/load-global-i32.ll

index 5df32c1e3120a60518a8f76f1bd215a446f82bd0..6360d39666c775e067a857d8385ec6f9fabb9f2e 100644 (file)
--- a/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -1,7 +1,7 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
  
  
  ; FUNC-LABEL: {{^}}global_load_i32:
diff --git a/test/CodeGen/AMDGPU/load-global-i64.ll b/test/CodeGen/AMDGPU/load-global-i64.ll

index de16b6c8997ef3850a2f7d2cae4e5e041a8169d0..c71db0b7357cdea1cfee4237852f5e1229b9974f 100644 (file)
--- a/test/CodeGen/AMDGPU/load-global-i64.ll
+++ b/test/CodeGen/AMDGPU/load-global-i64.ll
@@ -1,9 +1,9 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
  
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
  
  ; FUNC-LABEL: {{^}}global_load_i64:
  ; GCN-NOHSA: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
diff --git a/test/CodeGen/AMDGPU/load-global-i8.ll b/test/CodeGen/AMDGPU/load-global-i8.ll

index fc0cbf916b529c7a6ff612527aa7988f77b45440..3fe6bd26be14fca0ae78d44db520954beab38375 100644 (file)
--- a/test/CodeGen/AMDGPU/load-global-i8.ll
+++ b/test/CodeGen/AMDGPU/load-global-i8.ll
@@ -1,8 +1,8 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,SI,FUNC %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,SI,FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,VI,FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,SI,FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,SI,FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,VI,FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
  
  
  ; FUNC-LABEL: {{^}}global_load_i8:
diff --git a/test/CodeGen/AMDGPU/load-weird-sizes.ll b/test/CodeGen/AMDGPU/load-weird-sizes.ll

index d6162c388b5b103517229c17fb0303eee8f8fa95..f9ba6241fe0675a3ac970d719edc8103fe0a8fab 100644 (file)
--- a/test/CodeGen/AMDGPU/load-weird-sizes.ll
+++ b/test/CodeGen/AMDGPU/load-weird-sizes.ll
@@ -1,8 +1,8 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=CI-HSA -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=CM -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=SI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=CI-HSA -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=SI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=EG -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=CM -check-prefix=FUNC %s
  
  ; FUNC-LABEL: {{^}}load_i24:
  ; SI: {{flat|buffer}}_load_ubyte
diff --git a/test/CodeGen/AMDGPU/max.ll b/test/CodeGen/AMDGPU/max.ll

index ffcdac03bc74ca070cf49c75b0f7bb1fcbb32da0..6387c9ff6dfafda1b21578ab374eef057855029e 100644 (file)
--- a/test/CodeGen/AMDGPU/max.ll
+++ b/test/CodeGen/AMDGPU/max.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
  
  
  ; FUNC-LABEL: {{^}}v_test_imax_sge_i32:
diff --git a/test/CodeGen/AMDGPU/merge-stores.ll b/test/CodeGen/AMDGPU/merge-stores.ll

index dfd5b97fcc865c63f0a72c67ed23bc54616dd15e..6b0ec483247cff938a77e60bd31b473f0767bd7e 100644 (file)
--- a/test/CodeGen/AMDGPU/merge-stores.ll
+++ b/test/CodeGen/AMDGPU/merge-stores.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
  
  ; This test is mostly to test DAG store merging, so disable the vectorizer.
  ; Run with devices with different unaligned load restrictions.
diff --git a/test/CodeGen/AMDGPU/mubuf.ll b/test/CodeGen/AMDGPU/mubuf.ll

index b23b21118aaa341305db420e7b07571d06960926..97666492e376f06f4defc0177396a05f4ba12998 100644 (file)
--- a/test/CodeGen/AMDGPU/mubuf.ll
+++ b/test/CodeGen/AMDGPU/mubuf.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s
  
  declare i32 @llvm.amdgcn.workitem.id.x() readnone
  
diff --git a/test/CodeGen/AMDGPU/mul.ll b/test/CodeGen/AMDGPU/mul.ll

index 57c50c9804e56a243338acf11d71b285e75fc2f4..a0290789175d3fe0836b6341f6027f955f9a5560 100644 (file)
--- a/test/CodeGen/AMDGPU/mul.ll
+++ b/test/CodeGen/AMDGPU/mul.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s -check-prefix=FUNC
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s -check-prefix=FUNC
  
  ; mul24 and mad24 are affected
  
diff --git a/test/CodeGen/AMDGPU/no-shrink-extloads.ll b/test/CodeGen/AMDGPU/no-shrink-extloads.ll

index 8a7bf6db5b8d4c73345ac1368fd339be0b840bdd..500e4cb3cc73e1e5187a2725b022eadfae3c6b56 100644 (file)
--- a/test/CodeGen/AMDGPU/no-shrink-extloads.ll
+++ b/test/CodeGen/AMDGPU/no-shrink-extloads.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
  
  declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
  
diff --git a/test/CodeGen/AMDGPU/or.ll b/test/CodeGen/AMDGPU/or.ll

index eb082843fb829a4b50352565385094823520a778..8e6885c4fc5e82d93307587cbd6f3c674bc411a2 100644 (file)
--- a/test/CodeGen/AMDGPU/or.ll
+++ b/test/CodeGen/AMDGPU/or.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
  
  
  ; FUNC-LABEL: {{^}}or_v2i32:
diff --git a/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll b/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll

index f83eb56dc6edf4cb5e38d8c87f8265d32db5834d..776b151e301702dc98f054562a57e29a568278aa 100644 (file)
--- a/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
  
  declare {}* @llvm.invariant.start.p0i8(i64, i8* nocapture) #0
  declare void @llvm.invariant.end.p0i8({}*, i64, i8* nocapture) #0
diff --git a/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll b/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll

index ecb513cd80b6e483bfd779186b79e3b2f182cb8e..d8c7438e4d0da53ea5fa15981a492da1b4e48241 100644 (file)
--- a/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll
+++ b/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
  
  ; GCN-LABEL: {{^}}reduce_i64_load_align_4_width_to_i32:
  ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
diff --git a/test/CodeGen/AMDGPU/reorder-stores.ll b/test/CodeGen/AMDGPU/reorder-stores.ll

index ff4069226a62bc5286991812330635bf02e11648..260b32ed3406c06cdb0b350de09498fcefeadddc 100644 (file)
--- a/test/CodeGen/AMDGPU/reorder-stores.ll
+++ b/test/CodeGen/AMDGPU/reorder-stores.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn < %s | FileCheck -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI %s
  
  ; SI-LABEL: {{^}}no_reorder_v2f64_global_load_store:
  ; SI: buffer_load_dwordx4
diff --git a/test/CodeGen/AMDGPU/rotl.i64.ll b/test/CodeGen/AMDGPU/rotl.i64.ll

index 266490718dd182e6678b17af0c68f4a9a144cbc5..fa29d789cebeebc76f782dae99ee522682db1f9d 100644 (file)
--- a/test/CodeGen/AMDGPU/rotl.i64.ll
+++ b/test/CodeGen/AMDGPU/rotl.i64.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s
  
  ; BOTH-LABEL: {{^}}s_rotl_i64:
  ; BOTH-DAG: s_lshl_b64
diff --git a/test/CodeGen/AMDGPU/rotr.i64.ll b/test/CodeGen/AMDGPU/rotr.i64.ll

index 9eda479cd25c2d92da6ac1eb95f8661a59c1c49d..af58b404ca6c625b929e709cdda4831758de9ea5 100644 (file)
--- a/test/CodeGen/AMDGPU/rotr.i64.ll
+++ b/test/CodeGen/AMDGPU/rotr.i64.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s
  
  ; BOTH-LABEL: {{^}}s_rotr_i64:
  ; BOTH-DAG: s_sub_i32
diff --git a/test/CodeGen/AMDGPU/rsq.ll b/test/CodeGen/AMDGPU/rsq.ll

index 9462683efe0e83ee9137e81ab2c3a7cb90f6ebef..6cbc8c3aa6865240fd3ca9390204f7a4f6e538de 100644 (file)
--- a/test/CodeGen/AMDGPU/rsq.ll
+++ b/test/CodeGen/AMDGPU/rsq.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI %s
  
  declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
  declare float @llvm.sqrt.f32(float) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/s_movk_i32.ll b/test/CodeGen/AMDGPU/s_movk_i32.ll

index a131aaa3dfb4f8c1e46cc728c64d09aa986968fd..797fbc2712b0f0a86de1e881903d92abc5b559cb 100644 (file)
--- a/test/CodeGen/AMDGPU/s_movk_i32.ll
+++ b/test/CodeGen/AMDGPU/s_movk_i32.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
  
  ; SI-LABEL: {{^}}s_movk_i32_k0:
  ; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xffff{{$}}
diff --git a/test/CodeGen/AMDGPU/saddo.ll b/test/CodeGen/AMDGPU/saddo.ll

index 586a455b2b91ed918552d65de3d514e4baf98d47..09e87d524419e35b41b4d5f98905c838f4831814 100644 (file)
--- a/test/CodeGen/AMDGPU/saddo.ll
+++ b/test/CodeGen/AMDGPU/saddo.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=cypress -verify-machineinstrs< %s
  
  declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
  declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/salu-to-valu.ll b/test/CodeGen/AMDGPU/salu-to-valu.ll

index 6e1dd163833378628ccad6e8d3d9e21ef3487fe0..d5b2fa0b6754011a4e0bdf27307809ab99515812 100644 (file)
--- a/test/CodeGen/AMDGPU/salu-to-valu.ll
+++ b/test/CodeGen/AMDGPU/salu-to-valu.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=CI %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI --check-prefix=GCN-HSA %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=CI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn--amdhsa -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI --check-prefix=GCN-HSA %s
  
  declare i32 @llvm.amdgcn.workitem.id.x() #0
  declare i32 @llvm.amdgcn.workitem.id.y() #0
diff --git a/test/CodeGen/AMDGPU/scalar_to_vector.ll b/test/CodeGen/AMDGPU/scalar_to_vector.ll

index 62d0d936788580b9f99af8f7d7e81da0c6868b00..8c74ec683d9f36696c5d4ba6d7152e5cd2b3e96f 100644 (file)
--- a/test/CodeGen/AMDGPU/scalar_to_vector.ll
+++ b/test/CodeGen/AMDGPU/scalar_to_vector.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
  
  ; XXX - Why the packing?
  ; GCN-LABEL: {{^}}scalar_to_vector_v2i32:
diff --git a/test/CodeGen/AMDGPU/schedule-global-loads.ll b/test/CodeGen/AMDGPU/schedule-global-loads.ll

index 44d46086f02af4b59bfe27b3b1558ff48aec1f86..2dddba8bccc76e6e2d67e7b85a7fd4fc3b498abd 100644 (file)
--- a/test/CodeGen/AMDGPU/schedule-global-loads.ll
+++ b/test/CodeGen/AMDGPU/schedule-global-loads.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
  
  ; FIXME: This currently doesn't do a great job of clustering the
  ; loads, which end up with extra moves between them. Right now, it
diff --git a/test/CodeGen/AMDGPU/scratch-buffer.ll b/test/CodeGen/AMDGPU/scratch-buffer.ll

index 6b1e85915a110f163cf5c46d8d150c005b6410b8..4ae9871865f5e738e0a3b43e9be0a6344bb5c271 100644 (file)
--- a/test/CodeGen/AMDGPU/scratch-buffer.ll
+++ b/test/CodeGen/AMDGPU/scratch-buffer.ll
@@ -1,5 +1,5 @@
-; RUN: llc -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s
  
  ; When a frame index offset is more than 12-bits, make sure we don't store
  ; it in mubuf's offset field.
diff --git a/test/CodeGen/AMDGPU/sdiv.ll b/test/CodeGen/AMDGPU/sdiv.ll

index 7ec6ca809b685c31d46862a58e790ea24f1c865f..305107f690fb836fd2521fb76749465ab16c8d1d 100644 (file)
--- a/test/CodeGen/AMDGPU/sdiv.ll
+++ b/test/CodeGen/AMDGPU/sdiv.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
  
  ; The code generated by sdiv is long and complex and may frequently change.
  ; The goal of this test is to make sure the ISel doesn't fail.
diff --git a/test/CodeGen/AMDGPU/sdwa-peephole.ll b/test/CodeGen/AMDGPU/sdwa-peephole.ll

index 0dc7cc309f7c9c8a18ef03ffcd470e0942ce7fd6..98922500f69d64cb88b2360494ab7d28f18a1e86 100644 (file)
--- a/test/CodeGen/AMDGPU/sdwa-peephole.ll
+++ b/test/CodeGen/AMDGPU/sdwa-peephole.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=NOSDWA -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=SDWA -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=SDWA -check-prefix=GCN %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=NOSDWA -check-prefix=GCN %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=SDWA -check-prefix=GCN %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx900 -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=SDWA -check-prefix=GCN %s
  
  ; GCN-LABEL: {{^}}add_shr_i32:
  ; NOSDWA: v_lshrrev_b32_e32 v[[DST:[0-9]+]], 16, v{{[0-9]+}}
diff --git a/test/CodeGen/AMDGPU/select-vectors.ll b/test/CodeGen/AMDGPU/select-vectors.ll

index ebbc675b2babe165a138b698ecfe23b64ed0d084..b77ebcf5bf5291148d66931e8ff54f328732fc43 100644 (file)
--- a/test/CodeGen/AMDGPU/select-vectors.ll
+++ b/test/CodeGen/AMDGPU/select-vectors.ll
@@ -1,6 +1,6 @@
-; RUN: llc -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -verify-machineinstrs -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
  
  ; Test expansion of scalar selects on vectors.
  ; Evergreen not enabled since it seems to be having problems with doubles.
diff --git a/test/CodeGen/AMDGPU/select.f16.ll b/test/CodeGen/AMDGPU/select.f16.ll

index 92ee2eb7f403f43254cd37066c8f2c69540d0af5..62627e56aba4756a7ed1fe9c304c3b9b224bd800 100644 (file)
--- a/test/CodeGen/AMDGPU/select.f16.ll
+++ b/test/CodeGen/AMDGPU/select.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
  
  ; GCN-LABEL: {{^}}select_f16:
  ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
diff --git a/test/CodeGen/AMDGPU/sext-in-reg.ll b/test/CodeGen/AMDGPU/sext-in-reg.ll

index 160fb6a038fed9cdb2174d5a65b9d7ffee68b229..5b4d9ed259b600db344e60b862ca4a52528ef14f 100644 (file)
--- a/test/CodeGen/AMDGPU/sext-in-reg.ll
+++ b/test/CodeGen/AMDGPU/sext-in-reg.ll
@@ -1,7 +1,7 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
  
  ; FIXME: i16 promotion pass ruins the scalar cases when legal.
  ; FIXME: r600 fails verifier
diff --git a/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll b/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll

index fb0bbaa9cbf272f852dbe23d221c5ee45a088bfc..8250bad7b0a102f18e61a4d6f222ab25e2866b62 100644 (file)
--- a/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll
+++ b/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
  
  ; Copy VGPR -> SGPR used twice as an instruction operand, which is then
  ; used in an REG_SEQUENCE that also needs to be handled.
diff --git a/test/CodeGen/AMDGPU/sgpr-copy.ll b/test/CodeGen/AMDGPU/sgpr-copy.ll

index 931051102cd5c574379aa1ba7f0ed72ae4795e16..3b24cf82d783b8358eee12b21befcef8aae6e2a7 100644 (file)
--- a/test/CodeGen/AMDGPU/sgpr-copy.ll
+++ b/test/CodeGen/AMDGPU/sgpr-copy.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s
  
  ; CHECK-LABEL: {{^}}phi1:
  ; CHECK: s_buffer_load_dword [[DST:s[0-9]]], {{s\[[0-9]+:[0-9]+\]}}, 0x0
diff --git a/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll b/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll

index 4f7b61adc91d57df611ba1ecd52b3a4e1e68bec1..ac9cc982d456dd7b9c110fafb7c0a308f03600d1 100644 (file)
--- a/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll
+++ b/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
  
  ; Extract the high bit of the 1st quarter
  ; GCN-LABEL: {{^}}v_uextract_bit_31_i128:
diff --git a/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll b/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll

index c70eb9b9c4a534d6b096d4c38981f69472f1d49e..670287ba79373f17c0cea6f2345758f79c442100 100644 (file)
--- a/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
+++ b/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
  
  ; Make sure 64-bit BFE pattern does a 32-bit BFE on the relevant half.
  
diff --git a/test/CodeGen/AMDGPU/shift-i64-opts.ll b/test/CodeGen/AMDGPU/shift-i64-opts.ll

index 5306e190a4f9c040afae88131555fd80dd7b33a9..f3faa39c64e68570e4a69873d8c1b1c4511d2c1e 100644 (file)
--- a/test/CodeGen/AMDGPU/shift-i64-opts.ll
+++ b/test/CodeGen/AMDGPU/shift-i64-opts.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=FAST64 -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=SLOW64 -check-prefix=GCN %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=FAST64 -check-prefix=GCN %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=SLOW64 -check-prefix=GCN %s
  
  
  ; lshr (i64 x), c: c > 32 => reg_sequence lshr (i32 hi_32(x)), (c - 32), 0
diff --git a/test/CodeGen/AMDGPU/shl.ll b/test/CodeGen/AMDGPU/shl.ll

index edc313ee323bd5dfa8978f7dafae63f881ff949c..13ac9140b82734b2ad753d94db9fe9be511c45cc 100644 (file)
--- a/test/CodeGen/AMDGPU/shl.ll
+++ b/test/CodeGen/AMDGPU/shl.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
  ; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
  
  declare i32 @llvm.r600.read.tidig.x() #0
  
diff --git a/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll

index 348c7200c0bc1887b62fa23ec0963bd066cac865..17109187d5387185bdc2d2e5f52d0a41b5a76be6 100644 (file)
--- a/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
+++ b/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=bonaire -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -check-prefix=FUNC -check-prefix=CI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=bonaire -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -check-prefix=FUNC -check-prefix=CI %s
  
  declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
  declare void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
diff --git a/test/CodeGen/AMDGPU/sign_extend.ll b/test/CodeGen/AMDGPU/sign_extend.ll

index 3e452c214e983afb4e7e8f4fde5ba82228723ef8..c80945f390bed54e7c8a6885f9d8de13e7659e0f 100644 (file)
--- a/test/CodeGen/AMDGPU/sign_extend.ll
+++ b/test/CodeGen/AMDGPU/sign_extend.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
  
  ; GCN-LABEL: {{^}}s_sext_i1_to_i32:
  ; GCN: v_cndmask_b32_e64
diff --git a/test/CodeGen/AMDGPU/sitofp.f16.ll b/test/CodeGen/AMDGPU/sitofp.f16.ll

index 574d1c0b2c78ea230c94ed11c98fb2729e2ea9a5..0bcef99df39f6b754dc7efa4b14c86b4d33e9f8e 100644 (file)
--- a/test/CodeGen/AMDGPU/sitofp.f16.ll
+++ b/test/CodeGen/AMDGPU/sitofp.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
  
  ; GCN-LABEL: {{^}}sitofp_i16_to_f16
  ; GCN: buffer_load_{{sshort|ushort}} v[[A_I16:[0-9]+]]
diff --git a/test/CodeGen/AMDGPU/sminmax.ll b/test/CodeGen/AMDGPU/sminmax.ll

index 827d672022eba389e799b9b42794b1558398d0f4..eced4a045343b99b3a309603a83ace6354b9ea98 100644 (file)
--- a/test/CodeGen/AMDGPU/sminmax.ll
+++ b/test/CodeGen/AMDGPU/sminmax.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
  
  ; FUNC-LABEL: {{^}}s_abs_i32:
  ; GCN: s_abs_i32
diff --git a/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/test/CodeGen/AMDGPU/sminmax.v2i16.ll

index a9aac2d8abb75e12e973c2a1b3a06797c48c8c19..27263429650d8726fd1b0b5a511bb0df380d8c59 100644 (file)
--- a/test/CodeGen/AMDGPU/sminmax.v2i16.ll
+++ b/test/CodeGen/AMDGPU/sminmax.v2i16.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s
  
  ; GCN-LABEL: {{^}}s_abs_v2i16:
  ; GFX9: s_load_dword [[VAL:s[0-9]+]]
diff --git a/test/CodeGen/AMDGPU/spill-cfg-position.ll b/test/CodeGen/AMDGPU/spill-cfg-position.ll

index 1ca0919258a8ef880c601702ad2c1a7b11de7f45..cbf9f37e29ef7a76f457d0db11c4432bd1e699b8 100644 (file)
--- a/test/CodeGen/AMDGPU/spill-cfg-position.ll
+++ b/test/CodeGen/AMDGPU/spill-cfg-position.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs -stress-regalloc=6 < %s | FileCheck %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs -stress-regalloc=6 < %s | FileCheck %s
  
  ; Inline spiller can decide to move a spill as early as possible in the basic block.
  ; It will skip phis and label, but we also need to make sure it skips instructions
diff --git a/test/CodeGen/AMDGPU/sra.ll b/test/CodeGen/AMDGPU/sra.ll

index 44cfdf6398aef42e82d1b61b1142a4cce7578e5e..74618b263bad7175b2a05a6aa811c071436cf3a8 100644 (file)
--- a/test/CodeGen/AMDGPU/sra.ll
+++ b/test/CodeGen/AMDGPU/sra.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
  
  declare i32 @llvm.r600.read.tidig.x() #0
  
diff --git a/test/CodeGen/AMDGPU/srem.ll b/test/CodeGen/AMDGPU/srem.ll

index e067258920892b8a1f4f14124412f88ac2627c31..51eaf9a960b0020f29bc8445d7c9ff9bcf1b8885 100644 (file)
--- a/test/CodeGen/AMDGPU/srem.ll
+++ b/test/CodeGen/AMDGPU/srem.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=redwood < %s
  
  define amdgpu_kernel void @srem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
    %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
diff --git a/test/CodeGen/AMDGPU/srl.ll b/test/CodeGen/AMDGPU/srl.ll

index cb40ecf2de1ca4493e01f98bc4f58311338b68e7..8878b4538555630e40f3229dc95a5432b77f6a0a 100644 (file)
--- a/test/CodeGen/AMDGPU/srl.ll
+++ b/test/CodeGen/AMDGPU/srl.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
  ; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
  
  declare i32 @llvm.r600.read.tidig.x() #0
  
diff --git a/test/CodeGen/AMDGPU/ssubo.ll b/test/CodeGen/AMDGPU/ssubo.ll

index 135632343f9094054bee513c418526e785e5c086..d65c2adc7e202f0d2d1b925ae0f47c274862395a 100644 (file)
--- a/test/CodeGen/AMDGPU/ssubo.ll
+++ b/test/CodeGen/AMDGPU/ssubo.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=cypress -verify-machineinstrs< %s
  
  declare { i32, i1 } @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone
  declare { i64, i1 } @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/sub.ll b/test/CodeGen/AMDGPU/sub.ll

index e7655df1552041bf75fed0c3ff58d30d49a62c0d..46f1b120f21277e13f9dffe82eb2503dc06a163c 100644 (file)
--- a/test/CodeGen/AMDGPU/sub.ll
+++ b/test/CodeGen/AMDGPU/sub.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
  
  
  declare i32 @llvm.r600.read.tidig.x() readnone
diff --git a/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll b/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll

index f90040385f7532a43b710209fa819a5011b85769..77a6820713d6dc58dd3e04896be3608f1850b9c3 100644 (file)
--- a/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll
+++ b/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s
  
  ; CHECK-LABEL: {{^}}trunc_i64_bitcast_v2i32:
  ; CHECK: buffer_load_dword v
diff --git a/test/CodeGen/AMDGPU/trunc.ll b/test/CodeGen/AMDGPU/trunc.ll

index 0c91d52df0c086d5380e54b0b01245d243de9254..da038f4b05972fe0a597e340000552f450ad63b4 100644 (file)
--- a/test/CodeGen/AMDGPU/trunc.ll
+++ b/test/CodeGen/AMDGPU/trunc.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs< %s | FileCheck -check-prefix=GCN -check-prefix=VI  %s
-; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=fiji -verify-machineinstrs< %s | FileCheck -check-prefix=GCN -check-prefix=VI  %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
  
  declare i32 @llvm.r600.read.tidig.x() nounwind readnone
  
diff --git a/test/CodeGen/AMDGPU/uaddo.ll b/test/CodeGen/AMDGPU/uaddo.ll

index 632ccaa7e612451dc4377bad9335ed5584d5ee2d..f6a72633d418668c50a94ddb1e03dcb9e7f0d935 100644 (file)
--- a/test/CodeGen/AMDGPU/uaddo.ll
+++ b/test/CodeGen/AMDGPU/uaddo.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG,FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG,FUNC %s
  
  ; FUNC-LABEL: {{^}}s_uaddo_i64_zext:
  ; GCN: s_add_u32
diff --git a/test/CodeGen/AMDGPU/udiv.ll b/test/CodeGen/AMDGPU/udiv.ll

index d9dab0d40acf652702675959cd9be2531c891528..1d683776bfd5a76047bd92cd5ff83068d9fba18c 100644 (file)
--- a/test/CodeGen/AMDGPU/udiv.ll
+++ b/test/CodeGen/AMDGPU/udiv.ll
@@ -1,9 +1,9 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -check-prefix=SI -check-prefix=FUNC -check-prefix=VI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -check-prefix=SI -check-prefix=FUNC -check-prefix=VI %s
  
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+fp32-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+fp32-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
  
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
  
  ; FUNC-LABEL: {{^}}udiv_i32:
  ; EG-NOT: SETGE_INT
diff --git a/test/CodeGen/AMDGPU/uitofp.f16.ll b/test/CodeGen/AMDGPU/uitofp.f16.ll

index 0c3b0fcaf85492d1a9cebd5938d877c449bf76a3..eaa1d073cafb479bf747e19f2512cd2579856be8 100644 (file)
--- a/test/CodeGen/AMDGPU/uitofp.f16.ll
+++ b/test/CodeGen/AMDGPU/uitofp.f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
  
  ; GCN-LABEL: {{^}}uitofp_i16_to_f16
  ; GCN: buffer_load_ushort v[[A_I16:[0-9]+]]
diff --git a/test/CodeGen/AMDGPU/urem.ll b/test/CodeGen/AMDGPU/urem.ll

index fb4eab43a2d66a660075688d6008e0ca8f565e71..823c918dcda7038cbb8394d63afa032cf4b5e95f 100644 (file)
--- a/test/CodeGen/AMDGPU/urem.ll
+++ b/test/CodeGen/AMDGPU/urem.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
  
  ; The code generated by urem is long and complex and may frequently
  ; change.  The goal of this test is to make sure the ISel doesn't fail
diff --git a/test/CodeGen/AMDGPU/usubo.ll b/test/CodeGen/AMDGPU/usubo.ll

index d1f454f0bc65551b1d3bd4039a4001c5f68b62a0..99c759a1787581ab84a82e50b84e5afc11002ec4 100644 (file)
--- a/test/CodeGen/AMDGPU/usubo.ll
+++ b/test/CodeGen/AMDGPU/usubo.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG,FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG,FUNC %s
  
  ; FUNC-LABEL: {{^}}s_usubo_i64_zext:
  ; GCN: s_sub_u32
diff --git a/test/CodeGen/AMDGPU/v_mac.ll b/test/CodeGen/AMDGPU/v_mac.ll

index 2b96f7d50076a396ccd9bf1470fa66366d3c7c43..5f3c6a25cdf7988acb06dee1031983b46f7816ce 100644 (file)
--- a/test/CodeGen/AMDGPU/v_mac.ll
+++ b/test/CodeGen/AMDGPU/v_mac.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-FLUSH -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-DENORM -check-prefix=GCN %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-FLUSH -check-prefix=GCN %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-DENORM -check-prefix=GCN %s
  
  ; GCN-LABEL: {{^}}mac_vvv:
  ; GCN: buffer_load_dword [[A:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0{{$}}
diff --git a/test/CodeGen/AMDGPU/v_mac_f16.ll b/test/CodeGen/AMDGPU/v_mac_f16.ll

index ce4a69db350607300d857220fb26c61db8276ba7..3f18b95eb078bf3b966d85d25992fc596464fa1b 100644 (file)
--- a/test/CodeGen/AMDGPU/v_mac_f16.ll
+++ b/test/CodeGen/AMDGPU/v_mac_f16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
  
  ; GCN-LABEL: {{^}}mac_f16:
  ; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]]
diff --git a/test/CodeGen/AMDGPU/vectorize-global-local.ll b/test/CodeGen/AMDGPU/vectorize-global-local.ll

index 90cf34e609f6ec13def5e491769d687ad2081c18..381ff5b1b518a1ca0ee9d3ca77c51b5de6b6a863 100644 (file)
--- a/test/CodeGen/AMDGPU/vectorize-global-local.ll
+++ b/test/CodeGen/AMDGPU/vectorize-global-local.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
  ; CHECK-DAG: flat_load_dwordx4
  ; CHECK-DAG: flat_load_dwordx4
  ; CHECK-DAG: flat_load_dwordx4
diff --git a/test/CodeGen/AMDGPU/xor.ll b/test/CodeGen/AMDGPU/xor.ll

index 57a082a0170c35861bb3b9f5d1c2149783481f3b..db3e42a138e11ad71441ca0838a49271b5f2aac6 100644 (file)
--- a/test/CodeGen/AMDGPU/xor.ll
+++ b/test/CodeGen/AMDGPU/xor.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
  
  
  ; FUNC-LABEL: {{^}}xor_v2i32:
author	Alexander Timofeev <Alexander.Timofeev@amd.com>
	Mon, 3 Jul 2017 14:54:11 +0000 (14:54 +0000)
committer	Alexander Timofeev <Alexander.Timofeev@amd.com>
	Mon, 3 Jul 2017 14:54:11 +0000 (14:54 +0000)
lib/Target/AMDGPU/AMDGPUTargetMachine.cpp		patch \| blob \| history
test/CodeGen/AMDGPU/add.ll		patch \| blob \| history
test/CodeGen/AMDGPU/add_i128.ll		patch \| blob \| history
test/CodeGen/AMDGPU/add_i64.ll		patch \| blob \| history
test/CodeGen/AMDGPU/and-gcn.ll		patch \| blob \| history
test/CodeGen/AMDGPU/and.ll		patch \| blob \| history
test/CodeGen/AMDGPU/any_extend_vector_inreg.ll		patch \| blob \| history
test/CodeGen/AMDGPU/bitreverse.ll		patch \| blob \| history
test/CodeGen/AMDGPU/bswap.ll		patch \| blob \| history
test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll		patch \| blob \| history
test/CodeGen/AMDGPU/cgp-addressing-modes.ll		patch \| blob \| history
test/CodeGen/AMDGPU/coalescer_remat.ll		patch \| blob \| history
test/CodeGen/AMDGPU/copy-illegal-type.ll		patch \| blob \| history
test/CodeGen/AMDGPU/ctlz.ll		patch \| blob \| history
test/CodeGen/AMDGPU/ctlz_zero_undef.ll		patch \| blob \| history
test/CodeGen/AMDGPU/ctpop.ll		patch \| blob \| history
test/CodeGen/AMDGPU/ctpop64.ll		patch \| blob \| history
test/CodeGen/AMDGPU/cttz_zero_undef.ll		patch \| blob \| history
test/CodeGen/AMDGPU/cvt_f32_ubyte.ll		patch \| blob \| history
test/CodeGen/AMDGPU/early-if-convert-cost.ll		patch \| blob \| history
test/CodeGen/AMDGPU/early-if-convert.ll		patch \| blob \| history
test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll		patch \| blob \| history
test/CodeGen/AMDGPU/extractelt-to-trunc.ll		patch \| blob \| history
test/CodeGen/AMDGPU/fabs.f16.ll		patch \| blob \| history
test/CodeGen/AMDGPU/fadd.f16.ll		patch \| blob \| history
test/CodeGen/AMDGPU/fadd64.ll		patch \| blob \| history
test/CodeGen/AMDGPU/fcanonicalize.f16.ll		patch \| blob \| history
test/CodeGen/AMDGPU/fcanonicalize.ll		patch \| blob \| history
test/CodeGen/AMDGPU/fcmp.f16.ll		patch \| blob \| history
test/CodeGen/AMDGPU/fcmp64.ll		patch \| blob \| history
test/CodeGen/AMDGPU/fconst64.ll		patch \| blob \| history
test/CodeGen/AMDGPU/fcopysign.f16.ll		patch \| blob \| history
test/CodeGen/AMDGPU/fma-combine.ll		patch \| blob \| history
test/CodeGen/AMDGPU/fma.f64.ll		patch \| blob \| history
test/CodeGen/AMDGPU/fma.ll		patch \| blob \| history
test/CodeGen/AMDGPU/fmul.f16.ll		patch \| blob \| history
test/CodeGen/AMDGPU/fmul64.ll		patch \| blob \| history
test/CodeGen/AMDGPU/fmuladd.f32.ll		patch \| blob \| history
test/CodeGen/AMDGPU/fmuladd.f64.ll		patch \| blob \| history
test/CodeGen/AMDGPU/fmuladd.v2f16.ll		patch \| blob \| history
test/CodeGen/AMDGPU/fneg-fabs.ll		patch \| blob \| history
test/CodeGen/AMDGPU/fneg.f16.ll		patch \| blob \| history
test/CodeGen/AMDGPU/fp32_to_fp16.ll		patch \| blob \| history
test/CodeGen/AMDGPU/fpext.f16.ll		patch \| blob \| history
test/CodeGen/AMDGPU/fptosi.f16.ll		patch \| blob \| history
test/CodeGen/AMDGPU/fptoui.f16.ll		patch \| blob \| history
test/CodeGen/AMDGPU/fptrunc.f16.ll		patch \| blob \| history
test/CodeGen/AMDGPU/fract.f64.ll		patch \| blob \| history
test/CodeGen/AMDGPU/fract.ll		patch \| blob \| history
test/CodeGen/AMDGPU/frem.ll		patch \| blob \| history
test/CodeGen/AMDGPU/fsqrt.f64.ll		patch \| blob \| history
test/CodeGen/AMDGPU/fsqrt.ll		patch \| blob \| history
test/CodeGen/AMDGPU/fsub.f16.ll		patch \| blob \| history
test/CodeGen/AMDGPU/fsub.ll		patch \| blob \| history
test/CodeGen/AMDGPU/fsub64.ll		patch \| blob \| history
test/CodeGen/AMDGPU/ftrunc.f64.ll		patch \| blob \| history
test/CodeGen/AMDGPU/global-extload-i16.ll		patch \| blob \| history
test/CodeGen/AMDGPU/half.ll		patch \| blob \| history
test/CodeGen/AMDGPU/imm.ll		patch \| blob \| history
test/CodeGen/AMDGPU/immv216.ll		patch \| blob \| history
test/CodeGen/AMDGPU/indirect-addressing-si.ll		patch \| blob \| history
test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll		patch \| blob \| history
test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll		patch \| blob \| history
test/CodeGen/AMDGPU/llvm.amdgcn.class.ll		patch \| blob \| history
test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll		patch \| blob \| history
test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll		patch \| blob \| history
test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll		patch \| blob \| history
test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll		patch \| blob \| history
test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll		patch \| blob \| history
test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll		patch \| blob \| history
test/CodeGen/AMDGPU/llvm.ceil.f16.ll		patch \| blob \| history
test/CodeGen/AMDGPU/llvm.cos.f16.ll		patch \| blob \| history
test/CodeGen/AMDGPU/llvm.exp2.f16.ll		patch \| blob \| history
test/CodeGen/AMDGPU/llvm.floor.f16.ll		patch \| blob \| history
test/CodeGen/AMDGPU/llvm.fma.f16.ll		patch \| blob \| history
test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll		patch \| blob \| history
test/CodeGen/AMDGPU/llvm.log2.f16.ll		patch \| blob \| history
test/CodeGen/AMDGPU/llvm.maxnum.f16.ll		patch \| blob \| history
test/CodeGen/AMDGPU/llvm.minnum.f16.ll		patch \| blob \| history
test/CodeGen/AMDGPU/llvm.rint.f16.ll		patch \| blob \| history
test/CodeGen/AMDGPU/llvm.sin.f16.ll		patch \| blob \| history
test/CodeGen/AMDGPU/llvm.sqrt.f16.ll		patch \| blob \| history
test/CodeGen/AMDGPU/llvm.trunc.f16.ll		patch \| blob \| history
test/CodeGen/AMDGPU/load-global-f32.ll		patch \| blob \| history
test/CodeGen/AMDGPU/load-global-f64.ll		patch \| blob \| history
test/CodeGen/AMDGPU/load-global-i16.ll		patch \| blob \| history
test/CodeGen/AMDGPU/load-global-i32.ll		patch \| blob \| history
test/CodeGen/AMDGPU/load-global-i64.ll		patch \| blob \| history
test/CodeGen/AMDGPU/load-global-i8.ll		patch \| blob \| history
test/CodeGen/AMDGPU/load-weird-sizes.ll		patch \| blob \| history
test/CodeGen/AMDGPU/max.ll		patch \| blob \| history
test/CodeGen/AMDGPU/merge-stores.ll		patch \| blob \| history
test/CodeGen/AMDGPU/mubuf.ll		patch \| blob \| history
test/CodeGen/AMDGPU/mul.ll		patch \| blob \| history
test/CodeGen/AMDGPU/no-shrink-extloads.ll		patch \| blob \| history
test/CodeGen/AMDGPU/or.ll		patch \| blob \| history
test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll		patch \| blob \| history
test/CodeGen/AMDGPU/reduce-load-width-alignment.ll		patch \| blob \| history
test/CodeGen/AMDGPU/reorder-stores.ll		patch \| blob \| history
test/CodeGen/AMDGPU/rotl.i64.ll		patch \| blob \| history
test/CodeGen/AMDGPU/rotr.i64.ll		patch \| blob \| history
test/CodeGen/AMDGPU/rsq.ll		patch \| blob \| history
test/CodeGen/AMDGPU/s_movk_i32.ll		patch \| blob \| history
test/CodeGen/AMDGPU/saddo.ll		patch \| blob \| history
test/CodeGen/AMDGPU/salu-to-valu.ll		patch \| blob \| history
test/CodeGen/AMDGPU/scalar_to_vector.ll		patch \| blob \| history
test/CodeGen/AMDGPU/schedule-global-loads.ll		patch \| blob \| history
test/CodeGen/AMDGPU/scratch-buffer.ll		patch \| blob \| history
test/CodeGen/AMDGPU/sdiv.ll		patch \| blob \| history
test/CodeGen/AMDGPU/sdwa-peephole.ll		patch \| blob \| history
test/CodeGen/AMDGPU/select-vectors.ll		patch \| blob \| history
test/CodeGen/AMDGPU/select.f16.ll		patch \| blob \| history
test/CodeGen/AMDGPU/sext-in-reg.ll		patch \| blob \| history
test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll		patch \| blob \| history
test/CodeGen/AMDGPU/sgpr-copy.ll		patch \| blob \| history
test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll		patch \| blob \| history
test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll		patch \| blob \| history
test/CodeGen/AMDGPU/shift-i64-opts.ll		patch \| blob \| history
test/CodeGen/AMDGPU/shl.ll		patch \| blob \| history
test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll		patch \| blob \| history
test/CodeGen/AMDGPU/sign_extend.ll		patch \| blob \| history
test/CodeGen/AMDGPU/sitofp.f16.ll		patch \| blob \| history
test/CodeGen/AMDGPU/sminmax.ll		patch \| blob \| history
test/CodeGen/AMDGPU/sminmax.v2i16.ll		patch \| blob \| history
test/CodeGen/AMDGPU/spill-cfg-position.ll		patch \| blob \| history
test/CodeGen/AMDGPU/sra.ll		patch \| blob \| history
test/CodeGen/AMDGPU/srem.ll		patch \| blob \| history
test/CodeGen/AMDGPU/srl.ll		patch \| blob \| history
test/CodeGen/AMDGPU/ssubo.ll		patch \| blob \| history
test/CodeGen/AMDGPU/sub.ll		patch \| blob \| history
test/CodeGen/AMDGPU/trunc-bitcast-vector.ll		patch \| blob \| history
test/CodeGen/AMDGPU/trunc.ll		patch \| blob \| history
test/CodeGen/AMDGPU/uaddo.ll		patch \| blob \| history
test/CodeGen/AMDGPU/udiv.ll		patch \| blob \| history
test/CodeGen/AMDGPU/uitofp.f16.ll		patch \| blob \| history
test/CodeGen/AMDGPU/urem.ll		patch \| blob \| history
test/CodeGen/AMDGPU/usubo.ll		patch \| blob \| history
test/CodeGen/AMDGPU/v_mac.ll		patch \| blob \| history
test/CodeGen/AMDGPU/v_mac_f16.ll		patch \| blob \| history
test/CodeGen/AMDGPU/vectorize-global-local.ll		patch \| blob \| history
test/CodeGen/AMDGPU/xor.ll		patch \| blob \| history