From 138d42906522a5fff55a52154d245b5fc5fac9b3 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 22 Feb 2017 21:05:25 +0000 Subject: [PATCH] AMDGPU: Always allocate emergency stack slot at offset 0 This allows us to ensure that 0 is never a valid pointer to a user object, and ensures that the offset is always legal without needing a register to access it. This comes at the cost of usable offsets and wasted stack space. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@295877 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/SIFrameLowering.cpp | 24 ++++-- test/CodeGen/AMDGPU/amdgpu.private-memory.ll | 12 +-- test/CodeGen/AMDGPU/array-ptr-calc-i32.ll | 6 +- test/CodeGen/AMDGPU/captured-frame-index.ll | 50 +++++------ test/CodeGen/AMDGPU/cgp-addressing-modes.ll | 45 +++++++++- test/CodeGen/AMDGPU/commute-compares.ll | 2 +- .../AMDGPU/control-flow-fastregalloc.ll | 32 ++++--- test/CodeGen/AMDGPU/extload-private.ll | 8 +- test/CodeGen/AMDGPU/insert_vector_elt.ll | 52 ++++++------ .../CodeGen/AMDGPU/kernarg-stack-alignment.ll | 10 +-- test/CodeGen/AMDGPU/local-stack-slot-bug.ll | 7 +- test/CodeGen/AMDGPU/private-element-size.ll | 84 +++++++++---------- test/CodeGen/AMDGPU/scratch-buffer.ll | 8 +- test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll | 6 +- test/CodeGen/AMDGPU/spill-m0.ll | 26 +++--- test/CodeGen/AMDGPU/spill-wide-sgpr.ll | 16 ++-- ...vgpr-spill-emergency-stack-slot-compute.ll | 4 +- .../AMDGPU/vgpr-spill-emergency-stack-slot.ll | 2 +- test/CodeGen/AMDGPU/wqm.ll | 2 +- 19 files changed, 222 insertions(+), 174 deletions(-) diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp index 03de4bf508f..80ba15a64a5 100644 --- a/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -391,11 +391,25 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized( if (!MFI.hasStackObjects()) return; - assert(RS && "RegScavenger required if spilling"); - int ScavengeFI = MFI.CreateStackObject( - AMDGPU::SGPR_32RegClass.getSize(), - AMDGPU::SGPR_32RegClass.getAlignment(), false); - RS->addScavengingFrameIndex(ScavengeFI); + bool MayNeedScavengingEmergencySlot = MFI.hasStackObjects(); + if (MayNeedScavengingEmergencySlot) { + // We force this to be at offset 0 so no user object ever has 0 as an + // address, so we may use 0 as an invalid pointer value. This is because + // LLVM assumes 0 is an invalid pointer in address space 0. Because alloca + // is required to be address space 0, we are forced to accept this for + // now. Ideally we could have the stack in another address space with 0 as a + // valid pointer, and -1 as the null value. + // + // This will also waste additional space when user stack objects require > 4 + // byte alignment. + // + // The main cost here is losing the offset for addressing modes. However + // this also ensures we shouldn't need a register for the offset when + // emergency scavenging. + int ScavengeFI = MFI.CreateFixedObject( + AMDGPU::SGPR_32RegClass.getSize(), 0, false); + RS->addScavengingFrameIndex(ScavengeFI); + } const SISubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); diff --git a/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/test/CodeGen/AMDGPU/amdgpu.private-memory.ll index ef2ca647cc3..f76b94cd9a0 100644 --- a/test/CodeGen/AMDGPU/amdgpu.private-memory.ll +++ b/test/CodeGen/AMDGPU/amdgpu.private-memory.ll @@ -227,8 +227,8 @@ for.end: ; R600: MOVA_INT -; SI-ALLOCA-DAG: buffer_store_short v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} ; encoding: [0x00,0x00,0x68,0xe0 -; SI-ALLOCA-DAG: buffer_store_short v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:2 ; encoding: [0x02,0x00,0x68,0xe0 +; SI-ALLOCA-DAG: buffer_store_short v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:6 ; encoding: [0x06,0x00,0x68,0xe0 +; SI-ALLOCA-DAG: buffer_store_short v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; encoding: [0x04,0x00,0x68,0xe0 ; Loaded value is 0 or 1, so sext will become zext, so we get buffer_load_ushort instead of buffer_load_sshort. ; SI-ALLOCA: buffer_load_sshort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} @@ -253,11 +253,11 @@ entry: ; R600: MOVA_INT -; SI-PROMOTE-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} ; encoding: -; SI-PROMOTE-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:1 ; encoding: +; SI-PROMOTE-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; encoding: +; SI-PROMOTE-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:5 ; encoding: -; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} ; encoding: [0x00,0x00,0x60,0xe0 -; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:1 ; encoding: [0x01,0x00,0x60,0xe0 +; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; encoding: [0x04,0x00,0x60,0xe0 +; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:5 ; encoding: [0x05,0x00,0x60,0xe0 define void @char_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: %0 = alloca [2 x i8] diff --git a/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll b/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll index f190bd0cb01..7dfe342a2e9 100644 --- a/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll +++ b/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll @@ -12,11 +12,7 @@ declare void @llvm.amdgcn.s.barrier() #2 ; SI-LABEL: {{^}}test_private_array_ptr_calc: -; FIXME: We end up with zero argument for ADD, because -; SIRegisterInfo::eliminateFrameIndex() blindly replaces the frame index -; with the appropriate offset. We should fold this into the store. - -; SI-ALLOCA: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 0, v{{[0-9]+}} +; SI-ALLOCA: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 16, v{{[0-9]+}} ; SI-ALLOCA: buffer_store_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:64 ; SI-ALLOCA: s_barrier ; SI-ALLOCA: buffer_load_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:64 diff --git a/test/CodeGen/AMDGPU/captured-frame-index.ll b/test/CodeGen/AMDGPU/captured-frame-index.ll index 49af159581f..e2e4f164fe8 100644 --- a/test/CodeGen/AMDGPU/captured-frame-index.ll +++ b/test/CodeGen/AMDGPU/captured-frame-index.ll @@ -1,7 +1,7 @@ ; RUN: llc -mtriple=amdgcn-- -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}store_fi_lifetime: -; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}} ; GCN: buffer_store_dword [[FI]] define void @store_fi_lifetime(i32 addrspace(1)* %out, i32 %in) #0 { entry: @@ -15,7 +15,7 @@ entry: ; GCN-LABEL: {{^}}stored_fi_to_lds: ; GCN: s_load_dword [[LDSPTR:s[0-9]+]] ; GCN: buffer_store_dword v{{[0-9]+}}, off, -; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 4{{$}} ; GCN: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]] ; GCN: ds_write_b32 [[VLDSPTR]], [[ZERO0]] define void @stored_fi_to_lds(float* addrspace(3)* %ptr) #0 { @@ -27,16 +27,16 @@ define void @stored_fi_to_lds(float* addrspace(3)* %ptr) #0 { ; Offset is applied ; GCN-LABEL: {{^}}stored_fi_to_lds_2_small_objects: -; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}} +; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 4{{$}} ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8{{$}} ; GCN-DAG: s_load_dword [[LDSPTR:s[0-9]+]] ; GCN-DAG: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]] ; GCN: ds_write_b32 [[VLDSPTR]], [[ZERO]] -; GCN-DAG: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}} +; GCN-DAG: v_mov_b32_e32 [[FI1:v[0-9]+]], 8{{$}} ; GCN: ds_write_b32 [[VLDSPTR]], [[FI1]] define void @stored_fi_to_lds_2_small_objects(float* addrspace(3)* %ptr) #0 { %tmp0 = alloca float @@ -51,9 +51,9 @@ define void @stored_fi_to_lds_2_small_objects(float* addrspace(3)* %ptr) #0 { ; Same frame index is used multiple times in the store ; GCN-LABEL: {{^}}stored_fi_to_self: ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x4d2{{$}} -; GCN: buffer_store_dword [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}} -; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; GCN: buffer_store_dword [[ZERO]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}} +; GCN: buffer_store_dword [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}} +; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 4{{$}} +; GCN: buffer_store_dword [[ZERO]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}} define void @stored_fi_to_self() #0 { %tmp = alloca i32* @@ -66,13 +66,13 @@ define void @stored_fi_to_self() #0 { ; GCN-LABEL: {{^}}stored_fi_to_self_offset: ; GCN-DAG: v_mov_b32_e32 [[K0:v[0-9]+]], 32{{$}} -; GCN: buffer_store_dword [[K0]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}} +; GCN: buffer_store_dword [[K0]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}} ; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x4d2{{$}} -; GCN: buffer_store_dword [[K1]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2048{{$}} +; GCN: buffer_store_dword [[K1]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2052{{$}} -; GCN: v_mov_b32_e32 [[OFFSETK:v[0-9]+]], 0x800{{$}} -; GCN: buffer_store_dword [[OFFSETK]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2048{{$}} +; GCN: v_mov_b32_e32 [[OFFSETK:v[0-9]+]], 0x804{{$}} +; GCN: buffer_store_dword [[OFFSETK]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2052{{$}} define void @stored_fi_to_self_offset() #0 { %tmp0 = alloca [512 x i32] %tmp1 = alloca i32* @@ -89,15 +89,15 @@ define void @stored_fi_to_self_offset() #0 { } ; GCN-LABEL: {{^}}stored_fi_to_fi: -; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}} ; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}} ; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8{{$}} +; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:12{{$}} -; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}} -; GCN: buffer_store_dword [[FI1]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8{{$}} +; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 8{{$}} +; GCN: buffer_store_dword [[FI1]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:12{{$}} -; GCN: v_mov_b32_e32 [[FI2:v[0-9]+]], 8{{$}} -; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}} +; GCN: v_mov_b32_e32 [[FI2:v[0-9]+]], 12{{$}} +; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8{{$}} define void @stored_fi_to_fi() #0 { %tmp0 = alloca i32* %tmp1 = alloca i32* @@ -115,8 +115,8 @@ define void @stored_fi_to_fi() #0 { } ; GCN-LABEL: {{^}}stored_fi_to_global: -; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}} -; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}} +; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}} +; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}} ; GCN: buffer_store_dword [[FI]] define void @stored_fi_to_global(float* addrspace(1)* %ptr) #0 { %tmp = alloca float @@ -127,14 +127,14 @@ define void @stored_fi_to_global(float* addrspace(1)* %ptr) #0 { ; Offset is applied ; GCN-LABEL: {{^}}stored_fi_to_global_2_small_objects: -; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}} ; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}} ; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8{{$}} +; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:12{{$}} -; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}} +; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 8{{$}} ; GCN: buffer_store_dword [[FI1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GCN-DAG: v_mov_b32_e32 [[FI2:v[0-9]+]], 8{{$}} +; GCN-DAG: v_mov_b32_e32 [[FI2:v[0-9]+]], 12{{$}} ; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} define void @stored_fi_to_global_2_small_objects(float* addrspace(1)* %ptr) #0 { %tmp0 = alloca float @@ -150,10 +150,10 @@ define void @stored_fi_to_global_2_small_objects(float* addrspace(1)* %ptr) #0 { ; GCN-LABEL: {{^}}stored_fi_to_global_huge_frame_offset: ; GCN: v_mov_b32_e32 [[BASE_0:v[0-9]+]], 0{{$}} -; GCN: buffer_store_dword [[BASE_0]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}} +; GCN: buffer_store_dword [[BASE_0]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}} ; FIXME: Re-initialize -; GCN: v_mov_b32_e32 [[BASE_0_1:v[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 [[BASE_0_1:v[0-9]+]], 4{{$}} ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} ; GCN-DAG: v_add_i32_e32 [[BASE_1_OFF_1:v[0-9]+]], vcc, 0x3ffc, [[BASE_0_1]] @@ -184,7 +184,7 @@ define void @stored_fi_to_global_huge_frame_offset(i32* addrspace(1)* %ptr) #0 { ; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} ; GCN: s_add_u32 s{{[0-9]+}}, s[[PC_LO]], g1@gotpcrel32@lo+4 ; GCN: s_addc_u32 s{{[0-9]+}}, s[[PC_HI]], g1@gotpcrel32@hi+4 -; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}} ; GCN: buffer_store_dword [[FI]] define void @cannot_select_assertzext_valuetype(i32 addrspace(1)* %out, i32 %idx) #0 { entry: diff --git a/test/CodeGen/AMDGPU/cgp-addressing-modes.ll b/test/CodeGen/AMDGPU/cgp-addressing-modes.ll index 2ed2857ff34..12c6b8ea969 100644 --- a/test/CodeGen/AMDGPU/cgp-addressing-modes.ll +++ b/test/CodeGen/AMDGPU/cgp-addressing-modes.ll @@ -130,6 +130,47 @@ done: ; GCN: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:4092{{$}} ; GCN: {{^}}BB4_2: define void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) { +entry: + %alloca = alloca [512 x i32], align 4 + %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998 + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999 + %add.arg = add i32 %arg, 8 + %alloca.gep = getelementptr [512 x i32], [512 x i32]* %alloca, i32 0, i32 1022 + %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 + %tmp0 = icmp eq i32 %tid, 0 + br i1 %tmp0, label %endif, label %if + +if: + store volatile i32 123, i32* %alloca.gep + %tmp1 = load volatile i32, i32* %alloca.gep + br label %endif + +endif: + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(1)* %out.gep.0 + %load = load volatile i32, i32* %alloca.gep + store i32 %load, i32 addrspace(1)* %out.gep.1 + br label %done + +done: + ret void +} + +; This ends up not fitting due to the reserved 4 bytes at offset 0 +; OPT-LABEL: @test_sink_scratch_small_offset_i32_reserved( +; OPT-NOT: getelementptr [512 x i32] +; OPT: br i1 +; OPT: ptrtoint + +; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32_reserved: +; GCN: s_and_saveexec_b64 +; GCN: v_mov_b32_e32 [[BASE_FI0:v[0-9]+]], 4 +; GCN: buffer_store_dword {{v[0-9]+}}, [[BASE_FI0]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}} +; GCN: v_mov_b32_e32 [[BASE_FI1:v[0-9]+]], 4 +; GCN: buffer_load_dword {{v[0-9]+}}, [[BASE_FI1]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}} +; GCN: {{^BB[0-9]+}}_2: + +define void @test_sink_scratch_small_offset_i32_reserved(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) { entry: %alloca = alloca [512 x i32], align 4 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998 @@ -165,7 +206,7 @@ done: ; GCN: s_and_saveexec_b64 ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} -; GCN: {{^}}BB5_2: +; GCN: {{^BB[0-9]+}}_2: define void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) { entry: %alloca = alloca [512 x i32], align 4 @@ -197,7 +238,7 @@ done: ; GCN: s_and_saveexec_b64 ; CI: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; VI: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -; GCN: {{^}}BB6_2: +; GCN: {{^BB[0-9]+}}_2: define void @test_sink_global_vreg_sreg_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) { entry: %offset.ext = zext i32 %offset to i64 diff --git a/test/CodeGen/AMDGPU/commute-compares.ll b/test/CodeGen/AMDGPU/commute-compares.ll index a4c51b233f4..43a0e58a5ed 100644 --- a/test/CodeGen/AMDGPU/commute-compares.ll +++ b/test/CodeGen/AMDGPU/commute-compares.ll @@ -701,7 +701,7 @@ define void @commute_uno_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i ; GCN-LABEL: {{^}}commute_frameindex: ; XGCN: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}} -; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}} ; GCN: v_cmp_eq_u32_e32 vcc, [[FI]], v{{[0-9]+}} define void @commute_frameindex(i32 addrspace(1)* nocapture %out) #0 { entry: diff --git a/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll index c6fb2da788c..013e89fcdb5 100644 --- a/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll +++ b/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll @@ -28,13 +28,12 @@ ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]] -; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 ; 4-byte Folded Spill +; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:4 ; 4-byte Folded Spill ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]] -; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:4 ; 4-byte Folded Spill +; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:8 ; 4-byte Folded Spill ; Spill load -; VMEM: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill -; VGPR: buffer_store_dword [[LOAD0]], off, s[0:3], s7 ; 4-byte Folded Spill +; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}} @@ -44,8 +43,7 @@ ; GCN: {{^}}BB{{[0-9]+}}_1: ; %if ; GCN: s_mov_b32 m0, -1 ; GCN: ds_read_b32 [[LOAD1:v[0-9]+]] -; VMEM: buffer_load_dword [[RELOAD_LOAD0:v[0-9]+]], off, s[0:3], s7 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload -; VGPR: buffer_load_dword [[RELOAD_LOAD0:v[0-9]+]], off, s[0:3], s7 ; 4-byte Folded Reload +; GCN: buffer_load_dword [[RELOAD_LOAD0:v[0-9]+]], off, s[0:3], s7 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload ; GCN: s_waitcnt vmcnt(0) ; Spill val register @@ -60,11 +58,11 @@ -; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 ; 4-byte Folded Reload +; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload ; VMEM: s_waitcnt vmcnt(0) ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]] -; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload +; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:8 ; 4-byte Folded Reload ; VMEM: s_waitcnt vmcnt(0) ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]] @@ -107,7 +105,7 @@ endif: ; GCN: s_xor_b64 s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}} ; Spill load -; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 ; 4-byte Folded Spill +; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:4 ; 4-byte Folded Spill ; Spill saved exec ; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]] @@ -115,9 +113,9 @@ endif: ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]] -; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:16 ; 4-byte Folded Spill +; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:20 ; 4-byte Folded Spill ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]] -; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:20 ; 4-byte Folded Spill +; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:24 ; 4-byte Folded Spill ; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}} @@ -127,7 +125,7 @@ endif: ; GCN: [[LOOP:BB[0-9]+_[0-9]+]]: -; GCN: buffer_load_dword v[[VAL_LOOP_RELOAD:[0-9]+]], off, s[0:3], s7 ; 4-byte Folded Reload +; GCN: buffer_load_dword v[[VAL_LOOP_RELOAD:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload ; GCN: v_subrev_i32_e32 [[VAL_LOOP:v[0-9]+]], vcc, v{{[0-9]+}}, v[[VAL_LOOP_RELOAD]] ; GCN: v_cmp_ne_u32_e32 vcc, ; GCN: s_and_b64 vcc, exec, vcc @@ -140,11 +138,11 @@ endif: ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]] ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]] -; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:16 ; 4-byte Folded Reload +; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:20 ; 4-byte Folded Reload ; VMEM: s_waitcnt vmcnt(0) ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]] -; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:20 ; 4-byte Folded Reload +; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:24 ; 4-byte Folded Reload ; VMEM: s_waitcnt vmcnt(0) ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]] @@ -187,7 +185,7 @@ end: ; GCN: s_xor_b64 s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}} ; Spill load -; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 ; 4-byte Folded Spill +; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:4 ; 4-byte Folded Spill ; Spill saved exec ; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]] @@ -244,14 +242,14 @@ end: ; GCN: BB{{[0-9]+}}_2: ; %if ; GCN: ds_read_b32 -; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], s7 ; 4-byte Folded Reload +; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, v{{[0-9]+}}, v[[LOAD0_RELOAD]] ; GCN: buffer_store_dword [[ADD]], off, s[0:3], s7 offset:[[RESULT_OFFSET]] ; 4-byte Folded Spill ; GCN: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_branch [[ENDIF:BB[0-9]+_[0-9]+]] ; GCN: [[ELSE]]: ; %else -; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], s7 ; 4-byte Folded Reload +; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload ; GCN: v_subrev_i32_e32 [[SUB:v[0-9]+]], vcc, v{{[0-9]+}}, v[[LOAD0_RELOAD]] ; GCN: buffer_store_dword [[ADD]], off, s[0:3], s7 offset:[[FLOW_RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; GCN: s_waitcnt vmcnt(0) expcnt(0) diff --git a/test/CodeGen/AMDGPU/extload-private.ll b/test/CodeGen/AMDGPU/extload-private.ll index 6cebe5f495c..505aef3258c 100644 --- a/test/CodeGen/AMDGPU/extload-private.ll +++ b/test/CodeGen/AMDGPU/extload-private.ll @@ -2,7 +2,7 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}load_i8_sext_private: -; SI: buffer_load_sbyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+$}} +; SI: buffer_load_sbyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}} define void @load_i8_sext_private(i32 addrspace(1)* %out) { entry: %tmp0 = alloca i8 @@ -13,7 +13,7 @@ entry: } ; FUNC-LABEL: {{^}}load_i8_zext_private: -; SI: buffer_load_ubyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+$}} +; SI: buffer_load_ubyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}} define void @load_i8_zext_private(i32 addrspace(1)* %out) { entry: %tmp0 = alloca i8 @@ -24,7 +24,7 @@ entry: } ; FUNC-LABEL: {{^}}load_i16_sext_private: -; SI: buffer_load_sshort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+$}} +; SI: buffer_load_sshort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}} define void @load_i16_sext_private(i32 addrspace(1)* %out) { entry: %tmp0 = alloca i16 @@ -35,7 +35,7 @@ entry: } ; FUNC-LABEL: {{^}}load_i16_zext_private: -; SI: buffer_load_ushort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+$}} +; SI: buffer_load_ushort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}} define void @load_i16_zext_private(i32 addrspace(1)* %out) { entry: %tmp0 = alloca i16 diff --git a/test/CodeGen/AMDGPU/insert_vector_elt.ll b/test/CodeGen/AMDGPU/insert_vector_elt.ll index 65ac693a4f4..d7da673ee84 100644 --- a/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -207,14 +207,14 @@ define void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> ; GCN: buffer_load_ushort v{{[0-9]+}}, off ; GCN: buffer_load_ushort v{{[0-9]+}}, off -; GCN-DAG: v_mov_b32_e32 [[BASE_FI:v[0-9]+]], 0{{$}} +; GCN-DAG: v_mov_b32_e32 [[BASE_FI:v[0-9]+]], 8{{$}} ; GCN-DAG: s_and_b32 [[MASK_IDX:s[0-9]+]], s{{[0-9]+}}, 3{{$}} ; GCN-DAG: v_or_b32_e32 [[IDX:v[0-9]+]], [[MASK_IDX]], [[BASE_FI]]{{$}} -; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:6 -; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4 -; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2 -; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}} +; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:14 +; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:12 +; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:10 +; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8 ; GCN: buffer_store_short v{{[0-9]+}}, [[IDX]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}} ; GCN: s_waitcnt @@ -235,8 +235,8 @@ define void @dynamic_insertelement_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> ; GCN: buffer_load_ubyte v{{[0-9]+}}, off ; GCN: buffer_load_ubyte v{{[0-9]+}}, off -; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:1 -; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}} +; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:5 +; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4 ; GCN: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}} @@ -255,9 +255,9 @@ define void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a ; GCN: buffer_load_ubyte v{{[0-9]+}}, off ; GCN: buffer_load_ubyte v{{[0-9]+}}, off -; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2 -; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:1 -; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}} +; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:6 +; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:5 +; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4 ; GCN: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}} @@ -279,10 +279,10 @@ define void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> %a ; GCN: buffer_load_ubyte v{{[0-9]+}}, off ; GCN: buffer_load_ubyte v{{[0-9]+}}, off -; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:3 -; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2 -; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:1 -; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}} +; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:7 +; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:6 +; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:5 +; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4 ; GCN: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}} @@ -396,15 +396,15 @@ define void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> ; Stack store -; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}}{{$}} -; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:16{{$}} +; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:32{{$}} +; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:48{{$}} ; Write element ; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} ; Stack reload -; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:16{{$}} -; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}}{{$}} +; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:32{{$}} +; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:48{{$}} ; Store result ; GCN: buffer_store_dwordx4 @@ -421,17 +421,17 @@ define void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x d ; GCN-LABEL: {{^}}dynamic_insertelement_v8f64: ; GCN-DAG: SCRATCH_RSRC_DWORD -; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:16{{$}} -; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:32{{$}} -; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:48{{$}} -; GCN: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}}{{$}} +; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:64{{$}} +; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:80{{$}} +; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:96{{$}} +; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:112{{$}} ; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} -; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:48{{$}} -; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:32{{$}} -; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:16{{$}} -; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}}{{$}} +; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:64{{$}} +; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:80{{$}} +; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:96{{$}} +; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:112{{$}} ; GCN: buffer_store_dwordx4 ; GCN: buffer_store_dwordx4 diff --git a/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll b/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll index 21c92dbc909..aaa63f7196b 100644 --- a/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll +++ b/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll @@ -4,7 +4,7 @@ ; alignment of the stack ; CHECK-LABEL: {{^}}no_args: -; CHECK: ScratchSize: 8{{$}} +; CHECK: ScratchSize: 5{{$}} define void @no_args() { %alloca = alloca i8 store volatile i8 0, i8* %alloca @@ -12,7 +12,7 @@ define void @no_args() { } ; CHECK-LABEL: {{^}}force_align32: -; CHECK: ScratchSize: 8{{$}} +; CHECK: ScratchSize: 5{{$}} define void @force_align32(<8 x i32>) { %alloca = alloca i8 store volatile i8 0, i8* %alloca @@ -20,7 +20,7 @@ define void @force_align32(<8 x i32>) { } ; CHECK-LABEL: {{^}}force_align64: -; CHECK: ScratchSize: 8{{$}} +; CHECK: ScratchSize: 5{{$}} define void @force_align64(<16 x i32>) { %alloca = alloca i8 store volatile i8 0, i8* %alloca @@ -28,7 +28,7 @@ define void @force_align64(<16 x i32>) { } ; CHECK-LABEL: {{^}}force_align128: -; CHECK: ScratchSize: 8{{$}} +; CHECK: ScratchSize: 5{{$}} define void @force_align128(<32 x i32>) { %alloca = alloca i8 store volatile i8 0, i8* %alloca @@ -36,7 +36,7 @@ define void @force_align128(<32 x i32>) { } ; CHECK-LABEL: {{^}}force_align256: -; CHECK: ScratchSize: 8{{$}} +; CHECK: ScratchSize: 5{{$}} define void @force_align256(<64 x i32>) { %alloca = alloca i8 store volatile i8 0, i8* %alloca diff --git a/test/CodeGen/AMDGPU/local-stack-slot-bug.ll b/test/CodeGen/AMDGPU/local-stack-slot-bug.ll index dc43e8613dd..d3e0f0be4b5 100644 --- a/test/CodeGen/AMDGPU/local-stack-slot-bug.ll +++ b/test/CodeGen/AMDGPU/local-stack-slot-bug.ll @@ -8,13 +8,12 @@ ; CHECK-LABEL: {{^}}main: ; CHECK-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x200 -; CHECK-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; CHECK-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0x400{{$}} ; CHECK-DAG: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0 ; CHECK-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, [[BYTES]] -; TODO: add 0? -; CHECK-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], [[CLAMP_IDX]], [[ZERO]] -; CHECK-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], [[CLAMP_IDX]], [[K]] +; CHECK-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], [[CLAMP_IDX]], [[K]] +; CHECK-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], [[CLAMP_IDX]], [[ZERO]] ; CHECK: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen ; CHECK: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen diff --git a/test/CodeGen/AMDGPU/private-element-size.ll b/test/CodeGen/AMDGPU/private-element-size.ll index de9a8f75512..eb487637ce9 100644 --- a/test/CodeGen/AMDGPU/private-element-size.ll +++ b/test/CodeGen/AMDGPU/private-element-size.ll @@ -10,27 +10,27 @@ ; HSA-ELT4: private_element_size = 1 -; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9{{$}} ; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16 +; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:32 ; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9{{$}} -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:8 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:24{{$}} ; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16 -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:24 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:32 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:40 ; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen ; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:4{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:8{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:12{{$}} ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:16{{$}} ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:20{{$}} ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:24{{$}} ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:28{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:32{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:36{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:40{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:44{{$}} ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}} @@ -59,36 +59,28 @@ entry: ; HSA-ELT8: private_element_size = 2 ; HSA-ELT4: private_element_size = 1 -; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9{{$}} -; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16 ; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:32 ; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:48 +; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:64 +; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:80 ; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} ; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9{{$}} -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:8 -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16 -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:24 ; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:32 ; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:40 ; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:48 ; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:56 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:88 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:80 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:72 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:64 ; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen ; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:4{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:8{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:12{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:16{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:20{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:24{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:28{{$}} ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:32{{$}} ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:36{{$}} ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:40{{$}} @@ -97,6 +89,14 @@ entry: ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:52{{$}} ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:56{{$}} ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:60{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:64{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:68{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:72{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:76{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:80{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:84{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:88{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:92{{$}} ; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} ; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}} @@ -130,16 +130,16 @@ entry: ; HSA-ELT8: private_element_size = 2 ; HSA-ELT4: private_element_size = 1 -; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9{{$}} -; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:8 +; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16 +; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:24 ; HSA-ELTGE8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:4{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:8{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:12{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:16{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:20{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:24{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:28{{$}} ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}} @@ -166,16 +166,16 @@ entry: ; HSA-ELT8: private_element_size = 2 ; HSA-ELT4: private_element_size = 1 -; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9{{$}} -; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:8 +; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16 +; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:24 ; HSA-ELTGE8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:4{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:8{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:12{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:16{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:20{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:24{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:28{{$}} ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}} @@ -202,27 +202,27 @@ entry: ; HSA-ELT8: private_element_size = 2 ; HSA-ELT4: private_element_size = 1 -; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9{{$}} ; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16 +; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:32 ; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9{{$}} -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:8 -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16{{$}} ; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:24 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:40 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:32 ; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen ; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:4{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:8{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:12{{$}} ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:16{{$}} ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:20{{$}} ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:24{{$}} ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:28{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:32{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:36{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:40{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:44{{$}} ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}} diff --git a/test/CodeGen/AMDGPU/scratch-buffer.ll b/test/CodeGen/AMDGPU/scratch-buffer.ll index 94101f0b92b..b4672904302 100644 --- a/test/CodeGen/AMDGPU/scratch-buffer.ll +++ b/test/CodeGen/AMDGPU/scratch-buffer.ll @@ -9,8 +9,8 @@ ; should be able to reuse the same regiser for each scratch buffer access. ; GCN-LABEL: {{^}}legal_offset_fi: -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+$}} -; GCN: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x8000 +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offset:4{{$}} +; GCN: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x8004 ; GCN: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}} define void @legal_offset_fi(i32 addrspace(1)* %out, i32 %cond, i32 %if_offset, i32 %else_offset) { @@ -49,7 +49,7 @@ done: ; GCN-LABEL: {{^}}legal_offset_fi_offset: ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}} ; This constant isn't folded, because it has multiple uses. -; GCN-DAG: v_mov_b32_e32 [[K8000:v[0-9]+]], 0x8000 +; GCN-DAG: v_mov_b32_e32 [[K8000:v[0-9]+]], 0x8004 ; GCN-DAG: v_add_i32_e32 [[OFFSET:v[0-9]+]], vcc, [[K8000]] ; GCN: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}} @@ -98,7 +98,7 @@ entry: } ; GCN-LABEL: {{^}}pos_vaddr_offset: -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:16 +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:20 define void @pos_vaddr_offset(i32 addrspace(1)* %out, i32 %offset) { entry: %array = alloca [8192 x i32] diff --git a/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll b/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll index 062f5245af1..f5ae4f5fb73 100644 --- a/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll +++ b/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll @@ -8,7 +8,7 @@ ; ALL: s_mov_b32 s[[HI:[0-9]+]], 0xe80000 ; Make sure we are handling hazards correctly. -; SGPR: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:12 +; SGPR: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:16 ; SGPR-NEXT: s_waitcnt vmcnt(0) ; SGPR-NEXT: v_readfirstlane_b32 s[[HI:[0-9]+]], [[VHI]] ; SGPR-NEXT: s_nop 4 @@ -16,10 +16,10 @@ ; Make sure scratch wave offset register is correctly incremented and ; then restored. -; SMEM: s_mov_b32 m0, s[[OFF]]{{$}} +; SMEM: s_add_u32 m0, s[[OFF]], 0x100{{$}} ; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[LO]]:[[HI]]], m0 ; 16-byte Folded Spill -; SMEM: s_mov_b32 m0, s[[OFF]]{{$}} +; SMEM: s_add_u32 m0, s[[OFF]], 0x100{{$}} ; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[LO]]:[[HI]]], m0 ; 16-byte Folded Reload ; SMEM: s_dcache_wb diff --git a/test/CodeGen/AMDGPU/spill-m0.ll b/test/CodeGen/AMDGPU/spill-m0.ll index c2e7e8ae53f..3d88b9aa3da 100644 --- a/test/CodeGen/AMDGPU/spill-m0.ll +++ b/test/CodeGen/AMDGPU/spill-m0.ll @@ -17,11 +17,11 @@ ; TOVMEM-DAG: s_mov_b32 [[M0_COPY:s[0-9]+]], m0 ; TOVMEM-DAG: v_mov_b32_e32 [[SPILL_VREG:v[0-9]+]], [[M0_COPY]] -; TOVMEM: buffer_store_dword [[SPILL_VREG]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} ; 4-byte Folded Spill +; TOVMEM: buffer_store_dword [[SPILL_VREG]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4 ; 4-byte Folded Spill ; TOVMEM: s_waitcnt vmcnt(0) ; TOSMEM-DAG: s_mov_b32 [[M0_COPY:s[0-9]+]], m0 -; TOSMEM: s_mov_b32 m0, s3{{$}} +; TOSMEM: s_add_u32 m0, s3, 0x100{{$}} ; TOSMEM-NOT: [[M0_COPY]] ; TOSMEM: s_buffer_store_dword [[M0_COPY]], s{{\[}}[[LO]]:[[HI]]], m0 ; 4-byte Folded Spill ; TOSMEM: s_waitcnt lgkmcnt(0) @@ -32,12 +32,12 @@ ; TOVGPR: v_readlane_b32 [[M0_RESTORE:s[0-9]+]], [[SPILL_VREG]], 0 ; TOVGPR: s_mov_b32 m0, [[M0_RESTORE]] -; TOVMEM: buffer_load_dword [[RELOAD_VREG:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} ; 4-byte Folded Reload +; TOVMEM: buffer_load_dword [[RELOAD_VREG:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4 ; 4-byte Folded Reload ; TOVMEM: s_waitcnt vmcnt(0) ; TOVMEM: v_readfirstlane_b32 [[M0_RESTORE:s[0-9]+]], [[RELOAD_VREG]] ; TOVMEM: s_mov_b32 m0, [[M0_RESTORE]] -; TOSMEM: s_mov_b32 m0, s3{{$}} +; TOSMEM: s_add_u32 m0, s3, 0x100{{$}} ; TOSMEM: s_buffer_load_dword [[M0_RESTORE:s[0-9]+]], s{{\[}}[[LO]]:[[HI]]], m0 ; 4-byte Folded Reload ; TOSMEM-NOT: [[M0_RESTORE]] ; TOSMEM: s_mov_b32 m0, [[M0_RESTORE]] @@ -67,12 +67,12 @@ endif: ; GCN: v_interp_mov_f32 ; TOSMEM-NOT: s_m0 -; TOSMEM: s_mov_b32 m0, s7 +; TOSMEM: s_add_u32 m0, s7, 0x100 ; TOSMEM-NEXT: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Spill ; TOSMEM-NOT: m0 ; TOSMEM-NOT: m0 -; TOSMEM: s_add_u32 m0, s7, 0x100 +; TOSMEM: s_add_u32 m0, s7, 0x200 ; TOSMEM: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Spill ; TOSMEM-NOT: m0 @@ -81,7 +81,7 @@ endif: ; TOSMEM: s_branch ; TOSMEM: BB{{[0-9]+_[0-9]+}}: -; TOSMEM-NEXT: s_add_u32 m0, s7, 0x100 +; TOSMEM-NEXT: s_add_u32 m0, s7, 0x200 ; TOSMEM-NEXT: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Reload @@ -121,7 +121,7 @@ endif: ; preds = %else, %if ; GCN: ; clobber m0 ; TOSMEM: s_mov_b32 vcc_hi, m0 -; TOSMEM: s_mov_b32 m0, s3 +; TOSMEM: s_add_u32 m0, s3, 0x100 ; TOSMEM-NEXT: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Spill ; TOSMEM: s_mov_b32 m0, vcc_hi @@ -130,7 +130,7 @@ endif: ; preds = %else, %if ; TOSMEM: s_branch ; TOSMEM: BB{{[0-9]+_[0-9]+}}: -; TOSMEM-NEXT: s_mov_b32 m0, s3 +; TOSMEM-NEXT: s_add_u32 m0, s3, 0x100 ; TOSMEM-NEXT: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Reload ; GCN-NOT: v_readlane_b32 m0 @@ -160,10 +160,10 @@ endif: ; TOSMEM: s_load_dwordx2 [[REG:s\[[0-9]+:[0-9]+\]]] ; TOSMEM: s_cmp_eq_u32 ; TOSMEM-NOT: m0 -; TOSMEM: s_mov_b32 m0, s3 +; TOSMEM: s_add_u32 m0, s3, 0x100 ; TOSMEM: s_buffer_store_dwordx2 [[REG]], s[88:91], m0 ; 8-byte Folded Spill ; TOSMEM-NOT: m0 -; TOSMEM: s_add_u32 m0, s3, 0x200 +; TOSMEM: s_add_u32 m0, s3, 0x300 ; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s[88:91], m0 ; 4-byte Folded Spill ; TOSMEM-NOT: m0 ; TOSMEM: s_cbranch_scc1 @@ -171,7 +171,7 @@ endif: ; TOSMEM: s_mov_b32 m0, -1 ; TOSMEM: s_mov_b32 vcc_hi, m0 -; TOSMEM: s_mov_b32 m0, s3 +; TOSMEM: s_add_u32 m0, s3, 0x100 ; TOSMEM: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[88:91], m0 ; 8-byte Folded Reload ; TOSMEM: s_mov_b32 m0, vcc_hi ; TOSMEM: s_waitcnt lgkmcnt(0) @@ -179,7 +179,7 @@ endif: ; TOSMEM: ds_write_b64 ; TOSMEM-NOT: m0 -; TOSMEM: s_add_u32 m0, s3, 0x200 +; TOSMEM: s_add_u32 m0, s3, 0x300 ; TOSMEM: s_buffer_load_dword s0, s[88:91], m0 ; 4-byte Folded Reload ; TOSMEM-NOT: m0 ; TOSMEM: s_waitcnt lgkmcnt(0) diff --git a/test/CodeGen/AMDGPU/spill-wide-sgpr.ll b/test/CodeGen/AMDGPU/spill-wide-sgpr.ll index cab45be8da5..ebba35a6689 100644 --- a/test/CodeGen/AMDGPU/spill-wide-sgpr.ll +++ b/test/CodeGen/AMDGPU/spill-wide-sgpr.ll @@ -3,11 +3,11 @@ ; RUN: llc -O0 -march=amdgcn -mcpu=fiji -amdgpu-spill-sgpr-to-smem=0 -amdgpu-spill-sgpr-to-vgpr=0 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=VMEM %s ; ALL-LABEL: {{^}}spill_sgpr_x2: -; SMEM: s_mov_b32 m0, s3{{$}} +; SMEM: s_add_u32 m0, s3, 0x100{{$}} ; SMEM: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:11], m0 ; 8-byte Folded Spill ; SMEM: s_cbranch_scc1 -; SMEM: s_mov_b32 m0, s3{{$}} +; SMEM: s_add_u32 m0, s3, 0x100{{$}} ; SMEM: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:11], m0 ; 8-byte Folded Reload ; SMEM: s_dcache_wb @@ -44,11 +44,11 @@ ret: } ; ALL-LABEL: {{^}}spill_sgpr_x4: -; SMEM: s_mov_b32 m0, s3{{$}} +; SMEM: s_add_u32 m0, s3, 0x100{{$}} ; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[12:15], m0 ; 16-byte Folded Spill ; SMEM: s_cbranch_scc1 -; SMEM: s_mov_b32 m0, s3{{$}} +; SMEM: s_add_u32 m0, s3, 0x100{{$}} ; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[12:15], m0 ; 16-byte Folded Reload ; SMEM: s_dcache_wb ; SMEM: s_endpgm @@ -93,15 +93,15 @@ ret: ; ALL-LABEL: {{^}}spill_sgpr_x8: -; SMEM: s_mov_b32 m0, s3{{$}} +; SMEM: s_add_u32 m0, s3, 0x100{{$}} ; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[16:19], m0 ; 16-byte Folded Spill -; SMEM: s_add_u32 m0, s3, 16 +; SMEM: s_add_u32 m0, s3, 0x110{{$}} ; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[16:19], m0 ; 16-byte Folded Spill ; SMEM: s_cbranch_scc1 -; SMEM: s_mov_b32 m0, s3{{$}} +; SMEM: s_add_u32 m0, s3, 0x100{{$}} ; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[16:19], m0 ; 16-byte Folded Reload -; SMEM: s_add_u32 m0, s3, 16 +; SMEM: s_add_u32 m0, s3, 0x110{{$}} ; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[16:19], m0 ; 16-byte Folded Reload ; SMEM: s_dcache_wb diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll index a8908f87fbf..53b6c5c64de 100644 --- a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll +++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll @@ -15,7 +15,7 @@ ; HSA: enable_sgpr_private_segment_buffer = 1 ; HSA: enable_sgpr_flat_scratch_init = 0 -; HSA: workitem_private_segment_byte_size = 1024 +; HSA: workitem_private_segment_byte_size = 1536 ; GCN-NOT: flat_scr @@ -40,7 +40,7 @@ ; GCN: buffer_load_dword {{v[0-9]+}}, off, s[12:15], s16 offset:{{[0-9]+}} ; GCN: NumVgprs: 256 -; GCN: ScratchSize: 1024 +; GCN: ScratchSize: 1536 ; s[0:3] input user SGPRs. s4,s5,s6 = workgroup IDs. s8 scratch offset. define void @spill_vgpr_compute(<4 x float> %arg6, float addrspace(1)* %arg, i32 %arg1, i32 %arg2, float %arg3, float %arg4, float %arg5) #0 { diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll index 0a35834709b..3c7b32afbc1 100644 --- a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll +++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll @@ -23,7 +23,7 @@ ; GCN: buffer_store_dword {{v[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], s[[OFFREG]] offset:{{[0-9]+}} ; 4-byte Folded Spill ; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], s[[OFFREG]] offset:{{[0-9]+}} ; 4-byte Folded Reload ; GCN: NumVgprs: 256 -; GCN: ScratchSize: 1024 +; GCN: ScratchSize: 1536 define amdgpu_vs void @main([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, [16 x <16 x i8>] addrspace(2)* byval %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 { bb: diff --git a/test/CodeGen/AMDGPU/wqm.ll b/test/CodeGen/AMDGPU/wqm.ll index ff48429d438..e06116ce734 100644 --- a/test/CodeGen/AMDGPU/wqm.ll +++ b/test/CodeGen/AMDGPU/wqm.ll @@ -392,7 +392,7 @@ break: ; CHECK: s_and_b64 exec, exec, [[LIVE]] ; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 ; CHECK: s_wqm_b64 exec, exec -; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+$}} +; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:4{{$}} ; CHECK: s_and_b64 exec, exec, [[LIVE]] ; CHECK: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen ; CHECK: s_wqm_b64 exec, exec -- 2.50.1