From 682fc3e780e05c2bc038872ce5ed05b13148b841 Mon Sep 17 00:00:00 2001 From: Nicolai Haehnle Date: Wed, 15 Jun 2016 07:13:05 +0000 Subject: [PATCH] AMDGPU: Fix MUBUF offset bugs affecting llvm.amdgcn.buffer.* intrinsics Summary: This fixes two related bugs. First, the generic optimization passes unfortunately generate negative constant offsets but the hardware treats SOffset as an unsigned value. Second, there is a hardware bug on SI and CI, where address clamping in MUBUF instructions does not work correctly when SOffset is larger than the buffer size. This patch works around this bug by never using SOffset. An alternative workaround would be to do the clamping manually when SOffset is too large, but generating the required code sequence during instruction selection would be rather involved, and in any case the resulting code would probably be worse. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=96360 Reviewers: arsenm, tstellarAMD Subscribers: arsenm, llvm-commits, kzhuravl Differential Revision: http://reviews.llvm.org/D21326 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@272761 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 43 +++++++++++++------ .../AMDGPU/llvm.amdgcn.buffer.atomic.ll | 14 +++--- .../AMDGPU/llvm.amdgcn.buffer.load.format.ll | 28 ++++++------ .../CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll | 19 ++++++-- 4 files changed, 69 insertions(+), 35 deletions(-) diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 8858803ac4b..bba92c32195 100644 --- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -131,7 +131,7 @@ private: SDValue &Offset, SDValue &SLC) const; bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, SDValue &Offset) const; - void SelectMUBUFConstant(SDValue Constant, + bool SelectMUBUFConstant(SDValue Constant, SDValue &SOffset, SDValue &ImmOffset) const; bool SelectMUBUFIntrinsicOffset(SDValue Offset, SDValue &SOffset, @@ -1168,7 +1168,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE); } -void AMDGPUDAGToDAGISel::SelectMUBUFConstant(SDValue Constant, +bool AMDGPUDAGToDAGISel::SelectMUBUFConstant(SDValue Constant, SDValue &SOffset, SDValue &ImmOffset) const { SDLoc DL(Constant); @@ -1193,6 +1193,13 @@ void AMDGPUDAGToDAGISel::SelectMUBUFConstant(SDValue Constant, } } + // There is a hardware bug in SI and CI which prevents address clamping in + // MUBUF instructions from working correctly with SOffsets. The immediate + // offset is unaffected. + if (Overflow > 0 && + Subtarget->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) + return false; + ImmOffset = CurDAG->getTargetConstant(Imm, DL, MVT::i16); if (Overflow <= 64) @@ -1201,6 +1208,8 @@ void AMDGPUDAGToDAGISel::SelectMUBUFConstant(SDValue Constant, SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, CurDAG->getTargetConstant(Overflow, DL, MVT::i32)), 0); + + return true; } bool AMDGPUDAGToDAGISel::SelectMUBUFIntrinsicOffset(SDValue Offset, @@ -1211,9 +1220,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFIntrinsicOffset(SDValue Offset, if (!isa(Offset)) return false; - SelectMUBUFConstant(Offset, SOffset, ImmOffset); - - return true; + return SelectMUBUFConstant(Offset, SOffset, ImmOffset); } bool AMDGPUDAGToDAGISel::SelectMUBUFIntrinsicVOffset(SDValue Offset, @@ -1223,20 +1230,30 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFIntrinsicVOffset(SDValue Offset, SDLoc DL(Offset); // Don't generate an unnecessary voffset for constant offsets. - if (isa(Offset)) - return false; + if (isa(Offset)) { + SDValue Tmp1, Tmp2; + + // When necessary, use a voffset in <= CI anyway to work around a hardware + // bug. + if (Subtarget->getGeneration() > AMDGPUSubtarget::SEA_ISLANDS || + SelectMUBUFConstant(Offset, Tmp1, Tmp2)) + return false; + } if (CurDAG->isBaseWithConstantOffset(Offset)) { SDValue N0 = Offset.getOperand(0); SDValue N1 = Offset.getOperand(1); - SelectMUBUFConstant(N1, SOffset, ImmOffset); - VOffset = N0; - } else { - SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32); - ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16); - VOffset = Offset; + if (cast(N1)->getSExtValue() >= 0 && + SelectMUBUFConstant(N1, SOffset, ImmOffset)) { + VOffset = N0; + return true; + } } + SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32); + ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16); + VOffset = Offset; + return true; } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll index a45b5378ac4..98f7058b5ef 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll @@ -1,9 +1,9 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI ;CHECK-LABEL: {{^}}test1: ;CHECK: buffer_atomic_swap v0, off, s[0:3], 0 glc -;CHECK: s_movk_i32 [[SOFS:s[0-9]+]], 0x1fff +;VI: s_movk_i32 [[SOFS:s[0-9]+]], 0x1fff ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_atomic_swap v0, v1, s[0:3], 0 idxen glc ;CHECK: s_waitcnt vmcnt(0) @@ -13,7 +13,8 @@ ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_atomic_swap v0, v2, s[0:3], 0 offen offset:42 glc ;CHECK-DAG: s_waitcnt vmcnt(0) -;CHECK: buffer_atomic_swap v0, off, s[0:3], [[SOFS]] offset:1 glc +;SICI: buffer_atomic_swap v0, v1, s[0:3], 0 offen glc +;VI: buffer_atomic_swap v0, off, s[0:3], [[SOFS]] offset:1 glc ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_atomic_swap v0, off, s[0:3], 0{{$}} define amdgpu_ps float @test1(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex, i32 %voffset) { @@ -70,7 +71,7 @@ main_body: ;CHECK-LABEL: {{^}}test3: ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 glc ;CHECK: s_waitcnt vmcnt(0) -;CHECK: s_movk_i32 [[SOFS:s[0-9]+]], 0x1fff +;VI: s_movk_i32 [[SOFS:s[0-9]+]], 0x1fff ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v2, s[0:3], 0 idxen glc ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v3, s[0:3], 0 offen glc @@ -79,7 +80,8 @@ main_body: ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v3, s[0:3], 0 offen offset:42 glc ;CHECK-DAG: s_waitcnt vmcnt(0) -;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[SOFS]] offset:1 glc +;SICI: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 offen glc +;VI: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[SOFS]] offset:1 glc define amdgpu_ps float @test3(<4 x i32> inreg %rsrc, i32 %data, i32 %cmp, i32 %vindex, i32 %voffset) { main_body: %o1 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %data, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i1 0) diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll index 9fe7161394a..67c7baba3e1 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll @@ -1,5 +1,5 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI ;CHECK-LABEL: {{^}}buffer_load: ;CHECK: buffer_load_format_xyzw v[0:3], off, s[0:3], 0 @@ -27,11 +27,15 @@ main_body: } ;CHECK-LABEL: {{^}}buffer_load_immoffs_large: -;CHECK-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 61 offset:4095 -;CHECK-DAG: s_movk_i32 [[OFS1:s[0-9]+]], 0x7fff -;CHECK: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS1]] offset:4093 -;CHECK: s_mov_b32 [[OFS2:s[0-9]+]], 0x8fff -;CHECK: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS2]] offset:1 +;SICI: v_mov_b32_e32 [[VOFS:v[0-9]+]], 0x103c +;SICI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[VOFS]], s[0:3], 0 offen +;SICI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 offen +;VI-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 61 offset:4095 +;VI-DAG: s_movk_i32 [[OFS1:s[0-9]+]], 0x7fff +;VI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS1]] offset:4093 +;SICI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 offen +;VI: s_mov_b32 [[OFS2:s[0-9]+]], 0x8fff +;VI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS2]] offset:1 ;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) { main_body: @@ -44,11 +48,11 @@ main_body: } ;CHECK-LABEL: {{^}}buffer_load_immoffs_reuse: -;CHECK: s_movk_i32 [[OFS:s[0-9]+]], 0xfff -;CHECK: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS]] offset:65 -;CHECK-NOT: s_mov -;CHECK: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS]] offset:81 -;CHECK: s_waitcnt +;VI: s_movk_i32 [[OFS:s[0-9]+]], 0xfff +;VI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS]] offset:65 +;VI-NOT: s_mov +;VI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS]] offset:81 +;VI: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_immoffs_reuse(<4 x i32> inreg) { main_body: %d.0 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4160, i1 0, i1 0) diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll index a547f007ce2..010ad276da1 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll @@ -1,5 +1,5 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI ;CHECK-LABEL: {{^}}buffer_load: ;CHECK: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 @@ -27,8 +27,9 @@ main_body: } ;CHECK-LABEL: {{^}}buffer_load_immoffs_large: -;CHECK: s_movk_i32 [[OFFSET:s[0-9]+]], 0x1fff -;CHECK: buffer_load_dwordx4 v[0:3], off, s[0:3], [[OFFSET]] offset:1 +;SICI: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 offen +;VI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x1fff +;VI: buffer_load_dwordx4 v[0:3], off, s[0:3], [[OFFSET]] offset:1 ;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) { main_body: @@ -101,6 +102,16 @@ main_body: ret <2 x float> %data } +;CHECK-LABEL: {{^}}buffer_load_negative_offset: +;CHECK: v_add_i32_e32 [[VOFS:v[0-9]+]], vcc, -16, v0 +;CHECK: buffer_load_dwordx4 v[0:3], [[VOFS]], s[0:3], 0 offen +define amdgpu_ps <4 x float> @buffer_load_negative_offset(<4 x i32> inreg, i32 %ofs) { +main_body: + %ofs.1 = add i32 %ofs, -16 + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %ofs.1, i1 0, i1 0) + ret <4 x float> %data +} + declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #0 declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #0 declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #0 -- 2.50.1