From 7935774fce6df0588fdfde83e7c8991c753e545b Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Tue, 16 Jul 2019 17:44:54 +0000 Subject: [PATCH] [AMDGPU] Optimize atomic max/min Summary: Extend the atomic optimizer to handle signed and unsigned max and min operations, as well as add and subtract. Reviewers: arsenm, sheredom, critson, rampitec Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, jfb, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D64328 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@366235 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp | 177 ++++++++++++++---- .../atomic_optimizations_local_pointer.ll | 108 +++++++++++ 2 files changed, 249 insertions(+), 36 deletions(-) diff --git a/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp index 810861503be..c65a49b7c5b 100644 --- a/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -40,7 +40,7 @@ enum DPP_CTRL { struct ReplacementInfo { Instruction *I; - Instruction::BinaryOps Op; + AtomicRMWInst::BinOp Op; unsigned ValIdx; bool ValDivergent; }; @@ -55,8 +55,8 @@ private: bool HasDPP; bool IsPixelShader; - void optimizeAtomic(Instruction &I, Instruction::BinaryOps Op, - unsigned ValIdx, bool ValDivergent) const; + void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx, + bool ValDivergent) const; public: static char ID; @@ -120,16 +120,17 @@ void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) { break; } - Instruction::BinaryOps Op; + AtomicRMWInst::BinOp Op = I.getOperation(); - switch (I.getOperation()) { + switch (Op) { default: return; case AtomicRMWInst::Add: - Op = Instruction::Add; - break; case AtomicRMWInst::Sub: - Op = Instruction::Sub; + case AtomicRMWInst::Max: + case AtomicRMWInst::Min: + case AtomicRMWInst::UMax: + case AtomicRMWInst::UMin: break; } @@ -161,7 +162,7 @@ void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) { } void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) { - Instruction::BinaryOps Op; + AtomicRMWInst::BinOp Op; switch (I.getIntrinsicID()) { default: @@ -169,12 +170,32 @@ void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) { case Intrinsic::amdgcn_buffer_atomic_add: case Intrinsic::amdgcn_struct_buffer_atomic_add: case Intrinsic::amdgcn_raw_buffer_atomic_add: - Op = Instruction::Add; + Op = AtomicRMWInst::Add; break; case Intrinsic::amdgcn_buffer_atomic_sub: case Intrinsic::amdgcn_struct_buffer_atomic_sub: case Intrinsic::amdgcn_raw_buffer_atomic_sub: - Op = Instruction::Sub; + Op = AtomicRMWInst::Sub; + break; + case Intrinsic::amdgcn_buffer_atomic_smin: + case Intrinsic::amdgcn_struct_buffer_atomic_smin: + case Intrinsic::amdgcn_raw_buffer_atomic_smin: + Op = AtomicRMWInst::Min; + break; + case Intrinsic::amdgcn_buffer_atomic_umin: + case Intrinsic::amdgcn_struct_buffer_atomic_umin: + case Intrinsic::amdgcn_raw_buffer_atomic_umin: + Op = AtomicRMWInst::UMin; + break; + case Intrinsic::amdgcn_buffer_atomic_smax: + case Intrinsic::amdgcn_struct_buffer_atomic_smax: + case Intrinsic::amdgcn_raw_buffer_atomic_smax: + Op = AtomicRMWInst::Max; + break; + case Intrinsic::amdgcn_buffer_atomic_umax: + case Intrinsic::amdgcn_struct_buffer_atomic_umax: + case Intrinsic::amdgcn_raw_buffer_atomic_umax: + Op = AtomicRMWInst::UMax; break; } @@ -206,8 +227,57 @@ void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) { ToReplace.push_back(Info); } +// Use the builder to create the non-atomic counterpart of the specified +// atomicrmw binary op. +static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op, + Value *LHS, Value *RHS) { + CmpInst::Predicate Pred; + + switch (Op) { + default: + llvm_unreachable("Unhandled atomic op"); + case AtomicRMWInst::Add: + return B.CreateBinOp(Instruction::Add, LHS, RHS); + case AtomicRMWInst::Sub: + return B.CreateBinOp(Instruction::Sub, LHS, RHS); + + case AtomicRMWInst::Max: + Pred = CmpInst::ICMP_SGT; + break; + case AtomicRMWInst::Min: + Pred = CmpInst::ICMP_SLT; + break; + case AtomicRMWInst::UMax: + Pred = CmpInst::ICMP_UGT; + break; + case AtomicRMWInst::UMin: + Pred = CmpInst::ICMP_ULT; + break; + } + Value *Cond = B.CreateICmp(Pred, LHS, RHS); + return B.CreateSelect(Cond, LHS, RHS); +} + +static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op, + unsigned BitWidth) { + switch (Op) { + default: + llvm_unreachable("Unhandled atomic op"); + case AtomicRMWInst::Add: + case AtomicRMWInst::Sub: + case AtomicRMWInst::UMax: + return APInt::getMinValue(BitWidth); + case AtomicRMWInst::UMin: + return APInt::getMaxValue(BitWidth); + case AtomicRMWInst::Max: + return APInt::getSignedMinValue(BitWidth); + case AtomicRMWInst::Min: + return APInt::getSignedMaxValue(BitWidth); + } +} + void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, - Instruction::BinaryOps Op, + AtomicRMWInst::BinOp Op, unsigned ValIdx, bool ValDivergent) const { // Start building just before the instruction. @@ -266,16 +336,16 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, Value *const MbcntCast = B.CreateIntCast(Mbcnt, Ty, false); - Value *LaneOffset = nullptr; + Value *const Identity = B.getInt(getIdentityValueForAtomicOp(Op, TyBitWidth)); + + Value *ExclScan = nullptr; Value *NewV = nullptr; // If we have a divergent value in each lane, we need to combine the value // using DPP. if (ValDivergent) { - Value *const Identity = B.getIntN(TyBitWidth, 0); - - // First we need to set all inactive invocations to 0, so that they can - // correctly contribute to the final result. + // First we need to set all inactive invocations to the identity value, so + // that they can correctly contribute to the final result. CallInst *const SetInactive = B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity}); @@ -283,7 +353,7 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, B.CreateIntrinsic(Intrinsic::amdgcn_update_dpp, Ty, {Identity, SetInactive, B.getInt32(DPP_WF_SR1), B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}); - NewV = FirstDPP; + ExclScan = FirstDPP; const unsigned Iters = 7; const unsigned DPPCtrl[Iters] = { @@ -295,21 +365,20 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, // This loop performs an exclusive scan across the wavefront, with all lanes // active (by using the WWM intrinsic). for (unsigned Idx = 0; Idx < Iters; Idx++) { - Value *const UpdateValue = Idx < 3 ? FirstDPP : NewV; + Value *const UpdateValue = Idx < 3 ? FirstDPP : ExclScan; CallInst *const DPP = B.CreateIntrinsic( Intrinsic::amdgcn_update_dpp, Ty, {Identity, UpdateValue, B.getInt32(DPPCtrl[Idx]), B.getInt32(RowMask[Idx]), B.getInt32(BankMask[Idx]), B.getFalse()}); - NewV = B.CreateBinOp(Op, NewV, DPP); + ExclScan = buildNonAtomicBinOp(B, Op, ExclScan, DPP); } - LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV); - NewV = B.CreateBinOp(Op, SetInactive, NewV); + NewV = buildNonAtomicBinOp(B, Op, SetInactive, ExclScan); // Read the value from the last lane, which has accumlated the values of - // each active lane in the wavefront. This will be our new value with which - // we will provide to the atomic operation. + // each active lane in the wavefront. This will be our new value which we + // will provide to the atomic operation. if (TyBitWidth == 64) { Value *const ExtractLo = B.CreateTrunc(NewV, B.getInt32Ty()); Value *const ExtractHi = @@ -324,9 +393,8 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, B.CreateInsertElement(PartialInsert, ReadLaneHi, B.getInt32(1)); NewV = B.CreateBitCast(Insert, Ty); } else if (TyBitWidth == 32) { - CallInst *const ReadLane = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, - {}, {NewV, B.getInt32(63)}); - NewV = ReadLane; + NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, + {NewV, B.getInt32(63)}); } else { llvm_unreachable("Unhandled atomic bit width"); } @@ -334,14 +402,32 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, // Finally mark the readlanes in the WWM section. NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV); } else { - // Get the total number of active lanes we have by using popcount. - Instruction *const Ctpop = B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot); - Value *const CtpopCast = B.CreateIntCast(Ctpop, Ty, false); - - // Calculate the new value we will be contributing to the atomic operation - // for the entire wavefront. - NewV = B.CreateMul(V, CtpopCast); - LaneOffset = B.CreateMul(V, MbcntCast); + switch (Op) { + default: + llvm_unreachable("Unhandled atomic op"); + + case AtomicRMWInst::Add: + case AtomicRMWInst::Sub: { + // Get the total number of active lanes we have by using popcount. + Instruction *const Ctpop = + B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot); + Value *const CtpopCast = B.CreateIntCast(Ctpop, Ty, false); + + // Calculate the new value we will be contributing to the atomic operation + // for the entire wavefront. + NewV = B.CreateMul(V, CtpopCast); + break; + } + + case AtomicRMWInst::Max: + case AtomicRMWInst::Min: + case AtomicRMWInst::UMax: + case AtomicRMWInst::UMin: + // Max/min with a uniform value is idempotent: doing the atomic operation + // multiple times has the same effect as doing it once. + NewV = V; + break; + } } // We only want a single lane to enter our new control flow, and we do this @@ -407,7 +493,26 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, // get our individual lane's slice into the result. We use the lane offset we // previously calculated combined with the atomic result value we got from the // first lane, to get our lane's index into the atomic result. - Value *const Result = B.CreateBinOp(Op, BroadcastI, LaneOffset); + Value *LaneOffset = nullptr; + if (ValDivergent) { + LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, ExclScan); + } else { + switch (Op) { + default: + llvm_unreachable("Unhandled atomic op"); + case AtomicRMWInst::Add: + case AtomicRMWInst::Sub: + LaneOffset = B.CreateMul(V, MbcntCast); + break; + case AtomicRMWInst::Max: + case AtomicRMWInst::Min: + case AtomicRMWInst::UMax: + case AtomicRMWInst::UMin: + LaneOffset = B.CreateSelect(Cond, Identity, V); + break; + } + } + Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset); if (IsPixelShader) { // Need a final PHI to reconverge to above the helper lane branch mask. diff --git a/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index f3d50c9c490..5f7649c1c0e 100644 --- a/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -194,3 +194,111 @@ entry: store i64 %old, i64 addrspace(1)* %out ret void } + +; GCN-LABEL: max_i32_varying: +; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 +; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] +; GFX8MORE: ds_max_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] +define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %old = atomicrmw max i32 addrspace(3)* @local_var32, i32 %lane acq_rel + store i32 %old, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: max_i64_constant: +; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 +; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] +; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], 5 +; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], 0 +; GCN: ds_max_rtn_i64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}} +define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { +entry: + %old = atomicrmw max i64 addrspace(3)* @local_var64, i64 5 acq_rel + store i64 %old, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: min_i32_varying: +; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 +; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] +; GFX8MORE: ds_min_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] +define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %old = atomicrmw min i32 addrspace(3)* @local_var32, i32 %lane acq_rel + store i32 %old, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: min_i64_constant: +; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 +; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] +; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], 5 +; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], 0 +; GCN: ds_min_rtn_i64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}} +define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { +entry: + %old = atomicrmw min i64 addrspace(3)* @local_var64, i64 5 acq_rel + store i64 %old, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: umax_i32_varying: +; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 +; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] +; GFX8MORE: ds_max_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] +define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %old = atomicrmw umax i32 addrspace(3)* @local_var32, i32 %lane acq_rel + store i32 %old, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: umax_i64_constant: +; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 +; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] +; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], 5 +; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], 0 +; GCN: ds_max_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}} +define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { +entry: + %old = atomicrmw umax i64 addrspace(3)* @local_var64, i64 5 acq_rel + store i64 %old, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: umin_i32_varying: +; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 +; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] +; GFX8MORE: ds_min_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] +define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %old = atomicrmw umin i32 addrspace(3)* @local_var32, i32 %lane acq_rel + store i32 %old, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: umin_i64_constant: +; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 +; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] +; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], 5 +; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], 0 +; GCN: ds_min_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}} +define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { +entry: + %old = atomicrmw umin i64 addrspace(3)* @local_var64, i64 5 acq_rel + store i64 %old, i64 addrspace(1)* %out + ret void +} -- 2.40.0