From: Jay Foad <jay.foad@gmail.com> Date: Fri, 5 Jul 2019 14:52:48 +0000 (+0000) Subject: [AMDGPU] DPP combiner: recognize identities for more opcodes X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=9ec87924ff79788806056d9474aab671f464fa6d;p=llvm [AMDGPU] DPP combiner: recognize identities for more opcodes Summary: This allows the DPP combiner to kick in more often. For example the exclusive scan generated by the atomic optimizer for a divergent atomic add used to look like this: v_mov_b32_e32 v3, v1 v_mov_b32_e32 v5, v1 v_mov_b32_e32 v6, v1 v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf s_nop 1 v_add_u32_dpp v4, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf v_mov_b32_dpp v6, v3 row_shr:3 row_mask:0xf bank_mask:0xf v_add3_u32 v3, v4, v5, v6 v_mov_b32_e32 v4, v1 s_nop 1 v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xe v_add_u32_e32 v3, v3, v4 v_mov_b32_e32 v4, v1 s_nop 1 v_mov_b32_dpp v4, v3 row_shr:8 row_mask:0xf bank_mask:0xc v_add_u32_e32 v3, v3, v4 v_mov_b32_e32 v4, v1 s_nop 1 v_mov_b32_dpp v4, v3 row_bcast:15 row_mask:0xa bank_mask:0xf v_add_u32_e32 v3, v3, v4 s_nop 1 v_mov_b32_dpp v1, v3 row_bcast:31 row_mask:0xc bank_mask:0xf v_add_u32_e32 v1, v3, v1 v_add_u32_e32 v1, v2, v1 v_readlane_b32 s0, v1, 63 But now most of the dpp movs are combined into adds: v_mov_b32_e32 v3, v1 v_mov_b32_e32 v5, v1 s_nop 0 v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf s_nop 1 v_add_u32_dpp v4, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf v_mov_b32_dpp v1, v3 row_shr:3 row_mask:0xf bank_mask:0xf v_add3_u32 v1, v4, v5, v1 s_nop 1 v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xe s_nop 1 v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xc s_nop 1 v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf s_nop 1 v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf v_add_u32_e32 v1, v2, v1 v_readlane_b32 s0, v1, 63 Reviewers: arsenm, vpykhtin Subscribers: kzhuravl, nemanjai, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, kbarton, MaskRay, jfb, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D64207 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@365211 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/AMDGPU/GCNDPPCombine.cpp b/lib/Target/AMDGPU/GCNDPPCombine.cpp index 536dc54a65c..7348b5b56c8 100644 --- a/lib/Target/AMDGPU/GCNDPPCombine.cpp +++ b/lib/Target/AMDGPU/GCNDPPCombine.cpp @@ -253,33 +253,46 @@ static bool isIdentityValue(unsigned OrigMIOp, MachineOperand *OldOpnd) { switch (OrigMIOp) { default: break; case AMDGPU::V_ADD_U32_e32: + case AMDGPU::V_ADD_U32_e64: case AMDGPU::V_ADD_I32_e32: + case AMDGPU::V_ADD_I32_e64: case AMDGPU::V_OR_B32_e32: + case AMDGPU::V_OR_B32_e64: case AMDGPU::V_SUBREV_U32_e32: + case AMDGPU::V_SUBREV_U32_e64: case AMDGPU::V_SUBREV_I32_e32: + case AMDGPU::V_SUBREV_I32_e64: case AMDGPU::V_MAX_U32_e32: + case AMDGPU::V_MAX_U32_e64: case AMDGPU::V_XOR_B32_e32: + case AMDGPU::V_XOR_B32_e64: if (OldOpnd->getImm() == 0) return true; break; case AMDGPU::V_AND_B32_e32: + case AMDGPU::V_AND_B32_e64: case AMDGPU::V_MIN_U32_e32: + case AMDGPU::V_MIN_U32_e64: if (static_cast<uint32_t>(OldOpnd->getImm()) == std::numeric_limits<uint32_t>::max()) return true; break; case AMDGPU::V_MIN_I32_e32: + case AMDGPU::V_MIN_I32_e64: if (static_cast<int32_t>(OldOpnd->getImm()) == std::numeric_limits<int32_t>::max()) return true; break; case AMDGPU::V_MAX_I32_e32: + case AMDGPU::V_MAX_I32_e64: if (static_cast<int32_t>(OldOpnd->getImm()) == std::numeric_limits<int32_t>::min()) return true; break; case AMDGPU::V_MUL_I32_I24_e32: + case AMDGPU::V_MUL_I32_I24_e64: case AMDGPU::V_MUL_U32_U24_e32: + case AMDGPU::V_MUL_U32_U24_e64: if (OldOpnd->getImm() == 1) return true; break; diff --git a/test/CodeGen/AMDGPU/dpp_combine.mir b/test/CodeGen/AMDGPU/dpp_combine.mir index fbeadcf2e00..d98cde5cff4 100644 --- a/test/CodeGen/AMDGPU/dpp_combine.mir +++ b/test/CodeGen/AMDGPU/dpp_combine.mir @@ -1,4 +1,4 @@ -# RUN: llc -march=amdgcn -mcpu=tonga -run-pass=gcn-dpp-combine -o - %s | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=gcn-dpp-combine -o - %s | FileCheck %s --- # old is undefined: only combine when masks are fully enabled and @@ -328,6 +328,30 @@ body: | %10:vgpr_32 = V_ADD_F32_e64 4, %9, 8, %0, 0, 0, implicit $exec ... +# check for e64 modifiers +# CHECK-LABEL: name: add_u32_e64 +# CHECK: %4:vgpr_32 = V_ADD_U32_dpp %2, %0, %1, 1, 15, 15, 1, implicit $exec +# CHECK: %6:vgpr_32 = V_ADD_U32_e64 %5, %1, 1, implicit $exec + +name: add_u32_e64 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = IMPLICIT_DEF + + ; this should be combined as all modifiers are default + %3:vgpr_32 = V_MOV_B32_dpp undef %2, %0, 1, 15, 15, 1, implicit $exec + %4:vgpr_32 = V_ADD_U32_e64 %3, %1, 0, implicit $exec + + ; this shouldn't be combined as clamp is set + %5:vgpr_32 = V_MOV_B32_dpp undef %2, %0, 1, 15, 15, 1, implicit $exec + %6:vgpr_32 = V_ADD_U32_e64 %5, %1, 1, implicit $exec +... + # tests on sequences of dpp consumers # CHECK-LABEL: name: dpp_seq # CHECK: %4:vgpr_32 = V_ADD_I32_dpp %1, %0, %1, 1, 14, 15, 0, implicit-def $vcc, implicit $exec