Summary:
This allows the DPP combiner to kick in more often. For example the
exclusive scan generated by the atomic optimizer for a divergent atomic
add used to look like this:
v_mov_b32_e32 v3, v1
v_mov_b32_e32 v5, v1
v_mov_b32_e32 v6, v1
v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
s_nop 1
v_add_u32_dpp v4, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
v_mov_b32_dpp v6, v3 row_shr:3 row_mask:0xf bank_mask:0xf
v_add3_u32 v3, v4, v5, v6
v_mov_b32_e32 v4, v1
s_nop 1
v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xe
v_add_u32_e32 v3, v3, v4
v_mov_b32_e32 v4, v1
s_nop 1
v_mov_b32_dpp v4, v3 row_shr:8 row_mask:0xf bank_mask:0xc
v_add_u32_e32 v3, v3, v4
v_mov_b32_e32 v4, v1
s_nop 1
v_mov_b32_dpp v4, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
v_add_u32_e32 v3, v3, v4
s_nop 1
v_mov_b32_dpp v1, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
v_add_u32_e32 v1, v3, v1
v_add_u32_e32 v1, v2, v1
v_readlane_b32 s0, v1, 63
But now most of the dpp movs are combined into adds:
v_mov_b32_e32 v3, v1
v_mov_b32_e32 v5, v1
s_nop 0
v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
s_nop 1
v_add_u32_dpp v4, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
v_mov_b32_dpp v1, v3 row_shr:3 row_mask:0xf bank_mask:0xf
v_add3_u32 v1, v4, v5, v1
s_nop 1
v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xe
s_nop 1
v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xc
s_nop 1
v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
s_nop 1
v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
v_add_u32_e32 v1, v2, v1
v_readlane_b32 s0, v1, 63
Reviewers: arsenm, vpykhtin
Subscribers: kzhuravl, nemanjai, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, kbarton, MaskRay, jfb, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D64207
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@365211
91177308-0d34-0410-b5e6-
96231b3b80d8
switch (OrigMIOp) {
default: break;
case AMDGPU::V_ADD_U32_e32:
+ case AMDGPU::V_ADD_U32_e64:
case AMDGPU::V_ADD_I32_e32:
+ case AMDGPU::V_ADD_I32_e64:
case AMDGPU::V_OR_B32_e32:
+ case AMDGPU::V_OR_B32_e64:
case AMDGPU::V_SUBREV_U32_e32:
+ case AMDGPU::V_SUBREV_U32_e64:
case AMDGPU::V_SUBREV_I32_e32:
+ case AMDGPU::V_SUBREV_I32_e64:
case AMDGPU::V_MAX_U32_e32:
+ case AMDGPU::V_MAX_U32_e64:
case AMDGPU::V_XOR_B32_e32:
+ case AMDGPU::V_XOR_B32_e64:
if (OldOpnd->getImm() == 0)
return true;
break;
case AMDGPU::V_AND_B32_e32:
+ case AMDGPU::V_AND_B32_e64:
case AMDGPU::V_MIN_U32_e32:
+ case AMDGPU::V_MIN_U32_e64:
if (static_cast<uint32_t>(OldOpnd->getImm()) ==
std::numeric_limits<uint32_t>::max())
return true;
break;
case AMDGPU::V_MIN_I32_e32:
+ case AMDGPU::V_MIN_I32_e64:
if (static_cast<int32_t>(OldOpnd->getImm()) ==
std::numeric_limits<int32_t>::max())
return true;
break;
case AMDGPU::V_MAX_I32_e32:
+ case AMDGPU::V_MAX_I32_e64:
if (static_cast<int32_t>(OldOpnd->getImm()) ==
std::numeric_limits<int32_t>::min())
return true;
break;
case AMDGPU::V_MUL_I32_I24_e32:
+ case AMDGPU::V_MUL_I32_I24_e64:
case AMDGPU::V_MUL_U32_U24_e32:
+ case AMDGPU::V_MUL_U32_U24_e64:
if (OldOpnd->getImm() == 1)
return true;
break;
-# RUN: llc -march=amdgcn -mcpu=tonga -run-pass=gcn-dpp-combine -o - %s | FileCheck %s
+# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=gcn-dpp-combine -o - %s | FileCheck %s
---
# old is undefined: only combine when masks are fully enabled and
%10:vgpr_32 = V_ADD_F32_e64 4, %9, 8, %0, 0, 0, implicit $exec
...
+# check for e64 modifiers
+# CHECK-LABEL: name: add_u32_e64
+# CHECK: %4:vgpr_32 = V_ADD_U32_dpp %2, %0, %1, 1, 15, 15, 1, implicit $exec
+# CHECK: %6:vgpr_32 = V_ADD_U32_e64 %5, %1, 1, implicit $exec
+
+name: add_u32_e64
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ %0:vgpr_32 = COPY $vgpr0
+ %1:vgpr_32 = COPY $vgpr1
+ %2:vgpr_32 = IMPLICIT_DEF
+
+ ; this should be combined as all modifiers are default
+ %3:vgpr_32 = V_MOV_B32_dpp undef %2, %0, 1, 15, 15, 1, implicit $exec
+ %4:vgpr_32 = V_ADD_U32_e64 %3, %1, 0, implicit $exec
+
+ ; this shouldn't be combined as clamp is set
+ %5:vgpr_32 = V_MOV_B32_dpp undef %2, %0, 1, 15, 15, 1, implicit $exec
+ %6:vgpr_32 = V_ADD_U32_e64 %5, %1, 1, implicit $exec
+...
+
# tests on sequences of dpp consumers
# CHECK-LABEL: name: dpp_seq
# CHECK: %4:vgpr_32 = V_ADD_I32_dpp %1, %0, %1, 1, 14, 15, 0, implicit-def $vcc, implicit $exec