From: Stanislav Mekhanoshin Date: Wed, 26 Jul 2017 21:29:15 +0000 (+0000) Subject: [AMDGPU] Optimize SI_IF lowering for simple if regions X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=ddb10d2e51d6a55b9cff98797d275163ed14a5c3;p=llvm [AMDGPU] Optimize SI_IF lowering for simple if regions Currently SI_IF results in a s_and_saveexec_b64 followed by s_xor_b64. The xor is used to extract only the changed bits. In case of a simple if region where the only use of that value is in the SI_END_CF to restore the old exec mask, we can omit the xor and perform an or of the exec mask with the original exec value saved by the s_and_saveexec_b64. Differential Revision: https://reviews.llvm.org/D35861 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@309185 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/AMDGPU/SILowerControlFlow.cpp b/lib/Target/AMDGPU/SILowerControlFlow.cpp index 5f1c7f1fc42..de86c19a752 100644 --- a/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -149,9 +149,19 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) { MachineOperand &ImpDefSCC = MI.getOperand(4); assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef()); + // If there is only one use of save exec register and that use is SI_END_CF, + // we can optimize SI_IF by returning the full saved exec mask instead of + // just cleared bits. + bool SimpleIf = false; + auto U = MRI->use_instr_nodbg_begin(SaveExecReg); + SimpleIf = U != MRI->use_instr_nodbg_end() && + std::next(U) == MRI->use_instr_nodbg_end() && + U->getOpcode() == AMDGPU::SI_END_CF; + // Add an implicit def of exec to discourage scheduling VALU after this which // will interfere with trying to form s_and_saveexec_b64 later. - unsigned CopyReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned CopyReg = SimpleIf ? SaveExecReg + : MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); MachineInstr *CopyExec = BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), CopyReg) .addReg(AMDGPU::EXEC) @@ -166,11 +176,14 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) { .addReg(Cond.getReg()); setImpSCCDefDead(*And, true); - MachineInstr *Xor = - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), SaveExecReg) - .addReg(Tmp) - .addReg(CopyReg); - setImpSCCDefDead(*Xor, ImpDefSCC.isDead()); + MachineInstr *Xor = nullptr; + if (!SimpleIf) { + Xor = + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), SaveExecReg) + .addReg(Tmp) + .addReg(CopyReg); + setImpSCCDefDead(*Xor, ImpDefSCC.isDead()); + } // Use a copy that is a terminator to get correct spill code placement it with // fast regalloc. @@ -194,7 +207,8 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) { // register. LIS->ReplaceMachineInstrInMaps(MI, *And); - LIS->InsertMachineInstrInMaps(*Xor); + if (!SimpleIf) + LIS->InsertMachineInstrInMaps(*Xor); LIS->InsertMachineInstrInMaps(*SetExec); LIS->InsertMachineInstrInMaps(*NewBr); @@ -207,7 +221,8 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) { LIS->removeInterval(SaveExecReg); LIS->createAndComputeVirtRegInterval(SaveExecReg); LIS->createAndComputeVirtRegInterval(Tmp); - LIS->createAndComputeVirtRegInterval(CopyReg); + if (!SimpleIf) + LIS->createAndComputeVirtRegInterval(CopyReg); } void SILowerControlFlow::emitElse(MachineInstr &MI) { diff --git a/test/CodeGen/AMDGPU/branch-condition-and.ll b/test/CodeGen/AMDGPU/branch-condition-and.ll index 662ea37a2b9..51b91ee286f 100644 --- a/test/CodeGen/AMDGPU/branch-condition-and.ll +++ b/test/CodeGen/AMDGPU/branch-condition-and.ll @@ -14,7 +14,6 @@ ; GCN-DAG: v_cmp_lt_f32_e32 vcc, ; GCN: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], vcc, [[OTHERCC]] ; GCN: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[AND]] -; GCN: s_xor_b64 {{s\[[0-9]+:[0-9]+\]}}, exec, [[SAVED]] ; GCN: ; mask branch [[BB5:BB[0-9]+_[0-9]+]] ; GCN-NEXT: BB{{[0-9]+_[0-9]+}}: ; %bb4 diff --git a/test/CodeGen/AMDGPU/branch-relaxation.ll b/test/CodeGen/AMDGPU/branch-relaxation.ll index 15f579eb06d..233d5a5822f 100644 --- a/test/CodeGen/AMDGPU/branch-relaxation.ll +++ b/test/CodeGen/AMDGPU/branch-relaxation.ll @@ -141,7 +141,6 @@ bb3: ; GCN: buffer_load_dword ; GCN: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}} ; GCN: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], vcc -; GCN: s_xor_b64 [[SAVE]], exec, [[SAVE]] ; GCN: v_nop_e64 ; GCN: v_nop_e64 @@ -385,7 +384,6 @@ bb3: ; GCN-LABEL: {{^}}uniform_inside_divergent: ; GCN: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}} ; GCN-NEXT: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc -; GCN-NEXT: s_xor_b64 [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]] ; GCN-NEXT: ; mask branch [[ENDIF:BB[0-9]+_[0-9]+]] ; GCN-NEXT: s_cbranch_execnz [[IF:BB[0-9]+_[0-9]+]] @@ -436,7 +434,6 @@ endif: ; GCN-LABEL: {{^}}analyze_mask_branch: ; GCN: v_cmp_lt_f32_e32 vcc ; GCN-NEXT: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc -; GCN-NEXT: s_xor_b64 [[MASK]], exec, [[MASK]] ; GCN-NEXT: ; mask branch [[RET:BB[0-9]+_[0-9]+]] ; GCN-NEXT: s_cbranch_execz [[BRANCH_SKIP:BB[0-9]+_[0-9]+]] ; GCN-NEXT: s_branch [[LOOP_BODY:BB[0-9]+_[0-9]+]] diff --git a/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll index 79d9b169187..08a31ac1847 100644 --- a/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll +++ b/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll @@ -20,7 +20,6 @@ ; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], s{{[0-9]+}}, v0 ; GCN: s_mov_b64 s{{\[}}[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]{{\]}}, exec ; GCN: s_and_b64 s{{\[}}[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, [[CMP0]] -; GCN: s_xor_b64 s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}} ; Spill saved exec ; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]] @@ -101,7 +100,6 @@ endif: ; GCN: s_mov_b64 s{{\[}}[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]{{\]}}, exec ; GCN: s_and_b64 s{{\[}}[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]{{\]}}, s{{\[}}[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]{{\]}}, [[CMP0]] -; GCN: s_xor_b64 s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}} ; Spill load ; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:4 ; 4-byte Folded Spill diff --git a/test/CodeGen/AMDGPU/i1-copy-phi.ll b/test/CodeGen/AMDGPU/i1-copy-phi.ll index 4a0213dd1de..63a9f1feb6d 100644 --- a/test/CodeGen/AMDGPU/i1-copy-phi.ll +++ b/test/CodeGen/AMDGPU/i1-copy-phi.ll @@ -4,11 +4,9 @@ ; SI-LABEL: {{^}}br_i1_phi: ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}} ; SI: s_and_saveexec_b64 -; SI: s_xor_b64 ; SI: v_mov_b32_e32 [[REG]], -1{{$}} ; SI: v_cmp_ne_u32_e32 vcc, 0, [[REG]] ; SI: s_and_saveexec_b64 -; SI: s_xor_b64 ; SI: s_endpgm define amdgpu_kernel void @br_i1_phi(i32 %arg) { bb: diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll index 1b3e09a81e5..25b0ad54421 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll @@ -137,7 +137,6 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace ; GCN-LABEL: {{^}}test_div_fmas_f32_i1_phi_vcc: ; SI: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}} ; SI: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], vcc -; SI: s_xor_b64 [[SAVE]], exec, [[SAVE]] ; SI: buffer_load_dword [[LOAD:v[0-9]+]] ; SI: v_cmp_ne_u32_e32 vcc, 0, [[LOAD]] diff --git a/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll index ba3ff0b08bc..943beb6bb73 100644 --- a/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll +++ b/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll @@ -81,7 +81,6 @@ ; GCN-NEXT: s_or_b64 exec, exec ; GCN: v_cmp_ne_u32_e32 vcc, 0 ; GCN-NEXT: s_and_saveexec_b64 -; GCN-NEXT: s_xor_b64 ; GCN: ; %exit0 ; GCN: buffer_store_dword diff --git a/test/CodeGen/AMDGPU/ret_jump.ll b/test/CodeGen/AMDGPU/ret_jump.ll index 1acae60f305..7c2e28108df 100644 --- a/test/CodeGen/AMDGPU/ret_jump.ll +++ b/test/CodeGen/AMDGPU/ret_jump.ll @@ -11,7 +11,6 @@ ; GCN-NEXT: ; %else ; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc -; GCN-NEXT: s_xor_b64 [[XOR_EXEC:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE_EXEC]] ; GCN-NEXT: ; mask branch [[FLOW:BB[0-9]+_[0-9]+]] ; GCN: BB{{[0-9]+_[0-9]+}}: ; %unreachable.bb @@ -60,7 +59,6 @@ ret.bb: ; preds = %else, %main_body ; GCN: ; BB#{{[0-9]+}}: ; %else ; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc -; GCN-NEXT: s_xor_b64 [[XOR_EXEC:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE_EXEC]] ; GCN-NEXT: ; mask branch [[FLOW1:BB[0-9]+_[0-9]+]] ; GCN-NEXT: ; %unreachable.bb diff --git a/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll b/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll index 5b0d5274d5b..f82f7731227 100644 --- a/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll +++ b/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll @@ -3,7 +3,6 @@ ; GCN-LABEL: {{^}}lower_control_flow_unreachable_terminator: ; GCN: v_cmp_eq_u32 ; GCN: s_and_saveexec_b64 -; GCN: s_xor_b64 ; GCN: ; mask branch [[RET:BB[0-9]+_[0-9]+]] ; GCN-NEXT: BB{{[0-9]+_[0-9]+}}: ; %unreachable @@ -31,7 +30,6 @@ ret: ; GCN-LABEL: {{^}}lower_control_flow_unreachable_terminator_swap_block_order: ; GCN: v_cmp_ne_u32 ; GCN: s_and_saveexec_b64 -; GCN: s_xor_b64 ; GCN: ; mask branch [[RETURN:BB[0-9]+_[0-9]+]] ; GCN-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %unreachable diff --git a/test/CodeGen/AMDGPU/skip-if-dead.ll b/test/CodeGen/AMDGPU/skip-if-dead.ll index ea8b87f1dee..0c052ae0a5d 100644 --- a/test/CodeGen/AMDGPU/skip-if-dead.ll +++ b/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -202,7 +202,6 @@ exit: ; CHECK-LABEL: {{^}}test_kill_divergent_loop: ; CHECK: v_cmp_eq_u32_e32 vcc, 0, v0 ; CHECK-NEXT: s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], vcc -; CHECK-NEXT: s_xor_b64 [[SAVEEXEC]], exec, [[SAVEEXEC]] ; CHECK-NEXT: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]] ; CHECK-NEXT: s_cbranch_execz [[EXIT]] @@ -337,7 +336,6 @@ bb7: ; preds = %bb4 ; CHECK-LABEL: {{^}}if_after_kill_block: ; CHECK: ; BB#0: ; CHECK: s_and_saveexec_b64 -; CHECK: s_xor_b64 ; CHECK-NEXT: mask branch [[BB4:BB[0-9]+_[0-9]+]] ; CHECK: v_cmpx_le_f32_e32 vcc, 0, @@ -347,7 +345,6 @@ bb7: ; preds = %bb4 ; CHECK: v_cmp_neq_f32_e32 vcc, 0, ; CHECK: s_and_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc -; CHECK: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, exec ; CHECK: mask branch [[END:BB[0-9]+_[0-9]+]] ; CHECK-NOT: branch diff --git a/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll b/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll index 35615c40d49..fb9628cf7b9 100644 --- a/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll +++ b/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll @@ -10,7 +10,6 @@ target triple="amdgcn--" ; CHECK: v_mbcnt_lo_u32_b32_e64 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc -; CHECK-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; BB0_1: ; CHECK: s_load_dword s0, s[0:1], 0xa ; CHECK-NEXT: s_waitcnt lgkmcnt(0) diff --git a/test/CodeGen/AMDGPU/uniform-cfg.ll b/test/CodeGen/AMDGPU/uniform-cfg.ll index a9d45d71fa2..9416336a44e 100644 --- a/test/CodeGen/AMDGPU/uniform-cfg.ll +++ b/test/CodeGen/AMDGPU/uniform-cfg.ll @@ -303,7 +303,6 @@ done: ; GCN-LABEL: {{^}}uniform_inside_divergent: ; GCN: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}} ; GCN: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc -; GCN: s_xor_b64 [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]] ; GCN: s_cmp_lg_u32 {{s[0-9]+}}, 0 ; GCN: s_cbranch_scc0 [[IF_UNIFORM_LABEL:[A-Z0-9_a-z]+]] ; GCN: s_endpgm @@ -335,7 +334,6 @@ endif: ; GCN: [[IF_LABEL]]: ; GCN: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}} ; GCN: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc -; GCN: s_xor_b64 [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]] ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 ; GCN: buffer_store_dword [[ONE]] define amdgpu_kernel void @divergent_inside_uniform(i32 addrspace(1)* %out, i32 %cond) { @@ -360,7 +358,6 @@ endif: ; GCN-LABEL: {{^}}divergent_if_uniform_if: ; GCN: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc -; GCN: s_xor_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]] ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 ; GCN: buffer_store_dword [[ONE]] ; GCN: s_or_b64 exec, exec, [[MASK]] diff --git a/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll b/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll index 8a08f9d8bb0..82283f39792 100644 --- a/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll +++ b/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll @@ -5,7 +5,6 @@ ; CHECK-LABEL: {{^}}test1: ; CHECK: v_cmp_ne_u32_e32 vcc, 0 ; CHECK: s_and_saveexec_b64 -; CHECK-NEXT: s_xor_b64 ; CHECK-NEXT: ; mask branch ; CHECK-NEXT: s_cbranch_execz BB{{[0-9]+_[0-9]+}} ; CHECK-NEXT: BB{{[0-9]+_[0-9]+}}: ; %loop_body.preheader @@ -35,7 +34,6 @@ out: ; CHECK-LABEL: {{^}}test2: ; CHECK: s_and_saveexec_b64 -; CHECK-NEXT: s_xor_b64 ; CHECK-NEXT: ; mask branch ; CHECK-NEXT: s_cbranch_execz define amdgpu_kernel void @test2(i32 addrspace(1)* %out, i32 %a, i32 %b) { diff --git a/test/CodeGen/AMDGPU/valu-i1.ll b/test/CodeGen/AMDGPU/valu-i1.ll index a67f36d0a7e..a5d810de769 100644 --- a/test/CodeGen/AMDGPU/valu-i1.ll +++ b/test/CodeGen/AMDGPU/valu-i1.ll @@ -18,7 +18,6 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone ; SI-NOT: s_mov_b64 s[{{[0-9]:[0-9]}}], -1 ; SI: v_mov_b32_e32 v{{[0-9]}}, -1 ; SI: s_and_saveexec_b64 -; SI-NEXT: s_xor_b64 ; SI-NEXT: ; mask branch ; v_mov should be after exec modification @@ -66,8 +65,7 @@ end: ; SI-LABEL: {{^}}simple_test_v_if: ; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}} ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc -; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]] -; SI: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]] +; SI-NEXT: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]] ; SI-NEXT: BB{{[0-9]+_[0-9]+}}: ; SI: buffer_store_dword @@ -94,8 +92,7 @@ exit: ; SI-LABEL: {{^}}simple_test_v_if_ret_else_ret: ; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}} ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc -; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]] -; SI: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]] +; SI-NEXT: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]] ; SI-NEXT: BB{{[0-9]+_[0-9]+}}: ; SI: buffer_store_dword @@ -160,8 +157,8 @@ exit: ; SI-LABEL: {{^}}simple_test_v_loop: ; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}} ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc -; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]] -; SI: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]] +; SI-NEXT: ; mask branch +; SI-NEXT: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]] ; SI: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0{{$}} @@ -202,8 +199,8 @@ exit: ; SI: buffer_load_dword [[VBOUND:v[0-9]+]] ; SI: v_cmp_lt_i32_e32 vcc ; SI: s_and_saveexec_b64 [[OUTER_CMP_SREG:s\[[0-9]+:[0-9]+\]]], vcc -; SI: s_xor_b64 [[OUTER_CMP_SREG]], exec, [[OUTER_CMP_SREG]] -; SI: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]] +; SI-NEXT: ; mask branch +; SI-NEXT: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]] ; Initialize inner condition to false ; SI: BB{{[0-9]+_[0-9]+}}: ; %bb10.preheader