From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Date: Wed, 26 Jul 2017 21:29:15 +0000 (+0000)
Subject: [AMDGPU] Optimize SI_IF lowering for simple if regions
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=ddb10d2e51d6a55b9cff98797d275163ed14a5c3;p=llvm

[AMDGPU] Optimize SI_IF lowering for simple if regions

Currently SI_IF results in a s_and_saveexec_b64 followed by s_xor_b64.
The xor is used to extract only the changed bits. In case of a simple
if region where the only use of that value is in the SI_END_CF to
restore the old exec mask, we can omit the xor and perform an or of
the exec mask with the original exec value saved by the
s_and_saveexec_b64.

Differential Revision: https://reviews.llvm.org/D35861

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@309185 91177308-0d34-0410-b5e6-96231b3b80d8
---

diff --git a/lib/Target/AMDGPU/SILowerControlFlow.cpp b/lib/Target/AMDGPU/SILowerControlFlow.cpp
index 5f1c7f1fc42..de86c19a752 100644
--- a/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -149,9 +149,19 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
   MachineOperand &ImpDefSCC = MI.getOperand(4);
   assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef());
 
+  // If there is only one use of save exec register and that use is SI_END_CF,
+  // we can optimize SI_IF by returning the full saved exec mask instead of
+  // just cleared bits.
+  bool SimpleIf = false;
+  auto U = MRI->use_instr_nodbg_begin(SaveExecReg);
+  SimpleIf = U != MRI->use_instr_nodbg_end() &&
+             std::next(U) == MRI->use_instr_nodbg_end() &&
+             U->getOpcode() == AMDGPU::SI_END_CF;
+
   // Add an implicit def of exec to discourage scheduling VALU after this which
   // will interfere with trying to form s_and_saveexec_b64 later.
-  unsigned CopyReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+  unsigned CopyReg = SimpleIf ? SaveExecReg
+                       : MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
   MachineInstr *CopyExec =
     BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), CopyReg)
     .addReg(AMDGPU::EXEC)
@@ -166,11 +176,14 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
     .addReg(Cond.getReg());
   setImpSCCDefDead(*And, true);
 
-  MachineInstr *Xor =
-    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), SaveExecReg)
-    .addReg(Tmp)
-    .addReg(CopyReg);
-  setImpSCCDefDead(*Xor, ImpDefSCC.isDead());
+  MachineInstr *Xor = nullptr;
+  if (!SimpleIf) {
+    Xor =
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), SaveExecReg)
+      .addReg(Tmp)
+      .addReg(CopyReg);
+    setImpSCCDefDead(*Xor, ImpDefSCC.isDead());
+  }
 
   // Use a copy that is a terminator to get correct spill code placement it with
   // fast regalloc.
@@ -194,7 +207,8 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
   // register.
   LIS->ReplaceMachineInstrInMaps(MI, *And);
 
-  LIS->InsertMachineInstrInMaps(*Xor);
+  if (!SimpleIf)
+    LIS->InsertMachineInstrInMaps(*Xor);
   LIS->InsertMachineInstrInMaps(*SetExec);
   LIS->InsertMachineInstrInMaps(*NewBr);
 
@@ -207,7 +221,8 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
   LIS->removeInterval(SaveExecReg);
   LIS->createAndComputeVirtRegInterval(SaveExecReg);
   LIS->createAndComputeVirtRegInterval(Tmp);
-  LIS->createAndComputeVirtRegInterval(CopyReg);
+  if (!SimpleIf)
+    LIS->createAndComputeVirtRegInterval(CopyReg);
 }
 
 void SILowerControlFlow::emitElse(MachineInstr &MI) {
diff --git a/test/CodeGen/AMDGPU/branch-condition-and.ll b/test/CodeGen/AMDGPU/branch-condition-and.ll
index 662ea37a2b9..51b91ee286f 100644
--- a/test/CodeGen/AMDGPU/branch-condition-and.ll
+++ b/test/CodeGen/AMDGPU/branch-condition-and.ll
@@ -14,7 +14,6 @@
 ; GCN-DAG: v_cmp_lt_f32_e32 vcc,
 ; GCN: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], vcc, [[OTHERCC]]
 ; GCN: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[AND]]
-; GCN: s_xor_b64 {{s\[[0-9]+:[0-9]+\]}}, exec, [[SAVED]]
 ; GCN: ; mask branch [[BB5:BB[0-9]+_[0-9]+]]
 
 ; GCN-NEXT: BB{{[0-9]+_[0-9]+}}: ; %bb4
diff --git a/test/CodeGen/AMDGPU/branch-relaxation.ll b/test/CodeGen/AMDGPU/branch-relaxation.ll
index 15f579eb06d..233d5a5822f 100644
--- a/test/CodeGen/AMDGPU/branch-relaxation.ll
+++ b/test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -141,7 +141,6 @@ bb3:
 ; GCN: buffer_load_dword
 ; GCN: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
 ; GCN: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], vcc
-; GCN: s_xor_b64 [[SAVE]], exec, [[SAVE]]
 
 ; GCN: v_nop_e64
 ; GCN: v_nop_e64
@@ -385,7 +384,6 @@ bb3:
 ; GCN-LABEL: {{^}}uniform_inside_divergent:
 ; GCN: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}}
 ; GCN-NEXT: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
-; GCN-NEXT: s_xor_b64  [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]]
 ; GCN-NEXT: ; mask branch [[ENDIF:BB[0-9]+_[0-9]+]]
 ; GCN-NEXT: s_cbranch_execnz [[IF:BB[0-9]+_[0-9]+]]
 
@@ -436,7 +434,6 @@ endif:
 ; GCN-LABEL: {{^}}analyze_mask_branch:
 ; GCN: v_cmp_lt_f32_e32 vcc
 ; GCN-NEXT: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
-; GCN-NEXT: s_xor_b64 [[MASK]], exec, [[MASK]]
 ; GCN-NEXT: ; mask branch [[RET:BB[0-9]+_[0-9]+]]
 ; GCN-NEXT: s_cbranch_execz [[BRANCH_SKIP:BB[0-9]+_[0-9]+]]
 ; GCN-NEXT: s_branch [[LOOP_BODY:BB[0-9]+_[0-9]+]]
diff --git a/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
index 79d9b169187..08a31ac1847 100644
--- a/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
+++ b/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
@@ -20,7 +20,6 @@
 ; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], s{{[0-9]+}}, v0
 ; GCN: s_mov_b64 s{{\[}}[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]{{\]}}, exec
 ; GCN: s_and_b64 s{{\[}}[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, [[CMP0]]
-; GCN: s_xor_b64 s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}
 
 ; Spill saved exec
 ; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]]
@@ -101,7 +100,6 @@ endif:
 
 ; GCN: s_mov_b64 s{{\[}}[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]{{\]}}, exec
 ; GCN: s_and_b64 s{{\[}}[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]{{\]}}, s{{\[}}[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]{{\]}}, [[CMP0]]
-; GCN: s_xor_b64 s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}
 
 ; Spill load
 ; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:4 ; 4-byte Folded Spill
diff --git a/test/CodeGen/AMDGPU/i1-copy-phi.ll b/test/CodeGen/AMDGPU/i1-copy-phi.ll
index 4a0213dd1de..63a9f1feb6d 100644
--- a/test/CodeGen/AMDGPU/i1-copy-phi.ll
+++ b/test/CodeGen/AMDGPU/i1-copy-phi.ll
@@ -4,11 +4,9 @@
 ; SI-LABEL: {{^}}br_i1_phi:
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
 ; SI: s_and_saveexec_b64
-; SI: s_xor_b64
 ; SI: v_mov_b32_e32 [[REG]], -1{{$}}
 ; SI: v_cmp_ne_u32_e32 vcc, 0, [[REG]]
 ; SI: s_and_saveexec_b64
-; SI: s_xor_b64
 ; SI: s_endpgm
 define amdgpu_kernel void @br_i1_phi(i32 %arg) {
 bb:
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
index 1b3e09a81e5..25b0ad54421 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
@@ -137,7 +137,6 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace
 ; GCN-LABEL: {{^}}test_div_fmas_f32_i1_phi_vcc:
 ; SI: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}}
 ; SI: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], vcc
-; SI: s_xor_b64 [[SAVE]], exec, [[SAVE]]
 
 ; SI: buffer_load_dword [[LOAD:v[0-9]+]]
 ; SI: v_cmp_ne_u32_e32 vcc, 0, [[LOAD]]
diff --git a/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
index ba3ff0b08bc..943beb6bb73 100644
--- a/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
+++ b/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
@@ -81,7 +81,6 @@
 ; GCN-NEXT: s_or_b64 exec, exec
 ; GCN: v_cmp_ne_u32_e32 vcc, 0
 ; GCN-NEXT: s_and_saveexec_b64
-; GCN-NEXT: s_xor_b64
 
 ; GCN: ; %exit0
 ; GCN: buffer_store_dword
diff --git a/test/CodeGen/AMDGPU/ret_jump.ll b/test/CodeGen/AMDGPU/ret_jump.ll
index 1acae60f305..7c2e28108df 100644
--- a/test/CodeGen/AMDGPU/ret_jump.ll
+++ b/test/CodeGen/AMDGPU/ret_jump.ll
@@ -11,7 +11,6 @@
 ; GCN-NEXT: ; %else
 
 ; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc
-; GCN-NEXT: s_xor_b64 [[XOR_EXEC:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE_EXEC]]
 ; GCN-NEXT: ; mask branch [[FLOW:BB[0-9]+_[0-9]+]]
 
 ; GCN: BB{{[0-9]+_[0-9]+}}: ; %unreachable.bb
@@ -60,7 +59,6 @@ ret.bb:                                          ; preds = %else, %main_body
 
 ; GCN: ; BB#{{[0-9]+}}: ; %else
 ; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc
-; GCN-NEXT: s_xor_b64 [[XOR_EXEC:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE_EXEC]]
 ; GCN-NEXT: ; mask branch [[FLOW1:BB[0-9]+_[0-9]+]]
 
 ; GCN-NEXT:  ; %unreachable.bb
diff --git a/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll b/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
index 5b0d5274d5b..f82f7731227 100644
--- a/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
+++ b/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
@@ -3,7 +3,6 @@
 ; GCN-LABEL: {{^}}lower_control_flow_unreachable_terminator:
 ; GCN: v_cmp_eq_u32
 ; GCN: s_and_saveexec_b64
-; GCN: s_xor_b64
 ; GCN: ; mask branch [[RET:BB[0-9]+_[0-9]+]]
 
 ; GCN-NEXT: BB{{[0-9]+_[0-9]+}}: ; %unreachable
@@ -31,7 +30,6 @@ ret:
 ; GCN-LABEL: {{^}}lower_control_flow_unreachable_terminator_swap_block_order:
 ; GCN: v_cmp_ne_u32
 ; GCN: s_and_saveexec_b64
-; GCN: s_xor_b64
 ; GCN: ; mask branch [[RETURN:BB[0-9]+_[0-9]+]]
 
 ; GCN-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %unreachable
diff --git a/test/CodeGen/AMDGPU/skip-if-dead.ll b/test/CodeGen/AMDGPU/skip-if-dead.ll
index ea8b87f1dee..0c052ae0a5d 100644
--- a/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -202,7 +202,6 @@ exit:
 ; CHECK-LABEL: {{^}}test_kill_divergent_loop:
 ; CHECK: v_cmp_eq_u32_e32 vcc, 0, v0
 ; CHECK-NEXT: s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], vcc
-; CHECK-NEXT: s_xor_b64 [[SAVEEXEC]], exec, [[SAVEEXEC]]
 ; CHECK-NEXT: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]]
 ; CHECK-NEXT: s_cbranch_execz [[EXIT]]
 
@@ -337,7 +336,6 @@ bb7:                                              ; preds = %bb4
 ; CHECK-LABEL: {{^}}if_after_kill_block:
 ; CHECK: ; BB#0:
 ; CHECK: s_and_saveexec_b64
-; CHECK: s_xor_b64
 ; CHECK-NEXT: mask branch [[BB4:BB[0-9]+_[0-9]+]]
 
 ; CHECK: v_cmpx_le_f32_e32 vcc, 0,
@@ -347,7 +345,6 @@ bb7:                                              ; preds = %bb4
 
 ; CHECK: v_cmp_neq_f32_e32 vcc, 0,
 ; CHECK: s_and_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc
-; CHECK: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, exec
 ; CHECK: mask branch [[END:BB[0-9]+_[0-9]+]]
 ; CHECK-NOT: branch
 
diff --git a/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll b/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll
index 35615c40d49..fb9628cf7b9 100644
--- a/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll
+++ b/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll
@@ -10,7 +10,6 @@ target triple="amdgcn--"
 ; CHECK: v_mbcnt_lo_u32_b32_e64
 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
 ; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; CHECK-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
 ; BB0_1:
 ; CHECK: s_load_dword s0, s[0:1], 0xa
 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/test/CodeGen/AMDGPU/uniform-cfg.ll b/test/CodeGen/AMDGPU/uniform-cfg.ll
index a9d45d71fa2..9416336a44e 100644
--- a/test/CodeGen/AMDGPU/uniform-cfg.ll
+++ b/test/CodeGen/AMDGPU/uniform-cfg.ll
@@ -303,7 +303,6 @@ done:
 ; GCN-LABEL: {{^}}uniform_inside_divergent:
 ; GCN: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}}
 ; GCN: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
-; GCN: s_xor_b64  [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]]
 ; GCN: s_cmp_lg_u32 {{s[0-9]+}}, 0
 ; GCN: s_cbranch_scc0 [[IF_UNIFORM_LABEL:[A-Z0-9_a-z]+]]
 ; GCN: s_endpgm
@@ -335,7 +334,6 @@ endif:
 ; GCN: [[IF_LABEL]]:
 ; GCN: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}}
 ; GCN: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
-; GCN: s_xor_b64  [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]]
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
 ; GCN: buffer_store_dword [[ONE]]
 define amdgpu_kernel void @divergent_inside_uniform(i32 addrspace(1)* %out, i32 %cond) {
@@ -360,7 +358,6 @@ endif:
 ; GCN-LABEL: {{^}}divergent_if_uniform_if:
 ; GCN: v_cmp_eq_u32_e32 vcc, 0, v0
 ; GCN: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
-; GCN: s_xor_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]]
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
 ; GCN: buffer_store_dword [[ONE]]
 ; GCN: s_or_b64 exec, exec, [[MASK]]
diff --git a/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll b/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll
index 8a08f9d8bb0..82283f39792 100644
--- a/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll
+++ b/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll
@@ -5,7 +5,6 @@
 ; CHECK-LABEL: {{^}}test1:
 ; CHECK: v_cmp_ne_u32_e32 vcc, 0
 ; CHECK: s_and_saveexec_b64
-; CHECK-NEXT: s_xor_b64
 ; CHECK-NEXT: ; mask branch
 ; CHECK-NEXT: s_cbranch_execz BB{{[0-9]+_[0-9]+}}
 ; CHECK-NEXT: BB{{[0-9]+_[0-9]+}}: ; %loop_body.preheader
@@ -35,7 +34,6 @@ out:
 
 ; CHECK-LABEL: {{^}}test2:
 ; CHECK: s_and_saveexec_b64
-; CHECK-NEXT: s_xor_b64
 ; CHECK-NEXT: ; mask branch
 ; CHECK-NEXT: s_cbranch_execz
 define amdgpu_kernel void @test2(i32 addrspace(1)* %out, i32 %a, i32 %b) {
diff --git a/test/CodeGen/AMDGPU/valu-i1.ll b/test/CodeGen/AMDGPU/valu-i1.ll
index a67f36d0a7e..a5d810de769 100644
--- a/test/CodeGen/AMDGPU/valu-i1.ll
+++ b/test/CodeGen/AMDGPU/valu-i1.ll
@@ -18,7 +18,6 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 ; SI-NOT: s_mov_b64 s[{{[0-9]:[0-9]}}], -1
 ; SI: v_mov_b32_e32 v{{[0-9]}}, -1
 ; SI: s_and_saveexec_b64
-; SI-NEXT: s_xor_b64
 ; SI-NEXT: ; mask branch
 
 ; v_mov should be after exec modification
@@ -66,8 +65,7 @@ end:
 ; SI-LABEL: {{^}}simple_test_v_if:
 ; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
 ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
-; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
-; SI: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]]
+; SI-NEXT: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]]
 
 ; SI-NEXT: BB{{[0-9]+_[0-9]+}}:
 ; SI: buffer_store_dword
@@ -94,8 +92,7 @@ exit:
 ; SI-LABEL: {{^}}simple_test_v_if_ret_else_ret:
 ; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
 ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
-; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
-; SI: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]]
+; SI-NEXT: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]]
 
 ; SI-NEXT: BB{{[0-9]+_[0-9]+}}:
 ; SI: buffer_store_dword
@@ -160,8 +157,8 @@ exit:
 ; SI-LABEL: {{^}}simple_test_v_loop:
 ; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
 ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
-; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
-; SI: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]]
+; SI-NEXT: ; mask branch
+; SI-NEXT: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]]
 
 ; SI: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
 
@@ -202,8 +199,8 @@ exit:
 ; SI: buffer_load_dword [[VBOUND:v[0-9]+]]
 ; SI: v_cmp_lt_i32_e32 vcc
 ; SI: s_and_saveexec_b64 [[OUTER_CMP_SREG:s\[[0-9]+:[0-9]+\]]], vcc
-; SI: s_xor_b64 [[OUTER_CMP_SREG]], exec, [[OUTER_CMP_SREG]]
-; SI: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]]
+; SI-NEXT: ; mask branch
+; SI-NEXT: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]]
 
 ; Initialize inner condition to false
 ; SI: BB{{[0-9]+_[0-9]+}}: ; %bb10.preheader