From: Matt Arsenault Date: Mon, 7 Nov 2016 19:09:27 +0000 (+0000) Subject: AMDGPU: Preserve vcc undef flags when inverting branch X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=e5fd9c09add0c5ec235071fb6b8e337d64641c3f;p=llvm AMDGPU: Preserve vcc undef flags when inverting branch If the branch was on a read-undef of vcc, passes that used analyzeBranch to invert the branch condition wouldn't preserve the undef flag resulting in a verifier error. Fixes verifier failures in a future commit. Also fix verifier error when inserting copy for vccz corruption bug. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@286133 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/AMDGPU/SIInsertWaits.cpp b/lib/Target/AMDGPU/SIInsertWaits.cpp index 6c4a2a4d210..a9e693917bf 100644 --- a/lib/Target/AMDGPU/SIInsertWaits.cpp +++ b/lib/Target/AMDGPU/SIInsertWaits.cpp @@ -178,8 +178,10 @@ FunctionPass *llvm::createSIInsertWaitsPass() { const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } }; -static bool readsVCCZ(unsigned Opcode) { - return Opcode == AMDGPU::S_CBRANCH_VCCNZ || Opcode == AMDGPU::S_CBRANCH_VCCZ; +static bool readsVCCZ(const MachineInstr &MI) { + unsigned Opc = MI.getOpcode(); + return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) && + !MI.getOperand(1).isUndef(); } bool SIInsertWaits::hasOutstandingLGKM() const { @@ -574,7 +576,7 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { } // Check if we need to apply the bug work-around - if (readsVCCZ(I->getOpcode()) && VCCZCorrupt) { + if (VCCZCorrupt && readsVCCZ(*I)) { DEBUG(dbgs() << "Inserting vccz bug work-around before: " << *I << '\n'); // Wait on everything, not just LGKM. vccz reads usually come from @@ -589,7 +591,7 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { // vcc and then writing it back to the register. BuildMI(MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64), AMDGPU::VCC) - .addReg(AMDGPU::VCC); + .addReg(AMDGPU::VCC); } } diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index 94b484ed0b3..02cbc882bf8 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1196,6 +1196,7 @@ bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock *CondBB = I->getOperand(0).getMBB(); Cond.push_back(MachineOperand::CreateImm(Pred)); + Cond.push_back(I->getOperand(1)); // Save the branch register. ++I; @@ -1298,9 +1299,16 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, = getBranchOpcode(static_cast(Cond[0].getImm())); if (!FBB) { - BuildMI(&MBB, DL, get(Opcode)) + Cond[1].isUndef(); + MachineInstr *CondBr = + BuildMI(&MBB, DL, get(Opcode)) .addMBB(TBB); + // Copy the flags onto the implicit condition register operand. + MachineOperand &CondReg = CondBr->getOperand(1); + CondReg.setIsUndef(Cond[1].isUndef()); + CondReg.setIsKill(Cond[1].isKill()); + if (BytesAdded) *BytesAdded = 4; return 1; @@ -1308,11 +1316,16 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, assert(TBB && FBB); - BuildMI(&MBB, DL, get(Opcode)) + MachineInstr *CondBr = + BuildMI(&MBB, DL, get(Opcode)) .addMBB(TBB); BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) .addMBB(FBB); + MachineOperand &CondReg = CondBr->getOperand(1); + CondReg.setIsUndef(Cond[1].isUndef()); + CondReg.setIsKill(Cond[1].isKill()); + if (BytesAdded) *BytesAdded = 8; @@ -1321,7 +1334,7 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, bool SIInstrInfo::reverseBranchCondition( SmallVectorImpl &Cond) const { - assert(Cond.size() == 1); + assert(Cond.size() == 2); Cond[0].setImm(-Cond[0].getImm()); return false; } diff --git a/test/CodeGen/MIR/AMDGPU/invert-br-undef-vcc.mir b/test/CodeGen/MIR/AMDGPU/invert-br-undef-vcc.mir new file mode 100644 index 00000000000..66182d09289 --- /dev/null +++ b/test/CodeGen/MIR/AMDGPU/invert-br-undef-vcc.mir @@ -0,0 +1,89 @@ +# RUN: llc -run-pass block-placement -march=amdgcn -verify-machineinstrs -o - %s | FileCheck %s +--- | + + define void @invert_br_undef_vcc(float %cond, i32 addrspace(1)* %out) #0 { + entry: + br i1 undef, label %if, label %else, !structurizecfg.uniform !0, !amdgpu.uniform !0 + + else: ; preds = %entry + store volatile i32 100, i32 addrspace(1)* undef + br label %done, !structurizecfg.uniform !0 + + if: ; preds = %entry + store volatile i32 9, i32 addrspace(1)* undef + br label %done, !structurizecfg.uniform !0 + + done: ; preds = %if, %else + %value = phi i32 [ 0, %if ], [ 1, %else ] + store i32 %value, i32 addrspace(1)* %out + ret void + } + + attributes #0 = { nounwind } + + !0 = !{} + +... +--- +# CHECK-LABEL: name: invert_br_undef_vcc +# CHECK: S_CBRANCH_VCCZ %bb.1.else, implicit undef %vcc + +name: invert_br_undef_vcc +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%sgpr0_sgpr1' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0.entry: + successors: %bb.2.if, %bb.1.else + liveins: %sgpr0_sgpr1 + + %sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed %sgpr0_sgpr1, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %sgpr7 = S_MOV_B32 61440 + %sgpr6 = S_MOV_B32 -1 + S_CBRANCH_VCCNZ %bb.2.if, implicit undef %vcc + + bb.1.else: + successors: %bb.3.done + liveins: %sgpr6, %sgpr7, %sgpr0_sgpr1_sgpr2_sgpr3:0x00000003 + + %vgpr0 = V_MOV_B32_e32 100, implicit %exec + BUFFER_STORE_DWORD_OFFSET killed %vgpr0, killed %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `i32 addrspace(1)* undef`) + %vgpr0 = V_MOV_B32_e32 1, implicit %exec + S_BRANCH %bb.3.done + + bb.2.if: + successors: %bb.3.done + liveins: %sgpr6, %sgpr7, %sgpr0_sgpr1_sgpr2_sgpr3:0x00000003 + + %vgpr0 = V_MOV_B32_e32 9, implicit %exec + BUFFER_STORE_DWORD_OFFSET killed %vgpr0, killed %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `i32 addrspace(1)* undef`) + %vgpr0 = V_MOV_B32_e32 0, implicit %exec + + bb.3.done: + liveins: %vgpr0, %sgpr0_sgpr1_sgpr2_sgpr3:0x00000003 + + %sgpr3 = S_MOV_B32 61440 + %sgpr2 = S_MOV_B32 -1 + BUFFER_STORE_DWORD_OFFSET killed %vgpr0, killed %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit %exec :: (store 4 into %ir.out) + S_ENDPGM + +... diff --git a/test/CodeGen/MIR/AMDGPU/vccz-corrupt-bug-workaround.mir b/test/CodeGen/MIR/AMDGPU/vccz-corrupt-bug-workaround.mir new file mode 100644 index 00000000000..03e473e3a0c --- /dev/null +++ b/test/CodeGen/MIR/AMDGPU/vccz-corrupt-bug-workaround.mir @@ -0,0 +1,177 @@ +# RUN: llc -run-pass si-insert-waits -march=amdgcn -mcpu=tahiti -o - %s | FileCheck %s +--- | + + define void @vccz_corrupt_workaround(float %cond, i32 addrspace(1)* %out) #0 { + entry: + %cmp0 = fcmp oeq float %cond, 0.000000e+00 + br i1 %cmp0, label %if, label %else, !structurizecfg.uniform !0, !amdgpu.uniform !0 + + else: ; preds = %entry + store volatile i32 100, i32 addrspace(1)* undef + br label %done, !structurizecfg.uniform !0 + + if: ; preds = %entry + store volatile i32 9, i32 addrspace(1)* undef + br label %done, !structurizecfg.uniform !0 + + done: ; preds = %if, %else + %value = phi i32 [ 0, %if ], [ 1, %else ] + store i32 %value, i32 addrspace(1)* %out + ret void + } + + define void @vccz_corrupt_undef_vcc(float %cond, i32 addrspace(1)* %out) #0 { + entry: + br i1 undef, label %if, label %else, !structurizecfg.uniform !0, !amdgpu.uniform !0 + + else: ; preds = %entry + store volatile i32 100, i32 addrspace(1)* undef + br label %done, !structurizecfg.uniform !0 + + if: ; preds = %entry + store volatile i32 9, i32 addrspace(1)* undef + br label %done, !structurizecfg.uniform !0 + + done: ; preds = %if, %else + %value = phi i32 [ 0, %if ], [ 1, %else ] + store i32 %value, i32 addrspace(1)* %out + ret void + } + + attributes #0 = { nounwind } + attributes #1 = { readnone } + + !0 = !{} + +... +--- +# CHECK-LABEL: name: vccz_corrupt_workaround +# CHECK: %vcc = V_CMP_EQ_F32 +# CHECK-NEXT: %vcc = S_MOV_B64 %vcc +# CHECK-NEXT: S_CBRANCH_VCCZ %bb.2.else, implicit killed %vcc + +name: vccz_corrupt_workaround +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%sgpr0_sgpr1' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0.entry: + successors: %bb.2.if, %bb.1.else + liveins: %sgpr0_sgpr1 + + %sgpr2 = S_LOAD_DWORD_IMM %sgpr0_sgpr1, 9, 0 :: (non-temporal dereferenceable invariant load 4 from `float addrspace(2)* undef`) + %sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed %sgpr0_sgpr1, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %sgpr7 = S_MOV_B32 61440 + %sgpr6 = S_MOV_B32 -1 + %vcc = V_CMP_EQ_F32_e64 0, 0, 0, %sgpr2, 0, 0, implicit %exec + S_CBRANCH_VCCZ %bb.1.else, implicit killed %vcc + + bb.2.if: + successors: %bb.3.done + liveins: %sgpr6, %sgpr7, %sgpr0_sgpr1_sgpr2_sgpr3:0x00000003 + + %vgpr0 = V_MOV_B32_e32 9, implicit %exec + BUFFER_STORE_DWORD_OFFSET killed %vgpr0, killed %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `i32 addrspace(1)* undef`) + %vgpr0 = V_MOV_B32_e32 0, implicit %exec + S_BRANCH %bb.3.done + + bb.1.else: + successors: %bb.3.done + liveins: %sgpr6, %sgpr7, %sgpr0_sgpr1_sgpr2_sgpr3:0x00000003 + + %vgpr0 = V_MOV_B32_e32 100, implicit %exec + BUFFER_STORE_DWORD_OFFSET killed %vgpr0, killed %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `i32 addrspace(1)* undef`) + %vgpr0 = V_MOV_B32_e32 1, implicit %exec + + bb.3.done: + liveins: %vgpr0, %sgpr0_sgpr1_sgpr2_sgpr3:0x00000003 + + %sgpr3 = S_MOV_B32 61440 + %sgpr2 = S_MOV_B32 -1 + BUFFER_STORE_DWORD_OFFSET killed %vgpr0, killed %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit %exec :: (store 4 into %ir.out) + S_ENDPGM + +... +--- +# CHECK-LABEL: name: vccz_corrupt_undef_vcc +# CHECK: S_WAITCNT +# CHECK-NEXT: S_CBRANCH_VCCZ %bb.2.else, implicit undef %vcc + +name: vccz_corrupt_undef_vcc +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%sgpr0_sgpr1' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0.entry: + successors: %bb.2.if, %bb.1.else + liveins: %sgpr0_sgpr1 + + %sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed %sgpr0_sgpr1, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %sgpr7 = S_MOV_B32 61440 + %sgpr6 = S_MOV_B32 -1 + S_CBRANCH_VCCZ %bb.1.else, implicit undef %vcc + + bb.2.if: + successors: %bb.3.done + liveins: %sgpr6, %sgpr7, %sgpr0_sgpr1_sgpr2_sgpr3:0x00000003 + + %vgpr0 = V_MOV_B32_e32 9, implicit %exec + BUFFER_STORE_DWORD_OFFSET killed %vgpr0, killed %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `i32 addrspace(1)* undef`) + %vgpr0 = V_MOV_B32_e32 0, implicit %exec + S_BRANCH %bb.3.done + + bb.1.else: + successors: %bb.3.done + liveins: %sgpr6, %sgpr7, %sgpr0_sgpr1_sgpr2_sgpr3:0x00000003 + + %vgpr0 = V_MOV_B32_e32 100, implicit %exec + BUFFER_STORE_DWORD_OFFSET killed %vgpr0, killed %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `i32 addrspace(1)* undef`) + %vgpr0 = V_MOV_B32_e32 1, implicit %exec + + bb.3.done: + liveins: %vgpr0, %sgpr0_sgpr1_sgpr2_sgpr3:0x00000003 + + %sgpr3 = S_MOV_B32 61440 + %sgpr2 = S_MOV_B32 -1 + BUFFER_STORE_DWORD_OFFSET killed %vgpr0, killed %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit %exec :: (store 4 into %ir.out) + S_ENDPGM + +...