From: Sanjay Patel Date: Wed, 7 Jun 2017 00:17:08 +0000 (+0000) Subject: [CGP / PowerPC] use direct compares if there's only one load per block in memcmp... X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=756819c52ab1e6e1ec004e22d69aa93c3b3a2509;p=llvm [CGP / PowerPC] use direct compares if there's only one load per block in memcmp() expansion I'd like to enable CGP memcmp expansion for x86, but the output from CGP would regress the special cases (memcmp(x,y,N) != 0 for N=1,2,4,8,16,32 bytes) that we already handle. I'm not sure if we'll actually be able to produce the optimal code given the block-at-a-time limitation in the DAG. We might have to just avoid those special-cases here in CGP. But regardless of that, I think this is a win for the more general cases. http://rise4fun.com/Alive/cbQ Differential Revision: https://reviews.llvm.org/D33963 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@304849 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp index 2d6d3ada212..12b00eaf6d7 100644 --- a/lib/CodeGen/CodeGenPrepare.cpp +++ b/lib/CodeGen/CodeGenPrepare.cpp @@ -1812,7 +1812,7 @@ void MemCmpExpansion::emitLoadCompareBlockMultipleLoads( unsigned NumLoads = std::min(NumLoadsRemaining, NumLoadsPerBlock); Builder.SetInsertPoint(LoadCmpBlocks[Index]); - + Value *Cmp = nullptr; for (unsigned i = 0; i < NumLoads; ++i) { unsigned LoadSize = getLoadSize(RemainingBytes); unsigned GEPIndex = NumBytesProcessed / LoadSize; @@ -1846,9 +1846,16 @@ void MemCmpExpansion::emitLoadCompareBlockMultipleLoads( LoadSrc1 = Builder.CreateZExtOrTrunc(LoadSrc1, MaxLoadType); LoadSrc2 = Builder.CreateZExtOrTrunc(LoadSrc2, MaxLoadType); } - Diff = Builder.CreateXor(LoadSrc1, LoadSrc2); - Diff = Builder.CreateZExtOrTrunc(Diff, MaxLoadType); - XorList.push_back(Diff); + if (NumLoads != 1) { + // If we have multiple loads per block, we need to generate a composite + // comparison using xor+or. + Diff = Builder.CreateXor(LoadSrc1, LoadSrc2); + Diff = Builder.CreateZExtOrTrunc(Diff, MaxLoadType); + XorList.push_back(Diff); + } else { + // If there's only one load per block, we just compare the loaded values. + Cmp = Builder.CreateICmpNE(LoadSrc1, LoadSrc2); + } } auto pairWiseOr = [&](std::vector &InList) -> std::vector { @@ -1862,16 +1869,17 @@ void MemCmpExpansion::emitLoadCompareBlockMultipleLoads( return OutList; }; - // Pairwise OR the XOR results. - OrList = pairWiseOr(XorList); + if (!Cmp) { + // Pairwise OR the XOR results. + OrList = pairWiseOr(XorList); - // Pairwise OR the OR results until one result left. - while (OrList.size() != 1) { - OrList = pairWiseOr(OrList); + // Pairwise OR the OR results until one result left. + while (OrList.size() != 1) { + OrList = pairWiseOr(OrList); + } + Cmp = Builder.CreateICmpNE(OrList[0], ConstantInt::get(Diff->getType(), 0)); } - Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, OrList[0], - ConstantInt::get(Diff->getType(), 0)); BasicBlock *NextBB = (Index == (LoadCmpBlocks.size() - 1)) ? EndBlock : LoadCmpBlocks[Index + 1]; diff --git a/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll b/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll index 584c9ea40f5..a48a42ee918 100644 --- a/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll +++ b/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll @@ -21,8 +21,7 @@ define signext i32 @zeroEqualityTest02(i8* %x, i8* %y) { ; CHECK-NEXT: lwz 3, 0(3) ; CHECK-NEXT: lwz 4, 0(4) ; CHECK-NEXT: li 5, 1 -; CHECK-NEXT: xor 3, 3, 4 -; CHECK-NEXT: cmplwi 3, 0 +; CHECK-NEXT: cmpld 3, 4 ; CHECK-NEXT: isel 3, 0, 5, 2 ; CHECK-NEXT: clrldi 3, 3, 32 ; CHECK-NEXT: blr @@ -38,19 +37,19 @@ define signext i32 @zeroEqualityTest01(i8* %x, i8* %y) { ; CHECK: # BB#0: # %loadbb ; CHECK-NEXT: ld 5, 0(3) ; CHECK-NEXT: ld 6, 0(4) -; CHECK-NEXT: xor. 5, 5, 6 +; CHECK-NEXT: cmpld 5, 6 ; CHECK-NEXT: bne 0, .LBB1_2 ; CHECK-NEXT: # BB#1: # %loadbb1 ; CHECK-NEXT: ld 3, 8(3) ; CHECK-NEXT: ld 4, 8(4) -; CHECK-NEXT: xor. 3, 3, 4 +; CHECK-NEXT: cmpld 3, 4 +; CHECK-NEXT: li 3, 0 ; CHECK-NEXT: beq 0, .LBB1_3 ; CHECK-NEXT: .LBB1_2: # %res_block ; CHECK-NEXT: li 3, 1 ; CHECK-NEXT: clrldi 3, 3, 32 ; CHECK-NEXT: blr -; CHECK-NEXT: .LBB1_3: -; CHECK-NEXT: li 3, 0 +; CHECK-NEXT: .LBB1_3: # %endblock ; CHECK-NEXT: clrldi 3, 3, 32 ; CHECK-NEXT: blr %call = tail call signext i32 @memcmp(i8* %x, i8* %y, i64 16) @@ -65,27 +64,24 @@ define signext i32 @zeroEqualityTest03(i8* %x, i8* %y) { ; CHECK: # BB#0: # %loadbb ; CHECK-NEXT: lwz 5, 0(3) ; CHECK-NEXT: lwz 6, 0(4) -; CHECK-NEXT: xor 5, 5, 6 -; CHECK-NEXT: cmplwi 5, 0 +; CHECK-NEXT: cmpld 5, 6 ; CHECK-NEXT: bne 0, .LBB2_3 ; CHECK-NEXT: # BB#1: # %loadbb1 ; CHECK-NEXT: lhz 5, 4(3) ; CHECK-NEXT: lhz 6, 4(4) -; CHECK-NEXT: xor 5, 5, 6 -; CHECK-NEXT: rlwinm. 5, 5, 0, 16, 31 +; CHECK-NEXT: cmpld 5, 6 ; CHECK-NEXT: bne 0, .LBB2_3 ; CHECK-NEXT: # BB#2: # %loadbb2 ; CHECK-NEXT: lbz 3, 6(3) ; CHECK-NEXT: lbz 4, 6(4) -; CHECK-NEXT: xor 3, 3, 4 -; CHECK-NEXT: rlwinm. 3, 3, 0, 24, 31 +; CHECK-NEXT: cmpld 3, 4 +; CHECK-NEXT: li 3, 0 ; CHECK-NEXT: beq 0, .LBB2_4 ; CHECK-NEXT: .LBB2_3: # %res_block ; CHECK-NEXT: li 3, 1 ; CHECK-NEXT: clrldi 3, 3, 32 ; CHECK-NEXT: blr -; CHECK-NEXT: .LBB2_4: -; CHECK-NEXT: li 3, 0 +; CHECK-NEXT: .LBB2_4: # %endblock ; CHECK-NEXT: clrldi 3, 3, 32 ; CHECK-NEXT: blr %call = tail call signext i32 @memcmp(i8* %x, i8* %y, i64 7) @@ -178,24 +174,22 @@ define signext i32 @zeroEqualityTest06() { ; CHECK-NEXT: addis 4, 2, .LzeroEqualityTest04.buffer2@toc@ha ; CHECK-NEXT: ld 3, .LzeroEqualityTest04.buffer1@toc@l(3) ; CHECK-NEXT: ld 4, .LzeroEqualityTest04.buffer2@toc@l(4) -; CHECK-NEXT: xor. 3, 3, 4 +; CHECK-NEXT: cmpld 3, 4 ; CHECK-NEXT: bne 0, .LBB5_2 ; CHECK-NEXT: # BB#1: # %loadbb1 ; CHECK-NEXT: addis 3, 2, .LzeroEqualityTest04.buffer1@toc@ha+8 ; CHECK-NEXT: addis 4, 2, .LzeroEqualityTest04.buffer2@toc@ha+8 ; CHECK-NEXT: ld 3, .LzeroEqualityTest04.buffer1@toc@l+8(3) ; CHECK-NEXT: ld 4, .LzeroEqualityTest04.buffer2@toc@l+8(4) -; CHECK-NEXT: xor. 3, 3, 4 -; CHECK-NEXT: beq 0, .LBB5_4 +; CHECK-NEXT: cmpld 3, 4 +; CHECK-NEXT: li 3, 0 +; CHECK-NEXT: beq 0, .LBB5_3 ; CHECK-NEXT: .LBB5_2: # %res_block ; CHECK-NEXT: li 3, 1 ; CHECK-NEXT: .LBB5_3: # %endblock ; CHECK-NEXT: cntlzw 3, 3 ; CHECK-NEXT: srwi 3, 3, 5 ; CHECK-NEXT: blr -; CHECK-NEXT: .LBB5_4: -; CHECK-NEXT: li 3, 0 -; CHECK-NEXT: b .LBB5_3 %call = tail call signext i32 @memcmp(i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer1 to i8*), i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer2 to i8*), i64 16) %not.tobool = icmp eq i32 %call, 0 %cond = zext i1 %not.tobool to i32