From: Sanjay Patel Date: Wed, 14 Jun 2017 17:00:57 +0000 (+0000) Subject: [x86] replace div/rem with shift/mask for better shuffle combining perf X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=b76b866ea10fe9d593596c2776a85cb8e6a58128;p=llvm [x86] replace div/rem with shift/mask for better shuffle combining perf We know that shuffle masks are power-of-2 sizes, but there's no way (?) for LLVM to know that, so hack combineX86ShufflesRecursively() to be much faster by replacing div/rem with shift/mask. This makes the motivating compile-time bug in PR32037 ( https://bugs.llvm.org/show_bug.cgi?id=32037 ) about 9% faster overall. Differential Revision: https://reviews.llvm.org/D34174 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@305398 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 77de1a89977..c7e4f4dc14e 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -27970,28 +27970,45 @@ static bool combineX86ShufflesRecursively(ArrayRef SrcOps, OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && "The smaller number of elements must divide the larger."); - int MaskWidth = std::max(OpMask.size(), RootMask.size()); - int RootRatio = std::max(1, OpMask.size() / RootMask.size()); - int OpRatio = std::max(1, RootMask.size() / OpMask.size()); - assert(((RootRatio == 1 && OpRatio == 1) || - (RootRatio == 1) != (OpRatio == 1)) && + + // This function can be performance-critical, so we rely on the power-of-2 + // knowledge that we have about the mask sizes to replace div/rem ops with + // bit-masks and shifts. + assert(isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes"); + assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes"); + unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size()); + unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size()); + + unsigned MaskWidth = std::max(OpMask.size(), RootMask.size()); + unsigned RootRatio = std::max(1, OpMask.size() >> RootMaskSizeLog2); + unsigned OpRatio = std::max(1, RootMask.size() >> OpMaskSizeLog2); + assert((RootRatio == 1 || OpRatio == 1) && "Must not have a ratio for both incoming and op masks!"); - SmallVector Mask((unsigned)MaskWidth, SM_SentinelUndef); + assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes"); + assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes"); + assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes"); + unsigned RootRatioLog2 = countTrailingZeros(RootRatio); + unsigned OpRatioLog2 = countTrailingZeros(OpRatio); + + SmallVector Mask(MaskWidth, SM_SentinelUndef); // Merge this shuffle operation's mask into our accumulated mask. Note that // this shuffle's mask will be the first applied to the input, followed by the // root mask to get us all the way to the root value arrangement. The reason // for this order is that we are recursing up the operation chain. - for (int i = 0; i < MaskWidth; ++i) { - int RootIdx = i / RootRatio; + for (unsigned i = 0; i < MaskWidth; ++i) { + unsigned RootIdx = i >> RootRatioLog2; if (RootMask[RootIdx] < 0) { // This is a zero or undef lane, we're done. Mask[i] = RootMask[RootIdx]; continue; } - int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio; + // TODO: Here and below, we could convert multiply to shift-left for + // performance because we know that our mask sizes are power-of-2. + unsigned RootMaskedIdx = + RootMask[RootIdx] * RootRatio + (i & (RootRatio - 1)); // Just insert the scaled root mask value if it references an input other // than the SrcOp we're currently inserting. @@ -28001,9 +28018,9 @@ static bool combineX86ShufflesRecursively(ArrayRef SrcOps, continue; } - RootMaskedIdx %= MaskWidth; + RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1); - int OpIdx = RootMaskedIdx / OpRatio; + unsigned OpIdx = RootMaskedIdx >> OpRatioLog2; if (OpMask[OpIdx] < 0) { // The incoming lanes are zero or undef, it doesn't matter which ones we // are using. @@ -28012,8 +28029,9 @@ static bool combineX86ShufflesRecursively(ArrayRef SrcOps, } // Ok, we have non-zero lanes, map them through to one of the Op's inputs. - int OpMaskedIdx = OpMask[OpIdx] * OpRatio + RootMaskedIdx % OpRatio; - OpMaskedIdx %= MaskWidth; + unsigned OpMaskedIdx = + OpMask[OpIdx] * OpRatio + (RootMaskedIdx & (OpRatio - 1)); + OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1); if (OpMask[OpIdx] < (int)OpMask.size()) { assert(0 <= InputIdx0 && "Unknown target shuffle input");