From f1bc4aa5fa5b44ded9991dfa0df5d1a9d13ff5b7 Mon Sep 17 00:00:00 2001 From: Clement Courbet Date: Fri, 27 Oct 2017 12:34:18 +0000 Subject: [PATCH] [CodeGen][ExpandMemCmp][NFC] Simplify load sequence generation. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@316763 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/CodeGenPrepare.cpp | 73 ++++++++---------- test/CodeGen/X86/memcmp.ll | 135 ++++++++++++++++++++------------- 2 files changed, 115 insertions(+), 93 deletions(-) diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp index 1e5f15397bb..346248ffc06 100644 --- a/lib/CodeGen/CodeGenPrepare.cpp +++ b/lib/CodeGen/CodeGenPrepare.cpp @@ -1743,7 +1743,6 @@ class MemCmpExpansion { const uint64_t Offset; }; SmallVector LoadSequence; - void computeLoadSequence(); void createLoadCmpBlocks(); void createResultBlock(); @@ -1759,18 +1758,13 @@ class MemCmpExpansion { Value *getMemCmpEqZeroOneBlock(); Value *getMemCmpOneBlock(); - // Computes the decomposition. THis is the common code to compute the number - // of loads and the actual load sequence. `callback` is called with each load - // size and number of loads for the block size. - template - void getDecomposition(CallBackT callback) const; - public: MemCmpExpansion(CallInst *CI, uint64_t Size, unsigned MaxLoadSize, - unsigned NumLoadsPerBlock, const DataLayout &DL); + unsigned MaxNumLoads, unsigned NumLoadsPerBlock, + const DataLayout &DL); unsigned getNumBlocks(); - uint64_t getNumLoads() const { return NumLoads; } + uint64_t getNumLoads() const { return LoadSequence.size(); } Value *getMemCmpExpansion(); }; @@ -1787,6 +1781,7 @@ class MemCmpExpansion { // LoadCmpBlock finds a difference. MemCmpExpansion::MemCmpExpansion(CallInst *const CI, uint64_t Size, const unsigned MaxLoadSize, + const unsigned MaxNumLoads, const unsigned LoadsPerBlock, const DataLayout &TheDataLayout) : CI(CI), @@ -1798,27 +1793,34 @@ MemCmpExpansion::MemCmpExpansion(CallInst *const CI, uint64_t Size, IsUsedForZeroCmp(isOnlyUsedInZeroEqualityComparison(CI)), DL(TheDataLayout), Builder(CI) { + assert(Size > 0 && "zero blocks"); // Scale the max size down if the target can load more bytes than we need. while (this->MaxLoadSize > Size) { this->MaxLoadSize /= 2; } - // Compute the number of loads. At that point we don't want to compute the - // actual decomposition because it might be too large to fit in memory. - getDecomposition([this](unsigned LoadSize, uint64_t NumLoadsForSize) { - NumLoads += NumLoadsForSize; - }); -} - -template -void MemCmpExpansion::getDecomposition(CallBackT callback) const { + // Compute the decomposition. unsigned LoadSize = this->MaxLoadSize; - assert(Size > 0 && "zero blocks"); uint64_t CurSize = Size; + uint64_t Offset = 0; while (CurSize) { assert(LoadSize > 0 && "zero load size"); const uint64_t NumLoadsForThisSize = CurSize / LoadSize; + if (LoadSequence.size() + NumLoadsForThisSize > MaxNumLoads) { + // Do not expand if the total number of loads is larger than what the + // target allows. Note that it's important that we exit before completing + // the expansion to avoid using a ton of memory to store the expansion for + // large sizes. + LoadSequence.clear(); + return; + } if (NumLoadsForThisSize > 0) { - callback(LoadSize, NumLoadsForThisSize); + for (uint64_t I = 0; I < NumLoadsForThisSize; ++I) { + LoadSequence.push_back({LoadSize, Offset}); + Offset += LoadSize; + } + if (LoadSize > 1) { + ++NumLoadsNonOneByte; + } CurSize = CurSize % LoadSize; } // FIXME: This can result in a non-native load size (e.g. X86-32+SSE can @@ -1827,21 +1829,7 @@ void MemCmpExpansion::getDecomposition(CallBackT callback) const { // 4). LoadSize /= 2; } -} - -void MemCmpExpansion::computeLoadSequence() { - uint64_t Offset = 0; - getDecomposition( - [this, &Offset](unsigned LoadSize, uint64_t NumLoadsForSize) { - for (uint64_t I = 0; I < NumLoadsForSize; ++I) { - LoadSequence.push_back({LoadSize, Offset}); - Offset += LoadSize; - } - if (LoadSize > 1) { - ++NumLoadsNonOneByte; - } - }); - assert(LoadSequence.size() == getNumLoads() && "mismatch in numbe rof loads"); + assert(LoadSequence.size() <= MaxNumLoads && "broken invariant"); } unsigned MemCmpExpansion::getNumBlocks() { @@ -2241,7 +2229,6 @@ Value *MemCmpExpansion::getMemCmpOneBlock() { // This function expands the memcmp call into an inline expansion and returns // the memcmp result. Value *MemCmpExpansion::getMemCmpExpansion() { - computeLoadSequence(); // A memcmp with zero-comparison with only one block of load and compare does // not need to set up any extra blocks. This case could be handled in the DAG, // but since we have all of the machinery to flexibly expand any memcpy here, @@ -2372,17 +2359,23 @@ static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI, } const uint64_t SizeVal = SizeCast->getZExtValue(); + if (SizeVal == 0) { + return false; + } + // TTI call to check if target would like to expand memcmp. Also, get the // max LoadSize. unsigned MaxLoadSize; if (!TTI->enableMemCmpExpansion(MaxLoadSize)) return false; - MemCmpExpansion Expansion(CI, SizeVal, MaxLoadSize, MemCmpNumLoadsPerBlock, - *DL); + const unsigned MaxNumLoads = + TLI->getMaxExpandSizeMemcmp(CI->getFunction()->optForSize()); + + MemCmpExpansion Expansion(CI, SizeVal, MaxLoadSize, MaxNumLoads, + MemCmpNumLoadsPerBlock, *DL); // Don't expand if this will require more loads than desired by the target. - if (Expansion.getNumLoads() > - TLI->getMaxExpandSizeMemcmp(CI->getFunction()->optForSize())) { + if (Expansion.getNumLoads() == 0) { NumMemCmpGreaterThanMax++; return false; } diff --git a/test/CodeGen/X86/memcmp.ll b/test/CodeGen/X86/memcmp.ll index 3cf26788123..64e62dae47a 100644 --- a/test/CodeGen/X86/memcmp.ll +++ b/test/CodeGen/X86/memcmp.ll @@ -13,6 +13,35 @@ declare i32 @memcmp(i8*, i8*, i64) +define i32 @length0(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length0: +; X86: # BB#0: +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: retl +; +; X64-LABEL: length0: +; X64: # BB#0: +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 0) nounwind + ret i32 %m + } + +define i1 @length0_eq(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length0_eq: +; X86: # BB#0: +; X86-NEXT: movb $1, %al +; X86-NEXT: retl +; +; X64-LABEL: length0_eq: +; X64: # BB#0: +; X64-NEXT: movb $1, %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 0) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + define i32 @length2(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length2: ; X86: # BB#0: @@ -120,14 +149,14 @@ define i32 @length3(i8* %X, i8* %Y) nounwind { ; X86-NEXT: rolw $8, %dx ; X86-NEXT: rolw $8, %si ; X86-NEXT: cmpw %si, %dx -; X86-NEXT: jne .LBB4_1 +; X86-NEXT: jne .LBB6_1 ; X86-NEXT: # BB#2: # %loadbb1 ; X86-NEXT: movzbl 2(%eax), %eax ; X86-NEXT: movzbl 2(%ecx), %ecx ; X86-NEXT: subl %ecx, %eax ; X86-NEXT: popl %esi ; X86-NEXT: retl -; X86-NEXT: .LBB4_1: # %res_block +; X86-NEXT: .LBB6_1: # %res_block ; X86-NEXT: setae %al ; X86-NEXT: movzbl %al, %eax ; X86-NEXT: leal -1(%eax,%eax), %eax @@ -141,13 +170,13 @@ define i32 @length3(i8* %X, i8* %Y) nounwind { ; X64-NEXT: rolw $8, %ax ; X64-NEXT: rolw $8, %cx ; X64-NEXT: cmpw %cx, %ax -; X64-NEXT: jne .LBB4_1 +; X64-NEXT: jne .LBB6_1 ; X64-NEXT: # BB#2: # %loadbb1 ; X64-NEXT: movzbl 2(%rdi), %eax ; X64-NEXT: movzbl 2(%rsi), %ecx ; X64-NEXT: subl %ecx, %eax ; X64-NEXT: retq -; X64-NEXT: .LBB4_1: # %res_block +; X64-NEXT: .LBB6_1: # %res_block ; X64-NEXT: setae %al ; X64-NEXT: movzbl %al, %eax ; X64-NEXT: leal -1(%rax,%rax), %eax @@ -163,15 +192,15 @@ define i1 @length3_eq(i8* %X, i8* %Y) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movzwl (%eax), %edx ; X86-NEXT: cmpw (%ecx), %dx -; X86-NEXT: jne .LBB5_1 +; X86-NEXT: jne .LBB7_1 ; X86-NEXT: # BB#2: # %loadbb1 ; X86-NEXT: movb 2(%eax), %dl ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpb 2(%ecx), %dl -; X86-NEXT: je .LBB5_3 -; X86-NEXT: .LBB5_1: # %res_block +; X86-NEXT: je .LBB7_3 +; X86-NEXT: .LBB7_1: # %res_block ; X86-NEXT: movl $1, %eax -; X86-NEXT: .LBB5_3: # %endblock +; X86-NEXT: .LBB7_3: # %endblock ; X86-NEXT: testl %eax, %eax ; X86-NEXT: setne %al ; X86-NEXT: retl @@ -180,15 +209,15 @@ define i1 @length3_eq(i8* %X, i8* %Y) nounwind { ; X64: # BB#0: # %loadbb ; X64-NEXT: movzwl (%rdi), %eax ; X64-NEXT: cmpw (%rsi), %ax -; X64-NEXT: jne .LBB5_1 +; X64-NEXT: jne .LBB7_1 ; X64-NEXT: # BB#2: # %loadbb1 ; X64-NEXT: movb 2(%rdi), %cl ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpb 2(%rsi), %cl -; X64-NEXT: je .LBB5_3 -; X64-NEXT: .LBB5_1: # %res_block +; X64-NEXT: je .LBB7_3 +; X64-NEXT: .LBB7_1: # %res_block ; X64-NEXT: movl $1, %eax -; X64-NEXT: .LBB5_3: # %endblock +; X64-NEXT: .LBB7_3: # %endblock ; X64-NEXT: testl %eax, %eax ; X64-NEXT: setne %al ; X64-NEXT: retq @@ -277,14 +306,14 @@ define i32 @length5(i8* %X, i8* %Y) nounwind { ; X86-NEXT: bswapl %edx ; X86-NEXT: bswapl %esi ; X86-NEXT: cmpl %esi, %edx -; X86-NEXT: jne .LBB9_1 +; X86-NEXT: jne .LBB11_1 ; X86-NEXT: # BB#2: # %loadbb1 ; X86-NEXT: movzbl 4(%eax), %eax ; X86-NEXT: movzbl 4(%ecx), %ecx ; X86-NEXT: subl %ecx, %eax ; X86-NEXT: popl %esi ; X86-NEXT: retl -; X86-NEXT: .LBB9_1: # %res_block +; X86-NEXT: .LBB11_1: # %res_block ; X86-NEXT: setae %al ; X86-NEXT: movzbl %al, %eax ; X86-NEXT: leal -1(%eax,%eax), %eax @@ -298,13 +327,13 @@ define i32 @length5(i8* %X, i8* %Y) nounwind { ; X64-NEXT: bswapl %eax ; X64-NEXT: bswapl %ecx ; X64-NEXT: cmpl %ecx, %eax -; X64-NEXT: jne .LBB9_1 +; X64-NEXT: jne .LBB11_1 ; X64-NEXT: # BB#2: # %loadbb1 ; X64-NEXT: movzbl 4(%rdi), %eax ; X64-NEXT: movzbl 4(%rsi), %ecx ; X64-NEXT: subl %ecx, %eax ; X64-NEXT: retq -; X64-NEXT: .LBB9_1: # %res_block +; X64-NEXT: .LBB11_1: # %res_block ; X64-NEXT: setae %al ; X64-NEXT: movzbl %al, %eax ; X64-NEXT: leal -1(%rax,%rax), %eax @@ -320,15 +349,15 @@ define i1 @length5_eq(i8* %X, i8* %Y) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl (%eax), %edx ; X86-NEXT: cmpl (%ecx), %edx -; X86-NEXT: jne .LBB10_1 +; X86-NEXT: jne .LBB12_1 ; X86-NEXT: # BB#2: # %loadbb1 ; X86-NEXT: movb 4(%eax), %dl ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpb 4(%ecx), %dl -; X86-NEXT: je .LBB10_3 -; X86-NEXT: .LBB10_1: # %res_block +; X86-NEXT: je .LBB12_3 +; X86-NEXT: .LBB12_1: # %res_block ; X86-NEXT: movl $1, %eax -; X86-NEXT: .LBB10_3: # %endblock +; X86-NEXT: .LBB12_3: # %endblock ; X86-NEXT: testl %eax, %eax ; X86-NEXT: setne %al ; X86-NEXT: retl @@ -337,15 +366,15 @@ define i1 @length5_eq(i8* %X, i8* %Y) nounwind { ; X64: # BB#0: # %loadbb ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: cmpl (%rsi), %eax -; X64-NEXT: jne .LBB10_1 +; X64-NEXT: jne .LBB12_1 ; X64-NEXT: # BB#2: # %loadbb1 ; X64-NEXT: movb 4(%rdi), %cl ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpb 4(%rsi), %cl -; X64-NEXT: je .LBB10_3 -; X64-NEXT: .LBB10_1: # %res_block +; X64-NEXT: je .LBB12_3 +; X64-NEXT: .LBB12_1: # %res_block ; X64-NEXT: movl $1, %eax -; X64-NEXT: .LBB10_3: # %endblock +; X64-NEXT: .LBB12_3: # %endblock ; X64-NEXT: testl %eax, %eax ; X64-NEXT: setne %al ; X64-NEXT: retq @@ -365,7 +394,7 @@ define i32 @length8(i8* %X, i8* %Y) nounwind { ; X86-NEXT: bswapl %ecx ; X86-NEXT: bswapl %edx ; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: jne .LBB11_1 +; X86-NEXT: jne .LBB13_1 ; X86-NEXT: # BB#2: # %loadbb1 ; X86-NEXT: movl 4(%esi), %ecx ; X86-NEXT: movl 4(%eax), %edx @@ -373,11 +402,11 @@ define i32 @length8(i8* %X, i8* %Y) nounwind { ; X86-NEXT: bswapl %edx ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: jne .LBB11_1 +; X86-NEXT: jne .LBB13_1 ; X86-NEXT: # BB#3: # %endblock ; X86-NEXT: popl %esi ; X86-NEXT: retl -; X86-NEXT: .LBB11_1: # %res_block +; X86-NEXT: .LBB13_1: # %res_block ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl %edx, %ecx ; X86-NEXT: setae %al @@ -407,15 +436,15 @@ define i1 @length8_eq(i8* %X, i8* %Y) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl (%eax), %edx ; X86-NEXT: cmpl (%ecx), %edx -; X86-NEXT: jne .LBB12_1 +; X86-NEXT: jne .LBB14_1 ; X86-NEXT: # BB#2: # %loadbb1 ; X86-NEXT: movl 4(%eax), %edx ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl 4(%ecx), %edx -; X86-NEXT: je .LBB12_3 -; X86-NEXT: .LBB12_1: # %res_block +; X86-NEXT: je .LBB14_3 +; X86-NEXT: .LBB14_1: # %res_block ; X86-NEXT: movl $1, %eax -; X86-NEXT: .LBB12_3: # %endblock +; X86-NEXT: .LBB14_3: # %endblock ; X86-NEXT: testl %eax, %eax ; X86-NEXT: sete %al ; X86-NEXT: retl @@ -436,14 +465,14 @@ define i1 @length8_eq_const(i8* %X) nounwind { ; X86: # BB#0: # %loadbb ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: cmpl $858927408, (%ecx) # imm = 0x33323130 -; X86-NEXT: jne .LBB13_1 +; X86-NEXT: jne .LBB15_1 ; X86-NEXT: # BB#2: # %loadbb1 ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl $926299444, 4(%ecx) # imm = 0x37363534 -; X86-NEXT: je .LBB13_3 -; X86-NEXT: .LBB13_1: # %res_block +; X86-NEXT: je .LBB15_3 +; X86-NEXT: .LBB15_1: # %res_block ; X86-NEXT: movl $1, %eax -; X86-NEXT: .LBB13_3: # %endblock +; X86-NEXT: .LBB15_3: # %endblock ; X86-NEXT: testl %eax, %eax ; X86-NEXT: setne %al ; X86-NEXT: retl @@ -476,15 +505,15 @@ define i1 @length12_eq(i8* %X, i8* %Y) nounwind { ; X64: # BB#0: # %loadbb ; X64-NEXT: movq (%rdi), %rax ; X64-NEXT: cmpq (%rsi), %rax -; X64-NEXT: jne .LBB14_1 +; X64-NEXT: jne .LBB16_1 ; X64-NEXT: # BB#2: # %loadbb1 ; X64-NEXT: movl 8(%rdi), %ecx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpl 8(%rsi), %ecx -; X64-NEXT: je .LBB14_3 -; X64-NEXT: .LBB14_1: # %res_block +; X64-NEXT: je .LBB16_3 +; X64-NEXT: .LBB16_1: # %res_block ; X64-NEXT: movl $1, %eax -; X64-NEXT: .LBB14_3: # %endblock +; X64-NEXT: .LBB16_3: # %endblock ; X64-NEXT: testl %eax, %eax ; X64-NEXT: setne %al ; X64-NEXT: retq @@ -511,7 +540,7 @@ define i32 @length12(i8* %X, i8* %Y) nounwind { ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB15_1 +; X64-NEXT: jne .LBB17_1 ; X64-NEXT: # BB#2: # %loadbb1 ; X64-NEXT: movl 8(%rdi), %ecx ; X64-NEXT: movl 8(%rsi), %edx @@ -519,10 +548,10 @@ define i32 @length12(i8* %X, i8* %Y) nounwind { ; X64-NEXT: bswapl %edx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB15_1 +; X64-NEXT: jne .LBB17_1 ; X64-NEXT: # BB#3: # %endblock ; X64-NEXT: retq -; X64-NEXT: .LBB15_1: # %res_block +; X64-NEXT: .LBB17_1: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al @@ -552,7 +581,7 @@ define i32 @length16(i8* %X, i8* %Y) nounwind { ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB16_1 +; X64-NEXT: jne .LBB18_1 ; X64-NEXT: # BB#2: # %loadbb1 ; X64-NEXT: movq 8(%rdi), %rcx ; X64-NEXT: movq 8(%rsi), %rdx @@ -560,10 +589,10 @@ define i32 @length16(i8* %X, i8* %Y) nounwind { ; X64-NEXT: bswapq %rdx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB16_1 +; X64-NEXT: jne .LBB18_1 ; X64-NEXT: # BB#3: # %endblock ; X64-NEXT: retq -; X64-NEXT: .LBB16_1: # %res_block +; X64-NEXT: .LBB18_1: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al @@ -614,15 +643,15 @@ define i1 @length16_eq(i8* %x, i8* %y) nounwind { ; X64: # BB#0: # %loadbb ; X64-NEXT: movq (%rdi), %rax ; X64-NEXT: cmpq (%rsi), %rax -; X64-NEXT: jne .LBB17_1 +; X64-NEXT: jne .LBB19_1 ; X64-NEXT: # BB#2: # %loadbb1 ; X64-NEXT: movq 8(%rdi), %rcx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq 8(%rsi), %rcx -; X64-NEXT: je .LBB17_3 -; X64-NEXT: .LBB17_1: # %res_block +; X64-NEXT: je .LBB19_3 +; X64-NEXT: .LBB19_1: # %res_block ; X64-NEXT: movl $1, %eax -; X64-NEXT: .LBB17_3: # %endblock +; X64-NEXT: .LBB19_3: # %endblock ; X64-NEXT: testl %eax, %eax ; X64-NEXT: setne %al ; X64-NEXT: retq @@ -670,15 +699,15 @@ define i1 @length16_eq_const(i8* %X) nounwind { ; X64: # BB#0: # %loadbb ; X64-NEXT: movabsq $3978425819141910832, %rax # imm = 0x3736353433323130 ; X64-NEXT: cmpq %rax, (%rdi) -; X64-NEXT: jne .LBB18_1 +; X64-NEXT: jne .LBB20_1 ; X64-NEXT: # BB#2: # %loadbb1 ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: movabsq $3833745473465760056, %rcx # imm = 0x3534333231303938 ; X64-NEXT: cmpq %rcx, 8(%rdi) -; X64-NEXT: je .LBB18_3 -; X64-NEXT: .LBB18_1: # %res_block +; X64-NEXT: je .LBB20_3 +; X64-NEXT: .LBB20_1: # %res_block ; X64-NEXT: movl $1, %eax -; X64-NEXT: .LBB18_3: # %endblock +; X64-NEXT: .LBB20_3: # %endblock ; X64-NEXT: testl %eax, %eax ; X64-NEXT: sete %al ; X64-NEXT: retq -- 2.40.0