From: Sanjay Patel Date: Tue, 27 Jun 2017 23:15:01 +0000 (+0000) Subject: [CGP] add specialization for memcmp expansion with only one basic block X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=dbbccbae9753529ecdf61a4792b66c0666c020f7;p=llvm [CGP] add specialization for memcmp expansion with only one basic block git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@306485 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp index 474a214723e..892b85fe239 100644 --- a/lib/CodeGen/CodeGenPrepare.cpp +++ b/lib/CodeGen/CodeGenPrepare.cpp @@ -1679,6 +1679,7 @@ class MemCmpExpansion { void emitMemCmpResultBlock(); Value *getMemCmpExpansionZeroCase(unsigned Size); Value *getMemCmpEqZeroOneBlock(unsigned Size); + Value *getMemCmpOneBlock(unsigned Size); unsigned getLoadSize(unsigned Size); unsigned getNumLoads(unsigned Size); @@ -1711,7 +1712,7 @@ MemCmpExpansion::MemCmpExpansion(CallInst *CI, uint64_t Size, // we choose to handle this case too to avoid fragmented lowering. IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI); NumBlocks = calculateNumBlocks(Size); - if (!IsUsedForZeroCmp || NumBlocks != 1) { + if ((!IsUsedForZeroCmp && NumLoadsPerBlock != 1) || NumBlocks != 1) { BasicBlock *StartBlock = CI->getParent(); EndBlock = StartBlock->splitBasicBlock(CI, "endblock"); setupEndBlockPHINodes(); @@ -2090,6 +2091,41 @@ Value *MemCmpExpansion::getMemCmpEqZeroOneBlock(unsigned Size) { return Builder.CreateZExt(Cmp, Type::getInt32Ty(CI->getContext())); } +/// A memcmp expansion that only has one block of load and compare can bypass +/// the compare, branch, and phi IR that is required in the general case. +Value *MemCmpExpansion::getMemCmpOneBlock(unsigned Size) { + assert(NumLoadsPerBlock == 1 && "Only handles one load pair per block"); + + Type *LoadSizeType = IntegerType::get(CI->getContext(), Size * 8); + Value *Source1 = CI->getArgOperand(0); + Value *Source2 = CI->getArgOperand(1); + + // Cast source to LoadSizeType*. + if (Source1->getType() != LoadSizeType) + Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); + if (Source2->getType() != LoadSizeType) + Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); + + // Load LoadSizeType from the base address. + Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); + Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); + + if (DL.isLittleEndian() && Size != 1) { + Function *Bswap = Intrinsic::getDeclaration(CI->getModule(), + Intrinsic::bswap, LoadSizeType); + LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1); + LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2); + } + + // TODO: Instead of comparing ULT, just subtract and return the difference? + Value *CmpNE = Builder.CreateICmpNE(LoadSrc1, LoadSrc2); + Value *CmpULT = Builder.CreateICmpULT(LoadSrc1, LoadSrc2); + Type *I32 = Builder.getInt32Ty(); + Value *Sel1 = Builder.CreateSelect(CmpULT, ConstantInt::get(I32, -1), + ConstantInt::get(I32, 1)); + return Builder.CreateSelect(CmpNE, Sel1, ConstantInt::get(I32, 0)); +} + // This function expands the memcmp call into an inline expansion and returns // the memcmp result. Value *MemCmpExpansion::getMemCmpExpansion(uint64_t Size) { @@ -2097,6 +2133,10 @@ Value *MemCmpExpansion::getMemCmpExpansion(uint64_t Size) { return NumBlocks == 1 ? getMemCmpEqZeroOneBlock(Size) : getMemCmpExpansionZeroCase(Size); + // TODO: Handle more than one load pair per block in getMemCmpOneBlock(). + if (NumBlocks == 1 && NumLoadsPerBlock == 1) + return getMemCmpOneBlock(Size); + // This loop calls emitLoadCompareBlock for comparing Size bytes of the two // memcmp sources. It starts with loading using the maximum load size set by // the target. It processes any remaining bytes using a load size which is the diff --git a/test/CodeGen/PowerPC/memcmp.ll b/test/CodeGen/PowerPC/memcmp.ll index 039c48b2a96..7cec2a1331d 100644 --- a/test/CodeGen/PowerPC/memcmp.ll +++ b/test/CodeGen/PowerPC/memcmp.ll @@ -13,11 +13,10 @@ entry: ; CHECK: ldbrx [[LOAD1:[0-9]+]] ; CHECK-NEXT: ldbrx [[LOAD2:[0-9]+]] ; CHECK-NEXT: li [[LI:[0-9]+]], 1 -; CHECK-NEXT: cmpld [[LOAD1]], [[LOAD2]] ; CHECK-NEXT: li [[LI2:[0-9]+]], -1 +; CHECK-NEXT: cmpld [[LOAD1]], [[LOAD2]] ; CHECK-NEXT: isel [[ISEL:[0-9]+]], [[LI2]], [[LI]], 0 ; CHECK-NEXT: isel [[ISEL2:[0-9]+]], 0, [[ISEL]], 2 -; CHECK-NEXT: extsw 3, [[ISEL2]] ; CHECK-NEXT: blr } @@ -34,11 +33,10 @@ entry: ; CHECK: lwbrx [[LOAD1:[0-9]+]] ; CHECK-NEXT: lwbrx [[LOAD2:[0-9]+]] ; CHECK-NEXT: li [[LI:[0-9]+]], 1 -; CHECK-NEXT: cmpld [[LOAD1]], [[LOAD2]] ; CHECK-NEXT: li [[LI2:[0-9]+]], -1 +; CHECK-NEXT: cmplw [[LOAD1]], [[LOAD2]] ; CHECK-NEXT: isel [[ISEL:[0-9]+]], [[LI2]], [[LI]], 0 ; CHECK-NEXT: isel [[ISEL2:[0-9]+]], 0, [[ISEL]], 2 -; CHECK-NEXT: extsw 3, [[ISEL2]] ; CHECK-NEXT: blr } @@ -55,11 +53,10 @@ entry: ; CHECK: lhbrx [[LOAD1:[0-9]+]] ; CHECK-NEXT: lhbrx [[LOAD2:[0-9]+]] ; CHECK-NEXT: li [[LI:[0-9]+]], 1 -; CHECK-NEXT: cmpld [[LOAD1]], [[LOAD2]] ; CHECK-NEXT: li [[LI2:[0-9]+]], -1 +; CHECK-NEXT: cmplw [[LOAD1]], [[LOAD2]] ; CHECK-NEXT: isel [[ISEL:[0-9]+]], [[LI2]], [[LI]], 0 ; CHECK-NEXT: isel [[ISEL2:[0-9]+]], 0, [[ISEL]], 2 -; CHECK-NEXT: extsw 3, [[ISEL2]] ; CHECK-NEXT: blr } @@ -75,8 +72,11 @@ entry: ; CHECK-LABEL: @test4 ; CHECK: lbz [[LOAD1:[0-9]+]] ; CHECK-NEXT: lbz [[LOAD2:[0-9]+]] -; CHECK-NEXT: subf [[SUB:[0-9]+]], [[LOAD2]], [[LOAD1]] -; CHECK-NEXT: extsw 3, [[SUB]] +; CHECK-NEXT: li [[LI:[0-9]+]], 1 +; CHECK-NEXT: li [[LI2:[0-9]+]], -1 +; CHECK-NEXT: cmplw [[LOAD1]], [[LOAD2]] +; CHECK-NEXT: isel [[ISEL:[0-9]+]], [[LI2]], [[LI]], 0 +; CHECK-NEXT: isel [[ISEL2:[0-9]+]], 0, [[ISEL]], 2 ; CHECK-NEXT: blr } diff --git a/test/CodeGen/PowerPC/memcmpIR.ll b/test/CodeGen/PowerPC/memcmpIR.ll index 044c8d5aa33..55f48ad19a6 100644 --- a/test/CodeGen/PowerPC/memcmpIR.ll +++ b/test/CodeGen/PowerPC/memcmpIR.ll @@ -59,28 +59,20 @@ define signext i32 @test2(i32* nocapture readonly %buffer1, i32* nocapture reado ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32* ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD1]]) ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD2]]) - ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[BSWAP1]] to i64 - ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[BSWAP2]] to i64 - ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[ZEXT1]], [[ZEXT2]] - ; CHECK-NEXT: br i1 [[ICMP]], label %endblock, label %res_block - - ; CHECK-LABEL: res_block:{{.*}} - ; CHECK: [[ICMP2:%[0-9]+]] = icmp ult i64 - ; CHECK-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1 - ; CHECK-NEXT: br label %endblock + ; CHECK-NEXT: [[CMP1:%[0-9]+]] = icmp ne i32 [[BSWAP1]], [[BSWAP2]] + ; CHECK-NEXT: [[CMP2:%[0-9]+]] = icmp ult i32 [[BSWAP1]], [[BSWAP2]] + ; CHECK-NEXT: [[SELECT1:%[0-9]+]] = select i1 [[CMP2]], i32 -1, i32 1 + ; CHECK-NEXT: [[SELECT2:%[0-9]+]] = select i1 [[CMP1]], i32 [[SELECT1]], i32 0 + ; CHECK-NEXT: ret i32 [[SELECT2]] ; CHECK-BE-LABEL: @test2( ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i32, i32* ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32* - ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[LOAD1]] to i64 - ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[LOAD2]] to i64 - ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[ZEXT1]], [[ZEXT2]] - ; CHECK-BE-NEXT: br i1 [[ICMP]], label %endblock, label %res_block - - ; CHECK-BE-LABEL: res_block:{{.*}} - ; CHECK-BE: [[ICMP2:%[0-9]+]] = icmp ult i64 - ; CHECK-BE-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1 - ; CHECK-BE-NEXT: br label %endblock + ; CHECK-BE-NEXT: [[CMP1:%[0-9]+]] = icmp ne i32 [[LOAD1]], [[LOAD2]] + ; CHECK-BE-NEXT: [[CMP2:%[0-9]+]] = icmp ult i32 [[LOAD1]], [[LOAD2]] + ; CHECK-BE-NEXT: [[SELECT1:%[0-9]+]] = select i1 [[CMP2]], i32 -1, i32 1 + ; CHECK-BE-NEXT: [[SELECT2:%[0-9]+]] = select i1 [[CMP1]], i32 [[SELECT1]], i32 0 + ; CHECK-BE-NEXT: ret i32 [[SELECT2]] entry: %0 = bitcast i32* %buffer1 to i8* diff --git a/test/CodeGen/X86/memcmp.ll b/test/CodeGen/X86/memcmp.ll index 608f68fb479..52ef4b0dca5 100644 --- a/test/CodeGen/X86/memcmp.ll +++ b/test/CodeGen/X86/memcmp.ll @@ -12,42 +12,39 @@ declare i32 @memcmp(i8*, i8*, i64) define i32 @length2(i8* %X, i8* %Y) nounwind { ; X32-LABEL: length2: -; X32: # BB#0: # %loadbb +; X32: # BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movzwl (%ecx), %ecx ; X32-NEXT: movzwl (%eax), %eax ; X32-NEXT: rolw $8, %cx ; X32-NEXT: rolw $8, %ax -; X32-NEXT: movzwl %cx, %ecx -; X32-NEXT: movzwl %ax, %edx -; X32-NEXT: xorl %eax, %eax -; X32-NEXT: cmpl %edx, %ecx -; X32-NEXT: je .LBB0_3 -; X32-NEXT: # BB#1: # %res_block +; X32-NEXT: cmpw %ax, %cx ; X32-NEXT: movl $-1, %eax -; X32-NEXT: jb .LBB0_3 -; X32-NEXT: # BB#2: # %res_block +; X32-NEXT: jae .LBB0_1 +; X32-NEXT: # BB#2: +; X32-NEXT: je .LBB0_3 +; X32-NEXT: .LBB0_4: +; X32-NEXT: retl +; X32-NEXT: .LBB0_1: ; X32-NEXT: movl $1, %eax -; X32-NEXT: .LBB0_3: # %endblock +; X32-NEXT: jne .LBB0_4 +; X32-NEXT: .LBB0_3: +; X32-NEXT: xorl %eax, %eax ; X32-NEXT: retl ; ; X64-LABEL: length2: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movzwl (%rdi), %eax ; X64-NEXT: movzwl (%rsi), %ecx ; X64-NEXT: rolw $8, %ax ; X64-NEXT: rolw $8, %cx -; X64-NEXT: movzwl %ax, %edx -; X64-NEXT: movzwl %cx, %ecx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpq %rcx, %rdx -; X64-NEXT: je .LBB0_2 -; X64-NEXT: # BB#1: # %res_block +; X64-NEXT: xorl %edx, %edx +; X64-NEXT: cmpw %cx, %ax ; X64-NEXT: movl $-1, %ecx ; X64-NEXT: movl $1, %eax ; X64-NEXT: cmovbl %ecx, %eax -; X64-NEXT: .LBB0_2: # %endblock +; X64-NEXT: cmovel %edx, %eax ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind ret i32 %m @@ -169,38 +166,39 @@ define i1 @length3_eq(i8* %X, i8* %Y) nounwind { define i32 @length4(i8* %X, i8* %Y) nounwind { ; X32-LABEL: length4: -; X32: # BB#0: # %loadbb +; X32: # BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl (%ecx), %ecx -; X32-NEXT: movl (%eax), %edx +; X32-NEXT: movl (%eax), %eax ; X32-NEXT: bswapl %ecx -; X32-NEXT: bswapl %edx -; X32-NEXT: xorl %eax, %eax -; X32-NEXT: cmpl %edx, %ecx -; X32-NEXT: je .LBB6_3 -; X32-NEXT: # BB#1: # %res_block +; X32-NEXT: bswapl %eax +; X32-NEXT: cmpl %eax, %ecx ; X32-NEXT: movl $-1, %eax -; X32-NEXT: jb .LBB6_3 -; X32-NEXT: # BB#2: # %res_block +; X32-NEXT: jae .LBB6_1 +; X32-NEXT: # BB#2: +; X32-NEXT: je .LBB6_3 +; X32-NEXT: .LBB6_4: +; X32-NEXT: retl +; X32-NEXT: .LBB6_1: ; X32-NEXT: movl $1, %eax -; X32-NEXT: .LBB6_3: # %endblock +; X32-NEXT: jne .LBB6_4 +; X32-NEXT: .LBB6_3: +; X32-NEXT: xorl %eax, %eax ; X32-NEXT: retl ; ; X64-LABEL: length4: -; X64: # BB#0: # %loadbb -; X64-NEXT: movl (%rdi), %ecx -; X64-NEXT: movl (%rsi), %edx +; X64: # BB#0: +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movl (%rsi), %ecx +; X64-NEXT: bswapl %eax ; X64-NEXT: bswapl %ecx -; X64-NEXT: bswapl %edx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: je .LBB6_2 -; X64-NEXT: # BB#1: # %res_block +; X64-NEXT: xorl %edx, %edx +; X64-NEXT: cmpl %ecx, %eax ; X64-NEXT: movl $-1, %ecx ; X64-NEXT: movl $1, %eax ; X64-NEXT: cmovbl %ecx, %eax -; X64-NEXT: .LBB6_2: # %endblock +; X64-NEXT: cmovel %edx, %eax ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind ret i32 %m @@ -303,19 +301,17 @@ define i32 @length8(i8* %X, i8* %Y) nounwind { ; X32-NEXT: retl ; ; X64-LABEL: length8: -; X64: # BB#0: # %loadbb -; X64-NEXT: movq (%rdi), %rcx -; X64-NEXT: movq (%rsi), %rdx +; X64: # BB#0: +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: movq (%rsi), %rcx +; X64-NEXT: bswapq %rax ; X64-NEXT: bswapq %rcx -; X64-NEXT: bswapq %rdx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: je .LBB11_2 -; X64-NEXT: # BB#1: # %res_block +; X64-NEXT: xorl %edx, %edx +; X64-NEXT: cmpq %rcx, %rax ; X64-NEXT: movl $-1, %ecx ; X64-NEXT: movl $1, %eax ; X64-NEXT: cmovbl %ecx, %eax -; X64-NEXT: .LBB11_2: # %endblock +; X64-NEXT: cmovel %edx, %eax ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 8) nounwind ret i32 %m diff --git a/test/Transforms/CodeGenPrepare/X86/memcmp.ll b/test/Transforms/CodeGenPrepare/X86/memcmp.ll index b0335ee3450..2435cd7d0a8 100644 --- a/test/Transforms/CodeGenPrepare/X86/memcmp.ll +++ b/test/Transforms/CodeGenPrepare/X86/memcmp.ll @@ -4,45 +4,18 @@ declare i32 @memcmp(i8* nocapture, i8* nocapture, i64) define i32 @cmp2(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp2( -; X32-NEXT: loadbb: -; X32-NEXT: [[TMP0:%.*]] = bitcast i8* %x to i16* -; X32-NEXT: [[TMP1:%.*]] = bitcast i8* %y to i16* -; X32-NEXT: [[TMP2:%.*]] = load i16, i16* [[TMP0]] -; X32-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] -; X32-NEXT: [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]]) -; X32-NEXT: [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]]) -; X32-NEXT: [[TMP6:%.*]] = zext i16 [[TMP4]] to i32 -; X32-NEXT: [[TMP7:%.*]] = zext i16 [[TMP5]] to i32 -; X32-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP6]], [[TMP7]] -; X32-NEXT: br i1 [[TMP8]], label %endblock, label %res_block -; X32: res_block: -; X32-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP6]], [[TMP7]] -; X32-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 -1, i32 1 -; X32-NEXT: br label %endblock -; X32: endblock: -; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, %loadbb ], [ [[TMP10]], %res_block ] -; X32-NEXT: ret i32 [[PHI_RES]] -; -; X64-LABEL: @cmp2( -; X64-NEXT: loadbb: -; X64-NEXT: [[TMP0:%.*]] = bitcast i8* %x to i16* -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* %y to i16* -; X64-NEXT: [[TMP2:%.*]] = load i16, i16* [[TMP0]] -; X64-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]]) -; X64-NEXT: [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]]) -; X64-NEXT: [[TMP6:%.*]] = zext i16 [[TMP4]] to i64 -; X64-NEXT: [[TMP7:%.*]] = zext i16 [[TMP5]] to i64 -; X64-NEXT: [[TMP8:%.*]] = icmp eq i64 [[TMP6]], [[TMP7]] -; X64-NEXT: br i1 [[TMP8]], label %endblock, label %res_block -; X64: res_block: -; X64-NEXT: [[TMP9:%.*]] = icmp ult i64 [[TMP6]], [[TMP7]] -; X64-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 -1, i32 1 -; X64-NEXT: br label %endblock -; X64: endblock: -; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, %loadbb ], [ [[TMP10]], %res_block ] -; X64-NEXT: ret i32 [[PHI_RES]] +; ALL-LABEL: @cmp2( +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* %x to i16* +; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* %y to i16* +; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]] +; ALL-NEXT: [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]]) +; ALL-NEXT: [[TMP6:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP4]]) +; ALL-NEXT: [[TMP7:%.*]] = icmp ne i16 [[TMP5]], [[TMP6]] +; ALL-NEXT: [[TMP8:%.*]] = icmp ult i16 [[TMP5]], [[TMP6]] +; ALL-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i32 -1, i32 1 +; ALL-NEXT: [[TMP10:%.*]] = select i1 [[TMP7]], i32 [[TMP9]], i32 0 +; ALL-NEXT: ret i32 [[TMP10]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 2) ret i32 %call @@ -58,43 +31,18 @@ define i32 @cmp3(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp4(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp4( -; X32-NEXT: loadbb: -; X32-NEXT: [[TMP0:%.*]] = bitcast i8* %x to i32* -; X32-NEXT: [[TMP1:%.*]] = bitcast i8* %y to i32* -; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] -; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] -; X32-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) -; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) -; X32-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] -; X32-NEXT: br i1 [[TMP6]], label %endblock, label %res_block -; X32: res_block: -; X32-NEXT: [[TMP7:%.*]] = icmp ult i32 [[TMP4]], [[TMP5]] -; X32-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 -; X32-NEXT: br label %endblock -; X32: endblock: -; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, %loadbb ], [ [[TMP8]], %res_block ] -; X32-NEXT: ret i32 [[PHI_RES]] -; -; X64-LABEL: @cmp4( -; X64-NEXT: loadbb: -; X64-NEXT: [[TMP0:%.*]] = bitcast i8* %x to i32* -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* %y to i32* -; X64-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] -; X64-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) -; X64-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) -; X64-NEXT: [[TMP6:%.*]] = zext i32 [[TMP4]] to i64 -; X64-NEXT: [[TMP7:%.*]] = zext i32 [[TMP5]] to i64 -; X64-NEXT: [[TMP8:%.*]] = icmp eq i64 [[TMP6]], [[TMP7]] -; X64-NEXT: br i1 [[TMP8]], label %endblock, label %res_block -; X64: res_block: -; X64-NEXT: [[TMP9:%.*]] = icmp ult i64 [[TMP6]], [[TMP7]] -; X64-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 -1, i32 1 -; X64-NEXT: br label %endblock -; X64: endblock: -; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, %loadbb ], [ [[TMP10]], %res_block ] -; X64-NEXT: ret i32 [[PHI_RES]] +; ALL-LABEL: @cmp4( +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* %x to i32* +; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* %y to i32* +; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] +; ALL-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; ALL-NEXT: [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP4]]) +; ALL-NEXT: [[TMP7:%.*]] = icmp ne i32 [[TMP5]], [[TMP6]] +; ALL-NEXT: [[TMP8:%.*]] = icmp ult i32 [[TMP5]], [[TMP6]] +; ALL-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i32 -1, i32 1 +; ALL-NEXT: [[TMP10:%.*]] = select i1 [[TMP7]], i32 [[TMP9]], i32 0 +; ALL-NEXT: ret i32 [[TMP10]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 4) ret i32 %call @@ -133,22 +81,17 @@ define i32 @cmp8(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X32-NEXT: ret i32 [[CALL]] ; ; X64-LABEL: @cmp8( -; X64-NEXT: loadbb: -; X64-NEXT: [[TMP0:%.*]] = bitcast i8* %x to i64* -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* %y to i64* -; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* %x to i64* +; X64-NEXT: [[TMP2:%.*]] = bitcast i8* %y to i64* ; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) +; X64-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] ; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) -; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]] -; X64-NEXT: br i1 [[TMP6]], label %endblock, label %res_block -; X64: res_block: -; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[TMP4]], [[TMP5]] -; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 -; X64-NEXT: br label %endblock -; X64: endblock: -; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, %loadbb ], [ [[TMP8]], %res_block ] -; X64-NEXT: ret i32 [[PHI_RES]] +; X64-NEXT: [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP4]]) +; X64-NEXT: [[TMP7:%.*]] = icmp ne i64 [[TMP5]], [[TMP6]] +; X64-NEXT: [[TMP8:%.*]] = icmp ult i64 [[TMP5]], [[TMP6]] +; X64-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i32 -1, i32 1 +; X64-NEXT: [[TMP10:%.*]] = select i1 [[TMP7]], i32 [[TMP9]], i32 0 +; X64-NEXT: ret i32 [[TMP10]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 8) ret i32 %call