From: Sanjay Patel Date: Tue, 20 Jun 2017 15:58:30 +0000 (+0000) Subject: [x86] enable CGP memcmp() expansion for 2/4/8 byte sizes X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=aab686b3f7f5ac6d7e5dd815ce7dd869615a38e1;p=llvm [x86] enable CGP memcmp() expansion for 2/4/8 byte sizes There are a couple of potential improvements as seen in the IR and asm: 1. We're unnecessarily extending to a larger type to compare values. 2. The codegen for (select cond, 1, -1) could avoid a cmov. (or we could change the order of the compares, so we have a select with 0 operand) git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@305802 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index eacf2e55143..8dfaf3f080e 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1662,6 +1662,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, MaxStoresPerMemcpyOptSize = 4; MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores MaxStoresPerMemmoveOptSize = 4; + + // TODO: These control memcmp expansion in CGP and are set low to prevent + // altering the vector expansion for 16/32 byte memcmp in SelectionDAGBuilder. + MaxLoadsPerMemcmp = 1; + MaxLoadsPerMemcmpOptSize = 1; + // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4). setPrefLoopAlignment(ExperimentalPrefLoopAlignment); diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index 1d58cccc308..f13933e9288 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -2232,6 +2232,12 @@ bool X86TTIImpl::areInlineCompatible(const Function *Caller, return (CallerBits & CalleeBits) == CalleeBits; } +bool X86TTIImpl::expandMemCmp(Instruction *I, unsigned &MaxLoadSize) { + // TODO: We can increase these based on available vector ops. + MaxLoadSize = ST->is64Bit() ? 8 : 4; + return true; +} + bool X86TTIImpl::enableInterleavedAccessVectorization() { // TODO: We expect this to be beneficial regardless of arch, // but there are currently some unexplained performance artifacts on Atom. diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h index b907b7556a1..375fb924c2c 100644 --- a/lib/Target/X86/X86TargetTransformInfo.h +++ b/lib/Target/X86/X86TargetTransformInfo.h @@ -107,7 +107,7 @@ public: bool isLegalMaskedScatter(Type *DataType); bool areInlineCompatible(const Function *Caller, const Function *Callee) const; - + bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize); bool enableInterleavedAccessVectorization(); private: int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask, diff --git a/test/CodeGen/X86/memcmp.ll b/test/CodeGen/X86/memcmp.ll index 4e2475b1c67..9d26aee2e8b 100644 --- a/test/CodeGen/X86/memcmp.ll +++ b/test/CodeGen/X86/memcmp.ll @@ -12,19 +12,46 @@ declare i32 @memcmp(i8*, i8*, i64) define i32 @length2(i8* %X, i8* %Y) nounwind { ; X32-LABEL: length2: -; X32: # BB#0: -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $2 -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: calll memcmp -; X32-NEXT: addl $16, %esp +; X32: # BB#0: # %loadbb +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movzwl (%ecx), %ecx +; X32-NEXT: movzwl (%eax), %eax +; X32-NEXT: rolw $8, %cx +; X32-NEXT: rolw $8, %ax +; X32-NEXT: movzwl %cx, %ecx +; X32-NEXT: movzwl %ax, %eax +; X32-NEXT: cmpl %eax, %ecx +; X32-NEXT: je .LBB0_1 +; X32-NEXT: # BB#2: # %res_block +; X32-NEXT: movl $-1, %eax +; X32-NEXT: jb .LBB0_4 +; X32-NEXT: # BB#3: # %res_block +; X32-NEXT: movl $1, %eax +; X32-NEXT: .LBB0_4: # %endblock +; X32-NEXT: retl +; X32-NEXT: .LBB0_1: +; X32-NEXT: xorl %eax, %eax ; X32-NEXT: retl ; ; X64-LABEL: length2: -; X64: # BB#0: -; X64-NEXT: movl $2, %edx -; X64-NEXT: jmp memcmp # TAILCALL +; X64: # BB#0: # %loadbb +; X64-NEXT: movzwl (%rdi), %eax +; X64-NEXT: movzwl (%rsi), %ecx +; X64-NEXT: rolw $8, %ax +; X64-NEXT: rolw $8, %cx +; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: movzwl %cx, %ecx +; X64-NEXT: cmpq %rcx, %rax +; X64-NEXT: je .LBB0_1 +; X64-NEXT: # BB#2: # %res_block +; X64-NEXT: movl $-1, %ecx +; X64-NEXT: movl $1, %eax +; X64-NEXT: cmovbl %ecx, %eax +; X64-NEXT: retq +; X64-NEXT: .LBB0_1: +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind ret i32 %m } @@ -145,19 +172,42 @@ define i1 @length3_eq(i8* %X, i8* %Y) nounwind { define i32 @length4(i8* %X, i8* %Y) nounwind { ; X32-LABEL: length4: -; X32: # BB#0: -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $4 -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: calll memcmp -; X32-NEXT: addl $16, %esp +; X32: # BB#0: # %loadbb +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl (%ecx), %ecx +; X32-NEXT: movl (%eax), %eax +; X32-NEXT: bswapl %ecx +; X32-NEXT: bswapl %eax +; X32-NEXT: cmpl %eax, %ecx +; X32-NEXT: je .LBB6_1 +; X32-NEXT: # BB#2: # %res_block +; X32-NEXT: movl $-1, %eax +; X32-NEXT: jb .LBB6_4 +; X32-NEXT: # BB#3: # %res_block +; X32-NEXT: movl $1, %eax +; X32-NEXT: .LBB6_4: # %endblock +; X32-NEXT: retl +; X32-NEXT: .LBB6_1: +; X32-NEXT: xorl %eax, %eax ; X32-NEXT: retl ; ; X64-LABEL: length4: -; X64: # BB#0: -; X64-NEXT: movl $4, %edx -; X64-NEXT: jmp memcmp # TAILCALL +; X64: # BB#0: # %loadbb +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movl (%rsi), %ecx +; X64-NEXT: bswapl %eax +; X64-NEXT: bswapl %ecx +; X64-NEXT: cmpq %rcx, %rax +; X64-NEXT: je .LBB6_1 +; X64-NEXT: # BB#2: # %res_block +; X64-NEXT: movl $-1, %ecx +; X64-NEXT: movl $1, %eax +; X64-NEXT: cmovbl %ecx, %eax +; X64-NEXT: retq +; X64-NEXT: .LBB6_1: +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind ret i32 %m } @@ -259,9 +309,21 @@ define i32 @length8(i8* %X, i8* %Y) nounwind { ; X32-NEXT: retl ; ; X64-LABEL: length8: -; X64: # BB#0: -; X64-NEXT: movl $8, %edx -; X64-NEXT: jmp memcmp # TAILCALL +; X64: # BB#0: # %loadbb +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: movq (%rsi), %rcx +; X64-NEXT: bswapq %rax +; X64-NEXT: bswapq %rcx +; X64-NEXT: cmpq %rcx, %rax +; X64-NEXT: je .LBB11_1 +; X64-NEXT: # BB#2: # %res_block +; X64-NEXT: movl $-1, %ecx +; X64-NEXT: movl $1, %eax +; X64-NEXT: cmovbl %ecx, %eax +; X64-NEXT: retq +; X64-NEXT: .LBB11_1: +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 8) nounwind ret i32 %m } diff --git a/test/Transforms/CodeGenPrepare/X86/memcmp.ll b/test/Transforms/CodeGenPrepare/X86/memcmp.ll index 328e8cc2907..337889b34d6 100644 --- a/test/Transforms/CodeGenPrepare/X86/memcmp.ll +++ b/test/Transforms/CodeGenPrepare/X86/memcmp.ll @@ -6,9 +6,47 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" declare i32 @memcmp(i8* nocapture, i8* nocapture, i64) define i32 @cmp2(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp2( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 2) -; ALL-NEXT: ret i32 [[CALL]] +; X32-LABEL: @cmp2( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* %x to i16* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* %y to i16* +; X32-NEXT: [[TMP2:%.*]] = load i16, i16* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]]) +; X32-NEXT: [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]]) +; X32-NEXT: [[TMP6:%.*]] = zext i16 [[TMP4]] to i32 +; X32-NEXT: [[TMP7:%.*]] = zext i16 [[TMP5]] to i32 +; X32-NEXT: [[TMP8:%.*]] = sub i32 [[TMP6]], [[TMP7]] +; X32-NEXT: [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0 +; X32-NEXT: br i1 [[TMP9]], label %res_block, label %endblock +; X32: res_block: +; X32-NEXT: [[TMP10:%.*]] = icmp ult i32 [[TMP6]], [[TMP7]] +; X32-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 -1, i32 1 +; X32-NEXT: br label %endblock +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, %loadbb ], [ [[TMP11]], %res_block ] +; X32-NEXT: ret i32 [[PHI_RES]] +; +; X64-LABEL: @cmp2( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* %x to i16* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* %y to i16* +; X64-NEXT: [[TMP2:%.*]] = load i16, i16* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = zext i16 [[TMP4]] to i64 +; X64-NEXT: [[TMP7:%.*]] = zext i16 [[TMP5]] to i64 +; X64-NEXT: [[TMP8:%.*]] = sub i64 [[TMP6]], [[TMP7]] +; X64-NEXT: [[TMP9:%.*]] = icmp ne i64 [[TMP8]], 0 +; X64-NEXT: br i1 [[TMP9]], label %res_block, label %endblock +; X64: res_block: +; X64-NEXT: [[TMP10:%.*]] = icmp ult i64 [[TMP6]], [[TMP7]] +; X64-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 -1, i32 1 +; X64-NEXT: br label %endblock +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, %loadbb ], [ [[TMP11]], %res_block ] +; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 2) ret i32 %call @@ -24,9 +62,45 @@ define i32 @cmp3(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp4(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp4( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 4) -; ALL-NEXT: ret i32 [[CALL]] +; X32-LABEL: @cmp4( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* %x to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* %y to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X32-NEXT: [[TMP6:%.*]] = sub i32 [[TMP4]], [[TMP5]] +; X32-NEXT: [[TMP7:%.*]] = icmp ne i32 [[TMP6]], 0 +; X32-NEXT: br i1 [[TMP7]], label %res_block, label %endblock +; X32: res_block: +; X32-NEXT: [[TMP8:%.*]] = icmp ult i32 [[TMP4]], [[TMP5]] +; X32-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i32 -1, i32 1 +; X32-NEXT: br label %endblock +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, %loadbb ], [ [[TMP9]], %res_block ] +; X32-NEXT: ret i32 [[PHI_RES]] +; +; X64-LABEL: @cmp4( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* %x to i32* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* %y to i32* +; X64-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = zext i32 [[TMP4]] to i64 +; X64-NEXT: [[TMP7:%.*]] = zext i32 [[TMP5]] to i64 +; X64-NEXT: [[TMP8:%.*]] = sub i64 [[TMP6]], [[TMP7]] +; X64-NEXT: [[TMP9:%.*]] = icmp ne i64 [[TMP8]], 0 +; X64-NEXT: br i1 [[TMP9]], label %res_block, label %endblock +; X64: res_block: +; X64-NEXT: [[TMP10:%.*]] = icmp ult i64 [[TMP6]], [[TMP7]] +; X64-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 -1, i32 1 +; X64-NEXT: br label %endblock +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, %loadbb ], [ [[TMP11]], %res_block ] +; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 4) ret i32 %call @@ -60,9 +134,28 @@ define i32 @cmp7(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp8(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp8( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 8) -; ALL-NEXT: ret i32 [[CALL]] +; X32-LABEL: @cmp8( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 8) +; X32-NEXT: ret i32 [[CALL]] +; +; X64-LABEL: @cmp8( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* %x to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* %y to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = sub i64 [[TMP4]], [[TMP5]] +; X64-NEXT: [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0 +; X64-NEXT: br i1 [[TMP7]], label %res_block, label %endblock +; X64: res_block: +; X64-NEXT: [[TMP8:%.*]] = icmp ult i64 [[TMP4]], [[TMP5]] +; X64-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i32 -1, i32 1 +; X64-NEXT: br label %endblock +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, %loadbb ], [ [[TMP9]], %res_block ] +; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 8) ret i32 %call @@ -142,8 +235,13 @@ define i32 @cmp16(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp_eq2(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp_eq2( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 2) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* %x to i16* +; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* %y to i16* +; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]] +; ALL-NEXT: [[TMP5:%.*]] = icmp ne i16 [[TMP3]], [[TMP4]] +; ALL-NEXT: [[TMP6:%.*]] = zext i1 [[TMP5]] to i32 +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0 ; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 ; ALL-NEXT: ret i32 [[CONV]] ; @@ -168,8 +266,13 @@ define i32 @cmp_eq3(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp_eq4(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp_eq4( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 4) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* %x to i32* +; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* %y to i32* +; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] +; ALL-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]] +; ALL-NEXT: [[TMP6:%.*]] = zext i1 [[TMP5]] to i32 +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0 ; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 ; ALL-NEXT: ret i32 [[CONV]] ; @@ -219,11 +322,22 @@ define i32 @cmp_eq7(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp_eq8(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq8( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 8) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] +; X32-LABEL: @cmp_eq8( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 8) +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; +; X64-LABEL: @cmp_eq8( +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* %x to i64* +; X64-NEXT: [[TMP2:%.*]] = bitcast i8* %y to i64* +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] +; X64-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]] +; X64-NEXT: [[TMP6:%.*]] = zext i1 [[TMP5]] to i32 +; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0 +; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64-NEXT: ret i32 [[CONV]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 8) %cmp = icmp eq i32 %call, 0