From e029500a635fecfb7cb0e99e89a041bf170ad091 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 18 Jul 2017 15:55:30 +0000 Subject: [PATCH] [x86, CGP] increase memcmp() expansion up to 4 load pairs It should be a win to avoid going out to the system lib for all small memcmp() calls using scalar ops. For x86 32-bit, this means most everything up to 16 bytes. For 64-bit, that doubles because we can do 8-byte loads. Notes: Reduced from 4 to 2 loads for -Os behavior, which might not be optimal in all cases. It's effectively a question of how much do we trust the system implementation. Linux and macOS (and Windows I assume, but did not test) have optimized memcmp() code for x86, so it's probably not bad either way? PPC is using 8/4 for defaults on these. We do not expand at all for -Oz. There are still potential improvements to make for the CGP expansion IR and/or lowering such as avoiding select-of-constants (D34904) and not doing zexts to the max load type before doing a compare. We have special-case SSE/AVX codegen for (memcmp(x, y, 16/32) == 0) that will no longer be produced after this patch. I've shown the experimental justification for that change in PR33329: https://bugs.llvm.org/show_bug.cgi?id=33329#c12 TLDR: While the vector code is a likely winner, we can't guarantee that it's a winner in all cases on all CPUs, so I'm willing to sacrifice it for the greater good of expanding all small memcmp(). If we want to resurrect that codegen, it can be done by adjusting the CGP params or poking a hole to let those fall-through the CGP expansion. Committed on behalf of Sanjay Patel Differential Revision: https://reviews.llvm.org/D35067 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@308322 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 9 +- test/CodeGen/X86/memcmp-optsize.ll | 395 +++-- test/CodeGen/X86/memcmp.ll | 719 +++++--- test/Transforms/CodeGenPrepare/X86/memcmp.ll | 1635 +++++++++++++++++- 4 files changed, 2336 insertions(+), 422 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 9ecca53bbf4..44eecd66471 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1670,10 +1670,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores MaxStoresPerMemmoveOptSize = 4; - // TODO: These control memcmp expansion in CGP and are set low to prevent - // altering the vector expansion for 16/32 byte memcmp in SelectionDAGBuilder. - MaxLoadsPerMemcmp = 1; - MaxLoadsPerMemcmpOptSize = 1; + // TODO: These control memcmp expansion in CGP and could be raised higher, but + // that needs to benchmarked and balanced with the potential use of vector + // load/store types (PR33329). + MaxLoadsPerMemcmp = 4; + MaxLoadsPerMemcmpOptSize = 2; // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4). setPrefLoopAlignment(ExperimentalPrefLoopAlignment); diff --git a/test/CodeGen/X86/memcmp-optsize.ll b/test/CodeGen/X86/memcmp-optsize.ll index 30674b86059..450205a966d 100644 --- a/test/CodeGen/X86/memcmp-optsize.ll +++ b/test/CodeGen/X86/memcmp-optsize.ll @@ -121,44 +121,94 @@ define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind optsize { define i32 @length3(i8* %X, i8* %Y) nounwind optsize { ; X86-LABEL: length3: -; X86: # BB#0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $3 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp +; X86: # BB#0: # %loadbb +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: movzwl (%ecx), %esi +; X86-NEXT: rolw $8, %dx +; X86-NEXT: rolw $8, %si +; X86-NEXT: movzwl %dx, %edx +; X86-NEXT: movzwl %si, %esi +; X86-NEXT: cmpl %esi, %edx +; X86-NEXT: jne .LBB4_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movzbl 2(%eax), %eax +; X86-NEXT: movzbl 2(%ecx), %ecx +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: jmp .LBB4_3 +; X86-NEXT: .LBB4_1: # %res_block +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: incl %ecx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: decl %eax +; X86-NEXT: cmpl %esi, %edx +; X86-NEXT: cmovael %ecx, %eax +; X86-NEXT: .LBB4_3: # %endblock +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: length3: -; X64: # BB#0: -; X64-NEXT: movl $3, %edx -; X64-NEXT: jmp memcmp # TAILCALL +; X64: # BB#0: # %loadbb +; X64-NEXT: movzwl (%rdi), %eax +; X64-NEXT: movzwl (%rsi), %ecx +; X64-NEXT: rolw $8, %ax +; X64-NEXT: rolw $8, %cx +; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: movzwl %cx, %ecx +; X64-NEXT: cmpq %rcx, %rax +; X64-NEXT: jne .LBB4_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movzbl 2(%rdi), %eax +; X64-NEXT: movzbl 2(%rsi), %ecx +; X64-NEXT: subl %ecx, %eax +; X64-NEXT: retq +; X64-NEXT: .LBB4_1: # %res_block +; X64-NEXT: movl $-1, %ecx +; X64-NEXT: movl $1, %eax +; X64-NEXT: cmovbl %ecx, %eax +; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind ret i32 %m } define i1 @length3_eq(i8* %X, i8* %Y) nounwind optsize { ; X86-LABEL: length3_eq: -; X86: # BB#0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $3 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp +; X86: # BB#0: # %loadbb +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: cmpw (%ecx), %dx +; X86-NEXT: jne .LBB5_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movb 2(%eax), %dl +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpb 2(%ecx), %dl +; X86-NEXT: je .LBB5_3 +; X86-NEXT: .LBB5_1: # %res_block +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: incl %eax +; X86-NEXT: .LBB5_3: # %endblock ; X86-NEXT: testl %eax, %eax ; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: length3_eq: -; X64: # BB#0: -; X64-NEXT: pushq %rax -; X64-NEXT: movl $3, %edx -; X64-NEXT: callq memcmp +; X64: # BB#0: # %loadbb +; X64-NEXT: movzwl (%rdi), %eax +; X64-NEXT: cmpw (%rsi), %ax +; X64-NEXT: jne .LBB5_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movb 2(%rdi), %cl +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpb 2(%rsi), %cl +; X64-NEXT: je .LBB5_3 +; X64-NEXT: .LBB5_1: # %res_block +; X64-NEXT: movl $1, %eax +; X64-NEXT: .LBB5_3: # %endblock ; X64-NEXT: testl %eax, %eax ; X64-NEXT: setne %al -; X64-NEXT: popq %rcx ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind %c = icmp ne i32 %m, 0 @@ -246,44 +296,90 @@ define i1 @length4_eq_const(i8* %X) nounwind optsize { define i32 @length5(i8* %X, i8* %Y) nounwind optsize { ; X86-LABEL: length5: -; X86: # BB#0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $5 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp +; X86: # BB#0: # %loadbb +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: movl (%ecx), %esi +; X86-NEXT: bswapl %edx +; X86-NEXT: bswapl %esi +; X86-NEXT: cmpl %esi, %edx +; X86-NEXT: jne .LBB9_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movzbl 4(%eax), %eax +; X86-NEXT: movzbl 4(%ecx), %ecx +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: jmp .LBB9_3 +; X86-NEXT: .LBB9_1: # %res_block +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: incl %ecx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: decl %eax +; X86-NEXT: cmpl %esi, %edx +; X86-NEXT: cmovael %ecx, %eax +; X86-NEXT: .LBB9_3: # %endblock +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: length5: -; X64: # BB#0: -; X64-NEXT: movl $5, %edx -; X64-NEXT: jmp memcmp # TAILCALL +; X64: # BB#0: # %loadbb +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movl (%rsi), %ecx +; X64-NEXT: bswapl %eax +; X64-NEXT: bswapl %ecx +; X64-NEXT: cmpq %rcx, %rax +; X64-NEXT: jne .LBB9_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movzbl 4(%rdi), %eax +; X64-NEXT: movzbl 4(%rsi), %ecx +; X64-NEXT: subl %ecx, %eax +; X64-NEXT: retq +; X64-NEXT: .LBB9_1: # %res_block +; X64-NEXT: movl $-1, %ecx +; X64-NEXT: movl $1, %eax +; X64-NEXT: cmovbl %ecx, %eax +; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind ret i32 %m } define i1 @length5_eq(i8* %X, i8* %Y) nounwind optsize { ; X86-LABEL: length5_eq: -; X86: # BB#0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $5 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp +; X86: # BB#0: # %loadbb +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: cmpl (%ecx), %edx +; X86-NEXT: jne .LBB10_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movb 4(%eax), %dl +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpb 4(%ecx), %dl +; X86-NEXT: je .LBB10_3 +; X86-NEXT: .LBB10_1: # %res_block +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: incl %eax +; X86-NEXT: .LBB10_3: # %endblock ; X86-NEXT: testl %eax, %eax ; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: length5_eq: -; X64: # BB#0: -; X64-NEXT: pushq %rax -; X64-NEXT: movl $5, %edx -; X64-NEXT: callq memcmp +; X64: # BB#0: # %loadbb +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: cmpl (%rsi), %eax +; X64-NEXT: jne .LBB10_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movb 4(%rdi), %cl +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpb 4(%rsi), %cl +; X64-NEXT: je .LBB10_3 +; X64-NEXT: .LBB10_1: # %res_block +; X64-NEXT: movl $1, %eax +; X64-NEXT: .LBB10_3: # %endblock ; X64-NEXT: testl %eax, %eax ; X64-NEXT: setne %al -; X64-NEXT: popq %rcx ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind %c = icmp ne i32 %m, 0 @@ -292,13 +388,33 @@ define i1 @length5_eq(i8* %X, i8* %Y) nounwind optsize { define i32 @length8(i8* %X, i8* %Y) nounwind optsize { ; X86-LABEL: length8: -; X86: # BB#0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $8 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp +; X86: # BB#0: # %loadbb +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl (%esi), %ecx +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: jne .LBB11_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movl 4(%esi), %ecx +; X86-NEXT: movl 4(%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: je .LBB11_3 +; X86-NEXT: .LBB11_1: # %res_block +; X86-NEXT: xorl %esi, %esi +; X86-NEXT: incl %esi +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: decl %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: cmovael %esi, %eax +; X86-NEXT: .LBB11_3: # %endblock +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: length8: @@ -320,13 +436,21 @@ define i32 @length8(i8* %X, i8* %Y) nounwind optsize { define i1 @length8_eq(i8* %X, i8* %Y) nounwind optsize { ; X86-LABEL: length8_eq: -; X86: # BB#0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $8 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp +; X86: # BB#0: # %loadbb +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: cmpl (%ecx), %edx +; X86-NEXT: jne .LBB12_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movl 4(%eax), %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl 4(%ecx), %edx +; X86-NEXT: je .LBB12_3 +; X86-NEXT: .LBB12_1: # %res_block +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: incl %eax +; X86-NEXT: .LBB12_3: # %endblock ; X86-NEXT: testl %eax, %eax ; X86-NEXT: sete %al ; X86-NEXT: retl @@ -344,13 +468,18 @@ define i1 @length8_eq(i8* %X, i8* %Y) nounwind optsize { define i1 @length8_eq_const(i8* %X) nounwind optsize { ; X86-LABEL: length8_eq_const: -; X86: # BB#0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $8 -; X86-NEXT: pushl $.L.str -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp +; X86: # BB#0: # %loadbb +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl $858927408, (%ecx) # imm = 0x33323130 +; X86-NEXT: jne .LBB13_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl $926299444, 4(%ecx) # imm = 0x37363534 +; X86-NEXT: je .LBB13_3 +; X86-NEXT: .LBB13_1: # %res_block +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: incl %eax +; X86-NEXT: .LBB13_3: # %endblock ; X86-NEXT: testl %eax, %eax ; X86-NEXT: setne %al ; X86-NEXT: retl @@ -380,13 +509,20 @@ define i1 @length12_eq(i8* %X, i8* %Y) nounwind optsize { ; X86-NEXT: retl ; ; X64-LABEL: length12_eq: -; X64: # BB#0: -; X64-NEXT: pushq %rax -; X64-NEXT: movl $12, %edx -; X64-NEXT: callq memcmp +; X64: # BB#0: # %loadbb +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: cmpq (%rsi), %rax +; X64-NEXT: jne .LBB14_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movl 8(%rdi), %ecx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpl 8(%rsi), %ecx +; X64-NEXT: je .LBB14_3 +; X64-NEXT: .LBB14_1: # %res_block +; X64-NEXT: movl $1, %eax +; X64-NEXT: .LBB14_3: # %endblock ; X64-NEXT: testl %eax, %eax ; X64-NEXT: setne %al -; X64-NEXT: popq %rcx ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind %c = icmp ne i32 %m, 0 @@ -405,9 +541,29 @@ define i32 @length12(i8* %X, i8* %Y) nounwind optsize { ; X86-NEXT: retl ; ; X64-LABEL: length12: -; X64: # BB#0: -; X64-NEXT: movl $12, %edx -; X64-NEXT: jmp memcmp # TAILCALL +; X64: # BB#0: # %loadbb +; X64-NEXT: movq (%rdi), %rcx +; X64-NEXT: movq (%rsi), %rdx +; X64-NEXT: bswapq %rcx +; X64-NEXT: bswapq %rdx +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: jne .LBB15_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movl 8(%rdi), %ecx +; X64-NEXT: movl 8(%rsi), %edx +; X64-NEXT: bswapl %ecx +; X64-NEXT: bswapl %edx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: jne .LBB15_1 +; X64-NEXT: # BB#3: # %endblock +; X64-NEXT: retq +; X64-NEXT: .LBB15_1: # %res_block +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: movl $-1, %ecx +; X64-NEXT: movl $1, %eax +; X64-NEXT: cmovbl %ecx, %eax +; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind ret i32 %m } @@ -426,9 +582,29 @@ define i32 @length16(i8* %X, i8* %Y) nounwind optsize { ; X86-NEXT: retl ; ; X64-LABEL: length16: -; X64: # BB#0: -; X64-NEXT: movl $16, %edx -; X64-NEXT: jmp memcmp # TAILCALL +; X64: # BB#0: # %loadbb +; X64-NEXT: movq (%rdi), %rcx +; X64-NEXT: movq (%rsi), %rdx +; X64-NEXT: bswapq %rcx +; X64-NEXT: bswapq %rdx +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: jne .LBB16_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movq 8(%rdi), %rcx +; X64-NEXT: movq 8(%rsi), %rdx +; X64-NEXT: bswapq %rcx +; X64-NEXT: bswapq %rdx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: jne .LBB16_1 +; X64-NEXT: # BB#3: # %endblock +; X64-NEXT: retq +; X64-NEXT: .LBB16_1: # %res_block +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: movl $-1, %ecx +; X64-NEXT: movl $1, %eax +; X64-NEXT: cmovbl %ecx, %eax +; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind ret i32 %m } @@ -458,24 +634,22 @@ define i1 @length16_eq(i8* %x, i8* %y) nounwind optsize { ; X86-SSE2-NEXT: setne %al ; X86-SSE2-NEXT: retl ; -; X64-SSE2-LABEL: length16_eq: -; X64-SSE2: # BB#0: -; X64-SSE2-NEXT: movdqu (%rsi), %xmm0 -; X64-SSE2-NEXT: movdqu (%rdi), %xmm1 -; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; X64-SSE2-NEXT: pmovmskb %xmm1, %eax -; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: setne %al -; X64-SSE2-NEXT: retq -; -; X64-AVX2-LABEL: length16_eq: -; X64-AVX2: # BB#0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 -; X64-AVX2-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 -; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax -; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX2-NEXT: setne %al -; X64-AVX2-NEXT: retq +; X64-LABEL: length16_eq: +; X64: # BB#0: # %loadbb +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: cmpq (%rsi), %rax +; X64-NEXT: jne .LBB17_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movq 8(%rdi), %rcx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpq 8(%rsi), %rcx +; X64-NEXT: je .LBB17_3 +; X64-NEXT: .LBB17_1: # %res_block +; X64-NEXT: movl $1, %eax +; X64-NEXT: .LBB17_3: # %endblock +; X64-NEXT: testl %eax, %eax +; X64-NEXT: setne %al +; X64-NEXT: retq %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) nounwind %cmp = icmp ne i32 %call, 0 ret i1 %cmp @@ -504,23 +678,22 @@ define i1 @length16_eq_const(i8* %X) nounwind optsize { ; X86-SSE2-NEXT: sete %al ; X86-SSE2-NEXT: retl ; -; X64-SSE2-LABEL: length16_eq_const: -; X64-SSE2: # BB#0: -; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 -; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 -; X64-SSE2-NEXT: pmovmskb %xmm0, %eax -; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: sete %al -; X64-SSE2-NEXT: retq -; -; X64-AVX2-LABEL: length16_eq_const: -; X64-AVX2: # BB#0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 -; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax -; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX2-NEXT: sete %al -; X64-AVX2-NEXT: retq +; X64-LABEL: length16_eq_const: +; X64: # BB#0: # %loadbb +; X64-NEXT: movabsq $3978425819141910832, %rax # imm = 0x3736353433323130 +; X64-NEXT: cmpq %rax, (%rdi) +; X64-NEXT: jne .LBB18_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: movabsq $3833745473465760056, %rcx # imm = 0x3534333231303938 +; X64-NEXT: cmpq %rcx, 8(%rdi) +; X64-NEXT: je .LBB18_3 +; X64-NEXT: .LBB18_1: # %res_block +; X64-NEXT: movl $1, %eax +; X64-NEXT: .LBB18_3: # %endblock +; X64-NEXT: testl %eax, %eax +; X64-NEXT: sete %al +; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 16) nounwind %c = icmp eq i32 %m, 0 ret i1 %c diff --git a/test/CodeGen/X86/memcmp.ll b/test/CodeGen/X86/memcmp.ll index e7ba32cab0a..2e678276546 100644 --- a/test/CodeGen/X86/memcmp.ll +++ b/test/CodeGen/X86/memcmp.ll @@ -115,44 +115,90 @@ define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind { define i32 @length3(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length3: -; X86: # BB#0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $3 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp +; X86: # BB#0: # %loadbb +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: movzwl (%ecx), %esi +; X86-NEXT: rolw $8, %dx +; X86-NEXT: rolw $8, %si +; X86-NEXT: movzwl %dx, %edx +; X86-NEXT: movzwl %si, %esi +; X86-NEXT: cmpl %esi, %edx +; X86-NEXT: jne .LBB4_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movzbl 2(%eax), %eax +; X86-NEXT: movzbl 2(%ecx), %ecx +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: popl %esi +; X86-NEXT: retl +; X86-NEXT: .LBB4_1: # %res_block +; X86-NEXT: movl $-1, %ecx +; X86-NEXT: movl $1, %eax +; X86-NEXT: cmovbl %ecx, %eax +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: length3: -; X64: # BB#0: -; X64-NEXT: movl $3, %edx -; X64-NEXT: jmp memcmp # TAILCALL +; X64: # BB#0: # %loadbb +; X64-NEXT: movzwl (%rdi), %eax +; X64-NEXT: movzwl (%rsi), %ecx +; X64-NEXT: rolw $8, %ax +; X64-NEXT: rolw $8, %cx +; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: movzwl %cx, %ecx +; X64-NEXT: cmpq %rcx, %rax +; X64-NEXT: jne .LBB4_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movzbl 2(%rdi), %eax +; X64-NEXT: movzbl 2(%rsi), %ecx +; X64-NEXT: subl %ecx, %eax +; X64-NEXT: retq +; X64-NEXT: .LBB4_1: # %res_block +; X64-NEXT: movl $-1, %ecx +; X64-NEXT: movl $1, %eax +; X64-NEXT: cmovbl %ecx, %eax +; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind ret i32 %m } define i1 @length3_eq(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length3_eq: -; X86: # BB#0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $3 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp +; X86: # BB#0: # %loadbb +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: cmpw (%ecx), %dx +; X86-NEXT: jne .LBB5_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movb 2(%eax), %dl +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpb 2(%ecx), %dl +; X86-NEXT: je .LBB5_3 +; X86-NEXT: .LBB5_1: # %res_block +; X86-NEXT: movl $1, %eax +; X86-NEXT: .LBB5_3: # %endblock ; X86-NEXT: testl %eax, %eax ; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: length3_eq: -; X64: # BB#0: -; X64-NEXT: pushq %rax -; X64-NEXT: movl $3, %edx -; X64-NEXT: callq memcmp +; X64: # BB#0: # %loadbb +; X64-NEXT: movzwl (%rdi), %eax +; X64-NEXT: cmpw (%rsi), %ax +; X64-NEXT: jne .LBB5_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movb 2(%rdi), %cl +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpb 2(%rsi), %cl +; X64-NEXT: je .LBB5_3 +; X64-NEXT: .LBB5_1: # %res_block +; X64-NEXT: movl $1, %eax +; X64-NEXT: .LBB5_3: # %endblock ; X64-NEXT: testl %eax, %eax ; X64-NEXT: setne %al -; X64-NEXT: popq %rcx ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind %c = icmp ne i32 %m, 0 @@ -234,44 +280,86 @@ define i1 @length4_eq_const(i8* %X) nounwind { define i32 @length5(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length5: -; X86: # BB#0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $5 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp +; X86: # BB#0: # %loadbb +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: movl (%ecx), %esi +; X86-NEXT: bswapl %edx +; X86-NEXT: bswapl %esi +; X86-NEXT: cmpl %esi, %edx +; X86-NEXT: jne .LBB9_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movzbl 4(%eax), %eax +; X86-NEXT: movzbl 4(%ecx), %ecx +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: popl %esi +; X86-NEXT: retl +; X86-NEXT: .LBB9_1: # %res_block +; X86-NEXT: movl $-1, %ecx +; X86-NEXT: movl $1, %eax +; X86-NEXT: cmovbl %ecx, %eax +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: length5: -; X64: # BB#0: -; X64-NEXT: movl $5, %edx -; X64-NEXT: jmp memcmp # TAILCALL +; X64: # BB#0: # %loadbb +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movl (%rsi), %ecx +; X64-NEXT: bswapl %eax +; X64-NEXT: bswapl %ecx +; X64-NEXT: cmpq %rcx, %rax +; X64-NEXT: jne .LBB9_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movzbl 4(%rdi), %eax +; X64-NEXT: movzbl 4(%rsi), %ecx +; X64-NEXT: subl %ecx, %eax +; X64-NEXT: retq +; X64-NEXT: .LBB9_1: # %res_block +; X64-NEXT: movl $-1, %ecx +; X64-NEXT: movl $1, %eax +; X64-NEXT: cmovbl %ecx, %eax +; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind ret i32 %m } define i1 @length5_eq(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length5_eq: -; X86: # BB#0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $5 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp +; X86: # BB#0: # %loadbb +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: cmpl (%ecx), %edx +; X86-NEXT: jne .LBB10_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movb 4(%eax), %dl +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpb 4(%ecx), %dl +; X86-NEXT: je .LBB10_3 +; X86-NEXT: .LBB10_1: # %res_block +; X86-NEXT: movl $1, %eax +; X86-NEXT: .LBB10_3: # %endblock ; X86-NEXT: testl %eax, %eax ; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: length5_eq: -; X64: # BB#0: -; X64-NEXT: pushq %rax -; X64-NEXT: movl $5, %edx -; X64-NEXT: callq memcmp +; X64: # BB#0: # %loadbb +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: cmpl (%rsi), %eax +; X64-NEXT: jne .LBB10_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movb 4(%rdi), %cl +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpb 4(%rsi), %cl +; X64-NEXT: je .LBB10_3 +; X64-NEXT: .LBB10_1: # %res_block +; X64-NEXT: movl $1, %eax +; X64-NEXT: .LBB10_3: # %endblock ; X64-NEXT: testl %eax, %eax ; X64-NEXT: setne %al -; X64-NEXT: popq %rcx ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind %c = icmp ne i32 %m, 0 @@ -280,13 +368,33 @@ define i1 @length5_eq(i8* %X, i8* %Y) nounwind { define i32 @length8(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length8: -; X86: # BB#0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $8 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp +; X86: # BB#0: # %loadbb +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl (%esi), %ecx +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: jne .LBB11_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movl 4(%esi), %ecx +; X86-NEXT: movl 4(%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: jne .LBB11_1 +; X86-NEXT: # BB#3: # %endblock +; X86-NEXT: popl %esi +; X86-NEXT: retl +; X86-NEXT: .LBB11_1: # %res_block +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: movl $-1, %ecx +; X86-NEXT: movl $1, %eax +; X86-NEXT: cmovbl %ecx, %eax +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: length8: @@ -308,13 +416,20 @@ define i32 @length8(i8* %X, i8* %Y) nounwind { define i1 @length8_eq(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length8_eq: -; X86: # BB#0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $8 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp +; X86: # BB#0: # %loadbb +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: cmpl (%ecx), %edx +; X86-NEXT: jne .LBB12_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movl 4(%eax), %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl 4(%ecx), %edx +; X86-NEXT: je .LBB12_3 +; X86-NEXT: .LBB12_1: # %res_block +; X86-NEXT: movl $1, %eax +; X86-NEXT: .LBB12_3: # %endblock ; X86-NEXT: testl %eax, %eax ; X86-NEXT: sete %al ; X86-NEXT: retl @@ -332,13 +447,17 @@ define i1 @length8_eq(i8* %X, i8* %Y) nounwind { define i1 @length8_eq_const(i8* %X) nounwind { ; X86-LABEL: length8_eq_const: -; X86: # BB#0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $8 -; X86-NEXT: pushl $.L.str -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp +; X86: # BB#0: # %loadbb +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl $858927408, (%ecx) # imm = 0x33323130 +; X86-NEXT: jne .LBB13_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl $926299444, 4(%ecx) # imm = 0x37363534 +; X86-NEXT: je .LBB13_3 +; X86-NEXT: .LBB13_1: # %res_block +; X86-NEXT: movl $1, %eax +; X86-NEXT: .LBB13_3: # %endblock ; X86-NEXT: testl %eax, %eax ; X86-NEXT: setne %al ; X86-NEXT: retl @@ -356,25 +475,43 @@ define i1 @length8_eq_const(i8* %X) nounwind { define i1 @length12_eq(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length12_eq: -; X86: # BB#0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $12 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax +; X86: # BB#0: # %loadbb +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: cmpl (%eax), %edx +; X86-NEXT: jne .LBB14_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movl 4(%ecx), %edx +; X86-NEXT: cmpl 4(%eax), %edx +; X86-NEXT: jne .LBB14_1 +; X86-NEXT: # BB#3: # %loadbb2 +; X86-NEXT: movl 8(%ecx), %edx +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpl 8(%eax), %edx +; X86-NEXT: je .LBB14_4 +; X86-NEXT: .LBB14_1: # %res_block +; X86-NEXT: movl $1, %ecx +; X86-NEXT: .LBB14_4: # %endblock +; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: length12_eq: -; X64: # BB#0: -; X64-NEXT: pushq %rax -; X64-NEXT: movl $12, %edx -; X64-NEXT: callq memcmp +; X64: # BB#0: # %loadbb +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: cmpq (%rsi), %rax +; X64-NEXT: jne .LBB14_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movl 8(%rdi), %ecx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpl 8(%rsi), %ecx +; X64-NEXT: je .LBB14_3 +; X64-NEXT: .LBB14_1: # %res_block +; X64-NEXT: movl $1, %eax +; X64-NEXT: .LBB14_3: # %endblock ; X64-NEXT: testl %eax, %eax ; X64-NEXT: setne %al -; X64-NEXT: popq %rcx ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind %c = icmp ne i32 %m, 0 @@ -383,19 +520,66 @@ define i1 @length12_eq(i8* %X, i8* %Y) nounwind { define i32 @length12(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length12: -; X86: # BB#0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $12 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp +; X86: # BB#0: # %loadbb +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl (%esi), %ecx +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: jne .LBB15_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movl 4(%esi), %ecx +; X86-NEXT: movl 4(%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: jne .LBB15_1 +; X86-NEXT: # BB#3: # %loadbb2 +; X86-NEXT: movl 8(%esi), %ecx +; X86-NEXT: movl 8(%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: jne .LBB15_1 +; X86-NEXT: # BB#4: # %endblock +; X86-NEXT: popl %esi +; X86-NEXT: retl +; X86-NEXT: .LBB15_1: # %res_block +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: movl $-1, %ecx +; X86-NEXT: movl $1, %eax +; X86-NEXT: cmovbl %ecx, %eax +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: length12: -; X64: # BB#0: -; X64-NEXT: movl $12, %edx -; X64-NEXT: jmp memcmp # TAILCALL +; X64: # BB#0: # %loadbb +; X64-NEXT: movq (%rdi), %rcx +; X64-NEXT: movq (%rsi), %rdx +; X64-NEXT: bswapq %rcx +; X64-NEXT: bswapq %rdx +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: jne .LBB15_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movl 8(%rdi), %ecx +; X64-NEXT: movl 8(%rsi), %edx +; X64-NEXT: bswapl %ecx +; X64-NEXT: bswapl %edx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: jne .LBB15_1 +; X64-NEXT: # BB#3: # %endblock +; X64-NEXT: retq +; X64-NEXT: .LBB15_1: # %res_block +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: movl $-1, %ecx +; X64-NEXT: movl $1, %eax +; X64-NEXT: cmovbl %ecx, %eax +; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind ret i32 %m } @@ -404,111 +588,165 @@ define i32 @length12(i8* %X, i8* %Y) nounwind { define i32 @length16(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length16: -; X86: # BB#0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $16 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp +; X86: # BB#0: # %loadbb +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl (%esi), %ecx +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: jne .LBB16_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movl 4(%esi), %ecx +; X86-NEXT: movl 4(%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: jne .LBB16_1 +; X86-NEXT: # BB#3: # %loadbb2 +; X86-NEXT: movl 8(%esi), %ecx +; X86-NEXT: movl 8(%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: jne .LBB16_1 +; X86-NEXT: # BB#4: # %loadbb3 +; X86-NEXT: movl 12(%esi), %ecx +; X86-NEXT: movl 12(%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: jne .LBB16_1 +; X86-NEXT: # BB#5: # %endblock +; X86-NEXT: popl %esi +; X86-NEXT: retl +; X86-NEXT: .LBB16_1: # %res_block +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: movl $-1, %ecx +; X86-NEXT: movl $1, %eax +; X86-NEXT: cmovbl %ecx, %eax +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: length16: -; X64: # BB#0: -; X64-NEXT: movl $16, %edx -; X64-NEXT: jmp memcmp # TAILCALL +; X64: # BB#0: # %loadbb +; X64-NEXT: movq (%rdi), %rcx +; X64-NEXT: movq (%rsi), %rdx +; X64-NEXT: bswapq %rcx +; X64-NEXT: bswapq %rdx +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: jne .LBB16_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movq 8(%rdi), %rcx +; X64-NEXT: movq 8(%rsi), %rdx +; X64-NEXT: bswapq %rcx +; X64-NEXT: bswapq %rdx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: jne .LBB16_1 +; X64-NEXT: # BB#3: # %endblock +; X64-NEXT: retq +; X64-NEXT: .LBB16_1: # %res_block +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: movl $-1, %ecx +; X64-NEXT: movl $1, %eax +; X64-NEXT: cmovbl %ecx, %eax +; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind ret i32 %m } define i1 @length16_eq(i8* %x, i8* %y) nounwind { -; X86-NOSSE-LABEL: length16_eq: -; X86-NOSSE: # BB#0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $16 -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: setne %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE2-LABEL: length16_eq: -; X86-SSE2: # BB#0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE2-NEXT: movdqu (%eax), %xmm1 -; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; X86-SSE2-NEXT: pmovmskb %xmm1, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: setne %al -; X86-SSE2-NEXT: retl -; -; X64-SSE2-LABEL: length16_eq: -; X64-SSE2: # BB#0: -; X64-SSE2-NEXT: movdqu (%rsi), %xmm0 -; X64-SSE2-NEXT: movdqu (%rdi), %xmm1 -; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; X64-SSE2-NEXT: pmovmskb %xmm1, %eax -; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: setne %al -; X64-SSE2-NEXT: retq +; X86-LABEL: length16_eq: +; X86: # BB#0: # %loadbb +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: cmpl (%eax), %edx +; X86-NEXT: jne .LBB17_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movl 4(%ecx), %edx +; X86-NEXT: cmpl 4(%eax), %edx +; X86-NEXT: jne .LBB17_1 +; X86-NEXT: # BB#3: # %loadbb2 +; X86-NEXT: movl 8(%ecx), %edx +; X86-NEXT: cmpl 8(%eax), %edx +; X86-NEXT: jne .LBB17_1 +; X86-NEXT: # BB#4: # %loadbb3 +; X86-NEXT: movl 12(%ecx), %edx +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpl 12(%eax), %edx +; X86-NEXT: je .LBB17_5 +; X86-NEXT: .LBB17_1: # %res_block +; X86-NEXT: movl $1, %ecx +; X86-NEXT: .LBB17_5: # %endblock +; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: setne %al +; X86-NEXT: retl ; -; X64-AVX2-LABEL: length16_eq: -; X64-AVX2: # BB#0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 -; X64-AVX2-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 -; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax -; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX2-NEXT: setne %al -; X64-AVX2-NEXT: retq +; X64-LABEL: length16_eq: +; X64: # BB#0: # %loadbb +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: cmpq (%rsi), %rax +; X64-NEXT: jne .LBB17_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movq 8(%rdi), %rcx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpq 8(%rsi), %rcx +; X64-NEXT: je .LBB17_3 +; X64-NEXT: .LBB17_1: # %res_block +; X64-NEXT: movl $1, %eax +; X64-NEXT: .LBB17_3: # %endblock +; X64-NEXT: testl %eax, %eax +; X64-NEXT: setne %al +; X64-NEXT: retq %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) nounwind %cmp = icmp ne i32 %call, 0 ret i1 %cmp } define i1 @length16_eq_const(i8* %X) nounwind { -; X86-NOSSE-LABEL: length16_eq_const: -; X86-NOSSE: # BB#0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $16 -; X86-NOSSE-NEXT: pushl $.L.str -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: sete %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE2-LABEL: length16_eq_const: -; X86-SSE2: # BB#0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movdqu (%eax), %xmm0 -; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: sete %al -; X86-SSE2-NEXT: retl -; -; X64-SSE2-LABEL: length16_eq_const: -; X64-SSE2: # BB#0: -; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 -; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 -; X64-SSE2-NEXT: pmovmskb %xmm0, %eax -; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: sete %al -; X64-SSE2-NEXT: retq +; X86-LABEL: length16_eq_const: +; X86: # BB#0: # %loadbb +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $858927408, (%eax) # imm = 0x33323130 +; X86-NEXT: jne .LBB18_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: cmpl $926299444, 4(%eax) # imm = 0x37363534 +; X86-NEXT: jne .LBB18_1 +; X86-NEXT: # BB#3: # %loadbb2 +; X86-NEXT: cmpl $825243960, 8(%eax) # imm = 0x31303938 +; X86-NEXT: jne .LBB18_1 +; X86-NEXT: # BB#4: # %loadbb3 +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpl $892613426, 12(%eax) # imm = 0x35343332 +; X86-NEXT: je .LBB18_5 +; X86-NEXT: .LBB18_1: # %res_block +; X86-NEXT: movl $1, %ecx +; X86-NEXT: .LBB18_5: # %endblock +; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: sete %al +; X86-NEXT: retl ; -; X64-AVX2-LABEL: length16_eq_const: -; X64-AVX2: # BB#0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 -; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax -; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX2-NEXT: sete %al -; X64-AVX2-NEXT: retq +; X64-LABEL: length16_eq_const: +; X64: # BB#0: # %loadbb +; X64-NEXT: movabsq $3978425819141910832, %rax # imm = 0x3736353433323130 +; X64-NEXT: cmpq %rax, (%rdi) +; X64-NEXT: jne .LBB18_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: movabsq $3833745473465760056, %rcx # imm = 0x3534333231303938 +; X64-NEXT: cmpq %rcx, 8(%rdi) +; X64-NEXT: je .LBB18_3 +; X64-NEXT: .LBB18_1: # %res_block +; X64-NEXT: movl $1, %eax +; X64-NEXT: .LBB18_3: # %endblock +; X64-NEXT: testl %eax, %eax +; X64-NEXT: sete %al +; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 16) nounwind %c = icmp eq i32 %m, 0 ret i1 %c @@ -526,9 +764,43 @@ define i32 @length32(i8* %X, i8* %Y) nounwind { ; X86-NEXT: retl ; ; X64-LABEL: length32: -; X64: # BB#0: -; X64-NEXT: movl $32, %edx -; X64-NEXT: jmp memcmp # TAILCALL +; X64: # BB#0: # %loadbb +; X64-NEXT: movq (%rdi), %rcx +; X64-NEXT: movq (%rsi), %rdx +; X64-NEXT: bswapq %rcx +; X64-NEXT: bswapq %rdx +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: jne .LBB19_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movq 8(%rdi), %rcx +; X64-NEXT: movq 8(%rsi), %rdx +; X64-NEXT: bswapq %rcx +; X64-NEXT: bswapq %rdx +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: jne .LBB19_1 +; X64-NEXT: # BB#3: # %loadbb2 +; X64-NEXT: movq 16(%rdi), %rcx +; X64-NEXT: movq 16(%rsi), %rdx +; X64-NEXT: bswapq %rcx +; X64-NEXT: bswapq %rdx +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: jne .LBB19_1 +; X64-NEXT: # BB#4: # %loadbb3 +; X64-NEXT: movq 24(%rdi), %rcx +; X64-NEXT: movq 24(%rsi), %rdx +; X64-NEXT: bswapq %rcx +; X64-NEXT: bswapq %rdx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: jne .LBB19_1 +; X64-NEXT: # BB#5: # %endblock +; X64-NEXT: retq +; X64-NEXT: .LBB19_1: # %res_block +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: movl $-1, %ecx +; X64-NEXT: movl $1, %eax +; X64-NEXT: cmovbl %ecx, %eax +; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 32) nounwind ret i32 %m } @@ -548,25 +820,30 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind { ; X86-NEXT: sete %al ; X86-NEXT: retl ; -; X64-SSE2-LABEL: length32_eq: -; X64-SSE2: # BB#0: -; X64-SSE2-NEXT: pushq %rax -; X64-SSE2-NEXT: movl $32, %edx -; X64-SSE2-NEXT: callq memcmp -; X64-SSE2-NEXT: testl %eax, %eax -; X64-SSE2-NEXT: sete %al -; X64-SSE2-NEXT: popq %rcx -; X64-SSE2-NEXT: retq -; -; X64-AVX2-LABEL: length32_eq: -; X64-AVX2: # BB#0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0 -; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax -; X64-AVX2-NEXT: cmpl $-1, %eax -; X64-AVX2-NEXT: sete %al -; X64-AVX2-NEXT: vzeroupper -; X64-AVX2-NEXT: retq +; X64-LABEL: length32_eq: +; X64: # BB#0: # %loadbb +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: cmpq (%rsi), %rax +; X64-NEXT: jne .LBB20_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movq 8(%rdi), %rax +; X64-NEXT: cmpq 8(%rsi), %rax +; X64-NEXT: jne .LBB20_1 +; X64-NEXT: # BB#3: # %loadbb2 +; X64-NEXT: movq 16(%rdi), %rax +; X64-NEXT: cmpq 16(%rsi), %rax +; X64-NEXT: jne .LBB20_1 +; X64-NEXT: # BB#4: # %loadbb3 +; X64-NEXT: movq 24(%rdi), %rcx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpq 24(%rsi), %rcx +; X64-NEXT: je .LBB20_5 +; X64-NEXT: .LBB20_1: # %res_block +; X64-NEXT: movl $1, %eax +; X64-NEXT: .LBB20_5: # %endblock +; X64-NEXT: testl %eax, %eax +; X64-NEXT: sete %al +; X64-NEXT: retq %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 32) nounwind %cmp = icmp eq i32 %call, 0 ret i1 %cmp @@ -585,26 +862,30 @@ define i1 @length32_eq_const(i8* %X) nounwind { ; X86-NEXT: setne %al ; X86-NEXT: retl ; -; X64-SSE2-LABEL: length32_eq_const: -; X64-SSE2: # BB#0: -; X64-SSE2-NEXT: pushq %rax -; X64-SSE2-NEXT: movl $.L.str, %esi -; X64-SSE2-NEXT: movl $32, %edx -; X64-SSE2-NEXT: callq memcmp -; X64-SSE2-NEXT: testl %eax, %eax -; X64-SSE2-NEXT: setne %al -; X64-SSE2-NEXT: popq %rcx -; X64-SSE2-NEXT: retq -; -; X64-AVX2-LABEL: length32_eq_const: -; X64-AVX2: # BB#0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 -; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax -; X64-AVX2-NEXT: cmpl $-1, %eax -; X64-AVX2-NEXT: setne %al -; X64-AVX2-NEXT: vzeroupper -; X64-AVX2-NEXT: retq +; X64-LABEL: length32_eq_const: +; X64: # BB#0: # %loadbb +; X64-NEXT: movabsq $3978425819141910832, %rax # imm = 0x3736353433323130 +; X64-NEXT: cmpq %rax, (%rdi) +; X64-NEXT: jne .LBB21_1 +; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: movabsq $3833745473465760056, %rax # imm = 0x3534333231303938 +; X64-NEXT: cmpq %rax, 8(%rdi) +; X64-NEXT: jne .LBB21_1 +; X64-NEXT: # BB#3: # %loadbb2 +; X64-NEXT: movabsq $3689065127958034230, %rax # imm = 0x3332313039383736 +; X64-NEXT: cmpq %rax, 16(%rdi) +; X64-NEXT: jne .LBB21_1 +; X64-NEXT: # BB#4: # %loadbb3 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: movabsq $3544395820347831604, %rcx # imm = 0x3130393837363534 +; X64-NEXT: cmpq %rcx, 24(%rdi) +; X64-NEXT: je .LBB21_5 +; X64-NEXT: .LBB21_1: # %res_block +; X64-NEXT: movl $1, %eax +; X64-NEXT: .LBB21_5: # %endblock +; X64-NEXT: testl %eax, %eax +; X64-NEXT: setne %al +; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 32) nounwind %c = icmp ne i32 %m, 0 ret i1 %c diff --git a/test/Transforms/CodeGenPrepare/X86/memcmp.ll b/test/Transforms/CodeGenPrepare/X86/memcmp.ll index 4b9e7c3956f..1dfc0876196 100644 --- a/test/Transforms/CodeGenPrepare/X86/memcmp.ll +++ b/test/Transforms/CodeGenPrepare/X86/memcmp.ll @@ -23,9 +23,63 @@ define i32 @cmp2(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp3(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp3( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 3) -; ALL-NEXT: ret i32 [[CALL]] +; X32-LABEL: @cmp3( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i16* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i16* +; X32-NEXT: [[TMP2:%.*]] = load i16, i16* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]]) +; X32-NEXT: [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]]) +; X32-NEXT: [[TMP6:%.*]] = zext i16 [[TMP4]] to i32 +; X32-NEXT: [[TMP7:%.*]] = zext i16 [[TMP5]] to i32 +; X32-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP6]], [[TMP7]] +; X32-NEXT: br i1 [[TMP8]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X32: res_block: +; X32-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP6]], [[TMP7]] +; X32-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 -1, i32 1 +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[X]], i8 2 +; X32-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[Y]], i8 2 +; X32-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP11]] +; X32-NEXT: [[TMP14:%.*]] = load i8, i8* [[TMP12]] +; X32-NEXT: [[TMP15:%.*]] = zext i8 [[TMP13]] to i32 +; X32-NEXT: [[TMP16:%.*]] = zext i8 [[TMP14]] to i32 +; X32-NEXT: [[TMP17:%.*]] = sub i32 [[TMP15]], [[TMP16]] +; X32-NEXT: br label [[ENDBLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP17]], [[LOADBB1]] ], [ [[TMP10]], [[RES_BLOCK]] ] +; X32-NEXT: ret i32 [[PHI_RES]] +; +; X64-LABEL: @cmp3( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i16* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i16* +; X64-NEXT: [[TMP2:%.*]] = load i16, i16* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = zext i16 [[TMP4]] to i64 +; X64-NEXT: [[TMP7:%.*]] = zext i16 [[TMP5]] to i64 +; X64-NEXT: [[TMP8:%.*]] = icmp eq i64 [[TMP6]], [[TMP7]] +; X64-NEXT: br i1 [[TMP8]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64: res_block: +; X64-NEXT: [[TMP9:%.*]] = icmp ult i64 [[TMP6]], [[TMP7]] +; X64-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[X]], i8 2 +; X64-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[Y]], i8 2 +; X64-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP11]] +; X64-NEXT: [[TMP14:%.*]] = load i8, i8* [[TMP12]] +; X64-NEXT: [[TMP15:%.*]] = zext i8 [[TMP13]] to i32 +; X64-NEXT: [[TMP16:%.*]] = zext i8 [[TMP14]] to i32 +; X64-NEXT: [[TMP17:%.*]] = sub i32 [[TMP15]], [[TMP16]] +; X64-NEXT: br label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP17]], [[LOADBB1]] ], [ [[TMP10]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 3) ret i32 %call @@ -50,27 +104,225 @@ define i32 @cmp4(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp5(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp5( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 5) -; ALL-NEXT: ret i32 [[CALL]] +; X32-LABEL: @cmp5( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X32-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] +; X32-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X32: res_block: +; X32-NEXT: [[TMP7:%.*]] = icmp ult i32 [[TMP4]], [[TMP5]] +; X32-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP9:%.*]] = getelementptr i8, i8* [[X]], i8 4 +; X32-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[Y]], i8 4 +; X32-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]] +; X32-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP10]] +; X32-NEXT: [[TMP13:%.*]] = zext i8 [[TMP11]] to i32 +; X32-NEXT: [[TMP14:%.*]] = zext i8 [[TMP12]] to i32 +; X32-NEXT: [[TMP15:%.*]] = sub i32 [[TMP13]], [[TMP14]] +; X32-NEXT: br label [[ENDBLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP15]], [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X32-NEXT: ret i32 [[PHI_RES]] +; +; X64-LABEL: @cmp5( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X64-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = zext i32 [[TMP4]] to i64 +; X64-NEXT: [[TMP7:%.*]] = zext i32 [[TMP5]] to i64 +; X64-NEXT: [[TMP8:%.*]] = icmp eq i64 [[TMP6]], [[TMP7]] +; X64-NEXT: br i1 [[TMP8]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64: res_block: +; X64-NEXT: [[TMP9:%.*]] = icmp ult i64 [[TMP6]], [[TMP7]] +; X64-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[X]], i8 4 +; X64-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[Y]], i8 4 +; X64-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP11]] +; X64-NEXT: [[TMP14:%.*]] = load i8, i8* [[TMP12]] +; X64-NEXT: [[TMP15:%.*]] = zext i8 [[TMP13]] to i32 +; X64-NEXT: [[TMP16:%.*]] = zext i8 [[TMP14]] to i32 +; X64-NEXT: [[TMP17:%.*]] = sub i32 [[TMP15]], [[TMP16]] +; X64-NEXT: br label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP17]], [[LOADBB1]] ], [ [[TMP10]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 5) ret i32 %call } define i32 @cmp6(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp6( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 6) -; ALL-NEXT: ret i32 [[CALL]] +; X32-LABEL: @cmp6( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X32-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] +; X32-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X32: res_block: +; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] +; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ] +; X32-NEXT: [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; X32-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i16* +; X32-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i16* +; X32-NEXT: [[TMP11:%.*]] = getelementptr i16, i16* [[TMP9]], i16 2 +; X32-NEXT: [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 2 +; X32-NEXT: [[TMP13:%.*]] = load i16, i16* [[TMP11]] +; X32-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP12]] +; X32-NEXT: [[TMP15:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP13]]) +; X32-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]]) +; X32-NEXT: [[TMP17]] = zext i16 [[TMP15]] to i32 +; X32-NEXT: [[TMP18]] = zext i16 [[TMP16]] to i32 +; X32-NEXT: [[TMP19:%.*]] = icmp eq i32 [[TMP17]], [[TMP18]] +; X32-NEXT: br i1 [[TMP19]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X32-NEXT: ret i32 [[PHI_RES]] +; +; X64-LABEL: @cmp6( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X64-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = zext i32 [[TMP4]] to i64 +; X64-NEXT: [[TMP7:%.*]] = zext i32 [[TMP5]] to i64 +; X64-NEXT: [[TMP8:%.*]] = icmp eq i64 [[TMP6]], [[TMP7]] +; X64-NEXT: br i1 [[TMP8]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64: res_block: +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP6]], [[LOADBB:%.*]] ], [ [[TMP19:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP7]], [[LOADBB]] ], [ [[TMP20:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[TMP9:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP11:%.*]] = bitcast i8* [[X]] to i16* +; X64-NEXT: [[TMP12:%.*]] = bitcast i8* [[Y]] to i16* +; X64-NEXT: [[TMP13:%.*]] = getelementptr i16, i16* [[TMP11]], i16 2 +; X64-NEXT: [[TMP14:%.*]] = getelementptr i16, i16* [[TMP12]], i16 2 +; X64-NEXT: [[TMP15:%.*]] = load i16, i16* [[TMP13]] +; X64-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP14]] +; X64-NEXT: [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]]) +; X64-NEXT: [[TMP18:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP16]]) +; X64-NEXT: [[TMP19]] = zext i16 [[TMP17]] to i64 +; X64-NEXT: [[TMP20]] = zext i16 [[TMP18]] to i64 +; X64-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]] +; X64-NEXT: br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP10]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 6) ret i32 %call } define i32 @cmp7(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp7( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 7) -; ALL-NEXT: ret i32 [[CALL]] +; X32-LABEL: @cmp7( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X32-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] +; X32-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X32: res_block: +; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] +; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ] +; X32-NEXT: [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; X32-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i16* +; X32-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i16* +; X32-NEXT: [[TMP11:%.*]] = getelementptr i16, i16* [[TMP9]], i16 2 +; X32-NEXT: [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 2 +; X32-NEXT: [[TMP13:%.*]] = load i16, i16* [[TMP11]] +; X32-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP12]] +; X32-NEXT: [[TMP15:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP13]]) +; X32-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]]) +; X32-NEXT: [[TMP17]] = zext i16 [[TMP15]] to i32 +; X32-NEXT: [[TMP18]] = zext i16 [[TMP16]] to i32 +; X32-NEXT: [[TMP19:%.*]] = icmp eq i32 [[TMP17]], [[TMP18]] +; X32-NEXT: br i1 [[TMP19]], label [[LOADBB2:%.*]], label [[RES_BLOCK]] +; X32: loadbb2: +; X32-NEXT: [[TMP20:%.*]] = getelementptr i8, i8* [[X]], i8 6 +; X32-NEXT: [[TMP21:%.*]] = getelementptr i8, i8* [[Y]], i8 6 +; X32-NEXT: [[TMP22:%.*]] = load i8, i8* [[TMP20]] +; X32-NEXT: [[TMP23:%.*]] = load i8, i8* [[TMP21]] +; X32-NEXT: [[TMP24:%.*]] = zext i8 [[TMP22]] to i32 +; X32-NEXT: [[TMP25:%.*]] = zext i8 [[TMP23]] to i32 +; X32-NEXT: [[TMP26:%.*]] = sub i32 [[TMP24]], [[TMP25]] +; X32-NEXT: br label [[ENDBLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP26]], [[LOADBB2]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X32-NEXT: ret i32 [[PHI_RES]] +; +; X64-LABEL: @cmp7( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X64-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = zext i32 [[TMP4]] to i64 +; X64-NEXT: [[TMP7:%.*]] = zext i32 [[TMP5]] to i64 +; X64-NEXT: [[TMP8:%.*]] = icmp eq i64 [[TMP6]], [[TMP7]] +; X64-NEXT: br i1 [[TMP8]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64: res_block: +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP6]], [[LOADBB:%.*]] ], [ [[TMP19:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP7]], [[LOADBB]] ], [ [[TMP20:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[TMP9:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP11:%.*]] = bitcast i8* [[X]] to i16* +; X64-NEXT: [[TMP12:%.*]] = bitcast i8* [[Y]] to i16* +; X64-NEXT: [[TMP13:%.*]] = getelementptr i16, i16* [[TMP11]], i16 2 +; X64-NEXT: [[TMP14:%.*]] = getelementptr i16, i16* [[TMP12]], i16 2 +; X64-NEXT: [[TMP15:%.*]] = load i16, i16* [[TMP13]] +; X64-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP14]] +; X64-NEXT: [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]]) +; X64-NEXT: [[TMP18:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP16]]) +; X64-NEXT: [[TMP19]] = zext i16 [[TMP17]] to i64 +; X64-NEXT: [[TMP20]] = zext i16 [[TMP18]] to i64 +; X64-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]] +; X64-NEXT: br i1 [[TMP21]], label [[LOADBB2:%.*]], label [[RES_BLOCK]] +; X64: loadbb2: +; X64-NEXT: [[TMP22:%.*]] = getelementptr i8, i8* [[X]], i8 6 +; X64-NEXT: [[TMP23:%.*]] = getelementptr i8, i8* [[Y]], i8 6 +; X64-NEXT: [[TMP24:%.*]] = load i8, i8* [[TMP22]] +; X64-NEXT: [[TMP25:%.*]] = load i8, i8* [[TMP23]] +; X64-NEXT: [[TMP26:%.*]] = zext i8 [[TMP24]] to i32 +; X64-NEXT: [[TMP27:%.*]] = zext i8 [[TMP25]] to i32 +; X64-NEXT: [[TMP28:%.*]] = sub i32 [[TMP26]], [[TMP27]] +; X64-NEXT: br label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP28]], [[LOADBB2]] ], [ [[TMP10]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 7) ret i32 %call @@ -78,8 +330,35 @@ define i32 @cmp7(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp8(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X32-LABEL: @cmp8( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 8) -; X32-NEXT: ret i32 [[CALL]] +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X32-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] +; X32-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X32: res_block: +; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ] +; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ] +; X32-NEXT: [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; X32-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1 +; X32-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1 +; X32-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]] +; X32-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] +; X32-NEXT: [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]]) +; X32-NEXT: [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) +; X32-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]] +; X32-NEXT: br i1 [[TMP17]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X32-NEXT: ret i32 [[PHI_RES]] ; ; X64-LABEL: @cmp8( ; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* @@ -99,72 +378,691 @@ define i32 @cmp8(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp9(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp9( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 9) -; ALL-NEXT: ret i32 [[CALL]] +; X32-LABEL: @cmp9( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X32-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] +; X32-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X32: res_block: +; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ] +; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ] +; X32-NEXT: [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; X32-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1 +; X32-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1 +; X32-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]] +; X32-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] +; X32-NEXT: [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]]) +; X32-NEXT: [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) +; X32-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]] +; X32-NEXT: br i1 [[TMP17]], label [[LOADBB2:%.*]], label [[RES_BLOCK]] +; X32: loadbb2: +; X32-NEXT: [[TMP18:%.*]] = getelementptr i8, i8* [[X]], i8 8 +; X32-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[Y]], i8 8 +; X32-NEXT: [[TMP20:%.*]] = load i8, i8* [[TMP18]] +; X32-NEXT: [[TMP21:%.*]] = load i8, i8* [[TMP19]] +; X32-NEXT: [[TMP22:%.*]] = zext i8 [[TMP20]] to i32 +; X32-NEXT: [[TMP23:%.*]] = zext i8 [[TMP21]] to i32 +; X32-NEXT: [[TMP24:%.*]] = sub i32 [[TMP22]], [[TMP23]] +; X32-NEXT: br label [[ENDBLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP24]], [[LOADBB2]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X32-NEXT: ret i32 [[PHI_RES]] +; +; X64-LABEL: @cmp9( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]] +; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64: res_block: +; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[TMP4]], [[TMP5]] +; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP9:%.*]] = getelementptr i8, i8* [[X]], i8 8 +; X64-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[Y]], i8 8 +; X64-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]] +; X64-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP10]] +; X64-NEXT: [[TMP13:%.*]] = zext i8 [[TMP11]] to i32 +; X64-NEXT: [[TMP14:%.*]] = zext i8 [[TMP12]] to i32 +; X64-NEXT: [[TMP15:%.*]] = sub i32 [[TMP13]], [[TMP14]] +; X64-NEXT: br label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP15]], [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 9) ret i32 %call } define i32 @cmp10(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp10( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 10) -; ALL-NEXT: ret i32 [[CALL]] +; X32-LABEL: @cmp10( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X32-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] +; X32-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X32: res_block: +; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ], [ [[TMP26:%.*]], [[LOADBB2:%.*]] ] +; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ], [ [[TMP27:%.*]], [[LOADBB2]] ] +; X32-NEXT: [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; X32-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1 +; X32-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1 +; X32-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]] +; X32-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] +; X32-NEXT: [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]]) +; X32-NEXT: [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) +; X32-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]] +; X32-NEXT: br i1 [[TMP17]], label [[LOADBB2]], label [[RES_BLOCK]] +; X32: loadbb2: +; X32-NEXT: [[TMP18:%.*]] = bitcast i8* [[X]] to i16* +; X32-NEXT: [[TMP19:%.*]] = bitcast i8* [[Y]] to i16* +; X32-NEXT: [[TMP20:%.*]] = getelementptr i16, i16* [[TMP18]], i16 4 +; X32-NEXT: [[TMP21:%.*]] = getelementptr i16, i16* [[TMP19]], i16 4 +; X32-NEXT: [[TMP22:%.*]] = load i16, i16* [[TMP20]] +; X32-NEXT: [[TMP23:%.*]] = load i16, i16* [[TMP21]] +; X32-NEXT: [[TMP24:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP22]]) +; X32-NEXT: [[TMP25:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP23]]) +; X32-NEXT: [[TMP26]] = zext i16 [[TMP24]] to i32 +; X32-NEXT: [[TMP27]] = zext i16 [[TMP25]] to i32 +; X32-NEXT: [[TMP28:%.*]] = icmp eq i32 [[TMP26]], [[TMP27]] +; X32-NEXT: br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X32-NEXT: ret i32 [[PHI_RES]] +; +; X64-LABEL: @cmp10( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]] +; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64: res_block: +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i16* +; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i16* +; X64-NEXT: [[TMP11:%.*]] = getelementptr i16, i16* [[TMP9]], i16 4 +; X64-NEXT: [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 4 +; X64-NEXT: [[TMP13:%.*]] = load i16, i16* [[TMP11]] +; X64-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP12]] +; X64-NEXT: [[TMP15:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP13]]) +; X64-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]]) +; X64-NEXT: [[TMP17]] = zext i16 [[TMP15]] to i64 +; X64-NEXT: [[TMP18]] = zext i16 [[TMP16]] to i64 +; X64-NEXT: [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]] +; X64-NEXT: br i1 [[TMP19]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 10) ret i32 %call } define i32 @cmp11(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp11( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 11) -; ALL-NEXT: ret i32 [[CALL]] +; X32-LABEL: @cmp11( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X32-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] +; X32-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X32: res_block: +; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ], [ [[TMP26:%.*]], [[LOADBB2:%.*]] ] +; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ], [ [[TMP27:%.*]], [[LOADBB2]] ] +; X32-NEXT: [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; X32-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1 +; X32-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1 +; X32-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]] +; X32-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] +; X32-NEXT: [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]]) +; X32-NEXT: [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) +; X32-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]] +; X32-NEXT: br i1 [[TMP17]], label [[LOADBB2]], label [[RES_BLOCK]] +; X32: loadbb2: +; X32-NEXT: [[TMP18:%.*]] = bitcast i8* [[X]] to i16* +; X32-NEXT: [[TMP19:%.*]] = bitcast i8* [[Y]] to i16* +; X32-NEXT: [[TMP20:%.*]] = getelementptr i16, i16* [[TMP18]], i16 4 +; X32-NEXT: [[TMP21:%.*]] = getelementptr i16, i16* [[TMP19]], i16 4 +; X32-NEXT: [[TMP22:%.*]] = load i16, i16* [[TMP20]] +; X32-NEXT: [[TMP23:%.*]] = load i16, i16* [[TMP21]] +; X32-NEXT: [[TMP24:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP22]]) +; X32-NEXT: [[TMP25:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP23]]) +; X32-NEXT: [[TMP26]] = zext i16 [[TMP24]] to i32 +; X32-NEXT: [[TMP27]] = zext i16 [[TMP25]] to i32 +; X32-NEXT: [[TMP28:%.*]] = icmp eq i32 [[TMP26]], [[TMP27]] +; X32-NEXT: br i1 [[TMP28]], label [[LOADBB3:%.*]], label [[RES_BLOCK]] +; X32: loadbb3: +; X32-NEXT: [[TMP29:%.*]] = getelementptr i8, i8* [[X]], i8 10 +; X32-NEXT: [[TMP30:%.*]] = getelementptr i8, i8* [[Y]], i8 10 +; X32-NEXT: [[TMP31:%.*]] = load i8, i8* [[TMP29]] +; X32-NEXT: [[TMP32:%.*]] = load i8, i8* [[TMP30]] +; X32-NEXT: [[TMP33:%.*]] = zext i8 [[TMP31]] to i32 +; X32-NEXT: [[TMP34:%.*]] = zext i8 [[TMP32]] to i32 +; X32-NEXT: [[TMP35:%.*]] = sub i32 [[TMP33]], [[TMP34]] +; X32-NEXT: br label [[ENDBLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP35]], [[LOADBB3]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X32-NEXT: ret i32 [[PHI_RES]] +; +; X64-LABEL: @cmp11( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]] +; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64: res_block: +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i16* +; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i16* +; X64-NEXT: [[TMP11:%.*]] = getelementptr i16, i16* [[TMP9]], i16 4 +; X64-NEXT: [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 4 +; X64-NEXT: [[TMP13:%.*]] = load i16, i16* [[TMP11]] +; X64-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP12]] +; X64-NEXT: [[TMP15:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP13]]) +; X64-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]]) +; X64-NEXT: [[TMP17]] = zext i16 [[TMP15]] to i64 +; X64-NEXT: [[TMP18]] = zext i16 [[TMP16]] to i64 +; X64-NEXT: [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]] +; X64-NEXT: br i1 [[TMP19]], label [[LOADBB2:%.*]], label [[RES_BLOCK]] +; X64: loadbb2: +; X64-NEXT: [[TMP20:%.*]] = getelementptr i8, i8* [[X]], i8 10 +; X64-NEXT: [[TMP21:%.*]] = getelementptr i8, i8* [[Y]], i8 10 +; X64-NEXT: [[TMP22:%.*]] = load i8, i8* [[TMP20]] +; X64-NEXT: [[TMP23:%.*]] = load i8, i8* [[TMP21]] +; X64-NEXT: [[TMP24:%.*]] = zext i8 [[TMP22]] to i32 +; X64-NEXT: [[TMP25:%.*]] = zext i8 [[TMP23]] to i32 +; X64-NEXT: [[TMP26:%.*]] = sub i32 [[TMP24]], [[TMP25]] +; X64-NEXT: br label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP26]], [[LOADBB2]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 11) ret i32 %call } define i32 @cmp12(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp12( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 12) -; ALL-NEXT: ret i32 [[CALL]] +; X32-LABEL: @cmp12( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X32-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] +; X32-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X32: res_block: +; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ], [ [[TMP24:%.*]], [[LOADBB2:%.*]] ] +; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ], [ [[TMP25:%.*]], [[LOADBB2]] ] +; X32-NEXT: [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; X32-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1 +; X32-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1 +; X32-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]] +; X32-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] +; X32-NEXT: [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]]) +; X32-NEXT: [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) +; X32-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]] +; X32-NEXT: br i1 [[TMP17]], label [[LOADBB2]], label [[RES_BLOCK]] +; X32: loadbb2: +; X32-NEXT: [[TMP18:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP19:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP20:%.*]] = getelementptr i32, i32* [[TMP18]], i32 2 +; X32-NEXT: [[TMP21:%.*]] = getelementptr i32, i32* [[TMP19]], i32 2 +; X32-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP20]] +; X32-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP21]] +; X32-NEXT: [[TMP24]] = call i32 @llvm.bswap.i32(i32 [[TMP22]]) +; X32-NEXT: [[TMP25]] = call i32 @llvm.bswap.i32(i32 [[TMP23]]) +; X32-NEXT: [[TMP26:%.*]] = icmp eq i32 [[TMP24]], [[TMP25]] +; X32-NEXT: br i1 [[TMP26]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X32-NEXT: ret i32 [[PHI_RES]] +; +; X64-LABEL: @cmp12( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]] +; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64: res_block: +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32* +; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32* +; X64-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 2 +; X64-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 2 +; X64-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]] +; X64-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] +; X64-NEXT: [[TMP15:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP13]]) +; X64-NEXT: [[TMP16:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) +; X64-NEXT: [[TMP17]] = zext i32 [[TMP15]] to i64 +; X64-NEXT: [[TMP18]] = zext i32 [[TMP16]] to i64 +; X64-NEXT: [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]] +; X64-NEXT: br i1 [[TMP19]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 12) ret i32 %call } define i32 @cmp13(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp13( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 13) -; ALL-NEXT: ret i32 [[CALL]] +; X32-LABEL: @cmp13( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X32-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] +; X32-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X32: res_block: +; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ], [ [[TMP24:%.*]], [[LOADBB2:%.*]] ] +; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ], [ [[TMP25:%.*]], [[LOADBB2]] ] +; X32-NEXT: [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; X32-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1 +; X32-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1 +; X32-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]] +; X32-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] +; X32-NEXT: [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]]) +; X32-NEXT: [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) +; X32-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]] +; X32-NEXT: br i1 [[TMP17]], label [[LOADBB2]], label [[RES_BLOCK]] +; X32: loadbb2: +; X32-NEXT: [[TMP18:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP19:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP20:%.*]] = getelementptr i32, i32* [[TMP18]], i32 2 +; X32-NEXT: [[TMP21:%.*]] = getelementptr i32, i32* [[TMP19]], i32 2 +; X32-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP20]] +; X32-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP21]] +; X32-NEXT: [[TMP24]] = call i32 @llvm.bswap.i32(i32 [[TMP22]]) +; X32-NEXT: [[TMP25]] = call i32 @llvm.bswap.i32(i32 [[TMP23]]) +; X32-NEXT: [[TMP26:%.*]] = icmp eq i32 [[TMP24]], [[TMP25]] +; X32-NEXT: br i1 [[TMP26]], label [[LOADBB3:%.*]], label [[RES_BLOCK]] +; X32: loadbb3: +; X32-NEXT: [[TMP27:%.*]] = getelementptr i8, i8* [[X]], i8 12 +; X32-NEXT: [[TMP28:%.*]] = getelementptr i8, i8* [[Y]], i8 12 +; X32-NEXT: [[TMP29:%.*]] = load i8, i8* [[TMP27]] +; X32-NEXT: [[TMP30:%.*]] = load i8, i8* [[TMP28]] +; X32-NEXT: [[TMP31:%.*]] = zext i8 [[TMP29]] to i32 +; X32-NEXT: [[TMP32:%.*]] = zext i8 [[TMP30]] to i32 +; X32-NEXT: [[TMP33:%.*]] = sub i32 [[TMP31]], [[TMP32]] +; X32-NEXT: br label [[ENDBLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP33]], [[LOADBB3]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X32-NEXT: ret i32 [[PHI_RES]] +; +; X64-LABEL: @cmp13( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]] +; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64: res_block: +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32* +; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32* +; X64-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 2 +; X64-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 2 +; X64-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]] +; X64-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] +; X64-NEXT: [[TMP15:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP13]]) +; X64-NEXT: [[TMP16:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) +; X64-NEXT: [[TMP17]] = zext i32 [[TMP15]] to i64 +; X64-NEXT: [[TMP18]] = zext i32 [[TMP16]] to i64 +; X64-NEXT: [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]] +; X64-NEXT: br i1 [[TMP19]], label [[LOADBB2:%.*]], label [[RES_BLOCK]] +; X64: loadbb2: +; X64-NEXT: [[TMP20:%.*]] = getelementptr i8, i8* [[X]], i8 12 +; X64-NEXT: [[TMP21:%.*]] = getelementptr i8, i8* [[Y]], i8 12 +; X64-NEXT: [[TMP22:%.*]] = load i8, i8* [[TMP20]] +; X64-NEXT: [[TMP23:%.*]] = load i8, i8* [[TMP21]] +; X64-NEXT: [[TMP24:%.*]] = zext i8 [[TMP22]] to i32 +; X64-NEXT: [[TMP25:%.*]] = zext i8 [[TMP23]] to i32 +; X64-NEXT: [[TMP26:%.*]] = sub i32 [[TMP24]], [[TMP25]] +; X64-NEXT: br label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP26]], [[LOADBB2]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 13) ret i32 %call } define i32 @cmp14(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp14( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 14) -; ALL-NEXT: ret i32 [[CALL]] +; X32-LABEL: @cmp14( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X32-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] +; X32-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X32: res_block: +; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ], [ [[TMP24:%.*]], [[LOADBB2:%.*]] ], [ [[TMP35:%.*]], [[LOADBB3:%.*]] ] +; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ], [ [[TMP25:%.*]], [[LOADBB2]] ], [ [[TMP36:%.*]], [[LOADBB3]] ] +; X32-NEXT: [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; X32-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1 +; X32-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1 +; X32-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]] +; X32-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] +; X32-NEXT: [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]]) +; X32-NEXT: [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) +; X32-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]] +; X32-NEXT: br i1 [[TMP17]], label [[LOADBB2]], label [[RES_BLOCK]] +; X32: loadbb2: +; X32-NEXT: [[TMP18:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP19:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP20:%.*]] = getelementptr i32, i32* [[TMP18]], i32 2 +; X32-NEXT: [[TMP21:%.*]] = getelementptr i32, i32* [[TMP19]], i32 2 +; X32-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP20]] +; X32-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP21]] +; X32-NEXT: [[TMP24]] = call i32 @llvm.bswap.i32(i32 [[TMP22]]) +; X32-NEXT: [[TMP25]] = call i32 @llvm.bswap.i32(i32 [[TMP23]]) +; X32-NEXT: [[TMP26:%.*]] = icmp eq i32 [[TMP24]], [[TMP25]] +; X32-NEXT: br i1 [[TMP26]], label [[LOADBB3]], label [[RES_BLOCK]] +; X32: loadbb3: +; X32-NEXT: [[TMP27:%.*]] = bitcast i8* [[X]] to i16* +; X32-NEXT: [[TMP28:%.*]] = bitcast i8* [[Y]] to i16* +; X32-NEXT: [[TMP29:%.*]] = getelementptr i16, i16* [[TMP27]], i16 6 +; X32-NEXT: [[TMP30:%.*]] = getelementptr i16, i16* [[TMP28]], i16 6 +; X32-NEXT: [[TMP31:%.*]] = load i16, i16* [[TMP29]] +; X32-NEXT: [[TMP32:%.*]] = load i16, i16* [[TMP30]] +; X32-NEXT: [[TMP33:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP31]]) +; X32-NEXT: [[TMP34:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP32]]) +; X32-NEXT: [[TMP35]] = zext i16 [[TMP33]] to i32 +; X32-NEXT: [[TMP36]] = zext i16 [[TMP34]] to i32 +; X32-NEXT: [[TMP37:%.*]] = icmp eq i32 [[TMP35]], [[TMP36]] +; X32-NEXT: br i1 [[TMP37]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X32-NEXT: ret i32 [[PHI_RES]] +; +; X64-LABEL: @cmp14( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]] +; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64: res_block: +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ], [ [[TMP28:%.*]], [[LOADBB2:%.*]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ], [ [[TMP29:%.*]], [[LOADBB2]] ] +; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32* +; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32* +; X64-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 2 +; X64-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 2 +; X64-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]] +; X64-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] +; X64-NEXT: [[TMP15:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP13]]) +; X64-NEXT: [[TMP16:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) +; X64-NEXT: [[TMP17]] = zext i32 [[TMP15]] to i64 +; X64-NEXT: [[TMP18]] = zext i32 [[TMP16]] to i64 +; X64-NEXT: [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]] +; X64-NEXT: br i1 [[TMP19]], label [[LOADBB2]], label [[RES_BLOCK]] +; X64: loadbb2: +; X64-NEXT: [[TMP20:%.*]] = bitcast i8* [[X]] to i16* +; X64-NEXT: [[TMP21:%.*]] = bitcast i8* [[Y]] to i16* +; X64-NEXT: [[TMP22:%.*]] = getelementptr i16, i16* [[TMP20]], i16 6 +; X64-NEXT: [[TMP23:%.*]] = getelementptr i16, i16* [[TMP21]], i16 6 +; X64-NEXT: [[TMP24:%.*]] = load i16, i16* [[TMP22]] +; X64-NEXT: [[TMP25:%.*]] = load i16, i16* [[TMP23]] +; X64-NEXT: [[TMP26:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP24]]) +; X64-NEXT: [[TMP27:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP25]]) +; X64-NEXT: [[TMP28]] = zext i16 [[TMP26]] to i64 +; X64-NEXT: [[TMP29]] = zext i16 [[TMP27]] to i64 +; X64-NEXT: [[TMP30:%.*]] = icmp eq i64 [[TMP28]], [[TMP29]] +; X64-NEXT: br i1 [[TMP30]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 14) ret i32 %call } define i32 @cmp15(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp15( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 15) -; ALL-NEXT: ret i32 [[CALL]] +; X32-LABEL: @cmp15( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 15) +; X32-NEXT: ret i32 [[CALL]] +; +; X64-LABEL: @cmp15( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]] +; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64: res_block: +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ], [ [[TMP28:%.*]], [[LOADBB2:%.*]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ], [ [[TMP29:%.*]], [[LOADBB2]] ] +; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32* +; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32* +; X64-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 2 +; X64-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 2 +; X64-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]] +; X64-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] +; X64-NEXT: [[TMP15:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP13]]) +; X64-NEXT: [[TMP16:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) +; X64-NEXT: [[TMP17]] = zext i32 [[TMP15]] to i64 +; X64-NEXT: [[TMP18]] = zext i32 [[TMP16]] to i64 +; X64-NEXT: [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]] +; X64-NEXT: br i1 [[TMP19]], label [[LOADBB2]], label [[RES_BLOCK]] +; X64: loadbb2: +; X64-NEXT: [[TMP20:%.*]] = bitcast i8* [[X]] to i16* +; X64-NEXT: [[TMP21:%.*]] = bitcast i8* [[Y]] to i16* +; X64-NEXT: [[TMP22:%.*]] = getelementptr i16, i16* [[TMP20]], i16 6 +; X64-NEXT: [[TMP23:%.*]] = getelementptr i16, i16* [[TMP21]], i16 6 +; X64-NEXT: [[TMP24:%.*]] = load i16, i16* [[TMP22]] +; X64-NEXT: [[TMP25:%.*]] = load i16, i16* [[TMP23]] +; X64-NEXT: [[TMP26:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP24]]) +; X64-NEXT: [[TMP27:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP25]]) +; X64-NEXT: [[TMP28]] = zext i16 [[TMP26]] to i64 +; X64-NEXT: [[TMP29]] = zext i16 [[TMP27]] to i64 +; X64-NEXT: [[TMP30:%.*]] = icmp eq i64 [[TMP28]], [[TMP29]] +; X64-NEXT: br i1 [[TMP30]], label [[LOADBB3:%.*]], label [[RES_BLOCK]] +; X64: loadbb3: +; X64-NEXT: [[TMP31:%.*]] = getelementptr i8, i8* [[X]], i8 14 +; X64-NEXT: [[TMP32:%.*]] = getelementptr i8, i8* [[Y]], i8 14 +; X64-NEXT: [[TMP33:%.*]] = load i8, i8* [[TMP31]] +; X64-NEXT: [[TMP34:%.*]] = load i8, i8* [[TMP32]] +; X64-NEXT: [[TMP35:%.*]] = zext i8 [[TMP33]] to i32 +; X64-NEXT: [[TMP36:%.*]] = zext i8 [[TMP34]] to i32 +; X64-NEXT: [[TMP37:%.*]] = sub i32 [[TMP35]], [[TMP36]] +; X64-NEXT: br label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP37]], [[LOADBB3]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 15) ret i32 %call } define i32 @cmp16(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp16( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 16) -; ALL-NEXT: ret i32 [[CALL]] +; X32-LABEL: @cmp16( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X32-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] +; X32-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X32: res_block: +; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ], [ [[TMP24:%.*]], [[LOADBB2:%.*]] ], [ [[TMP33:%.*]], [[LOADBB3:%.*]] ] +; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ], [ [[TMP25:%.*]], [[LOADBB2]] ], [ [[TMP34:%.*]], [[LOADBB3]] ] +; X32-NEXT: [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; X32-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1 +; X32-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1 +; X32-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]] +; X32-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] +; X32-NEXT: [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]]) +; X32-NEXT: [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) +; X32-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]] +; X32-NEXT: br i1 [[TMP17]], label [[LOADBB2]], label [[RES_BLOCK]] +; X32: loadbb2: +; X32-NEXT: [[TMP18:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP19:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP20:%.*]] = getelementptr i32, i32* [[TMP18]], i32 2 +; X32-NEXT: [[TMP21:%.*]] = getelementptr i32, i32* [[TMP19]], i32 2 +; X32-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP20]] +; X32-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP21]] +; X32-NEXT: [[TMP24]] = call i32 @llvm.bswap.i32(i32 [[TMP22]]) +; X32-NEXT: [[TMP25]] = call i32 @llvm.bswap.i32(i32 [[TMP23]]) +; X32-NEXT: [[TMP26:%.*]] = icmp eq i32 [[TMP24]], [[TMP25]] +; X32-NEXT: br i1 [[TMP26]], label [[LOADBB3]], label [[RES_BLOCK]] +; X32: loadbb3: +; X32-NEXT: [[TMP27:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP28:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP29:%.*]] = getelementptr i32, i32* [[TMP27]], i32 3 +; X32-NEXT: [[TMP30:%.*]] = getelementptr i32, i32* [[TMP28]], i32 3 +; X32-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP29]] +; X32-NEXT: [[TMP32:%.*]] = load i32, i32* [[TMP30]] +; X32-NEXT: [[TMP33]] = call i32 @llvm.bswap.i32(i32 [[TMP31]]) +; X32-NEXT: [[TMP34]] = call i32 @llvm.bswap.i32(i32 [[TMP32]]) +; X32-NEXT: [[TMP35:%.*]] = icmp eq i32 [[TMP33]], [[TMP34]] +; X32-NEXT: br i1 [[TMP35]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X32-NEXT: ret i32 [[PHI_RES]] +; +; X64-LABEL: @cmp16( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]] +; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64: res_block: +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i64* +; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i64* +; X64-NEXT: [[TMP11:%.*]] = getelementptr i64, i64* [[TMP9]], i64 1 +; X64-NEXT: [[TMP12:%.*]] = getelementptr i64, i64* [[TMP10]], i64 1 +; X64-NEXT: [[TMP13:%.*]] = load i64, i64* [[TMP11]] +; X64-NEXT: [[TMP14:%.*]] = load i64, i64* [[TMP12]] +; X64-NEXT: [[TMP15]] = call i64 @llvm.bswap.i64(i64 [[TMP13]]) +; X64-NEXT: [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]]) +; X64-NEXT: [[TMP17:%.*]] = icmp eq i64 [[TMP15]], [[TMP16]] +; X64-NEXT: br i1 [[TMP17]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) ret i32 %call @@ -190,8 +1088,25 @@ define i32 @cmp_eq2(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp_eq3(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp_eq3( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 3) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; ALL-NEXT: loadbb: +; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i16* +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i16* +; ALL-NEXT: [[TMP2:%.*]] = load i16, i16* [[TMP0]] +; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = icmp ne i16 [[TMP2]], [[TMP3]] +; ALL-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; ALL: res_block: +; ALL-NEXT: br label [[ENDBLOCK:%.*]] +; ALL: loadbb1: +; ALL-NEXT: [[TMP5:%.*]] = getelementptr i8, i8* [[X]], i8 2 +; ALL-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[Y]], i8 2 +; ALL-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP5]] +; ALL-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]] +; ALL-NEXT: [[TMP9:%.*]] = icmp ne i8 [[TMP7]], [[TMP8]] +; ALL-NEXT: br i1 [[TMP9]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; ALL: endblock: +; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 ; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 ; ALL-NEXT: ret i32 [[CONV]] ; @@ -221,8 +1136,25 @@ define i32 @cmp_eq4(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp_eq5(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp_eq5( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 5) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; ALL-NEXT: loadbb: +; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; ALL-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] +; ALL-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; ALL: res_block: +; ALL-NEXT: br label [[ENDBLOCK:%.*]] +; ALL: loadbb1: +; ALL-NEXT: [[TMP5:%.*]] = getelementptr i8, i8* [[X]], i8 4 +; ALL-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[Y]], i8 4 +; ALL-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP5]] +; ALL-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]] +; ALL-NEXT: [[TMP9:%.*]] = icmp ne i8 [[TMP7]], [[TMP8]] +; ALL-NEXT: br i1 [[TMP9]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; ALL: endblock: +; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 ; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 ; ALL-NEXT: ret i32 [[CONV]] ; @@ -234,8 +1166,27 @@ define i32 @cmp_eq5(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp_eq6(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp_eq6( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 6) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; ALL-NEXT: loadbb: +; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; ALL-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] +; ALL-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; ALL: res_block: +; ALL-NEXT: br label [[ENDBLOCK:%.*]] +; ALL: loadbb1: +; ALL-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i16* +; ALL-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i16* +; ALL-NEXT: [[TMP7:%.*]] = getelementptr i16, i16* [[TMP5]], i16 2 +; ALL-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 2 +; ALL-NEXT: [[TMP9:%.*]] = load i16, i16* [[TMP7]] +; ALL-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]] +; ALL-NEXT: [[TMP11:%.*]] = icmp ne i16 [[TMP9]], [[TMP10]] +; ALL-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; ALL: endblock: +; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 ; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 ; ALL-NEXT: ret i32 [[CONV]] ; @@ -247,8 +1198,34 @@ define i32 @cmp_eq6(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp_eq7(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp_eq7( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 7) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; ALL-NEXT: loadbb: +; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; ALL-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] +; ALL-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; ALL: res_block: +; ALL-NEXT: br label [[ENDBLOCK:%.*]] +; ALL: loadbb1: +; ALL-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i16* +; ALL-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i16* +; ALL-NEXT: [[TMP7:%.*]] = getelementptr i16, i16* [[TMP5]], i16 2 +; ALL-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 2 +; ALL-NEXT: [[TMP9:%.*]] = load i16, i16* [[TMP7]] +; ALL-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]] +; ALL-NEXT: [[TMP11:%.*]] = icmp ne i16 [[TMP9]], [[TMP10]] +; ALL-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]] +; ALL: loadbb2: +; ALL-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[X]], i8 6 +; ALL-NEXT: [[TMP13:%.*]] = getelementptr i8, i8* [[Y]], i8 6 +; ALL-NEXT: [[TMP14:%.*]] = load i8, i8* [[TMP12]] +; ALL-NEXT: [[TMP15:%.*]] = load i8, i8* [[TMP13]] +; ALL-NEXT: [[TMP16:%.*]] = icmp ne i8 [[TMP14]], [[TMP15]] +; ALL-NEXT: br i1 [[TMP16]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; ALL: endblock: +; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ 1, [[RES_BLOCK]] ] +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 ; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 ; ALL-NEXT: ret i32 [[CONV]] ; @@ -260,8 +1237,27 @@ define i32 @cmp_eq7(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp_eq8(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X32-LABEL: @cmp_eq8( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 8) -; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] +; X32-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X32: res_block: +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1 +; X32-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1 +; X32-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]] +; X32-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] +; X32-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]] +; X32-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 ; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 ; X32-NEXT: ret i32 [[CONV]] ; @@ -283,11 +1279,60 @@ define i32 @cmp_eq8(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp_eq9(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq9( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 9) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] +; X32-LABEL: @cmp_eq9( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] +; X32-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X32: res_block: +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1 +; X32-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1 +; X32-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]] +; X32-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] +; X32-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]] +; X32-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]] +; X32: loadbb2: +; X32-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[X]], i8 8 +; X32-NEXT: [[TMP13:%.*]] = getelementptr i8, i8* [[Y]], i8 8 +; X32-NEXT: [[TMP14:%.*]] = load i8, i8* [[TMP12]] +; X32-NEXT: [[TMP15:%.*]] = load i8, i8* [[TMP13]] +; X32-NEXT: [[TMP16:%.*]] = icmp ne i8 [[TMP14]], [[TMP15]] +; X32-NEXT: br i1 [[TMP16]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ 1, [[RES_BLOCK]] ] +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; +; X64-LABEL: @cmp_eq9( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]] +; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64: res_block: +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP5:%.*]] = getelementptr i8, i8* [[X]], i8 8 +; X64-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[Y]], i8 8 +; X64-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP5]] +; X64-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]] +; X64-NEXT: [[TMP9:%.*]] = icmp ne i8 [[TMP7]], [[TMP8]] +; X64-NEXT: br i1 [[TMP9]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64-NEXT: ret i32 [[CONV]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 9) %cmp = icmp eq i32 %call, 0 @@ -296,11 +1341,64 @@ define i32 @cmp_eq9(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp_eq10(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq10( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 10) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] +; X32-LABEL: @cmp_eq10( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] +; X32-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X32: res_block: +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1 +; X32-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1 +; X32-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]] +; X32-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] +; X32-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]] +; X32-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]] +; X32: loadbb2: +; X32-NEXT: [[TMP12:%.*]] = bitcast i8* [[X]] to i16* +; X32-NEXT: [[TMP13:%.*]] = bitcast i8* [[Y]] to i16* +; X32-NEXT: [[TMP14:%.*]] = getelementptr i16, i16* [[TMP12]], i16 4 +; X32-NEXT: [[TMP15:%.*]] = getelementptr i16, i16* [[TMP13]], i16 4 +; X32-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP14]] +; X32-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]] +; X32-NEXT: [[TMP18:%.*]] = icmp ne i16 [[TMP16]], [[TMP17]] +; X32-NEXT: br i1 [[TMP18]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ 1, [[RES_BLOCK]] ] +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; +; X64-LABEL: @cmp_eq10( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]] +; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64: res_block: +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i16* +; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i16* +; X64-NEXT: [[TMP7:%.*]] = getelementptr i16, i16* [[TMP5]], i16 4 +; X64-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 4 +; X64-NEXT: [[TMP9:%.*]] = load i16, i16* [[TMP7]] +; X64-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]] +; X64-NEXT: [[TMP11:%.*]] = icmp ne i16 [[TMP9]], [[TMP10]] +; X64-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64-NEXT: ret i32 [[CONV]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 10) %cmp = icmp eq i32 %call, 0 @@ -309,11 +1407,78 @@ define i32 @cmp_eq10(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp_eq11(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq11( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 11) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] +; X32-LABEL: @cmp_eq11( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] +; X32-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X32: res_block: +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1 +; X32-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1 +; X32-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]] +; X32-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] +; X32-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]] +; X32-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]] +; X32: loadbb2: +; X32-NEXT: [[TMP12:%.*]] = bitcast i8* [[X]] to i16* +; X32-NEXT: [[TMP13:%.*]] = bitcast i8* [[Y]] to i16* +; X32-NEXT: [[TMP14:%.*]] = getelementptr i16, i16* [[TMP12]], i16 4 +; X32-NEXT: [[TMP15:%.*]] = getelementptr i16, i16* [[TMP13]], i16 4 +; X32-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP14]] +; X32-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]] +; X32-NEXT: [[TMP18:%.*]] = icmp ne i16 [[TMP16]], [[TMP17]] +; X32-NEXT: br i1 [[TMP18]], label [[RES_BLOCK]], label [[LOADBB3:%.*]] +; X32: loadbb3: +; X32-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[X]], i8 10 +; X32-NEXT: [[TMP20:%.*]] = getelementptr i8, i8* [[Y]], i8 10 +; X32-NEXT: [[TMP21:%.*]] = load i8, i8* [[TMP19]] +; X32-NEXT: [[TMP22:%.*]] = load i8, i8* [[TMP20]] +; X32-NEXT: [[TMP23:%.*]] = icmp ne i8 [[TMP21]], [[TMP22]] +; X32-NEXT: br i1 [[TMP23]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ 1, [[RES_BLOCK]] ] +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; +; X64-LABEL: @cmp_eq11( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]] +; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64: res_block: +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i16* +; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i16* +; X64-NEXT: [[TMP7:%.*]] = getelementptr i16, i16* [[TMP5]], i16 4 +; X64-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 4 +; X64-NEXT: [[TMP9:%.*]] = load i16, i16* [[TMP7]] +; X64-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]] +; X64-NEXT: [[TMP11:%.*]] = icmp ne i16 [[TMP9]], [[TMP10]] +; X64-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]] +; X64: loadbb2: +; X64-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[X]], i8 10 +; X64-NEXT: [[TMP13:%.*]] = getelementptr i8, i8* [[Y]], i8 10 +; X64-NEXT: [[TMP14:%.*]] = load i8, i8* [[TMP12]] +; X64-NEXT: [[TMP15:%.*]] = load i8, i8* [[TMP13]] +; X64-NEXT: [[TMP16:%.*]] = icmp ne i8 [[TMP14]], [[TMP15]] +; X64-NEXT: br i1 [[TMP16]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ 1, [[RES_BLOCK]] ] +; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64-NEXT: ret i32 [[CONV]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 11) %cmp = icmp eq i32 %call, 0 @@ -322,11 +1487,64 @@ define i32 @cmp_eq11(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp_eq12(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq12( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 12) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] +; X32-LABEL: @cmp_eq12( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] +; X32-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X32: res_block: +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1 +; X32-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1 +; X32-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]] +; X32-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] +; X32-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]] +; X32-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]] +; X32: loadbb2: +; X32-NEXT: [[TMP12:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP13:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[TMP12]], i32 2 +; X32-NEXT: [[TMP15:%.*]] = getelementptr i32, i32* [[TMP13]], i32 2 +; X32-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP14]] +; X32-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP15]] +; X32-NEXT: [[TMP18:%.*]] = icmp ne i32 [[TMP16]], [[TMP17]] +; X32-NEXT: br i1 [[TMP18]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ 1, [[RES_BLOCK]] ] +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; +; X64-LABEL: @cmp_eq12( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]] +; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64: res_block: +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32* +; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32* +; X64-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 2 +; X64-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 2 +; X64-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]] +; X64-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] +; X64-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]] +; X64-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64-NEXT: ret i32 [[CONV]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 12) %cmp = icmp eq i32 %call, 0 @@ -335,11 +1553,78 @@ define i32 @cmp_eq12(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp_eq13(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq13( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 13) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] +; X32-LABEL: @cmp_eq13( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] +; X32-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X32: res_block: +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1 +; X32-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1 +; X32-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]] +; X32-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] +; X32-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]] +; X32-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]] +; X32: loadbb2: +; X32-NEXT: [[TMP12:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP13:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[TMP12]], i32 2 +; X32-NEXT: [[TMP15:%.*]] = getelementptr i32, i32* [[TMP13]], i32 2 +; X32-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP14]] +; X32-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP15]] +; X32-NEXT: [[TMP18:%.*]] = icmp ne i32 [[TMP16]], [[TMP17]] +; X32-NEXT: br i1 [[TMP18]], label [[RES_BLOCK]], label [[LOADBB3:%.*]] +; X32: loadbb3: +; X32-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[X]], i8 12 +; X32-NEXT: [[TMP20:%.*]] = getelementptr i8, i8* [[Y]], i8 12 +; X32-NEXT: [[TMP21:%.*]] = load i8, i8* [[TMP19]] +; X32-NEXT: [[TMP22:%.*]] = load i8, i8* [[TMP20]] +; X32-NEXT: [[TMP23:%.*]] = icmp ne i8 [[TMP21]], [[TMP22]] +; X32-NEXT: br i1 [[TMP23]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ 1, [[RES_BLOCK]] ] +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; +; X64-LABEL: @cmp_eq13( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]] +; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64: res_block: +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32* +; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32* +; X64-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 2 +; X64-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 2 +; X64-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]] +; X64-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] +; X64-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]] +; X64-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]] +; X64: loadbb2: +; X64-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[X]], i8 12 +; X64-NEXT: [[TMP13:%.*]] = getelementptr i8, i8* [[Y]], i8 12 +; X64-NEXT: [[TMP14:%.*]] = load i8, i8* [[TMP12]] +; X64-NEXT: [[TMP15:%.*]] = load i8, i8* [[TMP13]] +; X64-NEXT: [[TMP16:%.*]] = icmp ne i8 [[TMP14]], [[TMP15]] +; X64-NEXT: br i1 [[TMP16]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ 1, [[RES_BLOCK]] ] +; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64-NEXT: ret i32 [[CONV]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 13) %cmp = icmp eq i32 %call, 0 @@ -348,11 +1633,82 @@ define i32 @cmp_eq13(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp_eq14(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq14( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 14) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] +; X32-LABEL: @cmp_eq14( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] +; X32-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X32: res_block: +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1 +; X32-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1 +; X32-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]] +; X32-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] +; X32-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]] +; X32-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]] +; X32: loadbb2: +; X32-NEXT: [[TMP12:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP13:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[TMP12]], i32 2 +; X32-NEXT: [[TMP15:%.*]] = getelementptr i32, i32* [[TMP13]], i32 2 +; X32-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP14]] +; X32-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP15]] +; X32-NEXT: [[TMP18:%.*]] = icmp ne i32 [[TMP16]], [[TMP17]] +; X32-NEXT: br i1 [[TMP18]], label [[RES_BLOCK]], label [[LOADBB3:%.*]] +; X32: loadbb3: +; X32-NEXT: [[TMP19:%.*]] = bitcast i8* [[X]] to i16* +; X32-NEXT: [[TMP20:%.*]] = bitcast i8* [[Y]] to i16* +; X32-NEXT: [[TMP21:%.*]] = getelementptr i16, i16* [[TMP19]], i16 6 +; X32-NEXT: [[TMP22:%.*]] = getelementptr i16, i16* [[TMP20]], i16 6 +; X32-NEXT: [[TMP23:%.*]] = load i16, i16* [[TMP21]] +; X32-NEXT: [[TMP24:%.*]] = load i16, i16* [[TMP22]] +; X32-NEXT: [[TMP25:%.*]] = icmp ne i16 [[TMP23]], [[TMP24]] +; X32-NEXT: br i1 [[TMP25]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ 1, [[RES_BLOCK]] ] +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; +; X64-LABEL: @cmp_eq14( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]] +; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64: res_block: +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32* +; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32* +; X64-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 2 +; X64-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 2 +; X64-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]] +; X64-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] +; X64-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]] +; X64-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]] +; X64: loadbb2: +; X64-NEXT: [[TMP12:%.*]] = bitcast i8* [[X]] to i16* +; X64-NEXT: [[TMP13:%.*]] = bitcast i8* [[Y]] to i16* +; X64-NEXT: [[TMP14:%.*]] = getelementptr i16, i16* [[TMP12]], i16 6 +; X64-NEXT: [[TMP15:%.*]] = getelementptr i16, i16* [[TMP13]], i16 6 +; X64-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP14]] +; X64-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]] +; X64-NEXT: [[TMP18:%.*]] = icmp ne i16 [[TMP16]], [[TMP17]] +; X64-NEXT: br i1 [[TMP18]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ 1, [[RES_BLOCK]] ] +; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64-NEXT: ret i32 [[CONV]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 14) %cmp = icmp eq i32 %call, 0 @@ -361,11 +1717,52 @@ define i32 @cmp_eq14(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp_eq15(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq15( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 15) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] +; X32-LABEL: @cmp_eq15( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 15) +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; +; X64-LABEL: @cmp_eq15( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]] +; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64: res_block: +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32* +; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32* +; X64-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 2 +; X64-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 2 +; X64-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]] +; X64-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] +; X64-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]] +; X64-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]] +; X64: loadbb2: +; X64-NEXT: [[TMP12:%.*]] = bitcast i8* [[X]] to i16* +; X64-NEXT: [[TMP13:%.*]] = bitcast i8* [[Y]] to i16* +; X64-NEXT: [[TMP14:%.*]] = getelementptr i16, i16* [[TMP12]], i16 6 +; X64-NEXT: [[TMP15:%.*]] = getelementptr i16, i16* [[TMP13]], i16 6 +; X64-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP14]] +; X64-NEXT: [[TMP17:%.*]] = load i16, i16* [[TMP15]] +; X64-NEXT: [[TMP18:%.*]] = icmp ne i16 [[TMP16]], [[TMP17]] +; X64-NEXT: br i1 [[TMP18]], label [[RES_BLOCK]], label [[LOADBB3:%.*]] +; X64: loadbb3: +; X64-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[X]], i8 14 +; X64-NEXT: [[TMP20:%.*]] = getelementptr i8, i8* [[Y]], i8 14 +; X64-NEXT: [[TMP21:%.*]] = load i8, i8* [[TMP19]] +; X64-NEXT: [[TMP22:%.*]] = load i8, i8* [[TMP20]] +; X64-NEXT: [[TMP23:%.*]] = icmp ne i8 [[TMP21]], [[TMP22]] +; X64-NEXT: br i1 [[TMP23]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ 1, [[RES_BLOCK]] ] +; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64-NEXT: ret i32 [[CONV]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 15) %cmp = icmp eq i32 %call, 0 @@ -374,11 +1771,73 @@ define i32 @cmp_eq15(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp_eq16(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq16( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 16) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] +; X32-LABEL: @cmp_eq16( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] +; X32-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X32: res_block: +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1 +; X32-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1 +; X32-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]] +; X32-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] +; X32-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]] +; X32-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]] +; X32: loadbb2: +; X32-NEXT: [[TMP12:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP13:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[TMP12]], i32 2 +; X32-NEXT: [[TMP15:%.*]] = getelementptr i32, i32* [[TMP13]], i32 2 +; X32-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP14]] +; X32-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP15]] +; X32-NEXT: [[TMP18:%.*]] = icmp ne i32 [[TMP16]], [[TMP17]] +; X32-NEXT: br i1 [[TMP18]], label [[RES_BLOCK]], label [[LOADBB3:%.*]] +; X32: loadbb3: +; X32-NEXT: [[TMP19:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP20:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP21:%.*]] = getelementptr i32, i32* [[TMP19]], i32 3 +; X32-NEXT: [[TMP22:%.*]] = getelementptr i32, i32* [[TMP20]], i32 3 +; X32-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP21]] +; X32-NEXT: [[TMP24:%.*]] = load i32, i32* [[TMP22]] +; X32-NEXT: [[TMP25:%.*]] = icmp ne i32 [[TMP23]], [[TMP24]] +; X32-NEXT: br i1 [[TMP25]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ 1, [[RES_BLOCK]] ] +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; +; X64-LABEL: @cmp_eq16( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]] +; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64: res_block: +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i64* +; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i64* +; X64-NEXT: [[TMP7:%.*]] = getelementptr i64, i64* [[TMP5]], i64 1 +; X64-NEXT: [[TMP8:%.*]] = getelementptr i64, i64* [[TMP6]], i64 1 +; X64-NEXT: [[TMP9:%.*]] = load i64, i64* [[TMP7]] +; X64-NEXT: [[TMP10:%.*]] = load i64, i64* [[TMP8]] +; X64-NEXT: [[TMP11:%.*]] = icmp ne i64 [[TMP9]], [[TMP10]] +; X64-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] +; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 +; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64-NEXT: ret i32 [[CONV]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) %cmp = icmp eq i32 %call, 0 -- 2.40.0