return false;
}
- /// Should we merge stores after Legalization (generally
- /// better quality) or before (simpler)
+ /// Allow store merging after legalization in addition to before legalization.
+ /// This may catch stores that do not exist earlier (eg, stores created from
+ /// intrinsics).
virtual bool mergeStoresAfterLegalization() const { return false; }
/// Returns if it's reasonable to merge stores to MemVT size.
Ptr, ST->getMemoryVT(), ST->getMemOperand());
}
- // Only perform this optimization before the types are legal, because we
- // don't want to perform this optimization on every DAGCombine invocation.
- if ((TLI.mergeStoresAfterLegalization()) ? Level == AfterLegalizeDAG
- : !LegalTypes) {
+ // Always perform this optimization before types are legal. If the target
+ // prefers, also try this after legalization to catch stores that were created
+ // by intrinsics or other nodes.
+ if (!LegalTypes || (TLI.mergeStoresAfterLegalization())) {
for (;;) {
// There can be multiple store sequences on the same chain.
// Keep trying to merge store sequences until we are unable to do so
; AVX1-NEXT: pushq %r13
; AVX1-NEXT: pushq %r12
; AVX1-NEXT: pushq %rbx
-; AVX1-NEXT: vmovq %xmm0, %rcx
+; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %rdx
; AVX1-NEXT: movq %rcx, %r8
; AVX1-NEXT: movq %rcx, %r9
; AVX1-NEXT: movq %rcx, %r10
; AVX1-NEXT: movq %rcx, %r11
; AVX1-NEXT: movq %rcx, %r14
; AVX1-NEXT: movq %rcx, %r15
-; AVX1-NEXT: vpextrq $1, %xmm0, %rdx
; AVX1-NEXT: movq %rdx, %r12
; AVX1-NEXT: movq %rdx, %r13
-; AVX1-NEXT: movq %rdx, %rbx
-; AVX1-NEXT: movq %rdx, %rax
; AVX1-NEXT: movq %rdx, %rdi
+; AVX1-NEXT: movq %rdx, %rax
; AVX1-NEXT: movq %rdx, %rsi
+; AVX1-NEXT: movq %rdx, %rbx
; AVX1-NEXT: movq %rdx, %rbp
; AVX1-NEXT: andb $15, %dl
; AVX1-NEXT: movb %dl, -{{[0-9]+}}(%rsp)
; AVX1-NEXT: shrq $56, %rbp
; AVX1-NEXT: andb $15, %bpl
; AVX1-NEXT: movb %bpl, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: shrq $48, %rsi
+; AVX1-NEXT: shrq $48, %rbx
+; AVX1-NEXT: andb $15, %bl
+; AVX1-NEXT: movb %bl, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: shrq $40, %rsi
; AVX1-NEXT: andb $15, %sil
; AVX1-NEXT: movb %sil, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: shrq $40, %rdi
-; AVX1-NEXT: andb $15, %dil
-; AVX1-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; AVX1-NEXT: shrq $32, %rax
; AVX1-NEXT: andb $15, %al
; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: shrq $24, %rbx
-; AVX1-NEXT: andb $15, %bl
-; AVX1-NEXT: movb %bl, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: shrq $24, %rdi
+; AVX1-NEXT: andb $15, %dil
+; AVX1-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; AVX1-NEXT: shrq $16, %r13
; AVX1-NEXT: andb $15, %r13b
; AVX1-NEXT: movb %r13b, -{{[0-9]+}}(%rsp)
; AVX2-NEXT: pushq %r13
; AVX2-NEXT: pushq %r12
; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: vmovq %xmm0, %rcx
+; AVX2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %rdx
; AVX2-NEXT: movq %rcx, %r8
; AVX2-NEXT: movq %rcx, %r9
; AVX2-NEXT: movq %rcx, %r10
; AVX2-NEXT: movq %rcx, %r11
; AVX2-NEXT: movq %rcx, %r14
; AVX2-NEXT: movq %rcx, %r15
-; AVX2-NEXT: vpextrq $1, %xmm0, %rdx
; AVX2-NEXT: movq %rdx, %r12
; AVX2-NEXT: movq %rdx, %r13
-; AVX2-NEXT: movq %rdx, %rbx
-; AVX2-NEXT: movq %rdx, %rax
; AVX2-NEXT: movq %rdx, %rdi
+; AVX2-NEXT: movq %rdx, %rax
; AVX2-NEXT: movq %rdx, %rsi
+; AVX2-NEXT: movq %rdx, %rbx
; AVX2-NEXT: movq %rdx, %rbp
; AVX2-NEXT: andb $15, %dl
; AVX2-NEXT: movb %dl, -{{[0-9]+}}(%rsp)
; AVX2-NEXT: shrq $56, %rbp
; AVX2-NEXT: andb $15, %bpl
; AVX2-NEXT: movb %bpl, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: shrq $48, %rsi
+; AVX2-NEXT: shrq $48, %rbx
+; AVX2-NEXT: andb $15, %bl
+; AVX2-NEXT: movb %bl, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: shrq $40, %rsi
; AVX2-NEXT: andb $15, %sil
; AVX2-NEXT: movb %sil, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: shrq $40, %rdi
-; AVX2-NEXT: andb $15, %dil
-; AVX2-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; AVX2-NEXT: shrq $32, %rax
; AVX2-NEXT: andb $15, %al
; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: shrq $24, %rbx
-; AVX2-NEXT: andb $15, %bl
-; AVX2-NEXT: movb %bl, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: shrq $24, %rdi
+; AVX2-NEXT: andb $15, %dil
+; AVX2-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; AVX2-NEXT: shrq $16, %r13
; AVX2-NEXT: andb $15, %r13b
; AVX2-NEXT: movb %r13b, -{{[0-9]+}}(%rsp)
; X32-SSE1-NEXT: movzwl 18(%ecx), %ecx
; X32-SSE1-NEXT: movw %di, 10(%eax)
; X32-SSE1-NEXT: movw %cx, 14(%eax)
-; X32-SSE1-NEXT: movl %edx, (%eax)
; X32-SSE1-NEXT: movl %esi, 6(%eax)
+; X32-SSE1-NEXT: movl %edx, (%eax)
; X32-SSE1-NEXT: popl %esi
; X32-SSE1-NEXT: popl %edi
; X32-SSE1-NEXT: retl $4
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-SSE1-NEXT: movl 8(%ecx), %edx
; X32-SSE1-NEXT: movzwl 14(%ecx), %ecx
-; X32-SSE1-NEXT: movl %edx, (%eax)
; X32-SSE1-NEXT: movw %cx, 6(%eax)
+; X32-SSE1-NEXT: movl %edx, (%eax)
; X32-SSE1-NEXT: movl $0, 12(%eax)
; X32-SSE1-NEXT: movl $0, 8(%eax)
; X32-SSE1-NEXT: retl $4
; X32-SSE1-NEXT: movb %cl, 15(%eax)
; X32-SSE1-NEXT: movw %bx, 11(%eax)
; X32-SSE1-NEXT: movl %edi, 7(%eax)
-; X32-SSE1-NEXT: movw %bp, (%eax)
; X32-SSE1-NEXT: movl %esi, 3(%eax)
+; X32-SSE1-NEXT: movw %bp, (%eax)
; X32-SSE1-NEXT: popl %esi
; X32-SSE1-NEXT: popl %edi
; X32-SSE1-NEXT: popl %ebx
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-SSE1-NEXT: movzwl (%ecx), %edx
; X32-SSE1-NEXT: movb 3(%ecx), %cl
-; X32-SSE1-NEXT: movw %dx, (%eax)
; X32-SSE1-NEXT: movb %cl, 3(%eax)
+; X32-SSE1-NEXT: movw %dx, (%eax)
; X32-SSE1-NEXT: movb $0, 15(%eax)
; X32-SSE1-NEXT: movw $0, 13(%eax)
; X32-SSE1-NEXT: movw $0, 6(%eax)
define void @redundant_stores_merging() {
; CHECK-LABEL: redundant_stores_merging:
; CHECK: # BB#0:
-; CHECK-NEXT: movabsq $1958505086977, %rax # imm = 0x1C800000001
+; CHECK-NEXT: movabsq $528280977409, %rax # imm = 0x7B00000001
; CHECK-NEXT: movq %rax, e+{{.*}}(%rip)
+; CHECK-NEXT: movl $456, e+{{.*}}(%rip) # imm = 0x1C8
; CHECK-NEXT: retq
store i32 1, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 1), align 4
store i32 123, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 2), align 4
define void @redundant_stores_merging_reverse() {
; CHECK-LABEL: redundant_stores_merging_reverse:
; CHECK: # BB#0:
-; CHECK-NEXT: movabsq $1958505086977, %rax # imm = 0x1C800000001
+; CHECK-NEXT: movabsq $528280977409, %rax # imm = 0x7B00000001
; CHECK-NEXT: movq %rax, e+{{.*}}(%rip)
+; CHECK-NEXT: movl $456, e+{{.*}}(%rip) # imm = 0x1C8
; CHECK-NEXT: retq
store i32 123, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 2), align 4
store i32 456, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 2), align 4
define void @extract_vector_store_16_consecutive_bytes(<2 x i64> %v, i8* %ptr) #0 {
; CHECK-LABEL: extract_vector_store_16_consecutive_bytes:
; CHECK: # BB#0:
-; CHECK-NEXT: vpextrb $0, %xmm0, (%rdi)
-; CHECK-NEXT: vpextrb $1, %xmm0, 1(%rdi)
-; CHECK-NEXT: vpextrb $2, %xmm0, 2(%rdi)
-; CHECK-NEXT: vpextrb $3, %xmm0, 3(%rdi)
-; CHECK-NEXT: vpextrb $4, %xmm0, 4(%rdi)
-; CHECK-NEXT: vpextrb $5, %xmm0, 5(%rdi)
-; CHECK-NEXT: vpextrb $6, %xmm0, 6(%rdi)
-; CHECK-NEXT: vpextrb $7, %xmm0, 7(%rdi)
-; CHECK-NEXT: vpextrb $8, %xmm0, 8(%rdi)
-; CHECK-NEXT: vpextrb $9, %xmm0, 9(%rdi)
-; CHECK-NEXT: vpextrb $10, %xmm0, 10(%rdi)
-; CHECK-NEXT: vpextrb $11, %xmm0, 11(%rdi)
-; CHECK-NEXT: vpextrb $12, %xmm0, 12(%rdi)
-; CHECK-NEXT: vpextrb $13, %xmm0, 13(%rdi)
-; CHECK-NEXT: vpextrb $14, %xmm0, 14(%rdi)
-; CHECK-NEXT: vpextrb $15, %xmm0, 15(%rdi)
+; CHECK-NEXT: vmovups %xmm0, (%rdi)
; CHECK-NEXT: retq
%bc = bitcast <2 x i64> %v to <16 x i8>
%ext00 = extractelement <16 x i8> %bc, i32 0
define void @extract_vector_store_32_consecutive_bytes(<4 x i64> %v, i8* %ptr) #0 {
; CHECK-LABEL: extract_vector_store_32_consecutive_bytes:
; CHECK: # BB#0:
-; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT: vpextrb $0, %xmm0, (%rdi)
-; CHECK-NEXT: vpextrb $1, %xmm0, 1(%rdi)
-; CHECK-NEXT: vpextrb $2, %xmm0, 2(%rdi)
-; CHECK-NEXT: vpextrb $3, %xmm0, 3(%rdi)
-; CHECK-NEXT: vpextrb $4, %xmm0, 4(%rdi)
-; CHECK-NEXT: vpextrb $5, %xmm0, 5(%rdi)
-; CHECK-NEXT: vpextrb $6, %xmm0, 6(%rdi)
-; CHECK-NEXT: vpextrb $7, %xmm0, 7(%rdi)
-; CHECK-NEXT: vpextrb $8, %xmm0, 8(%rdi)
-; CHECK-NEXT: vpextrb $9, %xmm0, 9(%rdi)
-; CHECK-NEXT: vpextrb $10, %xmm0, 10(%rdi)
-; CHECK-NEXT: vpextrb $11, %xmm0, 11(%rdi)
-; CHECK-NEXT: vpextrb $12, %xmm0, 12(%rdi)
-; CHECK-NEXT: vpextrb $13, %xmm0, 13(%rdi)
-; CHECK-NEXT: vpextrb $14, %xmm0, 14(%rdi)
-; CHECK-NEXT: vpextrb $15, %xmm0, 15(%rdi)
-; CHECK-NEXT: vpextrb $0, %xmm1, 16(%rdi)
-; CHECK-NEXT: vpextrb $1, %xmm1, 17(%rdi)
-; CHECK-NEXT: vpextrb $2, %xmm1, 18(%rdi)
-; CHECK-NEXT: vpextrb $3, %xmm1, 19(%rdi)
-; CHECK-NEXT: vpextrb $4, %xmm1, 20(%rdi)
-; CHECK-NEXT: vpextrb $5, %xmm1, 21(%rdi)
-; CHECK-NEXT: vpextrb $6, %xmm1, 22(%rdi)
-; CHECK-NEXT: vpextrb $7, %xmm1, 23(%rdi)
-; CHECK-NEXT: vpextrb $8, %xmm1, 24(%rdi)
-; CHECK-NEXT: vpextrb $9, %xmm1, 25(%rdi)
-; CHECK-NEXT: vpextrb $10, %xmm1, 26(%rdi)
-; CHECK-NEXT: vpextrb $11, %xmm1, 27(%rdi)
-; CHECK-NEXT: vpextrb $12, %xmm1, 28(%rdi)
-; CHECK-NEXT: vpextrb $13, %xmm1, 29(%rdi)
-; CHECK-NEXT: vpextrb $14, %xmm1, 30(%rdi)
-; CHECK-NEXT: vpextrb $15, %xmm1, 31(%rdi)
+; CHECK-NEXT: vmovups %ymm0, (%rdi)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%bc = bitcast <4 x i64> %v to <32 x i8>