From 154874adc5349d2c70926e53d5fcb7e82b0a661b Mon Sep 17 00:00:00 2001 From: Nirav Dave Date: Wed, 13 Mar 2019 17:07:09 +0000 Subject: [PATCH] [DAGCombiner] If a TokenFactor would be merged into its user, consider the user later. Summary: A number of optimizations are inhibited by single-use TokenFactors not being merged into the TokenFactor using it. This makes we consider if we can do the merge immediately. Most tests changes here are due to the change in visitation causing minor reorderings and associated reassociation of paired memory operations. CodeGen tests with non-reordering changes: X86/aligned-variadic.ll -- memory-based add folded into stored leaq value. X86/constant-combiners.ll -- Optimizes out overlap between stores. X86/pr40631_deadstore_elision -- folds constant byte store into preceding quad word constant store. Reviewers: RKSimon, craig.topper, spatel, efriedma, courbet Reviewed By: courbet Subscribers: dylanmckay, sdardis, nemanjai, jvesely, nhaehnle, javed.absar, eraman, hiraditya, kbarton, jrtc27, atanasyan, jsji, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D59260 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@356068 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 6 + .../CodeGen/AArch64/aarch64_win64cc_vararg.ll | 8 +- test/CodeGen/AArch64/addr-of-ret-addr.ll | 2 +- test/CodeGen/AArch64/alloca.ll | 14 +- test/CodeGen/AArch64/arm64-memcpy-inline.ll | 16 +- test/CodeGen/AArch64/arm64-variadic-aapcs.ll | 27 +- test/CodeGen/AArch64/win64_vararg.ll | 32 +-- test/CodeGen/AMDGPU/call-argument-types.ll | 2 +- .../ARM/2012-10-04-AAPCS-byval-align8.ll | 4 +- .../ARM/2012-10-04-FixedFrame-vs-byval.ll | 4 +- .../2014-02-21-byval-reg-split-alignment.ll | 8 +- test/CodeGen/ARM/memset-inline.ll | 12 +- test/CodeGen/ARM/thumb1_return_sequence.ll | 4 +- test/CodeGen/ARM/unaligned_load_store.ll | 16 +- test/CodeGen/AVR/calling-conv/c/basic.ll | 32 +-- test/CodeGen/AVR/directmem.ll | 24 +- test/CodeGen/BPF/undef.ll | 28 +-- test/CodeGen/MSP430/cc_args.ll | 8 +- test/CodeGen/Mips/v2i16tof32.ll | 22 +- test/CodeGen/PowerPC/f128-aggregates.ll | 86 +++---- test/CodeGen/PowerPC/ppc64-byval-align.ll | 2 +- test/CodeGen/Thumb/frame-access.ll | 6 +- test/CodeGen/Thumb/mvn.ll | 12 +- test/CodeGen/X86/aligned-variadic.ll | 2 +- test/CodeGen/X86/atomic-idempotent.ll | 6 +- test/CodeGen/X86/avx-load-store.ll | 4 +- test/CodeGen/X86/btc_bts_btr.ll | 6 +- test/CodeGen/X86/combine-sbb.ll | 2 +- test/CodeGen/X86/constant-combines.ll | 2 +- test/CodeGen/X86/min-legal-vector-width.ll | 38 +-- test/CodeGen/X86/musttail-varargs.ll | 20 +- test/CodeGen/X86/musttail.ll | 8 +- test/CodeGen/X86/nosse-vector.ll | 8 +- test/CodeGen/X86/oddshuffles.ll | 238 +++++++++--------- test/CodeGen/X86/pr40631_deadstore_elision.ll | 3 +- test/CodeGen/X86/rotate.ll | 2 +- test/CodeGen/X86/rotate4.ll | 24 +- test/CodeGen/X86/sadd_sat_vec.ll | 4 +- test/CodeGen/X86/shift-and.ll | 2 +- test/CodeGen/X86/shrink_vmul-widen.ll | 64 ++--- test/CodeGen/X86/shrink_vmul.ll | 64 ++--- test/CodeGen/X86/ssub_sat_vec.ll | 4 +- test/CodeGen/X86/uadd_sat_vec.ll | 4 +- test/CodeGen/X86/usub_sat_vec.ll | 4 +- test/CodeGen/X86/vastart-defs-eflags.ll | 8 +- test/CodeGen/X86/vec_fpext.ll | 32 +-- test/CodeGen/X86/widen_cast-2.ll | 4 +- test/CodeGen/X86/widen_load-2.ll | 40 +-- test/CodeGen/X86/win64_frame.ll | 4 +- test/CodeGen/X86/win64_vararg.ll | 6 +- test/CodeGen/X86/x86-64-ms_abi-vararg.ll | 10 +- test/CodeGen/XCore/byVal.ll | 4 +- 52 files changed, 498 insertions(+), 494 deletions(-) diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 9ff62ad7fec..1095b413072 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -1709,6 +1709,12 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) { if (OptLevel == CodeGenOpt::None) return SDValue(); + // If this is used only a single token factor, we should make sure we have a + // chance to merge them together. This prevents TF chains from inhibiting + // optimizations. + if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::TokenFactor) + AddToWorklist(*(N->use_begin())); + SmallVector TFs; // List of token factors to visit. SmallVector Ops; // Ops for replacing token factor. SmallPtrSet SeenOps; diff --git a/test/CodeGen/AArch64/aarch64_win64cc_vararg.ll b/test/CodeGen/AArch64/aarch64_win64cc_vararg.ll index 43b821fa37c..a45ae74ac49 100644 --- a/test/CodeGen/AArch64/aarch64_win64cc_vararg.ll +++ b/test/CodeGen/AArch64/aarch64_win64cc_vararg.ll @@ -5,10 +5,10 @@ entry: ; CHECK: str x30, [sp, #-80]! ; CHECK: add x8, sp, #24 ; CHECK: add x0, sp, #24 -; CHECK: stp x6, x7, [sp, #64] -; CHECK: stp x4, x5, [sp, #48] -; CHECK: stp x2, x3, [sp, #32] -; CHECK: str x1, [sp, #24] +; CHECK: stp x1, x2, [sp, #24] +; CHECK: stp x3, x4, [sp, #40] +; CHECK: stp x5, x6, [sp, #56] +; CHECK: str x7, [sp, #72] ; CHECK: str x8, [sp, #8] ; CHECK: bl other_func ; CHECK: ldr x30, [sp], #80 diff --git a/test/CodeGen/AArch64/addr-of-ret-addr.ll b/test/CodeGen/AArch64/addr-of-ret-addr.ll index b099b18362e..a6bc36441b1 100644 --- a/test/CodeGen/AArch64/addr-of-ret-addr.ll +++ b/test/CodeGen/AArch64/addr-of-ret-addr.ll @@ -44,7 +44,7 @@ entry: ; CHECK: sub sp, sp, #96 ; CHECK: stp x29, x30, [sp, #16] ; CHECK: add x29, sp, #16 -; CHECK: str x1, [x29, #24] +; CHECK: stp x1, x2, [x29, #24] ; CHECK: add x1, x29, #8 ; CHECK: ldp x29, x30, [sp, #16] ; CHECK: add sp, sp, #96 diff --git a/test/CodeGen/AArch64/alloca.ll b/test/CodeGen/AArch64/alloca.ll index ab7a631dc24..25bb3c8ba89 100644 --- a/test/CodeGen/AArch64/alloca.ll +++ b/test/CodeGen/AArch64/alloca.ll @@ -78,22 +78,22 @@ define void @test_variadic_alloca(i64 %n, ...) { ; CHECK: stp x29, x30, [sp, #-16]! ; CHECK: mov x29, sp ; CHECK: sub sp, sp, #192 -; CHECK: stp q6, q7, [x29, #-96] +; CHECK-DAG: stp q6, q7, [x29, #-96] ; [...] -; CHECK: stp q0, q1, [x29, #-192] +; CHECK-DAG: stp q0, q1, [x29, #-192] -; CHECK: stp x6, x7, [x29, #-16] +; CHECK-DAG: stp x5, x6, [x29, #-24] ; [...] -; CHECK: stp x2, x3, [x29, #-48] +; CHECK-DAG: stp x1, x2, [x29, #-56] ; CHECK-NOFP-ARM64: stp x29, x30, [sp, #-16]! ; CHECK-NOFP-ARM64: mov x29, sp ; CHECK-NOFP-ARM64: sub sp, sp, #64 -; CHECK-NOFP-ARM64: stp x6, x7, [x29, #-16] +; CHECK-NOFP-ARM64-DAG: stp x5, x6, [x29, #-24] ; [...] -; CHECK-NOFP-ARM64: stp x4, x5, [x29, #-32] +; CHECK-NOFP-ARM64-DAG: stp x3, x4, [x29, #-40] ; [...] -; CHECK-NOFP-ARM64: stp x2, x3, [x29, #-48] +; CHECK-NOFP-ARM64-DAG: stp x1, x2, [x29, #-56] ; [...] ; CHECK-NOFP-ARM64: mov x8, sp diff --git a/test/CodeGen/AArch64/arm64-memcpy-inline.ll b/test/CodeGen/AArch64/arm64-memcpy-inline.ll index 629cf37926a..f6d66b692c3 100644 --- a/test/CodeGen/AArch64/arm64-memcpy-inline.ll +++ b/test/CodeGen/AArch64/arm64-memcpy-inline.ll @@ -16,10 +16,10 @@ define i32 @t0() { entry: ; CHECK-LABEL: t0: -; CHECK: ldur [[REG0:w[0-9]+]], [x[[BASEREG:[0-9]+]], #7] -; CHECK: stur [[REG0]], [x[[BASEREG2:[0-9]+]], #7] -; CHECK: ldr [[REG2:x[0-9]+]], -; CHECK: str [[REG2]], +; CHECK-DAG: ldur [[REG0:w[0-9]+]], [x[[BASEREG:[0-9]+]], #7] +; CHECK-DAG: stur [[REG0]], [x[[BASEREG2:[0-9]+]], #7] +; CHECK-DAG: ldr [[REG2:x[0-9]+]], +; CHECK-DAG: str [[REG2]], call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 getelementptr inbounds (%struct.x, %struct.x* @dst, i32 0, i32 0), i8* align 8 getelementptr inbounds (%struct.x, %struct.x* @src, i32 0, i32 0), i32 11, i1 false) ret i32 0 } @@ -85,10 +85,10 @@ entry: define void @t6() nounwind { entry: ; CHECK-LABEL: t6: -; CHECK: ldur [[REG9:x[0-9]+]], [x{{[0-9]+}}, #6] -; CHECK: stur [[REG9]], [x{{[0-9]+}}, #6] -; CHECK: ldr -; CHECK: str +; CHECK-DAG: ldur [[REG9:x[0-9]+]], [x{{[0-9]+}}, #6] +; CHECK-DAG: stur [[REG9]], [x{{[0-9]+}}, #6] +; CHECK-DAG: ldr +; CHECK-DAG: str call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([512 x i8], [512 x i8]* @spool.splbuf, i64 0, i64 0), i8* getelementptr inbounds ([14 x i8], [14 x i8]* @.str6, i64 0, i64 0), i64 14, i1 false) ret void } diff --git a/test/CodeGen/AArch64/arm64-variadic-aapcs.ll b/test/CodeGen/AArch64/arm64-variadic-aapcs.ll index 09125293cac..db87d7fae80 100644 --- a/test/CodeGen/AArch64/arm64-variadic-aapcs.ll +++ b/test/CodeGen/AArch64/arm64-variadic-aapcs.ll @@ -14,13 +14,13 @@ define void @test_simple(i32 %n, ...) { ; CHECK: adrp x[[VA_LIST_HI:[0-9]+]], var ; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, :lo12:var -; CHECK: stp x1, x2, [sp, #[[GR_BASE:[0-9]+]]] +; CHECK-DAG: stp x6, x7, [sp, # ; ... omit middle ones ... -; CHECK: str x7, [sp, # +; CHECK-DAG: str x1, [sp, #[[GR_BASE:[0-9]+]]] -; CHECK: stp q0, q1, [sp] +; CHECK-DAG: stp q0, q1, [sp] ; ... omit middle ones ... -; CHECK: stp q6, q7, [sp, # +; CHECK-DAG: stp q6, q7, [sp, # ; CHECK: str [[STACK_TOP]], [x[[VA_LIST]]] @@ -50,13 +50,13 @@ define void @test_fewargs(i32 %n, i32 %n1, i32 %n2, float %m, ...) { ; CHECK: adrp x[[VA_LIST_HI:[0-9]+]], var ; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, :lo12:var -; CHECK: stp x3, x4, [sp, #[[GR_BASE:[0-9]+]]] +; CHECK-DAG: stp x6, x7, [sp, # ; ... omit middle ones ... -; CHECK: str x7, [sp, # +; CHECK-DAG: str x3, [sp, #[[GR_BASE:[0-9]+]]] -; CHECK: stp q1, q2, [sp] +; CHECK-DAG: stp q6, q7, [sp, #80] ; ... omit middle ones ... -; CHECK: str q7, [sp, # +; CHECK-DAG: str q1, [sp] ; CHECK: str [[STACK_TOP]], [x[[VA_LIST]]] @@ -95,10 +95,13 @@ define void @test_nospare([8 x i64], [8 x float], ...) { ; __stack field should point just past them. define void @test_offsetstack([8 x i64], [2 x i64], [3 x float], ...) { ; CHECK-LABEL: test_offsetstack: -; CHECK: stp {{q[0-9]+}}, {{q[0-9]+}}, [sp, #-80]! -; CHECK: add [[STACK_TOP:x[0-9]+]], sp, #96 -; CHECK: add x[[VAR:[0-9]+]], {{x[0-9]+}}, :lo12:var -; CHECK: str [[STACK_TOP]], [x[[VAR]]] + +; CHECK-DAG: stp {{q[0-9]+}}, {{q[0-9]+}}, [sp, #48] +; CHECK-DAG: stp {{q[0-9]+}}, {{q[0-9]+}}, [sp, #16] +; CHECK-DAG: str {{q[0-9]+}}, [sp] +; CHECK-DAG: add [[STACK_TOP:x[0-9]+]], sp, #96 +; CHECK-DAG: add x[[VAR:[0-9]+]], {{x[0-9]+}}, :lo12:var +; CHECK-DAG: str [[STACK_TOP]], [x[[VAR]]] %addr = bitcast %va_list* @var to i8* call void @llvm.va_start(i8* %addr) diff --git a/test/CodeGen/AArch64/win64_vararg.ll b/test/CodeGen/AArch64/win64_vararg.ll index 38da60b81a5..d9bb2ff6b12 100644 --- a/test/CodeGen/AArch64/win64_vararg.ll +++ b/test/CodeGen/AArch64/win64_vararg.ll @@ -5,10 +5,10 @@ entry: ; CHECK: str x30, [sp, #-80]! ; CHECK: add x8, sp, #24 ; CHECK: add x0, sp, #24 -; CHECK: stp x6, x7, [sp, #64] -; CHECK: stp x4, x5, [sp, #48] -; CHECK: stp x2, x3, [sp, #32] -; CHECK: str x1, [sp, #24] +; CHECK: stp x1, x2, [sp, #24] +; CHECK: stp x3, x4, [sp, #40] +; CHECK: stp x5, x6, [sp, #56] +; CHECK: str x7, [sp, #72] ; CHECK: str x8, [sp, #8] ; CHECK: bl other_func ; CHECK: ldr x30, [sp], #80 @@ -78,10 +78,10 @@ entry: ; CHECK-LABEL: copy1: ; CHECK: sub sp, sp, #80 ; CHECK: add x8, sp, #24 -; CHECK: stp x6, x7, [sp, #64] -; CHECK: stp x4, x5, [sp, #48] -; CHECK: stp x2, x3, [sp, #32] -; CHECK: str x1, [sp, #24] +; CHECK: stp x1, x2, [sp, #24] +; CHECK: stp x3, x4, [sp, #40] +; CHECK: stp x5, x6, [sp, #56] +; CHECK: str x7, [sp, #72] ; CHECK: stp x8, x8, [sp], #80 ; CHECK: ret define void @copy1(i64 %a0, ...) nounwind { @@ -111,9 +111,9 @@ declare i64* @__local_stdio_printf_options() local_unnamed_addr #4 ; CHECK: mov x19, x2 ; CHECK: mov x20, x1 ; CHECK: mov x21, x0 -; CHECK: stp x6, x7, [x29, #48] -; CHECK: stp x4, x5, [x29, #32] -; CHECK: str x3, [x29, #24] +; CHECK: stp x3, x4, [x29, #24] +; CHECK: stp x5, x6, [x29, #40] +; CHECK: str x7, [x29, #56] ; CHECK: str x8, [sp, #8] ; CHECK: bl __local_stdio_printf_options ; CHECK: ldr x8, [x0] @@ -162,9 +162,9 @@ attributes #6 = { "no-frame-pointer-elim"="true" } ; CHECK: lsr x15, x8, #4 ; CHECK: mov x19, x1 ; CHECK: mov [[REG2:x[0-9]+]], sp -; CHECK: stp x6, x7, [x29, #48] -; CHECK: stp x4, x5, [x29, #32] ; CHECK: stp x2, x3, [x29, #16] +; CHECK: stp x4, x5, [x29, #32] +; CHECK: stp x6, x7, [x29, #48] ; CHECK: bl __chkstk ; CHECK: mov x8, sp ; CHECK: sub [[REG:x[0-9]+]], x8, x15, lsl #4 @@ -219,9 +219,9 @@ declare void @llvm.stackrestore(i8*) ; CHECK-DAG: mov x19, x2 ; CHECK-DAG: mov x20, x1 ; CHECK-DAG: mov x21, x0 -; CHECK-DAG: stp x6, x7, [sp, #80] -; CHECK-DAG: stp x4, x5, [sp, #64] -; CHECK-DAG: str x3, [sp, #56] +; CHECK-DAG: stp x3, x4, [sp, #56] +; CHECK-DAG: stp x5, x6, [sp, #72] +; CHECK-DAG: str x7, [sp, #88] ; CHECK-DAG: str x8, [sp, #8] ; CHECK-DAG: bl __local_stdio_printf_options ; CHECK-DAG: ldr x8, [x0] diff --git a/test/CodeGen/AMDGPU/call-argument-types.ll b/test/CodeGen/AMDGPU/call-argument-types.ll index 78ccc8cf267..4a4bb0fd252 100644 --- a/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/test/CodeGen/AMDGPU/call-argument-types.ll @@ -752,8 +752,8 @@ entry: ; GCN: buffer_store_dword v33, off, s[0:3], s5 offset:12 ; 4-byte Folded Spill ; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; GCN: buffer_load_dword v33, off, s[0:3], s5 offset:8 -; GCN: buffer_store_dword v33, off, s[0:3], s5 offset:8 ; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4 +; GCN: buffer_store_dword v33, off, s[0:3], s5 offset:8 ; GCN: s_getpc_b64 ; GCN: buffer_load_dword v33, off, s[0:3], s5 offset:12 ; 4-byte Folded Reload ; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:16 ; 4-byte Folded Reload diff --git a/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll b/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll index 954860219d1..1b0dbe9f47f 100644 --- a/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll +++ b/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll @@ -42,8 +42,8 @@ declare void @f(double); ; CHECK-LABEL: test_byval_8_bytes_alignment_fixed_arg: ; CHECK-NOT: str r1 -; CHECK: str r3, [sp, #12] -; CHECK: str r2, [sp, #8] +; CHECK-DAG: str r3, [sp, #12] +; CHECK-DAG: str r2, [sp, #8] ; CHECK-NOT: str r1 define void @test_byval_8_bytes_alignment_fixed_arg(i32 %n1, %struct_t* byval %val) nounwind { entry: diff --git a/test/CodeGen/ARM/2012-10-04-FixedFrame-vs-byval.ll b/test/CodeGen/ARM/2012-10-04-FixedFrame-vs-byval.ll index 34af9026b52..1530d645620 100644 --- a/test/CodeGen/ARM/2012-10-04-FixedFrame-vs-byval.ll +++ b/test/CodeGen/ARM/2012-10-04-FixedFrame-vs-byval.ll @@ -7,8 +7,8 @@ declare i32 @printf(i8*, ...) ; CHECK-LABEL: test_byval_usage_scheduling: -; CHECK: str r3, [sp, #12] -; CHECK: str r2, [sp, #8] +; CHECK-DAG: str r3, [sp, #12] +; CHECK-DAG: str r2, [sp, #8] ; CHECK: vldr d16, [sp, #8] define void @test_byval_usage_scheduling(i32 %n1, i32 %n2, %struct_t* byval %val) nounwind { entry: diff --git a/test/CodeGen/ARM/2014-02-21-byval-reg-split-alignment.ll b/test/CodeGen/ARM/2014-02-21-byval-reg-split-alignment.ll index 5b2fc57359a..f8c4d5d8db8 100644 --- a/test/CodeGen/ARM/2014-02-21-byval-reg-split-alignment.ll +++ b/test/CodeGen/ARM/2014-02-21-byval-reg-split-alignment.ll @@ -35,8 +35,8 @@ define void @foo2(i32 %a, %struct8bytes8align* byval %b) { ; CHECK: sub sp, sp, #8 ; CHECK: push {r11, lr} ; CHECK: add r0, sp, #8 -; CHECK: str r3, [sp, #12] -; CHECK: str r2, [sp, #8] +; CHECK-DAG: str r3, [sp, #12] +; CHECK-DAG: str r2, [sp, #8] ; CHECK: bl usePtr ; CHECK: pop {r11, lr} ; CHECK: add sp, sp, #8 @@ -70,8 +70,8 @@ define void @foo4(%struct4bytes* byval %a, %struct8bytes8align* byval %b) { ; CHECK: push {r11, lr} ; CHECK: str r0, [sp, #8] ; CHECK: add r0, sp, #16 -; CHECK: str r3, [sp, #20] -; CHECK: str r2, [sp, #16] +; CHECK-DAG: str r3, [sp, #20] +; CHECK-DAG: str r2, [sp, #16] ; CHECK: bl usePtr ; CHECK: pop {r11, lr} ; CHECK: add sp, sp, #16 diff --git a/test/CodeGen/ARM/memset-inline.ll b/test/CodeGen/ARM/memset-inline.ll index 01b21e9d387..1b88539211c 100644 --- a/test/CodeGen/ARM/memset-inline.ll +++ b/test/CodeGen/ARM/memset-inline.ll @@ -25,12 +25,12 @@ entry: ; CHECK-7A: vst1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r2] ; CHECK-6M-LABEL: t2: ; CHECK-6M: movs [[REG:r[0-9]+]], #0 -; CHECK-6M: str [[REG]], [sp, #20] -; CHECK-6M: str [[REG]], [sp, #16] -; CHECK-6M: str [[REG]], [sp, #12] -; CHECK-6M: str [[REG]], [sp, #8] -; CHECK-6M: str [[REG]], [sp, #4] -; CHECK-6M: str [[REG]], [sp] +; CHECK-6M-DAG: str [[REG]], [sp, #20] +; CHECK-6M-DAG: str [[REG]], [sp, #16] +; CHECK-6M-DAG: str [[REG]], [sp, #12] +; CHECK-6M-DAG: str [[REG]], [sp, #8] +; CHECK-6M-DAG: str [[REG]], [sp, #4] +; CHECK-6M-DAG: str [[REG]], [sp] %buf = alloca [26 x i8], align 1 %0 = getelementptr inbounds [26 x i8], [26 x i8]* %buf, i32 0, i32 0 call void @llvm.memset.p0i8.i32(i8* %0, i8 0, i32 26, i1 false) diff --git a/test/CodeGen/ARM/thumb1_return_sequence.ll b/test/CodeGen/ARM/thumb1_return_sequence.ll index 11e18f1347a..a7f78c74f7f 100644 --- a/test/CodeGen/ARM/thumb1_return_sequence.ll +++ b/test/CodeGen/ARM/thumb1_return_sequence.ll @@ -57,14 +57,14 @@ entry: ; Epilogue ; -------- -; CHECK-V4T: ldr [[POP:r[4567]]], [sp, #12] +; CHECK-V4T: ldr [[POP:r[4567]]], [sp, #16] ; CHECK-V4T-NEXT: mov lr, [[POP]] ; CHECK-V4T-NEXT: pop {[[SAVED]]} ; CHECK-V4T-NEXT: add sp, #16 ; CHECK-V4T-NEXT: bx lr ; CHECK-V5T: lsls r4 ; CHECK-V5T-NEXT: mov sp, r4 -; CHECK-V5T: ldr [[POP:r[4567]]], [sp, #12] +; CHECK-V5T: ldr [[POP:r[4567]]], [sp, #16] ; CHECK-V5T-NEXT: mov lr, [[POP]] ; CHECK-V5T-NEXT: pop {[[SAVED]]} ; CHECK-V5T-NEXT: add sp, #16 diff --git a/test/CodeGen/ARM/unaligned_load_store.ll b/test/CodeGen/ARM/unaligned_load_store.ll index 4e16bda6c4d..75098e18fc9 100644 --- a/test/CodeGen/ARM/unaligned_load_store.ll +++ b/test/CodeGen/ARM/unaligned_load_store.ll @@ -13,14 +13,14 @@ define void @t(i8* nocapture %a, i8* nocapture %b) nounwind { entry: ; EXPANDED-LABEL: t: -; EXPANDED: ldrb [[R2:r[0-9]+]] -; EXPANDED: ldrb [[R3:r[0-9]+]] -; EXPANDED: ldrb [[R12:r[0-9]+]] -; EXPANDED: ldrb [[R1:r[0-9]+]] -; EXPANDED: strb [[R1]] -; EXPANDED: strb [[R12]] -; EXPANDED: strb [[R3]] -; EXPANDED: strb [[R2]] +; EXPANDED-DAG: ldrb [[R2:r[0-9]+]] +; EXPANDED-DAG: ldrb [[R3:r[0-9]+]] +; EXPANDED-DAG: ldrb [[R12:r[0-9]+]] +; EXPANDED-DAG: ldrb [[R1:r[0-9]+]] +; EXPANDED-DAG: strb [[R1]] +; EXPANDED-DAG: strb [[R12]] +; EXPANDED-DAG: strb [[R3]] +; EXPANDED-DAG: strb [[R2]] ; UNALIGNED-LABEL: t: ; UNALIGNED: ldr r1 diff --git a/test/CodeGen/AVR/calling-conv/c/basic.ll b/test/CodeGen/AVR/calling-conv/c/basic.ll index a5d4676b9b3..80a61a47cb2 100644 --- a/test/CodeGen/AVR/calling-conv/c/basic.ll +++ b/test/CodeGen/AVR/calling-conv/c/basic.ll @@ -66,24 +66,24 @@ define void @ret_void_args_i64(i64 %a) { ; CHECK-LABEL: ret_void_args_i64_i64 define void @ret_void_args_i64_i64(i64 %a, i64 %b) { - ; CHECK: sts 11, r25 - ; CHECK-NEXT: sts 10, r24 - ; CHECK-NEXT: sts 9, r23 - ; CHECK-NEXT: sts 8, r22 - ; CHECK-NEXT: sts 7, r21 - ; CHECK-NEXT: sts 6, r20 - ; CHECK-NEXT: sts 5, r19 - ; CHECK-NEXT: sts 4, r18 + ; CHECK-DAG: sts 11, r25 + ; CHECK-DAG: sts 10, r24 + ; CHECK-DAG: sts 9, r23 + ; CHECK-DAG: sts 8, r22 + ; CHECK-DAG: sts 7, r21 + ; CHECK-DAG: sts 6, r20 + ; CHECK-DAG: sts 5, r19 + ; CHECK-DAG: sts 4, r18 store volatile i64 %a, i64* inttoptr (i64 4 to i64*) - ; CHECK-NEXT: sts 11, r17 - ; CHECK-NEXT: sts 10, r16 - ; CHECK-NEXT: sts 9, r15 - ; CHECK-NEXT: sts 8, r14 - ; CHECK-NEXT: sts 7, r13 - ; CHECK-NEXT: sts 6, r12 - ; CHECK-NEXT: sts 5, r11 - ; CHECK-NEXT: sts 4, r10 + ; CHECK-DAG: sts 11, r17 + ; CHECK-DAG: sts 10, r16 + ; CHECK-DAG: sts 9, r15 + ; CHECK-DAG: sts 8, r14 + ; CHECK-DAG: sts 7, r13 + ; CHECK-DAG: sts 6, r12 + ; CHECK-DAG: sts 5, r11 + ; CHECK-DAG: sts 4, r10 store volatile i64 %b, i64* inttoptr (i64 4 to i64*) ret void } diff --git a/test/CodeGen/AVR/directmem.ll b/test/CodeGen/AVR/directmem.ll index 6d2ddc536d2..6e1f72eceb8 100644 --- a/test/CodeGen/AVR/directmem.ll +++ b/test/CodeGen/AVR/directmem.ll @@ -207,10 +207,10 @@ define i32 @static32_inc() { ; CHECK: sbci r23, 255 ; CHECK: sbci r24, 255 ; CHECK: sbci r25, 255 -; CHECK: sts long.static+3, r25 -; CHECK: sts long.static+2, r24 -; CHECK: sts long.static+1, r23 -; CHECK: sts long.static, r22 +; CHECK-DAG: sts long.static+3, r25 +; CHECK-DAG: sts long.static+2, r24 +; CHECK-DAG: sts long.static+1, r23 +; CHECK-DAG: sts long.static, r22 %1 = load i32, i32* @long.static %inc = add nsw i32 %1, 1 store i32 %inc, i32* @long.static @@ -309,14 +309,14 @@ define i64 @static64_inc() { ; CHECK: sbci r23, 255 ; CHECK: sbci r24, 255 ; CHECK: sbci r25, 255 -; CHECK: sts longlong.static+7, r25 -; CHECK: sts longlong.static+6, r24 -; CHECK: sts longlong.static+5, r23 -; CHECK: sts longlong.static+4, r22 -; CHECK: sts longlong.static+3, r21 -; CHECK: sts longlong.static+2, r20 -; CHECK: sts longlong.static+1, r19 -; CHECK: sts longlong.static, r18 +; CHECK-DAG: sts longlong.static+7, r25 +; CHECK-DAG: sts longlong.static+6, r24 +; CHECK-DAG: sts longlong.static+5, r23 +; CHECK-DAG: sts longlong.static+4, r22 +; CHECK-DAG: sts longlong.static+3, r21 +; CHECK-DAG: sts longlong.static+2, r20 +; CHECK-DAG: sts longlong.static+1, r19 +; CHECK-DAG: sts longlong.static, r18 %1 = load i64, i64* @longlong.static %inc = add nsw i64 %1, 1 store i64 %inc, i64* @longlong.static diff --git a/test/CodeGen/BPF/undef.ll b/test/CodeGen/BPF/undef.ll index 3736cb7a61d..099c2f8ac76 100644 --- a/test/CodeGen/BPF/undef.ll +++ b/test/CodeGen/BPF/undef.ll @@ -20,20 +20,20 @@ define i32 @ebpf_filter(%struct.__sk_buff* nocapture readnone %ebpf_packet) #0 s ; CHECK: *(u64 *)(r10 - 8) = r1 ; CHECK: r1 = 0 -; CHECK: *(u16 *)(r10 + 24) = r1 -; CHECK: *(u16 *)(r10 + 22) = r1 -; CHECK: *(u16 *)(r10 + 20) = r1 -; CHECK: *(u16 *)(r10 + 18) = r1 -; CHECK: *(u16 *)(r10 + 16) = r1 -; CHECK: *(u16 *)(r10 + 14) = r1 -; CHECK: *(u16 *)(r10 + 12) = r1 -; CHECK: *(u16 *)(r10 + 10) = r1 -; CHECK: *(u16 *)(r10 + 8) = r1 -; CHECK: *(u16 *)(r10 + 6) = r1 -; CHECK: *(u16 *)(r10 + 4) = r1 -; CHECK: *(u16 *)(r10 + 2) = r1 -; CHECK: *(u16 *)(r10 + 0) = r1 -; CHECK: *(u16 *)(r10 + 26) = r1 +; CHECK-DAG: *(u16 *)(r10 + 24) = r1 +; CHECK-DAG: *(u16 *)(r10 + 22) = r1 +; CHECK-DAG: *(u16 *)(r10 + 20) = r1 +; CHECK-DAG: *(u16 *)(r10 + 18) = r1 +; CHECK-DAG: *(u16 *)(r10 + 16) = r1 +; CHECK-DAG: *(u16 *)(r10 + 14) = r1 +; CHECK-DAG: *(u16 *)(r10 + 12) = r1 +; CHECK-DAG: *(u16 *)(r10 + 10) = r1 +; CHECK-DAG: *(u16 *)(r10 + 8) = r1 +; CHECK-DAG: *(u16 *)(r10 + 6) = r1 +; CHECK-DAG: *(u16 *)(r10 + 4) = r1 +; CHECK-DAG: *(u16 *)(r10 + 2) = r1 +; CHECK-DAG: *(u16 *)(r10 + 0) = r1 +; CHECK-DAG: *(u16 *)(r10 + 26) = r1 ; CHECK: r2 = r10 ; CHECK: r2 += -8 diff --git a/test/CodeGen/MSP430/cc_args.ll b/test/CodeGen/MSP430/cc_args.ll index c8164f1291d..6695a98b2ac 100644 --- a/test/CodeGen/MSP430/cc_args.ll +++ b/test/CodeGen/MSP430/cc_args.ll @@ -166,10 +166,10 @@ define void @f_i64_i64(i64 %a, i64 %b) #0 { ; CHECK: mov r13, &g_i64+2 ; CHECK: mov r12, &g_i64 store volatile i64 %a, i64* @g_i64, align 2 -; CHECK: mov 10(r4), &g_i64+6 -; CHECK: mov 8(r4), &g_i64+4 -; CHECK: mov 6(r4), &g_i64+2 -; CHECK: mov 4(r4), &g_i64 +; CHECK-DAG: mov 10(r4), &g_i64+6 +; CHECK-DAG: mov 8(r4), &g_i64+4 +; CHECK-DAG: mov 6(r4), &g_i64+2 +; CHECK-DAG: mov 4(r4), &g_i64 store volatile i64 %b, i64* @g_i64, align 2 ret void } diff --git a/test/CodeGen/Mips/v2i16tof32.ll b/test/CodeGen/Mips/v2i16tof32.ll index 7e5591ee9cb..334413b03d5 100644 --- a/test/CodeGen/Mips/v2i16tof32.ll +++ b/test/CodeGen/Mips/v2i16tof32.ll @@ -15,23 +15,19 @@ define float @f(<8 x i16>* %a) { ; CHECK-NEXT: .cfi_def_cfa_register 30 ; CHECK-NEXT: addiu $1, $zero, -16 ; CHECK-NEXT: and $sp, $sp, $1 -; CHECK-NEXT: lw $1, 8($4) -; CHECK-NEXT: lw $2, 4($4) -; CHECK-NEXT: lw $3, 12($4) -; CHECK-NEXT: sw $3, 12($sp) -; CHECK-NEXT: sw $1, 8($sp) -; CHECK-NEXT: sw $2, 4($sp) -; CHECK-NEXT: lw $1, 0($4) -; CHECK-NEXT: sw $1, 0($sp) -; CHECK-NEXT: mtc1 $1, $f0 +; CHECK-NEXT: lw $1, 12($4) +; CHECK-NEXT: lw $2, 0($4) +; CHECK-NEXT: lw $3, 8($4) +; CHECK-NEXT: sw $3, 8($sp) +; CHECK-NEXT: sw $1, 12($sp) +; CHECK-NEXT: sw $2, 0($sp) +; CHECK-NEXT: lw $1, 4($4) +; CHECK-NEXT: sw $1, 4($sp) +; CHECK-NEXT: mtc1 $2, $f0 ; CHECK-NEXT: move $sp, $fp ; CHECK-NEXT: lw $fp, 28($sp) # 4-byte Folded Reload ; CHECK-NEXT: jr $ra ; CHECK-NEXT: addiu $sp, $sp, 32 -; CHECK-NEXT: .set at -; CHECK-NEXT: .set macro -; CHECK-NEXT: .set reorder -; CHECK-NEXT: .end f entry: %m = alloca <8 x i16> %0 = load <8 x i16>, <8 x i16>* %a diff --git a/test/CodeGen/PowerPC/f128-aggregates.ll b/test/CodeGen/PowerPC/f128-aggregates.ll index 9d161037763..8a8c7f17a3e 100644 --- a/test/CodeGen/PowerPC/f128-aggregates.ll +++ b/test/CodeGen/PowerPC/f128-aggregates.ll @@ -82,27 +82,27 @@ define fp128 @testStruct_03(%struct.With9fp128params* byval nocapture readonly align 16 %a) { ; CHECK-LABEL: testStruct_03: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: std r10, 88(r1) -; CHECK-NEXT: std r9, 80(r1) -; CHECK-NEXT: std r8, 72(r1) -; CHECK-NEXT: std r7, 64(r1) -; CHECK-NEXT: std r6, 56(r1) -; CHECK-NEXT: std r5, 48(r1) -; CHECK-NEXT: std r4, 40(r1) -; CHECK-NEXT: std r3, 32(r1) +; CHECK-DAG: std r10, 88(r1) +; CHECK-DAG: std r9, 80(r1) +; CHECK-DAG: std r8, 72(r1) +; CHECK-DAG: std r7, 64(r1) +; CHECK-DAG: std r6, 56(r1) +; CHECK-DAG: std r5, 48(r1) +; CHECK-DAG: std r4, 40(r1) +; CHECK-DAG: std r3, 32(r1) ; CHECK-NEXT: lxv v2, 128(r1) ; CHECK-NEXT: blr ; CHECK-BE-LABEL: testStruct_03: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: std r10, 104(r1) -; CHECK-BE-NEXT: std r9, 96(r1) -; CHECK-BE-NEXT: std r8, 88(r1) -; CHECK-BE-NEXT: std r7, 80(r1) -; CHECK-BE-NEXT: std r6, 72(r1) -; CHECK-BE-NEXT: std r5, 64(r1) -; CHECK-BE-NEXT: std r4, 56(r1) -; CHECK-BE-NEXT: std r3, 48(r1) +; CHECK-BE-DAG: std r10, 104(r1) +; CHECK-BE-DAG: std r9, 96(r1) +; CHECK-BE-DAG: std r8, 88(r1) +; CHECK-BE-DAG: std r7, 80(r1) +; CHECK-BE-DAG: std r6, 72(r1) +; CHECK-BE-DAG: std r5, 64(r1) +; CHECK-BE-DAG: std r4, 56(r1) +; CHECK-BE-DAG: std r3, 48(r1) ; CHECK-BE-NEXT: lxv v2, 144(r1) ; CHECK-BE-NEXT: blr entry: @@ -256,27 +256,27 @@ entry: define fp128 @testNestedAggregate(%struct.MixedC* byval nocapture readonly align 16 %a) { ; CHECK-LABEL: testNestedAggregate: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: std r8, 72(r1) -; CHECK-NEXT: std r7, 64(r1) -; CHECK-NEXT: std r10, 88(r1) -; CHECK-NEXT: std r9, 80(r1) -; CHECK-NEXT: std r6, 56(r1) -; CHECK-NEXT: std r5, 48(r1) -; CHECK-NEXT: std r4, 40(r1) -; CHECK-NEXT: std r3, 32(r1) +; CHECK-DAG: std r10, 88(r1) +; CHECK-DAG: std r9, 80(r1) +; CHECK-DAG: std r8, 72(r1) +; CHECK-DAG: std r7, 64(r1) +; CHECK-DAG: std r6, 56(r1) +; CHECK-DAG: std r5, 48(r1) +; CHECK-DAG: std r4, 40(r1) +; CHECK-DAG: std r3, 32(r1) ; CHECK-NEXT: lxv v2, 64(r1) ; CHECK-NEXT: blr ; CHECK-BE-LABEL: testNestedAggregate: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: std r8, 88(r1) -; CHECK-BE-NEXT: std r7, 80(r1) -; CHECK-BE-NEXT: std r10, 104(r1) -; CHECK-BE-NEXT: std r9, 96(r1) -; CHECK-BE-NEXT: std r6, 72(r1) -; CHECK-BE-NEXT: std r5, 64(r1) -; CHECK-BE-NEXT: std r4, 56(r1) -; CHECK-BE-NEXT: std r3, 48(r1) +; CHECK-BE-DAG: std r8, 88(r1) +; CHECK-BE-DAG: std r7, 80(r1) +; CHECK-BE-DAG: std r10, 104(r1) +; CHECK-BE-DAG: std r9, 96(r1) +; CHECK-BE-DAG: std r6, 72(r1) +; CHECK-BE-DAG: std r5, 64(r1) +; CHECK-BE-DAG: std r4, 56(r1) +; CHECK-BE-DAG: std r3, 48(r1) ; CHECK-BE-NEXT: lxv v2, 80(r1) ; CHECK-BE-NEXT: blr entry: @@ -337,17 +337,17 @@ entry: define fp128 @sum_float128(i32 signext %count, ...) { ; CHECK-LABEL: sum_float128: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: std r10, 88(r1) -; CHECK-NEXT: std r9, 80(r1) -; CHECK-NEXT: std r8, 72(r1) -; CHECK-NEXT: std r7, 64(r1) -; CHECK-NEXT: std r6, 56(r1) -; CHECK-NEXT: cmpwi cr0, r3, 1 -; CHECK-NEXT: std r4, 40(r1) -; CHECK-NEXT: addis [[REG:r[0-9]+]], r2, .LCPI17_0@toc@ha -; CHECK-NEXT: addi [[REG1:r[0-9]+]], [[REG]], .LCPI17_0@toc@l -; CHECK-NEXT: lxvx v2, 0, [[REG1]] -; CHECK-NEXT: std r5, 48(r1) +; CHECK-DAG: std r10, 88(r1) +; CHECK-DAG: std r9, 80(r1) +; CHECK-DAG: std r8, 72(r1) +; CHECK-DAG: std r7, 64(r1) +; CHECK-DAG: std r6, 56(r1) +; CHECK-DAG: std r4, 40(r1) +; CHECK-DAG: cmpwi cr0, r3, 1 +; CHECK-DAG: std r5, 48(r1) +; CHECK-DAG: addis [[REG:r[0-9]+]], r2, .LCPI17_0@toc@ha +; CHECK-DAG: addi [[REG1:r[0-9]+]], [[REG]], .LCPI17_0@toc@l +; CHECK-DAG: lxvx v2, 0, [[REG1]] ; CHECK-NEXT: bltlr cr0 ; CHECK-NEXT: # %bb.1: # %if.end ; CHECK-NEXT: addi r3, r1, 40 diff --git a/test/CodeGen/PowerPC/ppc64-byval-align.ll b/test/CodeGen/PowerPC/ppc64-byval-align.ll index f91da59a3ac..db0cd86995a 100644 --- a/test/CodeGen/PowerPC/ppc64-byval-align.ll +++ b/test/CodeGen/PowerPC/ppc64-byval-align.ll @@ -34,7 +34,7 @@ entry: ret i64 %0 } ; CHECK-LABEL: @callee2 -; CHECK: ld 3, 128(1) +; CHECK: ld {{[0-9]+}}, 128(1) ; CHECK: blr declare i64 @test2(%struct.pad* byval, i32 signext, %struct.test* byval align 16) diff --git a/test/CodeGen/Thumb/frame-access.ll b/test/CodeGen/Thumb/frame-access.ll index 9cbed5ed33f..a9d2999c050 100644 --- a/test/CodeGen/Thumb/frame-access.ll +++ b/test/CodeGen/Thumb/frame-access.ll @@ -173,9 +173,9 @@ entry: ; Setup frame pointer ; CHECK: add r7, sp, #8 ; Register varargs stored via FP -; CHECK: str r3, [r7, #16] -; CHECK-NEXT: str r2, [r7, #12] -; CHECK-NEXT: str r1, [r7, #8] +; CHECK-DAG: str r3, [r7, #16] +; CHECK-DAG: str r2, [r7, #12] +; CHECK-DAG: str r1, [r7, #8] ; Moving SP, access via SP ; int test_args_moving_sp(int a, int b, int c, int d, int e) { diff --git a/test/CodeGen/Thumb/mvn.ll b/test/CodeGen/Thumb/mvn.ll index 1e16effc259..a108bfd124f 100644 --- a/test/CodeGen/Thumb/mvn.ll +++ b/test/CodeGen/Thumb/mvn.ll @@ -194,26 +194,26 @@ for.cond.cleanup: define void @test128(i128* %a) { ; CHECK-LABEL: test128: -; CHECK: ldr r1, [r0, #4] +; CHECK: ldr r1, [r0, #8] ; CHECK-NEXT: ldr r2, .LCPI8_0 ; CHECK-NEXT: eors r2, r1 -; CHECK-NEXT: str r2, [r0, #4] +; CHECK-NEXT: str r2, [r0, #8] ; CHECK-NEXT: ldr r1, [r0] ; CHECK-NEXT: ldr r2, .LCPI8_1 ; CHECK-NEXT: eors r2, r1 ; CHECK-NEXT: str r2, [r0] -; CHECK-NEXT: ldr r1, [r0, #8] +; CHECK-NEXT: ldr r1, [r0, #4] ; CHECK-NEXT: ldr r2, .LCPI8_2 ; CHECK-NEXT: eors r2, r1 -; CHECK-NEXT: str r2, [r0, #8] +; CHECK-NEXT: str r2, [r0, #4] ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: .LCPI8_0: -; CHECK-NEXT: .long 4075008415 +; CHECK-NEXT: .long 6692605 ; CHECK-NEXT: .LCPI8_1: ; CHECK-NEXT: .long 2080661269 ; CHECK-NEXT: .LCPI8_2: -; CHECK-NEXT: .long 6692605 +; CHECK-NEXT: .long 4075008415 %x = load i128, i128* %a %xn = xor i128 %x, 123456789123456789123456789 store i128 %xn, i128* %a diff --git a/test/CodeGen/X86/aligned-variadic.ll b/test/CodeGen/X86/aligned-variadic.ll index 1ea57296a70..d8274443e3e 100644 --- a/test/CodeGen/X86/aligned-variadic.ll +++ b/test/CodeGen/X86/aligned-variadic.ll @@ -17,7 +17,7 @@ entry: store i8* %overflow_arg_area.next, i8** %overflow_arg_area_p, align 8 ; X32: leal 68(%esp), [[REG:%.*]] ; X32: movl [[REG]], 16(%esp) -; X64: leaq 232(%rsp), [[REG:%.*]] +; X64: leaq 256(%rsp), [[REG:%.*]] ; X64: movq [[REG]], 184(%rsp) ; X64: leaq 176(%rsp), %rdi call void @qux(%struct.__va_list_tag* %arraydecay) diff --git a/test/CodeGen/X86/atomic-idempotent.ll b/test/CodeGen/X86/atomic-idempotent.ll index c67a9269c13..e7e2430597e 100644 --- a/test/CodeGen/X86/atomic-idempotent.ll +++ b/test/CodeGen/X86/atomic-idempotent.ll @@ -132,10 +132,10 @@ define i128 @or128(i128* %p) { ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: movl %edi, 12(%esi) -; X32-NEXT: movl %edx, 8(%esi) -; X32-NEXT: movl %ecx, 4(%esi) +; X32-NEXT: movl %edi, 8(%esi) +; X32-NEXT: movl %edx, 12(%esi) ; X32-NEXT: movl %eax, (%esi) +; X32-NEXT: movl %ecx, 4(%esi) ; X32-NEXT: movl %esi, %eax ; X32-NEXT: leal -8(%ebp), %esp ; X32-NEXT: popl %esi diff --git a/test/CodeGen/X86/avx-load-store.ll b/test/CodeGen/X86/avx-load-store.ll index eabe82de444..1b28c729de4 100644 --- a/test/CodeGen/X86/avx-load-store.ll +++ b/test/CodeGen/X86/avx-load-store.ll @@ -245,8 +245,8 @@ define void @add8i32(<8 x i32>* %ret, <8 x i32>* %bp) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: vmovups (%rsi), %xmm0 ; CHECK-NEXT: vmovups 16(%rsi), %xmm1 -; CHECK-NEXT: vmovups %xmm0, (%rdi) ; CHECK-NEXT: vmovups %xmm1, 16(%rdi) +; CHECK-NEXT: vmovups %xmm0, (%rdi) ; CHECK-NEXT: retq ; ; CHECK_O0-LABEL: add8i32: @@ -290,8 +290,8 @@ define void @add4i64a16(<4 x i64>* %ret, <4 x i64>* %bp) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps (%rsi), %xmm0 ; CHECK-NEXT: vmovaps 16(%rsi), %xmm1 -; CHECK-NEXT: vmovaps %xmm0, (%rdi) ; CHECK-NEXT: vmovaps %xmm1, 16(%rdi) +; CHECK-NEXT: vmovaps %xmm0, (%rdi) ; CHECK-NEXT: retq ; ; CHECK_O0-LABEL: add4i64a16: diff --git a/test/CodeGen/X86/btc_bts_btr.ll b/test/CodeGen/X86/btc_bts_btr.ll index 6f43cf752c0..5e64be94b97 100644 --- a/test/CodeGen/X86/btc_bts_btr.ll +++ b/test/CodeGen/X86/btc_bts_btr.ll @@ -859,8 +859,8 @@ define void @btr_64_dont_fold(i64* %x, i64 %n) { ; X86-NEXT: .LBB33_2: ; X86-NEXT: notl %esi ; X86-NEXT: notl %edx -; X86-NEXT: andl %esi, 4(%eax) ; X86-NEXT: andl %edx, (%eax) +; X86-NEXT: andl %esi, 4(%eax) ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl @@ -899,8 +899,8 @@ define void @bts_64_dont_fold(i64* %x, i64 %n) { ; X86-NEXT: movl %edx, %esi ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: .LBB34_2: -; X86-NEXT: orl %esi, 4(%eax) ; X86-NEXT: orl %edx, (%eax) +; X86-NEXT: orl %esi, 4(%eax) ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl @@ -938,8 +938,8 @@ define void @btc_64_dont_fold(i64* %x, i64 %n) { ; X86-NEXT: movl %edx, %esi ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: .LBB35_2: -; X86-NEXT: xorl %esi, 4(%eax) ; X86-NEXT: xorl %edx, (%eax) +; X86-NEXT: xorl %esi, 4(%eax) ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl diff --git a/test/CodeGen/X86/combine-sbb.ll b/test/CodeGen/X86/combine-sbb.ll index fcdba851ad0..43011b033be 100644 --- a/test/CodeGen/X86/combine-sbb.ll +++ b/test/CodeGen/X86/combine-sbb.ll @@ -77,8 +77,8 @@ define void @PR25858_i64(%WideUInt64* sret, %WideUInt64*, %WideUInt64*) nounwind ; X86-NEXT: movzbl %bl, %ecx ; X86-NEXT: subl %ecx, %edx ; X86-NEXT: sbbl $0, %ebp -; X86-NEXT: movl %edi, 4(%eax) ; X86-NEXT: movl %esi, (%eax) +; X86-NEXT: movl %edi, 4(%eax) ; X86-NEXT: movl %edx, 8(%eax) ; X86-NEXT: movl %ebp, 12(%eax) ; X86-NEXT: popl %esi diff --git a/test/CodeGen/X86/constant-combines.ll b/test/CodeGen/X86/constant-combines.ll index f3d2df63a76..20fbedb1574 100644 --- a/test/CodeGen/X86/constant-combines.ll +++ b/test/CodeGen/X86/constant-combines.ll @@ -19,7 +19,7 @@ define void @PR22524({ float, float }* %arg) { ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: xorps %xmm1, %xmm1 ; CHECK-NEXT: mulss %xmm0, %xmm1 -; CHECK-NEXT: movq $0, (%rdi) +; CHECK-NEXT: movl $0, (%rdi) ; CHECK-NEXT: movss %xmm1, 4(%rdi) ; CHECK-NEXT: retq entry: diff --git a/test/CodeGen/X86/min-legal-vector-width.ll b/test/CodeGen/X86/min-legal-vector-width.ll index 1a28852f5aa..e5ff6014edc 100644 --- a/test/CodeGen/X86/min-legal-vector-width.ll +++ b/test/CodeGen/X86/min-legal-vector-width.ll @@ -8,10 +8,10 @@ define void @add256(<16 x i32>* %a, <16 x i32>* %b, <16 x i32>* %c) "min-legal-v ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm0 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 -; CHECK-NEXT: vpaddd (%rsi), %ymm0, %ymm0 ; CHECK-NEXT: vpaddd 32(%rsi), %ymm1, %ymm1 -; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) +; CHECK-NEXT: vpaddd (%rsi), %ymm0, %ymm0 ; CHECK-NEXT: vmovdqa %ymm0, (%rdx) +; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %d = load <16 x i32>, <16 x i32>* %a @@ -85,10 +85,10 @@ define void @pmaddwd_32_256(<32 x i16>* %APtr, <32 x i16>* %BPtr, <16 x i32>* %C ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm0 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 -; CHECK-NEXT: vpmaddwd (%rsi), %ymm0, %ymm0 ; CHECK-NEXT: vpmaddwd 32(%rsi), %ymm1, %ymm1 -; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) +; CHECK-NEXT: vpmaddwd (%rsi), %ymm0, %ymm0 ; CHECK-NEXT: vmovdqa %ymm0, (%rdx) +; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %A = load <32 x i16>, <32 x i16>* %APtr @@ -128,10 +128,10 @@ define void @psubus_64i8_max_256(<64 x i8>* %xptr, <64 x i8>* %yptr, <64 x i8>* ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm0 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 -; CHECK-NEXT: vpsubusb (%rsi), %ymm0, %ymm0 ; CHECK-NEXT: vpsubusb 32(%rsi), %ymm1, %ymm1 -; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) +; CHECK-NEXT: vpsubusb (%rsi), %ymm0, %ymm0 ; CHECK-NEXT: vmovdqa %ymm0, (%rdx) +; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %x = load <64 x i8>, <64 x i8>* %xptr @@ -652,27 +652,27 @@ define void @mul256(<64 x i8>* %a, <64 x i8>* %b, <64 x i8>* %c) "min-legal-vect ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 ; CHECK-NEXT: vmovdqa (%rsi), %ymm2 ; CHECK-NEXT: vmovdqa 32(%rsi), %ymm3 -; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] -; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] +; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] ; CHECK-NEXT: vpmullw %ymm4, %ymm5, %ymm4 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; CHECK-NEXT: vpand %ymm5, %ymm4, %ymm4 -; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] -; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; CHECK-NEXT: vpmullw %ymm2, %ymm0, %ymm0 -; CHECK-NEXT: vpand %ymm5, %ymm0, %ymm0 -; CHECK-NEXT: vpackuswb %ymm4, %ymm0, %ymm0 -; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] -; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] -; CHECK-NEXT: vpmullw %ymm2, %ymm4, %ymm2 -; CHECK-NEXT: vpand %ymm5, %ymm2, %ymm2 ; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] ; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; CHECK-NEXT: vpmullw %ymm3, %ymm1, %ymm1 ; CHECK-NEXT: vpand %ymm5, %ymm1, %ymm1 -; CHECK-NEXT: vpackuswb %ymm2, %ymm1, %ymm1 -; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) +; CHECK-NEXT: vpackuswb %ymm4, %ymm1, %ymm1 +; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] +; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; CHECK-NEXT: vpmullw %ymm3, %ymm4, %ymm3 +; CHECK-NEXT: vpand %ymm5, %ymm3, %ymm3 +; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] +; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; CHECK-NEXT: vpmullw %ymm2, %ymm0, %ymm0 +; CHECK-NEXT: vpand %ymm5, %ymm0, %ymm0 +; CHECK-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 ; CHECK-NEXT: vmovdqa %ymm0, (%rdx) +; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %d = load <64 x i8>, <64 x i8>* %a diff --git a/test/CodeGen/X86/musttail-varargs.ll b/test/CodeGen/X86/musttail-varargs.ll index 6a338c5c7da..b62343fc82a 100644 --- a/test/CodeGen/X86/musttail-varargs.ll +++ b/test/CodeGen/X86/musttail-varargs.ll @@ -56,11 +56,11 @@ define void @f_thunk(i8* %this, ...) { ; LINUX-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp) ; LINUX-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp) ; LINUX-NEXT: .LBB0_2: -; LINUX-NEXT: movq %r15, {{[0-9]+}}(%rsp) -; LINUX-NEXT: movq %r12, {{[0-9]+}}(%rsp) -; LINUX-NEXT: movq %r13, {{[0-9]+}}(%rsp) -; LINUX-NEXT: movq %rbp, {{[0-9]+}}(%rsp) ; LINUX-NEXT: movq %rbx, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movq %rbp, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movq %r13, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movq %r12, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movq %r15, {{[0-9]+}}(%rsp) ; LINUX-NEXT: leaq {{[0-9]+}}(%rsp), %rax ; LINUX-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; LINUX-NEXT: leaq {{[0-9]+}}(%rsp), %rax @@ -150,11 +150,11 @@ define void @f_thunk(i8* %this, ...) { ; LINUX-X32-NEXT: movaps %xmm6, {{[0-9]+}}(%esp) ; LINUX-X32-NEXT: movaps %xmm7, {{[0-9]+}}(%esp) ; LINUX-X32-NEXT: .LBB0_2: -; LINUX-X32-NEXT: movq %r15, {{[0-9]+}}(%esp) -; LINUX-X32-NEXT: movq %r12, {{[0-9]+}}(%esp) -; LINUX-X32-NEXT: movq %r13, {{[0-9]+}}(%esp) -; LINUX-X32-NEXT: movq %rbp, {{[0-9]+}}(%esp) ; LINUX-X32-NEXT: movq %rbx, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movq %rbp, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movq %r13, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movq %r12, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movq %r15, {{[0-9]+}}(%esp) ; LINUX-X32-NEXT: leal {{[0-9]+}}(%rsp), %eax ; LINUX-X32-NEXT: movl %eax, {{[0-9]+}}(%esp) ; LINUX-X32-NEXT: leal {{[0-9]+}}(%rsp), %eax @@ -223,9 +223,9 @@ define void @f_thunk(i8* %this, ...) { ; WINDOWS-NEXT: movq %r8, %rdi ; WINDOWS-NEXT: movq %rdx, %rbx ; WINDOWS-NEXT: movq %rcx, %rbp -; WINDOWS-NEXT: movq %r9, {{[0-9]+}}(%rsp) -; WINDOWS-NEXT: movq %r8, {{[0-9]+}}(%rsp) ; WINDOWS-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; WINDOWS-NEXT: movq %r8, {{[0-9]+}}(%rsp) +; WINDOWS-NEXT: movq %r9, {{[0-9]+}}(%rsp) ; WINDOWS-NEXT: leaq {{[0-9]+}}(%rsp), %rax ; WINDOWS-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; WINDOWS-NEXT: callq get_f diff --git a/test/CodeGen/X86/musttail.ll b/test/CodeGen/X86/musttail.ll index 927322b5723..6192d3109ff 100644 --- a/test/CodeGen/X86/musttail.ll +++ b/test/CodeGen/X86/musttail.ll @@ -46,8 +46,8 @@ define i32 @t4({}* %fn, i32 %n, i32 %r) { ; CHECK-LABEL: t4: ; CHECK: incl %[[r:.*]] ; CHECK: decl %[[n:.*]] -; CHECK: movl %[[r]], {{[0-9]+}}(%esp) -; CHECK: movl %[[n]], {{[0-9]+}}(%esp) +; CHECK-DAG: movl %[[r]], {{[0-9]+}}(%esp) +; CHECK-DAG: movl %[[n]], {{[0-9]+}}(%esp) ; CHECK: jmpl *%{{.*}} entry: @@ -71,8 +71,8 @@ define i32 @t5({}* %fn, i32 %n, i32 %r) alignstack(32) { ; CHECK: incl %[[r:.*]] ; CHECK: decl %[[n:.*]] ; Store them through ebp, since that's the only stable arg pointer. -; CHECK: movl %[[r]], {{[0-9]+}}(%ebp) -; CHECK: movl %[[n]], {{[0-9]+}}(%ebp) +; CHECK-DAG: movl %[[r]], {{[0-9]+}}(%ebp) +; CHECK-DAG: movl %[[n]], {{[0-9]+}}(%ebp) ; Epilogue. ; CHECK: leal {{[-0-9]+}}(%ebp), %esp ; CHECK: popl %esi diff --git a/test/CodeGen/X86/nosse-vector.ll b/test/CodeGen/X86/nosse-vector.ll index ec97b1ed9c0..ef2b40a8741 100644 --- a/test/CodeGen/X86/nosse-vector.ll +++ b/test/CodeGen/X86/nosse-vector.ll @@ -146,7 +146,7 @@ define void @sitofp_4i64_4f32_mem(<4 x i64>* %p0, <4 x float>* %p1) nounwind { ; X32-NEXT: subl $48, %esp ; X32-NEXT: movl 8(%ebp), %eax ; X32-NEXT: movl 24(%eax), %ecx -; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl 28(%eax), %ecx ; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill ; X32-NEXT: movl 16(%eax), %esi @@ -163,7 +163,7 @@ define void @sitofp_4i64_4f32_mem(<4 x i64>* %p0, <4 x float>* %p1) nounwind { ; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X32-NEXT: movl (%esp), %eax # 4-byte Reload ; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X32-NEXT: movl 12(%ebp), %eax ; X32-NEXT: fildll {{[0-9]+}}(%esp) @@ -277,10 +277,10 @@ define void @add_2i64_mem(<2 x i64>* %p0, <2 x i64>* %p1, <2 x i64>* %p2) nounwi ; X32-NEXT: adcl 4(%ecx), %edx ; X32-NEXT: addl 8(%ecx), %edi ; X32-NEXT: adcl 12(%ecx), %esi -; X32-NEXT: movl %esi, 12(%eax) ; X32-NEXT: movl %edi, 8(%eax) -; X32-NEXT: movl %edx, 4(%eax) +; X32-NEXT: movl %esi, 12(%eax) ; X32-NEXT: movl %ebx, (%eax) +; X32-NEXT: movl %edx, 4(%eax) ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi ; X32-NEXT: popl %ebx diff --git a/test/CodeGen/X86/oddshuffles.ll b/test/CodeGen/X86/oddshuffles.ll index 5f15f886aba..a4b8f58fcc7 100644 --- a/test/CodeGen/X86/oddshuffles.ll +++ b/test/CodeGen/X86/oddshuffles.ll @@ -1497,111 +1497,111 @@ define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2 define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2, <8 x i32>* %q3) nounwind { ; SSE2-LABEL: interleave_24i32_in: ; SSE2: # %bb.0: -; SSE2-NEXT: movups (%rsi), %xmm5 -; SSE2-NEXT: movups 16(%rsi), %xmm8 -; SSE2-NEXT: movups (%rdx), %xmm6 -; SSE2-NEXT: movups 16(%rdx), %xmm3 -; SSE2-NEXT: movups (%rcx), %xmm0 -; SSE2-NEXT: movups 16(%rcx), %xmm4 -; SSE2-NEXT: movaps %xmm0, %xmm7 -; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,0],xmm5[1,0] -; SSE2-NEXT: movaps %xmm5, %xmm1 -; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm7[0,2] +; SSE2-NEXT: movups (%rsi), %xmm1 +; SSE2-NEXT: movups 16(%rsi), %xmm0 +; SSE2-NEXT: movups (%rdx), %xmm8 +; SSE2-NEXT: movups 16(%rdx), %xmm5 +; SSE2-NEXT: movups (%rcx), %xmm3 +; SSE2-NEXT: movups 16(%rcx), %xmm6 +; SSE2-NEXT: movaps %xmm3, %xmm7 +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,0],xmm1[1,0] +; SSE2-NEXT: movaps %xmm1, %xmm9 +; SSE2-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm7[0,2] ; SSE2-NEXT: movaps %xmm5, %xmm7 -; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm6[2,1] -; SSE2-NEXT: movaps %xmm0, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm6[1,0] +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,2],xmm6[3,2] +; SSE2-NEXT: movaps %xmm6, %xmm4 +; SSE2-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm7[0,2] +; SSE2-NEXT: movaps %xmm0, %xmm7 +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm5[2,1] +; SSE2-NEXT: movaps %xmm6, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm5[1,0] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm7[0,2] -; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,2],xmm0[3,2] -; SSE2-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm6[0,2] -; SSE2-NEXT: movaps %xmm4, %xmm5 -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0],xmm8[1,0] -; SSE2-NEXT: movaps %xmm8, %xmm6 -; SSE2-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,0],xmm0[1,0] +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0,2] ; SSE2-NEXT: movaps %xmm8, %xmm5 -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[2,1] -; SSE2-NEXT: movaps %xmm4, %xmm7 -; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm3[1,0] -; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm5[0,2] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,2],xmm4[3,2] -; SSE2-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm8[2],xmm4[3],xmm8[3] -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm3[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,2],xmm3[3,2] +; SSE2-NEXT: movaps %xmm3, %xmm6 +; SSE2-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm5[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm8[2,1] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm8[1,0] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[0,2] +; SSE2-NEXT: movups %xmm3, 16(%rdi) +; SSE2-NEXT: movups %xmm6, 32(%rdi) +; SSE2-NEXT: movups %xmm0, 48(%rdi) +; SSE2-NEXT: movups %xmm2, 64(%rdi) ; SSE2-NEXT: movups %xmm4, 80(%rdi) -; SSE2-NEXT: movups %xmm7, 64(%rdi) -; SSE2-NEXT: movups %xmm6, 48(%rdi) -; SSE2-NEXT: movups %xmm0, 32(%rdi) -; SSE2-NEXT: movups %xmm2, 16(%rdi) -; SSE2-NEXT: movups %xmm1, (%rdi) +; SSE2-NEXT: movups %xmm9, (%rdi) ; SSE2-NEXT: retq ; ; SSE42-LABEL: interleave_24i32_in: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqu (%rsi), %xmm5 -; SSE42-NEXT: movdqu 16(%rsi), %xmm2 -; SSE42-NEXT: movdqu (%rdx), %xmm6 -; SSE42-NEXT: movdqu 16(%rdx), %xmm1 -; SSE42-NEXT: movdqu (%rcx), %xmm7 -; SSE42-NEXT: movdqu 16(%rcx), %xmm4 -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,1,1] -; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,1,0,1] -; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3],xmm3[4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,0,1] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5],xmm3[6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,2,2] -; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3],xmm3[4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4,5],xmm3[6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] -; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,3,2,3] -; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm5[2,3],xmm7[4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,2,3,3] -; SSE42-NEXT: pblendw {{.*#+}} xmm5 = xmm7[0,1,2,3],xmm5[4,5],xmm7[6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,1,1] -; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,1,0,1] -; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm6[2,3],xmm7[4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,1,0,1] -; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4,5],xmm7[6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,2,2] -; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm4[2,3],xmm7[4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm2[4,5],xmm7[6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] -; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3],xmm4[4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5],xmm4[6,7] -; SSE42-NEXT: movdqu %xmm1, 80(%rdi) -; SSE42-NEXT: movdqu %xmm7, 64(%rdi) -; SSE42-NEXT: movdqu %xmm6, 48(%rdi) -; SSE42-NEXT: movdqu %xmm5, 32(%rdi) -; SSE42-NEXT: movdqu %xmm3, 16(%rdi) -; SSE42-NEXT: movdqu %xmm0, (%rdi) +; SSE42-NEXT: movdqu (%rsi), %xmm8 +; SSE42-NEXT: movdqu 16(%rsi), %xmm4 +; SSE42-NEXT: movdqu (%rdx), %xmm2 +; SSE42-NEXT: movdqu 16(%rdx), %xmm5 +; SSE42-NEXT: movdqu (%rcx), %xmm3 +; SSE42-NEXT: movdqu 16(%rcx), %xmm6 +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,1,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm8[0,1,0,1] +; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,3],xmm7[4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,0,1] +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm7[0,1,2,3],xmm1[4,5],xmm7[6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3],xmm0[4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,2,3,3] +; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm0[0,1,2,3],xmm7[4,5],xmm0[6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,2,2] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5],xmm0[6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,1,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] +; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,1,0,1] +; SSE42-NEXT: pblendw {{.*#+}} xmm5 = xmm4[0,1,2,3],xmm5[4,5],xmm4[6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,3,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] +; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,2,3,3] +; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5],xmm6[6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,2,2] +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm8[4,5],xmm2[6,7] +; SSE42-NEXT: movdqu %xmm2, 16(%rdi) +; SSE42-NEXT: movdqu %xmm4, 32(%rdi) +; SSE42-NEXT: movdqu %xmm5, 48(%rdi) +; SSE42-NEXT: movdqu %xmm0, 64(%rdi) +; SSE42-NEXT: movdqu %xmm7, 80(%rdi) +; SSE42-NEXT: movdqu %xmm1, (%rdi) ; SSE42-NEXT: retq ; ; AVX1-LABEL: interleave_24i32_in: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovupd (%rsi), %ymm0 ; AVX1-NEXT: vmovupd (%rcx), %ymm1 -; AVX1-NEXT: vmovups (%rdx), %xmm2 -; AVX1-NEXT: vmovups 16(%rdx), %xmm3 -; AVX1-NEXT: vmovups (%rsi), %xmm4 -; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,0],xmm2[2,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm2[1,1],xmm5[0,2] -; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,0],xmm4[0,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[2,1] +; AVX1-NEXT: vmovups 16(%rcx), %xmm2 +; AVX1-NEXT: vmovups (%rdx), %xmm3 +; AVX1-NEXT: vmovups 16(%rdx), %xmm4 +; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm4[3,0],xmm2[3,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm2[2,1],xmm5[0,2] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm4[1,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[2,2] ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 -; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = xmm1[0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 -; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] -; AVX1-NEXT: vmovups 16(%rcx), %xmm4 -; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm3[3,0],xmm4[3,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,1],xmm5[0,2] -; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[1,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,0],xmm3[2,2] -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm4 = ymm0[1,1,3,3] ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] +; AVX1-NEXT: vmovups (%rsi), %xmm4 +; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,0],xmm3[2,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm3[1,1],xmm5[0,2] +; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,0],xmm4[0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[2,1] +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = xmm1[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 ; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] ; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2] ; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,1,2,2] @@ -1609,8 +1609,8 @@ define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2, ; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = mem[0,0,3,3,4,4,7,7] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX1-NEXT: vmovups %ymm0, 32(%rdi) -; AVX1-NEXT: vmovups %ymm3, 64(%rdi) -; AVX1-NEXT: vmovups %ymm2, (%rdi) +; AVX1-NEXT: vmovups %ymm3, (%rdi) +; AVX1-NEXT: vmovups %ymm2, 64(%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1653,19 +1653,19 @@ define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2, ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,3,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm4 = [1,0,2,2,1,0,2,2] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] -; AVX2-FAST-NEXT: vbroadcastsd (%rcx), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[1,1,2,2] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7] +; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm1[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm4 = [1,0,2,2,1,0,2,2] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX2-FAST-NEXT: vbroadcastsd (%rcx), %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, 32(%rdi) -; AVX2-FAST-NEXT: vmovups %ymm4, (%rdi) +; AVX2-FAST-NEXT: vmovups %ymm0, (%rdi) +; AVX2-FAST-NEXT: vmovups %ymm2, 32(%rdi) ; AVX2-FAST-NEXT: vmovups %ymm3, 64(%rdi) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq @@ -1674,32 +1674,32 @@ define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2, ; XOP: # %bb.0: ; XOP-NEXT: vmovupd (%rsi), %ymm0 ; XOP-NEXT: vmovups (%rcx), %ymm1 -; XOP-NEXT: vmovups (%rdx), %xmm2 -; XOP-NEXT: vmovups 16(%rdx), %xmm3 -; XOP-NEXT: vmovups (%rsi), %xmm4 -; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,0],xmm2[2,0] -; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm2[1,1],xmm5[0,2] -; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,0],xmm4[0,0] -; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[2,1] +; XOP-NEXT: vmovups 16(%rcx), %xmm2 +; XOP-NEXT: vmovups (%rdx), %xmm3 +; XOP-NEXT: vmovups 16(%rdx), %xmm4 +; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm4[3,0],xmm2[3,0] +; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm2[2,1],xmm5[0,2] +; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm4[1,0] +; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[2,2] ; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 -; XOP-NEXT: vmovddup {{.*#+}} xmm4 = xmm1[0,0] -; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 -; XOP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] -; XOP-NEXT: vmovups 16(%rcx), %xmm4 -; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm3[3,0],xmm4[3,0] -; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,1],xmm5[0,2] -; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[1,0] -; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,0],xmm3[2,2] -; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 ; XOP-NEXT: vpermilpd {{.*#+}} ymm4 = ymm0[1,1,3,3] ; XOP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] +; XOP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] +; XOP-NEXT: vmovups (%rsi), %xmm4 +; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,0],xmm3[2,0] +; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm3[1,1],xmm5[0,2] +; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,0],xmm4[0,0] +; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[2,1] +; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; XOP-NEXT: vmovddup {{.*#+}} xmm4 = xmm1[0,0] +; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 ; XOP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] ; XOP-NEXT: vpermil2ps {{.*#+}} ymm0 = ymm1[2],ymm0[3],ymm1[2,3],ymm0[4],ymm1[5,4],ymm0[5] ; XOP-NEXT: vpermilps {{.*#+}} ymm1 = mem[0,0,3,3,4,4,7,7] ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; XOP-NEXT: vmovups %ymm0, 32(%rdi) -; XOP-NEXT: vmovups %ymm3, 64(%rdi) -; XOP-NEXT: vmovups %ymm2, (%rdi) +; XOP-NEXT: vmovups %ymm3, (%rdi) +; XOP-NEXT: vmovups %ymm2, 64(%rdi) ; XOP-NEXT: vzeroupper ; XOP-NEXT: retq %s1 = load <8 x i32>, <8 x i32>* %q1, align 4 diff --git a/test/CodeGen/X86/pr40631_deadstore_elision.ll b/test/CodeGen/X86/pr40631_deadstore_elision.ll index c742ce4bd94..f330e0f1578 100644 --- a/test/CodeGen/X86/pr40631_deadstore_elision.ll +++ b/test/CodeGen/X86/pr40631_deadstore_elision.ll @@ -12,13 +12,12 @@ define i32 @ipt_do_table(%struct.sk_buff* noalias nocapture readonly) { ; CHECK-NEXT: movq (%rdi), %rax ; CHECK-NEXT: xorps %xmm0, %xmm0 ; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq $0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movq $170, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movaps {{.*#+}} xmm0 = [12297829382473034410,12297829382473034410] ; CHECK-NEXT: movaps %xmm0, (%rsp) ; CHECK-NEXT: movabsq $-6148914691236517206, %rcx # imm = 0xAAAAAAAAAAAAAAAA ; CHECK-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $-86, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movzwl 2(%rax), %ecx ; CHECK-NEXT: andl $8191, %ecx # imm = 0x1FFF ; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%rsp) diff --git a/test/CodeGen/X86/rotate.ll b/test/CodeGen/X86/rotate.ll index 0d92e267f98..d2ecc28d01d 100644 --- a/test/CodeGen/X86/rotate.ll +++ b/test/CodeGen/X86/rotate.ll @@ -572,8 +572,8 @@ define void @rotr1_64_mem(i64* %Aptr) nounwind { ; X86-NEXT: movl %edx, %esi ; X86-NEXT: shldl $31, %ecx, %esi ; X86-NEXT: shldl $31, %edx, %ecx -; X86-NEXT: movl %ecx, 4(%eax) ; X86-NEXT: movl %esi, (%eax) +; X86-NEXT: movl %ecx, 4(%eax) ; X86-NEXT: popl %esi ; X86-NEXT: retl ; diff --git a/test/CodeGen/X86/rotate4.ll b/test/CodeGen/X86/rotate4.ll index 7f5e426e999..fa7f550fd11 100644 --- a/test/CodeGen/X86/rotate4.ll +++ b/test/CodeGen/X86/rotate4.ll @@ -244,32 +244,32 @@ define void @rotate_left_m64(i64 *%pa, i64 %b) { ; X86-NEXT: .cfi_offset %ebp, -8 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl (%eax), %edx +; X86-NEXT: movl (%eax), %esi ; X86-NEXT: movl 4(%eax), %ebx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: shll %cl, %esi +; X86-NEXT: movl %esi, %edx +; X86-NEXT: shll %cl, %edx ; X86-NEXT: movl %ebx, %edi -; X86-NEXT: shldl %cl, %edx, %edi +; X86-NEXT: shldl %cl, %esi, %edi ; X86-NEXT: testb $32, %cl ; X86-NEXT: je .LBB6_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: movl %esi, %edi -; X86-NEXT: xorl %esi, %esi +; X86-NEXT: movl %edx, %edi +; X86-NEXT: xorl %edx, %edx ; X86-NEXT: .LBB6_2: ; X86-NEXT: negb %cl ; X86-NEXT: movl %ebx, %ebp ; X86-NEXT: shrl %cl, %ebp -; X86-NEXT: shrdl %cl, %ebx, %edx +; X86-NEXT: shrdl %cl, %ebx, %esi ; X86-NEXT: testb $32, %cl ; X86-NEXT: je .LBB6_4 ; X86-NEXT: # %bb.3: -; X86-NEXT: movl %ebp, %edx +; X86-NEXT: movl %ebp, %esi ; X86-NEXT: xorl %ebp, %ebp ; X86-NEXT: .LBB6_4: +; X86-NEXT: orl %esi, %edx ; X86-NEXT: orl %ebp, %edi -; X86-NEXT: orl %edx, %esi +; X86-NEXT: movl %edx, (%eax) ; X86-NEXT: movl %edi, 4(%eax) -; X86-NEXT: movl %esi, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 16 ; X86-NEXT: popl %edi @@ -336,10 +336,10 @@ define void @rotate_right_m64(i64 *%pa, i64 %b) { ; X86-NEXT: movl %ebp, %esi ; X86-NEXT: xorl %ebp, %ebp ; X86-NEXT: .LBB7_4: -; X86-NEXT: orl %esi, %edx ; X86-NEXT: orl %ebp, %edi -; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: orl %esi, %edx ; X86-NEXT: movl %edi, (%eax) +; X86-NEXT: movl %edx, 4(%eax) ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 16 ; X86-NEXT: popl %edi diff --git a/test/CodeGen/X86/sadd_sat_vec.ll b/test/CodeGen/X86/sadd_sat_vec.ll index 320cc076e5f..186141aa6ae 100644 --- a/test/CodeGen/X86/sadd_sat_vec.ll +++ b/test/CodeGen/X86/sadd_sat_vec.ll @@ -460,10 +460,10 @@ define void @v12i16(<12 x i16>* %px, <12 x i16>* %py, <12 x i16>* %pz) nounwind ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vpaddsw (%rsi), %xmm0, %xmm0 ; AVX1-NEXT: vpaddsw 16(%rsi), %xmm1, %xmm1 -; AVX1-NEXT: vmovq %xmm1, 16(%rdx) +; AVX1-NEXT: vpaddsw (%rsi), %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, (%rdx) +; AVX1-NEXT: vmovq %xmm1, 16(%rdx) ; AVX1-NEXT: retq ; ; AVX2-LABEL: v12i16: diff --git a/test/CodeGen/X86/shift-and.ll b/test/CodeGen/X86/shift-and.ll index fc8eb2ff0ce..00dc6358ed0 100644 --- a/test/CodeGen/X86/shift-and.ll +++ b/test/CodeGen/X86/shift-and.ll @@ -144,8 +144,8 @@ define void @t5ptr(i64 %t, i64* %ptr) nounwind { ; X32-NEXT: movl %esi, %edx ; X32-NEXT: xorl %esi, %esi ; X32-NEXT: .LBB5_2: -; X32-NEXT: movl %esi, 4(%eax) ; X32-NEXT: movl %edx, (%eax) +; X32-NEXT: movl %esi, 4(%eax) ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi ; X32-NEXT: retl diff --git a/test/CodeGen/X86/shrink_vmul-widen.ll b/test/CodeGen/X86/shrink_vmul-widen.ll index d0fad23394b..0ed79ea4af7 100644 --- a/test/CodeGen/X86/shrink_vmul-widen.ll +++ b/test/CodeGen/X86/shrink_vmul-widen.ll @@ -746,18 +746,18 @@ define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i6 ; X86-SSE-NEXT: pmulhuw %xmm0, %xmm4 ; X86-SSE-NEXT: pmullw %xmm0, %xmm2 ; X86-SSE-NEXT: movdqa %xmm2, %xmm0 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; X86-SSE-NEXT: movdqa %xmm3, %xmm4 ; X86-SSE-NEXT: pmulhuw %xmm1, %xmm4 ; X86-SSE-NEXT: pmullw %xmm1, %xmm3 ; X86-SSE-NEXT: movdqa %xmm3, %xmm1 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; X86-SSE-NEXT: movdqu %xmm3, 48(%esi,%ecx,4) -; X86-SSE-NEXT: movdqu %xmm1, 32(%esi,%ecx,4) -; X86-SSE-NEXT: movdqu %xmm2, 16(%esi,%ecx,4) -; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4) +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; X86-SSE-NEXT: movdqu %xmm3, 32(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm1, 48(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm2, (%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm0, 16(%esi,%ecx,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; @@ -818,18 +818,18 @@ define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i6 ; X64-SSE-NEXT: pmulhuw %xmm0, %xmm4 ; X64-SSE-NEXT: pmullw %xmm0, %xmm2 ; X64-SSE-NEXT: movdqa %xmm2, %xmm0 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; X64-SSE-NEXT: movdqa %xmm3, %xmm4 ; X64-SSE-NEXT: pmulhuw %xmm1, %xmm4 ; X64-SSE-NEXT: pmullw %xmm1, %xmm3 ; X64-SSE-NEXT: movdqa %xmm3, %xmm1 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; X64-SSE-NEXT: movdqu %xmm3, 48(%rax,%rdx,4) -; X64-SSE-NEXT: movdqu %xmm1, 32(%rax,%rdx,4) -; X64-SSE-NEXT: movdqu %xmm2, 16(%rax,%rdx,4) -; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4) +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; X64-SSE-NEXT: movdqu %xmm3, 32(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm1, 48(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm2, (%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm0, 16(%rax,%rdx,4) ; X64-SSE-NEXT: retq ; ; X64-AVX1-LABEL: mul_16xi16: @@ -1262,18 +1262,18 @@ define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly % ; X86-SSE-NEXT: pmulhw %xmm0, %xmm4 ; X86-SSE-NEXT: pmullw %xmm0, %xmm2 ; X86-SSE-NEXT: movdqa %xmm2, %xmm0 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; X86-SSE-NEXT: movdqa %xmm3, %xmm4 ; X86-SSE-NEXT: pmulhw %xmm1, %xmm4 ; X86-SSE-NEXT: pmullw %xmm1, %xmm3 ; X86-SSE-NEXT: movdqa %xmm3, %xmm1 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; X86-SSE-NEXT: movdqu %xmm3, 48(%esi,%ecx,4) -; X86-SSE-NEXT: movdqu %xmm1, 32(%esi,%ecx,4) -; X86-SSE-NEXT: movdqu %xmm2, 16(%esi,%ecx,4) -; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4) +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; X86-SSE-NEXT: movdqu %xmm3, 32(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm1, 48(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm2, (%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm0, 16(%esi,%ecx,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; @@ -1334,18 +1334,18 @@ define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly % ; X64-SSE-NEXT: pmulhw %xmm0, %xmm4 ; X64-SSE-NEXT: pmullw %xmm0, %xmm2 ; X64-SSE-NEXT: movdqa %xmm2, %xmm0 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; X64-SSE-NEXT: movdqa %xmm3, %xmm4 ; X64-SSE-NEXT: pmulhw %xmm1, %xmm4 ; X64-SSE-NEXT: pmullw %xmm1, %xmm3 ; X64-SSE-NEXT: movdqa %xmm3, %xmm1 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; X64-SSE-NEXT: movdqu %xmm3, 48(%rax,%rdx,4) -; X64-SSE-NEXT: movdqu %xmm1, 32(%rax,%rdx,4) -; X64-SSE-NEXT: movdqu %xmm2, 16(%rax,%rdx,4) -; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4) +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; X64-SSE-NEXT: movdqu %xmm3, 32(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm1, 48(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm2, (%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm0, 16(%rax,%rdx,4) ; X64-SSE-NEXT: retq ; ; X64-AVX1-LABEL: mul_16xi16_sext: diff --git a/test/CodeGen/X86/shrink_vmul.ll b/test/CodeGen/X86/shrink_vmul.ll index 8a8a396775b..85ffce8fc39 100644 --- a/test/CodeGen/X86/shrink_vmul.ll +++ b/test/CodeGen/X86/shrink_vmul.ll @@ -740,18 +740,18 @@ define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i6 ; X86-SSE-NEXT: pmulhuw %xmm0, %xmm4 ; X86-SSE-NEXT: pmullw %xmm0, %xmm2 ; X86-SSE-NEXT: movdqa %xmm2, %xmm0 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; X86-SSE-NEXT: movdqa %xmm3, %xmm4 ; X86-SSE-NEXT: pmulhuw %xmm1, %xmm4 ; X86-SSE-NEXT: pmullw %xmm1, %xmm3 ; X86-SSE-NEXT: movdqa %xmm3, %xmm1 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; X86-SSE-NEXT: movdqu %xmm3, 48(%esi,%ecx,4) -; X86-SSE-NEXT: movdqu %xmm1, 32(%esi,%ecx,4) -; X86-SSE-NEXT: movdqu %xmm2, 16(%esi,%ecx,4) -; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4) +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; X86-SSE-NEXT: movdqu %xmm3, 32(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm1, 48(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm2, (%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm0, 16(%esi,%ecx,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; @@ -812,18 +812,18 @@ define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i6 ; X64-SSE-NEXT: pmulhuw %xmm0, %xmm4 ; X64-SSE-NEXT: pmullw %xmm0, %xmm2 ; X64-SSE-NEXT: movdqa %xmm2, %xmm0 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; X64-SSE-NEXT: movdqa %xmm3, %xmm4 ; X64-SSE-NEXT: pmulhuw %xmm1, %xmm4 ; X64-SSE-NEXT: pmullw %xmm1, %xmm3 ; X64-SSE-NEXT: movdqa %xmm3, %xmm1 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; X64-SSE-NEXT: movdqu %xmm3, 48(%rax,%rdx,4) -; X64-SSE-NEXT: movdqu %xmm1, 32(%rax,%rdx,4) -; X64-SSE-NEXT: movdqu %xmm2, 16(%rax,%rdx,4) -; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4) +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; X64-SSE-NEXT: movdqu %xmm3, 32(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm1, 48(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm2, (%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm0, 16(%rax,%rdx,4) ; X64-SSE-NEXT: retq ; ; X64-AVX1-LABEL: mul_16xi16: @@ -1240,18 +1240,18 @@ define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly % ; X86-SSE-NEXT: pmulhw %xmm0, %xmm4 ; X86-SSE-NEXT: pmullw %xmm0, %xmm2 ; X86-SSE-NEXT: movdqa %xmm2, %xmm0 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; X86-SSE-NEXT: movdqa %xmm3, %xmm4 ; X86-SSE-NEXT: pmulhw %xmm1, %xmm4 ; X86-SSE-NEXT: pmullw %xmm1, %xmm3 ; X86-SSE-NEXT: movdqa %xmm3, %xmm1 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; X86-SSE-NEXT: movdqu %xmm3, 48(%esi,%ecx,4) -; X86-SSE-NEXT: movdqu %xmm1, 32(%esi,%ecx,4) -; X86-SSE-NEXT: movdqu %xmm2, 16(%esi,%ecx,4) -; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4) +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; X86-SSE-NEXT: movdqu %xmm3, 32(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm1, 48(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm2, (%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm0, 16(%esi,%ecx,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; @@ -1312,18 +1312,18 @@ define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly % ; X64-SSE-NEXT: pmulhw %xmm0, %xmm4 ; X64-SSE-NEXT: pmullw %xmm0, %xmm2 ; X64-SSE-NEXT: movdqa %xmm2, %xmm0 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; X64-SSE-NEXT: movdqa %xmm3, %xmm4 ; X64-SSE-NEXT: pmulhw %xmm1, %xmm4 ; X64-SSE-NEXT: pmullw %xmm1, %xmm3 ; X64-SSE-NEXT: movdqa %xmm3, %xmm1 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; X64-SSE-NEXT: movdqu %xmm3, 48(%rax,%rdx,4) -; X64-SSE-NEXT: movdqu %xmm1, 32(%rax,%rdx,4) -; X64-SSE-NEXT: movdqu %xmm2, 16(%rax,%rdx,4) -; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4) +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; X64-SSE-NEXT: movdqu %xmm3, 32(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm1, 48(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm2, (%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm0, 16(%rax,%rdx,4) ; X64-SSE-NEXT: retq ; ; X64-AVX1-LABEL: mul_16xi16_sext: diff --git a/test/CodeGen/X86/ssub_sat_vec.ll b/test/CodeGen/X86/ssub_sat_vec.ll index 5ba7b173c51..b9adde1226e 100644 --- a/test/CodeGen/X86/ssub_sat_vec.ll +++ b/test/CodeGen/X86/ssub_sat_vec.ll @@ -460,10 +460,10 @@ define void @v12i16(<12 x i16>* %px, <12 x i16>* %py, <12 x i16>* %pz) nounwind ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vpsubsw (%rsi), %xmm0, %xmm0 ; AVX1-NEXT: vpsubsw 16(%rsi), %xmm1, %xmm1 -; AVX1-NEXT: vmovq %xmm1, 16(%rdx) +; AVX1-NEXT: vpsubsw (%rsi), %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, (%rdx) +; AVX1-NEXT: vmovq %xmm1, 16(%rdx) ; AVX1-NEXT: retq ; ; AVX2-LABEL: v12i16: diff --git a/test/CodeGen/X86/uadd_sat_vec.ll b/test/CodeGen/X86/uadd_sat_vec.ll index c52c489bcad..83fe8c10989 100644 --- a/test/CodeGen/X86/uadd_sat_vec.ll +++ b/test/CodeGen/X86/uadd_sat_vec.ll @@ -460,10 +460,10 @@ define void @v12i16(<12 x i16>* %px, <12 x i16>* %py, <12 x i16>* %pz) nounwind ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vpaddusw (%rsi), %xmm0, %xmm0 ; AVX1-NEXT: vpaddusw 16(%rsi), %xmm1, %xmm1 -; AVX1-NEXT: vmovq %xmm1, 16(%rdx) +; AVX1-NEXT: vpaddusw (%rsi), %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, (%rdx) +; AVX1-NEXT: vmovq %xmm1, 16(%rdx) ; AVX1-NEXT: retq ; ; AVX2-LABEL: v12i16: diff --git a/test/CodeGen/X86/usub_sat_vec.ll b/test/CodeGen/X86/usub_sat_vec.ll index 72c0c51ab74..e3d47d2ab6a 100644 --- a/test/CodeGen/X86/usub_sat_vec.ll +++ b/test/CodeGen/X86/usub_sat_vec.ll @@ -460,10 +460,10 @@ define void @v12i16(<12 x i16>* %px, <12 x i16>* %py, <12 x i16>* %pz) nounwind ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vpsubusw (%rsi), %xmm0, %xmm0 ; AVX1-NEXT: vpsubusw 16(%rsi), %xmm1, %xmm1 -; AVX1-NEXT: vmovq %xmm1, 16(%rdx) +; AVX1-NEXT: vpsubusw (%rsi), %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, (%rdx) +; AVX1-NEXT: vmovq %xmm1, 16(%rdx) ; AVX1-NEXT: retq ; ; AVX2-LABEL: v12i16: diff --git a/test/CodeGen/X86/vastart-defs-eflags.ll b/test/CodeGen/X86/vastart-defs-eflags.ll index 6ef691552aa..00e605ae516 100644 --- a/test/CodeGen/X86/vastart-defs-eflags.ll +++ b/test/CodeGen/X86/vastart-defs-eflags.ll @@ -21,11 +21,11 @@ define i32 @check_flag(i32 %flags, ...) nounwind { ; CHECK-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp) ; CHECK-NEXT: LBB0_2: ## %entry -; CHECK-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testl $512, %edi ## imm = 0x200 ; CHECK-NEXT: je LBB0_4 diff --git a/test/CodeGen/X86/vec_fpext.ll b/test/CodeGen/X86/vec_fpext.ll index b66d5d1bfff..3007c8d71dc 100644 --- a/test/CodeGen/X86/vec_fpext.ll +++ b/test/CodeGen/X86/vec_fpext.ll @@ -186,14 +186,14 @@ define void @fpext_frommem8(<8 x float>* %in, <8 x double>* %out) { ; X32-SSE: # %bb.0: # %entry ; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] ; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04] -; X32-SSE-NEXT: cvtps2pd (%ecx), %xmm0 # encoding: [0x0f,0x5a,0x01] -; X32-SSE-NEXT: cvtps2pd 8(%ecx), %xmm1 # encoding: [0x0f,0x5a,0x49,0x08] -; X32-SSE-NEXT: cvtps2pd 16(%ecx), %xmm2 # encoding: [0x0f,0x5a,0x51,0x10] -; X32-SSE-NEXT: cvtps2pd 24(%ecx), %xmm3 # encoding: [0x0f,0x5a,0x59,0x18] -; X32-SSE-NEXT: movups %xmm3, 48(%eax) # encoding: [0x0f,0x11,0x58,0x30] -; X32-SSE-NEXT: movups %xmm2, 32(%eax) # encoding: [0x0f,0x11,0x50,0x20] -; X32-SSE-NEXT: movups %xmm1, 16(%eax) # encoding: [0x0f,0x11,0x48,0x10] -; X32-SSE-NEXT: movups %xmm0, (%eax) # encoding: [0x0f,0x11,0x00] +; X32-SSE-NEXT: cvtps2pd 8(%ecx), %xmm0 # encoding: [0x0f,0x5a,0x41,0x08] +; X32-SSE-NEXT: cvtps2pd (%ecx), %xmm1 # encoding: [0x0f,0x5a,0x09] +; X32-SSE-NEXT: cvtps2pd 24(%ecx), %xmm2 # encoding: [0x0f,0x5a,0x51,0x18] +; X32-SSE-NEXT: cvtps2pd 16(%ecx), %xmm3 # encoding: [0x0f,0x5a,0x59,0x10] +; X32-SSE-NEXT: movups %xmm3, 32(%eax) # encoding: [0x0f,0x11,0x58,0x20] +; X32-SSE-NEXT: movups %xmm2, 48(%eax) # encoding: [0x0f,0x11,0x50,0x30] +; X32-SSE-NEXT: movups %xmm1, (%eax) # encoding: [0x0f,0x11,0x08] +; X32-SSE-NEXT: movups %xmm0, 16(%eax) # encoding: [0x0f,0x11,0x40,0x10] ; X32-SSE-NEXT: retl # encoding: [0xc3] ; ; X32-AVX-LABEL: fpext_frommem8: @@ -218,14 +218,14 @@ define void @fpext_frommem8(<8 x float>* %in, <8 x double>* %out) { ; ; X64-SSE-LABEL: fpext_frommem8: ; X64-SSE: # %bb.0: # %entry -; X64-SSE-NEXT: cvtps2pd (%rdi), %xmm0 # encoding: [0x0f,0x5a,0x07] -; X64-SSE-NEXT: cvtps2pd 8(%rdi), %xmm1 # encoding: [0x0f,0x5a,0x4f,0x08] -; X64-SSE-NEXT: cvtps2pd 16(%rdi), %xmm2 # encoding: [0x0f,0x5a,0x57,0x10] -; X64-SSE-NEXT: cvtps2pd 24(%rdi), %xmm3 # encoding: [0x0f,0x5a,0x5f,0x18] -; X64-SSE-NEXT: movups %xmm3, 48(%rsi) # encoding: [0x0f,0x11,0x5e,0x30] -; X64-SSE-NEXT: movups %xmm2, 32(%rsi) # encoding: [0x0f,0x11,0x56,0x20] -; X64-SSE-NEXT: movups %xmm1, 16(%rsi) # encoding: [0x0f,0x11,0x4e,0x10] -; X64-SSE-NEXT: movups %xmm0, (%rsi) # encoding: [0x0f,0x11,0x06] +; X64-SSE-NEXT: cvtps2pd 8(%rdi), %xmm0 # encoding: [0x0f,0x5a,0x47,0x08] +; X64-SSE-NEXT: cvtps2pd (%rdi), %xmm1 # encoding: [0x0f,0x5a,0x0f] +; X64-SSE-NEXT: cvtps2pd 24(%rdi), %xmm2 # encoding: [0x0f,0x5a,0x57,0x18] +; X64-SSE-NEXT: cvtps2pd 16(%rdi), %xmm3 # encoding: [0x0f,0x5a,0x5f,0x10] +; X64-SSE-NEXT: movups %xmm3, 32(%rsi) # encoding: [0x0f,0x11,0x5e,0x20] +; X64-SSE-NEXT: movups %xmm2, 48(%rsi) # encoding: [0x0f,0x11,0x56,0x30] +; X64-SSE-NEXT: movups %xmm1, (%rsi) # encoding: [0x0f,0x11,0x0e] +; X64-SSE-NEXT: movups %xmm0, 16(%rsi) # encoding: [0x0f,0x11,0x46,0x10] ; X64-SSE-NEXT: retq # encoding: [0xc3] ; ; X64-AVX-LABEL: fpext_frommem8: diff --git a/test/CodeGen/X86/widen_cast-2.ll b/test/CodeGen/X86/widen_cast-2.ll index 03d4700c064..e7780912cd9 100644 --- a/test/CodeGen/X86/widen_cast-2.ll +++ b/test/CodeGen/X86/widen_cast-2.ll @@ -21,9 +21,9 @@ define void @convert(<7 x i32>* %dst, <14 x i16>* %src) nounwind { ; CHECK-NEXT: movdqa 16(%edx,%eax), %xmm2 ; CHECK-NEXT: psubw %xmm0, %xmm1 ; CHECK-NEXT: psubw %xmm0, %xmm2 -; CHECK-NEXT: pextrd $2, %xmm2, 24(%ecx,%eax) -; CHECK-NEXT: pextrd $1, %xmm2, 20(%ecx,%eax) ; CHECK-NEXT: movd %xmm2, 16(%ecx,%eax) +; CHECK-NEXT: pextrd $1, %xmm2, 20(%ecx,%eax) +; CHECK-NEXT: pextrd $2, %xmm2, 24(%ecx,%eax) ; CHECK-NEXT: movdqa %xmm1, (%ecx,%eax) ; CHECK-NEXT: incl (%esp) ; CHECK-NEXT: cmpl $3, (%esp) diff --git a/test/CodeGen/X86/widen_load-2.ll b/test/CodeGen/X86/widen_load-2.ll index 5147db07cb8..23b68b26980 100644 --- a/test/CodeGen/X86/widen_load-2.ll +++ b/test/CodeGen/X86/widen_load-2.ll @@ -47,8 +47,8 @@ define void @add3i32_2(%i32vec3* sret %ret, %i32vec3* %ap, %i32vec3* %bp) { ; X86-NEXT: pinsrd $1, 4(%ecx), %xmm1 ; X86-NEXT: pinsrd $2, 8(%ecx), %xmm1 ; X86-NEXT: paddd %xmm0, %xmm1 -; X86-NEXT: pextrd $2, %xmm1, 8(%eax) ; X86-NEXT: pextrd $1, %xmm1, 4(%eax) +; X86-NEXT: pextrd $2, %xmm1, 8(%eax) ; X86-NEXT: movd %xmm1, (%eax) ; X86-NEXT: retl $4 ; @@ -81,9 +81,9 @@ define void @add7i32(%i32vec7* sret %ret, %i32vec7* %ap, %i32vec7* %bp) { ; X86-NEXT: movdqa 16(%edx), %xmm1 ; X86-NEXT: paddd (%ecx), %xmm0 ; X86-NEXT: paddd 16(%ecx), %xmm1 -; X86-NEXT: pextrd $2, %xmm1, 24(%eax) -; X86-NEXT: pextrd $1, %xmm1, 20(%eax) ; X86-NEXT: movd %xmm1, 16(%eax) +; X86-NEXT: pextrd $1, %xmm1, 20(%eax) +; X86-NEXT: pextrd $2, %xmm1, 24(%eax) ; X86-NEXT: movdqa %xmm0, (%eax) ; X86-NEXT: retl $4 ; @@ -94,8 +94,8 @@ define void @add7i32(%i32vec7* sret %ret, %i32vec7* %ap, %i32vec7* %bp) { ; X64-NEXT: movdqa 16(%rsi), %xmm1 ; X64-NEXT: paddd (%rdx), %xmm0 ; X64-NEXT: paddd 16(%rdx), %xmm1 -; X64-NEXT: pextrd $2, %xmm1, 24(%rdi) ; X64-NEXT: movq %xmm1, 16(%rdi) +; X64-NEXT: pextrd $2, %xmm1, 24(%rdi) ; X64-NEXT: movdqa %xmm0, (%rdi) ; X64-NEXT: retq %a = load %i32vec7, %i32vec7* %ap, align 16 @@ -116,10 +116,10 @@ define void @add12i32(%i32vec12* sret %ret, %i32vec12* %ap, %i32vec12* %bp) { ; X86-NEXT: movdqa (%edx), %xmm1 ; X86-NEXT: movdqa 16(%edx), %xmm2 ; X86-NEXT: paddd (%ecx), %xmm1 -; X86-NEXT: paddd 16(%ecx), %xmm2 ; X86-NEXT: paddd 32(%ecx), %xmm0 -; X86-NEXT: movdqa %xmm0, 32(%eax) +; X86-NEXT: paddd 16(%ecx), %xmm2 ; X86-NEXT: movdqa %xmm2, 16(%eax) +; X86-NEXT: movdqa %xmm0, 32(%eax) ; X86-NEXT: movdqa %xmm1, (%eax) ; X86-NEXT: retl $4 ; @@ -130,10 +130,10 @@ define void @add12i32(%i32vec12* sret %ret, %i32vec12* %ap, %i32vec12* %bp) { ; X64-NEXT: movdqa 16(%rsi), %xmm1 ; X64-NEXT: movdqa 32(%rsi), %xmm2 ; X64-NEXT: paddd (%rdx), %xmm0 -; X64-NEXT: paddd 16(%rdx), %xmm1 ; X64-NEXT: paddd 32(%rdx), %xmm2 -; X64-NEXT: movdqa %xmm2, 32(%rdi) +; X64-NEXT: paddd 16(%rdx), %xmm1 ; X64-NEXT: movdqa %xmm1, 16(%rdi) +; X64-NEXT: movdqa %xmm2, 32(%rdi) ; X64-NEXT: movdqa %xmm0, (%rdi) ; X64-NEXT: retq %a = load %i32vec12, %i32vec12* %ap, align 16 @@ -225,8 +225,8 @@ define void @add12i16(%i16vec12* nocapture sret %ret, %i16vec12* %ap, %i16vec12* ; X86-NEXT: movdqa 16(%edx), %xmm1 ; X86-NEXT: paddw (%ecx), %xmm0 ; X86-NEXT: paddw 16(%ecx), %xmm1 -; X86-NEXT: pextrd $1, %xmm1, 20(%eax) ; X86-NEXT: movd %xmm1, 16(%eax) +; X86-NEXT: pextrd $1, %xmm1, 20(%eax) ; X86-NEXT: movdqa %xmm0, (%eax) ; X86-NEXT: retl $4 ; @@ -258,10 +258,10 @@ define void @add18i16(%i16vec18* nocapture sret %ret, %i16vec18* %ap, %i16vec18* ; X86-NEXT: movdqa (%edx), %xmm1 ; X86-NEXT: movdqa 16(%edx), %xmm2 ; X86-NEXT: paddw (%ecx), %xmm1 -; X86-NEXT: paddw 16(%ecx), %xmm2 ; X86-NEXT: paddw 32(%ecx), %xmm0 -; X86-NEXT: movd %xmm0, 32(%eax) +; X86-NEXT: paddw 16(%ecx), %xmm2 ; X86-NEXT: movdqa %xmm2, 16(%eax) +; X86-NEXT: movd %xmm0, 32(%eax) ; X86-NEXT: movdqa %xmm1, (%eax) ; X86-NEXT: retl $4 ; @@ -272,10 +272,10 @@ define void @add18i16(%i16vec18* nocapture sret %ret, %i16vec18* %ap, %i16vec18* ; X64-NEXT: movdqa 16(%rsi), %xmm1 ; X64-NEXT: movdqa 32(%rsi), %xmm2 ; X64-NEXT: paddw (%rdx), %xmm0 -; X64-NEXT: paddw 16(%rdx), %xmm1 ; X64-NEXT: paddw 32(%rdx), %xmm2 -; X64-NEXT: movd %xmm2, 32(%rdi) +; X64-NEXT: paddw 16(%rdx), %xmm1 ; X64-NEXT: movdqa %xmm1, 16(%rdi) +; X64-NEXT: movd %xmm2, 32(%rdi) ; X64-NEXT: movdqa %xmm0, (%rdi) ; X64-NEXT: retq %a = load %i16vec18, %i16vec18* %ap, align 16 @@ -331,11 +331,11 @@ define void @add31i8(%i8vec31* nocapture sret %ret, %i8vec31* %ap, %i8vec31* %bp ; X86-NEXT: movdqa 16(%edx), %xmm1 ; X86-NEXT: paddb (%ecx), %xmm0 ; X86-NEXT: paddb 16(%ecx), %xmm1 -; X86-NEXT: pextrb $14, %xmm1, 30(%eax) -; X86-NEXT: pextrw $6, %xmm1, 28(%eax) -; X86-NEXT: pextrd $2, %xmm1, 24(%eax) -; X86-NEXT: pextrd $1, %xmm1, 20(%eax) ; X86-NEXT: movd %xmm1, 16(%eax) +; X86-NEXT: pextrd $1, %xmm1, 20(%eax) +; X86-NEXT: pextrd $2, %xmm1, 24(%eax) +; X86-NEXT: pextrw $6, %xmm1, 28(%eax) +; X86-NEXT: pextrb $14, %xmm1, 30(%eax) ; X86-NEXT: movdqa %xmm0, (%eax) ; X86-NEXT: retl $4 ; @@ -346,10 +346,10 @@ define void @add31i8(%i8vec31* nocapture sret %ret, %i8vec31* %ap, %i8vec31* %bp ; X64-NEXT: movdqa 16(%rsi), %xmm1 ; X64-NEXT: paddb (%rdx), %xmm0 ; X64-NEXT: paddb 16(%rdx), %xmm1 -; X64-NEXT: pextrb $14, %xmm1, 30(%rdi) -; X64-NEXT: pextrw $6, %xmm1, 28(%rdi) -; X64-NEXT: pextrd $2, %xmm1, 24(%rdi) ; X64-NEXT: movq %xmm1, 16(%rdi) +; X64-NEXT: pextrd $2, %xmm1, 24(%rdi) +; X64-NEXT: pextrw $6, %xmm1, 28(%rdi) +; X64-NEXT: pextrb $14, %xmm1, 30(%rdi) ; X64-NEXT: movdqa %xmm0, (%rdi) ; X64-NEXT: retq %a = load %i8vec31, %i8vec31* %ap, align 16 diff --git a/test/CodeGen/X86/win64_frame.ll b/test/CodeGen/X86/win64_frame.ll index f95b04242aa..eae02dafa71 100644 --- a/test/CodeGen/X86/win64_frame.ll +++ b/test/CodeGen/X86/win64_frame.ll @@ -29,9 +29,9 @@ define void @f2(i32 %p, ...) "no-frame-pointer-elim"="true" { ; ALL-NEXT: movq %rsp, %rbp ; ALL-NEXT: .seh_setframe 5, 0 ; ALL-NEXT: .seh_endprologue -; ALL-NEXT: movq %r9, 48(%rbp) -; ALL-NEXT: movq %r8, 40(%rbp) ; ALL-NEXT: movq %rdx, 32(%rbp) +; ALL-NEXT: movq %r8, 40(%rbp) +; ALL-NEXT: movq %r9, 48(%rbp) ; ALL-NEXT: leaq 32(%rbp), %rax ; ALL-NEXT: movq %rax, (%rbp) ; ALL-NEXT: addq $8, %rsp diff --git a/test/CodeGen/X86/win64_vararg.ll b/test/CodeGen/X86/win64_vararg.ll index f0aff6f89bc..91841ced39f 100644 --- a/test/CodeGen/X86/win64_vararg.ll +++ b/test/CodeGen/X86/win64_vararg.ll @@ -6,9 +6,9 @@ define void @average_va(i32 %count, ...) nounwind { entry: ; CHECK: pushq -; CHECK: movq %r9, 40(%rsp) -; CHECK: movq %r8, 32(%rsp) -; CHECK: movq %rdx, 24(%rsp) +; CHECK-DAG: movq %r9, 40(%rsp) +; CHECK-DAG: movq %r8, 32(%rsp) +; CHECK-DAG: movq %rdx, 24(%rsp) ; CHECK: leaq 24(%rsp), %rax %ap = alloca i8*, align 8 ; [#uses=1] diff --git a/test/CodeGen/X86/x86-64-ms_abi-vararg.ll b/test/CodeGen/X86/x86-64-ms_abi-vararg.ll index e3387a2709c..016f18cb381 100644 --- a/test/CodeGen/X86/x86-64-ms_abi-vararg.ll +++ b/test/CodeGen/X86/x86-64-ms_abi-vararg.ll @@ -6,9 +6,9 @@ define win64cc void @average_va(i32 %count, ...) nounwind { entry: ; CHECK: pushq -; CHECK: movq %r9, 40(%rsp) -; CHECK: movq %r8, 32(%rsp) -; CHECK: movq %rdx, 24(%rsp) +; CHECK-DAG: movq %r9, 40(%rsp) +; CHECK-DAG: movq %r8, 32(%rsp) +; CHECK-DAG: movq %rdx, 24(%rsp) ; CHECK: leaq 24(%rsp), %rax %ap = alloca i8*, align 8 ; [#uses=1] @@ -59,8 +59,8 @@ entry: ; CHECK-LABEL: copy1: ; CHECK: leaq 32(%rsp), [[REG_copy1:%[a-z]+]] -; CHECK: movq [[REG_copy1]], 8(%rsp) -; CHECK: movq [[REG_copy1]], (%rsp) +; CHECK-DAG: movq [[REG_copy1]], 8(%rsp) +; CHECK-DAG: movq [[REG_copy1]], (%rsp) ; CHECK: ret define win64cc void @copy1(i64 %a0, ...) nounwind { entry: diff --git a/test/CodeGen/XCore/byVal.ll b/test/CodeGen/XCore/byVal.ll index 2c2a6e251d6..fde63f41e0d 100644 --- a/test/CodeGen/XCore/byVal.ll +++ b/test/CodeGen/XCore/byVal.ll @@ -39,8 +39,8 @@ entry: ; CHECK: extsp 4 ; CHECK: stw lr, sp[1] ; CHECK: mov r11, r1 -; CHECK: stw r2, sp[3] -; CHECK: stw r3, sp[4] +; CHECK-DAG: stw r2, sp[3] +; CHECK-DAG: stw r3, sp[4] ; CHECK: ldw r0, r0[0] ; CHECK: stw r0, sp[2] ; CHECK: ldaw r1, sp[2] -- 2.40.0