From 3cef9810b2ae6dd91c1cf211b47c8d838783196f Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sun, 3 Sep 2017 17:52:25 +0000 Subject: [PATCH] [X86] Add patterns to turn an insert into lower subvector of a zero vector into a move instruction which will implicitly zero the upper elements. Ideally we'd be able to emit the SUBREG_TO_REG without the explicit register->register move, but we'd need to be sure the producing operation would select something that guaranteed the upper bits were already zeroed. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@312450 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86InstrAVX512.td | 181 ++++++++++++++++++ lib/Target/X86/X86InstrSSE.td | 45 +++++ test/CodeGen/X86/avx-intrinsics-fast-isel.ll | 24 +-- .../X86/avx512-intrinsics-fast-isel.ll | 58 ++---- test/CodeGen/X86/compress_expand.ll | 3 +- test/CodeGen/X86/madd.ll | 41 ++-- test/CodeGen/X86/masked_gather_scatter.ll | 58 ++---- .../X86/merge-consecutive-loads-256.ll | 30 +-- .../X86/merge-consecutive-loads-512.ll | 18 +- .../X86/vector-shuffle-variable-256.ll | 51 ++--- 10 files changed, 323 insertions(+), 186 deletions(-) diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index b208f816800..b1778329fba 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -3903,6 +3903,187 @@ let Predicates = [HasVLX] in { def : Pat<(store (v32i8 (extract_subvector (v64i8 VR512:$src), (iPTR 0))), addr:$dst), (VMOVDQU32Z256mr addr:$dst, (v32i8 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>; + + // If we're inserting into an all zeros vector, just use a plain move which + // will zero the upper bits. + // TODO: Is there a safe way to detect whether the producing instruction + // already zeroed the upper bits? + + // 128->256 register form. + def : Pat<(v4f64 (insert_subvector (bitconvert (v8i32 immAllZerosV)), + (v2f64 VR128:$src), (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVAPDZ128rr VR128:$src), sub_xmm)>; + def : Pat<(v8f32 (insert_subvector (bitconvert (v8i32 immAllZerosV)), + (v4f32 VR128:$src), (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVAPSZ128rr VR128:$src), sub_xmm)>; + def : Pat<(v4i64 (insert_subvector (bitconvert (v8i32 immAllZerosV)), + (v2i64 VR128:$src), (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVDQA64Z128rr VR128:$src), sub_xmm)>; + def : Pat<(v8i32 (insert_subvector (bitconvert (v8i32 immAllZerosV)), + (v4i32 VR128:$src), (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVDQA64Z128rr VR128:$src), sub_xmm)>; + def : Pat<(v16i16 (insert_subvector (bitconvert (v8i32 immAllZerosV)), + (v8i16 VR128:$src), (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVDQA64Z128rr VR128:$src), sub_xmm)>; + def : Pat<(v32i8 (insert_subvector (bitconvert (v8i32 immAllZerosV)), + (v16i8 VR128:$src), (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVDQA64Z128rr VR128:$src), sub_xmm)>; + + // 128->256 memory form. + def : Pat<(v4f64 (insert_subvector (bitconvert (v8i32 immAllZerosV)), + (loadv2f64 addr:$src), (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVAPDZ128rm addr:$src), sub_xmm)>; + def : Pat<(v8f32 (insert_subvector (bitconvert (v8i32 immAllZerosV)), + (loadv4f32 addr:$src), (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVAPSZ128rm addr:$src), sub_xmm)>; + def : Pat<(v4i64 (insert_subvector (bitconvert (v8i32 immAllZerosV)), + (loadv2i64 addr:$src), (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVDQA64Z128rm addr:$src), sub_xmm)>; + def : Pat<(v8i32 (insert_subvector (bitconvert (v8i32 immAllZerosV)), + (bc_v4i32 (loadv2i64 addr:$src)), + (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVDQA64Z128rm addr:$src), sub_xmm)>; + def : Pat<(v16i16 (insert_subvector (bitconvert (v8i32 immAllZerosV)), + (bc_v8i16 (loadv2i64 addr:$src)), + (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVDQA64Z128rm addr:$src), sub_xmm)>; + def : Pat<(v32i8 (insert_subvector (bitconvert (v8i32 immAllZerosV)), + (bc_v16i8 (loadv2i64 addr:$src)), + (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVDQA64Z128rm addr:$src), sub_xmm)>; + + // 128->512 register form. + def : Pat<(v8f64 (insert_subvector (bitconvert (v16i32 immAllZerosV)), + (v2f64 VR128X:$src), (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVAPDZ128rr VR128X:$src), sub_xmm)>; + def : Pat<(v16f32 (insert_subvector (bitconvert (v16i32 immAllZerosV)), + (v4f32 VR128X:$src), (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVAPSZ128rr VR128X:$src), sub_xmm)>; + def : Pat<(v8i64 (insert_subvector (bitconvert (v16i32 immAllZerosV)), + (v2i64 VR128X:$src), (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVDQA64Z128rr VR128X:$src), sub_xmm)>; + def : Pat<(v16i32 (insert_subvector (bitconvert (v16i32 immAllZerosV)), + (v4i32 VR128X:$src), (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVDQA64Z128rr VR128X:$src), sub_xmm)>; + def : Pat<(v32i16 (insert_subvector (bitconvert (v16i32 immAllZerosV)), + (v8i16 VR128X:$src), (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVDQA64Z128rr VR128X:$src), sub_xmm)>; + def : Pat<(v64i8 (insert_subvector (bitconvert (v16i32 immAllZerosV)), + (v16i8 VR128X:$src), (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVDQA64Z128rr VR128X:$src), sub_xmm)>; + + // 128->512 memory form. + def : Pat<(v8f64 (insert_subvector (bitconvert (v16i32 immAllZerosV)), + (loadv2f64 addr:$src), (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVAPDZ128rm addr:$src), sub_xmm)>; + def : Pat<(v16f32 (insert_subvector (bitconvert (v16i32 immAllZerosV)), + (loadv4f32 addr:$src), (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVAPSZ128rm addr:$src), sub_xmm)>; + def : Pat<(v8i64 (insert_subvector (bitconvert (v16i32 immAllZerosV)), + (loadv2i64 addr:$src), (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVDQA64Z128rm addr:$src), sub_xmm)>; + def : Pat<(v16i32 (insert_subvector (bitconvert (v16i32 immAllZerosV)), + (bc_v4i32 (loadv2i64 addr:$src)), + (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVDQA64Z128rm addr:$src), sub_xmm)>; + def : Pat<(v32i16 (insert_subvector (bitconvert (v16i32 immAllZerosV)), + (bc_v8i16 (loadv2i64 addr:$src)), + (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVDQA64Z128rm addr:$src), sub_xmm)>; + def : Pat<(v64i8 (insert_subvector (bitconvert (v16i32 immAllZerosV)), + (bc_v16i8 (loadv2i64 addr:$src)), + (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVDQA64Z128rm addr:$src), sub_xmm)>; + + // 256->512 register form. + def : Pat<(v8f64 (insert_subvector (bitconvert (v16i32 immAllZerosV)), + (v4f64 VR256X:$src), (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVAPDZ256rr VR256X:$src), sub_ymm)>; + def : Pat<(v16f32 (insert_subvector (bitconvert (v16i32 immAllZerosV)), + (v8f32 VR256X:$src), (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVAPSZ256rr VR256X:$src), sub_ymm)>; + def : Pat<(v8i64 (insert_subvector (bitconvert (v16i32 immAllZerosV)), + (v4i64 VR256X:$src), (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVDQA64Z256rr VR256X:$src), sub_ymm)>; + def : Pat<(v16i32 (insert_subvector (bitconvert (v16i32 immAllZerosV)), + (v8i32 VR256X:$src), (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVDQA64Z256rr VR256X:$src), sub_ymm)>; + def : Pat<(v32i16 (insert_subvector (bitconvert (v16i32 immAllZerosV)), + (v16i16 VR256X:$src), (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVDQA64Z256rr VR256X:$src), sub_ymm)>; + def : Pat<(v64i8 (insert_subvector (bitconvert (v16i32 immAllZerosV)), + (v32i8 VR256X:$src), (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVDQA64Z256rr VR256X:$src), sub_ymm)>; + + // 256->512 memory form. + def : Pat<(v8f64 (insert_subvector (bitconvert (v16i32 immAllZerosV)), + (loadv4f64 addr:$src), (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVAPDZ256rm addr:$src), sub_ymm)>; + def : Pat<(v16f32 (insert_subvector (bitconvert (v16i32 immAllZerosV)), + (loadv8f32 addr:$src), (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVAPSZ256rm addr:$src), sub_ymm)>; + def : Pat<(v8i64 (insert_subvector (bitconvert (v16i32 immAllZerosV)), + (loadv4i64 addr:$src), (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVDQA64Z256rm addr:$src), sub_ymm)>; + def : Pat<(v16i32 (insert_subvector (bitconvert (v16i32 immAllZerosV)), + (bc_v8i32 (loadv4i64 addr:$src)), + (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVDQA64Z256rm addr:$src), sub_ymm)>; + def : Pat<(v32i16 (insert_subvector (bitconvert (v16i32 immAllZerosV)), + (bc_v16i16 (loadv4i64 addr:$src)), + (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVDQA64Z256rm addr:$src), sub_ymm)>; + def : Pat<(v64i8 (insert_subvector (bitconvert (v16i32 immAllZerosV)), + (bc_v32i8 (loadv4i64 addr:$src)), + (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVDQA64Z256rm addr:$src), sub_ymm)>; +} + +let Predicates = [HasAVX512, NoVLX] in { + // If we're inserting into an all zeros vector, just use a plain move which + // will zero the upper bits. + // TODO: Is there a safe way to detect whether the producing instruction + // already zeroed the upper bits? + def : Pat<(v8f64 (insert_subvector (bitconvert (v16i32 immAllZerosV)), + (v4f64 VR256:$src), (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVAPDYrr VR256:$src), sub_ymm)>; + def : Pat<(v16f32 (insert_subvector (bitconvert (v16i32 immAllZerosV)), + (v8f32 VR256:$src), (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVAPSYrr VR256:$src), sub_ymm)>; + def : Pat<(v8i64 (insert_subvector (bitconvert (v16i32 immAllZerosV)), + (v4i64 VR256:$src), (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVDQAYrr VR256:$src), sub_ymm)>; + def : Pat<(v16i32 (insert_subvector (bitconvert (v16i32 immAllZerosV)), + (v8i32 VR256:$src), (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVDQAYrr VR256:$src), sub_ymm)>; + def : Pat<(v32i16 (insert_subvector (bitconvert (v16i32 immAllZerosV)), + (v16i16 VR256:$src), (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVDQAYrr VR256:$src), sub_ymm)>; + def : Pat<(v64i8 (insert_subvector (bitconvert (v16i32 immAllZerosV)), + (v32i8 VR256:$src), (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVDQAYrr VR256:$src), sub_ymm)>; + + def : Pat<(v8f64 (insert_subvector (bitconvert (v16i32 immAllZerosV)), + (loadv4f64 addr:$src), (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVAPDYrm addr:$src), sub_ymm)>; + def : Pat<(v16f32 (insert_subvector (bitconvert (v16i32 immAllZerosV)), + (loadv8f32 addr:$src), (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVAPSYrm addr:$src), sub_ymm)>; + def : Pat<(v8i64 (insert_subvector (bitconvert (v16i32 immAllZerosV)), + (loadv4i64 addr:$src), (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVDQAYrm addr:$src), sub_ymm)>; + def : Pat<(v16i32 (insert_subvector (bitconvert (v16i32 immAllZerosV)), + (bc_v8i32 (loadv4i64 addr:$src)), + (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVDQAYrm addr:$src), sub_ymm)>; + def : Pat<(v32i16 (insert_subvector (bitconvert (v16i32 immAllZerosV)), + (bc_v16i16 (loadv4i64 addr:$src)), + (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVDQAYrm addr:$src), sub_ymm)>; + def : Pat<(v64i8 (insert_subvector (bitconvert (v16i32 immAllZerosV)), + (bc_v32i8 (loadv4i64 addr:$src)), + (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVDQAYrm addr:$src), sub_ymm)>; } multiclass masked_move_for_extract; + + // If we're inserting into an all zeros vector, just use a plain move which + // will zero the upper bits. + // TODO: Is there a safe way to detect whether the producing instruction + // already zeroed the upper bits? + def : Pat<(v4f64 (insert_subvector (bitconvert (v8i32 immAllZerosV)), + (v2f64 VR128:$src), (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVAPDrr VR128:$src), sub_xmm)>; + def : Pat<(v8f32 (insert_subvector (bitconvert (v8i32 immAllZerosV)), + (v4f32 VR128:$src), (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVAPSrr VR128:$src), sub_xmm)>; + def : Pat<(v4i64 (insert_subvector (bitconvert (v8i32 immAllZerosV)), + (v2i64 VR128:$src), (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVDQArr VR128:$src), sub_xmm)>; + def : Pat<(v8i32 (insert_subvector (bitconvert (v8i32 immAllZerosV)), + (v4i32 VR128:$src), (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVDQArr VR128:$src), sub_xmm)>; + def : Pat<(v16i16 (insert_subvector (bitconvert (v8i32 immAllZerosV)), + (v8i16 VR128:$src), (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVDQArr VR128:$src), sub_xmm)>; + def : Pat<(v32i8 (insert_subvector (bitconvert (v8i32 immAllZerosV)), + (v16i8 VR128:$src), (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVDQArr VR128:$src), sub_xmm)>; + + def : Pat<(v4f64 (insert_subvector (bitconvert (v8i32 immAllZerosV)), + (loadv2f64 addr:$src), (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVAPDrm addr:$src), sub_xmm)>; + def : Pat<(v8f32 (insert_subvector (bitconvert (v8i32 immAllZerosV)), + (loadv4f32 addr:$src), (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVAPSrm addr:$src), sub_xmm)>; + def : Pat<(v4i64 (insert_subvector (bitconvert (v8i32 immAllZerosV)), + (loadv2i64 addr:$src), (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVDQArm addr:$src), sub_xmm)>; + def : Pat<(v8i32 (insert_subvector (bitconvert (v8i32 immAllZerosV)), + (bc_v4i32 (loadv2i64 addr:$src)), + (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVDQArm addr:$src), sub_xmm)>; + def : Pat<(v16i16 (insert_subvector (bitconvert (v8i32 immAllZerosV)), + (bc_v8i16 (loadv2i64 addr:$src)), + (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVDQArm addr:$src), sub_xmm)>; + def : Pat<(v32i8 (insert_subvector (bitconvert (v8i32 immAllZerosV)), + (bc_v16i8 (loadv2i64 addr:$src)), + (iPTR 0))), + (SUBREG_TO_REG (i64 0), (VMOVDQArm addr:$src), sub_xmm)>; } //===---------------------------------------------------------------------===// diff --git a/test/CodeGen/X86/avx-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx-intrinsics-fast-isel.ll index 082061c099c..2f33cf78de1 100644 --- a/test/CodeGen/X86/avx-intrinsics-fast-isel.ll +++ b/test/CodeGen/X86/avx-intrinsics-fast-isel.ll @@ -3777,16 +3777,12 @@ declare void @llvm.x86.avx.vzeroupper() nounwind readnone define <4 x double> @test_mm256_zextpd128_pd256(<2 x double> %a0) nounwind { ; X32-LABEL: test_mm256_zextpd128_pd256: ; X32: # BB#0: -; X32-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; X32-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; X32-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; X32-NEXT: vmovaps %xmm0, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_zextpd128_pd256: ; X64: # BB#0: -; X64-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; X64-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; X64-NEXT: vmovaps %xmm0, %xmm0 ; X64-NEXT: retq %res = shufflevector <2 x double> %a0, <2 x double> zeroinitializer, <4 x i32> ret <4 x double> %res @@ -3795,16 +3791,12 @@ define <4 x double> @test_mm256_zextpd128_pd256(<2 x double> %a0) nounwind { define <8 x float> @test_mm256_zextps128_ps256(<4 x float> %a0) nounwind { ; X32-LABEL: test_mm256_zextps128_ps256: ; X32: # BB#0: -; X32-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; X32-NEXT: vmovaps %xmm0, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_zextps128_ps256: ; X64: # BB#0: -; X64-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; X64-NEXT: vmovaps %xmm0, %xmm0 ; X64-NEXT: retq %res = shufflevector <4 x float> %a0, <4 x float> zeroinitializer, <8 x i32> ret <8 x float> %res @@ -3813,16 +3805,12 @@ define <8 x float> @test_mm256_zextps128_ps256(<4 x float> %a0) nounwind { define <4 x i64> @test_mm256_zextsi128_si256(<2 x i64> %a0) nounwind { ; X32-LABEL: test_mm256_zextsi128_si256: ; X32: # BB#0: -; X32-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; X32-NEXT: vmovaps %xmm0, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_zextsi128_si256: ; X64: # BB#0: -; X64-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; X64-NEXT: vmovaps %xmm0, %xmm0 ; X64-NEXT: retq %res = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <4 x i32> ret <4 x i64> %res diff --git a/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll index d599c479ee2..18afae90ad9 100644 --- a/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ b/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -1133,21 +1133,17 @@ define <16 x float> @test_mm512_maskz_unpacklo_ps(i16 %a0, <16 x float> %a1, <16 define <8 x double> @test_mm512_zextpd128_pd512(<2 x double> %a0) nounwind { ; X32-LABEL: test_mm512_zextpd128_pd512: ; X32: # BB#0: -; X32-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; X32-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; X32-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; X32-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] -; X32-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] +; X32-NEXT: vmovaps %xmm0, %xmm0 +; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X32-NEXT: vmovaps %xmm1, %xmm1 ; X32-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; X32-NEXT: retl ; ; X64-LABEL: test_mm512_zextpd128_pd512: ; X64: # BB#0: -; X64-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; X64-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; X64-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] -; X64-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] +; X64-NEXT: vmovaps %xmm0, %xmm0 +; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-NEXT: vmovaps %xmm1, %xmm1 ; X64-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; X64-NEXT: retq %res = shufflevector <2 x double> %a0, <2 x double> zeroinitializer, <8 x i32> @@ -1157,14 +1153,12 @@ define <8 x double> @test_mm512_zextpd128_pd512(<2 x double> %a0) nounwind { define <8 x double> @test_mm512_zextpd256_pd512(<4 x double> %a0) nounwind { ; X32-LABEL: test_mm512_zextpd256_pd512: ; X32: # BB#0: -; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-NEXT: vinsertf64x4 $0, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vmovaps %ymm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: test_mm512_zextpd256_pd512: ; X64: # BB#0: -; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-NEXT: vinsertf64x4 $0, %ymm0, %zmm1, %zmm0 +; X64-NEXT: vmovaps %ymm0, %ymm0 ; X64-NEXT: retq %res = shufflevector <4 x double> %a0, <4 x double> zeroinitializer, <8 x i32> ret <8 x double> %res @@ -1173,21 +1167,17 @@ define <8 x double> @test_mm512_zextpd256_pd512(<4 x double> %a0) nounwind { define <16 x float> @test_mm512_zextps128_ps512(<4 x float> %a0) nounwind { ; X32-LABEL: test_mm512_zextps128_ps512: ; X32: # BB#0: -; X32-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; X32-NEXT: vmovaps %xmm0, %xmm0 ; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; X32-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; X32-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; X32-NEXT: vmovaps %xmm1, %xmm1 ; X32-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; X32-NEXT: retl ; ; X64-LABEL: test_mm512_zextps128_ps512: ; X64: # BB#0: -; X64-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; X64-NEXT: vmovaps %xmm0, %xmm0 ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; X64-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; X64-NEXT: vmovaps %xmm1, %xmm1 ; X64-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; X64-NEXT: retq %res = shufflevector <4 x float> %a0, <4 x float> zeroinitializer, <16 x i32> @@ -1197,14 +1187,12 @@ define <16 x float> @test_mm512_zextps128_ps512(<4 x float> %a0) nounwind { define <16 x float> @test_mm512_zextps256_ps512(<8 x float> %a0) nounwind { ; X32-LABEL: test_mm512_zextps256_ps512: ; X32: # BB#0: -; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-NEXT: vinsertf64x4 $0, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vmovaps %ymm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: test_mm512_zextps256_ps512: ; X64: # BB#0: -; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-NEXT: vinsertf64x4 $0, %ymm0, %zmm1, %zmm0 +; X64-NEXT: vmovaps %ymm0, %ymm0 ; X64-NEXT: retq %res = shufflevector <8 x float> %a0, <8 x float> zeroinitializer, <16 x i32> ret <16 x float> %res @@ -1213,21 +1201,17 @@ define <16 x float> @test_mm512_zextps256_ps512(<8 x float> %a0) nounwind { define <8 x i64> @test_mm512_zextsi128_si512(<2 x i64> %a0) nounwind { ; X32-LABEL: test_mm512_zextsi128_si512: ; X32: # BB#0: -; X32-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; X32-NEXT: vmovaps %xmm0, %xmm0 ; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; X32-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; X32-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; X32-NEXT: vmovaps %xmm1, %xmm1 ; X32-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; X32-NEXT: retl ; ; X64-LABEL: test_mm512_zextsi128_si512: ; X64: # BB#0: -; X64-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; X64-NEXT: vmovaps %xmm0, %xmm0 ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; X64-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; X64-NEXT: vmovaps %xmm1, %xmm1 ; X64-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; X64-NEXT: retq %res = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <8 x i32> @@ -1237,14 +1221,12 @@ define <8 x i64> @test_mm512_zextsi128_si512(<2 x i64> %a0) nounwind { define <8 x i64> @test_mm512_zextsi256_si512(<4 x i64> %a0) nounwind { ; X32-LABEL: test_mm512_zextsi256_si512: ; X32: # BB#0: -; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-NEXT: vinsertf64x4 $0, %ymm0, %zmm1, %zmm0 +; X32-NEXT: vmovaps %ymm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: test_mm512_zextsi256_si512: ; X64: # BB#0: -; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-NEXT: vinsertf64x4 $0, %ymm0, %zmm1, %zmm0 +; X64-NEXT: vmovaps %ymm0, %ymm0 ; X64-NEXT: retq %res = shufflevector <4 x i64> %a0, <4 x i64> zeroinitializer, <8 x i32> ret <8 x i64> %res diff --git a/test/CodeGen/X86/compress_expand.ll b/test/CodeGen/X86/compress_expand.ll index 746d5169f7b..81d36f2f54a 100644 --- a/test/CodeGen/X86/compress_expand.ll +++ b/test/CodeGen/X86/compress_expand.ll @@ -204,8 +204,7 @@ define void @test10(i64* %base, <4 x i64> %V, <4 x i1> %mask) { ; KNL-NEXT: vpslld $31, %xmm1, %xmm1 ; KNL-NEXT: vpsrad $31, %xmm1, %xmm1 ; KNL-NEXT: vpmovsxdq %xmm1, %ymm1 -; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; KNL-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; KNL-NEXT: vmovdqa %ymm1, %ymm1 ; KNL-NEXT: vpsllq $63, %zmm1, %zmm1 ; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL-NEXT: vpcompressq %zmm0, (%rdi) {%k1} diff --git a/test/CodeGen/X86/madd.ll b/test/CodeGen/X86/madd.ll index 0523fa78699..d378727630c 100644 --- a/test/CodeGen/X86/madd.ll +++ b/test/CodeGen/X86/madd.ll @@ -35,20 +35,19 @@ define i32 @_Z10test_shortPsS_i(i16* nocapture readonly, i16* nocapture readonly ; AVX2-NEXT: movl %edx, %eax ; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: .p2align 4, 0x90 ; AVX2-NEXT: .LBB0_1: # %vector.body ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX2-NEXT: vmovdqu (%rsi,%rcx,2), %xmm2 -; AVX2-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vmovdqu (%rsi,%rcx,2), %xmm1 +; AVX2-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa %xmm1, %xmm1 +; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: addq $8, %rcx ; AVX2-NEXT: cmpq %rcx, %rax ; AVX2-NEXT: jne .LBB0_1 ; AVX2-NEXT: # BB#2: # %middle.block -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 @@ -61,20 +60,19 @@ define i32 @_Z10test_shortPsS_i(i16* nocapture readonly, i16* nocapture readonly ; AVX512-NEXT: movl %edx, %eax ; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: .p2align 4, 0x90 ; AVX512-NEXT: .LBB0_1: # %vector.body ; AVX512-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512-NEXT: vmovdqu (%rsi,%rcx,2), %xmm2 -; AVX512-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm2, %xmm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512-NEXT: vpaddd %ymm1, %ymm2, %ymm1 +; AVX512-NEXT: vmovdqu (%rsi,%rcx,2), %xmm1 +; AVX512-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm1, %xmm1 +; AVX512-NEXT: vmovdqa %xmm1, %xmm1 +; AVX512-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: addq $8, %rcx ; AVX512-NEXT: cmpq %rcx, %rax ; AVX512-NEXT: jne .LBB0_1 ; AVX512-NEXT: # BB#2: # %middle.block -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vphaddd %ymm0, %ymm0, %ymm0 @@ -310,21 +308,20 @@ define i32 @_Z9test_charPcS_i(i8* nocapture readonly, i8* nocapture readonly, i3 ; AVX512-NEXT: movl %edx, %eax ; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: .p2align 4, 0x90 ; AVX512-NEXT: .LBB2_1: # %vector.body ; AVX512-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512-NEXT: vpmovsxbw (%rdi,%rcx), %ymm2 -; AVX512-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3 -; AVX512-NEXT: vpmaddwd %ymm2, %ymm3, %ymm2 -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm2 -; AVX512-NEXT: vpaddd %zmm1, %zmm2, %zmm1 +; AVX512-NEXT: vpmovsxbw (%rdi,%rcx), %ymm1 +; AVX512-NEXT: vpmovsxbw (%rsi,%rcx), %ymm2 +; AVX512-NEXT: vpmaddwd %ymm1, %ymm2, %ymm1 +; AVX512-NEXT: vmovdqa %ymm1, %ymm1 +; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: addq $16, %rcx ; AVX512-NEXT: cmpq %rcx, %rax ; AVX512-NEXT: jne .LBB2_1 ; AVX512-NEXT: # BB#2: # %middle.block -; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] diff --git a/test/CodeGen/X86/masked_gather_scatter.ll b/test/CodeGen/X86/masked_gather_scatter.ll index fdd94beb1c0..9e58f1dcf74 100644 --- a/test/CodeGen/X86/masked_gather_scatter.ll +++ b/test/CodeGen/X86/masked_gather_scatter.ll @@ -776,10 +776,8 @@ declare <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*>, i32, <2 x define <4 x float> @test15(float* %base, <4 x i32> %ind, <4 x i1> %mask) { ; KNL_64-LABEL: test15: ; KNL_64: # BB#0: -; KNL_64-NEXT: # kill: %XMM1 %XMM1 %YMM1 ; KNL_64-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; KNL_64-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; KNL_64-NEXT: vmovdqa %xmm1, %xmm1 ; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm2 ; KNL_64-NEXT: vpslld $31, %ymm1, %ymm0 ; KNL_64-NEXT: vptestmd %zmm0, %zmm0, %k1 @@ -790,10 +788,8 @@ define <4 x float> @test15(float* %base, <4 x i32> %ind, <4 x i1> %mask) { ; ; KNL_32-LABEL: test15: ; KNL_32: # BB#0: -; KNL_32-NEXT: # kill: %XMM1 %XMM1 %YMM1 ; KNL_32-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; KNL_32-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; KNL_32-NEXT: vmovdqa %xmm1, %xmm1 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm2 ; KNL_32-NEXT: vpslld $31, %ymm1, %ymm0 @@ -835,8 +831,7 @@ define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1 ; KNL_64-NEXT: vpsrad $31, %xmm1, %xmm1 ; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1 -; KNL_64-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; KNL_64-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; KNL_64-NEXT: vmovdqa %ymm1, %ymm1 ; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0 ; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 @@ -851,8 +846,7 @@ define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1 ; KNL_32-NEXT: vpsrad $31, %xmm1, %xmm1 ; KNL_32-NEXT: vpmovsxdq %xmm1, %ymm1 -; KNL_32-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; KNL_32-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; KNL_32-NEXT: vmovdqa %ymm1, %ymm1 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0 ; KNL_32-NEXT: vpsllq $63, %zmm1, %zmm1 @@ -944,11 +938,9 @@ declare void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> , <2 x float*> , i32 define void @test18(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) { ; KNL_64-LABEL: test18: ; KNL_64: # BB#0: -; KNL_64-NEXT: # kill: %XMM2 %XMM2 %YMM2 ; KNL_64-NEXT: # kill: %YMM1 %YMM1 %ZMM1 ; KNL_64-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; KNL_64-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; KNL_64-NEXT: vmovdqa %xmm2, %xmm2 ; KNL_64-NEXT: vpslld $31, %ymm2, %ymm2 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} @@ -957,11 +949,9 @@ define void @test18(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) { ; ; KNL_32-LABEL: test18: ; KNL_32: # BB#0: -; KNL_32-NEXT: # kill: %XMM2 %XMM2 %YMM2 ; KNL_32-NEXT: # kill: %XMM1 %XMM1 %YMM1 ; KNL_32-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; KNL_32-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; KNL_32-NEXT: vmovdqa %xmm2, %xmm2 ; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1 ; KNL_32-NEXT: vpslld $31, %ymm2, %ymm2 ; KNL_32-NEXT: vptestmd %zmm2, %zmm2, %k1 @@ -995,8 +985,7 @@ define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1 ; KNL_64-NEXT: vpsrad $31, %xmm1, %xmm1 ; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1 -; KNL_64-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; KNL_64-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; KNL_64-NEXT: vmovdqa %ymm1, %ymm1 ; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL_64-NEXT: vscatterqpd %zmm0, (%rdi,%zmm2,8) {%k1} @@ -1010,8 +999,7 @@ define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1 ; KNL_32-NEXT: vpsrad $31, %xmm1, %xmm1 ; KNL_32-NEXT: vpmovsxdq %xmm1, %ymm1 -; KNL_32-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; KNL_32-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; KNL_32-NEXT: vmovdqa %ymm1, %ymm1 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: vpsllq $63, %zmm1, %zmm1 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 @@ -1047,8 +1035,7 @@ define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) { ; KNL_64-NEXT: # kill: %XMM1 %XMM1 %ZMM1 ; KNL_64-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; KNL_64-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,2],zero,zero -; KNL_64-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; KNL_64-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; KNL_64-NEXT: vmovaps %xmm2, %xmm2 ; KNL_64-NEXT: vpslld $31, %ymm2, %ymm2 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 ; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1} @@ -1058,10 +1045,9 @@ define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) { ; KNL_32-LABEL: test20: ; KNL_32: # BB#0: ; KNL_32-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; KNL_32-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,2],zero,zero -; KNL_32-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; KNL_32-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; KNL_32-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,2],zero,zero +; KNL_32-NEXT: vmovaps %xmm2, %xmm2 ; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1 ; KNL_32-NEXT: vpslld $31, %ymm2, %ymm2 ; KNL_32-NEXT: vptestmd %zmm2, %zmm2, %k1 @@ -1153,10 +1139,9 @@ define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x fl ; KNL_64-LABEL: test22: ; KNL_64: # BB#0: ; KNL_64-NEXT: # kill: %XMM2 %XMM2 %YMM2 -; KNL_64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero -; KNL_64-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; KNL_64-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; KNL_64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero +; KNL_64-NEXT: vmovaps %xmm1, %xmm1 ; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0 ; KNL_64-NEXT: vpslld $31, %ymm1, %ymm1 ; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k1 @@ -1168,10 +1153,9 @@ define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x fl ; KNL_32-LABEL: test22: ; KNL_32: # BB#0: ; KNL_32-NEXT: # kill: %XMM2 %XMM2 %YMM2 -; KNL_32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero -; KNL_32-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; KNL_32-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; KNL_32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero +; KNL_32-NEXT: vmovaps %xmm1, %xmm1 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0 ; KNL_32-NEXT: vpslld $31, %ymm1, %ymm1 @@ -1215,8 +1199,7 @@ define <2 x float> @test22a(float* %base, <2 x i64> %ind, <2 x i1> %mask, <2 x f ; KNL_64-NEXT: # kill: %XMM2 %XMM2 %YMM2 ; KNL_64-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; KNL_64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero -; KNL_64-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; KNL_64-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; KNL_64-NEXT: vmovaps %xmm1, %xmm1 ; KNL_64-NEXT: vpslld $31, %ymm1, %ymm1 ; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k1 ; KNL_64-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k1} @@ -1229,8 +1212,7 @@ define <2 x float> @test22a(float* %base, <2 x i64> %ind, <2 x i1> %mask, <2 x f ; KNL_32-NEXT: # kill: %XMM2 %XMM2 %YMM2 ; KNL_32-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; KNL_32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero -; KNL_32-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; KNL_32-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; KNL_32-NEXT: vmovaps %xmm1, %xmm1 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: vpslld $31, %ymm1, %ymm1 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 @@ -2150,8 +2132,7 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6 ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1 ; KNL_64-NEXT: vpsrad $31, %xmm1, %xmm1 ; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1 -; KNL_64-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; KNL_64-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; KNL_64-NEXT: vmovdqa %ymm1, %ymm1 ; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm1 {%k1} @@ -2175,8 +2156,7 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1 ; KNL_32-NEXT: vpsrad $31, %xmm1, %xmm1 ; KNL_32-NEXT: vpmovsxdq %xmm1, %ymm1 -; KNL_32-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; KNL_32-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; KNL_32-NEXT: vmovdqa %ymm1, %ymm1 ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0 ; KNL_32-NEXT: vpsllq $63, %zmm1, %zmm1 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 diff --git a/test/CodeGen/X86/merge-consecutive-loads-256.ll b/test/CodeGen/X86/merge-consecutive-loads-256.ll index 8a8d88a4329..618e316bd07 100644 --- a/test/CodeGen/X86/merge-consecutive-loads-256.ll +++ b/test/CodeGen/X86/merge-consecutive-loads-256.ll @@ -28,15 +28,13 @@ define <4 x double> @merge_4f64_2f64_23(<2 x double>* %ptr) nounwind uwtable noi define <4 x double> @merge_4f64_2f64_2z(<2 x double>* %ptr) nounwind uwtable noinline ssp { ; AVX-LABEL: merge_4f64_2f64_2z: ; AVX: # BB#0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vinsertf128 $0, 32(%rdi), %ymm0, %ymm0 +; AVX-NEXT: vmovaps 32(%rdi), %xmm0 ; AVX-NEXT: retq ; ; X32-AVX-LABEL: merge_4f64_2f64_2z: ; X32-AVX: # BB#0: ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X32-AVX-NEXT: vinsertf128 $0, 32(%eax), %ymm0, %ymm0 +; X32-AVX-NEXT: vmovaps 32(%eax), %xmm0 ; X32-AVX-NEXT: retl %ptr0 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 2 %val0 = load <2 x double>, <2 x double>* %ptr0 @@ -111,15 +109,13 @@ define <4 x double> @merge_4f64_f64_34uu(double* %ptr) nounwind uwtable noinline define <4 x double> @merge_4f64_f64_45zz(double* %ptr) nounwind uwtable noinline ssp { ; AVX-LABEL: merge_4f64_f64_45zz: ; AVX: # BB#0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vinsertf128 $0, 32(%rdi), %ymm0, %ymm0 +; AVX-NEXT: vmovaps 32(%rdi), %xmm0 ; AVX-NEXT: retq ; ; X32-AVX-LABEL: merge_4f64_f64_45zz: ; X32-AVX: # BB#0: ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X32-AVX-NEXT: vinsertf128 $0, 32(%eax), %ymm0, %ymm0 +; X32-AVX-NEXT: vmovaps 32(%eax), %xmm0 ; X32-AVX-NEXT: retl %ptr0 = getelementptr inbounds double, double* %ptr, i64 4 %ptr1 = getelementptr inbounds double, double* %ptr, i64 5 @@ -159,15 +155,13 @@ define <4 x double> @merge_4f64_f64_34z6(double* %ptr) nounwind uwtable noinline define <4 x i64> @merge_4i64_2i64_3z(<2 x i64>* %ptr) nounwind uwtable noinline ssp { ; AVX-LABEL: merge_4i64_2i64_3z: ; AVX: # BB#0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vinsertf128 $0, 48(%rdi), %ymm0, %ymm0 +; AVX-NEXT: vmovaps 48(%rdi), %xmm0 ; AVX-NEXT: retq ; ; X32-AVX-LABEL: merge_4i64_2i64_3z: ; X32-AVX: # BB#0: ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X32-AVX-NEXT: vinsertf128 $0, 48(%eax), %ymm0, %ymm0 +; X32-AVX-NEXT: vmovaps 48(%eax), %xmm0 ; X32-AVX-NEXT: retl %ptr0 = getelementptr inbounds <2 x i64>, <2 x i64>* %ptr, i64 3 %val0 = load <2 x i64>, <2 x i64>* %ptr0 @@ -223,15 +217,13 @@ define <4 x i64> @merge_4i64_i64_1zzu(i64* %ptr) nounwind uwtable noinline ssp { define <4 x i64> @merge_4i64_i64_23zz(i64* %ptr) nounwind uwtable noinline ssp { ; AVX-LABEL: merge_4i64_i64_23zz: ; AVX: # BB#0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vinsertf128 $0, 16(%rdi), %ymm0, %ymm0 +; AVX-NEXT: vmovaps 16(%rdi), %xmm0 ; AVX-NEXT: retq ; ; X32-AVX-LABEL: merge_4i64_i64_23zz: ; X32-AVX: # BB#0: ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X32-AVX-NEXT: vinsertf128 $0, 16(%eax), %ymm0, %ymm0 +; X32-AVX-NEXT: vmovaps 16(%eax), %xmm0 ; X32-AVX-NEXT: retl %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 2 %ptr1 = getelementptr inbounds i64, i64* %ptr, i64 3 @@ -605,8 +597,7 @@ define <4 x double> @merge_4f64_f64_34uz_volatile(double* %ptr) nounwind uwtable ; AVX: # BB#0: ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX-NEXT: vmovapd %xmm0, %xmm0 ; AVX-NEXT: retq ; ; X32-AVX-LABEL: merge_4f64_f64_34uz_volatile: @@ -614,8 +605,7 @@ define <4 x double> @merge_4f64_f64_34uz_volatile(double* %ptr) nounwind uwtable ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X32-AVX-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; X32-AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; X32-AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; X32-AVX-NEXT: vmovapd %xmm0, %xmm0 ; X32-AVX-NEXT: retl %ptr0 = getelementptr inbounds double, double* %ptr, i64 3 %ptr1 = getelementptr inbounds double, double* %ptr, i64 4 diff --git a/test/CodeGen/X86/merge-consecutive-loads-512.ll b/test/CodeGen/X86/merge-consecutive-loads-512.ll index e95df0dcbd9..7049a72518a 100644 --- a/test/CodeGen/X86/merge-consecutive-loads-512.ll +++ b/test/CodeGen/X86/merge-consecutive-loads-512.ll @@ -106,19 +106,15 @@ define <8 x double> @merge_8f64_f64_23uuuuu9(double* %ptr) nounwind uwtable noin define <8 x double> @merge_8f64_f64_12zzuuzz(double* %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_8f64_f64_12zzuuzz: ; ALL: # BB#0: -; ALL-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; ALL-NEXT: vinsertf128 $0, 8(%rdi), %ymm0, %ymm0 -; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vinsertf64x4 $0, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vmovaps 8(%rdi), %xmm0 +; ALL-NEXT: vmovaps %ymm0, %ymm0 ; ALL-NEXT: retq ; ; X32-AVX512F-LABEL: merge_8f64_f64_12zzuuzz: ; X32-AVX512F: # BB#0: ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X32-AVX512F-NEXT: vinsertf128 $0, 8(%eax), %ymm0, %ymm0 -; X32-AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-AVX512F-NEXT: vinsertf64x4 $0, %ymm0, %zmm1, %zmm0 +; X32-AVX512F-NEXT: vmovaps 8(%eax), %xmm0 +; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm0 ; X32-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds double, double* %ptr, i64 1 %ptr1 = getelementptr inbounds double, double* %ptr, i64 2 @@ -196,8 +192,7 @@ define <8 x i64> @merge_8i64_4i64_z3(<4 x i64>* %ptr) nounwind uwtable noinline define <8 x i64> @merge_8i64_i64_56zz9uzz(i64* %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_8i64_i64_56zz9uzz: ; ALL: # BB#0: -; ALL-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; ALL-NEXT: vinsertf128 $0, 40(%rdi), %ymm0, %ymm0 +; ALL-NEXT: vmovaps 40(%rdi), %xmm0 ; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; ALL-NEXT: retq @@ -205,8 +200,7 @@ define <8 x i64> @merge_8i64_i64_56zz9uzz(i64* %ptr) nounwind uwtable noinline s ; X32-AVX512F-LABEL: merge_8i64_i64_56zz9uzz: ; X32-AVX512F: # BB#0: ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X32-AVX512F-NEXT: vinsertf128 $0, 40(%eax), %ymm0, %ymm0 +; X32-AVX512F-NEXT: vmovaps 40(%eax), %xmm0 ; X32-AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; X32-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; X32-AVX512F-NEXT: retl diff --git a/test/CodeGen/X86/vector-shuffle-variable-256.ll b/test/CodeGen/X86/vector-shuffle-variable-256.ll index ba63fa27ca4..70c44c435e4 100644 --- a/test/CodeGen/X86/vector-shuffle-variable-256.ll +++ b/test/CodeGen/X86/vector-shuffle-variable-256.ll @@ -146,41 +146,22 @@ define <4 x i64> @var_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64 %i0, i64 %i } define <4 x i64> @var_shuffle_v4i64_v4i64_xx00_i64(<4 x i64> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind { -; AVX1-LABEL: var_shuffle_v4i64_v4i64_xx00_i64: -; AVX1: # BB#0: -; AVX1-NEXT: pushq %rbp -; AVX1-NEXT: movq %rsp, %rbp -; AVX1-NEXT: andq $-32, %rsp -; AVX1-NEXT: subq $64, %rsp -; AVX1-NEXT: andl $3, %edi -; AVX1-NEXT: andl $3, %esi -; AVX1-NEXT: vmovaps %ymm0, (%rsp) -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-NEXT: movq %rbp, %rsp -; AVX1-NEXT: popq %rbp -; AVX1-NEXT: retq -; -; AVX2-LABEL: var_shuffle_v4i64_v4i64_xx00_i64: -; AVX2: # BB#0: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: movq %rsp, %rbp -; AVX2-NEXT: andq $-32, %rsp -; AVX2-NEXT: subq $64, %rsp -; AVX2-NEXT: andl $3, %edi -; AVX2-NEXT: andl $3, %esi -; AVX2-NEXT: vmovaps %ymm0, (%rsp) -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: movq %rbp, %rsp -; AVX2-NEXT: popq %rbp -; AVX2-NEXT: retq +; ALL-LABEL: var_shuffle_v4i64_v4i64_xx00_i64: +; ALL: # BB#0: +; ALL-NEXT: pushq %rbp +; ALL-NEXT: movq %rsp, %rbp +; ALL-NEXT: andq $-32, %rsp +; ALL-NEXT: subq $64, %rsp +; ALL-NEXT: andl $3, %edi +; ALL-NEXT: andl $3, %esi +; ALL-NEXT: vmovaps %ymm0, (%rsp) +; ALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; ALL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; ALL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; ALL-NEXT: vmovdqa %xmm0, %xmm0 +; ALL-NEXT: movq %rbp, %rsp +; ALL-NEXT: popq %rbp +; ALL-NEXT: retq %x0 = extractelement <4 x i64> %x, i64 %i0 %x1 = extractelement <4 x i64> %x, i64 %i1 %x2 = extractelement <4 x i64> %x, i64 %i2 -- 2.50.1