From 9204acdbd3f9574ec99c4c2e6f0023d3ec7ff191 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Thu, 24 Jan 2019 14:12:34 +0000 Subject: [PATCH] [x86] add tests for unpack shuffle lowering; NFC https://bugs.llvm.org/show_bug.cgi?id=40434 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@352048 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/X86/vector-shuffle-256-v16.ll | 17 ++++ test/CodeGen/X86/vector-shuffle-256-v32.ll | 17 ++++ test/CodeGen/X86/vector-shuffle-256-v4.ll | 61 ++++++++++++ test/CodeGen/X86/vector-shuffle-256-v8.ll | 105 +++++++++++++++++++++ 4 files changed, 200 insertions(+) diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll index 461246d80a8..4e447d9429d 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -4605,3 +4605,20 @@ define <16 x i16> @insert_dup_elt3_mem_v16i16_i32(i32* %ptr) #0 { %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <16 x i32> ret <16 x i16> %tmp3 } + +define <16 x i16> @unpckh_v16i16(<16 x i16> %x, <16 x i16> %y) { +; AVX1-LABEL: unpckh_v16i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: unpckh_v16i16: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2OR512VL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX2OR512VL-NEXT: retq + %unpckh = shufflevector <16 x i16> %x, <16 x i16> %y, <16 x i32> + ret <16 x i16> %unpckh +} + diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll index ba0e2086f48..d268364fda8 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -3321,3 +3321,20 @@ define <32 x i8> @zeroable_src_to_zext(<32 x i8> %a0) { %2 = shufflevector <32 x i8> %1, <32 x i8> , <32 x i32> ret <32 x i8> %2 } + +define <32 x i8> @unpckh_v32i8(<32 x i8> %x, <32 x i8> %y) { +; AVX1-LABEL: unpckh_v32i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: unpckh_v32i8: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2OR512VL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX2OR512VL-NEXT: retq + %unpckh = shufflevector <32 x i8> %x, <32 x i8> %y, <32 x i32> + ret <32 x i8> %unpckh +} + diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll index b3750b74ad3..83316d0a80c 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -1903,3 +1903,64 @@ define <8 x i32> @shuffle_v8i32_0zzzzzzz_optsize(<8 x i32> %a) optsize { %b = shufflevector <8 x i32> %a, <8 x i32> zeroinitializer, <8 x i32> ret <8 x i32> %b } + +; FIXME: AVX1 lowering is better than AVX2 (and AVX512?) + +define <4 x i64> @unpckh_v4i64(<4 x i64> %x, <4 x i64> %y) { +; AVX1-LABEL: unpckh_v4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: unpckh_v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-NEXT: retq +; +; AVX512VL-SLOW-LABEL: unpckh_v4i64: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: unpckh_v4i64: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [1,7,3,7] +; AVX512VL-FAST-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-NEXT: retq + %unpckh = shufflevector <4 x i64> %x, <4 x i64> %y, <4 x i32> + ret <4 x i64> %unpckh +} + +; FIXME: AVX1 lowering is better than AVX2 (and AVX512?) + +define <4 x double> @unpckh_v4f64(<4 x double> %x, <4 x double> %y) { +; AVX1-LABEL: unpckh_v4f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: unpckh_v4f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-NEXT: retq +; +; AVX512VL-SLOW-LABEL: unpckh_v4f64: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: unpckh_v4f64: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [1,7,3,7] +; AVX512VL-FAST-NEXT: vpermt2pd %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-NEXT: retq + %unpckh = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> + ret <4 x double> %unpckh +} + diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll index addf2d2563f..b2187397bb1 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -2877,3 +2877,108 @@ entry: %tmp6 = insertelement <8 x float> %tmp5, float %z, i32 5 ret <8 x float> %tmp6 } + +; FIXME: AVX1 lowering is better than AVX2 (and AVX512?) +; PR40434: https://bugs.llvm.org/show_bug.cgi?id=40434 + +define <8 x i32> @unpckh_v8i32(<8 x i32> %x, <8 x i32> %y) { +; AVX1-LABEL: unpckh_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: unpckh_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = <2,6,3,7,u,u,u,u> +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: unpckh_v8i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = <2,14,3,15,u,u,u,u> +; AVX512VL-NEXT: vpermt2d %ymm1, %ymm2, %ymm0 +; AVX512VL-NEXT: retq + %unpckh = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> + ret <8 x i32> %unpckh +} + +; FIXME: Same as above but with floats. AVX1 lowering is better than AVX2 (and AVX512?) + +define <8 x float> @unpckh_v8f32(<8 x float> %x, <8 x float> %y) { +; AVX1-LABEL: unpckh_v8f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: unpckh_v8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = <2,6,3,7,u,u,u,u> +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: unpckh_v8f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovaps {{.*#+}} ymm2 = <2,14,3,15,u,u,u,u> +; AVX512VL-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 +; AVX512VL-NEXT: retq + %unpckh = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> + ret <8 x float> %unpckh +} + +; FIXME: Why are integer and FP (below) lowering different for AVX1? + +define <8 x i32> @lowhalf_v8i32(<8 x i32> %x, <8 x i32> %y) { +; AVX1-LABEL: lowhalf_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,2] +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: lowhalf_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = <2,6,3,6,u,u,u,u> +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: lowhalf_v8i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = <2,14,3,14,u,u,u,u> +; AVX512VL-NEXT: vpermt2d %ymm1, %ymm2, %ymm0 +; AVX512VL-NEXT: retq + %r = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> + ret <8 x i32> %r +} + +; FIXME: AVX1 lowering is better than AVX2 (and AVX512?) + +define <8 x float> @lowhalf_v8f32(<8 x float> %x, <8 x float> %y) { +; AVX1-LABEL: lowhalf_v8f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[2,2] +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: lowhalf_v8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = <2,6,3,6,u,u,u,u> +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: lowhalf_v8f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovaps {{.*#+}} ymm2 = <2,14,3,14,u,u,u,u> +; AVX512VL-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 +; AVX512VL-NEXT: retq + %r = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> + ret <8 x float> %r +} + -- 2.50.1