From 04d4034ee6d9b43e34c95468f1baa14533d47add Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Wed, 6 Feb 2019 19:40:11 +0000 Subject: [PATCH] [x86] add tests for horizontal ops (PR38971, PR33758); NFC git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@353332 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/X86/haddsub.ll | 88 ++++++++++ test/CodeGen/X86/phaddsub-extract.ll | 254 +++++++++++++++++++++++++++ 2 files changed, 342 insertions(+) diff --git a/test/CodeGen/X86/haddsub.ll b/test/CodeGen/X86/haddsub.ll index aa92f03d77a..c0d6fec7be3 100644 --- a/test/CodeGen/X86/haddsub.ll +++ b/test/CodeGen/X86/haddsub.ll @@ -1352,3 +1352,91 @@ define float @extract_extract_v4f32_fadd_f32_uses3(<4 x float> %x, float* %p1, f ret float %x01 } +; Repeat tests from general reductions to verify output for hoppy targets: +; PR38971: https://bugs.llvm.org/show_bug.cgi?id=38971 + +declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float, <8 x float>) +declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double, <4 x double>) + +define float @fadd_reduce_v8f32(float %a0, <8 x float> %a1) { +; SSE3-SLOW-LABEL: fadd_reduce_v8f32: +; SSE3-SLOW: # %bb.0: +; SSE3-SLOW-NEXT: addps %xmm2, %xmm1 +; SSE3-SLOW-NEXT: movaps %xmm1, %xmm2 +; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE3-SLOW-NEXT: addps %xmm1, %xmm2 +; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE3-SLOW-NEXT: addps %xmm2, %xmm0 +; SSE3-SLOW-NEXT: retq +; +; SSE3-FAST-LABEL: fadd_reduce_v8f32: +; SSE3-FAST: # %bb.0: +; SSE3-FAST-NEXT: addps %xmm2, %xmm1 +; SSE3-FAST-NEXT: movaps %xmm1, %xmm0 +; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE3-FAST-NEXT: addps %xmm1, %xmm0 +; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 +; SSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: fadd_reduce_v8f32: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vzeroupper +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: fadd_reduce_v8f32: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX-FAST-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-FAST-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX-FAST-NEXT: vhaddps %ymm0, %ymm0, %ymm0 +; AVX-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-FAST-NEXT: vzeroupper +; AVX-FAST-NEXT: retq + %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float %a0, <8 x float> %a1) + ret float %r +} + +define double @fadd_reduce_v4f64(double %a0, <4 x double> %a1) { +; SSE3-SLOW-LABEL: fadd_reduce_v4f64: +; SSE3-SLOW: # %bb.0: +; SSE3-SLOW-NEXT: addpd %xmm2, %xmm1 +; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0 +; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE3-SLOW-NEXT: addpd %xmm1, %xmm0 +; SSE3-SLOW-NEXT: retq +; +; SSE3-FAST-LABEL: fadd_reduce_v4f64: +; SSE3-FAST: # %bb.0: +; SSE3-FAST-NEXT: movapd %xmm1, %xmm0 +; SSE3-FAST-NEXT: addpd %xmm2, %xmm0 +; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0 +; SSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: fadd_reduce_v4f64: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX-SLOW-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vzeroupper +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: fadd_reduce_v4f64: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX-FAST-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; AVX-FAST-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 +; AVX-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-FAST-NEXT: vzeroupper +; AVX-FAST-NEXT: retq + %r = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double %a0, <4 x double> %a1) + ret double %r +} + diff --git a/test/CodeGen/X86/phaddsub-extract.ll b/test/CodeGen/X86/phaddsub-extract.ll index 57978e424cb..c989f65cd35 100644 --- a/test/CodeGen/X86/phaddsub-extract.ll +++ b/test/CodeGen/X86/phaddsub-extract.ll @@ -902,3 +902,257 @@ define i32 @extract_extract_v4i32_add_i32_uses3(<4 x i32> %x, i32* %p1, i32* %p2 ret i32 %x01 } +; PR33758: https://bugs.llvm.org/show_bug.cgi?id=33758 + +define i32 @partial_reduction_add_v8i32(<8 x i32> %x) { +; SSE3-SLOW-LABEL: partial_reduction_add_v8i32: +; SSE3-SLOW: # %bb.0: +; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-SLOW-NEXT: paddd %xmm0, %xmm1 +; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE3-SLOW-NEXT: paddd %xmm1, %xmm0 +; SSE3-SLOW-NEXT: movd %xmm0, %eax +; SSE3-SLOW-NEXT: retq +; +; SSE3-FAST-LABEL: partial_reduction_add_v8i32: +; SSE3-FAST: # %bb.0: +; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-FAST-NEXT: paddd %xmm0, %xmm1 +; SSE3-FAST-NEXT: phaddd %xmm1, %xmm1 +; SSE3-FAST-NEXT: movd %xmm1, %eax +; SSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: partial_reduction_add_v8i32: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vmovd %xmm0, %eax +; AVX-SLOW-NEXT: vzeroupper +; AVX-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: partial_reduction_add_v8i32: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vmovd %xmm0, %eax +; AVX1-FAST-NEXT: vzeroupper +; AVX1-FAST-NEXT: retq +; +; AVX2-FAST-LABEL: partial_reduction_add_v8i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-FAST-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovd %xmm0, %eax +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-FAST-LABEL: partial_reduction_add_v8i32: +; AVX512-FAST: # %bb.0: +; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-FAST-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-FAST-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX512-FAST-NEXT: vmovd %xmm0, %eax +; AVX512-FAST-NEXT: vzeroupper +; AVX512-FAST-NEXT: retq + %x23 = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> + %x0213 = add <8 x i32> %x, %x23 + %x13 = shufflevector <8 x i32> %x0213, <8 x i32> undef, <8 x i32> + %x0123 = add <8 x i32> %x0213, %x13 + %r = extractelement <8 x i32> %x0123, i32 0 + ret i32 %r +} + +define i32 @partial_reduction_add_v16i32(<16 x i32> %x) { +; SSE3-SLOW-LABEL: partial_reduction_add_v16i32: +; SSE3-SLOW: # %bb.0: +; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-SLOW-NEXT: paddd %xmm0, %xmm1 +; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE3-SLOW-NEXT: paddd %xmm1, %xmm0 +; SSE3-SLOW-NEXT: movd %xmm0, %eax +; SSE3-SLOW-NEXT: retq +; +; SSE3-FAST-LABEL: partial_reduction_add_v16i32: +; SSE3-FAST: # %bb.0: +; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-FAST-NEXT: paddd %xmm0, %xmm1 +; SSE3-FAST-NEXT: phaddd %xmm1, %xmm1 +; SSE3-FAST-NEXT: movd %xmm1, %eax +; SSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: partial_reduction_add_v16i32: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vmovd %xmm0, %eax +; AVX-SLOW-NEXT: vzeroupper +; AVX-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: partial_reduction_add_v16i32: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vmovd %xmm0, %eax +; AVX1-FAST-NEXT: vzeroupper +; AVX1-FAST-NEXT: retq +; +; AVX2-FAST-LABEL: partial_reduction_add_v16i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovd %xmm0, %eax +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-FAST-LABEL: partial_reduction_add_v16i32: +; AVX512-FAST: # %bb.0: +; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-FAST-NEXT: vmovd %xmm0, %eax +; AVX512-FAST-NEXT: vzeroupper +; AVX512-FAST-NEXT: retq + %x23 = shufflevector <16 x i32> %x, <16 x i32> undef, <16 x i32> + %x0213 = add <16 x i32> %x, %x23 + %x13 = shufflevector <16 x i32> %x0213, <16 x i32> undef, <16 x i32> + %x0123 = add <16 x i32> %x0213, %x13 + %r = extractelement <16 x i32> %x0123, i32 0 + ret i32 %r +} + +define i32 @partial_reduction_sub_v8i32(<8 x i32> %x) { +; SSE3-SLOW-LABEL: partial_reduction_sub_v8i32: +; SSE3-SLOW: # %bb.0: +; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-SLOW-NEXT: psubd %xmm1, %xmm0 +; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE3-SLOW-NEXT: psubd %xmm1, %xmm0 +; SSE3-SLOW-NEXT: movd %xmm0, %eax +; SSE3-SLOW-NEXT: retq +; +; SSE3-FAST-LABEL: partial_reduction_sub_v8i32: +; SSE3-FAST: # %bb.0: +; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-FAST-NEXT: psubd %xmm1, %xmm0 +; SSE3-FAST-NEXT: phsubd %xmm0, %xmm0 +; SSE3-FAST-NEXT: movd %xmm0, %eax +; SSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: partial_reduction_sub_v8i32: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vmovd %xmm0, %eax +; AVX-SLOW-NEXT: vzeroupper +; AVX-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: partial_reduction_sub_v8i32: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vmovd %xmm0, %eax +; AVX1-FAST-NEXT: vzeroupper +; AVX1-FAST-NEXT: retq +; +; AVX2-FAST-LABEL: partial_reduction_sub_v8i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-FAST-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vphsubd %ymm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovd %xmm0, %eax +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-FAST-LABEL: partial_reduction_sub_v8i32: +; AVX512-FAST: # %bb.0: +; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-FAST-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX512-FAST-NEXT: vphsubd %ymm0, %ymm0, %ymm0 +; AVX512-FAST-NEXT: vmovd %xmm0, %eax +; AVX512-FAST-NEXT: vzeroupper +; AVX512-FAST-NEXT: retq + %x23 = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> + %x0213 = sub <8 x i32> %x, %x23 + %x13 = shufflevector <8 x i32> %x0213, <8 x i32> undef, <8 x i32> + %x0123 = sub <8 x i32> %x0213, %x13 + %r = extractelement <8 x i32> %x0123, i32 0 + ret i32 %r +} + +define i32 @partial_reduction_sub_v16i32(<16 x i32> %x) { +; SSE3-SLOW-LABEL: partial_reduction_sub_v16i32: +; SSE3-SLOW: # %bb.0: +; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-SLOW-NEXT: psubd %xmm1, %xmm0 +; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE3-SLOW-NEXT: psubd %xmm1, %xmm0 +; SSE3-SLOW-NEXT: movd %xmm0, %eax +; SSE3-SLOW-NEXT: retq +; +; SSE3-FAST-LABEL: partial_reduction_sub_v16i32: +; SSE3-FAST: # %bb.0: +; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE3-FAST-NEXT: psubd %xmm1, %xmm0 +; SSE3-FAST-NEXT: phsubd %xmm0, %xmm0 +; SSE3-FAST-NEXT: movd %xmm0, %eax +; SSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: partial_reduction_sub_v16i32: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vmovd %xmm0, %eax +; AVX-SLOW-NEXT: vzeroupper +; AVX-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: partial_reduction_sub_v16i32: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vmovd %xmm0, %eax +; AVX1-FAST-NEXT: vzeroupper +; AVX1-FAST-NEXT: retq +; +; AVX2-FAST-LABEL: partial_reduction_sub_v16i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovd %xmm0, %eax +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-FAST-LABEL: partial_reduction_sub_v16i32: +; AVX512-FAST: # %bb.0: +; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX512-FAST-NEXT: vmovd %xmm0, %eax +; AVX512-FAST-NEXT: vzeroupper +; AVX512-FAST-NEXT: retq + %x23 = shufflevector <16 x i32> %x, <16 x i32> undef, <16 x i32> + %x0213 = sub <16 x i32> %x, %x23 + %x13 = shufflevector <16 x i32> %x0213, <16 x i32> undef, <16 x i32> + %x0123 = sub <16 x i32> %x0213, %x13 + %r = extractelement <16 x i32> %x0123, i32 0 + ret i32 %r +} + -- 2.40.0