From a22f9f24054f0525fe1090b3fa665bd4fc0921c3 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sat, 9 Sep 2017 18:18:17 +0000 Subject: [PATCH] [X86][SSE] i32 vector multiplications test cases from PR6399 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@312868 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/X86/mulvi32.ll | 472 ++++++++++++++++++++++++++++++++++++ 1 file changed, 472 insertions(+) create mode 100644 test/CodeGen/X86/mulvi32.ll diff --git a/test/CodeGen/X86/mulvi32.ll b/test/CodeGen/X86/mulvi32.ll new file mode 100644 index 00000000000..58744ad553f --- /dev/null +++ b/test/CodeGen/X86/mulvi32.ll @@ -0,0 +1,472 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 + +; PR6399 + +define <2 x i32> @_mul2xi32a(<2 x i32>, <2 x i32>) { +; SSE-LABEL: _mul2xi32a: +; SSE: # BB#0: +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: psrlq $32, %xmm2 +; SSE-NEXT: pmuludq %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: psrlq $32, %xmm3 +; SSE-NEXT: pmuludq %xmm0, %xmm3 +; SSE-NEXT: paddq %xmm2, %xmm3 +; SSE-NEXT: psllq $32, %xmm3 +; SSE-NEXT: pmuludq %xmm1, %xmm0 +; SSE-NEXT: paddq %xmm3, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: _mul2xi32a: +; AVX: # BB#0: +; AVX-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX-NEXT: retq + %r = mul <2 x i32> %0, %1 + ret <2 x i32> %r +} + +define <2 x i32> @_mul2xi32b(<2 x i32>, <2 x i32>) { +; SSE2-LABEL: _mul2xi32b: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: pmuludq %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] +; SSE2-NEXT: retq +; +; SSE42-LABEL: _mul2xi32b: +; SSE42: # BB#0: +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE42-NEXT: pmuludq %xmm0, %xmm1 +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; SSE42-NEXT: retq +; +; AVX-LABEL: _mul2xi32b: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX-NEXT: retq + %factor0 = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> + %factor1 = shufflevector <2 x i32> %1, <2 x i32> undef, <4 x i32> + %product64 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %factor0, <4 x i32> %factor1) readnone + %product = bitcast <2 x i64> %product64 to <4 x i32> + %r = shufflevector <4 x i32> %product, <4 x i32> undef, <2 x i32> + ret <2 x i32> %r +} + +define <4 x i32> @_mul4xi32a(<4 x i32>, <4 x i32>) { +; SSE2-LABEL: _mul4xi32a: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSE42-LABEL: _mul4xi32a: +; SSE42: # BB#0: +; SSE42-NEXT: pmulld %xmm1, %xmm0 +; SSE42-NEXT: retq +; +; AVX-LABEL: _mul4xi32a: +; AVX: # BB#0: +; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %r = mul <4 x i32> %0, %1 + ret <4 x i32> %r +} + +define <4 x i32> @_mul4xi32b(<4 x i32>, <4 x i32>) { +; SSE2-LABEL: _mul4xi32b: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSE42-LABEL: _mul4xi32b: +; SSE42: # BB#0: +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE42-NEXT: pmuludq %xmm1, %xmm0 +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE42-NEXT: pmuludq %xmm2, %xmm1 +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; SSE42-NEXT: retq +; +; AVX1-LABEL: _mul4xi32b: +; AVX1: # BB#0: +; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: _mul4xi32b: +; AVX2: # BB#0: +; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] +; AVX2-NEXT: retq + %even0 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> + %even1 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + %evenMul64 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %even0, <4 x i32> %even1) readnone + %evenMul = bitcast <2 x i64> %evenMul64 to <4 x i32> + %odd0 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> + %odd1 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + %oddMul64 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %odd0, <4 x i32> %odd1) readnone + %oddMul = bitcast <2 x i64> %oddMul64 to <4 x i32> + %r = shufflevector <4 x i32> %evenMul, <4 x i32> %oddMul, <4 x i32> + ret <4 x i32> %r +} + +; the following extractelement's and insertelement's +; are just an unrolled 'zext' on a vector +; %ext0 = zext <4 x i32> %0 to <4 x i64> +; %ext1 = zext <4 x i32> %1 to <4 x i64> +define <4 x i64> @_mul4xi32toi64a(<4 x i32>, <4 x i32>) { +; SSE2-LABEL: _mul4xi32toi64a: +; SSE2: # BB#0: +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: movq %xmm1, %rcx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: shrq $32, %rcx +; SSE2-NEXT: movq %xmm0, %rdx +; SSE2-NEXT: movd %edx, %xmm2 +; SSE2-NEXT: shrq $32, %rdx +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm0, %rsi +; SSE2-NEXT: movd %esi, %xmm3 +; SSE2-NEXT: shrq $32, %rsi +; SSE2-NEXT: movd %esi, %xmm0 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE2-NEXT: movd %edx, %xmm0 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: shrq $32, %rax +; SSE2-NEXT: pmuludq %xmm3, %xmm1 +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; SSE2-NEXT: pmuludq %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE42-LABEL: _mul4xi32toi64a: +; SSE42: # BB#0: +; SSE42-NEXT: movq %xmm1, %rax +; SSE42-NEXT: pextrq $1, %xmm1, %rcx +; SSE42-NEXT: movd %ecx, %xmm1 +; SSE42-NEXT: shrq $32, %rcx +; SSE42-NEXT: movq %xmm0, %rdx +; SSE42-NEXT: movd %edx, %xmm2 +; SSE42-NEXT: shrq $32, %rdx +; SSE42-NEXT: pextrq $1, %xmm0, %rsi +; SSE42-NEXT: movd %esi, %xmm3 +; SSE42-NEXT: shrq $32, %rsi +; SSE42-NEXT: movd %esi, %xmm0 +; SSE42-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE42-NEXT: movd %edx, %xmm0 +; SSE42-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE42-NEXT: movd %ecx, %xmm0 +; SSE42-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE42-NEXT: movd %eax, %xmm0 +; SSE42-NEXT: shrq $32, %rax +; SSE42-NEXT: pmuludq %xmm3, %xmm1 +; SSE42-NEXT: movd %eax, %xmm3 +; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; SSE42-NEXT: pmuludq %xmm2, %xmm0 +; SSE42-NEXT: retq +; +; AVX1-LABEL: _mul4xi32toi64a: +; AVX1: # BB#0: +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vmovd %eax, %xmm2 +; AVX1-NEXT: shrq $32, %rax +; AVX1-NEXT: vmovq %xmm1, %rcx +; AVX1-NEXT: vpextrq $1, %xmm0, %rdx +; AVX1-NEXT: vmovd %edx, %xmm0 +; AVX1-NEXT: shrq $32, %rdx +; AVX1-NEXT: vpextrq $1, %xmm1, %rsi +; AVX1-NEXT: vmovd %esi, %xmm1 +; AVX1-NEXT: shrq $32, %rsi +; AVX1-NEXT: vmovd %esi, %xmm3 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; AVX1-NEXT: vmovd %edx, %xmm3 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; AVX1-NEXT: vmovd %ecx, %xmm3 +; AVX1-NEXT: shrq $32, %rcx +; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %ecx, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX1-NEXT: vmovd %eax, %xmm3 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: _mul4xi32toi64a: +; AVX2: # BB#0: +; AVX2-NEXT: vmovq %xmm1, %rax +; AVX2-NEXT: vmovd %eax, %xmm2 +; AVX2-NEXT: shrq $32, %rax +; AVX2-NEXT: vpextrq $1, %xmm1, %rcx +; AVX2-NEXT: vmovq %xmm0, %rdx +; AVX2-NEXT: vmovd %edx, %xmm1 +; AVX2-NEXT: shrq $32, %rdx +; AVX2-NEXT: vpextrq $1, %xmm0, %rsi +; AVX2-NEXT: vmovd %esi, %xmm0 +; AVX2-NEXT: shrq $32, %rsi +; AVX2-NEXT: vmovd %esi, %xmm3 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; AVX2-NEXT: vmovd %edx, %xmm3 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; AVX2-NEXT: vmovd %ecx, %xmm3 +; AVX2-NEXT: shrq $32, %rcx +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vmovd %ecx, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX2-NEXT: vmovd %eax, %xmm3 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %f00 = extractelement <4 x i32> %0, i32 0 + %f01 = extractelement <4 x i32> %0, i32 1 + %f02 = extractelement <4 x i32> %0, i32 2 + %f03 = extractelement <4 x i32> %0, i32 3 + %f10 = extractelement <4 x i32> %1, i32 0 + %f11 = extractelement <4 x i32> %1, i32 1 + %f12 = extractelement <4 x i32> %1, i32 2 + %f13 = extractelement <4 x i32> %1, i32 3 + %ext00 = zext i32 %f00 to i64 + %ext01 = zext i32 %f01 to i64 + %ext02 = zext i32 %f02 to i64 + %ext03 = zext i32 %f03 to i64 + %ext10 = zext i32 %f10 to i64 + %ext11 = zext i32 %f11 to i64 + %ext12 = zext i32 %f12 to i64 + %ext13 = zext i32 %f13 to i64 + %extv00 = insertelement <4 x i64> undef, i64 %ext00, i32 0 + %extv01 = insertelement <4 x i64> %extv00, i64 %ext01, i32 1 + %extv02 = insertelement <4 x i64> %extv01, i64 %ext02, i32 2 + %extv03 = insertelement <4 x i64> %extv02, i64 %ext03, i32 3 + %extv10 = insertelement <4 x i64> undef, i64 %ext10, i32 0 + %extv11 = insertelement <4 x i64> %extv10, i64 %ext11, i32 1 + %extv12 = insertelement <4 x i64> %extv11, i64 %ext12, i32 2 + %extv13 = insertelement <4 x i64> %extv12, i64 %ext13, i32 3 + %r = mul <4 x i64> %extv03, %extv13 + ret <4 x i64> %r +} + +; very similar to mul4xi32 above +; there is no bitcast and the final shuffle is a little different +define <4 x i64> @_mul4xi32toi64b(<4 x i32>, <4 x i32>) { +; SSE-LABEL: _mul4xi32toi64b: +; SSE: # BB#0: +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE-NEXT: pmuludq %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE-NEXT: pmuludq %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX1-LABEL: _mul4xi32toi64b: +; AVX1: # BB#0: +; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm0[1] +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: _mul4xi32toi64b: +; AVX2: # BB#0: +; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: retq + %even0 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> + %even1 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + %evenMul = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %even0, <4 x i32> %even1) readnone + %odd0 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> + %odd1 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + %oddMul = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %odd0, <4 x i32> %odd1) readnone + %r = shufflevector <2 x i64> %evenMul, <2 x i64> %oddMul, <4 x i32> + ret <4 x i64> %r +} + +; Here we do not split into even and odd indexed elements +; but into the lower and the upper half of the factor vectors. +; This makes the initial shuffle more complicated, +; but the final shuffle is a no-op. +define <4 x i64> @_mul4xi32toi64c(<4 x i32>, <4 x i32>) { +; SSE2-LABEL: _mul4xi32toi64c: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,1,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3] +; SSE2-NEXT: pmuludq %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] +; SSE2-NEXT: pmuludq %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE42-LABEL: _mul4xi32toi64c: +; SSE42: # BB#0: +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero +; SSE42-NEXT: pmuludq %xmm3, %xmm2 +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; SSE42-NEXT: pmuludq %xmm0, %xmm1 +; SSE42-NEXT: movdqa %xmm2, %xmm0 +; SSE42-NEXT: retq +; +; AVX1-LABEL: _mul4xi32toi64c: +; AVX1: # BB#0: +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: _mul4xi32toi64c: +; AVX2: # BB#0: +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero +; AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; AVX2-NEXT: retq + %lower0 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> + %lower1 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + %lowerMul = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %lower0, <4 x i32> %lower1) readnone + %upper0 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> + %upper1 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + %upperMul = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %upper0, <4 x i32> %upper1) readnone + %r = shufflevector <2 x i64> %lowerMul, <2 x i64> %upperMul, <4 x i32> + ret <4 x i64> %r +} + +; If we know, that the most significant half of i64 elements are zero, +; then multiplication can be simplified drastically. +; In the following example I assert a zero upper half +; by 'trunc' followed by 'zext'. +; +; the following extractelement's and insertelement's +; are just an unrolled 'trunc' plus 'zext' on a vector +; %trunc0 = trunc <2 x i64> %0 to <2 x i32> +; %trunc1 = trunc <2 x i64> %1 to <2 x i32> +; %ext0 = zext <2 x i32> %0 to <2 x i64> +; %ext1 = zext <2 x i32> %1 to <2 x i64> +define <2 x i64> @_mul2xi64toi64a(<2 x i64>, <2 x i64>) { +; SSE2-LABEL: _mul2xi64toi64a: +; SSE2: # BB#0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295] +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE42-LABEL: _mul2xi64toi64a: +; SSE42: # BB#0: +; SSE42-NEXT: pxor %xmm2, %xmm2 +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; SSE42-NEXT: pmuludq %xmm1, %xmm0 +; SSE42-NEXT: retq +; +; AVX1-LABEL: _mul2xi64toi64a: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: _mul2xi64toi64a: +; AVX2: # BB#0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq + %f00 = extractelement <2 x i64> %0, i32 0 + %f01 = extractelement <2 x i64> %0, i32 1 + %f10 = extractelement <2 x i64> %1, i32 0 + %f11 = extractelement <2 x i64> %1, i32 1 + %trunc00 = trunc i64 %f00 to i32 + %trunc01 = trunc i64 %f01 to i32 + %ext00 = zext i32 %trunc00 to i64 + %ext01 = zext i32 %trunc01 to i64 + %trunc10 = trunc i64 %f10 to i32 + %trunc11 = trunc i64 %f11 to i32 + %ext10 = zext i32 %trunc10 to i64 + %ext11 = zext i32 %trunc11 to i64 + %extv00 = insertelement <2 x i64> undef, i64 %ext00, i32 0 + %extv01 = insertelement <2 x i64> %extv00, i64 %ext01, i32 1 + %extv10 = insertelement <2 x i64> undef, i64 %ext10, i32 0 + %extv11 = insertelement <2 x i64> %extv10, i64 %ext11, i32 1 + %r = mul <2 x i64> %extv01, %extv11 + ret <2 x i64> %r +} + +define <2 x i64> @_mul2xi64toi64b(<2 x i64>, <2 x i64>) { +; SSE-LABEL: _mul2xi64toi64b: +; SSE: # BB#0: +; SSE-NEXT: pmuludq %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: _mul2xi64toi64b: +; AVX: # BB#0: +; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %f0 = bitcast <2 x i64> %0 to <4 x i32> + %f1 = bitcast <2 x i64> %1 to <4 x i32> + %r = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %f0, <4 x i32> %f1) readnone + ret <2 x i64> %r +} + +declare <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32>, <4 x i32>) nounwind readnone -- 2.40.0