From afebd3ffb54e5250d2a44a7a8c47f3dd0d5a604b Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 2 Jul 2017 19:32:37 +0000 Subject: [PATCH] [X86][AVX512VPOPCNTDQ] Improve support for v16i8/v8i16/v16i16/ CTPOP Zero extend to v16i32/v8i64, use VPOPCNTDQ instructions and truncate back. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@306990 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 14 ++++ test/CodeGen/X86/vector-popcnt-128.ll | 93 ++++++++++++++++++++------- test/CodeGen/X86/vector-popcnt-256.ll | 14 +--- test/CodeGen/X86/vector-popcnt-512.ll | 26 ++------ test/CodeGen/X86/vector-tzcnt-128.ll | 54 +++++----------- test/CodeGen/X86/vector-tzcnt-256.ll | 28 ++------ test/CodeGen/X86/vector-tzcnt-512.ll | 52 ++++----------- 7 files changed, 125 insertions(+), 156 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 45407114541..be80ecd50d5 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -23203,6 +23203,20 @@ static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget, SDLoc DL(Op.getNode()); SDValue Op0 = Op.getOperand(0); + // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions. + if (Subtarget.hasVPOPCNTDQ()) { + if (VT == MVT::v8i16) { + Op = DAG.getNode(X86ISD::VZEXT, DL, MVT::v8i64, Op0); + Op = DAG.getNode(ISD::CTPOP, DL, MVT::v8i64, Op); + return DAG.getNode(X86ISD::VTRUNC, DL, VT, Op); + } + if (VT == MVT::v16i8 || VT == MVT::v16i16) { + Op = DAG.getNode(X86ISD::VZEXT, DL, MVT::v16i32, Op0); + Op = DAG.getNode(ISD::CTPOP, DL, MVT::v16i32, Op); + return DAG.getNode(X86ISD::VTRUNC, DL, VT, Op); + } + } + if (!Subtarget.hasSSSE3()) { // We can't use the fast LUT approach, so fall back on vectorized bitmath. assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!"); diff --git a/test/CodeGen/X86/vector-popcnt-128.ll b/test/CodeGen/X86/vector-popcnt-128.ll index adda108bdc7..d2f33785530 100644 --- a/test/CodeGen/X86/vector-popcnt-128.ll +++ b/test/CodeGen/X86/vector-popcnt-128.ll @@ -344,20 +344,43 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; SSE41-NEXT: psrlw $8, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: testv8i16: -; AVX: # BB#0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: testv8i16: +; AVX1: # BB#0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv8i16: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv8i16: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vzeroupper +; AVX512VPOPCNTDQ-NEXT: retq %out = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %in) ret <8 x i16> %out } @@ -431,17 +454,37 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind { ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: testv16i8: -; AVX: # BB#0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: testv16i8: +; AVX1: # BB#0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv16i8: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv16i8: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vzeroupper +; AVX512VPOPCNTDQ-NEXT: retq %out = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %in) ret <16 x i8> %out } diff --git a/test/CodeGen/X86/vector-popcnt-256.ll b/test/CodeGen/X86/vector-popcnt-256.ll index accbad35e9d..4c5de2fed38 100644 --- a/test/CodeGen/X86/vector-popcnt-256.ll +++ b/test/CodeGen/X86/vector-popcnt-256.ll @@ -155,17 +155,9 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { ; ; AVX512VPOPCNTDQ-LABEL: testv16i16: ; AVX512VPOPCNTDQ: # BB#0: -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm1, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq %out = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %in) ret <16 x i16> %out diff --git a/test/CodeGen/X86/vector-popcnt-512.ll b/test/CodeGen/X86/vector-popcnt-512.ll index 735487141ef..1e3f81a9483 100644 --- a/test/CodeGen/X86/vector-popcnt-512.ll +++ b/test/CodeGen/X86/vector-popcnt-512.ll @@ -149,26 +149,12 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; ; AVX512VPOPCNTDQ-LABEL: testv32i16: ; AVX512VPOPCNTDQ: # BB#0: -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm0, %ymm3 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm3 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm3, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm1, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm1, %ymm1 ; AVX512VPOPCNTDQ-NEXT: retq %out = call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %in) ret <32 x i16> %out diff --git a/test/CodeGen/X86/vector-tzcnt-128.ll b/test/CodeGen/X86/vector-tzcnt-128.ll index 4b5a00a30d0..820178d2d99 100644 --- a/test/CodeGen/X86/vector-tzcnt-128.ll +++ b/test/CodeGen/X86/vector-tzcnt-128.ll @@ -928,17 +928,10 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX512VPOPCNTDQ-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; X32-SSE-LABEL: testv8i16: @@ -1095,17 +1088,10 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { ; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX512VPOPCNTDQ-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; X32-SSE-LABEL: testv8i16u: @@ -1243,14 +1229,10 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind { ; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; X32-SSE-LABEL: testv16i8: @@ -1384,14 +1366,10 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind { ; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; X32-SSE-LABEL: testv16i8u: diff --git a/test/CodeGen/X86/vector-tzcnt-256.ll b/test/CodeGen/X86/vector-tzcnt-256.ll index 16192ec61a5..30e5661d548 100644 --- a/test/CodeGen/X86/vector-tzcnt-256.ll +++ b/test/CodeGen/X86/vector-tzcnt-256.ll @@ -584,17 +584,9 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm1, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm1, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; X32-AVX-LABEL: testv16i16: @@ -722,17 +714,9 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind { ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm1, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm1, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; X32-AVX-LABEL: testv16i16u: diff --git a/test/CodeGen/X86/vector-tzcnt-512.ll b/test/CodeGen/X86/vector-tzcnt-512.ll index a196d5ee0c0..3bf677aadf1 100644 --- a/test/CodeGen/X86/vector-tzcnt-512.ll +++ b/test/CodeGen/X86/vector-tzcnt-512.ll @@ -364,29 +364,15 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 ; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm3, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm5 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm5, %ymm6, %ymm5 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm6, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm5 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm5, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm1, %ymm2, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm3, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm6, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm6, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm1, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm1, %ymm1 ; AVX512VPOPCNTDQ-NEXT: retq %out = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %in, i1 0) ret <32 x i16> %out @@ -472,29 +458,15 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind { ; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 ; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm3, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm5 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm5, %ymm6, %ymm5 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm6, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm5 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm5, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm1, %ymm2, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm3, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm6, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm6, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm1, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm1, %ymm1 ; AVX512VPOPCNTDQ-NEXT: retq %out = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %in, i1 -1) ret <32 x i16> %out -- 2.40.0