From eb6715931ac5732b39ac1ebc9617396f95892bd7 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 30 Sep 2019 03:14:38 +0000 Subject: [PATCH] [X86] Split v16i32/v8i64 bitreverse on avx512f targets without avx512bw to enable the use of vpshufb on the 256-bit halves. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373177 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 13 +++- test/CodeGen/X86/vector-bitreverse.ll | 93 ++++++++++++--------------- 2 files changed, 52 insertions(+), 54 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 3a862590e6b..089c7ebb19a 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1508,10 +1508,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::MGATHER, VT, Custom); setOperationAction(ISD::MSCATTER, VT, Custom); } - // Need to custom split v32i16/v64i8 bitcasts. if (!Subtarget.hasBWI()) { + // Need to custom split v32i16/v64i8 bitcasts. setOperationAction(ISD::BITCAST, MVT::v32i16, Custom); setOperationAction(ISD::BITCAST, MVT::v64i8, Custom); + + // Better to split these into two 256-bit ops. + setOperationAction(ISD::BITREVERSE, MVT::v8i64, Custom); + setOperationAction(ISD::BITREVERSE, MVT::v16i32, Custom); } if (Subtarget.hasVBMI2()) { @@ -26919,6 +26923,13 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget, SDValue In = Op.getOperand(0); SDLoc DL(Op); + // Split v8i64/v16i32 without BWI so that we can still use the PSHUFB + // lowering. + if (VT == MVT::v8i64 || VT == MVT::v16i32) { + assert(!Subtarget.hasBWI() && "BWI should Expand BITREVERSE"); + return Lower512IntUnary(Op, DAG); + } + unsigned NumElts = VT.getVectorNumElements(); assert(VT.getScalarType() == MVT::i8 && "Only byte vector BITREVERSE supported"); diff --git a/test/CodeGen/X86/vector-bitreverse.ll b/test/CodeGen/X86/vector-bitreverse.ll index ddb90c7a557..954d7a2c52a 100644 --- a/test/CodeGen/X86/vector-bitreverse.ll +++ b/test/CodeGen/X86/vector-bitreverse.ll @@ -1911,28 +1911,26 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind { ; ; AVX512F-LABEL: test_bitreverse_v16i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpslld $8, %zmm0, %zmm1 -; AVX512F-NEXT: vpslld $24, %zmm0, %zmm2 -; AVX512F-NEXT: vpternlogd $248, {{.*}}(%rip){1to16}, %zmm1, %zmm2 -; AVX512F-NEXT: vpsrld $24, %zmm0, %zmm1 -; AVX512F-NEXT: vpsrld $8, %zmm0, %zmm0 -; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512F-NEXT: vpternlogd $254, %zmm1, %zmm2, %zmm0 -; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1 -; AVX512F-NEXT: vpslld $4, %zmm1, %zmm1 -; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512F-NEXT: vpsrld $4, %zmm0, %zmm0 -; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1 -; AVX512F-NEXT: vpslld $2, %zmm1, %zmm1 -; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512F-NEXT: vpsrld $2, %zmm0, %zmm0 -; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1 -; AVX512F-NEXT: vpslld $1, %zmm1, %zmm1 -; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512F-NEXT: vpsrld $1, %zmm0, %zmm0 -; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm4 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4 +; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX512F-NEXT: vpshufb %ymm1, %ymm6, %ymm1 +; AVX512F-NEXT: vpor %ymm1, %ymm4, %ymm1 +; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm2 +; AVX512F-NEXT: vpshufb %ymm2, %ymm5, %ymm2 +; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpshufb %ymm0, %ymm6, %ymm0 +; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: test_bitreverse_v16i32: @@ -2217,37 +2215,26 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind { ; ; AVX512F-LABEL: test_bitreverse_v8i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsllq $40, %zmm0, %zmm1 -; AVX512F-NEXT: vpsllq $56, %zmm0, %zmm2 -; AVX512F-NEXT: vpternlogq $248, {{.*}}(%rip){1to8}, %zmm1, %zmm2 -; AVX512F-NEXT: vpsllq $8, %zmm0, %zmm1 -; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 -; AVX512F-NEXT: vpsllq $24, %zmm0, %zmm3 -; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3 -; AVX512F-NEXT: vpternlogq $254, %zmm1, %zmm2, %zmm3 -; AVX512F-NEXT: vpsrlq $40, %zmm0, %zmm1 -; AVX512F-NEXT: vpsrlq $56, %zmm0, %zmm2 -; AVX512F-NEXT: vpternlogq $248, {{.*}}(%rip){1to8}, %zmm1, %zmm2 -; AVX512F-NEXT: vpsrlq $24, %zmm0, %zmm1 -; AVX512F-NEXT: vpsrlq $8, %zmm0, %zmm0 -; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512F-NEXT: vpternlogq $248, {{.*}}(%rip){1to8}, %zmm1, %zmm0 -; AVX512F-NEXT: vpternlogq $254, %zmm2, %zmm3, %zmm0 -; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1 -; AVX512F-NEXT: vpsllq $4, %zmm1, %zmm1 -; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512F-NEXT: vpsrlq $4, %zmm0, %zmm0 -; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1 -; AVX512F-NEXT: vpsllq $2, %zmm1, %zmm1 -; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512F-NEXT: vpsrlq $2, %zmm0, %zmm0 -; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1 -; AVX512F-NEXT: vpsllq $1, %zmm1, %zmm1 -; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512F-NEXT: vpsrlq $1, %zmm0, %zmm0 -; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] +; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm4 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4 +; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX512F-NEXT: vpshufb %ymm1, %ymm6, %ymm1 +; AVX512F-NEXT: vpor %ymm1, %ymm4, %ymm1 +; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm2 +; AVX512F-NEXT: vpshufb %ymm2, %ymm5, %ymm2 +; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpshufb %ymm0, %ymm6, %ymm0 +; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: test_bitreverse_v8i64: -- 2.50.1