From a4ee850374e53a8de531d02ddc86a7adab4c3f81 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 28 Mar 2017 16:40:38 +0000 Subject: [PATCH] [X86][AVX2] Add support for combining v16i16 shuffles to VPBLENDW git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@298929 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 75 ++++++++++++------- .../X86/clear_upper_vector_element_bits.ll | 5 +- test/CodeGen/X86/vec_uint_to_fp-fastmath.ll | 7 +- .../X86/vector-shuffle-combining-avx2.ll | 12 ++- 4 files changed, 59 insertions(+), 40 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index cb38a85b83b..88e09cd56f8 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -27140,43 +27140,62 @@ static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef Mask, } // Attempt to combine to X86ISD::BLENDI. - // TODO - add 16i16 support (requires lane duplication). - if (NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) || - (Subtarget.hasAVX() && MaskVT.is256BitVector()))) { + if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) || + (Subtarget.hasAVX() && MaskVT.is256BitVector()))) || + (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) { uint64_t BlendMask = 0; bool ForceV1Zero = false, ForceV2Zero = false; SmallVector TargetMask(Mask.begin(), Mask.end()); if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero, BlendMask)) { - // Determine a type compatible with X86ISD::BLENDI. - ShuffleVT = MaskVT; - if (Subtarget.hasAVX2()) { - if (ShuffleVT == MVT::v4i64) - ShuffleVT = MVT::v8i32; - else if (ShuffleVT == MVT::v2i64) - ShuffleVT = MVT::v4i32; + if (MaskVT == MVT::v16i16) { + // We can only use v16i16 PBLENDW if the lanes are repeated. + SmallVector RepeatedMask; + if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask, + RepeatedMask)) { + assert(RepeatedMask.size() == 8 && + "Repeated mask size doesn't match!"); + PermuteImm = 0; + for (int i = 0; i < 8; ++i) + if (RepeatedMask[i] >= 8) + PermuteImm |= 1 << i; + V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1; + V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2; + Shuffle = X86ISD::BLENDI; + ShuffleVT = MaskVT; + return true; + } } else { - if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32) - ShuffleVT = MVT::v8i16; - else if (ShuffleVT == MVT::v4i64) - ShuffleVT = MVT::v4f64; - else if (ShuffleVT == MVT::v8i32) - ShuffleVT = MVT::v8f32; - } + // Determine a type compatible with X86ISD::BLENDI. + ShuffleVT = MaskVT; + if (Subtarget.hasAVX2()) { + if (ShuffleVT == MVT::v4i64) + ShuffleVT = MVT::v8i32; + else if (ShuffleVT == MVT::v2i64) + ShuffleVT = MVT::v4i32; + } else { + if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32) + ShuffleVT = MVT::v8i16; + else if (ShuffleVT == MVT::v4i64) + ShuffleVT = MVT::v4f64; + else if (ShuffleVT == MVT::v8i32) + ShuffleVT = MVT::v8f32; + } - V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1; - V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2; + if (!ShuffleVT.isFloatingPoint()) { + int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits(); + BlendMask = + scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale); + ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale); + ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale); + } - if (!ShuffleVT.isFloatingPoint()) { - int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits(); - BlendMask = scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale); - ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale); - ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale); + V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1; + V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2; + PermuteImm = (unsigned)BlendMask; + Shuffle = X86ISD::BLENDI; + return true; } - - PermuteImm = (unsigned)BlendMask; - Shuffle = X86ISD::BLENDI; - return true; } } diff --git a/test/CodeGen/X86/clear_upper_vector_element_bits.ll b/test/CodeGen/X86/clear_upper_vector_element_bits.ll index eb51657ea61..c425e3a92d1 100644 --- a/test/CodeGen/X86/clear_upper_vector_element_bits.ll +++ b/test/CodeGen/X86/clear_upper_vector_element_bits.ll @@ -103,7 +103,6 @@ define <4 x i32> @_clearupper4xi32a(<4 x i32>) nounwind { ret <4 x i32> %v3 } -; FIXME: Missed vpblendw on AVX2 target define <8 x i32> @_clearupper8xi32a(<8 x i32>) nounwind { ; SSE-LABEL: _clearupper8xi32a: ; SSE: # BB#0: @@ -119,8 +118,8 @@ define <8 x i32> @_clearupper8xi32a(<8 x i32>) nounwind { ; ; AVX2-LABEL: _clearupper8xi32a: ; AVX2: # BB#0: -; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm1 -; AVX2-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] ; AVX2-NEXT: retq %x0 = extractelement <8 x i32> %0, i32 0 %x1 = extractelement <8 x i32> %0, i32 1 diff --git a/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll b/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll index d17db7d9779..7df3c307042 100644 --- a/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll +++ b/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll @@ -103,9 +103,6 @@ define <4 x float> @test_uitofp_v4i32_to_v4f32(<4 x i32> %arg) { ; AVX2: [[FPMASKCSTADDR_v8:.LCPI[0-9_]+]]: ; AVX2-NEXT: .long 1199570944 # float 65536 -; AVX2: [[MASKCSTADDR_v8:.LCPI[0-9_]+]]: -; AVX2-NEXT: .long 65535 # 0xffff - define <8 x float> @test_uitofp_v8i32_to_v8f32(<8 x i32> %arg) { ; SSE2-LABEL: test_uitofp_v8i32_to_v8f32: ; SSE2: # BB#0: @@ -166,8 +163,8 @@ define <8 x float> @test_uitofp_v8i32_to_v8f32(<8 x i32> %arg) { ; AVX2-NEXT: vcvtdq2ps %ymm1, %ymm1 ; AVX2-NEXT: vbroadcastss [[FPMASKCSTADDR_v8]](%rip), %ymm2 ; AVX2-NEXT: vmulps %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpbroadcastd [[MASKCSTADDR_v8]](%rip), %ymm2 -; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vxorps %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 ; AVX2-NEXT: vaddps %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll index 03609f6d22c..1385929ab8c 100644 --- a/test/CodeGen/X86/vector-shuffle-combining-avx2.ll +++ b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll @@ -74,12 +74,14 @@ define <32 x i8> @combine_pshufb_vpermps(<8 x float> %a) { define <32 x i8> @combine_and_pshufb(<32 x i8> %a0) { ; X32-LABEL: combine_and_pshufb: ; X32: # BB#0: -; X32-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 +; X32-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; X32-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] ; X32-NEXT: retl ; ; X64-LABEL: combine_and_pshufb: ; X64: # BB#0: -; X64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; X64-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] ; X64-NEXT: retq %1 = shufflevector <32 x i8> %a0, <32 x i8> zeroinitializer, <32 x i32> %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> ) @@ -89,12 +91,14 @@ define <32 x i8> @combine_and_pshufb(<32 x i8> %a0) { define <32 x i8> @combine_pshufb_and(<32 x i8> %a0) { ; X32-LABEL: combine_pshufb_and: ; X32: # BB#0: -; X32-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 +; X32-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; X32-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] ; X32-NEXT: retl ; ; X64-LABEL: combine_pshufb_and: ; X64: # BB#0: -; X64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; X64-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] ; X64-NEXT: retq %1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> ) %2 = shufflevector <32 x i8> %1, <32 x i8> zeroinitializer, <32 x i32> -- 2.40.0