From d46db47633d3d10555454ce0d83c22791fb2391d Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 13 Feb 2017 04:53:29 +0000 Subject: [PATCH] [X86] Genericize the handling of INSERT_SUBVECTOR from an EXTRACT_SUBVECTOR to support 512-bit vectors with 128-bit or 256-bit subvectors. We now detect that both the extract and insert indices are non-zero and convert to a shuffle. This will be lowered as a blend for 256-bit vectors or as a vshuf operations for 512-bit vectors. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@294931 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 39 +++++++++---------- test/CodeGen/X86/avx-vperm2x128.ll | 2 +- test/CodeGen/X86/avx512-mask-op.ll | 6 +-- .../X86/clear_upper_vector_element_bits.ll | 10 ++--- test/CodeGen/X86/vector-shuffle-256-v16.ll | 2 +- test/CodeGen/X86/vector-shuffle-256-v32.ll | 2 +- 6 files changed, 28 insertions(+), 33 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 9d2b3fcd039..bc1b7bc1894 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -34160,28 +34160,25 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG, MVT OpVT = N->getSimpleValueType(0); MVT SubVecVT = SubVec.getSimpleValueType(); - // If we're inserting into the upper half of a 256-bit vector with a vector - // that was extracted from the upper half of a 256-bit vector, we should - // use a blend instead. - if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && OpVT.is256BitVector() && + // If this is an insert of an extract, combine to a shuffle. Don't do this + // if the insert or extract can be represented with a subvector operation. + if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && SubVec.getOperand(0).getSimpleValueType() == OpVT && - Idx == SubVec.getOperand(1) && IdxVal == OpVT.getVectorNumElements()/2) { - - // Integers must be cast to 32-bit because there is only vpblendd; - // vpblendw can't be used for this because it has a handicapped mask. - // If we don't have AVX2, then cast to float. Using a wrong domain blend - // is still more efficient than using the wrong domain vinsertf128 that - // will be created by InsertSubVector(). - MVT CastVT = OpVT; - if (OpVT.isInteger()) - CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32; - - // The blend instruction, and therefore its mask, depend on the data type. - unsigned MaskVal = CastVT.getScalarSizeInBits() == 64 ? 0x0c : 0xf0; - SDValue Mask = DAG.getConstant(MaskVal, dl, MVT::i8); - Vec = DAG.getNode(X86ISD::BLENDI, dl, CastVT, DAG.getBitcast(CastVT, Vec), - DAG.getBitcast(CastVT, SubVec.getOperand(0)), Mask); - return DAG.getBitcast(OpVT, Vec); + (IdxVal != 0 || !Vec.isUndef())) { + int ExtIdxVal = cast(SubVec.getOperand(1))->getZExtValue(); + if (ExtIdxVal != 0) { + int VecNumElts = OpVT.getVectorNumElements(); + int SubVecNumElts = SubVecVT.getVectorNumElements(); + SmallVector Mask(VecNumElts); + // First create an identity shuffle mask. + for (int i = 0; i != VecNumElts; ++i) + Mask[i] = i; + // Now insert the extracted portion. + for (int i = 0; i != SubVecNumElts; ++i) + Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts; + + return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask); + } } // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte diff --git a/test/CodeGen/X86/avx-vperm2x128.ll b/test/CodeGen/X86/avx-vperm2x128.ll index 280b218b8b5..f4a77c370db 100644 --- a/test/CodeGen/X86/avx-vperm2x128.ll +++ b/test/CodeGen/X86/avx-vperm2x128.ll @@ -466,7 +466,7 @@ define <4 x i64> @shuffle_v4i64_67zz(<4 x i64> %a, <4 x i64> %b) { ; AVX1: ## BB#0: ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero ; AVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v4i64_67zz: diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll index 6d0c7c2be96..1f8a5992691 100644 --- a/test/CodeGen/X86/avx512-mask-op.ll +++ b/test/CodeGen/X86/avx512-mask-op.ll @@ -542,8 +542,7 @@ define <64 x i8> @test16(i64 %x) { ; SKX-NEXT: movl $32, %eax ; SKX-NEXT: kmovd %eax, %k1 ; SKX-NEXT: vpblendmb %ymm0, %ymm1, %ymm0 {%k1} -; SKX-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; SKX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] ; SKX-NEXT: vpmovb2m %zmm0, %k0 ; SKX-NEXT: vpmovm2b %k0, %zmm0 ; SKX-NEXT: retq @@ -609,8 +608,7 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) { ; SKX-NEXT: movl $32, %eax ; SKX-NEXT: kmovd %eax, %k1 ; SKX-NEXT: vpblendmb %ymm0, %ymm1, %ymm0 {%k1} -; SKX-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; SKX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] ; SKX-NEXT: vpmovb2m %zmm0, %k0 ; SKX-NEXT: vpmovm2b %k0, %zmm0 ; SKX-NEXT: retq diff --git a/test/CodeGen/X86/clear_upper_vector_element_bits.ll b/test/CodeGen/X86/clear_upper_vector_element_bits.ll index 4ad865e19aa..14ef67884a4 100644 --- a/test/CodeGen/X86/clear_upper_vector_element_bits.ll +++ b/test/CodeGen/X86/clear_upper_vector_element_bits.ll @@ -46,7 +46,7 @@ define <4 x i64> @_clearupper4xi64a(<4 x i64>) nounwind { ; ; AVX1-LABEL: _clearupper4xi64a: ; AVX1: # BB#0: -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7] +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2,3] ; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX1-NEXT: retq @@ -118,8 +118,8 @@ define <8 x i32> @_clearupper8xi32a(<8 x i32>) nounwind { ; ; AVX1-LABEL: _clearupper8xi32a: ; AVX1: # BB#0: -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7] -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2,3] +; AVX1-NEXT: vandpd {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: _clearupper8xi32a: @@ -292,8 +292,8 @@ define <16 x i16> @_clearupper16xi16a(<16 x i16>) nounwind { ; ; AVX1-LABEL: _clearupper16xi16a: ; AVX1: # BB#0: -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7] -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2,3] +; AVX1-NEXT: vandpd {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: _clearupper16xi16a: diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll index eb036353eaf..951756c256d 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -2653,7 +2653,7 @@ define <16 x i16> @shuffle_v16i16_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_1 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[1,1,2,3,4,5,6,7] ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4,5,6,7] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-NEXT: retq ; ; AVX2OR512VL-LABEL: shuffle_v16i16_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15: diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll index 31563e82d37..abca69a6de6 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -1089,7 +1089,7 @@ define <32 x i8> @shuffle_v32i8_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15_ ; AVX1-LABEL: shuffle_v32i8_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15_u6_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31: ; AVX1: # BB#0: ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1],zero,xmm0[2],zero,xmm0[4,u,6,7,8,9,10,11,12,13,14,15] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-NEXT: retq ; ; AVX2OR512VL-LABEL: shuffle_v32i8_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15_u6_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31: -- 2.50.1