From 5b6f1112aa0724b6efbd64a8d2275e647ab4a038 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 2 Feb 2017 22:02:57 +0000 Subject: [PATCH] [X86] Move turning 256-bit INSERT_SUBVECTORS into BLENDI from legalize to DAG combine. On one test this seems to have given more chance for DAG combine to do other INSERT_SUBVECTOR/EXTRACT_SUBVECTOR combines before the BLENDI was created. Looks like we can still improve more by teaching DAG combine to optimize INSERT_SUBVECTOR/EXTRACT_SUBVECTOR with BLENDI. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@293944 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 83 ++++++++++++-------------- test/CodeGen/X86/insertelement-zero.ll | 16 ++--- 2 files changed, 45 insertions(+), 54 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index bc4c2a842a2..c76164ecd79 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -4917,50 +4917,6 @@ static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal, static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl) { assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!"); - - // For insertion into the zero index (low half) of a 256-bit vector, it is - // more efficient to generate a blend with immediate instead of an insert*128. - // We are still creating an INSERT_SUBVECTOR below with an undef node to - // extend the subvector to the size of the result vector. Make sure that - // we are not recursing on that node by checking for undef here. - if (IdxVal == 0 && Result.getValueType().is256BitVector() && - !Result.isUndef()) { - EVT ResultVT = Result.getValueType(); - SDValue ZeroIndex = DAG.getIntPtrConstant(0, dl); - SDValue Undef = DAG.getUNDEF(ResultVT); - SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Undef, - Vec, ZeroIndex); - - // The blend instruction, and therefore its mask, depend on the data type. - MVT ScalarType = ResultVT.getVectorElementType().getSimpleVT(); - if (ScalarType.isFloatingPoint()) { - // Choose either vblendps (float) or vblendpd (double). - unsigned ScalarSize = ScalarType.getSizeInBits(); - assert((ScalarSize == 64 || ScalarSize == 32) && "Unknown float type"); - unsigned MaskVal = (ScalarSize == 64) ? 0x03 : 0x0f; - SDValue Mask = DAG.getConstant(MaskVal, dl, MVT::i8); - return DAG.getNode(X86ISD::BLENDI, dl, ResultVT, Result, Vec256, Mask); - } - - const X86Subtarget &Subtarget = - static_cast(DAG.getSubtarget()); - - // AVX2 is needed for 256-bit integer blend support. - // Integers must be cast to 32-bit because there is only vpblendd; - // vpblendw can't be used for this because it has a handicapped mask. - - // If we don't have AVX2, then cast to float. Using a wrong domain blend - // is still more efficient than using the wrong domain vinsertf128 that - // will be created by InsertSubVector(). - MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32; - - SDValue Mask = DAG.getConstant(0x0f, dl, MVT::i8); - Result = DAG.getBitcast(CastVT, Result); - Vec256 = DAG.getBitcast(CastVT, Vec256); - Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask); - return DAG.getBitcast(ResultVT, Vec256); - } - return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128); } @@ -34165,6 +34121,45 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG, MVT OpVT = N->getSimpleValueType(0); MVT SubVecVT = SubVec.getSimpleValueType(); + // For insertion into the zero index (low half) of a 256-bit vector, it is + // more efficient to generate a blend with immediate instead of an insert*128. + // We are still creating an INSERT_SUBVECTOR below with an undef node to + // extend the subvector to the size of the result vector. Make sure that + // we are not recursing on that node by checking for undef here. + if (IdxVal == 0 && OpVT.is256BitVector() && SubVecVT.is128BitVector() && + !Vec.isUndef()) { + SDValue ZeroIndex = DAG.getIntPtrConstant(0, dl); + SDValue Undef = DAG.getUNDEF(OpVT); + SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef, + SubVec, ZeroIndex); + + // The blend instruction, and therefore its mask, depend on the data type. + MVT ScalarType = OpVT.getVectorElementType(); + if (ScalarType.isFloatingPoint()) { + // Choose either vblendps (float) or vblendpd (double). + unsigned ScalarSize = ScalarType.getSizeInBits(); + assert((ScalarSize == 64 || ScalarSize == 32) && "Unknown float type"); + unsigned MaskVal = (ScalarSize == 64) ? 0x03 : 0x0f; + SDValue Mask = DAG.getConstant(MaskVal, dl, MVT::i8); + return DAG.getNode(X86ISD::BLENDI, dl, OpVT, Vec, Vec256, Mask); + } + + // AVX2 is needed for 256-bit integer blend support. + // Integers must be cast to 32-bit because there is only vpblendd; + // vpblendw can't be used for this because it has a handicapped mask. + + // If we don't have AVX2, then cast to float. Using a wrong domain blend + // is still more efficient than using the wrong domain vinsertf128 that + // will be created by InsertSubVector(). + MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32; + + SDValue Mask = DAG.getConstant(0x0f, dl, MVT::i8); + Vec = DAG.getBitcast(CastVT, Vec); + Vec256 = DAG.getBitcast(CastVT, Vec256); + Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Vec, Vec256, Mask); + return DAG.getBitcast(OpVT, Vec256); + } + // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte // load: // (insert_subvector (insert_subvector undef, (load16 addr), 0), diff --git a/test/CodeGen/X86/insertelement-zero.ll b/test/CodeGen/X86/insertelement-zero.ll index 64c586fcb68..3571fa72738 100644 --- a/test/CodeGen/X86/insertelement-zero.ll +++ b/test/CodeGen/X86/insertelement-zero.ll @@ -409,9 +409,8 @@ define <16 x i16> @insert_v16i16_z12345z789ABZDEz(<16 x i16> %a) { ; AVX1: # BB#0: ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm0[1,2,3,4,5,6,7] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm1[6],xmm2[7] +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,6],xmm1[7] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -421,8 +420,7 @@ define <16 x i16> @insert_v16i16_z12345z789ABZDEz(<16 x i16> %a) { ; AVX2: # BB#0: ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm0[1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7] +; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm1[6],xmm2[7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,6],xmm1[7] @@ -500,9 +498,8 @@ define <32 x i8> @insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz(<32 x i8> %a) { ; AVX1: # BB#0: ; AVX1-NEXT: xorl %eax, %eax ; AVX1-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm1 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7] @@ -513,8 +510,7 @@ define <32 x i8> @insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz(<32 x i8> %a) { ; AVX2: # BB#0: ; AVX2-NEXT: xorl %eax, %eax ; AVX2-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm1 +; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -- 2.50.1