From 97ab1ab89a20a8c1feffcc463567736e0b17b895 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 25 Nov 2016 16:48:05 +0000 Subject: [PATCH] [AVX-512] Add support for changing VSHUFF64x2 to VSHUFF32x4 when its feeding a vselect with 32-bit element size. Summary: Shuffle lowering may have widened the element size of a i32 shuffle to i64 before selecting X86ISD::SHUF128. If this shuffle was used by a vselect this can prevent us from selecting masked operations. This patch detects this and changes the element size to match the vselect. I don't handle changing integer to floating point or vice versa as its not clear if its better to push such a bitcast to the inputs of the shuffle or to the user of the vselect. So I'm ignoring that case for now. Reviewers: delena, zvi, RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D27087 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@287939 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 34 ++++++++++++++++------ test/CodeGen/X86/vector-shuffle-512-v16.ll | 16 +++++----- 2 files changed, 33 insertions(+), 17 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 44eae351529..2074456bf8e 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -27834,11 +27834,24 @@ static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG, MVT EltVT = VT.getVectorElementType(); SDLoc DL(Op.getNode()); - switch (Op.getOpcode()) { + auto BitcastAndCombineShuffle = [&](unsigned Opcode, SDValue Op0, SDValue Op1, + SDValue Op2) { + Op0 = DAG.getBitcast(VT, Op0); + DCI.AddToWorklist(Op0.getNode()); + Op1 = DAG.getBitcast(VT, Op1); + DCI.AddToWorklist(Op1.getNode()); + DCI.CombineTo(OrigOp.getNode(), + DAG.getNode(Opcode, DL, VT, Op0, Op1, Op2)); + return true; + }; + + unsigned Opcode = Op.getOpcode(); + switch (Opcode) { case X86ISD::PALIGNR: // PALIGNR can be converted to VALIGND/Q for 128-bit vectors. if (!VT.is128BitVector()) return false; + Opcode = X86ISD::VALIGN; LLVM_FALLTHROUGH; case X86ISD::VALIGN: { if (EltVT != MVT::i32 && EltVT != MVT::i64) @@ -27851,14 +27864,17 @@ static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG, if ((ShiftAmt % EltSize) != 0) return false; Imm = ShiftAmt / EltSize; - SDValue Op0 = DAG.getBitcast(VT, Op.getOperand(0)); - DCI.AddToWorklist(Op0.getNode()); - SDValue Op1 = DAG.getBitcast(VT, Op.getOperand(1)); - DCI.AddToWorklist(Op1.getNode()); - DCI.CombineTo(OrigOp.getNode(), - DAG.getNode(X86ISD::VALIGN, DL, VT, Op0, Op1, - DAG.getConstant(Imm, DL, MVT::i8))); - return true; + return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1), + DAG.getConstant(Imm, DL, MVT::i8)); + } + case X86ISD::SHUF128: { + if (EltVT.getSizeInBits() != 32 && EltVT.getSizeInBits() != 64) + return false; + // Only change element size, not type. + if (VT.isInteger() != Op.getSimpleValueType().isInteger()) + return false; + return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1), + Op.getOperand(2)); } } diff --git a/test/CodeGen/X86/vector-shuffle-512-v16.ll b/test/CodeGen/X86/vector-shuffle-512-v16.ll index 66442595054..d2077786d6e 100644 --- a/test/CodeGen/X86/vector-shuffle-512-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-512-v16.ll @@ -431,16 +431,16 @@ define <16 x float> @test_vshuff32x4_512_mask(<16 x float> %x, <16 x float> %x1, ; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 ; AVX512F-NEXT: vpslld $31, %zmm3, %zmm3 ; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k1 -; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[2,3,0,1] -; AVX512F-NEXT: vblendmps %zmm0, %zmm2, %zmm0 {%k1} +; AVX512F-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,4,5,6,7],zmm1[4,5,6,7,0,1,2,3] +; AVX512F-NEXT: vmovaps %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: test_vshuff32x4_512_mask: ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vpsllw $7, %xmm3, %xmm3 ; AVX512BW-NEXT: vpmovb2m %zmm3, %k1 -; AVX512BW-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[2,3,0,1] -; AVX512BW-NEXT: vblendmps %zmm0, %zmm2, %zmm0 {%k1} +; AVX512BW-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,4,5,6,7],zmm1[4,5,6,7,0,1,2,3] +; AVX512BW-NEXT: vmovaps %zmm2, %zmm0 ; AVX512BW-NEXT: retq %x2 = shufflevector <16 x float> %x, <16 x float> %x1, <16 x i32> %res = select <16 x i1> %mask, <16 x float> %x2, <16 x float> %y @@ -453,16 +453,16 @@ define <16 x i32> @test_vshufi32x4_512_mask(<16 x i32> %x, <16 x i32> %x1, <16 x ; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 ; AVX512F-NEXT: vpslld $31, %zmm3, %zmm3 ; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k1 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[2,3,0,1] -; AVX512F-NEXT: vpblendmd %zmm0, %zmm2, %zmm0 {%k1} +; AVX512F-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,4,5,6,7],zmm1[4,5,6,7,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: test_vshufi32x4_512_mask: ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vpsllw $7, %xmm3, %xmm3 ; AVX512BW-NEXT: vpmovb2m %zmm3, %k1 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[2,3,0,1] -; AVX512BW-NEXT: vpblendmd %zmm0, %zmm2, %zmm0 {%k1} +; AVX512BW-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,4,5,6,7],zmm1[4,5,6,7,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512BW-NEXT: retq %x2 = shufflevector <16 x i32> %x, <16 x i32> %x1, <16 x i32> %res = select <16 x i1> %mask, <16 x i32> %x2, <16 x i32> %y -- 2.50.1