From: Simon Pilgrim Date: Tue, 22 Nov 2016 17:50:06 +0000 (+0000) Subject: [X86][SSE] Combine UNPCKL(FHADD,FHADD) -> FHADD for v2f64 shuffles. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=fcc1f76b4d065552630b28ec1d7959b692680c18;p=llvm [X86][SSE] Combine UNPCKL(FHADD,FHADD) -> FHADD for v2f64 shuffles. This occurs during UINT_TO_FP v2f64 lowering. We can easily generalize this to other horizontal ops (FHSUB, PACKSS, PACKUS) as required - we are doing something similar with PACKUS in lowerV2I64VectorShuffle git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@287676 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index f7834061824..0a4fa41954f 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -26601,6 +26601,17 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, assert(Mask.size() == 4); break; case X86ISD::UNPCKL: { + auto Op0 = N.getOperand(0); + auto Op1 = N.getOperand(1); + unsigned Opcode0 = Op0.getOpcode(); + unsigned Opcode1 = Op1.getOpcode(); + + // Combine X86ISD::UNPCKL with 2 X86ISD::FHADD inputs into a single + // X86ISD::FHADD. This is generated by UINT_TO_FP v2f64 scalarization. + // TODO: Add other horizontal operations as required. + if (VT == MVT::v2f64 && Opcode0 == Opcode1 && Opcode0 == X86ISD::FHADD) + return DAG.getNode(Opcode0, DL, VT, Op0.getOperand(0), Op1.getOperand(0)); + // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE // moves upper half elements into the lower half part. For example: @@ -26618,9 +26629,7 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, if (!VT.is128BitVector()) return SDValue(); - auto Op0 = N.getOperand(0); - auto Op1 = N.getOperand(1); - if (Op0.isUndef() && Op1.getNode()->getOpcode() == ISD::VECTOR_SHUFFLE) { + if (Op0.isUndef() && Opcode1 == ISD::VECTOR_SHUFFLE) { ArrayRef Mask = cast(Op1.getNode())->getMask(); unsigned NumElts = VT.getVectorNumElements(); diff --git a/test/CodeGen/X86/vec_int_to_fp.ll b/test/CodeGen/X86/vec_int_to_fp.ll index 2094a213e4b..303971643c4 100644 --- a/test/CodeGen/X86/vec_int_to_fp.ll +++ b/test/CodeGen/X86/vec_int_to_fp.ll @@ -425,12 +425,10 @@ define <2 x double> @uitofp_2i64_to_2f64(<2 x i64> %a) { ; VEX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; VEX-NEXT: vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25] ; VEX-NEXT: vsubpd %xmm3, %xmm2, %xmm2 -; VEX-NEXT: vhaddpd %xmm2, %xmm2, %xmm2 ; VEX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; VEX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; VEX-NEXT: vsubpd %xmm3, %xmm0, %xmm0 -; VEX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 -; VEX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; VEX-NEXT: vhaddpd %xmm0, %xmm2, %xmm0 ; VEX-NEXT: retq ; ; AVX512-LABEL: uitofp_2i64_to_2f64: @@ -471,12 +469,10 @@ define <2 x double> @uitofp_2i32_to_2f64(<4 x i32> %a) { ; VEX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; VEX-NEXT: vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25] ; VEX-NEXT: vsubpd %xmm3, %xmm2, %xmm2 -; VEX-NEXT: vhaddpd %xmm2, %xmm2, %xmm2 ; VEX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; VEX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; VEX-NEXT: vsubpd %xmm3, %xmm0, %xmm0 -; VEX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 -; VEX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; VEX-NEXT: vhaddpd %xmm0, %xmm2, %xmm0 ; VEX-NEXT: retq ; ; AVX512F-LABEL: uitofp_2i32_to_2f64: @@ -699,20 +695,16 @@ define <4 x double> @uitofp_4i64_to_4f64(<4 x i64> %a) { ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX1-NEXT: vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25] ; AVX1-NEXT: vsubpd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vhaddpd %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX1-NEXT: vsubpd %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX1-NEXT: vhaddpd %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; AVX1-NEXT: vsubpd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vhaddpd %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; AVX1-NEXT: vsubpd %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX1-NEXT: vhaddpd %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -723,20 +715,16 @@ define <4 x double> @uitofp_4i64_to_4f64(<4 x i64> %a) { ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX2-NEXT: vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25] ; AVX2-NEXT: vsubpd %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vhaddpd %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX2-NEXT: vsubpd %xmm4, %xmm1, %xmm1 -; AVX2-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX2-NEXT: vhaddpd %xmm1, %xmm3, %xmm1 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; AVX2-NEXT: vsubpd %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vhaddpd %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; AVX2-NEXT: vsubpd %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX2-NEXT: vhaddpd %xmm0, %xmm3, %xmm0 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -2456,12 +2444,10 @@ define <2 x double> @uitofp_load_2i64_to_2f64(<2 x i64> *%a) { ; VEX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; VEX-NEXT: vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25] ; VEX-NEXT: vsubpd %xmm3, %xmm2, %xmm2 -; VEX-NEXT: vhaddpd %xmm2, %xmm2, %xmm2 ; VEX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; VEX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; VEX-NEXT: vsubpd %xmm3, %xmm0, %xmm0 -; VEX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 -; VEX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; VEX-NEXT: vhaddpd %xmm0, %xmm2, %xmm0 ; VEX-NEXT: retq ; ; AVX512F-LABEL: uitofp_load_2i64_to_2f64: @@ -2515,12 +2501,10 @@ define <2 x double> @uitofp_load_2i32_to_2f64(<2 x i32> *%a) { ; VEX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; VEX-NEXT: vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25] ; VEX-NEXT: vsubpd %xmm3, %xmm2, %xmm2 -; VEX-NEXT: vhaddpd %xmm2, %xmm2, %xmm2 ; VEX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; VEX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; VEX-NEXT: vsubpd %xmm3, %xmm0, %xmm0 -; VEX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 -; VEX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; VEX-NEXT: vhaddpd %xmm0, %xmm2, %xmm0 ; VEX-NEXT: retq ; ; AVX512F-LABEL: uitofp_load_2i32_to_2f64: @@ -2652,20 +2636,16 @@ define <4 x double> @uitofp_load_4i64_to_4f64(<4 x i64> *%a) { ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX1-NEXT: vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25] ; AVX1-NEXT: vsubpd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vhaddpd %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX1-NEXT: vsubpd %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX1-NEXT: vhaddpd %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; AVX1-NEXT: vsubpd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vhaddpd %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; AVX1-NEXT: vsubpd %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX1-NEXT: vhaddpd %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -2677,20 +2657,16 @@ define <4 x double> @uitofp_load_4i64_to_4f64(<4 x i64> *%a) { ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX2-NEXT: vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25] ; AVX2-NEXT: vsubpd %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vhaddpd %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX2-NEXT: vsubpd %xmm4, %xmm1, %xmm1 -; AVX2-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX2-NEXT: vhaddpd %xmm1, %xmm3, %xmm1 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; AVX2-NEXT: vsubpd %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vhaddpd %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; AVX2-NEXT: vsubpd %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX2-NEXT: vhaddpd %xmm0, %xmm3, %xmm0 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ;