[DAGCombiner] narrow vector binop with 2 insert subvector operands

author Sanjay Patel <spatel@rotateright.com>

Tue, 22 Jan 2019 14:24:13 +0000 (14:24 +0000)

committer Sanjay Patel <spatel@rotateright.com>

Tue, 22 Jan 2019 14:24:13 +0000 (14:24 +0000)
author Sanjay Patel <spatel@rotateright.com>
Tue, 22 Jan 2019 14:24:13 +0000 (14:24 +0000)
committer Sanjay Patel <spatel@rotateright.com>
Tue, 22 Jan 2019 14:24:13 +0000 (14:24 +0000)
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

index 3437f0debae14eae5b22e8f12ff8ba438515bd90..b440bbe29fdc580a677168ee2049875f444b16b5 100644 (file)
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -18174,6 +18174,7 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
    SDValue LHS = N->getOperand(0);
    SDValue RHS = N->getOperand(1);
    SDValue Ops[] = {LHS, RHS};
+  EVT VT = N->getValueType(0);
  
    // See if we can constant fold the vector operation.
    if (SDValue Fold = DAG.FoldConstantVectorArithmetic(
@@ -18191,7 +18192,6 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
      ShuffleVectorSDNode *SVN1 = cast<ShuffleVectorSDNode>(RHS);
  
      if (SVN0->getMask().equals(SVN1->getMask())) {
-      EVT VT = N->getValueType(0);
        SDValue UndefVector = LHS.getOperand(1);
        SDValue NewBinOp = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
                                       LHS.getOperand(0), RHS.getOperand(0),
@@ -18202,6 +18202,29 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
      }
    }
  
+  // The following pattern is likely to emerge with vector reduction ops. Moving
+  // the binary operation ahead of insertion may allow using a narrower vector
+  // instruction that has better performance than the wide version of the op:
+  // VBinOp (ins undef, X, Z), (ins undef, Y, Z) --> ins VecC, (VBinOp X, Y), Z
+  if (LHS.getOpcode() == ISD::INSERT_SUBVECTOR && LHS.getOperand(0).isUndef() &&
+      RHS.getOpcode() == ISD::INSERT_SUBVECTOR && RHS.getOperand(0).isUndef() &&
+      LHS.getOperand(2) == RHS.getOperand(2) &&
+      (LHS.hasOneUse() || RHS.hasOneUse())) {
+    SDValue X = LHS.getOperand(1);
+    SDValue Y = RHS.getOperand(1);
+    SDValue Z = LHS.getOperand(2);
+    EVT NarrowVT = X.getValueType();
+    if (NarrowVT == Y.getValueType() &&
+        TLI.isOperationLegalOrCustomOrPromote(N->getOpcode(), NarrowVT)) {
+      // (binop undef, undef) may not return undef, so compute that result.
+      SDLoc DL(N);
+      SDValue VecC = DAG.getNode(N->getOpcode(), DL, VT, DAG.getUNDEF(VT),
+                                 DAG.getUNDEF(VT));
+      SDValue NarrowBO = DAG.getNode(N->getOpcode(), DL, NarrowVT, X, Y);
+      return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, VecC, NarrowBO, Z);
+    }
+  }
+
    return SDValue();
  }
  
diff --git a/test/CodeGen/X86/avx512-hadd-hsub.ll b/test/CodeGen/X86/avx512-hadd-hsub.ll

index 00063521c6d13745d92343b4f9edc804893699ce..0979657e0f6c50bdb7a83722e4e91c08db3be294 100644 (file)
--- a/test/CodeGen/X86/avx512-hadd-hsub.ll
+++ b/test/CodeGen/X86/avx512-hadd-hsub.ll
@@ -111,14 +111,14 @@ define <16 x i32> @hadd_16_3(<16 x i32> %x225, <16 x i32> %x227) {
  ; KNL:       # %bb.0:
  ; KNL-NEXT:    vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
  ; KNL-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
-; KNL-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
+; KNL-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
  ; KNL-NEXT:    retq
  ;
  ; SKX-LABEL: hadd_16_3:
  ; SKX:       # %bb.0:
  ; SKX-NEXT:    vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
  ; SKX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
-; SKX-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
+; SKX-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
  ; SKX-NEXT:    retq
    %x226 = shufflevector <16 x i32> %x225, <16 x i32> %x227, <16 x i32> <i32 0, i32 2, i32 16, i32 18
  , i32 4, i32 6, i32 20, i32 22, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -134,14 +134,14 @@ define <16 x float> @fhadd_16_3(<16 x float> %x225, <16 x float> %x227) {
  ; KNL:       # %bb.0:
  ; KNL-NEXT:    vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
  ; KNL-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
-; KNL-NEXT:    vaddps %zmm0, %zmm2, %zmm0
+; KNL-NEXT:    vaddps %ymm0, %ymm2, %ymm0
  ; KNL-NEXT:    retq
  ;
  ; SKX-LABEL: fhadd_16_3:
  ; SKX:       # %bb.0:
  ; SKX-NEXT:    vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
  ; SKX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
-; SKX-NEXT:    vaddps %zmm0, %zmm2, %zmm0
+; SKX-NEXT:    vaddps %ymm0, %ymm2, %ymm0
  ; SKX-NEXT:    retq
    %x226 = shufflevector <16 x float> %x225, <16 x float> %x227, <16 x i32> <i32 0, i32 2, i32 16, i32 18
  , i32 4, i32 6, i32 20, i32 22, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -156,14 +156,14 @@ define <8 x double> @fhadd_16_4(<8 x double> %x225, <8 x double> %x227) {
  ; KNL:       # %bb.0:
  ; KNL-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
  ; KNL-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; KNL-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
+; KNL-NEXT:    vaddpd %ymm0, %ymm2, %ymm0
  ; KNL-NEXT:    retq
  ;
  ; SKX-LABEL: fhadd_16_4:
  ; SKX:       # %bb.0:
  ; SKX-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
  ; SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; SKX-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
+; SKX-NEXT:    vaddpd %ymm0, %ymm2, %ymm0
  ; SKX-NEXT:    retq
    %x226 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 undef, i32 undef, i32 undef, i32 undef>
    %x228 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 undef ,i32 undef, i32 undef, i32 undef>
diff --git a/test/CodeGen/X86/scalarize-fp.ll b/test/CodeGen/X86/scalarize-fp.ll

index d9606b8735254ac3aff05e2555fcf888e0f3d1dd..5cf526dbff5388c9e0e48c8090c08d094ef027a1 100644 (file)
--- a/test/CodeGen/X86/scalarize-fp.ll
+++ b/test/CodeGen/X86/scalarize-fp.ll
@@ -198,9 +198,8 @@ define <4 x double> @fadd_op1_constant_v4f64(double %x) nounwind {
  ;
  ; AVX-LABEL: fadd_op1_constant_v4f64:
  ; AVX:       # %bb.0:
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
  ; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
  ; AVX-NEXT:    retq
    %v = insertelement <4 x double> undef, double %x, i32 0
    %b = fadd <4 x double> %v, <double 42.0, double undef, double undef, double undef>
@@ -219,7 +218,7 @@ define <4 x double> @load_fadd_op1_constant_v4f64(double* %p) nounwind {
  ; AVX:       # %bb.0:
  ; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
  ; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
  ; AVX-NEXT:    retq
    %x = load double, double* %p
    %v = insertelement <4 x double> undef, double %x, i32 0
@@ -237,9 +236,8 @@ define <4 x double> @fsub_op0_constant_v4f64(double %x) nounwind {
  ;
  ; AVX-LABEL: fsub_op0_constant_v4f64:
  ; AVX:       # %bb.0:
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
  ; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT:    vsubpd %ymm0, %ymm1, %ymm0
+; AVX-NEXT:    vsubpd %xmm0, %xmm1, %xmm0
  ; AVX-NEXT:    retq
    %v = insertelement <4 x double> undef, double %x, i32 0
    %b = fsub <4 x double> <double 42.0, double undef, double undef, double undef>, %v
@@ -258,7 +256,7 @@ define <4 x double> @load_fsub_op0_constant_v4f64(double* %p) nounwind {
  ; AVX:       # %bb.0:
  ; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
  ; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT:    vsubpd %ymm0, %ymm1, %ymm0
+; AVX-NEXT:    vsubpd %xmm0, %xmm1, %xmm0
  ; AVX-NEXT:    retq
    %x = load double, double* %p
    %v = insertelement <4 x double> undef, double %x, i32 0
@@ -275,9 +273,8 @@ define <4 x double> @fmul_op1_constant_v4f64(double %x) nounwind {
  ;
  ; AVX-LABEL: fmul_op1_constant_v4f64:
  ; AVX:       # %bb.0:
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
  ; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
  ; AVX-NEXT:    retq
    %v = insertelement <4 x double> undef, double %x, i32 0
    %b = fmul <4 x double> %v, <double 42.0, double undef, double undef, double undef>
@@ -296,7 +293,7 @@ define <4 x double> @load_fmul_op1_constant_v4f64(double* %p) nounwind {
  ; AVX:       # %bb.0:
  ; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
  ; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
  ; AVX-NEXT:    retq
    %x = load double, double* %p
    %v = insertelement <4 x double> undef, double %x, i32 0
@@ -313,9 +310,8 @@ define <4 x double> @fdiv_op1_constant_v4f64(double %x) nounwind {
  ;
  ; AVX-LABEL: fdiv_op1_constant_v4f64:
  ; AVX:       # %bb.0:
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
  ; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT:    vdivpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    vdivpd %xmm1, %xmm0, %xmm0
  ; AVX-NEXT:    retq
    %v = insertelement <4 x double> undef, double %x, i32 0
    %b = fdiv <4 x double> %v, <double 42.0, double undef, double undef, double undef>
@@ -334,7 +330,7 @@ define <4 x double> @load_fdiv_op1_constant_v4f64(double* %p) nounwind {
  ; AVX:       # %bb.0:
  ; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
  ; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT:    vdivpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    vdivpd %xmm1, %xmm0, %xmm0
  ; AVX-NEXT:    retq
    %x = load double, double* %p
    %v = insertelement <4 x double> undef, double %x, i32 0
@@ -352,9 +348,8 @@ define <4 x double> @fdiv_op0_constant_v4f64(double %x) nounwind {
  ;
  ; AVX-LABEL: fdiv_op0_constant_v4f64:
  ; AVX:       # %bb.0:
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
  ; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT:    vdivpd %ymm0, %ymm1, %ymm0
+; AVX-NEXT:    vdivpd %xmm0, %xmm1, %xmm0
  ; AVX-NEXT:    retq
    %v = insertelement <4 x double> undef, double %x, i32 0
    %b = fdiv <4 x double> <double 42.0, double undef, double undef, double undef>, %v
@@ -373,7 +368,7 @@ define <4 x double> @load_fdiv_op0_constant_v4f64(double* %p) nounwind {
  ; AVX:       # %bb.0:
  ; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
  ; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT:    vdivpd %ymm0, %ymm1, %ymm0
+; AVX-NEXT:    vdivpd %xmm0, %xmm1, %xmm0
  ; AVX-NEXT:    retq
    %x = load double, double* %p
    %v = insertelement <4 x double> undef, double %x, i32 0
diff --git a/test/CodeGen/X86/vector-partial-undef.ll b/test/CodeGen/X86/vector-partial-undef.ll

index 1cd3415d082f74187521d5535802d7d24918c602..2b4ab11fea5b89e5233787b6342de0ed69acc436 100644 (file)
--- a/test/CodeGen/X86/vector-partial-undef.ll
+++ b/test/CodeGen/X86/vector-partial-undef.ll
@@ -13,9 +13,7 @@ define <4 x i64> @xor_insert_insert(<2 x i64> %x, <2 x i64> %y) {
  ;
  ; AVX-LABEL: xor_insert_insert:
  ; AVX:       # %bb.0:
-; AVX-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
  ; AVX-NEXT:    retq
    %xw = shufflevector <2 x i64> %x, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
    %yw = shufflevector <2 x i64> %y, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
@@ -32,9 +30,9 @@ define <4 x i64> @xor_insert_insert_high_half(<2 x i64> %x, <2 x i64> %y) {
  ;
  ; AVX-LABEL: xor_insert_insert_high_half:
  ; AVX:       # %bb.0:
-; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
  ; AVX-NEXT:    retq
    %xw = shufflevector <2 x i64> %x, <2 x i64> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
    %yw = shufflevector <2 x i64> %y, <2 x i64> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
author	Sanjay Patel <spatel@rotateright.com>
	Tue, 22 Jan 2019 14:24:13 +0000 (14:24 +0000)
committer	Sanjay Patel <spatel@rotateright.com>
	Tue, 22 Jan 2019 14:24:13 +0000 (14:24 +0000)
lib/CodeGen/SelectionDAG/DAGCombiner.cpp		patch \| blob \| history
test/CodeGen/X86/avx512-hadd-hsub.ll		patch \| blob \| history
test/CodeGen/X86/scalarize-fp.ll		patch \| blob \| history
test/CodeGen/X86/vector-partial-undef.ll		patch \| blob \| history