SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
SDValue Ops[] = {LHS, RHS};
+ EVT VT = N->getValueType(0);
// See if we can constant fold the vector operation.
if (SDValue Fold = DAG.FoldConstantVectorArithmetic(
ShuffleVectorSDNode *SVN1 = cast<ShuffleVectorSDNode>(RHS);
if (SVN0->getMask().equals(SVN1->getMask())) {
- EVT VT = N->getValueType(0);
SDValue UndefVector = LHS.getOperand(1);
SDValue NewBinOp = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
LHS.getOperand(0), RHS.getOperand(0),
}
}
+ // The following pattern is likely to emerge with vector reduction ops. Moving
+ // the binary operation ahead of insertion may allow using a narrower vector
+ // instruction that has better performance than the wide version of the op:
+ // VBinOp (ins undef, X, Z), (ins undef, Y, Z) --> ins VecC, (VBinOp X, Y), Z
+ if (LHS.getOpcode() == ISD::INSERT_SUBVECTOR && LHS.getOperand(0).isUndef() &&
+ RHS.getOpcode() == ISD::INSERT_SUBVECTOR && RHS.getOperand(0).isUndef() &&
+ LHS.getOperand(2) == RHS.getOperand(2) &&
+ (LHS.hasOneUse() || RHS.hasOneUse())) {
+ SDValue X = LHS.getOperand(1);
+ SDValue Y = RHS.getOperand(1);
+ SDValue Z = LHS.getOperand(2);
+ EVT NarrowVT = X.getValueType();
+ if (NarrowVT == Y.getValueType() &&
+ TLI.isOperationLegalOrCustomOrPromote(N->getOpcode(), NarrowVT)) {
+ // (binop undef, undef) may not return undef, so compute that result.
+ SDLoc DL(N);
+ SDValue VecC = DAG.getNode(N->getOpcode(), DL, VT, DAG.getUNDEF(VT),
+ DAG.getUNDEF(VT));
+ SDValue NarrowBO = DAG.getNode(N->getOpcode(), DL, NarrowVT, X, Y);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, VecC, NarrowBO, Z);
+ }
+ }
+
return SDValue();
}
; KNL: # %bb.0:
; KNL-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
; KNL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
-; KNL-NEXT: vpaddd %zmm0, %zmm2, %zmm0
+; KNL-NEXT: vpaddd %ymm0, %ymm2, %ymm0
; KNL-NEXT: retq
;
; SKX-LABEL: hadd_16_3:
; SKX: # %bb.0:
; SKX-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
; SKX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
-; SKX-NEXT: vpaddd %zmm0, %zmm2, %zmm0
+; SKX-NEXT: vpaddd %ymm0, %ymm2, %ymm0
; SKX-NEXT: retq
%x226 = shufflevector <16 x i32> %x225, <16 x i32> %x227, <16 x i32> <i32 0, i32 2, i32 16, i32 18
, i32 4, i32 6, i32 20, i32 22, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; KNL: # %bb.0:
; KNL-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
; KNL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
-; KNL-NEXT: vaddps %zmm0, %zmm2, %zmm0
+; KNL-NEXT: vaddps %ymm0, %ymm2, %ymm0
; KNL-NEXT: retq
;
; SKX-LABEL: fhadd_16_3:
; SKX: # %bb.0:
; SKX-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
; SKX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
-; SKX-NEXT: vaddps %zmm0, %zmm2, %zmm0
+; SKX-NEXT: vaddps %ymm0, %ymm2, %ymm0
; SKX-NEXT: retq
%x226 = shufflevector <16 x float> %x225, <16 x float> %x227, <16 x i32> <i32 0, i32 2, i32 16, i32 18
, i32 4, i32 6, i32 20, i32 22, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; KNL: # %bb.0:
; KNL-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
; KNL-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; KNL-NEXT: vaddpd %zmm0, %zmm2, %zmm0
+; KNL-NEXT: vaddpd %ymm0, %ymm2, %ymm0
; KNL-NEXT: retq
;
; SKX-LABEL: fhadd_16_4:
; SKX: # %bb.0:
; SKX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; SKX-NEXT: vaddpd %zmm0, %zmm2, %zmm0
+; SKX-NEXT: vaddpd %ymm0, %ymm2, %ymm0
; SKX-NEXT: retq
%x226 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 undef, i32 undef, i32 undef, i32 undef>
%x228 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 undef ,i32 undef, i32 undef, i32 undef>
;
; AVX-LABEL: fadd_op1_constant_v4f64:
; AVX: # %bb.0:
-; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%v = insertelement <4 x double> undef, double %x, i32 0
%b = fadd <4 x double> %v, <double 42.0, double undef, double undef, double undef>
; AVX: # %bb.0:
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%x = load double, double* %p
%v = insertelement <4 x double> undef, double %x, i32 0
;
; AVX-LABEL: fsub_op0_constant_v4f64:
; AVX: # %bb.0:
-; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT: vsubpd %ymm0, %ymm1, %ymm0
+; AVX-NEXT: vsubpd %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
%v = insertelement <4 x double> undef, double %x, i32 0
%b = fsub <4 x double> <double 42.0, double undef, double undef, double undef>, %v
; AVX: # %bb.0:
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT: vsubpd %ymm0, %ymm1, %ymm0
+; AVX-NEXT: vsubpd %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
%x = load double, double* %p
%v = insertelement <4 x double> undef, double %x, i32 0
;
; AVX-LABEL: fmul_op1_constant_v4f64:
; AVX: # %bb.0:
-; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%v = insertelement <4 x double> undef, double %x, i32 0
%b = fmul <4 x double> %v, <double 42.0, double undef, double undef, double undef>
; AVX: # %bb.0:
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%x = load double, double* %p
%v = insertelement <4 x double> undef, double %x, i32 0
;
; AVX-LABEL: fdiv_op1_constant_v4f64:
; AVX: # %bb.0:
-; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT: vdivpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT: vdivpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%v = insertelement <4 x double> undef, double %x, i32 0
%b = fdiv <4 x double> %v, <double 42.0, double undef, double undef, double undef>
; AVX: # %bb.0:
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT: vdivpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT: vdivpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%x = load double, double* %p
%v = insertelement <4 x double> undef, double %x, i32 0
;
; AVX-LABEL: fdiv_op0_constant_v4f64:
; AVX: # %bb.0:
-; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT: vdivpd %ymm0, %ymm1, %ymm0
+; AVX-NEXT: vdivpd %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
%v = insertelement <4 x double> undef, double %x, i32 0
%b = fdiv <4 x double> <double 42.0, double undef, double undef, double undef>, %v
; AVX: # %bb.0:
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT: vdivpd %ymm0, %ymm1, %ymm0
+; AVX-NEXT: vdivpd %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
%x = load double, double* %p
%v = insertelement <4 x double> undef, double %x, i32 0
;
; AVX-LABEL: xor_insert_insert:
; AVX: # %bb.0:
-; AVX-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
-; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%xw = shufflevector <2 x i64> %x, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
%yw = shufflevector <2 x i64> %y, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
;
; AVX-LABEL: xor_insert_insert_high_half:
; AVX: # %bb.0:
-; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX-NEXT: retq
%xw = shufflevector <2 x i64> %x, <2 x i64> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
%yw = shufflevector <2 x i64> %y, <2 x i64> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>