IsUnary = true;
break;
case X86ISD::VBROADCAST: {
- // We only decode broadcasts of same-sized vectors at the moment.
- if (N->getOperand(0).getValueType() == VT) {
+ SDValue N0 = N->getOperand(0);
+ // See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
+ // add the pre-extracted value to the Ops vector.
+ if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ N0.getOperand(0).getValueType() == VT &&
+ N0.getConstantOperandVal(1) == 0)
+ Ops.push_back(N0.getOperand(0));
+
+ // We only decode broadcasts of same-sized vectors, unless the broadcast
+ // came from an extract from the original width. If we found one, we
+ // pushed it the Ops vector above.
+ if (N0.getValueType() == VT || !Ops.empty()) {
DecodeVectorBroadcast(VT, Mask);
IsUnary = true;
break;
BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
}
+ // We only support broadcasting from 128-bit vectors to minimize the
+ // number of patterns we need to deal with in isel. So extract down to
+ // 128-bits.
+ if (SrcVT.getSizeInBits() > 128)
+ V = extract128BitVector(V, 0, DAG, DL);
+
return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
}
define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) {
; KNL_64-LABEL: test14:
; KNL_64: # BB#0:
-; KNL_64-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1
-; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; KNL_64-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0
; KNL_64-NEXT: vpbroadcastq %xmm0, %zmm0
; KNL_64-NEXT: vmovd %esi, %xmm1
; KNL_64-NEXT: vpbroadcastd %xmm1, %ymm1
;
; KNL_32-LABEL: test14:
; KNL_32: # BB#0:
-; KNL_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm1
-; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; KNL_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
; KNL_32-NEXT: vpbroadcastd %xmm0, %zmm0
; KNL_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
; KNL_32-NEXT: vpaddd %zmm1, %zmm0, %zmm1
;
; SKX-LABEL: test14:
; SKX: # BB#0:
-; SKX-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1
-; SKX-NEXT: vinserti64x2 $0, %xmm1, %zmm0, %zmm0
+; SKX-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0
; SKX-NEXT: vpbroadcastq %xmm0, %zmm0
; SKX-NEXT: vpbroadcastd %esi, %ymm1
; SKX-NEXT: vpmovsxdq %ymm1, %zmm1
;
; SKX_32-LABEL: test14:
; SKX_32: # BB#0:
-; SKX_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm1
-; SKX_32-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; SKX_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
; SKX_32-NEXT: vpbroadcastd %xmm0, %zmm0
; SKX_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
; SKX_32-NEXT: vpaddd %zmm1, %zmm0, %zmm1
;
; AVX2-LABEL: load_splat_8f32_4f32_01010101:
; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vmovaps (%rdi), %xmm0
-; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0
+; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_splat_8f32_4f32_01010101:
; AVX512: # BB#0: # %entry
-; AVX512-NEXT: vmovaps (%rdi), %xmm0
-; AVX512-NEXT: vbroadcastsd %xmm0, %ymm0
+; AVX512-NEXT: vbroadcastsd (%rdi), %ymm0
; AVX512-NEXT: retq
entry:
%ld = load <4 x float>, <4 x float>* %ptr
;
; AVX2-LABEL: load_splat_8i32_4i32_01010101:
; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vmovaps (%rdi), %xmm0
-; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0
+; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_splat_8i32_4i32_01010101:
; AVX512: # BB#0: # %entry
-; AVX512-NEXT: vmovaps (%rdi), %xmm0
-; AVX512-NEXT: vbroadcastsd %xmm0, %ymm0
+; AVX512-NEXT: vbroadcastsd (%rdi), %ymm0
; AVX512-NEXT: retq
entry:
%ld = load <4 x i32>, <4 x i32>* %ptr
;
; AVX2-LABEL: load_splat_16i16_8i16_0101010101010101:
; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vmovaps (%rdi), %xmm0
-; AVX2-NEXT: vbroadcastss %xmm0, %ymm0
+; AVX2-NEXT: vbroadcastss (%rdi), %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_splat_16i16_8i16_0101010101010101:
; AVX512: # BB#0: # %entry
-; AVX512-NEXT: vmovaps (%rdi), %xmm0
-; AVX512-NEXT: vbroadcastss %xmm0, %ymm0
+; AVX512-NEXT: vbroadcastss (%rdi), %ymm0
; AVX512-NEXT: retq
entry:
%ld = load <8 x i16>, <8 x i16>* %ptr
;
; AVX2-LABEL: load_splat_16i16_8i16_0123012301230123:
; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vmovaps (%rdi), %xmm0
-; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0
+; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_splat_16i16_8i16_0123012301230123:
; AVX512: # BB#0: # %entry
-; AVX512-NEXT: vmovaps (%rdi), %xmm0
-; AVX512-NEXT: vbroadcastsd %xmm0, %ymm0
+; AVX512-NEXT: vbroadcastsd (%rdi), %ymm0
; AVX512-NEXT: retq
entry:
%ld = load <8 x i16>, <8 x i16>* %ptr
;
; AVX2-LABEL: load_splat_32i8_16i8_01010101010101010101010101010101:
; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT: vpbroadcastw (%rdi), %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_splat_32i8_16i8_01010101010101010101010101010101:
; AVX512: # BB#0: # %entry
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vpbroadcastw %xmm0, %ymm0
+; AVX512-NEXT: vpbroadcastw (%rdi), %ymm0
; AVX512-NEXT: retq
entry:
%ld = load <16 x i8>, <16 x i8>* %ptr
;
; AVX2-LABEL: load_splat_32i8_16i8_01230123012301230123012301230123:
; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vmovaps (%rdi), %xmm0
-; AVX2-NEXT: vbroadcastss %xmm0, %ymm0
+; AVX2-NEXT: vbroadcastss (%rdi), %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_splat_32i8_16i8_01230123012301230123012301230123:
; AVX512: # BB#0: # %entry
-; AVX512-NEXT: vmovaps (%rdi), %xmm0
-; AVX512-NEXT: vbroadcastss %xmm0, %ymm0
+; AVX512-NEXT: vbroadcastss (%rdi), %ymm0
; AVX512-NEXT: retq
entry:
%ld = load <16 x i8>, <16 x i8>* %ptr
;
; AVX2-LABEL: load_splat_32i8_16i8_01234567012345670123456701234567:
; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vmovaps (%rdi), %xmm0
-; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0
+; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_splat_32i8_16i8_01234567012345670123456701234567:
; AVX512: # BB#0: # %entry
-; AVX512-NEXT: vmovaps (%rdi), %xmm0
-; AVX512-NEXT: vbroadcastsd %xmm0, %ymm0
+; AVX512-NEXT: vbroadcastsd (%rdi), %ymm0
; AVX512-NEXT: retq
entry:
%ld = load <16 x i8>, <16 x i8>* %ptr