// With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
// we can only broadcast from a register with AVX2.
unsigned NumElts = Mask.size();
- unsigned Opcode = VT == MVT::v2f64 ? X86ISD::MOVDDUP : X86ISD::VBROADCAST;
+ unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
+ ? X86ISD::MOVDDUP
+ : X86ISD::VBROADCAST;
bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
// Check that the mask is a broadcast.
// 32-bit targets need to load i64 as a f64 and then bitcast the result.
if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
- Opcode = (BroadcastVT.is128BitVector() ? X86ISD::MOVDDUP : Opcode);
+ Opcode = (BroadcastVT.is128BitVector() && !Subtarget.hasAVX2())
+ ? X86ISD::MOVDDUP
+ : Opcode;
}
// If we are broadcasting a load that is only used by the shuffle
// instructions are no slower than UNPCKLPD but has the option to
// fold the input operand into even an unaligned memory load.
if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
- if (isTargetShuffleEquivalent(Mask, {0, 0})) {
+ if (!Subtarget.hasAVX2() && isTargetShuffleEquivalent(Mask, {0, 0})) {
Shuffle = X86ISD::MOVDDUP;
SrcVT = DstVT = MVT::v2f64;
return true;
case X86ISD::MOVDDUP:
case X86ISD::MOVSS:
case X86ISD::MOVSD:
+ case X86ISD::VBROADCAST:
case X86ISD::VPPERM:
case X86ISD::VPERMI:
case X86ISD::VPERMV:
//===----------------------------------------------------------------------===//
multiclass avx512_movddup_128<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo _> {
+ X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain in {
defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src), OpcodeStr, "$src", "$src",
multiclass avx512_movddup_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
AVX512VLVectorVTInfo VTInfo> {
- defm Z : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info512>, EVEX_V512;
+ defm Z : avx512_unary_rm<opc, OpcodeStr, X86Movddup, VTInfo.info512>, EVEX_V512;
let Predicates = [HasAVX512, HasVLX] in {
- defm Z256 : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info256>,
+ defm Z256 : avx512_unary_rm<opc, OpcodeStr, X86Movddup, VTInfo.info256>,
EVEX_V256;
- defm Z128 : avx512_movddup_128<opc, OpcodeStr, OpNode, VTInfo.info128>,
- EVEX_V128;
+ defm Z128 : avx512_movddup_128<opc, OpcodeStr, X86VBroadcast, VTInfo.info128>,
+ EVEX_V128;
}
}
defm VMOVDDUP : avx512_movddup<0x12, "vmovddup", X86Movddup>;
let Predicates = [HasVLX] in {
-def : Pat<(X86Movddup (loadv2f64 addr:$src)),
- (VMOVDDUPZ128rm addr:$src)>;
def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
(VMOVDDUPZ128rm addr:$src)>;
def : Pat<(v2f64 (X86VBroadcast f64:$src)),
(VMOVDDUPZ128rr (COPY_TO_REGCLASS FR64X:$src, VR128X))>;
-
-def : Pat<(vselect (v2i1 VK2WM:$mask), (X86Movddup (loadv2f64 addr:$src)),
- (v2f64 VR128X:$src0)),
- (VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
-def : Pat<(vselect (v2i1 VK2WM:$mask), (X86Movddup (loadv2f64 addr:$src)),
- (bitconvert (v4i32 immAllZerosV))),
- (VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>;
+def : Pat<(v2f64 (X86VBroadcast (loadv2f64 addr:$src))),
+ (VMOVDDUPZ128rm addr:$src)>;
def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)),
(v2f64 VR128X:$src0)),
def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src))),
(bitconvert (v4i32 immAllZerosV))),
(VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>;
+
+def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadv2f64 addr:$src))),
+ (v2f64 VR128X:$src0)),
+ (VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadv2f64 addr:$src))),
+ (bitconvert (v4i32 immAllZerosV))),
+ (VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>;
}
//===----------------------------------------------------------------------===//
-; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f -mattr=+avx512vl| FileCheck %s
declare void @func_f32(float)
%r = select <4 x i1> %mask, <4 x double> %c, <4 x double> zeroinitializer
ret <4 x double> %r
}
+
+define <2 x double> @test_v2f64_broadcast_fold(<2 x double> *%a0, <2 x double> %a1) {
+; CHECK-LABEL: test_v2f64_broadcast_fold:
+; CHECK: # BB#0:
+; CHECK-NEXT: vaddpd (%rdi){1to2}, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %1 = load <2 x double>, <2 x double> *%a0, align 16
+ %2 = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
+ %3 = fadd <2 x double> %2, %a1
+ ret <2 x double> %3
+}
+
+define <2 x double> @test_v2f64_broadcast_fold_mask(<2 x double> *%a0, <2 x double> %a1, <2 x i64> %mask1, <2 x double> %a2) {
+; CHECK-LABEL: test_v2f64_broadcast_fold_mask:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpneqq %xmm3, %xmm1, %k1
+; CHECK-NEXT: vaddpd (%rdi){1to2}, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vmovapd %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %mask = icmp ne <2 x i64> %mask1, zeroinitializer
+ %1 = load <2 x double>, <2 x double> *%a0, align 16
+ %2 = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
+ %3 = fadd <2 x double> %2, %a1
+ %4 = select <2 x i1> %mask, <2 x double> %3, <2 x double> %a2
+ ret <2 x double> %4
+}