From eb7fbe2299795670534d4fd010136d7b3288c8d6 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 13 Oct 2017 21:56:48 +0000 Subject: [PATCH] [X86] Use X86ISD::VBROADCAST in place of v2f64 X86ISD::MOVDDUP when AVX2 is available This is particularly important for AVX512VL where we are better able to recognize the VBROADCAST loads to fold with other operations. For AVX512VL we now use X86ISD::VBROADCAST for all of the patterns and remove the 128-bit X86ISD::VMOVDDUP. We may be able to use this for AVX1 as well which would allow us to remove more isel patterns. I also had to add X86ISD::VBROADCAST as a node to call combineShuffle for so that we treat it similar to X86ISD::MOVDDUP. Differential Revision: https://reviews.llvm.org/D38836 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@315768 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 11 +++++++--- lib/Target/X86/X86InstrAVX512.td | 28 ++++++++++++------------ lib/Target/X86/X86InstrSSE.td | 5 +++++ test/CodeGen/X86/avx512vl-vbroadcast.ll | 29 ++++++++++++++++++++++++- 4 files changed, 55 insertions(+), 18 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index c45f9951940..6644baf8fa5 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -10000,7 +10000,9 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise // we can only broadcast from a register with AVX2. unsigned NumElts = Mask.size(); - unsigned Opcode = VT == MVT::v2f64 ? X86ISD::MOVDDUP : X86ISD::VBROADCAST; + unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2()) + ? X86ISD::MOVDDUP + : X86ISD::VBROADCAST; bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2(); // Check that the mask is a broadcast. @@ -10086,7 +10088,9 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, // 32-bit targets need to load i64 as a f64 and then bitcast the result. if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) { BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements()); - Opcode = (BroadcastVT.is128BitVector() ? X86ISD::MOVDDUP : Opcode); + Opcode = (BroadcastVT.is128BitVector() && !Subtarget.hasAVX2()) + ? X86ISD::MOVDDUP + : Opcode; } // If we are broadcasting a load that is only used by the shuffle @@ -27317,7 +27321,7 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef Mask, // instructions are no slower than UNPCKLPD but has the option to // fold the input operand into even an unaligned memory load. if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) { - if (isTargetShuffleEquivalent(Mask, {0, 0})) { + if (!Subtarget.hasAVX2() && isTargetShuffleEquivalent(Mask, {0, 0})) { Shuffle = X86ISD::MOVDDUP; SrcVT = DstVT = MVT::v2f64; return true; @@ -36412,6 +36416,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::MOVDDUP: case X86ISD::MOVSS: case X86ISD::MOVSD: + case X86ISD::VBROADCAST: case X86ISD::VPPERM: case X86ISD::VPERMI: case X86ISD::VPERMV: diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index bfb91702c3a..b3eb34bf44c 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -9100,7 +9100,7 @@ defm VMOVSLDUP : avx512_replicate<0x12, "vmovsldup", X86Movsldup>; //===----------------------------------------------------------------------===// multiclass avx512_movddup_128 opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _> { + X86VectorVTInfo _> { let ExeDomain = _.ExeDomain in { defm rr : AVX512_maskable opc, string OpcodeStr, SDNode OpNode, multiclass avx512_movddup_common opc, string OpcodeStr, SDNode OpNode, AVX512VLVectorVTInfo VTInfo> { - defm Z : avx512_unary_rm, EVEX_V512; + defm Z : avx512_unary_rm, EVEX_V512; let Predicates = [HasAVX512, HasVLX] in { - defm Z256 : avx512_unary_rm, + defm Z256 : avx512_unary_rm, EVEX_V256; - defm Z128 : avx512_movddup_128, - EVEX_V128; + defm Z128 : avx512_movddup_128, + EVEX_V128; } } @@ -9134,19 +9134,12 @@ multiclass avx512_movddup opc, string OpcodeStr, SDNode OpNode>{ defm VMOVDDUP : avx512_movddup<0x12, "vmovddup", X86Movddup>; let Predicates = [HasVLX] in { -def : Pat<(X86Movddup (loadv2f64 addr:$src)), - (VMOVDDUPZ128rm addr:$src)>; def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))), (VMOVDDUPZ128rm addr:$src)>; def : Pat<(v2f64 (X86VBroadcast f64:$src)), (VMOVDDUPZ128rr (COPY_TO_REGCLASS FR64X:$src, VR128X))>; - -def : Pat<(vselect (v2i1 VK2WM:$mask), (X86Movddup (loadv2f64 addr:$src)), - (v2f64 VR128X:$src0)), - (VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; -def : Pat<(vselect (v2i1 VK2WM:$mask), (X86Movddup (loadv2f64 addr:$src)), - (bitconvert (v4i32 immAllZerosV))), - (VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>; +def : Pat<(v2f64 (X86VBroadcast (loadv2f64 addr:$src))), + (VMOVDDUPZ128rm addr:$src)>; def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)), (v2f64 VR128X:$src0)), @@ -9162,6 +9155,13 @@ def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src) def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src))), (bitconvert (v4i32 immAllZerosV))), (VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>; + +def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadv2f64 addr:$src))), + (v2f64 VR128X:$src0)), + (VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; +def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadv2f64 addr:$src))), + (bitconvert (v4i32 immAllZerosV))), + (VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>; } //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 3bbe31071e0..a7d348b28e5 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -7969,6 +7969,11 @@ let Predicates = [HasAVX, NoVLX] in { (VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128))>; def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))), (VMOVDDUPrm addr:$src)>; + + def : Pat<(v2f64 (X86VBroadcast v2f64:$src)), + (VMOVDDUPrr VR128:$src)>; + def : Pat<(v2f64 (X86VBroadcast (loadv2f64 addr:$src))), + (VMOVDDUPrm addr:$src)>; } let Predicates = [HasAVX1Only] in { diff --git a/test/CodeGen/X86/avx512vl-vbroadcast.ll b/test/CodeGen/X86/avx512vl-vbroadcast.ll index f73825b509c..9fc957297e2 100644 --- a/test/CodeGen/X86/avx512vl-vbroadcast.ll +++ b/test/CodeGen/X86/avx512vl-vbroadcast.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f -mattr=+avx512vl| FileCheck %s declare void @func_f32(float) @@ -170,3 +170,30 @@ define <4 x double> @_ss4xdouble_maskz(double %a, <4 x i32> %mask1) { %r = select <4 x i1> %mask, <4 x double> %c, <4 x double> zeroinitializer ret <4 x double> %r } + +define <2 x double> @test_v2f64_broadcast_fold(<2 x double> *%a0, <2 x double> %a1) { +; CHECK-LABEL: test_v2f64_broadcast_fold: +; CHECK: # BB#0: +; CHECK-NEXT: vaddpd (%rdi){1to2}, %xmm0, %xmm0 +; CHECK-NEXT: retq + %1 = load <2 x double>, <2 x double> *%a0, align 16 + %2 = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer + %3 = fadd <2 x double> %2, %a1 + ret <2 x double> %3 +} + +define <2 x double> @test_v2f64_broadcast_fold_mask(<2 x double> *%a0, <2 x double> %a1, <2 x i64> %mask1, <2 x double> %a2) { +; CHECK-LABEL: test_v2f64_broadcast_fold_mask: +; CHECK: # BB#0: +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vpcmpneqq %xmm3, %xmm1, %k1 +; CHECK-NEXT: vaddpd (%rdi){1to2}, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovapd %xmm2, %xmm0 +; CHECK-NEXT: retq + %mask = icmp ne <2 x i64> %mask1, zeroinitializer + %1 = load <2 x double>, <2 x double> *%a0, align 16 + %2 = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer + %3 = fadd <2 x double> %2, %a1 + %4 = select <2 x i1> %mask, <2 x double> %3, <2 x double> %a2 + ret <2 x double> %4 +} -- 2.40.0