From: Simon Pilgrim Date: Thu, 6 Jun 2019 17:04:13 +0000 (+0000) Subject: [DAGCombine] MergeConsecutiveStores - improve non-temporal load\store handling (PR42123) X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=deea5e593e435f2f734ad4df24e5aa0cd6038884;p=llvm [DAGCombine] MergeConsecutiveStores - improve non-temporal load\store handling (PR42123) This patch is the first step towards ensuring MergeConsecutiveStores correctly handles non-temporal loads\stores: 1 - When merging load\stores we must ensure that they all have the same non-temporal flag. This is unlikely to occur, but can in strange cases where we're storing at the end of one page and the beginning of another. 2 - The merged load\store node must retain the non-temporal flag. Differential Revision: https://reviews.llvm.org/D62910 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@362723 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 940874b43c3..f78f775017a 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -14958,14 +14958,18 @@ void DAGCombiner::getStoreMergeCandidates( // Loads must only have one use. if (!Ld->hasNUsesOfValue(1, 0)) return; - // The memory operands must not be volatile. + // The memory operands must not be volatile/indexed. if (Ld->isVolatile() || Ld->isIndexed()) return; } auto CandidateMatch = [&](StoreSDNode *Other, BaseIndexOffset &Ptr, int64_t &Offset) -> bool { + // The memory operands must not be volatile/indexed. if (Other->isVolatile() || Other->isIndexed()) return false; + // Don't mix temporal stores with non-temporal stores. + if (St->isNonTemporal() != Other->isNonTemporal()) + return false; SDValue OtherBC = peekThroughBitcasts(Other->getValue()); // Allow merging constants of different types as integers. bool NoTypeMatch = (MemVT.isInteger()) ? !MemVT.bitsEq(Other->getMemoryVT()) @@ -14975,15 +14979,18 @@ void DAGCombiner::getStoreMergeCandidates( return false; // The Load's Base Ptr must also match if (LoadSDNode *OtherLd = dyn_cast(OtherBC)) { - auto LPtr = BaseIndexOffset::match(OtherLd, DAG); + BaseIndexOffset LPtr = BaseIndexOffset::match(OtherLd, DAG); if (LoadVT != OtherLd->getMemoryVT()) return false; // Loads must only have one use. if (!OtherLd->hasNUsesOfValue(1, 0)) return false; - // The memory operands must not be volatile. + // The memory operands must not be volatile/indexed. if (OtherLd->isVolatile() || OtherLd->isIndexed()) return false; + // Don't mix temporal loads with non-temporal loads. + if (cast(Val)->isNonTemporal() != OtherLd->isNonTemporal()) + return false; if (!(LBasePtr.equalBaseIndex(LPtr, DAG))) return false; } else @@ -15140,6 +15147,9 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { isa(StoredVal); bool IsExtractVecSrc = (StoredVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT || StoredVal.getOpcode() == ISD::EXTRACT_SUBVECTOR); + bool IsNonTemporalStore = St->isNonTemporal(); + bool IsNonTemporalLoad = + IsLoadSrc && cast(StoredVal)->isNonTemporal(); if (!IsConstantSrc && !IsLoadSrc && !IsExtractVecSrc) return false; @@ -15583,26 +15593,32 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { SDValue NewStoreChain = getMergeStoreChains(StoreNodes, NumElem); AddToWorklist(NewStoreChain.getNode()); - MachineMemOperand::Flags MMOFlags = + MachineMemOperand::Flags LdMMOFlags = isDereferenceable ? MachineMemOperand::MODereferenceable : MachineMemOperand::MONone; + if (IsNonTemporalLoad) + LdMMOFlags |= MachineMemOperand::MONonTemporal; + + MachineMemOperand::Flags StMMOFlags = + IsNonTemporalStore ? MachineMemOperand::MONonTemporal + : MachineMemOperand::MONone; SDValue NewLoad, NewStore; if (UseVectorTy || !DoIntegerTruncate) { NewLoad = DAG.getLoad(JointMemOpVT, LoadDL, FirstLoad->getChain(), FirstLoad->getBasePtr(), FirstLoad->getPointerInfo(), - FirstLoadAlign, MMOFlags); + FirstLoadAlign, LdMMOFlags); NewStore = DAG.getStore( NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(), - FirstInChain->getPointerInfo(), FirstStoreAlign); + FirstInChain->getPointerInfo(), FirstStoreAlign, StMMOFlags); } else { // This must be the truncstore/extload case EVT ExtendedTy = TLI.getTypeToTransformTo(*DAG.getContext(), JointMemOpVT); NewLoad = DAG.getExtLoad(ISD::EXTLOAD, LoadDL, ExtendedTy, FirstLoad->getChain(), FirstLoad->getBasePtr(), FirstLoad->getPointerInfo(), JointMemOpVT, - FirstLoadAlign, MMOFlags); + FirstLoadAlign, LdMMOFlags); NewStore = DAG.getTruncStore(NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(), FirstInChain->getPointerInfo(), diff --git a/test/CodeGen/X86/merge-consecutive-stores-nt.ll b/test/CodeGen/X86/merge-consecutive-stores-nt.ll index 9ef0ecb5fbc..714b5b5c65e 100644 --- a/test/CodeGen/X86/merge-consecutive-stores-nt.ll +++ b/test/CodeGen/X86/merge-consecutive-stores-nt.ll @@ -10,10 +10,6 @@ ; PR42123 ; -; FIXME: AVX doesn't retain NT flag on load/store. -; AVX1 load should be 2 x VMOVNTDQA xmm. -; AVX2 load should be VMOVNTDQA ymm. -; AVX store should be VMOVNTPS ymm. define void @merge_2_v4f32_align32(<4 x float>* %a0, <4 x float>* %a1) { ; X86-LABEL: merge_2_v4f32_align32: ; X86: # %bb.0: @@ -49,12 +45,20 @@ define void @merge_2_v4f32_align32(<4 x float>* %a0, <4 x float>* %a1) { ; X64-SSE41-NEXT: movntdq %xmm1, 16(%rsi) ; X64-SSE41-NEXT: retq ; -; X64-AVX-LABEL: merge_2_v4f32_align32: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 -; X64-AVX-NEXT: vmovaps %ymm0, (%rsi) -; X64-AVX-NEXT: vzeroupper -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: merge_2_v4f32_align32: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovntdqa (%rdi), %xmm0 +; X64-AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1 +; X64-AVX1-NEXT: vmovntdq %xmm1, 16(%rsi) +; X64-AVX1-NEXT: vmovntdq %xmm0, (%rsi) +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: merge_2_v4f32_align32: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovntdqa (%rdi), %ymm0 +; X64-AVX2-NEXT: vmovntdq %ymm0, (%rsi) +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0 %2 = bitcast float* %1 to <4 x float>* %3 = load <4 x float>, <4 x float>* %a0, align 32, !nontemporal !0 @@ -66,8 +70,7 @@ define void @merge_2_v4f32_align32(<4 x float>* %a0, <4 x float>* %a1) { ret void } -; FIXME: shouldn't attempt to merge nt and non-nt loads even if aligned. -; Must be kept seperate as VMOVNTDQA xmm + VMOVDQA xmm. +; Don't merge nt and non-nt loads even if aligned. define void @merge_2_v4f32_align32_mix_ntload(<4 x float>* %a0, <4 x float>* %a1) { ; X86-LABEL: merge_2_v4f32_align32_mix_ntload: ; X86: # %bb.0: @@ -105,9 +108,10 @@ define void @merge_2_v4f32_align32_mix_ntload(<4 x float>* %a0, <4 x float>* %a1 ; ; X64-AVX-LABEL: merge_2_v4f32_align32_mix_ntload: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 -; X64-AVX-NEXT: vmovaps %ymm0, (%rsi) -; X64-AVX-NEXT: vzeroupper +; X64-AVX-NEXT: vmovntdqa (%rdi), %xmm0 +; X64-AVX-NEXT: vmovaps 16(%rdi), %xmm1 +; X64-AVX-NEXT: vmovdqa %xmm0, (%rsi) +; X64-AVX-NEXT: vmovaps %xmm1, 16(%rsi) ; X64-AVX-NEXT: retq %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0 %2 = bitcast float* %1 to <4 x float>* @@ -120,8 +124,7 @@ define void @merge_2_v4f32_align32_mix_ntload(<4 x float>* %a0, <4 x float>* %a1 ret void } -; FIXME: shouldn't attempt to merge nt and non-nt stores even if aligned. -; Must be kept seperate as VMOVNTPS xmm + VMOVAPS xmm. +; Don't merge nt and non-nt stores even if aligned. define void @merge_2_v4f32_align32_mix_ntstore(<4 x float>* %a0, <4 x float>* %a1) { ; X86-LABEL: merge_2_v4f32_align32_mix_ntstore: ; X86: # %bb.0: @@ -143,9 +146,10 @@ define void @merge_2_v4f32_align32_mix_ntstore(<4 x float>* %a0, <4 x float>* %a ; ; X64-AVX-LABEL: merge_2_v4f32_align32_mix_ntstore: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 -; X64-AVX-NEXT: vmovaps %ymm0, (%rsi) -; X64-AVX-NEXT: vzeroupper +; X64-AVX-NEXT: vmovaps (%rdi), %xmm0 +; X64-AVX-NEXT: vmovaps 16(%rdi), %xmm1 +; X64-AVX-NEXT: vmovntps %xmm0, (%rsi) +; X64-AVX-NEXT: vmovaps %xmm1, 16(%rsi) ; X64-AVX-NEXT: retq %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0 %2 = bitcast float* %1 to <4 x float>* @@ -158,7 +162,7 @@ define void @merge_2_v4f32_align32_mix_ntstore(<4 x float>* %a0, <4 x float>* %a ret void } -; FIXME: AVX can't perform NT-load-ymm on 16-byte aligned memory. +; FIXME: AVX2 can't perform NT-load-ymm on 16-byte aligned memory. ; Must be kept seperate as VMOVNTDQA xmm. define void @merge_2_v4f32_align16_ntload(<4 x float>* %a0, <4 x float>* %a1) { ; X86-LABEL: merge_2_v4f32_align16_ntload: @@ -195,12 +199,20 @@ define void @merge_2_v4f32_align16_ntload(<4 x float>* %a0, <4 x float>* %a1) { ; X64-SSE41-NEXT: movdqa %xmm1, 16(%rsi) ; X64-SSE41-NEXT: retq ; -; X64-AVX-LABEL: merge_2_v4f32_align16_ntload: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovups (%rdi), %ymm0 -; X64-AVX-NEXT: vmovups %ymm0, (%rsi) -; X64-AVX-NEXT: vzeroupper -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: merge_2_v4f32_align16_ntload: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovntdqa (%rdi), %xmm0 +; X64-AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1 +; X64-AVX1-NEXT: vmovdqa %xmm1, 16(%rsi) +; X64-AVX1-NEXT: vmovdqa %xmm0, (%rsi) +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: merge_2_v4f32_align16_ntload: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovups (%rdi), %ymm0 +; X64-AVX2-NEXT: vmovups %ymm0, (%rsi) +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0 %2 = bitcast float* %1 to <4 x float>* %3 = load <4 x float>, <4 x float>* %a0, align 16, !nontemporal !0