From: Craig Topper Date: Mon, 1 Jul 2019 07:09:31 +0000 (+0000) Subject: [X86] Add a DAG combine to replace vector loads feeding a v4i32->v2f64 CVTSI2FP/CVTUI... X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=59a3445a99adc43eae0cda24a29b4905f1a4eeef;p=llvm [X86] Add a DAG combine to replace vector loads feeding a v4i32->v2f64 CVTSI2FP/CVTUI2FP node with a vzload. But only when the load isn't volatile. This improves load folding during isel where we only have vzload and scalar_to_vector+load patterns. We can't have full vector load isel patterns for the same volatile load issue. Also add some missing masked cvtsi2fp/cvtui2fp with vzload patterns. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@364728 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 0c0b788231c..9286ef4f066 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -41101,6 +41101,34 @@ static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG, KnownZero, DCI)) return SDValue(N, 0); + // Convert a full vector load into vzload when not all bits are needed. + SDValue In = N->getOperand(0); + MVT InVT = In.getSimpleValueType(); + if (VT.getVectorNumElements() < InVT.getVectorNumElements() && + ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) { + assert(InVT.is128BitVector() && "Expected 128-bit input vector"); + LoadSDNode *LN = cast(N->getOperand(0)); + // Unless the load is volatile. + if (!LN->isVolatile()) { + SDLoc dl(N); + unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements(); + MVT MemVT = MVT::getIntegerVT(NumBits); + MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits); + SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other); + SDValue Ops[] = { LN->getChain(), LN->getBasePtr() }; + SDValue VZLoad = + DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MemVT, + LN->getPointerInfo(), + LN->getAlignment(), + LN->getMemOperand()->getFlags()); + SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT, + DAG.getBitcast(InVT, VZLoad)); + DCI.CombineTo(N, Convert); + DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1)); + return SDValue(N, 0); + } + } + return SDValue(); } diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 6452e632f40..2c9b6f127a5 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -8429,9 +8429,25 @@ def : Pat<(v2f64 (X86VUintToFP (v4i32 VR128X:$src1))), let Predicates = [HasVLX] in { def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))), (VCVTDQ2PDZ128rm addr:$src)>; + def : Pat<(v2f64 (vselect VK2WM:$mask, + (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src)))), + VR128X:$src0)), + (VCVTDQ2PDZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; + def : Pat<(v2f64 (vselect VK2WM:$mask, + (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src)))), + v2f64x_info.ImmAllZerosV)), + (VCVTDQ2PDZ128rmkz VK2WM:$mask, addr:$src)>; def : Pat<(v2f64 (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))), (VCVTUDQ2PDZ128rm addr:$src)>; + def : Pat<(v2f64 (vselect VK2WM:$mask, + (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src)))), + VR128X:$src0)), + (VCVTUDQ2PDZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; + def : Pat<(v2f64 (vselect VK2WM:$mask, + (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src)))), + v2f64x_info.ImmAllZerosV)), + (VCVTUDQ2PDZ128rmkz VK2WM:$mask, addr:$src)>; } let Predicates = [HasDQI, HasVLX] in { diff --git a/test/CodeGen/X86/vec_int_to_fp-widen.ll b/test/CodeGen/X86/vec_int_to_fp-widen.ll index 0f2e35f2e29..6891a3d0245 100644 --- a/test/CodeGen/X86/vec_int_to_fp-widen.ll +++ b/test/CodeGen/X86/vec_int_to_fp-widen.ll @@ -3122,14 +3122,12 @@ define <2 x double> @sitofp_volatile_load_4i32_to_2f64(<4 x i32> *%a) { define <2 x double> @sitofp_load_4i32_to_2f64_2(<4 x i32>* %x) { ; SSE-LABEL: sitofp_load_4i32_to_2f64_2: ; SSE: # %bb.0: -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE-NEXT: cvtdq2pd (%rdi), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: sitofp_load_4i32_to_2f64_2: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %xmm0 -; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX-NEXT: vcvtdq2pd (%rdi), %xmm0 ; AVX-NEXT: retq %a = load <4 x i32>, <4 x i32>* %x %b = sitofp <4 x i32> %a to <4 x double> @@ -3597,7 +3595,7 @@ define <2 x double> @uitofp_load_4i32_to_2f64_2(<4 x i32>* %x) { ; ; AVX512F-LABEL: uitofp_load_4i32_to_2f64_2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps (%rdi), %xmm0 +; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -3605,13 +3603,12 @@ define <2 x double> @uitofp_load_4i32_to_2f64_2(<4 x i32>* %x) { ; ; AVX512VL-LABEL: uitofp_load_4i32_to_2f64_2: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovaps (%rdi), %xmm0 -; AVX512VL-NEXT: vcvtudq2pd %xmm0, %xmm0 +; AVX512VL-NEXT: vcvtudq2pd (%rdi), %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: uitofp_load_4i32_to_2f64_2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 +; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-NEXT: vzeroupper @@ -3619,8 +3616,7 @@ define <2 x double> @uitofp_load_4i32_to_2f64_2(<4 x i32>* %x) { ; ; AVX512VLDQ-LABEL: uitofp_load_4i32_to_2f64_2: ; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vmovaps (%rdi), %xmm0 -; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vcvtudq2pd (%rdi), %xmm0 ; AVX512VLDQ-NEXT: retq %a = load <4 x i32>, <4 x i32>* %x %b = uitofp <4 x i32> %a to <4 x double> diff --git a/test/CodeGen/X86/vec_int_to_fp.ll b/test/CodeGen/X86/vec_int_to_fp.ll index 18d3cab9df0..ceacb6c7be1 100644 --- a/test/CodeGen/X86/vec_int_to_fp.ll +++ b/test/CodeGen/X86/vec_int_to_fp.ll @@ -3122,14 +3122,12 @@ define <2 x double> @sitofp_volatile_load_4i32_to_2f64(<4 x i32> *%a) { define <2 x double> @sitofp_load_4i32_to_2f64_2(<4 x i32>* %x) { ; SSE-LABEL: sitofp_load_4i32_to_2f64_2: ; SSE: # %bb.0: -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE-NEXT: cvtdq2pd (%rdi), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: sitofp_load_4i32_to_2f64_2: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %xmm0 -; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX-NEXT: vcvtdq2pd (%rdi), %xmm0 ; AVX-NEXT: retq %a = load <4 x i32>, <4 x i32>* %x %b = sitofp <4 x i32> %a to <4 x double> @@ -3595,7 +3593,7 @@ define <2 x double> @uitofp_load_4i32_to_2f64_2(<4 x i32>* %x) { ; ; AVX512F-LABEL: uitofp_load_4i32_to_2f64_2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps (%rdi), %xmm0 +; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -3603,13 +3601,12 @@ define <2 x double> @uitofp_load_4i32_to_2f64_2(<4 x i32>* %x) { ; ; AVX512VL-LABEL: uitofp_load_4i32_to_2f64_2: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovaps (%rdi), %xmm0 -; AVX512VL-NEXT: vcvtudq2pd %xmm0, %xmm0 +; AVX512VL-NEXT: vcvtudq2pd (%rdi), %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: uitofp_load_4i32_to_2f64_2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 +; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-NEXT: vzeroupper @@ -3617,8 +3614,7 @@ define <2 x double> @uitofp_load_4i32_to_2f64_2(<4 x i32>* %x) { ; ; AVX512VLDQ-LABEL: uitofp_load_4i32_to_2f64_2: ; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vmovaps (%rdi), %xmm0 -; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vcvtudq2pd (%rdi), %xmm0 ; AVX512VLDQ-NEXT: retq %a = load <4 x i32>, <4 x i32>* %x %b = uitofp <4 x i32> %a to <4 x double>