From 88f49fa5ef0ce7ac29ab88d1883c2a41b586f394 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Sun, 28 Apr 2019 12:23:43 +0000 Subject: [PATCH] [DAGCombiner] try repeated fdiv divisor transform before building estimate This was originally part of D61028, but it's an independent diff. If we try the repeated divisor reciprocal transform before producing an estimate sequence, then we have an opportunity to use scalar fdiv. On x86, the trade-off is 1 divss vs. 5 vector FP ops in the default estimate sequence. On recent chips (Skylake, Ryzen), the full-precision division is only 3 cycle throughput, so that's probably the better perf default option and avoids problems from x86's inaccurate estimates. The last 2 tests show that users still have the option to override the defaults by using the function attributes for reciprocal estimates, but those patterns are potentially made faster by converting the vector ops (including ymm ops) to scalar math. Differential Revision: https://reviews.llvm.org/D61149 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@359398 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 6 +-- test/CodeGen/X86/fdiv-combine-vec.ll | 66 ++++++++++-------------- 2 files changed, 30 insertions(+), 42 deletions(-) diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 661df98212a..5d2804c711a 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -11992,6 +11992,9 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) { if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; + if (SDValue V = combineRepeatedFPDivisors(N)) + return V; + if (Options.UnsafeFPMath || Flags.hasAllowReciprocal()) { // fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable. if (N1CFP) { @@ -12081,9 +12084,6 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) { } } - if (SDValue CombineRepeatedDivisors = combineRepeatedFPDivisors(N)) - return CombineRepeatedDivisors; - return SDValue(); } diff --git a/test/CodeGen/X86/fdiv-combine-vec.ll b/test/CodeGen/X86/fdiv-combine-vec.ll index 6de3f31892d..825f8a50f96 100644 --- a/test/CodeGen/X86/fdiv-combine-vec.ll +++ b/test/CodeGen/X86/fdiv-combine-vec.ll @@ -51,25 +51,17 @@ define <4 x double> @splat_fdiv_v4f64(<4 x double> %x, double %y) { define <4 x float> @splat_fdiv_v4f32(<4 x float> %x, float %y) { ; SSE-LABEL: splat_fdiv_v4f32: ; SSE: # %bb.0: -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: rcpps %xmm1, %xmm2 -; SSE-NEXT: mulps %xmm2, %xmm1 -; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SSE-NEXT: subps %xmm1, %xmm3 -; SSE-NEXT: mulps %xmm2, %xmm3 -; SSE-NEXT: addps %xmm2, %xmm3 -; SSE-NEXT: mulps %xmm3, %xmm0 +; SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE-NEXT: divss %xmm1, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE-NEXT: mulps %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: splat_fdiv_v4f32: ; AVX: # %bb.0: +; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX-NEXT: vdivss %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0] -; AVX-NEXT: vrcpps %xmm1, %xmm2 -; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX-NEXT: vsubps %xmm1, %xmm3, %xmm1 -; AVX-NEXT: vmulps %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vaddps %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %vy = insertelement <4 x float> undef, float %y, i32 0 @@ -90,14 +82,10 @@ define <8 x float> @splat_fdiv_v8f32(<8 x float> %x, float %y) { ; ; AVX-LABEL: splat_fdiv_v8f32: ; AVX: # %bb.0: +; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX-NEXT: vdivss %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX-NEXT: vrcpps %ymm1, %ymm2 -; AVX-NEXT: vmulps %ymm2, %ymm1, %ymm1 -; AVX-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX-NEXT: vsubps %ymm1, %ymm3, %ymm1 -; AVX-NEXT: vmulps %ymm1, %ymm2, %ymm1 -; AVX-NEXT: vaddps %ymm1, %ymm2, %ymm1 ; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: retq %vy = insertelement <8 x float> undef, float %y, i32 0 @@ -109,25 +97,25 @@ define <8 x float> @splat_fdiv_v8f32(<8 x float> %x, float %y) { define <4 x float> @splat_fdiv_v4f32_estimate(<4 x float> %x, float %y) #0 { ; SSE-LABEL: splat_fdiv_v4f32_estimate: ; SSE: # %bb.0: -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: rcpps %xmm1, %xmm2 -; SSE-NEXT: mulps %xmm2, %xmm1 -; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SSE-NEXT: subps %xmm1, %xmm3 -; SSE-NEXT: mulps %xmm2, %xmm3 -; SSE-NEXT: addps %xmm2, %xmm3 +; SSE-NEXT: rcpss %xmm1, %xmm2 +; SSE-NEXT: mulss %xmm2, %xmm1 +; SSE-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSE-NEXT: subss %xmm1, %xmm3 +; SSE-NEXT: mulss %xmm2, %xmm3 +; SSE-NEXT: addss %xmm2, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0,0,0] ; SSE-NEXT: mulps %xmm3, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: splat_fdiv_v4f32_estimate: ; AVX: # %bb.0: +; AVX-NEXT: vrcpss %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; AVX-NEXT: vsubss %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vmulss %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vaddss %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0] -; AVX-NEXT: vrcpps %xmm1, %xmm2 -; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX-NEXT: vsubps %xmm1, %xmm3, %xmm1 -; AVX-NEXT: vmulps %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vaddps %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %vy = insertelement <4 x float> undef, float %y, i32 0 @@ -152,14 +140,14 @@ define <8 x float> @splat_fdiv_v8f32_estimate(<8 x float> %x, float %y) #0 { ; ; AVX-LABEL: splat_fdiv_v8f32_estimate: ; AVX: # %bb.0: +; AVX-NEXT: vrcpss %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; AVX-NEXT: vsubss %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vmulss %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vaddss %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX-NEXT: vrcpps %ymm1, %ymm2 -; AVX-NEXT: vmulps %ymm2, %ymm1, %ymm1 -; AVX-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX-NEXT: vsubps %ymm1, %ymm3, %ymm1 -; AVX-NEXT: vmulps %ymm1, %ymm2, %ymm1 -; AVX-NEXT: vaddps %ymm1, %ymm2, %ymm1 ; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: retq %vy = insertelement <8 x float> undef, float %y, i32 0 -- 2.50.1