From 09b0be25dc15d7a1a16f5801693f2da98f31491b Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Wed, 16 Nov 2016 22:34:05 +0000 Subject: [PATCH] [x86] allow FP-logic ops when one operand is FP and result is FP We save an inter-register file move this way. If there's any CPU where the FP logic is slower, we could transform this back to int-logic in MachineCombiner. This helps, but doesn't solve, PR6137: https://llvm.org/bugs/show_bug.cgi?id=6137 The 'andn' test shows that we're missing a pattern match to recognize the xor with -1 constant as a 'not' op. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@287171 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 40 ++++++++++++++++++---------- test/CodeGen/X86/fp-logic-replace.ll | 16 +++++------ test/CodeGen/X86/fp-logic.ll | 18 +++++-------- 3 files changed, 38 insertions(+), 36 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 9397f8c9cf5..01624a76a8e 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -26971,11 +26971,10 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, } // Convert a bitcasted integer logic operation that has one bitcasted - // floating-point operand and one constant operand into a floating-point - // logic operation. This may create a load of the constant, but that is - // cheaper than materializing the constant in an integer register and - // transferring it to an SSE register or transferring the SSE operand to - // integer register and back. + // floating-point operand into a floating-point logic operation. This may + // create a load of a constant, but that is cheaper than materializing the + // constant in an integer register and transferring it to an SSE register or + // transferring the SSE operand to integer register and back. unsigned FPOpcode; switch (N0.getOpcode()) { case ISD::AND: FPOpcode = X86ISD::FAND; break; @@ -26983,20 +26982,33 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, case ISD::XOR: FPOpcode = X86ISD::FXOR; break; default: return SDValue(); } - if (((Subtarget.hasSSE1() && VT == MVT::f32) || - (Subtarget.hasSSE2() && VT == MVT::f64)) && - isa(N0.getOperand(1)) && - N0.getOperand(0).getOpcode() == ISD::BITCAST && - N0.getOperand(0).getOperand(0).getValueType() == VT) { - SDValue N000 = N0.getOperand(0).getOperand(0); - SDValue FPConst = DAG.getBitcast(VT, N0.getOperand(1)); - return DAG.getNode(FPOpcode, SDLoc(N0), VT, N000, FPConst); + + if (!((Subtarget.hasSSE1() && VT == MVT::f32) || + (Subtarget.hasSSE2() && VT == MVT::f64))) + return SDValue(); + + SDValue LogicOp0 = N0.getOperand(0); + SDValue LogicOp1 = N0.getOperand(1); + SDLoc DL0(N0); + + // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y)) + if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST && + LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT && + !isa(LogicOp0.getOperand(0))) { + SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1); + return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1); + } + // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y) + if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST && + LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT && + !isa(LogicOp1.getOperand(0))) { + SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0); + return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0); } return SDValue(); } - // Match a binop + shuffle pyramid that represents a horizontal reduction over // the elements of a vector. // Returns the vector that is being reduced on, or SDValue() if a reduction diff --git a/test/CodeGen/X86/fp-logic-replace.ll b/test/CodeGen/X86/fp-logic-replace.ll index 47e07688702..50e2c1b2029 100644 --- a/test/CodeGen/X86/fp-logic-replace.ll +++ b/test/CodeGen/X86/fp-logic-replace.ll @@ -29,20 +29,16 @@ define double @FsANDPSrr(double %x, double %y) { define double @FsANDNPSrr(double %x, double %y) { ; SSE-LABEL: FsANDNPSrr: ; SSE: # BB#0: -; SSE-NEXT: movd %xmm0, %rax -; SSE-NEXT: movd %xmm1, %rcx -; SSE-NEXT: notq %rcx -; SSE-NEXT: andq %rax, %rcx -; SSE-NEXT: movd %rcx, %xmm0 +; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; SSE-NEXT: xorpd %xmm1, %xmm2 +; SSE-NEXT: andpd %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: FsANDNPSrr: ; AVX: # BB#0: -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: vmovq %xmm1, %rcx -; AVX-NEXT: notq %rcx -; AVX-NEXT: andq %rax, %rcx -; AVX-NEXT: vmovq %rcx, %xmm0 +; AVX-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; AVX-NEXT: vxorpd %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vandpd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; %bc1 = bitcast double %x to i64 diff --git a/test/CodeGen/X86/fp-logic.ll b/test/CodeGen/X86/fp-logic.ll index 2c6698fb120..301fa8f4137 100644 --- a/test/CodeGen/X86/fp-logic.ll +++ b/test/CodeGen/X86/fp-logic.ll @@ -3,13 +3,9 @@ ; PR22428: https://llvm.org/bugs/show_bug.cgi?id=22428 ; f1, f2, f3, and f4 should use an integer logic instruction. -; f9 and f10 should use an FP (SSE) logic instruction. +; f5, f6, f9, and f10 should use an FP (SSE) logic instruction. ; -; f5, f6, f7, and f8 are less clear. -; -; For f5 and f6, we can save a register move by using an FP logic instruction, -; but we may need to calculate the relative costs of an SSE op vs. int op vs. -; scalar <-> SSE register moves. +; f7 and f8 are less clear. ; ; For f7 and f8, the SSE instructions don't take immediate operands, so if we ; use one of those, we either have to load a constant from memory or move the @@ -79,9 +75,8 @@ define i32 @f4(float %x) { define float @f5(float %x, i32 %y) { ; CHECK-LABEL: f5: ; CHECK: # BB#0: -; CHECK-NEXT: movd %xmm0, %eax -; CHECK-NEXT: andl %edi, %eax -; CHECK-NEXT: movd %eax, %xmm0 +; CHECK-NEXT: movd %edi, %xmm1 +; CHECK-NEXT: andps %xmm1, %xmm0 ; CHECK-NEXT: retq ; %bc1 = bitcast float %x to i32 @@ -95,9 +90,8 @@ define float @f5(float %x, i32 %y) { define float @f6(float %x, i32 %y) { ; CHECK-LABEL: f6: ; CHECK: # BB#0: -; CHECK-NEXT: movd %xmm0, %eax -; CHECK-NEXT: andl %edi, %eax -; CHECK-NEXT: movd %eax, %xmm0 +; CHECK-NEXT: movd %edi, %xmm1 +; CHECK-NEXT: andps %xmm1, %xmm0 ; CHECK-NEXT: retq ; %bc1 = bitcast float %x to i32 -- 2.40.0