return Result;
}
+/// Horizontal vector math instructions may be slower than normal math with
+/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
+/// implementation, and likely shuffle complexity of the alternate sequence.
+static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ bool IsOptimizingSize = DAG.getMachineFunction().getFunction().hasOptSize();
+ bool HasFastHOps = Subtarget.hasFastHorizontalOps();
+ return !IsSingleSource || IsOptimizingSize || HasFastHOps;
+}
+
/// 64-bit unsigned integer to double expansion.
static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
SDValue Result;
- if (Subtarget.hasSSE3()) {
- // FIXME: The 'haddpd' instruction may be slower than 'shuffle + addsd'.
+ if (shouldUseHorizontalOp(true, DAG, Subtarget)) {
Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
} else {
SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
return Op;
}
-/// Horizontal vector math instructions may be slower than normal math with
-/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
-/// implementation, and likely shuffle complexity of the alternate sequence.
-static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
- bool IsOptimizingSize = DAG.getMachineFunction().getFunction().hasOptSize();
- bool HasFastHOps = Subtarget.hasFastHorizontalOps();
- return !IsSingleSource || IsOptimizingSize || HasFastHOps;
-}
-
/// Depending on uarch and/or optimizing for size, we might prefer to use a
/// vector operation in place of the typical scalar operation.
static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3,SSSE3-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSE,SSSE3,SSSE3-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
define float @pr26491(<4 x float> %a0) {
; SSE2-LABEL: pr26491:
; SSE2-NEXT: addpd %xmm2, %xmm1
; SSE2-NEXT: retq
;
-; SSSE3-LABEL: PR41414:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movq %rdi, %xmm2
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
-; SSSE3-NEXT: subpd {{.*}}(%rip), %xmm2
-; SSSE3-NEXT: haddpd %xmm2, %xmm2
-; SSSE3-NEXT: divpd %xmm2, %xmm1
-; SSSE3-NEXT: divpd %xmm2, %xmm0
-; SSSE3-NEXT: xorpd %xmm2, %xmm2
-; SSSE3-NEXT: addpd %xmm2, %xmm0
-; SSSE3-NEXT: addpd %xmm2, %xmm1
-; SSSE3-NEXT: retq
+; SSSE3-SLOW-LABEL: PR41414:
+; SSSE3-SLOW: # %bb.0:
+; SSSE3-SLOW-NEXT: movq %rdi, %xmm2
+; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
+; SSSE3-SLOW-NEXT: subpd {{.*}}(%rip), %xmm2
+; SSSE3-SLOW-NEXT: movapd %xmm2, %xmm3
+; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1]
+; SSSE3-SLOW-NEXT: addpd %xmm2, %xmm3
+; SSSE3-SLOW-NEXT: movddup {{.*#+}} xmm2 = xmm3[0,0]
+; SSSE3-SLOW-NEXT: divpd %xmm2, %xmm1
+; SSSE3-SLOW-NEXT: divpd %xmm2, %xmm0
+; SSSE3-SLOW-NEXT: xorpd %xmm2, %xmm2
+; SSSE3-SLOW-NEXT: addpd %xmm2, %xmm0
+; SSSE3-SLOW-NEXT: addpd %xmm2, %xmm1
+; SSSE3-SLOW-NEXT: retq
+;
+; SSSE3-FAST-LABEL: PR41414:
+; SSSE3-FAST: # %bb.0:
+; SSSE3-FAST-NEXT: movq %rdi, %xmm2
+; SSSE3-FAST-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
+; SSSE3-FAST-NEXT: subpd {{.*}}(%rip), %xmm2
+; SSSE3-FAST-NEXT: haddpd %xmm2, %xmm2
+; SSSE3-FAST-NEXT: divpd %xmm2, %xmm1
+; SSSE3-FAST-NEXT: divpd %xmm2, %xmm0
+; SSSE3-FAST-NEXT: xorpd %xmm2, %xmm2
+; SSSE3-FAST-NEXT: addpd %xmm2, %xmm0
+; SSSE3-FAST-NEXT: addpd %xmm2, %xmm1
+; SSSE3-FAST-NEXT: retq
+;
+; AVX1-SLOW-LABEL: PR41414:
+; AVX1-SLOW: # %bb.0:
+; AVX1-SLOW-NEXT: vmovq %rdi, %xmm1
+; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
+; AVX1-SLOW-NEXT: vsubpd {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX1-SLOW-NEXT: vaddpd %xmm1, %xmm2, %xmm1
+; AVX1-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
+; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; AVX1-SLOW-NEXT: vdivpd %ymm1, %ymm0, %ymm0
+; AVX1-SLOW-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; AVX1-SLOW-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; AVX1-SLOW-NEXT: retq
;
-; AVX1-LABEL: PR41414:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovq %rdi, %xmm1
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
-; AVX1-NEXT: vsubpd {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
-; AVX1-NEXT: vdivpd %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: retq
+; AVX1-FAST-LABEL: PR41414:
+; AVX1-FAST: # %bb.0:
+; AVX1-FAST-NEXT: vmovq %rdi, %xmm1
+; AVX1-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
+; AVX1-FAST-NEXT: vsubpd {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
+; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; AVX1-FAST-NEXT: vdivpd %ymm1, %ymm0, %ymm0
+; AVX1-FAST-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; AVX1-FAST-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; AVX1-FAST-NEXT: retq
;
; AVX2-LABEL: PR41414:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovq %rdi, %xmm1
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
; AVX2-NEXT: vsubpd {{.*}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX2-NEXT: vaddsd %xmm1, %xmm2, %xmm1
; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1
; AVX2-NEXT: vdivpd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vxorpd %xmm1, %xmm1, %xmm1