From a1e1f01699d27a5905c917a06a96b57625963d4e Mon Sep 17 00:00:00 2001 From: Andrea Di Biagio Date: Mon, 24 Nov 2014 12:23:15 +0000 Subject: [PATCH] [X86] Improved target specific combine on VSELECT dag nodes. This patch teaches function 'transformVSELECTtoBlendVECTOR_SHUFFLE' how to convert VSELECT dag nodes to shuffles on targets that do not have SSE4.1. On pre-SSE4.1 targets, we can still perform blend operations using movss/movsd. Also, removed a target specific combine that performed a premature lowering of VSELECT nodes to target specific MOVSS/MOVSD nodes. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@222647 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 97 ++------------ test/CodeGen/X86/vector-blend.ll | 195 +++++++++++++++++++++-------- test/CodeGen/X86/vselect-2.ll | 53 ++++++-- 3 files changed, 192 insertions(+), 153 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 0347a517444..7f987ccca6f 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -19980,6 +19980,7 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl &M, return (SVT.getVectorNumElements() == 2 || ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isMOVLMask(M, SVT) || + isCommutedMOVLMask(M, SVT) || isMOVHLPSMask(M, SVT) || isSHUFPMask(M, SVT) || isSHUFPMask(M, SVT, /* Commuted */ true) || @@ -22693,7 +22694,7 @@ matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS, } static SDValue -TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG, +transformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { SDLoc dl(N); SDValue Cond = N->getOperand(0); @@ -22706,18 +22707,6 @@ TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG, Cond = CondSrc->getOperand(0); } - MVT VT = N->getSimpleValueType(0); - MVT EltVT = VT.getVectorElementType(); - unsigned NumElems = VT.getVectorNumElements(); - // There is no blend with immediate in AVX-512. - if (VT.is512BitVector()) - return SDValue(); - - if (!Subtarget->hasSSE41() || EltVT == MVT::i8) - return SDValue(); - if (!Subtarget->hasInt256() && VT == MVT::v16i16) - return SDValue(); - if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) return SDValue(); @@ -22731,6 +22720,8 @@ TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG, if (!BUILD_VECTORtoBlendMask(cast(Cond), MaskValue)) return SDValue(); + MVT VT = N->getSimpleValueType(0); + unsigned NumElems = VT.getVectorNumElements(); SmallVector ShuffleMask(NumElems, -1); for (unsigned i = 0; i < NumElems; ++i) { // Be sure we emit undef where we can. @@ -22740,6 +22731,9 @@ TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG, ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1); } + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.isShuffleMaskLegal(ShuffleMask, VT)) + return SDValue(); return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]); } @@ -23179,81 +23173,6 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, } } - // Try to fold this VSELECT into a MOVSS/MOVSD - if (N->getOpcode() == ISD::VSELECT && - Cond.getOpcode() == ISD::BUILD_VECTOR && !DCI.isBeforeLegalize()) { - if (VT == MVT::v4i32 || VT == MVT::v4f32 || - (Subtarget->hasSSE2() && (VT == MVT::v2i64 || VT == MVT::v2f64))) { - bool CanFold = false; - unsigned NumElems = Cond.getNumOperands(); - SDValue A = LHS; - SDValue B = RHS; - - if (isZero(Cond.getOperand(0))) { - CanFold = true; - - // fold (vselect <0,-1,-1,-1>, A, B) -> (movss A, B) - // fold (vselect <0,-1> -> (movsd A, B) - for (unsigned i = 1, e = NumElems; i != e && CanFold; ++i) - CanFold = isAllOnes(Cond.getOperand(i)); - } else if (isAllOnes(Cond.getOperand(0))) { - CanFold = true; - std::swap(A, B); - - // fold (vselect <-1,0,0,0>, A, B) -> (movss B, A) - // fold (vselect <-1,0> -> (movsd B, A) - for (unsigned i = 1, e = NumElems; i != e && CanFold; ++i) - CanFold = isZero(Cond.getOperand(i)); - } - - if (CanFold) { - if (VT == MVT::v4i32 || VT == MVT::v4f32) - return getTargetShuffleNode(X86ISD::MOVSS, DL, VT, A, B, DAG); - return getTargetShuffleNode(X86ISD::MOVSD, DL, VT, A, B, DAG); - } - - if (Subtarget->hasSSE2() && (VT == MVT::v4i32 || VT == MVT::v4f32)) { - // fold (v4i32: vselect <0,0,-1,-1>, A, B) -> - // (v4i32 (bitcast (movsd (v2i64 (bitcast A)), - // (v2i64 (bitcast B))))) - // - // fold (v4f32: vselect <0,0,-1,-1>, A, B) -> - // (v4f32 (bitcast (movsd (v2f64 (bitcast A)), - // (v2f64 (bitcast B))))) - // - // fold (v4i32: vselect <-1,-1,0,0>, A, B) -> - // (v4i32 (bitcast (movsd (v2i64 (bitcast B)), - // (v2i64 (bitcast A))))) - // - // fold (v4f32: vselect <-1,-1,0,0>, A, B) -> - // (v4f32 (bitcast (movsd (v2f64 (bitcast B)), - // (v2f64 (bitcast A))))) - - CanFold = (isZero(Cond.getOperand(0)) && - isZero(Cond.getOperand(1)) && - isAllOnes(Cond.getOperand(2)) && - isAllOnes(Cond.getOperand(3))); - - if (!CanFold && isAllOnes(Cond.getOperand(0)) && - isAllOnes(Cond.getOperand(1)) && - isZero(Cond.getOperand(2)) && - isZero(Cond.getOperand(3))) { - CanFold = true; - std::swap(LHS, RHS); - } - - if (CanFold) { - EVT NVT = (VT == MVT::v4i32) ? MVT::v2i64 : MVT::v2f64; - SDValue NewA = DAG.getNode(ISD::BITCAST, DL, NVT, LHS); - SDValue NewB = DAG.getNode(ISD::BITCAST, DL, NVT, RHS); - SDValue Select = getTargetShuffleNode(X86ISD::MOVSD, DL, NVT, NewA, - NewB, DAG); - return DAG.getNode(ISD::BITCAST, DL, VT, Select); - } - } - } - } - // If we know that this node is legal then we know that it is going to be // matched by one of the SSE/AVX BLEND instructions. These instructions only // depend on the highest bit in each word. Try to use SimplifyDemandedBits @@ -23338,7 +23257,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, if ((N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::SHRUNKBLEND) && !DCI.isBeforeLegalize()) { - SDValue Shuffle = TransformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget); + SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget); if (Shuffle.getNode()) return Shuffle; } diff --git a/test/CodeGen/X86/vector-blend.ll b/test/CodeGen/X86/vector-blend.ll index 0a3ed7e4b32..f23b8288385 100644 --- a/test/CodeGen/X86/vector-blend.ll +++ b/test/CodeGen/X86/vector-blend.ll @@ -1,6 +1,6 @@ -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 @@ -36,15 +36,26 @@ entry: } define <4 x float> @vsel_float2(<4 x float> %v1, <4 x float> %v2) { -; SSE-LABEL: vsel_float2: -; SSE: # BB#0: # %entry -; SSE-NEXT: movss %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: vsel_float2: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: vsel_float2: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movss %xmm0, %xmm1 +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: vsel_float2: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; SSE41-NEXT: retq ; ; AVX-LABEL: vsel_float2: ; AVX: # BB#0: # %entry -; AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: retq entry: %vsel = select <4 x i1> , <4 x float> %v1, <4 x float> %v2 @@ -154,15 +165,26 @@ entry: } define <2 x double> @vsel_double(<2 x double> %v1, <2 x double> %v2) { -; SSE-LABEL: vsel_double: -; SSE: # BB#0: # %entry -; SSE-NEXT: movsd %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: vsel_double: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movsd %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: vsel_double: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movsd %xmm0, %xmm1 +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: vsel_double: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; SSE41-NEXT: retq ; ; AVX-LABEL: vsel_double: ; AVX: # BB#0: # %entry -; AVX-NEXT: vmovsd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; AVX-NEXT: retq entry: %vsel = select <2 x i1> , <2 x double> %v1, <2 x double> %v2 @@ -170,16 +192,32 @@ entry: } define <2 x i64> @vsel_i64(<2 x i64> %v1, <2 x i64> %v2) { -; SSE-LABEL: vsel_i64: -; SSE: # BB#0: # %entry -; SSE-NEXT: movsd %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: vsel_i64: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movsd %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq ; -; AVX-LABEL: vsel_i64: -; AVX: # BB#0: # %entry -; AVX-NEXT: vmovsd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; SSSE3-LABEL: vsel_i64: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movsd %xmm0, %xmm1 +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: vsel_i64: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: vsel_i64: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: vsel_i64: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-NEXT: retq entry: %vsel = select <2 x i1> , <2 x i64> %v1, <2 x i64> %v2 ret <2 x i64> %vsel @@ -251,13 +289,27 @@ entry: ; AVX256 tests: define <8 x float> @vsel_float8(<8 x float> %v1, <8 x float> %v2) { -; SSE-LABEL: vsel_float8: -; SSE: # BB#0: # %entry -; SSE-NEXT: movss %xmm0, %xmm2 -; SSE-NEXT: movss %xmm1, %xmm3 -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: retq +; SSE2-LABEL: vsel_float8: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movss %xmm0, %xmm2 +; SSE2-NEXT: movss %xmm1, %xmm3 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm3, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: vsel_float8: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movss %xmm0, %xmm2 +; SSSE3-NEXT: movss %xmm1, %xmm3 +; SSSE3-NEXT: movaps %xmm2, %xmm0 +; SSSE3-NEXT: movaps %xmm3, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: vsel_float8: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] +; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3] +; SSE41-NEXT: retq ; ; AVX-LABEL: vsel_float8: ; AVX: # BB#0: # %entry @@ -269,13 +321,27 @@ entry: } define <8 x i32> @vsel_i328(<8 x i32> %v1, <8 x i32> %v2) { -; SSE-LABEL: vsel_i328: -; SSE: # BB#0: # %entry -; SSE-NEXT: movss %xmm0, %xmm2 -; SSE-NEXT: movss %xmm1, %xmm3 -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: retq +; SSE2-LABEL: vsel_i328: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movss %xmm0, %xmm2 +; SSE2-NEXT: movss %xmm1, %xmm3 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm3, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: vsel_i328: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movss %xmm0, %xmm2 +; SSSE3-NEXT: movss %xmm1, %xmm3 +; SSSE3-NEXT: movaps %xmm2, %xmm0 +; SSSE3-NEXT: movaps %xmm3, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: vsel_i328: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3,4,5,6,7] +; SSE41-NEXT: retq ; ; AVX1-LABEL: vsel_i328: ; AVX1: # BB#0: # %entry @@ -376,13 +442,27 @@ entry: } define <4 x double> @vsel_double4(<4 x double> %v1, <4 x double> %v2) { -; SSE-LABEL: vsel_double4: -; SSE: # BB#0: # %entry -; SSE-NEXT: movsd %xmm0, %xmm2 -; SSE-NEXT: movsd %xmm1, %xmm3 -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: retq +; SSE2-LABEL: vsel_double4: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movsd %xmm0, %xmm2 +; SSE2-NEXT: movsd %xmm1, %xmm3 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm3, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: vsel_double4: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movsd %xmm0, %xmm2 +; SSSE3-NEXT: movsd %xmm1, %xmm3 +; SSSE3-NEXT: movaps %xmm2, %xmm0 +; SSSE3-NEXT: movaps %xmm3, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: vsel_double4: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm2[1] +; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm1[0],xmm3[1] +; SSE41-NEXT: retq ; ; AVX-LABEL: vsel_double4: ; AVX: # BB#0: # %entry @@ -474,12 +554,25 @@ entry: ; If we can figure out a blend has a constant mask, we should emit the ; blend instruction with an immediate mask define <4 x double> @constant_blendvpd_avx(<4 x double> %xy, <4 x double> %ab) { -; SSE-LABEL: constant_blendvpd_avx: -; SSE: # BB#0: # %entry -; SSE-NEXT: movsd %xmm1, %xmm3 -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: retq +; SSE2-LABEL: constant_blendvpd_avx: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movsd %xmm1, %xmm3 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm3, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: constant_blendvpd_avx: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movsd %xmm1, %xmm3 +; SSSE3-NEXT: movaps %xmm2, %xmm0 +; SSSE3-NEXT: movaps %xmm3, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: constant_blendvpd_avx: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm1[0],xmm3[1] +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: constant_blendvpd_avx: ; AVX: # BB#0: # %entry diff --git a/test/CodeGen/X86/vselect-2.ll b/test/CodeGen/X86/vselect-2.ll index 50da32c67a3..0991bdacd9c 100644 --- a/test/CodeGen/X86/vselect-2.ll +++ b/test/CodeGen/X86/vselect-2.ll @@ -1,33 +1,60 @@ -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=sse2 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE41 define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) { +; SSE2-LABEL: test1 +; SSE2: # BB#0: +; SSE2-NEXT: movsd %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test1 +; SSE41: # BB#0: +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: retq %select = select <4 x i1>, <4 x i32> %A, <4 x i32> %B ret <4 x i32> %select } -; CHECK-LABEL: test1 -; CHECK: movsd -; CHECK: ret define <4 x i32> @test2(<4 x i32> %A, <4 x i32> %B) { +; SSE2-LABEL: test2 +; SSE2: # BB#0: +; SSE2-NEXT: movsd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test2 +; SSE41: # BB#0: +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: retq %select = select <4 x i1>, <4 x i32> %A, <4 x i32> %B ret <4 x i32> %select } -; CHECK-LABEL: test2 -; CHECK: movsd -; CHECK-NEXT: ret define <4 x float> @test3(<4 x float> %A, <4 x float> %B) { +; SSE2-LABEL: test3 +; SSE2: # BB#0: +; SSE2-NEXT: movsd %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test3 +; SSE41: # BB#0: +; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; SSE41-NEXT: retq %select = select <4 x i1>, <4 x float> %A, <4 x float> %B ret <4 x float> %select } -; CHECK-LABEL: test3 -; CHECK: movsd -; CHECK: ret define <4 x float> @test4(<4 x float> %A, <4 x float> %B) { +; SSE2-LABEL: test4 +; SSE2: # BB#0: +; SSE2-NEXT: movsd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test4 +; SSE41: # BB#0: +; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE41-NEXT: retq %select = select <4 x i1>, <4 x float> %A, <4 x float> %B ret <4 x float> %select } -; CHECK-LABEL: test4 -; CHECK: movsd -; CHECK-NEXT: ret -- 2.40.0