From 10e92dd797d04724c6d7db4e3266316eb1f00f70 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 29 Aug 2019 05:48:48 +0000 Subject: [PATCH] [X86] Add a DAG combine to combine INSERTPS and VBROADCAST of a scalar load. Remove corresponding isel patterns. We had an isel pattern to perform this, but its better to do it in DAG combine as a simplification. This also fixes the lack of patterns for AVX512 targets. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@370294 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 79 +++++++++++++++++------------- lib/Target/X86/X86InstrSSE.td | 13 ----- test/CodeGen/X86/sse41.ll | 20 +++----- 3 files changed, 53 insertions(+), 59 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 05de73de0de..6453d3ad0bc 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -33550,46 +33550,57 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, // Attempt to merge insertps Op0 with an inner target shuffle node. SmallVector TargetMask0; SmallVector Ops0; - if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0)) - return SDValue(); + if (setTargetShuffleZeroElements(Op0, TargetMask0, Ops0)) { + bool Updated = false; + bool UseInput00 = false; + bool UseInput01 = false; + for (int i = 0; i != 4; ++i) { + int M = TargetMask0[i]; + if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) { + // No change if element is already zero or the inserted element. + continue; + } else if (isUndefOrZero(M)) { + // If the target mask is undef/zero then we must zero the element. + InsertPSMask |= (1u << i); + Updated = true; + continue; + } - bool Updated = false; - bool UseInput00 = false; - bool UseInput01 = false; - for (int i = 0; i != 4; ++i) { - int M = TargetMask0[i]; - if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) { - // No change if element is already zero or the inserted element. - continue; - } else if (isUndefOrZero(M)) { - // If the target mask is undef/zero then we must zero the element. - InsertPSMask |= (1u << i); - Updated = true; - continue; - } + // The input vector element must be inline. + if (M != i && M != (i + 4)) + return SDValue(); - // The input vector element must be inline. - if (M != i && M != (i + 4)) - return SDValue(); + // Determine which inputs of the target shuffle we're using. + UseInput00 |= (0 <= M && M < 4); + UseInput01 |= (4 <= M); + } - // Determine which inputs of the target shuffle we're using. - UseInput00 |= (0 <= M && M < 4); - UseInput01 |= (4 <= M); - } + // If we're not using both inputs of the target shuffle then use the + // referenced input directly. + if (UseInput00 && !UseInput01) { + Updated = true; + Op0 = Ops0[0]; + } else if (!UseInput00 && UseInput01) { + Updated = true; + Op0 = Ops0[1]; + } - // If we're not using both inputs of the target shuffle then use the - // referenced input directly. - if (UseInput00 && !UseInput01) { - Updated = true; - Op0 = Ops0[0]; - } else if (!UseInput00 && UseInput01) { - Updated = true; - Op0 = Ops0[1]; + if (Updated) + return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1, + DAG.getConstant(InsertPSMask, DL, MVT::i8)); } - if (Updated) - return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1, - DAG.getConstant(InsertPSMask, DL, MVT::i8)); + // If we're inserting an element from a vbroadcast of a load, fold the + // load into the X86insertps instruction. We need to convert the scalar + // load to a vector and clear the source lane of the INSERTPS control. + if (Op1.getOpcode() == X86ISD::VBROADCAST && Op1.hasOneUse() && + Op1.getOperand(0).hasOneUse() && + !Op1.getOperand(0).getValueType().isVector() && + ISD::isNormalLoad(Op1.getOperand(0).getNode())) + return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, + DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, + Op1.getOperand(0)), + DAG.getConstant(InsertPSMask & 0x3f, DL, MVT::i8)); return SDValue(); } diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 3d0e749f745..34ef4c2d81f 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -5323,19 +5323,6 @@ let ExeDomain = SSEPackedSingle in { defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1>; } -let Predicates = [UseAVX] in { - // If we're inserting an element from a vbroadcast of a load, fold the - // load into the X86insertps instruction. - // FIXME: Why are these here? This looks like a demanded bits issue. - // FIXME: Missing AVX512 equivalents. - def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), - (X86VBroadcast (loadf32 addr:$src2)), imm:$src3)), - (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>; - def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), - (X86VBroadcast (v4f32 (nonvolatile_load addr:$src2))), imm:$src3)), - (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>; -} - //===----------------------------------------------------------------------===// // SSE4.1 - Round Instructions //===----------------------------------------------------------------------===// diff --git a/test/CodeGen/X86/sse41.ll b/test/CodeGen/X86/sse41.ll index d55c1d23a1d..4e80f8f92d8 100644 --- a/test/CodeGen/X86/sse41.ll +++ b/test/CodeGen/X86/sse41.ll @@ -1559,9 +1559,8 @@ define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocap ; X86-AVX512: ## %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08] ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04] -; X86-AVX512-NEXT: vbroadcastss (%ecx,%eax,4), %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x0c,0x81] -; X86-AVX512-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30] -; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] +; X86-AVX512-NEXT: vinsertps $48, (%ecx,%eax,4), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x04,0x81,0x30] +; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] ; X86-AVX512-NEXT: retl ## encoding: [0xc3] ; ; X64-SSE-LABEL: insertps_from_broadcast_loadf32: @@ -1578,9 +1577,8 @@ define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocap ; ; X64-AVX512-LABEL: insertps_from_broadcast_loadf32: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vbroadcastss (%rdi,%rsi,4), %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x0c,0xb7] -; X64-AVX512-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30] -; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] +; X64-AVX512-NEXT: vinsertps $48, (%rdi,%rsi,4), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x04,0xb7,0x30] +; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] ; X64-AVX512-NEXT: retq ## encoding: [0xc3] %1 = getelementptr inbounds float, float* %fb, i64 %index %2 = load float, float* %1, align 4 @@ -1611,9 +1609,8 @@ define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float ; X86-AVX512-LABEL: insertps_from_broadcast_loadv4f32: ; X86-AVX512: ## %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vbroadcastss (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x08] -; X86-AVX512-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30] -; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] +; X86-AVX512-NEXT: vinsertps $48, (%eax), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x00,0x30] +; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] ; X86-AVX512-NEXT: retl ## encoding: [0xc3] ; ; X64-SSE-LABEL: insertps_from_broadcast_loadv4f32: @@ -1631,9 +1628,8 @@ define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float ; ; X64-AVX512-LABEL: insertps_from_broadcast_loadv4f32: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vbroadcastss (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x0f] -; X64-AVX512-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30] -; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] +; X64-AVX512-NEXT: vinsertps $48, (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x07,0x30] +; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] ; X64-AVX512-NEXT: retq ## encoding: [0xc3] %1 = load <4 x float>, <4 x float>* %b, align 4 %2 = extractelement <4 x float> %1, i32 0 -- 2.40.0