From: Craig Topper Date: Fri, 11 Oct 2019 04:16:49 +0000 (+0000) Subject: [X86] Add a DAG combine to turn v16i16->v16i8 VTRUNCUS+store into a saturating trunca... X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=cc3c27eb634556d2e09e8de9d30cdba1016b7d90;p=llvm [X86] Add a DAG combine to turn v16i16->v16i8 VTRUNCUS+store into a saturating truncating store. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@374509 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 275e876644c..0e119415303 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -40448,6 +40448,19 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, MVT::v16i8, St->getMemOperand()); } + // Try to fold a vpmovuswb 256->128 into a truncating store. + // FIXME: Generalize this to other types. + // FIXME: Do the same for signed saturation. + if (!St->isTruncatingStore() && VT == MVT::v16i8 && + St->getValue().getOpcode() == X86ISD::VTRUNCUS && + St->getValue().getOperand(0).getValueType() == MVT::v16i16 && + TLI.isTruncStoreLegal(MVT::v16i16, MVT::v16i8) && + St->getValue().hasOneUse()) { + return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(), + dl, St->getValue().getOperand(0), St->getBasePtr(), + MVT::v16i8, St->getMemOperand(), DAG); + } + // Optimize trunc store (of multiple scalars) to shuffle and store. // First, pack all of the elements in one place. Next, store to memory // in fewer chunks. diff --git a/test/CodeGen/X86/min-legal-vector-width.ll b/test/CodeGen/X86/min-legal-vector-width.ll index 95e6d3e7ae7..eb90a2ae634 100644 --- a/test/CodeGen/X86/min-legal-vector-width.ll +++ b/test/CodeGen/X86/min-legal-vector-width.ll @@ -1104,8 +1104,7 @@ define void @trunc_packus_v16i32_v16i8_store(<16 x i32>* %p, <16 x i8>* %q) "min ; CHECK-NEXT: vmovdqa (%rdi), %ymm0 ; CHECK-NEXT: vpackusdw 32(%rdi), %ymm0, %ymm0 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; CHECK-NEXT: vpmovuswb %ymm0, %xmm0 -; CHECK-NEXT: vmovdqa %xmm0, (%rsi) +; CHECK-NEXT: vpmovuswb %ymm0, (%rsi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %a = load <16 x i32>, <16 x i32>* %p