From: Simon Pilgrim Date: Mon, 17 Dec 2018 22:09:47 +0000 (+0000) Subject: [X86][SSE] Improve immediate vector shift known bits handling. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=aa3674df5b5e7550427cb30fdc8fa6e2ddb5d86b;p=llvm [X86][SSE] Improve immediate vector shift known bits handling. Convert VSRAI to VSRLI is the sign bit is known zero and improve KnownBits output for all shift instruction. Fixes the poor codegen comments in D55768. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@349407 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 92542198677..cdd4e1dd9a3 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -32379,6 +32379,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( SDValue Op, const APInt &OriginalDemandedBits, const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const { + EVT VT = Op.getValueType(); unsigned BitWidth = OriginalDemandedBits.getBitWidth(); unsigned Opc = Op.getOpcode(); switch(Opc) { @@ -32401,12 +32402,19 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( if (ShiftImm->getAPIntValue().uge(BitWidth)) break; - KnownBits KnownOp; unsigned ShAmt = ShiftImm->getZExtValue(); APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt); + if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask, - OriginalDemandedElts, KnownOp, TLO, Depth + 1)) + OriginalDemandedElts, Known, TLO, Depth + 1)) return true; + + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); + Known.Zero <<= ShAmt; + Known.One <<= ShAmt; + + // Low bits known zero. + Known.Zero.setLowBits(ShAmt); } break; } @@ -32415,22 +32423,30 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( if (ShiftImm->getAPIntValue().uge(BitWidth)) break; - KnownBits KnownOp; unsigned ShAmt = ShiftImm->getZExtValue(); APInt DemandedMask = OriginalDemandedBits << ShAmt; if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask, - OriginalDemandedElts, KnownOp, TLO, Depth + 1)) + OriginalDemandedElts, Known, TLO, Depth + 1)) return true; + + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); + Known.Zero.lshrInPlace(ShAmt); + Known.One.lshrInPlace(ShAmt); + + // High bits known zero. + Known.Zero.setHighBits(ShAmt); } break; } case X86ISD::VSRAI: { - if (auto *ShiftImm = dyn_cast(Op.getOperand(1))) { + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + + if (auto *ShiftImm = dyn_cast(Op1)) { if (ShiftImm->getAPIntValue().uge(BitWidth)) break; - KnownBits KnownOp; unsigned ShAmt = ShiftImm->getZExtValue(); APInt DemandedMask = OriginalDemandedBits << ShAmt; @@ -32439,15 +32455,29 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( if (OriginalDemandedBits.countLeadingZeros() < ShAmt) DemandedMask.setSignBit(); - if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask, - OriginalDemandedElts, KnownOp, TLO, Depth + 1)) + if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known, + TLO, Depth + 1)) return true; + + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); + Known.Zero.lshrInPlace(ShAmt); + Known.One.lshrInPlace(ShAmt); + + // If the input sign bit is known to be zero, or if none of the top bits + // are demanded, turn this into an unsigned shift right. + if (Known.Zero[BitWidth - ShAmt - 1] || + OriginalDemandedBits.countLeadingZeros() >= ShAmt) + return TLO.CombineTo( + Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1)); + + // High bits are known one. + if (Known.One[BitWidth - ShAmt - 1]) + Known.One.setHighBits(ShAmt); } break; } case X86ISD::MOVMSK: { SDValue Src = Op.getOperand(0); - MVT VT = Op.getSimpleValueType(); MVT SrcVT = Src.getSimpleValueType(); unsigned SrcBits = SrcVT.getScalarSizeInBits(); unsigned NumElts = SrcVT.getVectorNumElements(); diff --git a/test/CodeGen/X86/combine-sdiv.ll b/test/CodeGen/X86/combine-sdiv.ll index a82563cc826..6fff298dbb3 100644 --- a/test/CodeGen/X86/combine-sdiv.ll +++ b/test/CodeGen/X86/combine-sdiv.ll @@ -2986,53 +2986,47 @@ define <16 x i8> @pr38658(<16 x i8> %x) { ; SSE2-NEXT: psraw $8, %xmm2 ; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm2 ; SSE2-NEXT: psrlw $8, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: packuswb %xmm2, %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; SSE2-NEXT: psraw $8, %xmm0 -; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE2-NEXT: psraw $8, %xmm2 -; SSE2-NEXT: psllw $8, %xmm2 +; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm2 ; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: packuswb %xmm0, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm0 ; SSE2-NEXT: psrlw $7, %xmm1 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: paddb %xmm2, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: pr38658: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovsxbw %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovsxbw %xmm1, %xmm2 ; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm2 ; SSE41-NEXT: psrlw $8, %xmm2 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: packuswb %xmm2, %xmm0 -; SSE41-NEXT: paddb %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; SSE41-NEXT: psraw $8, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psllw $6, %xmm2 -; SSE41-NEXT: psllw $8, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7] -; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: packuswb %xmm2, %xmm1 +; SSE41-NEXT: paddb %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE41-NEXT: psraw $8, %xmm0 ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE41-NEXT: psraw $8, %xmm2 -; SSE41-NEXT: psllw $8, %xmm2 -; SSE41-NEXT: psrlw $8, %xmm2 -; SSE41-NEXT: packuswb %xmm1, %xmm2 -; SSE41-NEXT: psrlw $7, %xmm0 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE41-NEXT: paddb %xmm2, %xmm0 +; SSE41-NEXT: psllw $6, %xmm2 +; SSE41-NEXT: psllw $8, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; SSE41-NEXT: packuswb %xmm0, %xmm2 +; SSE41-NEXT: psrlw $7, %xmm1 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE41-NEXT: paddb %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: pr38658: @@ -3050,10 +3044,7 @@ define <16 x i8> @pr38658(<16 x i8> %x) { ; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7] ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 diff --git a/test/CodeGen/X86/combine-srem.ll b/test/CodeGen/X86/combine-srem.ll index 36a151e6cad..0ed2f63282a 100644 --- a/test/CodeGen/X86/combine-srem.ll +++ b/test/CodeGen/X86/combine-srem.ll @@ -226,7 +226,7 @@ define <4 x i32> @combine_vec_srem_by_pow2a_neg(<4 x i32> %x) { ; SSE-NEXT: psrad $31, %xmm1 ; SSE-NEXT: psrld $30, %xmm1 ; SSE-NEXT: paddd %xmm0, %xmm1 -; SSE-NEXT: psrad $2, %xmm1 +; SSE-NEXT: psrld $2, %xmm1 ; SSE-NEXT: pxor %xmm2, %xmm2 ; SSE-NEXT: psubd %xmm1, %xmm2 ; SSE-NEXT: pslld $2, %xmm2 @@ -238,7 +238,7 @@ define <4 x i32> @combine_vec_srem_by_pow2a_neg(<4 x i32> %x) { ; AVX-NEXT: vpsrad $31, %xmm0, %xmm1 ; AVX-NEXT: vpsrld $30, %xmm1, %xmm1 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpsrad $2, %xmm1, %xmm1 +; AVX-NEXT: vpsrld $2, %xmm1, %xmm1 ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpsubd %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vpslld $2, %xmm1, %xmm1 diff --git a/test/CodeGen/X86/promote-vec3.ll b/test/CodeGen/X86/promote-vec3.ll index 29832a23104..db337b2f557 100644 --- a/test/CodeGen/X86/promote-vec3.ll +++ b/test/CodeGen/X86/promote-vec3.ll @@ -78,11 +78,9 @@ define <3 x i16> @sext_i8(<3 x i8>) { ; SSE3-NEXT: pinsrw $2, %eax, %xmm0 ; SSE3-NEXT: psllw $8, %xmm0 ; SSE3-NEXT: psraw $8, %xmm0 -; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE3-NEXT: psrad $16, %xmm0 -; SSE3-NEXT: movd %xmm0, %eax -; SSE3-NEXT: pextrw $2, %xmm0, %edx -; SSE3-NEXT: pextrw $4, %xmm0, %ecx +; SSE3-NEXT: pextrw $0, %xmm0, %eax +; SSE3-NEXT: pextrw $1, %xmm0, %edx +; SSE3-NEXT: pextrw $2, %xmm0, %ecx ; SSE3-NEXT: # kill: def $ax killed $ax killed $eax ; SSE3-NEXT: # kill: def $dx killed $dx killed $edx ; SSE3-NEXT: # kill: def $cx killed $cx killed $ecx diff --git a/test/CodeGen/X86/vector-shift-ashr-sub128-widen.ll b/test/CodeGen/X86/vector-shift-ashr-sub128-widen.ll index e9bccd2c994..e910c9c74da 100644 --- a/test/CodeGen/X86/vector-shift-ashr-sub128-widen.ll +++ b/test/CodeGen/X86/vector-shift-ashr-sub128-widen.ll @@ -1975,24 +1975,20 @@ define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind { define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { ; SSE-LABEL: constant_shift_v8i8: ; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; SSE-NEXT: psraw $8, %xmm1 -; SSE-NEXT: psllw $8, %xmm1 -; SSE-NEXT: psrlw $8, %xmm1 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: psraw $8, %xmm0 ; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 ; SSE-NEXT: psrlw $8, %xmm0 -; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: constant_shift_v8i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 @@ -2051,16 +2047,14 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { ; ; X32-SSE-LABEL: constant_shift_v8i8: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; X32-SSE-NEXT: psraw $8, %xmm1 -; X32-SSE-NEXT: psllw $8, %xmm1 -; X32-SSE-NEXT: psrlw $8, %xmm1 +; X32-SSE-NEXT: pxor %xmm1, %xmm1 +; X32-SSE-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X32-SSE-NEXT: psraw $8, %xmm0 ; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 ; X32-SSE-NEXT: psrlw $8, %xmm0 -; X32-SSE-NEXT: packuswb %xmm1, %xmm0 +; X32-SSE-NEXT: packuswb %xmm2, %xmm0 ; X32-SSE-NEXT: retl %shift = ashr <8 x i8> %a, ret <8 x i8> %shift @@ -2069,24 +2063,20 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { ; SSE-LABEL: constant_shift_v4i8: ; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; SSE-NEXT: psraw $8, %xmm1 -; SSE-NEXT: psllw $8, %xmm1 -; SSE-NEXT: psrlw $8, %xmm1 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: psraw $8, %xmm0 ; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 ; SSE-NEXT: psrlw $8, %xmm0 -; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: constant_shift_v4i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 @@ -2145,16 +2135,14 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { ; ; X32-SSE-LABEL: constant_shift_v4i8: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; X32-SSE-NEXT: psraw $8, %xmm1 -; X32-SSE-NEXT: psllw $8, %xmm1 -; X32-SSE-NEXT: psrlw $8, %xmm1 +; X32-SSE-NEXT: pxor %xmm1, %xmm1 +; X32-SSE-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X32-SSE-NEXT: psraw $8, %xmm0 ; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 ; X32-SSE-NEXT: psrlw $8, %xmm0 -; X32-SSE-NEXT: packuswb %xmm1, %xmm0 +; X32-SSE-NEXT: packuswb %xmm2, %xmm0 ; X32-SSE-NEXT: retl %shift = ashr <4 x i8> %a, ret <4 x i8> %shift @@ -2163,24 +2151,20 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { ; SSE-LABEL: constant_shift_v2i8: ; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; SSE-NEXT: psraw $8, %xmm1 -; SSE-NEXT: psllw $8, %xmm1 -; SSE-NEXT: psrlw $8, %xmm1 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: psraw $8, %xmm0 ; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 ; SSE-NEXT: psrlw $8, %xmm0 -; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: constant_shift_v2i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 @@ -2239,16 +2223,14 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { ; ; X32-SSE-LABEL: constant_shift_v2i8: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; X32-SSE-NEXT: psraw $8, %xmm1 -; X32-SSE-NEXT: psllw $8, %xmm1 -; X32-SSE-NEXT: psrlw $8, %xmm1 +; X32-SSE-NEXT: pxor %xmm1, %xmm1 +; X32-SSE-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X32-SSE-NEXT: psraw $8, %xmm0 ; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 ; X32-SSE-NEXT: psrlw $8, %xmm0 -; X32-SSE-NEXT: packuswb %xmm1, %xmm0 +; X32-SSE-NEXT: packuswb %xmm2, %xmm0 ; X32-SSE-NEXT: retl %shift = ashr <2 x i8> %a, ret <2 x i8> %shift