From: Simon Pilgrim Date: Sun, 22 Jan 2017 22:21:44 +0000 (+0000) Subject: [X86][SSE] Improve shuffle combining with zero insertions X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=160800e73a270b2ecd68718fe7906e9a9b95e3ed;p=llvm [X86][SSE] Improve shuffle combining with zero insertions Add support for handling shuffles with scalar_to_vector(0) git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@292766 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index ead6e95b89c..cb522389765 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -5686,6 +5686,15 @@ static bool setTargetShuffleZeroElements(SDValue N, continue; } + // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF. + if (V.getOpcode() == ISD::SCALAR_TO_VECTOR && + (Size % V.getValueType().getVectorNumElements()) == 0) { + int Scale = Size / V.getValueType().getVectorNumElements(); + if (((M / Scale) == 0) && X86::isZeroNode(V.getOperand(0))) + Mask[i] = SM_SentinelZero; + continue; + } + // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements. if (V.getOpcode() != ISD::BUILD_VECTOR) continue; diff --git a/test/CodeGen/X86/clear_upper_vector_element_bits.ll b/test/CodeGen/X86/clear_upper_vector_element_bits.ll index 693bf2e17d5..6a4d83d392d 100644 --- a/test/CodeGen/X86/clear_upper_vector_element_bits.ll +++ b/test/CodeGen/X86/clear_upper_vector_element_bits.ll @@ -390,21 +390,18 @@ define <8 x i16> @_clearupper8xi16b(<8 x i16>) nounwind { ; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255] ; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4] -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2] -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4] ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: _clearupper8xi16b: diff --git a/test/CodeGen/X86/insertelement-zero.ll b/test/CodeGen/X86/insertelement-zero.ll index f6a9dd92d34..13414a57f58 100644 --- a/test/CodeGen/X86/insertelement-zero.ll +++ b/test/CodeGen/X86/insertelement-zero.ll @@ -46,22 +46,22 @@ define <2 x double> @insert_v2f64_z1(<2 x double> %a) { define <4 x double> @insert_v4f64_0zz3(<4 x double> %a) { ; SSE2-LABEL: insert_v4f64_0zz3: ; SSE2: # BB#0: +; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; SSE2-NEXT: xorpd %xmm2, %xmm2 -; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; SSE2-NEXT: retq ; ; SSE3-LABEL: insert_v4f64_0zz3: ; SSE3: # BB#0: +; SSE3-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; SSE3-NEXT: xorpd %xmm2, %xmm2 -; SSE3-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE3-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: insert_v4f64_0zz3: ; SSSE3: # BB#0: +; SSSE3-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; SSSE3-NEXT: xorpd %xmm2, %xmm2 -; SSSE3-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; SSSE3-NEXT: retq ; @@ -451,7 +451,7 @@ define <16 x i8> @insert_v16i8_z123456789ABCDEz(<16 x i8> %a) { ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] ; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] +; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pandn %xmm2, %xmm1 ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -466,22 +466,14 @@ define <16 x i8> @insert_v16i8_z123456789ABCDEz(<16 x i8> %a) { ; SSE3-NEXT: por %xmm1, %xmm0 ; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] ; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] +; SSE3-NEXT: pxor %xmm2, %xmm2 ; SSE3-NEXT: pandn %xmm2, %xmm1 ; SSE3-NEXT: por %xmm1, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: insert_v16i8_z123456789ABCDEz: ; SSSE3: # BB#0: -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; SSSE3-NEXT: xorl %eax, %eax -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: por %xmm2, %xmm0 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero -; SSSE3-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] -; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: insert_v16i8_z123456789ABCDEz: @@ -513,15 +505,13 @@ define <32 x i8> @insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz(<32 x i8> %a) { ; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] ; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0] -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255] -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1] -; SSE2-NEXT: pandn %xmm3, %xmm5 -; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255] +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm1 ; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm4, %xmm2 +; SSE2-NEXT: pandn %xmm3, %xmm2 ; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: retq @@ -536,37 +526,21 @@ define <32 x i8> @insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz(<32 x i8> %a) { ; SSE3-NEXT: por %xmm2, %xmm0 ; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] ; SSE3-NEXT: pand %xmm2, %xmm0 -; SSE3-NEXT: movdqa %xmm3, %xmm4 -; SSE3-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0] -; SSE3-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255] -; SSE3-NEXT: pand %xmm5, %xmm1 -; SSE3-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1] -; SSE3-NEXT: pandn %xmm3, %xmm5 -; SSE3-NEXT: por %xmm5, %xmm1 +; SSE3-NEXT: pxor %xmm3, %xmm3 +; SSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255] +; SSE3-NEXT: pand %xmm4, %xmm1 +; SSE3-NEXT: pandn %xmm3, %xmm4 +; SSE3-NEXT: por %xmm4, %xmm1 ; SSE3-NEXT: pand %xmm2, %xmm1 -; SSE3-NEXT: pandn %xmm4, %xmm2 +; SSE3-NEXT: pandn %xmm3, %xmm2 ; SSE3-NEXT: por %xmm2, %xmm0 ; SSE3-NEXT: por %xmm2, %xmm1 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz: ; SSSE3: # BB#0: -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; SSSE3-NEXT: xorl %eax, %eax -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: pshufb {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: por %xmm3, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,128] -; SSSE3-NEXT: pshufb %xmm3, %xmm0 -; SSSE3-NEXT: movdqa %xmm2, %xmm4 -; SSSE3-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0] -; SSSE3-NEXT: por %xmm4, %xmm0 -; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13],zero,xmm1[15] -; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0],zero -; SSSE3-NEXT: por %xmm2, %xmm1 -; SSSE3-NEXT: pshufb %xmm3, %xmm1 -; SSSE3-NEXT: por %xmm4, %xmm1 +; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 +; SSSE3-NEXT: andps {{.*}}(%rip), %xmm1 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz: