From 57e133c4e0dc10492116bb4fbd675314b7ce6676 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 3 Oct 2017 09:41:00 +0000 Subject: [PATCH] [X86][SSE] Add support for PACKSS/PACKUS constant folding Pulled out of D38472 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@314776 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 85 ++++++++++++++++++++++++ test/CodeGen/X86/avx2-intrinsics-x86.ll | 44 +++++------- test/CodeGen/X86/sse2-intrinsics-x86.ll | 54 ++++++--------- test/CodeGen/X86/sse41-intrinsics-x86.ll | 18 ++--- 4 files changed, 133 insertions(+), 68 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 0981d39fe5c..3dd4d74ca40 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -5350,6 +5350,13 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, return false; }; + // Handle UNDEFs. + if (Op.isUndef()) { + APInt UndefSrcElts = APInt::getAllOnesValue(NumElts); + SmallVector SrcEltBits(NumElts, APInt(EltSizeInBits, 0)); + return CastBitData(UndefSrcElts, SrcEltBits); + } + // Extract constant bits from build vector. if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) { unsigned SrcEltSizeInBits = VT.getScalarSizeInBits(); @@ -31838,6 +31845,82 @@ static SDValue combineShift(SDNode* N, SelectionDAG &DAG, return SDValue(); } +static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + unsigned Opcode = N->getOpcode(); + assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && + "Unexpected shift opcode"); + + EVT VT = N->getValueType(0); + EVT SVT = VT.getScalarType(); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + unsigned DstBitsPerElt = VT.getScalarSizeInBits(); + unsigned SrcBitsPerElt = 2 * DstBitsPerElt; + assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt && + N1.getScalarValueSizeInBits() == SrcBitsPerElt && + "Unexpected PACKSS/PACKUS input type"); + + // Constant Folding. + APInt UndefElts0, UndefElts1; + SmallVector EltBits0, EltBits1; + if ((N0->isUndef() || N->isOnlyUserOf(N0.getNode())) && + (N1->isUndef() || N->isOnlyUserOf(N1.getNode())) && + getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) && + getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) { + unsigned NumLanes = VT.getSizeInBits() / 128; + unsigned NumDstElts = VT.getVectorNumElements(); + unsigned NumSrcElts = NumDstElts / 2; + unsigned NumDstEltsPerLane = NumDstElts / NumLanes; + unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes; + bool IsSigned = (X86ISD::PACKSS == Opcode); + + APInt Undefs(NumDstElts, 0); + SmallVector Bits(NumDstElts, APInt::getNullValue(DstBitsPerElt)); + for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { + for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) { + unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane; + auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0); + auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0); + + if (UndefElts[SrcIdx]) { + Undefs.setBit(Lane * NumDstEltsPerLane + Elt); + continue; + } + + APInt &Val = EltBits[SrcIdx]; + if (IsSigned) { + // PACKSS: Truncate signed value with signed saturation. + // Source values less than dst minint are saturated to minint. + // Source values greater than dst maxint are saturated to maxint. + if (Val.isSignedIntN(DstBitsPerElt)) + Val = Val.trunc(DstBitsPerElt); + else if (Val.isNegative()) + Val = APInt::getSignedMinValue(DstBitsPerElt); + else + Val = APInt::getSignedMaxValue(DstBitsPerElt); + } else { + // PACKUS: Truncate signed value with unsigned saturation. + // Source values less than zero are saturated to zero. + // Source values greater than dst maxuint are saturated to maxuint. + if (Val.isIntN(DstBitsPerElt)) + Val = Val.trunc(DstBitsPerElt); + else if (Val.isNegative()) + Val = APInt::getNullValue(DstBitsPerElt); + else + Val = APInt::getAllOnesValue(DstBitsPerElt); + } + Bits[Lane * NumDstEltsPerLane + Elt] = Val; + } + } + + return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N)); + } + + return SDValue(); +} + static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -36069,6 +36152,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::SETCC: return combineSetCC(N, DAG, Subtarget); case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget); case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget); + case X86ISD::PACKSS: + case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget); case X86ISD::VSHLI: case X86ISD::VSRAI: case X86ISD::VSRLI: diff --git a/test/CodeGen/X86/avx2-intrinsics-x86.ll b/test/CodeGen/X86/avx2-intrinsics-x86.ll index 7c9874e9a48..1329c243924 100644 --- a/test/CodeGen/X86/avx2-intrinsics-x86.ll +++ b/test/CodeGen/X86/avx2-intrinsics-x86.ll @@ -21,15 +21,15 @@ declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readno define <16 x i16> @test_x86_avx2_packssdw_fold() { ; AVX2-LABEL: test_x86_avx2_packssdw_fold: ; AVX2: ## BB#0: -; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xef,0xc0] -; AVX2-NEXT: vpackssdw LCPI1_0, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x6b,0x05,A,A,A,A] +; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,0,255,32767,32767,65535,0,0,0,0,32769,32768,0,65280] +; AVX2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] ; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI1_0, kind: FK_Data_4 ; AVX2-NEXT: retl ## encoding: [0xc3] ; ; AVX512VL-LABEL: test_x86_avx2_packssdw_fold: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0] -; AVX512VL-NEXT: vpackssdw LCPI1_0, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6b,0x05,A,A,A,A] +; AVX512VL-NEXT: vmovaps LCPI1_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,32767,65535,0,0,0,0,32769,32768,0,65280] +; AVX512VL-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] ; AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI1_0, kind: FK_Data_4 ; AVX512VL-NEXT: retl ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> zeroinitializer, <8 x i32> ) @@ -56,20 +56,16 @@ declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readn define <32 x i8> @test_x86_avx2_packsswb_fold() { ; AVX2-LABEL: test_x86_avx2_packsswb_fold: ; AVX2: ## BB#0: -; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xef,0xc0] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,255,256,65535,65535,65281,65280,32858,0,255,256,65535,65535,65281,65280,32858] -; AVX2-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x0d,A,A,A,A] +; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] +; AVX2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] ; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI3_0, kind: FK_Data_4 -; AVX2-NEXT: vpacksswb %ymm0, %ymm1, %ymm0 ## encoding: [0xc5,0xf5,0x63,0xc0] ; AVX2-NEXT: retl ## encoding: [0xc3] ; ; AVX512VL-LABEL: test_x86_avx2_packsswb_fold: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0] -; AVX512VL-NEXT: vmovdqa LCPI3_0, %ymm1 ## EVEX TO VEX Compression ymm1 = [0,255,256,65535,65535,65281,65280,32858,0,255,256,65535,65535,65281,65280,32858] -; AVX512VL-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x0d,A,A,A,A] +; AVX512VL-NEXT: vmovaps LCPI3_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] ; AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI3_0, kind: FK_Data_4 -; AVX512VL-NEXT: vpacksswb %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x63,0xc0] ; AVX512VL-NEXT: retl ## encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> , <16 x i16> zeroinitializer) ret <32 x i8> %res @@ -95,20 +91,16 @@ declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readn define <32 x i8> @test_x86_avx2_packuswb_fold() { ; AVX2-LABEL: test_x86_avx2_packuswb_fold: ; AVX2: ## BB#0: -; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xef,0xc0] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,255,256,65535,65535,65281,65280,32858,0,255,256,65535,65535,65281,65280,32858] -; AVX2-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x0d,A,A,A,A] +; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] ; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI5_0, kind: FK_Data_4 -; AVX2-NEXT: vpackuswb %ymm0, %ymm1, %ymm0 ## encoding: [0xc5,0xf5,0x67,0xc0] ; AVX2-NEXT: retl ## encoding: [0xc3] ; ; AVX512VL-LABEL: test_x86_avx2_packuswb_fold: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0] -; AVX512VL-NEXT: vmovdqa LCPI5_0, %ymm1 ## EVEX TO VEX Compression ymm1 = [0,255,256,65535,65535,65281,65280,32858,0,255,256,65535,65535,65281,65280,32858] -; AVX512VL-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x0d,A,A,A,A] +; AVX512VL-NEXT: vmovaps LCPI5_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] ; AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI5_0, kind: FK_Data_4 -; AVX512VL-NEXT: vpackuswb %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x67,0xc0] ; AVX512VL-NEXT: retl ## encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> , <16 x i16> zeroinitializer) ret <32 x i8> %res @@ -850,16 +842,16 @@ declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readno define <16 x i16> @test_x86_avx2_packusdw_fold() { ; AVX2-LABEL: test_x86_avx2_packusdw_fold: ; AVX2: ## BB#0: -; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xef,0xc0] -; AVX2-NEXT: vpackusdw LCPI55_0, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x2b,0x05,A,A,A,A] -; AVX2-NEXT: ## fixup A - offset: 5, value: LCPI55_0, kind: FK_Data_4 +; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] +; AVX2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] +; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI55_0, kind: FK_Data_4 ; AVX2-NEXT: retl ## encoding: [0xc3] ; ; AVX512VL-LABEL: test_x86_avx2_packusdw_fold: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0] -; AVX512VL-NEXT: vpackusdw LCPI55_0, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2b,0x05,A,A,A,A] -; AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI55_0, kind: FK_Data_4 +; AVX512VL-NEXT: vmovaps LCPI55_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] +; AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI55_0, kind: FK_Data_4 ; AVX512VL-NEXT: retl ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> zeroinitializer, <8 x i32> ) ret <16 x i16> %res diff --git a/test/CodeGen/X86/sse2-intrinsics-x86.ll b/test/CodeGen/X86/sse2-intrinsics-x86.ll index e6d3057fc5d..7324e855088 100644 --- a/test/CodeGen/X86/sse2-intrinsics-x86.ll +++ b/test/CodeGen/X86/sse2-intrinsics-x86.ll @@ -684,22 +684,22 @@ declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind rea define <8 x i16> @test_x86_sse2_packssdw_128_fold() { ; SSE-LABEL: test_x86_sse2_packssdw_128_fold: ; SSE: ## BB#0: -; SSE-NEXT: pxor %xmm0, %xmm0 ## encoding: [0x66,0x0f,0xef,0xc0] -; SSE-NEXT: packssdw LCPI32_0, %xmm0 ## encoding: [0x66,0x0f,0x6b,0x05,A,A,A,A] -; SSE-NEXT: ## fixup A - offset: 4, value: LCPI32_0, kind: FK_Data_4 +; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,0,0,0,32767,32767,65535,32768] +; SSE-NEXT: ## encoding: [0x0f,0x28,0x05,A,A,A,A] +; SSE-NEXT: ## fixup A - offset: 3, value: LCPI32_0, kind: FK_Data_4 ; SSE-NEXT: retl ## encoding: [0xc3] ; ; AVX2-LABEL: test_x86_sse2_packssdw_128_fold: ; AVX2: ## BB#0: -; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xef,0xc0] -; AVX2-NEXT: vpackssdw LCPI32_0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x6b,0x05,A,A,A,A] +; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,0,0,32767,32767,65535,32768] +; AVX2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] ; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI32_0, kind: FK_Data_4 ; AVX2-NEXT: retl ## encoding: [0xc3] ; ; SKX-LABEL: test_x86_sse2_packssdw_128_fold: ; SKX: ## BB#0: -; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0] -; SKX-NEXT: vpackssdw LCPI32_0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6b,0x05,A,A,A,A] +; SKX-NEXT: vmovaps LCPI32_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [0,0,0,0,32767,32767,65535,32768] +; SKX-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] ; SKX-NEXT: ## fixup A - offset: 4, value: LCPI32_0, kind: FK_Data_4 ; SKX-NEXT: retl ## encoding: [0xc3] %res = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> zeroinitializer, <4 x i32> ) @@ -731,29 +731,23 @@ declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind rea define <16 x i8> @test_x86_sse2_packsswb_128_fold() { ; SSE-LABEL: test_x86_sse2_packsswb_128_fold: ; SSE: ## BB#0: -; SSE-NEXT: pxor %xmm1, %xmm1 ## encoding: [0x66,0x0f,0xef,0xc9] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,255,256,65535,65535,65281,65280,32858] -; SSE-NEXT: ## encoding: [0x66,0x0f,0x6f,0x05,A,A,A,A] -; SSE-NEXT: ## fixup A - offset: 4, value: LCPI34_0, kind: FK_Data_4 -; SSE-NEXT: packsswb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x63,0xc1] +; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] +; SSE-NEXT: ## encoding: [0x0f,0x28,0x05,A,A,A,A] +; SSE-NEXT: ## fixup A - offset: 3, value: LCPI34_0, kind: FK_Data_4 ; SSE-NEXT: retl ## encoding: [0xc3] ; ; AVX2-LABEL: test_x86_sse2_packsswb_128_fold: ; AVX2: ## BB#0: -; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xef,0xc0] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,255,256,65535,65535,65281,65280,32858] -; AVX2-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x0d,A,A,A,A] +; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] +; AVX2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] ; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI34_0, kind: FK_Data_4 -; AVX2-NEXT: vpacksswb %xmm0, %xmm1, %xmm0 ## encoding: [0xc5,0xf1,0x63,0xc0] ; AVX2-NEXT: retl ## encoding: [0xc3] ; ; SKX-LABEL: test_x86_sse2_packsswb_128_fold: ; SKX: ## BB#0: -; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0] -; SKX-NEXT: vmovdqa LCPI34_0, %xmm1 ## EVEX TO VEX Compression xmm1 = [0,255,256,65535,65535,65281,65280,32858] -; SKX-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x0d,A,A,A,A] +; SKX-NEXT: vmovaps LCPI34_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] +; SKX-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] ; SKX-NEXT: ## fixup A - offset: 4, value: LCPI34_0, kind: FK_Data_4 -; SKX-NEXT: vpacksswb %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x63,0xc0] ; SKX-NEXT: retl ## encoding: [0xc3] %res = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> , <8 x i16> zeroinitializer) ret <16 x i8> %res @@ -784,29 +778,23 @@ declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind rea define <16 x i8> @test_x86_sse2_packuswb_128_fold() { ; SSE-LABEL: test_x86_sse2_packuswb_128_fold: ; SSE: ## BB#0: -; SSE-NEXT: pxor %xmm1, %xmm1 ## encoding: [0x66,0x0f,0xef,0xc9] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,255,256,65535,65535,65281,65280,32858] -; SSE-NEXT: ## encoding: [0x66,0x0f,0x6f,0x05,A,A,A,A] -; SSE-NEXT: ## fixup A - offset: 4, value: LCPI36_0, kind: FK_Data_4 -; SSE-NEXT: packuswb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x67,0xc1] +; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; SSE-NEXT: ## encoding: [0x0f,0x28,0x05,A,A,A,A] +; SSE-NEXT: ## fixup A - offset: 3, value: LCPI36_0, kind: FK_Data_4 ; SSE-NEXT: retl ## encoding: [0xc3] ; ; AVX2-LABEL: test_x86_sse2_packuswb_128_fold: ; AVX2: ## BB#0: -; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xef,0xc0] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,255,256,65535,65535,65281,65280,32858] -; AVX2-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x0d,A,A,A,A] +; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] ; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI36_0, kind: FK_Data_4 -; AVX2-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 ## encoding: [0xc5,0xf1,0x67,0xc0] ; AVX2-NEXT: retl ## encoding: [0xc3] ; ; SKX-LABEL: test_x86_sse2_packuswb_128_fold: ; SKX: ## BB#0: -; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0] -; SKX-NEXT: vmovdqa LCPI36_0, %xmm1 ## EVEX TO VEX Compression xmm1 = [0,255,256,65535,65535,65281,65280,32858] -; SKX-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x0d,A,A,A,A] +; SKX-NEXT: vmovaps LCPI36_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; SKX-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] ; SKX-NEXT: ## fixup A - offset: 4, value: LCPI36_0, kind: FK_Data_4 -; SKX-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x67,0xc0] ; SKX-NEXT: retl ## encoding: [0xc3] %res = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> , <8 x i16> zeroinitializer) ret <16 x i8> %res diff --git a/test/CodeGen/X86/sse41-intrinsics-x86.ll b/test/CodeGen/X86/sse41-intrinsics-x86.ll index 39acde0b5cd..98300a526a9 100644 --- a/test/CodeGen/X86/sse41-intrinsics-x86.ll +++ b/test/CodeGen/X86/sse41-intrinsics-x86.ll @@ -138,23 +138,23 @@ declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readno define <8 x i16> @test_x86_sse41_packusdw_fold() { ; SSE41-LABEL: test_x86_sse41_packusdw_fold: ; SSE41: ## BB#0: -; SSE41-NEXT: pxor %xmm0, %xmm0 ## encoding: [0x66,0x0f,0xef,0xc0] -; SSE41-NEXT: packusdw LCPI7_0, %xmm0 ## encoding: [0x66,0x0f,0x38,0x2b,0x05,A,A,A,A] -; SSE41-NEXT: ## fixup A - offset: 5, value: LCPI7_0, kind: FK_Data_4 +; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,0,0,0,65535,65535,0,0] +; SSE41-NEXT: ## encoding: [0x0f,0x28,0x05,A,A,A,A] +; SSE41-NEXT: ## fixup A - offset: 3, value: LCPI7_0, kind: FK_Data_4 ; SSE41-NEXT: retl ## encoding: [0xc3] ; ; AVX2-LABEL: test_x86_sse41_packusdw_fold: ; AVX2: ## BB#0: -; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xef,0xc0] -; AVX2-NEXT: vpackusdw LCPI7_0, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x2b,0x05,A,A,A,A] -; AVX2-NEXT: ## fixup A - offset: 5, value: LCPI7_0, kind: FK_Data_4 +; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,0,0,65535,65535,0,0] +; AVX2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI7_0, kind: FK_Data_4 ; AVX2-NEXT: retl ## encoding: [0xc3] ; ; SKX-LABEL: test_x86_sse41_packusdw_fold: ; SKX: ## BB#0: -; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0] -; SKX-NEXT: vpackusdw LCPI7_0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x2b,0x05,A,A,A,A] -; SKX-NEXT: ## fixup A - offset: 5, value: LCPI7_0, kind: FK_Data_4 +; SKX-NEXT: vmovaps LCPI7_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [0,0,0,0,65535,65535,0,0] +; SKX-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; SKX-NEXT: ## fixup A - offset: 4, value: LCPI7_0, kind: FK_Data_4 ; SKX-NEXT: retl ## encoding: [0xc3] %res = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> zeroinitializer, <4 x i32> ) ret <8 x i16> %res -- 2.40.0