From b9c01fbcf77569dc5fd076ebfc85d155a275d2bb Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 24 Jan 2017 11:46:13 +0000 Subject: [PATCH] [X86][SSE] Add support for constant folding vector arithmetic shift by immediates git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@292919 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 25 ++++++++++--- test/CodeGen/X86/pmul.ll | 47 +++++++++++------------- test/CodeGen/X86/vector-idiv-sdiv-128.ll | 33 ++++++++--------- test/CodeGen/X86/vector-idiv-udiv-128.ll | 3 +- 4 files changed, 57 insertions(+), 51 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 834a04c61d3..8cf7383b8d7 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -30464,8 +30464,10 @@ static SDValue combineVectorShift(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { unsigned Opcode = N->getOpcode(); - assert((X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode) && + assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || + X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"); + bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode; EVT VT = N->getValueType(0); unsigned NumElts = VT.getVectorNumElements(); unsigned NumBitsPerElt = VT.getScalarSizeInBits(); @@ -30475,9 +30477,13 @@ static SDValue combineVectorShift(SDNode *N, SelectionDAG &DAG, return SDValue(); // Out of range logical bit shifts are guaranteed to be zero. + // Out of range arithmetic bit shifts splat the sign bit. APInt ShiftVal = cast(N->getOperand(1))->getAPIntValue(); if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) - return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N)); + if (LogicalShift) + return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N)); + else + ShiftVal = NumBitsPerElt - 1; SDValue N0 = N->getOperand(0); @@ -30490,7 +30496,7 @@ static SDValue combineVectorShift(SDNode *N, SelectionDAG &DAG, return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N)); // We can decode 'whole byte' logical bit shifts as shuffles. - if ((ShiftVal.getZExtValue() % 8) == 0) { + if (LogicalShift && (ShiftVal.getZExtValue() % 8) == 0) { SDValue Op(N, 0); SmallVector NonceMask; // Just a placeholder. NonceMask.push_back(0); @@ -30506,9 +30512,15 @@ static SDValue combineVectorShift(SDNode *N, SelectionDAG &DAG, if (N->isOnlyUserOf(N0.getNode()) && getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) { assert(EltBits.size() == NumElts && "Unexpected shift value type"); - for (APInt &Elt : EltBits) - Elt = X86ISD::VSHLI == Opcode ? Elt.shl(ShiftVal.getZExtValue()) - : Elt.lshr(ShiftVal.getZExtValue()); + unsigned ShiftImm = ShiftVal.getZExtValue(); + for (APInt &Elt : EltBits) { + if (X86ISD::VSHLI == Opcode) + Elt = Elt.shl(ShiftImm); + else if (X86ISD::VSRAI == Opcode) + Elt = Elt.ashr(ShiftImm); + else + Elt = Elt.lshr(ShiftImm); + } return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N)); } @@ -34086,6 +34098,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::SETCC: return combineX86SetCC(N, DAG, DCI, Subtarget); case X86ISD::BRCOND: return combineBrCond(N, DAG, DCI, Subtarget); case X86ISD::VSHLI: + case X86ISD::VSRAI: case X86ISD::VSRLI: return combineVectorShift(N, DAG, DCI, Subtarget); case X86ISD::VSEXT: case X86ISD::VZEXT: return combineVSZext(N, DAG, DCI, Subtarget); diff --git a/test/CodeGen/X86/pmul.ll b/test/CodeGen/X86/pmul.ll index 7d9ef28a090..ac83653efbd 100644 --- a/test/CodeGen/X86/pmul.ll +++ b/test/CodeGen/X86/pmul.ll @@ -8,19 +8,18 @@ define <16 x i8> @mul_v16i8c(<16 x i8> %i) nounwind { ; SSE2-LABEL: mul_v16i8c: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: psraw $8, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: psraw $8, %xmm2 -; SSE2-NEXT: pmullw %xmm1, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117] +; SSE2-NEXT: pmullw %xmm2, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: psraw $8, %xmm0 -; SSE2-NEXT: pmullw %xmm1, %xmm0 +; SSE2-NEXT: pmullw %xmm2, %xmm0 ; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: mul_v16i8c: @@ -382,29 +381,28 @@ entry: define <32 x i8> @mul_v32i8c(<32 x i8> %i) nounwind { ; SSE2-LABEL: mul_v32i8c: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: psraw $8, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: psraw $8, %xmm3 -; SSE2-NEXT: pmullw %xmm2, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [117,117,117,117,117,117,117,117] +; SSE2-NEXT: pmullw %xmm3, %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pand %xmm4, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: psraw $8, %xmm0 -; SSE2-NEXT: pmullw %xmm2, %xmm0 +; SSE2-NEXT: pmullw %xmm3, %xmm0 ; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: packuswb %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: psraw $8, %xmm3 -; SSE2-NEXT: pmullw %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: psraw $8, %xmm2 +; SSE2-NEXT: pmullw %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm4, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: psraw $8, %xmm1 -; SSE2-NEXT: pmullw %xmm2, %xmm1 +; SSE2-NEXT: pmullw %xmm3, %xmm1 ; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: packuswb %xmm3, %xmm1 +; SSE2-NEXT: packuswb %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE41-LABEL: mul_v32i8c: @@ -771,11 +769,10 @@ entry: define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind { ; SSE2-LABEL: mul_v64i8c: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] -; SSE2-NEXT: psraw $8, %xmm4 ; SSE2-NEXT: movdqa %xmm0, %xmm6 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: psraw $8, %xmm6 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [117,117,117,117,117,117,117,117] ; SSE2-NEXT: pmullw %xmm4, %xmm6 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] ; SSE2-NEXT: pand %xmm5, %xmm6 diff --git a/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/test/CodeGen/X86/vector-idiv-sdiv-128.ll index dfbfe06678f..933359fa084 100644 --- a/test/CodeGen/X86/vector-idiv-sdiv-128.ll +++ b/test/CodeGen/X86/vector-idiv-sdiv-128.ll @@ -175,19 +175,18 @@ define <8 x i16> @test_div7_8i16(<8 x i16> %a) nounwind { define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind { ; SSE2-LABEL: test_div7_16i8: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: psraw $8, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: psraw $8, %xmm3 -; SSE2-NEXT: pmullw %xmm2, %xmm3 -; SSE2-NEXT: psrlw $8, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [65427,65427,65427,65427,65427,65427,65427,65427] +; SSE2-NEXT: pmullw %xmm3, %xmm2 +; SSE2-NEXT: psrlw $8, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: psraw $8, %xmm1 -; SSE2-NEXT: pmullw %xmm2, %xmm1 +; SSE2-NEXT: pmullw %xmm3, %xmm1 ; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: packuswb %xmm3, %xmm1 +; SSE2-NEXT: packuswb %xmm2, %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: psrlw $2, %xmm0 @@ -483,19 +482,18 @@ define <8 x i16> @test_rem7_8i16(<8 x i16> %a) nounwind { define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind { ; SSE2-LABEL: test_rem7_16i8: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: psraw $8, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: psraw $8, %xmm3 -; SSE2-NEXT: pmullw %xmm2, %xmm3 -; SSE2-NEXT: psrlw $8, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [65427,65427,65427,65427,65427,65427,65427,65427] +; SSE2-NEXT: pmullw %xmm3, %xmm2 +; SSE2-NEXT: psrlw $8, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: psraw $8, %xmm1 -; SSE2-NEXT: pmullw %xmm2, %xmm1 +; SSE2-NEXT: pmullw %xmm3, %xmm1 ; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: packuswb %xmm3, %xmm1 +; SSE2-NEXT: packuswb %xmm2, %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: psrlw $2, %xmm2 @@ -509,8 +507,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind { ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: psraw $8, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; SSE2-NEXT: psraw $8, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7] ; SSE2-NEXT: pmullw %xmm3, %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; SSE2-NEXT: pand %xmm4, %xmm2 diff --git a/test/CodeGen/X86/vector-idiv-udiv-128.ll b/test/CodeGen/X86/vector-idiv-udiv-128.ll index c4ecaa4a1bc..7857e585dca 100644 --- a/test/CodeGen/X86/vector-idiv-udiv-128.ll +++ b/test/CodeGen/X86/vector-idiv-udiv-128.ll @@ -486,8 +486,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind { ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: psraw $8, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; SSE2-NEXT: psraw $8, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7] ; SSE2-NEXT: pmullw %xmm3, %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; SSE2-NEXT: pand %xmm4, %xmm2 -- 2.50.1