STATISTIC(NumTailCalls, "Number of tail calls");
static cl::opt<bool> ExperimentalVectorWideningLegalization(
- "x86-experimental-vector-widening-legalization", cl::init(false),
+ "x86-experimental-vector-widening-legalization", cl::init(true),
cl::desc("Enable an experimental vector type legalization through widening "
"rather than promotion."),
cl::Hidden);
bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
bool F64IsLegal =
!Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
- if (((VT.isVector() && !VT.isFloatingPoint()) ||
- (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
+ if ((VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit()) &&
isa<LoadSDNode>(St->getValue()) &&
!cast<LoadSDNode>(St->getValue())->isVolatile() &&
St->getChain().hasOneUse() && !St->isVolatile()) {
int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
Type *SubTp) {
// 64-bit packed float vectors (v2f32) are widened to type v4f32.
- // 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
+ // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
// Treat Transpose as 2-op shuffles - there's no difference in lowering.
int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
bool IsPairwise) {
-
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
-
- MVT MTy = LT.second;
-
- int ISD = TLI->InstructionOpcodeToISD(Opcode);
- assert(ISD && "Invalid opcode");
-
// We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
// and make it as the cost.
{ ISD::FADD, MVT::v2f64, 2 },
{ ISD::FADD, MVT::v4f32, 4 },
{ ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
+ { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32.
{ ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5".
+ { ISD::ADD, MVT::v2i16, 3 }, // FIXME: chosen to be less than v4i16
+ { ISD::ADD, MVT::v4i16, 4 }, // FIXME: chosen to be less than v8i16
{ ISD::ADD, MVT::v8i16, 5 },
};
{ ISD::FADD, MVT::v4f64, 5 },
{ ISD::FADD, MVT::v8f32, 7 },
{ ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
+ { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
{ ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5".
{ ISD::ADD, MVT::v4i64, 5 }, // The data reported by the IACA tool is "4.8".
+ { ISD::ADD, MVT::v2i16, 3 }, // FIXME: chosen to be less than v4i16
+ { ISD::ADD, MVT::v4i16, 4 }, // FIXME: chosen to be less than v8i16
{ ISD::ADD, MVT::v8i16, 5 },
{ ISD::ADD, MVT::v8i32, 5 },
};
{ ISD::FADD, MVT::v2f64, 2 },
{ ISD::FADD, MVT::v4f32, 4 },
{ ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
+ { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
{ ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
+ { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3".
+ { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3".
{ ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
};
{ ISD::FADD, MVT::v4f64, 3 },
{ ISD::FADD, MVT::v8f32, 4 },
{ ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
+ { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
{ ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "2.8".
{ ISD::ADD, MVT::v4i64, 3 },
+ { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3".
+ { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3".
{ ISD::ADD, MVT::v8i16, 4 },
{ ISD::ADD, MVT::v8i32, 5 },
};
+ int ISD = TLI->InstructionOpcodeToISD(Opcode);
+ assert(ISD && "Invalid opcode");
+
+ // Before legalizing the type, give a chance to look up illegal narrow types
+ // in the table.
+ // FIXME: Is there a better way to do this?
+ EVT VT = TLI->getValueType(DL, ValTy);
+ if (VT.isSimple()) {
+ MVT MTy = VT.getSimpleVT();
+ if (IsPairwise) {
+ if (ST->hasAVX())
+ if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
+ return Entry->Cost;
+
+ if (ST->hasSSE42())
+ if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
+ return Entry->Cost;
+ } else {
+ if (ST->hasAVX())
+ if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
+ return Entry->Cost;
+
+ if (ST->hasSSE42())
+ if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
+ return Entry->Cost;
+ }
+ }
+
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+
+ MVT MTy = LT.second;
+
if (IsPairwise) {
if (ST->hasAVX())
if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
; 64-bit packed float vectors (v2f32) are widened to type v4f32.
define <2 x i32> @test_v2i32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK-LABEL: 'test_v2i32'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %1
+; SSE2-LABEL: 'test_v2i32'
+; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 3>
+; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %1
+;
+; SSSE3-LABEL: 'test_v2i32'
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 3>
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %1
+;
+; SSE42-LABEL: 'test_v2i32'
+; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 3>
+; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %1
+;
+; AVX-LABEL: 'test_v2i32'
+; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 3>
+; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %1
;
; BTVER2-LABEL: 'test_v2i32'
; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 3>
}
define <2 x i32> @test_v2i32_2(<2 x i32> %a, <2 x i32> %b) {
-; CHECK-LABEL: 'test_v2i32_2'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 2, i32 1>
-; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %1
+; SSE2-LABEL: 'test_v2i32_2'
+; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 2, i32 1>
+; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %1
+;
+; SSSE3-LABEL: 'test_v2i32_2'
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 2, i32 1>
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %1
+;
+; SSE42-LABEL: 'test_v2i32_2'
+; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 2, i32 1>
+; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %1
+;
+; AVX-LABEL: 'test_v2i32_2'
+; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 2, i32 1>
+; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %1
;
; BTVER2-LABEL: 'test_v2i32_2'
; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 2, i32 1>
; A <2 x i64> vector multiply is implemented using
; 3 PMULUDQ and 2 PADDS and 4 shifts.
define void @mul_2i32() {
-; SSE-LABEL: 'mul_2i32'
-; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %A0 = mul <2 x i32> undef, undef
-; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSSE3-LABEL: 'mul_2i32'
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %A0 = mul <2 x i32> undef, undef
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'mul_2i32'
+; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %A0 = mul <2 x i32> undef, undef
+; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; AVX-LABEL: 'mul_2i32'
-; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %A0 = mul <2 x i32> undef, undef
+; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %A0 = mul <2 x i32> undef, undef
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
-; AVX512F-LABEL: 'mul_2i32'
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %A0 = mul <2 x i32> undef, undef
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; AVX512BW-LABEL: 'mul_2i32'
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %A0 = mul <2 x i32> undef, undef
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; AVX512DQ-LABEL: 'mul_2i32'
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %A0 = mul <2 x i32> undef, undef
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX512-LABEL: 'mul_2i32'
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %A0 = mul <2 x i32> undef, undef
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SLM-LABEL: 'mul_2i32'
-; SLM-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %A0 = mul <2 x i32> undef, undef
+; SLM-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %A0 = mul <2 x i32> undef, undef
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; GLM-LABEL: 'mul_2i32'
-; GLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %A0 = mul <2 x i32> undef, undef
+; GLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %A0 = mul <2 x i32> undef, undef
; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; BTVER2-LABEL: 'mul_2i32'
-; BTVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %A0 = mul <2 x i32> undef, undef
+; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %A0 = mul <2 x i32> undef, undef
; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
%A0 = mul <2 x i32> undef, undef
; SSE-LABEL: 'sitofp4'
; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %A1 = sitofp <4 x i1> %a to <4 x float>
; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %A2 = sitofp <4 x i1> %a to <4 x double>
-; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %B1 = sitofp <4 x i8> %b to <4 x float>
-; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %B2 = sitofp <4 x i8> %b to <4 x double>
-; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %C1 = sitofp <4 x i16> %c to <4 x float>
-; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %C2 = sitofp <4 x i16> %c to <4 x double>
+; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %B1 = sitofp <4 x i8> %b to <4 x float>
+; SSE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %B2 = sitofp <4 x i8> %b to <4 x double>
+; SSE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %C1 = sitofp <4 x i16> %c to <4 x float>
+; SSE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %C2 = sitofp <4 x i16> %c to <4 x double>
; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %D1 = sitofp <4 x i32> %d to <4 x float>
; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %D2 = sitofp <4 x i32> %d to <4 x double>
; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
define void @sitofp8(<8 x i1> %a, <8 x i8> %b, <8 x i16> %c, <8 x i32> %d) {
; SSE-LABEL: 'sitofp8'
; SSE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %A1 = sitofp <8 x i1> %a to <8 x float>
-; SSE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %B1 = sitofp <8 x i8> %b to <8 x float>
+; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %B1 = sitofp <8 x i8> %b to <8 x float>
; SSE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %C1 = sitofp <8 x i16> %c to <8 x float>
; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %D1 = sitofp <8 x i32> %d to <8 x float>
; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %A1 = uitofp <4 x i1> %a to <4 x float>
; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %A2 = uitofp <4 x i1> %a to <4 x double>
; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %B1 = uitofp <4 x i8> %b to <4 x float>
-; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %B2 = uitofp <4 x i8> %b to <4 x double>
-; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %C1 = uitofp <4 x i16> %c to <4 x float>
-; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %C2 = uitofp <4 x i16> %c to <4 x double>
+; SSE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %B2 = uitofp <4 x i8> %b to <4 x double>
+; SSE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %C1 = uitofp <4 x i16> %c to <4 x float>
+; SSE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %C2 = uitofp <4 x i16> %c to <4 x double>
; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %D1 = uitofp <4 x i32> %d to <4 x float>
; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %D2 = uitofp <4 x i32> %d to <4 x double>
; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
define void @uitofp8(<8 x i1> %a, <8 x i8> %b, <8 x i16> %c, <8 x i32> %d) {
; SSE-LABEL: 'uitofp8'
; SSE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %A1 = uitofp <8 x i1> %a to <8 x float>
-; SSE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %B1 = uitofp <8 x i8> %b to <8 x float>
+; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %B1 = uitofp <8 x i8> %b to <8 x float>
; SSE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %C1 = uitofp <8 x i16> %c to <8 x float>
; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %D1 = uitofp <8 x i32> %d to <8 x float>
; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
define i32 @fptosi_double_i16(i32 %arg) {
; SSE-LABEL: 'fptosi_double_i16'
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi double undef to i16
-; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I16 = fptosi <2 x double> undef to <2 x i16>
-; SSE-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I16 = fptosi <4 x double> undef to <4 x i16>
-; SSE-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V8I16 = fptosi <8 x double> undef to <8 x i16>
+; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = fptosi <2 x double> undef to <2 x i16>
+; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I16 = fptosi <4 x double> undef to <4 x i16>
+; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = fptosi <8 x double> undef to <8 x i16>
; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX-LABEL: 'fptosi_double_i16'
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi double undef to i16
-; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I16 = fptosi <2 x double> undef to <2 x i16>
+; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = fptosi <2 x double> undef to <2 x i16>
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptosi <4 x double> undef to <4 x i16>
; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = fptosi <8 x double> undef to <8 x i16>
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
-; AVX512F-LABEL: 'fptosi_double_i16'
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi double undef to i16
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I16 = fptosi <2 x double> undef to <2 x i16>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptosi <4 x double> undef to <4 x i16>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = fptosi <8 x double> undef to <8 x i16>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX512DQ-LABEL: 'fptosi_double_i16'
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi double undef to i16
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = fptosi <2 x double> undef to <2 x i16>
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptosi <4 x double> undef to <4 x i16>
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = fptosi <8 x double> undef to <8 x i16>
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512-LABEL: 'fptosi_double_i16'
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi double undef to i16
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = fptosi <2 x double> undef to <2 x i16>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptosi <4 x double> undef to <4 x i16>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = fptosi <8 x double> undef to <8 x i16>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; BTVER2-LABEL: 'fptosi_double_i16'
; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi double undef to i16
-; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I16 = fptosi <2 x double> undef to <2 x i16>
+; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = fptosi <2 x double> undef to <2 x i16>
; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptosi <4 x double> undef to <4 x i16>
; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = fptosi <8 x double> undef to <8 x i16>
; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
; AVX-LABEL: 'fptosi_double_i8'
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi double undef to i8
; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I8 = fptosi <2 x double> undef to <2 x i8>
-; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I8 = fptosi <4 x double> undef to <4 x i8>
-; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = fptosi <8 x double> undef to <8 x i8>
+; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I8 = fptosi <4 x double> undef to <4 x i8>
+; AVX-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I8 = fptosi <8 x double> undef to <8 x i8>
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
-; AVX512F-LABEL: 'fptosi_double_i8'
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi double undef to i8
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I8 = fptosi <2 x double> undef to <2 x i8>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I8 = fptosi <4 x double> undef to <4 x i8>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I8 = fptosi <8 x double> undef to <8 x i8>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX512DQ-LABEL: 'fptosi_double_i8'
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi double undef to i8
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I8 = fptosi <2 x double> undef to <2 x i8>
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I8 = fptosi <4 x double> undef to <4 x i8>
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I8 = fptosi <8 x double> undef to <8 x i8>
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512-LABEL: 'fptosi_double_i8'
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi double undef to i8
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I8 = fptosi <2 x double> undef to <2 x i8>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I8 = fptosi <4 x double> undef to <4 x i8>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I8 = fptosi <8 x double> undef to <8 x i8>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; BTVER2-LABEL: 'fptosi_double_i8'
; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi double undef to i8
; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I8 = fptosi <2 x double> undef to <2 x i8>
-; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I8 = fptosi <4 x double> undef to <4 x i8>
-; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = fptosi <8 x double> undef to <8 x i8>
+; BTVER2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I8 = fptosi <4 x double> undef to <4 x i8>
+; BTVER2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I8 = fptosi <8 x double> undef to <8 x i8>
; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
%I8 = fptosi double undef to i8
define i32 @fptosi_float_i8(i32 %arg) {
; SSE-LABEL: 'fptosi_float_i8'
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi float undef to i8
-; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I8 = fptosi <4 x float> undef to <4 x i8>
-; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = fptosi <8 x float> undef to <8 x i8>
-; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = fptosi <16 x float> undef to <16 x i8>
+; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I8 = fptosi <4 x float> undef to <4 x i8>
+; SSE-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I8 = fptosi <8 x float> undef to <8 x i8>
+; SSE-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V16I8 = fptosi <16 x float> undef to <16 x i8>
; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX-LABEL: 'fptosi_float_i8'
; AVX-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32>
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
-; AVX512F-LABEL: 'fptoui_double_i32'
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX512DQ-LABEL: 'fptoui_double_i32'
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32>
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32>
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32>
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512-LABEL: 'fptoui_double_i32'
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; BTVER2-LABEL: 'fptoui_double_i32'
; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32
;
; AVX-LABEL: 'fptoui_double_i16'
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui double undef to i16
-; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I16 = fptoui <2 x double> undef to <2 x i16>
-; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = fptoui <4 x double> undef to <4 x i16>
-; AVX-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I16 = fptoui <8 x double> undef to <8 x i16>
+; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = fptoui <2 x double> undef to <2 x i16>
+; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptoui <4 x double> undef to <4 x i16>
+; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = fptoui <8 x double> undef to <8 x i16>
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
-; AVX512F-LABEL: 'fptoui_double_i16'
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui double undef to i16
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I16 = fptoui <2 x double> undef to <2 x i16>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptoui <4 x double> undef to <4 x i16>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = fptoui <8 x double> undef to <8 x i16>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX512DQ-LABEL: 'fptoui_double_i16'
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui double undef to i16
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = fptoui <2 x double> undef to <2 x i16>
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptoui <4 x double> undef to <4 x i16>
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = fptoui <8 x double> undef to <8 x i16>
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512-LABEL: 'fptoui_double_i16'
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui double undef to i16
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = fptoui <2 x double> undef to <2 x i16>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptoui <4 x double> undef to <4 x i16>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = fptoui <8 x double> undef to <8 x i16>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; BTVER2-LABEL: 'fptoui_double_i16'
; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui double undef to i16
-; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I16 = fptoui <2 x double> undef to <2 x i16>
-; BTVER2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = fptoui <4 x double> undef to <4 x i16>
-; BTVER2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I16 = fptoui <8 x double> undef to <8 x i16>
+; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = fptoui <2 x double> undef to <2 x i16>
+; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptoui <4 x double> undef to <4 x i16>
+; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = fptoui <8 x double> undef to <8 x i16>
; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
%I16 = fptoui double undef to i16
; AVX-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I8 = fptoui <8 x double> undef to <8 x i8>
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
-; AVX512F-LABEL: 'fptoui_double_i8'
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui double undef to i8
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I8 = fptoui <2 x double> undef to <2 x i8>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I8 = fptoui <4 x double> undef to <4 x i8>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = fptoui <8 x double> undef to <8 x i8>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX512DQ-LABEL: 'fptoui_double_i8'
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui double undef to i8
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I8 = fptoui <2 x double> undef to <2 x i8>
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I8 = fptoui <4 x double> undef to <4 x i8>
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = fptoui <8 x double> undef to <8 x i8>
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512-LABEL: 'fptoui_double_i8'
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui double undef to i8
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I8 = fptoui <2 x double> undef to <2 x i8>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I8 = fptoui <4 x double> undef to <4 x i8>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = fptoui <8 x double> undef to <8 x i8>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; BTVER2-LABEL: 'fptoui_double_i8'
; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui double undef to i8
;
; AVX-LABEL: 'fptoui_float_i16'
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui float undef to i16
-; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = fptoui <4 x float> undef to <4 x i16>
+; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptoui <4 x float> undef to <4 x i16>
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = fptoui <8 x float> undef to <8 x i16>
; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = fptoui <16 x float> undef to <16 x i16>
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; BTVER2-LABEL: 'fptoui_float_i16'
; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui float undef to i16
-; BTVER2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = fptoui <4 x float> undef to <4 x i16>
+; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptoui <4 x float> undef to <4 x i16>
; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = fptoui <8 x float> undef to <8 x i16>
; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = fptoui <16 x float> undef to <16 x i16>
; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
; AVX-LABEL: 'fptoui_float_i8'
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui float undef to i8
; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I8 = fptoui <4 x float> undef to <4 x i8>
-; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I8 = fptoui <8 x float> undef to <8 x i8>
-; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = fptoui <16 x float> undef to <16 x i8>
+; AVX-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = fptoui <8 x float> undef to <8 x i8>
+; AVX-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16I8 = fptoui <16 x float> undef to <16 x i8>
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512-LABEL: 'fptoui_float_i8'
; BTVER2-LABEL: 'fptoui_float_i8'
; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui float undef to i8
; BTVER2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I8 = fptoui <4 x float> undef to <4 x i8>
-; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I8 = fptoui <8 x float> undef to <8 x i8>
-; BTVER2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = fptoui <16 x float> undef to <16 x i8>
+; BTVER2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = fptoui <8 x float> undef to <8 x i8>
+; BTVER2-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16I8 = fptoui <16 x float> undef to <16 x i8>
; BTVER2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
%I8 = fptoui float undef to i8
; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* undef, i32 1, <16 x i1> undef, <16 x i32> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* undef, i32 1, <8 x i1> undef, <8 x i32> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* undef, i32 1, <2 x i1> undef, <2 x i32> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* undef, i32 1, <32 x i1> undef, <32 x i16> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* undef, i32 1, <16 x i1> undef, <16 x i16> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* undef, i32 1, <16 x i1> undef, <16 x i32> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* undef, i32 1, <8 x i1> undef, <8 x i32> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; KNL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* undef, i32 1, <2 x i1> undef, <2 x i32> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* undef, i32 1, <32 x i1> undef, <32 x i16> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* undef, i32 1, <16 x i1> undef, <16 x i16> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef)
; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* undef, i32 1, <16 x i1> undef, <16 x i32> undef)
; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* undef, i32 1, <8 x i1> undef, <8 x i32> undef)
; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SKX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* undef, i32 1, <2 x i1> undef, <2 x i32> undef)
; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* undef, i32 1, <32 x i1> undef, <32 x i16> undef)
; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* undef, i32 1, <16 x i1> undef, <16 x i16> undef)
; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SKX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 1, <4 x i1> undef, <4 x i16> undef)
; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* undef, i32 1, <64 x i1> undef, <64 x i8> undef)
; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* undef, i32 1, <32 x i1> undef, <32 x i8> undef)
; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef)
; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
;
%V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* undef, i32 1, <8 x i1> undef, <8 x double> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> undef, <16 x i32>* undef, i32 1, <16 x i1> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> undef, <8 x i32>* undef, i32 1, <8 x i1> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> undef, <4 x i32>* undef, i32 1, <4 x i1> undef)
-; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> undef, <2 x i32>* undef, i32 1, <2 x i1> undef)
+; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> undef, <2 x i32>* undef, i32 1, <2 x i1> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.store.v32i16.p0v32i16(<32 x i16> undef, <32 x i16>* undef, i32 1, <32 x i1> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> undef, <16 x i16>* undef, i32 1, <16 x i1> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> undef, <8 x i16>* undef, i32 1, <8 x i1> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> undef, <16 x i32>* undef, i32 1, <16 x i1> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> undef, <8 x i32>* undef, i32 1, <8 x i1> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> undef, <4 x i32>* undef, i32 1, <4 x i1> undef)
-; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> undef, <2 x i32>* undef, i32 1, <2 x i1> undef)
+; KNL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> undef, <2 x i32>* undef, i32 1, <2 x i1> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.store.v32i16.p0v32i16(<32 x i16> undef, <32 x i16>* undef, i32 1, <32 x i1> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> undef, <16 x i16>* undef, i32 1, <16 x i1> undef)
; KNL-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> undef, <8 x i16>* undef, i32 1, <8 x i1> undef)
; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> undef, <16 x i32>* undef, i32 1, <16 x i1> undef)
; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> undef, <8 x i32>* undef, i32 1, <8 x i1> undef)
; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> undef, <4 x i32>* undef, i32 1, <4 x i1> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> undef, <2 x i32>* undef, i32 1, <2 x i1> undef)
+; SKX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> undef, <2 x i32>* undef, i32 1, <2 x i1> undef)
; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i16.p0v32i16(<32 x i16> undef, <32 x i16>* undef, i32 1, <32 x i1> undef)
; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> undef, <16 x i16>* undef, i32 1, <16 x i1> undef)
; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> undef, <8 x i16>* undef, i32 1, <8 x i1> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> undef, <4 x i16>* undef, i32 1, <4 x i1> undef)
+; SKX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> undef, <4 x i16>* undef, i32 1, <4 x i1> undef)
; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> undef, <64 x i8>* undef, i32 1, <64 x i1> undef)
; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> undef, <32 x i8>* undef, i32 1, <32 x i1> undef)
; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> undef, <16 x i8>* undef, i32 1, <16 x i1> undef)
-; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> undef, <8 x i8>* undef, i32 1, <8 x i1> undef)
+; SKX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> undef, <8 x i8>* undef, i32 1, <8 x i1> undef)
; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
;
call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> undef, <8 x double>* undef, i32 1, <8 x i1> undef)
}
define void @test5(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) {
-; SSE2-LABEL: 'test5'
-; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; SSE42-LABEL: 'test5'
-; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE-LABEL: 'test5'
+; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask)
+; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; AVX-LABEL: 'test5'
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
}
define void @test6(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) {
-; SSE2-LABEL: 'test6'
-; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %val, <2 x i32>* %addr, i32 4, <2 x i1> %mask)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; SSE42-LABEL: 'test6'
-; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %val, <2 x i32>* %addr, i32 4, <2 x i1> %mask)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE-LABEL: 'test6'
+; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %val, <2 x i32>* %addr, i32 4, <2 x i1> %mask)
+; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; AVX-LABEL: 'test6'
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %val, <2 x i32>* %addr, i32 4, <2 x i1> %mask)
+; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %val, <2 x i32>* %addr, i32 4, <2 x i1> %mask)
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; AVX512-LABEL: 'test6'
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %val, <2 x i32>* %addr, i32 4, <2 x i1> %mask)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %val, <2 x i32>* %addr, i32 4, <2 x i1> %mask)
; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
%mask = icmp eq <2 x i32> %trigger, zeroinitializer
}
define <2 x float> @test7(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %dst) {
-; SSE2-LABEL: 'test7'
-; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %res
-;
-; SSE42-LABEL: 'test7'
-; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %res
+; SSE-LABEL: 'test7'
+; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %res
;
; AVX-LABEL: 'test7'
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
}
define <2 x i32> @test8(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) {
-; SSE2-LABEL: 'test8'
-; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %res
-;
-; SSE42-LABEL: 'test8'
-; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %res
+; SSE-LABEL: 'test8'
+; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %res
;
; AVX-LABEL: 'test8'
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %res
;
; AVX512-LABEL: 'test8'
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %res
;
%mask = icmp eq <2 x i32> %trigger, zeroinitializer
+++ /dev/null
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
-; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
-; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
-; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
-; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
-; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
-; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
-; RUN: opt < %s -x86-experimental-vector-widening-legalization -cost-model -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
-
-define i32 @reduce_i64(i32 %arg) {
-; SSE2-LABEL: 'reduce_i64'
-; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; SSSE3-LABEL: 'reduce_i64'
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; SSE42-LABEL: 'reduce_i64'
-; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX-LABEL: 'reduce_i64'
-; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef)
-; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef)
-; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef)
-; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef)
-; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef)
-; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX512-LABEL: 'reduce_i64'
-; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef)
-; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef)
-; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef)
-; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef)
-; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef)
-; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
- %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef)
- %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef)
- %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef)
- %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef)
- %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef)
- ret i32 undef
-}
-
-define i32 @reduce_i32(i32 %arg) {
-; SSE2-LABEL: 'reduce_i32'
-; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; SSSE3-LABEL: 'reduce_i32'
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; SSE42-LABEL: 'reduce_i32'
-; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX-LABEL: 'reduce_i32'
-; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef)
-; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> undef)
-; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef)
-; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef)
-; AVX-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> undef)
-; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX512-LABEL: 'reduce_i32'
-; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef)
-; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> undef)
-; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef)
-; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef)
-; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> undef)
-; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
- %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef)
- %V4 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> undef)
- %V8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef)
- %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef)
- %V32 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> undef)
- ret i32 undef
-}
-
-define i32 @reduce_i16(i32 %arg) {
-; SSE2-LABEL: 'reduce_i16'
-; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; SSSE3-LABEL: 'reduce_i16'
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; SSE42-LABEL: 'reduce_i16'
-; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX1-LABEL: 'reduce_i16'
-; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX2-LABEL: 'reduce_i16'
-; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX512F-LABEL: 'reduce_i16'
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX512BW-LABEL: 'reduce_i16'
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX512DQ-LABEL: 'reduce_i16'
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
- %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef)
- %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef)
- %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef)
- %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef)
- %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef)
- %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef)
- ret i32 undef
-}
-
-define i32 @reduce_i8(i32 %arg) {
-; SSE2-LABEL: 'reduce_i8'
-; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; SSSE3-LABEL: 'reduce_i8'
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; SSE42-LABEL: 'reduce_i8'
-; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX1-LABEL: 'reduce_i8'
-; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 73 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX2-LABEL: 'reduce_i8'
-; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX512F-LABEL: 'reduce_i8'
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX512BW-LABEL: 'reduce_i8'
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX512DQ-LABEL: 'reduce_i8'
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
- %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
- %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
- %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
- %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
- %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
- %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
- %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef)
- ret i32 undef
-}
-
-declare i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64>)
-
-declare i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32>)
-
-declare i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16>)
-
-declare i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8>)
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX-LABEL: 'reduce_i32'
-; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef)
+; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef)
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512-LABEL: 'reduce_i32'
-; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef)
define i32 @reduce_i16(i32 %arg) {
; SSE2-LABEL: 'reduce_i16'
-; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX1-LABEL: 'reduce_i16'
-; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX2-LABEL: 'reduce_i16'
-; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512F-LABEL: 'reduce_i16'
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512BW-LABEL: 'reduce_i16'
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef)
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512DQ-LABEL: 'reduce_i16'
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef)
define i32 @reduce_i8(i32 %arg) {
; SSE2-LABEL: 'reduce_i8'
-; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SSE42-LABEL: 'reduce_i8'
-; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX1-LABEL: 'reduce_i8'
-; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX2-LABEL: 'reduce_i8'
-; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512F-LABEL: 'reduce_i8'
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512BW-LABEL: 'reduce_i8'
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512DQ-LABEL: 'reduce_i8'
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef)
define i32 @reduce_i16(i32 %arg) {
; SSE2-LABEL: 'reduce_i16'
-; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> undef)
define i32 @reduce_i8(i32 %arg) {
; SSE2-LABEL: 'reduce_i8'
-; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> undef)
define i32 @reduce_i32(i32 %arg) {
; SSE2-LABEL: 'reduce_i32'
-; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SSSE3-LABEL: 'reduce_i32'
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SSE42-LABEL: 'reduce_i32'
-; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX1-LABEL: 'reduce_i32'
-; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX2-LABEL: 'reduce_i32'
-; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
-; AVX512F-LABEL: 'reduce_i32'
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX512BW-LABEL: 'reduce_i32'
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; AVX512DQ-LABEL: 'reduce_i32'
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX512-LABEL: 'reduce_i32'
+; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
%V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef)
%V4 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> undef)
define i32 @reduce_i16(i32 %arg) {
; SSE2-LABEL: 'reduce_i16'
-; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SSSE3-LABEL: 'reduce_i16'
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SSE42-LABEL: 'reduce_i16'
-; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX1-LABEL: 'reduce_i16'
-; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX2-LABEL: 'reduce_i16'
-; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512F-LABEL: 'reduce_i16'
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512BW-LABEL: 'reduce_i16'
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef)
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef)
define i32 @reduce_i8(i32 %arg) {
; SSE2-LABEL: 'reduce_i8'
-; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 89 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 101 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 125 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SSSE3-LABEL: 'reduce_i8'
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 89 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SSE42-LABEL: 'reduce_i8'
-; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 89 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX1-LABEL: 'reduce_i8'
-; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 171 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 197 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX2-LABEL: 'reduce_i8'
-; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 106 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 123 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512F-LABEL: 'reduce_i8'
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512BW-LABEL: 'reduce_i8'
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef)
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef)
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 115 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512DQ-LABEL: 'reduce_i8'
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef)
define i32 @reduce_i16(i32 %arg) {
; SSE2-LABEL: 'reduce_i16'
-; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> undef)
define i32 @reduce_i8(i32 %arg) {
; SSE2-LABEL: 'reduce_i8'
-; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SSE42-LABEL: 'reduce_i32'
-; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX1-LABEL: 'reduce_i32'
-; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX2-LABEL: 'reduce_i32'
-; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512-LABEL: 'reduce_i32'
-; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> undef)
define i32 @reduce_i16(i32 %arg) {
; SSE2-LABEL: 'reduce_i16'
-; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SSSE3-LABEL: 'reduce_i16'
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SSE42-LABEL: 'reduce_i16'
-; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX1-LABEL: 'reduce_i16'
-; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX2-LABEL: 'reduce_i16'
-; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512F-LABEL: 'reduce_i16'
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512BW-LABEL: 'reduce_i16'
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef)
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512DQ-LABEL: 'reduce_i16'
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef)
define i32 @reduce_i8(i32 %arg) {
; SSE2-LABEL: 'reduce_i8'
-; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SSSE3-LABEL: 'reduce_i8'
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SSE42-LABEL: 'reduce_i8'
-; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX1-LABEL: 'reduce_i8'
-; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX2-LABEL: 'reduce_i8'
-; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512F-LABEL: 'reduce_i8'
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512BW-LABEL: 'reduce_i8'
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef)
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef)
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512DQ-LABEL: 'reduce_i8'
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SSE42-LABEL: 'reduce_i32'
-; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX1-LABEL: 'reduce_i32'
-; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX2-LABEL: 'reduce_i32'
-; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512-LABEL: 'reduce_i32'
-; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> undef)
define i32 @reduce_i16(i32 %arg) {
; SSE2-LABEL: 'reduce_i16'
-; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SSSE3-LABEL: 'reduce_i16'
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SSE42-LABEL: 'reduce_i16'
-; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX1-LABEL: 'reduce_i16'
-; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX2-LABEL: 'reduce_i16'
-; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512F-LABEL: 'reduce_i16'
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512BW-LABEL: 'reduce_i16'
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef)
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512DQ-LABEL: 'reduce_i16'
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef)
define i32 @reduce_i8(i32 %arg) {
; SSE2-LABEL: 'reduce_i8'
-; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SSSE3-LABEL: 'reduce_i8'
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SSE42-LABEL: 'reduce_i8'
-; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX1-LABEL: 'reduce_i8'
-; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX2-LABEL: 'reduce_i8'
-; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512F-LABEL: 'reduce_i8'
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512BW-LABEL: 'reduce_i8'
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef)
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef)
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512DQ-LABEL: 'reduce_i8'
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SSE42-LABEL: 'reduce_i32'
-; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX1-LABEL: 'reduce_i32'
-; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX2-LABEL: 'reduce_i32'
-; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512-LABEL: 'reduce_i32'
-; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> undef)
define i32 @reduce_i16(i32 %arg) {
; SSE2-LABEL: 'reduce_i16'
-; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SSSE3-LABEL: 'reduce_i16'
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SSE42-LABEL: 'reduce_i16'
-; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX1-LABEL: 'reduce_i16'
-; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX2-LABEL: 'reduce_i16'
-; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512F-LABEL: 'reduce_i16'
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512BW-LABEL: 'reduce_i16'
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef)
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512DQ-LABEL: 'reduce_i16'
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef)
define i32 @reduce_i8(i32 %arg) {
; SSE2-LABEL: 'reduce_i8'
-; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SSSE3-LABEL: 'reduce_i8'
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SSE42-LABEL: 'reduce_i8'
-; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX1-LABEL: 'reduce_i8'
-; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX2-LABEL: 'reduce_i8'
-; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512F-LABEL: 'reduce_i8'
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512BW-LABEL: 'reduce_i8'
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef)
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef)
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512DQ-LABEL: 'reduce_i8'
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SSE42-LABEL: 'reduce_i32'
-; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX1-LABEL: 'reduce_i32'
-; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX2-LABEL: 'reduce_i32'
-; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512-LABEL: 'reduce_i32'
-; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> undef)
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> undef)
define i32 @reduce_i16(i32 %arg) {
; SSE2-LABEL: 'reduce_i16'
-; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SSSE3-LABEL: 'reduce_i16'
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SSE42-LABEL: 'reduce_i16'
-; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX1-LABEL: 'reduce_i16'
-; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX2-LABEL: 'reduce_i16'
-; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512F-LABEL: 'reduce_i16'
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512BW-LABEL: 'reduce_i16'
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef)
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512DQ-LABEL: 'reduce_i16'
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef)
define i32 @reduce_i8(i32 %arg) {
; SSE2-LABEL: 'reduce_i8'
-; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SSSE3-LABEL: 'reduce_i8'
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SSE42-LABEL: 'reduce_i8'
-; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef)
-; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef)
+; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX1-LABEL: 'reduce_i8'
-; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef)
-; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef)
+; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX2-LABEL: 'reduce_i8'
-; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef)
-; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef)
+; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512F-LABEL: 'reduce_i8'
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef)
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef)
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512BW-LABEL: 'reduce_i8'
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef)
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef)
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef)
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512DQ-LABEL: 'reduce_i8'
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef)
-; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef)
+; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef)
define i32 @reduce_i16(i32 %arg) {
; SSE2-LABEL: 'reduce_i16'
-; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> undef)
define i32 @reduce_i8(i32 %arg) {
; SSE2-LABEL: 'reduce_i8'
-; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> undef)
define void @test_vXi32(<2 x i32> %a64, <2 x i32> %b64, <4 x i32> %a128, <4 x i32> %b128, <8 x i32> %a256, <8 x i32> %b256, <16 x i32> %a512, <16 x i32> %b512) {
; SSE-LABEL: 'test_vXi32'
-; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 0, i32 2>
+; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 0, i32 2>
; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
; SSE-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; AVX1-LABEL: 'test_vXi32'
-; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 0, i32 2>
+; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 0, i32 2>
; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; AVX2-LABEL: 'test_vXi32'
-; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 0, i32 2>
+; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 0, i32 2>
; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
; AVX2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; BTVER2-LABEL: 'test_vXi32'
-; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 0, i32 2>
+; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 0, i32 2>
; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
; BTVER2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
define i32 @sitofp_i8_double() {
; SSE-LABEL: 'sitofp_i8_double'
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f64 = sitofp i8 undef to double
-; SSE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %cvt_v2i8_v2f64 = sitofp <2 x i8> undef to <2 x double>
-; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %cvt_v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double>
-; SSE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %cvt_v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double>
+; SSE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %cvt_v2i8_v2f64 = sitofp <2 x i8> undef to <2 x double>
+; SSE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %cvt_v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double>
+; SSE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %cvt_v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double>
; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX-LABEL: 'sitofp_i8_double'
define i32 @sitofp_i16_double() {
; SSE-LABEL: 'sitofp_i16_double'
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f64 = sitofp i16 undef to double
-; SSE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %cvt_v2i16_v2f64 = sitofp <2 x i16> undef to <2 x double>
-; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %cvt_v4i16_v4f64 = sitofp <4 x i16> undef to <4 x double>
+; SSE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %cvt_v2i16_v2f64 = sitofp <2 x i16> undef to <2 x double>
+; SSE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %cvt_v4i16_v4f64 = sitofp <4 x i16> undef to <4 x double>
; SSE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %cvt_v8i16_v8f64 = sitofp <8 x i16> undef to <8 x double>
; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
define i32 @sitofp_i32_double() {
; SSE-LABEL: 'sitofp_i32_double'
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f64 = sitofp i32 undef to double
-; SSE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %cvt_v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double>
+; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %cvt_v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double>
; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %cvt_v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double>
; SSE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %cvt_v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double>
; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
define i32 @sitofp_i8_float() {
; SSE-LABEL: 'sitofp_i8_float'
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f32 = sitofp i8 undef to float
-; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cvt_v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float>
-; SSE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %cvt_v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float>
+; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float>
+; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float>
; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float>
; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
define i32 @sitofp_i16_float() {
; SSE-LABEL: 'sitofp_i16_float'
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f32 = sitofp i16 undef to float
-; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cvt_v4i16_v4f32 = sitofp <4 x i16> undef to <4 x float>
+; SSE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %cvt_v4i16_v4f32 = sitofp <4 x i16> undef to <4 x float>
; SSE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %cvt_v8i16_v8f32 = sitofp <8 x i16> undef to <8 x float>
; SSE-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %cvt_v16i16_v16f32 = sitofp <16 x i16> undef to <16 x float>
; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
define <2 x i8> @slm-costs_8_v2_mul(<2 x i8> %a, <2 x i8> %b) {
; SLM-LABEL: 'slm-costs_8_v2_mul'
-; SLM-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %res = mul nsw <2 x i8> %a, %b
+; SLM-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %res = mul nsw <2 x i8> %a, %b
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i8> %res
;
; GLM-LABEL: 'slm-costs_8_v2_mul'
-; GLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res = mul nsw <2 x i8> %a, %b
+; GLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res = mul nsw <2 x i8> %a, %b
; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i8> %res
;
entry:
define <4 x i8> @slm-costs_8_v4_mul(<4 x i8> %a, <4 x i8> %b) {
; SLM-LABEL: 'slm-costs_8_v4_mul'
-; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %res = mul nsw <4 x i8> %a, %b
+; SLM-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %res = mul nsw <4 x i8> %a, %b
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i8> %res
;
; GLM-LABEL: 'slm-costs_8_v4_mul'
-; GLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = mul nsw <4 x i8> %a, %b
+; GLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res = mul nsw <4 x i8> %a, %b
; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i8> %res
;
entry:
define <8 x i8> @slm-costs_8_v8_mul(<8 x i8> %a, <8 x i8> %b) {
; SLM-LABEL: 'slm-costs_8_v8_mul'
-; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = mul nsw <8 x i8> %a, %b
+; SLM-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %res = mul nsw <8 x i8> %a, %b
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i8> %res
;
; GLM-LABEL: 'slm-costs_8_v8_mul'
-; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = mul nsw <8 x i8> %a, %b
+; GLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res = mul nsw <8 x i8> %a, %b
; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i8> %res
;
entry:
define <2 x i16> @slm-costs_16_v2_mul(<2 x i16> %a, <2 x i16> %b) {
; SLM-LABEL: 'slm-costs_16_v2_mul'
-; SLM-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %res = mul nsw <2 x i16> %a, %b
+; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = mul nsw <2 x i16> %a, %b
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i16> %res
;
; GLM-LABEL: 'slm-costs_16_v2_mul'
-; GLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res = mul nsw <2 x i16> %a, %b
+; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = mul nsw <2 x i16> %a, %b
; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i16> %res
;
entry:
define <4 x i16> @slm-costs_16_v4_mul(<4 x i16> %a, <4 x i16> %b) {
; SLM-LABEL: 'slm-costs_16_v4_mul'
-; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %res = mul nsw <4 x i16> %a, %b
+; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = mul nsw <4 x i16> %a, %b
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i16> %res
;
; GLM-LABEL: 'slm-costs_16_v4_mul'
-; GLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = mul nsw <4 x i16> %a, %b
+; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res = mul nsw <4 x i16> %a, %b
; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i16> %res
;
entry:
define <2 x i32> @slm-costs_32_v2_mul(<2 x i32> %a, <2 x i32> %b) {
; SLM-LABEL: 'slm-costs_32_v2_mul'
-; SLM-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %res = mul nsw <2 x i32> %a, %b
+; SLM-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %res = mul nsw <2 x i32> %a, %b
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %res
;
; GLM-LABEL: 'slm-costs_32_v2_mul'
-; GLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res = mul nsw <2 x i32> %a, %b
+; GLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = mul nsw <2 x i32> %a, %b
; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %res
;
entry:
define %shifttype @shift2i16(%shifttype %a, %shifttype %b) {
entry:
; SSE2-LABEL: shift2i16
- ; SSE2: cost of 12 {{.*}} ashr
+ ; SSE2: cost of 32 {{.*}} ashr
; SSE2-CODEGEN-LABEL: shift2i16
- ; SSE2-CODEGEN: psrlq
+ ; SSE2-CODEGEN: psraw
%0 = ashr %shifttype %a , %b
ret %shifttype %0
define %shifttype4i16 @shift4i16(%shifttype4i16 %a, %shifttype4i16 %b) {
entry:
; SSE2-LABEL: shift4i16
- ; SSE2: cost of 16 {{.*}} ashr
+ ; SSE2: cost of 32 {{.*}} ashr
; SSE2-CODEGEN-LABEL: shift4i16
- ; SSE2-CODEGEN: psrad
+ ; SSE2-CODEGEN: psraw
%0 = ashr %shifttype4i16 %a , %b
ret %shifttype4i16 %0
define %shifttype2i32 @shift2i32(%shifttype2i32 %a, %shifttype2i32 %b) {
entry:
; SSE2-LABEL: shift2i32
- ; SSE2: cost of 12 {{.*}} ashr
+ ; SSE2: cost of 16 {{.*}} ashr
; SSE2-CODEGEN-LABEL: shift2i32
- ; SSE2-CODEGEN: psrlq
+ ; SSE2-CODEGEN: psrad
%0 = ashr %shifttype2i32 %a , %b
ret %shifttype2i32 %0
define %shifttype2i8 @shift2i8(%shifttype2i8 %a, %shifttype2i8 %b) {
entry:
; SSE2-LABEL: shift2i8
- ; SSE2: cost of 12 {{.*}} ashr
+ ; SSE2: cost of 54 {{.*}} ashr
; SSE2-CODEGEN-LABEL: shift2i8
- ; SSE2-CODEGEN: psrlq
+ ; SSE2-CODEGEN: psrlw
%0 = ashr %shifttype2i8 %a , %b
ret %shifttype2i8 %0
define %shifttype4i8 @shift4i8(%shifttype4i8 %a, %shifttype4i8 %b) {
entry:
; SSE2-LABEL: shift4i8
- ; SSE2: cost of 16 {{.*}} ashr
+ ; SSE2: cost of 54 {{.*}} ashr
; SSE2-CODEGEN-LABEL: shift4i8
- ; SSE2-CODEGEN: psrad
+ ; SSE2-CODEGEN: psraw
%0 = ashr %shifttype4i8 %a , %b
ret %shifttype4i8 %0
define %shifttype8i8 @shift8i8(%shifttype8i8 %a, %shifttype8i8 %b) {
entry:
; SSE2-LABEL: shift8i8
- ; SSE2: cost of 32 {{.*}} ashr
+ ; SSE2: cost of 54 {{.*}} ashr
; SSE2-CODEGEN-LABEL: shift8i8
; SSE2-CODEGEN: psraw
define %shifttypec @shift2i16const(%shifttypec %a, %shifttypec %b) {
entry:
; SSE2-LABEL: shift2i16const
- ; SSE2: cost of 4 {{.*}} ashr
+ ; SSE2: cost of 1 {{.*}} ashr
; SSE2-CODEGEN-LABEL: shift2i16const
- ; SSE2-CODEGEN: psrad $3
+ ; SSE2-CODEGEN: psraw $3
%0 = ashr %shifttypec %a , <i16 3, i16 3>
ret %shifttypec %0
; SSE2-LABEL: shift4i16const
; SSE2: cost of 1 {{.*}} ashr
; SSE2-CODEGEN-LABEL: shift4i16const
- ; SSE2-CODEGEN: psrad $19
+ ; SSE2-CODEGEN: psraw $3
%0 = ashr %shifttypec4i16 %a , <i16 3, i16 3, i16 3, i16 3>
ret %shifttypec4i16 %0
define %shifttypec2i32 @shift2i32c(%shifttypec2i32 %a, %shifttypec2i32 %b) {
entry:
; SSE2-LABEL: shift2i32c
- ; SSE2: cost of 4 {{.*}} ashr
+ ; SSE2: cost of 1 {{.*}} ashr
; SSE2-CODEGEN-LABEL: shift2i32c
; SSE2-CODEGEN: psrad $3
; SSE2-LABEL: shift2i8c
; SSE2: cost of 4 {{.*}} ashr
; SSE2-CODEGEN-LABEL: shift2i8c
- ; SSE2-CODEGEN: psrad $3
+ ; SSE2-CODEGEN: psrlw $3
%0 = ashr %shifttypec2i8 %a , <i8 3, i8 3>
ret %shifttypec2i8 %0
define %shifttypec4i8 @shift4i8c(%shifttypec4i8 %a, %shifttypec4i8 %b) {
entry:
; SSE2-LABEL: shift4i8c
- ; SSE2: cost of 1 {{.*}} ashr
+ ; SSE2: cost of 4 {{.*}} ashr
; SSE2-CODEGEN-LABEL: shift4i8c
- ; SSE2-CODEGEN: psrad $27
+ ; SSE2-CODEGEN: psrlw $3
%0 = ashr %shifttypec4i8 %a , <i8 3, i8 3, i8 3, i8 3>
ret %shifttypec4i8 %0
define %shifttypec8i8 @shift8i8c(%shifttypec8i8 %a, %shifttypec8i8 %b) {
entry:
; SSE2-LABEL: shift8i8c
- ; SSE2: cost of 1 {{.*}} ashr
+ ; SSE2: cost of 4 {{.*}} ashr
; SSE2-CODEGEN-LABEL: shift8i8c
- ; SSE2-CODEGEN: psraw $11
+ ; SSE2-CODEGEN: psrlw $3
%0 = ashr %shifttypec8i8 %a , <i8 3, i8 3, i8 3, i8 3,
i8 3, i8 3, i8 3, i8 3>
define %shifttype @shift2i16(%shifttype %a, %shifttype %b) {
entry:
; SSE2-LABEL: shift2i16
- ; SSE2: cost of 4 {{.*}} lshr
+ ; SSE2: cost of 32 {{.*}} lshr
; SSE2-CODEGEN-LABEL: shift2i16
- ; SSE2-CODEGEN: psrlq
+ ; SSE2-CODEGEN: psrlw
%0 = lshr %shifttype %a , %b
ret %shifttype %0
define %shifttype4i16 @shift4i16(%shifttype4i16 %a, %shifttype4i16 %b) {
entry:
; SSE2-LABEL: shift4i16
- ; SSE2: cost of 16 {{.*}} lshr
+ ; SSE2: cost of 32 {{.*}} lshr
; SSE2-CODEGEN-LABEL: shift4i16
- ; SSE2-CODEGEN: psrld
+ ; SSE2-CODEGEN: psrlw
%0 = lshr %shifttype4i16 %a , %b
ret %shifttype4i16 %0
define %shifttype2i32 @shift2i32(%shifttype2i32 %a, %shifttype2i32 %b) {
entry:
; SSE2-LABEL: shift2i32
- ; SSE2: cost of 4 {{.*}} lshr
+ ; SSE2: cost of 16 {{.*}} lshr
; SSE2-CODEGEN-LABEL: shift2i32
- ; SSE2-CODEGEN: psrlq
+ ; SSE2-CODEGEN: psrld
%0 = lshr %shifttype2i32 %a , %b
ret %shifttype2i32 %0
define %shifttype2i8 @shift2i8(%shifttype2i8 %a, %shifttype2i8 %b) {
entry:
; SSE2-LABEL: shift2i8
- ; SSE2: cost of 4 {{.*}} lshr
+ ; SSE2: cost of 26 {{.*}} lshr
; SSE2-CODEGEN-LABEL: shift2i8
- ; SSE2-CODEGEN: psrlq
+ ; SSE2-CODEGEN: psrlw
%0 = lshr %shifttype2i8 %a , %b
ret %shifttype2i8 %0
define %shifttype4i8 @shift4i8(%shifttype4i8 %a, %shifttype4i8 %b) {
entry:
; SSE2-LABEL: shift4i8
- ; SSE2: cost of 16 {{.*}} lshr
+ ; SSE2: cost of 26 {{.*}} lshr
; SSE2-CODEGEN-LABEL: shift4i8
- ; SSE2-CODEGEN: psrld
+ ; SSE2-CODEGEN: psrlw
%0 = lshr %shifttype4i8 %a , %b
ret %shifttype4i8 %0
define %shifttype8i8 @shift8i8(%shifttype8i8 %a, %shifttype8i8 %b) {
entry:
; SSE2-LABEL: shift8i8
- ; SSE2: cost of 32 {{.*}} lshr
+ ; SSE2: cost of 26 {{.*}} lshr
; SSE2-CODEGEN-LABEL: shift8i8
; SSE2-CODEGEN: psrlw
; SSE2-LABEL: shift2i16const
; SSE2: cost of 1 {{.*}} lshr
; SSE2-CODEGEN-LABEL: shift2i16const
- ; SSE2-CODEGEN: psrlq $3
+ ; SSE2-CODEGEN: psrlw $3
%0 = lshr %shifttypec %a , <i16 3, i16 3>
ret %shifttypec %0
; SSE2-LABEL: shift4i16const
; SSE2: cost of 1 {{.*}} lshr
; SSE2-CODEGEN-LABEL: shift4i16const
- ; SSE2-CODEGEN: psrld $3
+ ; SSE2-CODEGEN: psrlw $3
%0 = lshr %shifttypec4i16 %a , <i16 3, i16 3, i16 3, i16 3>
ret %shifttypec4i16 %0
; SSE2-LABEL: shift2i32c
; SSE2: cost of 1 {{.*}} lshr
; SSE2-CODEGEN-LABEL: shift2i32c
- ; SSE2-CODEGEN: psrlq $3
+ ; SSE2-CODEGEN: psrld $3
%0 = lshr %shifttypec2i32 %a , <i32 3, i32 3>
ret %shifttypec2i32 %0
define %shifttypec2i8 @shift2i8c(%shifttypec2i8 %a, %shifttypec2i8 %b) {
entry:
; SSE2-LABEL: shift2i8c
- ; SSE2: cost of 1 {{.*}} lshr
+ ; SSE2: cost of 2 {{.*}} lshr
; SSE2-CODEGEN-LABEL: shift2i8c
- ; SSE2-CODEGEN: psrlq $3
+ ; SSE2-CODEGEN: psrlw $3
%0 = lshr %shifttypec2i8 %a , <i8 3, i8 3>
ret %shifttypec2i8 %0
define %shifttypec4i8 @shift4i8c(%shifttypec4i8 %a, %shifttypec4i8 %b) {
entry:
; SSE2-LABEL: shift4i8c
- ; SSE2: cost of 1 {{.*}} lshr
+ ; SSE2: cost of 2 {{.*}} lshr
; SSE2-CODEGEN-LABEL: shift4i8c
- ; SSE2-CODEGEN: psrld $3
+ ; SSE2-CODEGEN: psrlw $3
%0 = lshr %shifttypec4i8 %a , <i8 3, i8 3, i8 3, i8 3>
ret %shifttypec4i8 %0
define %shifttypec8i8 @shift8i8c(%shifttypec8i8 %a, %shifttypec8i8 %b) {
entry:
; SSE2-LABEL: shift8i8c
- ; SSE2: cost of 1 {{.*}} lshr
+ ; SSE2: cost of 2 {{.*}} lshr
; SSE2-CODEGEN-LABEL: shift8i8c
; SSE2-CODEGEN: psrlw $3
define %shifttype @shift2i16(%shifttype %a, %shifttype %b) {
entry:
; SSE2-LABEL: shift2i16
- ; SSE2: cost of 4 {{.*}} shl
+ ; SSE2: cost of 32 {{.*}} shl
; SSE2-CODEGEN-LABEL: shift2i16
- ; SSE2-CODEGEN: psllq
+ ; SSE2-CODEGEN: pmullw
%0 = shl %shifttype %a , %b
ret %shifttype %0
define %shifttype4i16 @shift4i16(%shifttype4i16 %a, %shifttype4i16 %b) {
entry:
; SSE2-LABEL: shift4i16
- ; SSE2: cost of 10 {{.*}} shl
+ ; SSE2: cost of 32 {{.*}} shl
; SSE2-CODEGEN-LABEL: shift4i16
- ; SSE2-CODEGEN: pmuludq
+ ; SSE2-CODEGEN: pmullw
%0 = shl %shifttype4i16 %a , %b
ret %shifttype4i16 %0
define %shifttype2i32 @shift2i32(%shifttype2i32 %a, %shifttype2i32 %b) {
entry:
; SSE2-LABEL: shift2i32
- ; SSE2: cost of 4 {{.*}} shl
+ ; SSE2: cost of 10 {{.*}} shl
; SSE2-CODEGEN-LABEL: shift2i32
- ; SSE2-CODEGEN: psllq
+ ; SSE2-CODEGEN: pmuludq
%0 = shl %shifttype2i32 %a , %b
ret %shifttype2i32 %0
define %shifttype2i8 @shift2i8(%shifttype2i8 %a, %shifttype2i8 %b) {
entry:
; SSE2-LABEL: shift2i8
- ; SSE2: cost of 4 {{.*}} shl
+ ; SSE2: cost of 26 {{.*}} shl
; SSE2-CODEGEN-LABEL: shift2i8
- ; SSE2-CODEGEN: psllq
+ ; SSE2-CODEGEN: psllw
%0 = shl %shifttype2i8 %a , %b
ret %shifttype2i8 %0
define %shifttype4i8 @shift4i8(%shifttype4i8 %a, %shifttype4i8 %b) {
entry:
; SSE2-LABEL: shift4i8
- ; SSE2: cost of 10 {{.*}} shl
+ ; SSE2: cost of 26 {{.*}} shl
; SSE2-CODEGEN-LABEL: shift4i8
- ; SSE2-CODEGEN: pmuludq
+ ; SSE2-CODEGEN: psllw
%0 = shl %shifttype4i8 %a , %b
ret %shifttype4i8 %0
define %shifttype8i8 @shift8i8(%shifttype8i8 %a, %shifttype8i8 %b) {
entry:
; SSE2-LABEL: shift8i8
- ; SSE2: cost of 32 {{.*}} shl
+ ; SSE2: cost of 26 {{.*}} shl
; SSE2-CODEGEN-LABEL: shift8i8
- ; SSE2-CODEGEN: pmullw
+ ; SSE2-CODEGEN: psllw
%0 = shl %shifttype8i8 %a , %b
ret %shifttype8i8 %0
; SSE2-LABEL: shift2i16const
; SSE2: cost of 1 {{.*}} shl
; SSE2-CODEGEN-LABEL: shift2i16const
- ; SSE2-CODEGEN: psllq $3
+ ; SSE2-CODEGEN: psllw $3
%0 = shl %shifttypec %a , <i16 3, i16 3>
ret %shifttypec %0
; SSE2-LABEL: shift4i16const
; SSE2: cost of 1 {{.*}} shl
; SSE2-CODEGEN-LABEL: shift4i16const
- ; SSE2-CODEGEN: pslld $3
+ ; SSE2-CODEGEN: psllw $3
%0 = shl %shifttypec4i16 %a , <i16 3, i16 3, i16 3, i16 3>
ret %shifttypec4i16 %0
; SSE2-LABEL: shift2i32c
; SSE2: cost of 1 {{.*}} shl
; SSE2-CODEGEN-LABEL: shift2i32c
- ; SSE2-CODEGEN: psllq $3
+ ; SSE2-CODEGEN: pslld $3
%0 = shl %shifttypec2i32 %a , <i32 3, i32 3>
ret %shifttypec2i32 %0
define %shifttypec2i8 @shift2i8c(%shifttypec2i8 %a, %shifttypec2i8 %b) {
entry:
; SSE2-LABEL: shift2i8c
- ; SSE2: cost of 1 {{.*}} shl
+ ; SSE2: cost of 2 {{.*}} shl
; SSE2-CODEGEN-LABEL: shift2i8c
- ; SSE2-CODEGEN: psllq $3
+ ; SSE2-CODEGEN: psllw $3
%0 = shl %shifttypec2i8 %a , <i8 3, i8 3>
ret %shifttypec2i8 %0
define %shifttypec4i8 @shift4i8c(%shifttypec4i8 %a, %shifttypec4i8 %b) {
entry:
; SSE2-LABEL: shift4i8c
- ; SSE2: cost of 1 {{.*}} shl
+ ; SSE2: cost of 2 {{.*}} shl
; SSE2-CODEGEN-LABEL: shift4i8c
- ; SSE2-CODEGEN: pslld $3
+ ; SSE2-CODEGEN: psllw $3
%0 = shl %shifttypec4i8 %a , <i8 3, i8 3, i8 3, i8 3>
ret %shifttypec4i8 %0
define %shifttypec8i8 @shift8i8c(%shifttypec8i8 %a, %shifttypec8i8 %b) {
entry:
; SSE2-LABEL: shift8i8c
- ; SSE2: cost of 1 {{.*}} shl
+ ; SSE2: cost of 2 {{.*}} shl
; SSE2-CODEGEN-LABEL: shift8i8c
; SSE2-CODEGEN: psllw $3
define i32 @uitofp_i8_double() {
; SSE-LABEL: 'uitofp_i8_double'
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f64 = uitofp i8 undef to double
-; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %cvt_v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double>
-; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %cvt_v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double>
-; SSE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %cvt_v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double>
+; SSE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %cvt_v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double>
+; SSE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %cvt_v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double>
+; SSE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %cvt_v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double>
; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX-LABEL: 'uitofp_i8_double'
define i32 @uitofp_i16_double() {
; SSE-LABEL: 'uitofp_i16_double'
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f64 = uitofp i16 undef to double
-; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %cvt_v2i16_v2f64 = uitofp <2 x i16> undef to <2 x double>
-; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %cvt_v4i16_v4f64 = uitofp <4 x i16> undef to <4 x double>
+; SSE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %cvt_v2i16_v2f64 = uitofp <2 x i16> undef to <2 x double>
+; SSE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %cvt_v4i16_v4f64 = uitofp <4 x i16> undef to <4 x double>
; SSE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %cvt_v8i16_v8f64 = uitofp <8 x i16> undef to <8 x double>
; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
define i32 @uitofp_i32_double() {
; SSE-LABEL: 'uitofp_i32_double'
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f64 = uitofp i32 undef to double
-; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %cvt_v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double>
+; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %cvt_v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double>
; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %cvt_v4i32_v4f64 = uitofp <4 x i32> undef to <4 x double>
; SSE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %cvt_v8i32_v8f64 = uitofp <8 x i32> undef to <8 x double>
; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
; SSE-LABEL: 'uitofp_i8_float'
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f32 = uitofp i8 undef to float
; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float>
-; SSE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %cvt_v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float>
+; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float>
; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float>
; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
define i32 @uitofp_i16_float() {
; SSE-LABEL: 'uitofp_i16_float'
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f32 = uitofp i16 undef to float
-; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cvt_v4i16_v4f32 = uitofp <4 x i16> undef to <4 x float>
+; SSE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %cvt_v4i16_v4f32 = uitofp <4 x i16> undef to <4 x float>
; SSE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %cvt_v8i16_v8f32 = uitofp <8 x i16> undef to <8 x float>
; SSE-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %cvt_v16i16_v16f32 = uitofp <16 x i16> undef to <16 x float>
; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
define <2 x double> @a(<2 x i32> %x) nounwind {
; CHECK-LABEL: a:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-NEXT: cvtdq2pd %xmm0, %xmm0
; CHECK-NEXT: retl
entry:
; CHECK-LABEL: b:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cvttpd2dq %xmm0, %xmm0
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
; CHECK-NEXT: retl
entry:
%y = fptosi <2 x double> %x to <2 x i32>
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl (%eax), %eax
; CHECK-NEXT: shrl %eax
+; CHECK-NEXT: movzwl %ax, %eax
; CHECK-NEXT: movd %eax, %xmm0
; CHECK-NEXT: retl
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl (%eax), %eax
; CHECK-NEXT: shrl %eax
-; CHECK-NEXT: movzwl %ax, %eax
+; CHECK-NEXT: movzbl %al, %eax
; CHECK-NEXT: movd %eax, %xmm0
; CHECK-NEXT: retl
define i32 @main() nounwind uwtable {
; CHECK-LABEL: main:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: pmovsxbq {{.*}}(%rip), %xmm0
-; CHECK-NEXT: pmovsxbq {{.*}}(%rip), %xmm1
-; CHECK-NEXT: pextrq $1, %xmm1, %rax
-; CHECK-NEXT: pextrq $1, %xmm0, %rcx
-; CHECK-NEXT: cqto
-; CHECK-NEXT: idivq %rcx
-; CHECK-NEXT: movq %rax, %xmm2
-; CHECK-NEXT: movq %xmm1, %rax
-; CHECK-NEXT: movq %xmm0, %rcx
-; CHECK-NEXT: cqto
-; CHECK-NEXT: idivq %rcx
-; CHECK-NEXT: movq %rax, %xmm0
-; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: pextrb $1, %xmm0, %eax
+; CHECK-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT: pextrb $1, %xmm1, %ecx
+; CHECK-NEXT: # kill: def $al killed $al killed $eax
+; CHECK-NEXT: cbtw
+; CHECK-NEXT: idivb %cl
+; CHECK-NEXT: movl %eax, %ecx
+; CHECK-NEXT: pextrb $0, %xmm0, %eax
+; CHECK-NEXT: # kill: def $al killed $al killed $eax
+; CHECK-NEXT: cbtw
+; CHECK-NEXT: pextrb $0, %xmm1, %edx
+; CHECK-NEXT: idivb %dl
+; CHECK-NEXT: movzbl %cl, %ecx
+; CHECK-NEXT: movzbl %al, %eax
+; CHECK-NEXT: movd %eax, %xmm0
+; CHECK-NEXT: pinsrb $1, %ecx, %xmm0
; CHECK-NEXT: pextrw $0, %xmm0, {{.*}}(%rip)
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: retq
define void @foo8(float* nocapture %RET) nounwind {
; CHECK-LABEL: foo8:
; CHECK: ## %bb.0: ## %allocas
-; CHECK-NEXT: movaps {{.*#+}} xmm0 = [1.0E+2,2.0E+0,1.0E+2,4.0E+0]
-; CHECK-NEXT: movaps {{.*#+}} xmm1 = [1.0E+2,6.0E+0,1.0E+2,8.0E+0]
-; CHECK-NEXT: movups %xmm1, 16(%rdi)
-; CHECK-NEXT: movups %xmm0, (%rdi)
+; CHECK-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; CHECK-NEXT: cvtdq2ps %xmm0, %xmm0
+; CHECK-NEXT: movaps {{.*#+}} xmm1 = [1.0E+2,2.0E+0,1.0E+2,4.0E+0]
+; CHECK-NEXT: movups %xmm1, (%rdi)
+; CHECK-NEXT: movups %xmm0, 16(%rdi)
; CHECK-NEXT: retq
allocas:
%resultvec.i = select <8 x i1> <i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <8 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, <8 x i8> <i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100>
define void @prom_bug(<4 x i8> %t, i16* %p) {
; SSE2-LABEL: prom_bug:
; SSE2: ## %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: pextrw $0, %xmm0, %eax
+; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: movw %ax, (%rdi)
; SSE2-NEXT: retq
;
; SSE41-LABEL: prom_bug:
; SSE41: ## %bb.0:
-; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; SSE41-NEXT: pextrw $0, %xmm0, (%rdi)
; SSE41-NEXT: retq
%r = bitcast <4 x i8> %t to <2 x i16>
define <2 x i32> @vcast(<2 x float> %a, <2 x float> %b) {
; CHECK-LABEL: vcast:
; CHECK: # %bb.0:
-; CHECK-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
-; CHECK-NEXT: pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
-; CHECK-NEXT: psubq %xmm1, %xmm0
+; CHECK-NEXT: movdqa (%rcx), %xmm0
+; CHECK-NEXT: psubd (%rdx), %xmm0
; CHECK-NEXT: retq
%af = bitcast <2 x float> %a to <2 x i32>
%bf = bitcast <2 x float> %b to <2 x i32>
define <4 x i8> @build_vector_again(<16 x i8> %in) nounwind readnone {
; CHECK-LABEL: build_vector_again:
; CHECK: ## %bb.0: ## %entry
-; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; CHECK-NEXT: retq
entry:
%out = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; CHECK-LABEL: load_64:
; CHECK: # %bb.0: # %BB
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
+; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: retl
BB:
%t = load <2 x i32>, <2 x i32>* %ptr
; X64: # %bb.0: # %entry
; X64-NEXT: pavgusb %mm1, %mm0
; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
; X64-NEXT: retq
entry:
%0 = bitcast x86_mmx %a.coerce to <8 x i8>
; X64-NEXT: movdq2q %xmm0, %mm0
; X64-NEXT: pf2id %mm0, %mm0
; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
; X64-NEXT: retq
entry:
%0 = bitcast <2 x float> %a to x86_mmx
; X64-NEXT: movdq2q %xmm0, %mm1
; X64-NEXT: pfcmpeq %mm0, %mm1
; X64-NEXT: movq %mm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
; X64-NEXT: retq
entry:
%0 = bitcast <2 x float> %a to x86_mmx
; X64-NEXT: movdq2q %xmm0, %mm1
; X64-NEXT: pfcmpge %mm0, %mm1
; X64-NEXT: movq %mm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
; X64-NEXT: retq
entry:
%0 = bitcast <2 x float> %a to x86_mmx
; X64-NEXT: movdq2q %xmm0, %mm1
; X64-NEXT: pfcmpgt %mm0, %mm1
; X64-NEXT: movq %mm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
; X64-NEXT: retq
entry:
%0 = bitcast <2 x float> %a to x86_mmx
; X64: # %bb.0: # %entry
; X64-NEXT: pmulhrw %mm1, %mm0
; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
; X64-NEXT: retq
entry:
%0 = bitcast x86_mmx %a.coerce to <4 x i16>
; X64-NEXT: movdq2q %xmm0, %mm0
; X64-NEXT: pf2iw %mm0, %mm0
; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
; X64-NEXT: retq
entry:
%0 = bitcast <2 x float> %a to x86_mmx
;
; X64-LABEL: test_pswapdsi:
; X64: # %bb.0: # %entry
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-NEXT: movdq2q %xmm0, %mm0
; X64-NEXT: pswapd %mm0, %mm0 # mm0 = mm0[1,0]
; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
; X64-NEXT: retq
entry:
%0 = bitcast <2 x i32> %a to x86_mmx
define <4 x i8> @foo(<4 x i8> %x, <4 x i8> %y) {
; CHECK-LABEL: foo:
; CHECK: ## %bb.0: ## %entry
-; CHECK-NEXT: pmulld %xmm0, %xmm1
-; CHECK-NEXT: paddd %xmm1, %xmm0
+; CHECK-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; CHECK-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; CHECK-NEXT: pmullw %xmm1, %xmm2
+; CHECK-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
+; CHECK-NEXT: paddb %xmm2, %xmm0
; CHECK-NEXT: retq
entry:
%binop = mul <4 x i8> %x, %y
define i8 @foo(<4 x i8>* %V) {
; CHECK-LABEL: foo:
; CHECK: # %bb.0:
-; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT: pextrw $1, %xmm0, %eax
+; CHECK-NEXT: movb 2(%rdi), %al
; CHECK-NEXT: andb $95, %al
-; CHECK-NEXT: # kill: def $al killed $al killed $eax
; CHECK-NEXT: retq
%Vp = bitcast <4 x i8>* %V to <3 x i8>*
%V3i8 = load <3 x i8>, <3 x i8>* %Vp, align 4
; CHECK-O0-LABEL: vec_store:
; CHECK-O0: # %bb.0:
; CHECK-O0-NEXT: vmovd %xmm0, %eax
-; CHECK-O0-NEXT: vpextrd $2, %xmm0, %ecx
+; CHECK-O0-NEXT: vpextrd $1, %xmm0, %ecx
; CHECK-O0-NEXT: movl %eax, (%rdi)
; CHECK-O0-NEXT: movl %ecx, 4(%rdi)
; CHECK-O0-NEXT: retq
; CHECK-O3-LABEL: vec_store:
; CHECK-O3: # %bb.0:
; CHECK-O3-NEXT: vmovd %xmm0, %eax
-; CHECK-O3-NEXT: vpextrd $2, %xmm0, %ecx
+; CHECK-O3-NEXT: vpextrd $1, %xmm0, %ecx
; CHECK-O3-NEXT: movl %eax, (%rdi)
; CHECK-O3-NEXT: movl %ecx, 4(%rdi)
; CHECK-O3-NEXT: retq
; CHECK-O0-LABEL: vec_store_unaligned:
; CHECK-O0: # %bb.0:
; CHECK-O0-NEXT: vmovd %xmm0, %eax
-; CHECK-O0-NEXT: vpextrd $2, %xmm0, %ecx
+; CHECK-O0-NEXT: vpextrd $1, %xmm0, %ecx
; CHECK-O0-NEXT: movl %eax, (%rdi)
; CHECK-O0-NEXT: movl %ecx, 4(%rdi)
; CHECK-O0-NEXT: retq
; CHECK-O3-LABEL: vec_store_unaligned:
; CHECK-O3: # %bb.0:
; CHECK-O3-NEXT: vmovd %xmm0, %eax
-; CHECK-O3-NEXT: vpextrd $2, %xmm0, %ecx
+; CHECK-O3-NEXT: vpextrd $1, %xmm0, %ecx
; CHECK-O3-NEXT: movl %eax, (%rdi)
; CHECK-O3-NEXT: movl %ecx, 4(%rdi)
; CHECK-O3-NEXT: retq
; AVX2-LABEL: avg_v48i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-NEXT: vmovdqa 32(%rdi), %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-NEXT: vpbroadcastq 24(%rdi), %xmm3
+; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,0,1]
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
-; AVX2-NEXT: vmovdqa (%rsi), %xmm6
-; AVX2-NEXT: vmovdqa 32(%rsi), %xmm7
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[2,3,0,1]
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
-; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero
-; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpbroadcastq 24(%rsi), %xmm2
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
-; AVX2-NEXT: vpaddd %ymm2, %ymm3, %ymm2
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpaddd %ymm3, %ymm4, %ymm3
-; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[2,3,0,1]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero
-; AVX2-NEXT: vpaddd %ymm4, %ymm5, %ymm4
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
+; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,0,1]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm9 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
+; AVX2-NEXT: vmovdqa (%rsi), %xmm6
+; AVX2-NEXT: vmovdqa 16(%rsi), %xmm7
+; AVX2-NEXT: vmovdqa 32(%rsi), %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,0,1]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero
+; AVX2-NEXT: vpaddd %ymm5, %ymm3, %ymm3
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero
+; AVX2-NEXT: vpaddd %ymm5, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,3,0,1]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero
+; AVX2-NEXT: vpaddd %ymm5, %ymm4, %ymm4
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero,xmm7[4],zero,zero,zero,xmm7[5],zero,zero,zero,xmm7[6],zero,zero,zero,xmm7[7],zero,zero,zero
-; AVX2-NEXT: vpaddd %ymm5, %ymm8, %ymm5
+; AVX2-NEXT: vpaddd %ymm5, %ymm1, %ymm1
+; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,0,1]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero
+; AVX2-NEXT: vpaddd %ymm5, %ymm9, %ymm5
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
+; AVX2-NEXT: vpaddd %ymm2, %ymm8, %ymm2
; AVX2-NEXT: vpcmpeqd %ymm6, %ymm6, %ymm6
-; AVX2-NEXT: vpsubd %ymm6, %ymm1, %ymm1
-; AVX2-NEXT: vpsubd %ymm6, %ymm0, %ymm0
-; AVX2-NEXT: vpsubd %ymm6, %ymm2, %ymm2
; AVX2-NEXT: vpsubd %ymm6, %ymm3, %ymm3
+; AVX2-NEXT: vpsubd %ymm6, %ymm0, %ymm0
; AVX2-NEXT: vpsubd %ymm6, %ymm4, %ymm4
+; AVX2-NEXT: vpsubd %ymm6, %ymm1, %ymm1
; AVX2-NEXT: vpsubd %ymm6, %ymm5, %ymm5
+; AVX2-NEXT: vpsubd %ymm6, %ymm2, %ymm2
+; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2
; AVX2-NEXT: vpsrld $1, %ymm5, %ymm5
+; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1
; AVX2-NEXT: vpsrld $1, %ymm4, %ymm4
-; AVX2-NEXT: vpsrld $1, %ymm3, %ymm3
-; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2
; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0
-; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm1[2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrld $1, %ymm3, %ymm3
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm3[2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
; AVX2-NEXT: vpackusdw %ymm6, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm3[2,3],ymm2[2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
-; AVX2-NEXT: vpackusdw %ymm6, %ymm2, %ymm2
-; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm2
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm3
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm1[2,3],ymm4[2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
+; AVX2-NEXT: vpackusdw %ymm6, %ymm1, %ymm1
+; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm4
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-NEXT: vpackuswb %ymm0, %ymm3, %ymm0
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm5[2,3],ymm4[2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm3
-; AVX2-NEXT: vpackusdw %ymm2, %ymm3, %ymm2
-; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: vpackuswb %ymm0, %ymm4, %ymm0
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm5[2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2
+; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vmovdqu %xmm1, (%rax)
; SSE2-NEXT: pushq %r13
; SSE2-NEXT: pushq %r12
; SSE2-NEXT: pushq %rbx
-; SSE2-NEXT: movaps (%rdi), %xmm0
-; SSE2-NEXT: movaps (%rsi), %xmm1
-; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps (%rdi), %xmm1
+; SSE2-NEXT: movaps (%rsi), %xmm0
+; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r14d
-; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp
+; SSE2-NEXT: addq %r11, %rbp
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r14d
+; SSE2-NEXT: addq %r10, %r14
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
+; SSE2-NEXT: addq %r9, %rbx
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d
+; SSE2-NEXT: addq %r8, %r11
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d
+; SSE2-NEXT: addq %rdx, %r10
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d
+; SSE2-NEXT: addq %rcx, %r8
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi
+; SSE2-NEXT: addq %rax, %rdi
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: addq %rsi, %rdx
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT: leal -1(%rdx,%rsi), %edx
-; SSE2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT: leal -1(%rbx,%rdx), %edx
-; SSE2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT: leal -1(%rbp,%rdx), %edx
-; SSE2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT: leal -1(%rdi,%rdx), %r8d
-; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT: leal -1(%rax,%rdx), %edi
-; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT: leal -1(%rcx,%rax), %edx
-; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT: leal -1(%r9,%rax), %ecx
+; SSE2-NEXT: leaq -1(%r15,%rsi), %rax
+; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT: leal -1(%r10,%rsi), %eax
+; SSE2-NEXT: leaq -1(%r12,%rsi), %rax
+; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT: leaq -1(%r11,%rsi), %rsi
-; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
-; SSE2-NEXT: leaq -1(%r12,%rbx), %r12
-; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
-; SSE2-NEXT: leaq -1(%r15,%rbx), %r15
-; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
-; SSE2-NEXT: leaq -1(%r14,%rbx), %r14
-; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; SSE2-NEXT: leaq -1(%rbp,%rbx), %r11
-; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; SSE2-NEXT: leaq -1(%rbp,%rbx), %r10
-; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
-; SSE2-NEXT: leaq -1(%r13,%rbx), %r9
-; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; SSE2-NEXT: leaq -1(%r13,%rbx), %rbx
-; SSE2-NEXT: shrl %eax
-; SSE2-NEXT: movd %eax, %xmm8
-; SSE2-NEXT: shrl %ecx
-; SSE2-NEXT: movd %ecx, %xmm15
-; SSE2-NEXT: shrl %edx
-; SSE2-NEXT: movd %edx, %xmm9
-; SSE2-NEXT: shrl %edi
-; SSE2-NEXT: movd %edi, %xmm2
-; SSE2-NEXT: shrl %r8d
-; SSE2-NEXT: movd %r8d, %xmm10
-; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; SSE2-NEXT: shrl %eax
-; SSE2-NEXT: movd %eax, %xmm6
-; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; SSE2-NEXT: shrl %eax
-; SSE2-NEXT: movd %eax, %xmm11
-; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; SSE2-NEXT: shrl %eax
-; SSE2-NEXT: movd %eax, %xmm4
-; SSE2-NEXT: shrq %rsi
-; SSE2-NEXT: movd %esi, %xmm12
-; SSE2-NEXT: shrq %r12
-; SSE2-NEXT: movd %r12d, %xmm3
-; SSE2-NEXT: shrq %r15
-; SSE2-NEXT: movd %r15d, %xmm13
-; SSE2-NEXT: shrq %r14
-; SSE2-NEXT: movd %r14d, %xmm7
-; SSE2-NEXT: shrq %r11
-; SSE2-NEXT: movd %r11d, %xmm14
-; SSE2-NEXT: shrq %r10
-; SSE2-NEXT: movd %r10d, %xmm5
-; SSE2-NEXT: shrq %r9
-; SSE2-NEXT: movd %r9d, %xmm0
-; SSE2-NEXT: shrq %rbx
-; SSE2-NEXT: movd %ebx, %xmm1
+; SSE2-NEXT: leaq -1(%r13,%rsi), %rax
+; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE2-NEXT: leaq -1(%rax,%rsi), %rax
+; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE2-NEXT: leaq -1(%rax,%rsi), %rax
+; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE2-NEXT: leaq -1(%rax,%rsi), %rax
+; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE2-NEXT: leaq -1(%rax,%rsi), %rsi
+; SSE2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE2-NEXT: leaq -1(%rax,%rsi), %rsi
+; SSE2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT: addq $-1, %rbp
+; SSE2-NEXT: movl $0, %r9d
+; SSE2-NEXT: adcq $-1, %r9
+; SSE2-NEXT: addq $-1, %r14
+; SSE2-NEXT: movl $0, %esi
+; SSE2-NEXT: adcq $-1, %rsi
+; SSE2-NEXT: addq $-1, %rbx
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: adcq $-1, %rax
+; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT: addq $-1, %r11
+; SSE2-NEXT: movl $0, %r12d
+; SSE2-NEXT: adcq $-1, %r12
+; SSE2-NEXT: addq $-1, %r10
+; SSE2-NEXT: movl $0, %r13d
+; SSE2-NEXT: adcq $-1, %r13
+; SSE2-NEXT: addq $-1, %r8
+; SSE2-NEXT: movl $0, %r15d
+; SSE2-NEXT: adcq $-1, %r15
+; SSE2-NEXT: addq $-1, %rdi
+; SSE2-NEXT: movl $0, %ecx
+; SSE2-NEXT: adcq $-1, %rcx
+; SSE2-NEXT: addq $-1, %rdx
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: adcq $-1, %rax
+; SSE2-NEXT: shldq $63, %rdx, %rax
+; SSE2-NEXT: shldq $63, %rdi, %rcx
+; SSE2-NEXT: movq %rcx, %rdx
+; SSE2-NEXT: shldq $63, %r8, %r15
+; SSE2-NEXT: shldq $63, %r10, %r13
+; SSE2-NEXT: shldq $63, %r11, %r12
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; SSE2-NEXT: shldq $63, %rbx, %rdi
+; SSE2-NEXT: shldq $63, %r14, %rsi
+; SSE2-NEXT: shldq $63, %rbp, %r9
+; SSE2-NEXT: movq %r9, %xmm8
+; SSE2-NEXT: movq %rsi, %xmm15
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE2-NEXT: shrq %rcx
+; SSE2-NEXT: movq %rcx, %xmm9
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE2-NEXT: shrq %rcx
+; SSE2-NEXT: movq %rcx, %xmm2
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE2-NEXT: shrq %rcx
+; SSE2-NEXT: movq %rcx, %xmm10
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE2-NEXT: shrq %rcx
+; SSE2-NEXT: movq %rcx, %xmm4
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE2-NEXT: shrq %rcx
+; SSE2-NEXT: movq %rcx, %xmm11
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE2-NEXT: shrq %rcx
+; SSE2-NEXT: movq %rcx, %xmm7
+; SSE2-NEXT: movq %rdi, %xmm12
+; SSE2-NEXT: movq %r12, %xmm0
+; SSE2-NEXT: movq %r13, %xmm13
+; SSE2-NEXT: movq %r15, %xmm6
+; SSE2-NEXT: movq %rdx, %xmm14
+; SSE2-NEXT: movq %rax, %xmm5
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE2-NEXT: shrq %rax
+; SSE2-NEXT: movq %rax, %xmm3
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE2-NEXT: shrq %rax
+; SSE2-NEXT: movq %rax, %xmm1
; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3],xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3],xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm15[0,1,2,0]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm8
+; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1]
+; SSE2-NEXT: por %xmm8, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7]
+; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3],xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7]
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,0,65535,65535]
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,1]
+; SSE2-NEXT: pand %xmm8, %xmm7
+; SSE2-NEXT: pandn %xmm4, %xmm8
+; SSE2-NEXT: por %xmm7, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,1,2,2]
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3],xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7]
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535]
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pslld $16, %xmm6
+; SSE2-NEXT: pandn %xmm6, %xmm2
+; SSE2-NEXT: por %xmm0, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3],xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0]
-; SSE2-NEXT: movdqu %xmm4, (%rax)
+; SSE2-NEXT: psllq $48, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,65535,65535]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: pandn %xmm5, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
+; SSE2-NEXT: movups %xmm2, (%rax)
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: popq %r12
; SSE2-NEXT: popq %r13
; AVX1-NEXT: pushq %r13
; AVX1-NEXT: pushq %r12
; AVX1-NEXT: pushq %rbx
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; AVX1-NEXT: vpextrq $1, %xmm7, %r15
-; AVX1-NEXT: vmovq %xmm7, %r14
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; AVX1-NEXT: vpextrq $1, %xmm4, %r11
-; AVX1-NEXT: vmovq %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
+; AVX1-NEXT: vmovq %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX1-NEXT: vpextrq $1, %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
+; AVX1-NEXT: vmovq %xmm6, %r10
+; AVX1-NEXT: vpextrq $1, %xmm6, %r9
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm6[0],zero,xmm6[1],zero
+; AVX1-NEXT: vmovq %xmm7, %r8
+; AVX1-NEXT: vpextrq $1, %xmm7, %rdi
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3]
+; AVX1-NEXT: vpextrq $1, %xmm6, %rcx
+; AVX1-NEXT: vmovq %xmm6, %r14
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3]
+; AVX1-NEXT: vpextrq $1, %xmm6, %rax
+; AVX1-NEXT: vmovq %xmm6, %rbp
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero
+; AVX1-NEXT: vpextrq $1, %xmm5, %r11
+; AVX1-NEXT: vmovq %xmm5, %r15
+; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX1-NEXT: vpextrq $1, %xmm4, %rbx
+; AVX1-NEXT: vmovq %xmm4, %rdx
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; AVX1-NEXT: vpextrq $1, %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX1-NEXT: vmovq %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
-; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; AVX1-NEXT: vmovd %xmm6, %ecx
-; AVX1-NEXT: vpextrd $1, %xmm6, %edx
-; AVX1-NEXT: vpextrd $2, %xmm6, %r13d
-; AVX1-NEXT: vpextrd $3, %xmm6, %r12d
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX1-NEXT: vmovd %xmm1, %ebx
-; AVX1-NEXT: vpextrd $1, %xmm1, %ebp
-; AVX1-NEXT: vpextrd $2, %xmm1, %esi
-; AVX1-NEXT: vpextrd $3, %xmm1, %edi
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
-; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
-; AVX1-NEXT: vmovd %xmm7, %r8d
-; AVX1-NEXT: leal -1(%r12,%rdi), %eax
-; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX1-NEXT: vpextrd $2, %xmm7, %eax
-; AVX1-NEXT: leal -1(%r13,%rsi), %esi
-; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX1-NEXT: vpextrd $2, %xmm4, %edi
-; AVX1-NEXT: leal -1(%rdx,%rbp), %edx
-; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX1-NEXT: vpextrd $3, %xmm4, %edx
-; AVX1-NEXT: leal -1(%rcx,%rbx), %r10d
-; AVX1-NEXT: vpextrd $3, %xmm1, %ecx
-; AVX1-NEXT: leal -1(%rdx,%rcx), %r9d
-; AVX1-NEXT: vpextrd $2, %xmm1, %ecx
-; AVX1-NEXT: leal -1(%rdi,%rcx), %edi
-; AVX1-NEXT: vpextrd $2, %xmm5, %ecx
-; AVX1-NEXT: leal -1(%rax,%rcx), %eax
-; AVX1-NEXT: vmovd %xmm5, %ecx
-; AVX1-NEXT: leal -1(%r8,%rcx), %r8d
-; AVX1-NEXT: vpextrq $1, %xmm6, %rdx
-; AVX1-NEXT: leal -1(%r15,%rdx), %r15d
-; AVX1-NEXT: vmovq %xmm6, %rdx
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero
-; AVX1-NEXT: leal -1(%r14,%rdx), %r14d
-; AVX1-NEXT: vpextrq $1, %xmm1, %rdx
-; AVX1-NEXT: leal -1(%r11,%rdx), %edx
-; AVX1-NEXT: vmovq %xmm1, %rcx
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; AVX1-NEXT: leal -1(%rsi,%rcx), %ecx
-; AVX1-NEXT: vpextrq $1, %xmm1, %rsi
-; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; AVX1-NEXT: leal -1(%rbp,%rsi), %esi
-; AVX1-NEXT: vmovq %xmm1, %rbx
-; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; AVX1-NEXT: leal -1(%rbp,%rbx), %ebx
-; AVX1-NEXT: vpextrq $1, %xmm8, %r11
-; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX1-NEXT: vpextrq $1, %xmm0, %r12
-; AVX1-NEXT: leal -1(%r11,%r12), %r11d
-; AVX1-NEXT: vmovq %xmm8, %r12
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm7[2],xmm3[2],xmm7[3],xmm3[3]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpextrq $1, %xmm0, %rsi
+; AVX1-NEXT: addq %rcx, %rsi
; AVX1-NEXT: vmovq %xmm0, %r13
-; AVX1-NEXT: leal -1(%r12,%r13), %ebp
-; AVX1-NEXT: shrl %ebp
-; AVX1-NEXT: vmovd %ebp, %xmm0
-; AVX1-NEXT: shrl %r11d
-; AVX1-NEXT: vpinsrb $1, %r11d, %xmm0, %xmm0
-; AVX1-NEXT: shrl %ebx
-; AVX1-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0
-; AVX1-NEXT: shrl %esi
-; AVX1-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; AVX1-NEXT: shrl %ecx
-; AVX1-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: shrl %edx
-; AVX1-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0
-; AVX1-NEXT: shrl %r14d
-; AVX1-NEXT: vpinsrb $6, %r14d, %xmm0, %xmm0
-; AVX1-NEXT: shrl %r15d
-; AVX1-NEXT: vpinsrb $7, %r15d, %xmm0, %xmm0
-; AVX1-NEXT: shrl %r8d
-; AVX1-NEXT: vpinsrb $8, %r8d, %xmm0, %xmm0
-; AVX1-NEXT: shrl %eax
-; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
-; AVX1-NEXT: shrl %edi
-; AVX1-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; AVX1-NEXT: shrl %r9d
-; AVX1-NEXT: vpinsrb $11, %r9d, %xmm0, %xmm0
-; AVX1-NEXT: shrl %r10d
-; AVX1-NEXT: vpinsrb $12, %r10d, %xmm0, %xmm0
-; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; AVX1-NEXT: shrl %eax
-; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; AVX1-NEXT: shrl %eax
-; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; AVX1-NEXT: shrl %eax
-; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX1-NEXT: addq %r14, %r13
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpextrq $1, %xmm0, %r12
+; AVX1-NEXT: addq %rax, %r12
+; AVX1-NEXT: vmovq %xmm0, %r14
+; AVX1-NEXT: addq %rbp, %r14
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm7[0],zero,xmm7[1],zero
+; AVX1-NEXT: vpextrq $1, %xmm0, %rbp
+; AVX1-NEXT: addq %r11, %rbp
+; AVX1-NEXT: vmovq %xmm0, %r11
+; AVX1-NEXT: addq %r15, %r11
+; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX1-NEXT: vpextrq $1, %xmm0, %r15
+; AVX1-NEXT: addq %rbx, %r15
+; AVX1-NEXT: vmovq %xmm0, %rbx
+; AVX1-NEXT: addq %rdx, %rbx
+; AVX1-NEXT: vpextrq $1, %xmm6, %rax
+; AVX1-NEXT: leaq -1(%rdi,%rax), %rax
+; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT: vmovq %xmm6, %rax
+; AVX1-NEXT: leaq -1(%r8,%rax), %rax
+; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT: vpextrq $1, %xmm5, %rax
+; AVX1-NEXT: leaq -1(%r9,%rax), %rax
+; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT: vmovq %xmm5, %rax
+; AVX1-NEXT: leaq -1(%r10,%rax), %rax
+; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT: vpextrq $1, %xmm4, %rax
+; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX1-NEXT: leaq -1(%rcx,%rax), %rax
+; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT: vmovq %xmm4, %rax
+; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX1-NEXT: leaq -1(%rcx,%rax), %rax
+; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT: vpextrq $1, %xmm8, %rax
+; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX1-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX1-NEXT: leaq -1(%rax,%rcx), %rax
+; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT: vmovq %xmm8, %rax
+; AVX1-NEXT: vmovq %xmm0, %rcx
+; AVX1-NEXT: leaq -1(%rax,%rcx), %rax
+; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT: xorl %r10d, %r10d
+; AVX1-NEXT: addq $-1, %rsi
+; AVX1-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT: movl $0, %ecx
+; AVX1-NEXT: adcq $-1, %rcx
+; AVX1-NEXT: addq $-1, %r13
+; AVX1-NEXT: movl $0, %eax
+; AVX1-NEXT: adcq $-1, %rax
+; AVX1-NEXT: addq $-1, %r12
+; AVX1-NEXT: movl $0, %edi
+; AVX1-NEXT: adcq $-1, %rdi
+; AVX1-NEXT: addq $-1, %r14
+; AVX1-NEXT: movl $0, %esi
+; AVX1-NEXT: adcq $-1, %rsi
+; AVX1-NEXT: addq $-1, %rbp
+; AVX1-NEXT: movl $0, %r9d
+; AVX1-NEXT: adcq $-1, %r9
+; AVX1-NEXT: addq $-1, %r11
+; AVX1-NEXT: movl $0, %r8d
+; AVX1-NEXT: adcq $-1, %r8
+; AVX1-NEXT: addq $-1, %r15
+; AVX1-NEXT: movl $0, %edx
+; AVX1-NEXT: adcq $-1, %rdx
+; AVX1-NEXT: addq $-1, %rbx
+; AVX1-NEXT: adcq $-1, %r10
+; AVX1-NEXT: shldq $63, %r11, %r8
+; AVX1-NEXT: shldq $63, %rbp, %r9
+; AVX1-NEXT: shldq $63, %r14, %rsi
+; AVX1-NEXT: shldq $63, %r12, %rdi
+; AVX1-NEXT: shldq $63, %r13, %rax
+; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; AVX1-NEXT: shldq $63, %rbp, %rcx
+; AVX1-NEXT: shldq $63, %rbx, %r10
+; AVX1-NEXT: shldq $63, %r15, %rdx
+; AVX1-NEXT: vmovq %rcx, %xmm8
+; AVX1-NEXT: vmovq %rax, %xmm9
+; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX1-NEXT: shrq %rax
+; AVX1-NEXT: vmovq %rax, %xmm0
+; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX1-NEXT: shrq %rax
+; AVX1-NEXT: vmovq %rax, %xmm11
+; AVX1-NEXT: vmovq %rdi, %xmm12
+; AVX1-NEXT: vmovq %rsi, %xmm13
+; AVX1-NEXT: vmovq %rdx, %xmm14
+; AVX1-NEXT: vmovq %r10, %xmm15
+; AVX1-NEXT: vmovq %r9, %xmm10
+; AVX1-NEXT: vmovq %r8, %xmm1
+; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX1-NEXT: shrq %rax
+; AVX1-NEXT: vmovq %rax, %xmm2
+; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX1-NEXT: shrq %rax
+; AVX1-NEXT: vmovq %rax, %xmm3
+; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX1-NEXT: shrq %rax
+; AVX1-NEXT: vmovq %rax, %xmm4
+; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX1-NEXT: shrq %rax
+; AVX1-NEXT: vmovq %rax, %xmm5
+; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX1-NEXT: shrq %rax
+; AVX1-NEXT: vmovq %rax, %xmm6
+; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX1-NEXT: shrq %rax
+; AVX1-NEXT: vmovq %rax, %xmm7
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3],xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7]
+; AVX1-NEXT: vpsllq $48, %xmm8, %xmm8
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[0,0,1,1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0,1,2],xmm8[3],xmm0[4,5,6,7]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
+; AVX1-NEXT: vpslld $16, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2,3,4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5],xmm3[6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vmovdqu %xmm0, (%rax)
; AVX1-NEXT: popq %rbx
; AVX1-NEXT: popq %r12
; AVX2-NEXT: pushq %r13
; AVX2-NEXT: pushq %r12
; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: subq $16, %rsp
; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm10 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpextrq $1, %xmm4, %rbx
+; AVX2-NEXT: vmovq %xmm4, %rbp
+; AVX2-NEXT: vpextrq $1, %xmm3, %rdi
+; AVX2-NEXT: vmovq %xmm3, %rcx
+; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpextrq $1, %xmm3, %rdx
+; AVX2-NEXT: vmovq %xmm3, %r9
+; AVX2-NEXT: vpextrq $1, %xmm2, %r13
+; AVX2-NEXT: vmovq %xmm2, %r12
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm5 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm4
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm7
-; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpextrq $1, %xmm2, %r15
-; AVX2-NEXT: vmovq %xmm2, %r14
+; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpextrq $1, %xmm3, %r14
+; AVX2-NEXT: vmovq %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT: vpextrq $1, %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT: vmovq %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; AVX2-NEXT: vpextrq $1, %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: vmovq %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm1
-; AVX2-NEXT: vpextrq $1, %xmm1, %r13
-; AVX2-NEXT: vmovq %xmm1, %r11
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm11 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vmovq %xmm1, %r10
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpextrq $1, %xmm4, %rax
+; AVX2-NEXT: addq %rbx, %rax
+; AVX2-NEXT: movq %rax, %rbx
+; AVX2-NEXT: vmovq %xmm4, %rsi
+; AVX2-NEXT: addq %rbp, %rsi
+; AVX2-NEXT: vpextrq $1, %xmm3, %rax
+; AVX2-NEXT: addq %rdi, %rax
+; AVX2-NEXT: movq %rax, %rdi
+; AVX2-NEXT: vmovq %xmm3, %r11
+; AVX2-NEXT: addq %rcx, %r11
+; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpextrq $1, %xmm3, %rcx
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: vmovq %xmm3, %r8
+; AVX2-NEXT: addq %r9, %r8
+; AVX2-NEXT: vpextrq $1, %xmm2, %r9
+; AVX2-NEXT: addq %r13, %r9
+; AVX2-NEXT: vmovq %xmm2, %r15
+; AVX2-NEXT: addq %r12, %r15
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm8 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm1
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm6
-; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm0
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT: vmovd %xmm9, %r12d
-; AVX2-NEXT: vpextrd $2, %xmm9, %r9d
-; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm0
-; AVX2-NEXT: vmovd %xmm7, %ecx
-; AVX2-NEXT: vpextrd $2, %xmm7, %edi
-; AVX2-NEXT: vmovd %xmm5, %ebx
-; AVX2-NEXT: vpextrd $2, %xmm5, %esi
-; AVX2-NEXT: vmovd %xmm4, %edx
-; AVX2-NEXT: vpextrd $2, %xmm4, %ebp
-; AVX2-NEXT: vpextrd $2, %xmm1, %eax
-; AVX2-NEXT: leal -1(%rbp,%rax), %eax
-; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX2-NEXT: vmovd %xmm1, %eax
-; AVX2-NEXT: leal -1(%rdx,%rax), %eax
-; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX2-NEXT: vpextrd $2, %xmm8, %eax
-; AVX2-NEXT: leal -1(%rsi,%rax), %eax
-; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX2-NEXT: vmovd %xmm8, %eax
-; AVX2-NEXT: leal -1(%rbx,%rax), %r10d
-; AVX2-NEXT: vpextrd $2, %xmm6, %eax
-; AVX2-NEXT: leal -1(%rdi,%rax), %r8d
-; AVX2-NEXT: vmovd %xmm6, %eax
-; AVX2-NEXT: leal -1(%rcx,%rax), %edi
-; AVX2-NEXT: vpextrd $2, %xmm3, %eax
-; AVX2-NEXT: leal -1(%r9,%rax), %r9d
-; AVX2-NEXT: vmovd %xmm3, %ecx
-; AVX2-NEXT: leal -1(%r12,%rcx), %r12d
-; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
-; AVX2-NEXT: leal -1(%r15,%rcx), %r15d
-; AVX2-NEXT: vmovq %xmm0, %rcx
-; AVX2-NEXT: leal -1(%r14,%rcx), %r14d
-; AVX2-NEXT: vpextrq $1, %xmm2, %rdx
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: leal -1(%rax,%rdx), %edx
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpextrq $1, %xmm3, %rax
+; AVX2-NEXT: addq %r14, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: vmovq %xmm3, %rax
+; AVX2-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: vpextrq $1, %xmm2, %rax
+; AVX2-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX2-NEXT: vmovq %xmm2, %rax
-; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm0
+; AVX2-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT: vpextrq $1, %xmm0, %rbp
+; AVX2-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
+; AVX2-NEXT: vmovq %xmm0, %r12
+; AVX2-NEXT: addq %r10, %r12
+; AVX2-NEXT: vpextrq $1, %xmm1, %rax
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vpextrq $1, %xmm0, %r10
+; AVX2-NEXT: addq %rax, %r10
+; AVX2-NEXT: vmovq %xmm1, %rax
+; AVX2-NEXT: vmovq %xmm0, %rdx
+; AVX2-NEXT: addq %rax, %rdx
+; AVX2-NEXT: addq $-1, %rbx
+; AVX2-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: adcq $-1, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: addq $-1, %rsi
+; AVX2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: adcq $-1, %rax
+; AVX2-NEXT: movq %rax, (%rsp) # 8-byte Spill
+; AVX2-NEXT: addq $-1, %rdi
+; AVX2-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: adcq $-1, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: addq $-1, %r11
+; AVX2-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: adcq $-1, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: addq $-1, %rcx
+; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: adcq $-1, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: addq $-1, %r8
+; AVX2-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: adcq $-1, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: addq $-1, %r9
+; AVX2-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: adcq $-1, %rax
+; AVX2-NEXT: movq %rax, %rsi
+; AVX2-NEXT: addq $-1, %r15
+; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movl $0, %r15d
+; AVX2-NEXT: adcq $-1, %r15
+; AVX2-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT: movl $0, %r13d
+; AVX2-NEXT: adcq $-1, %r13
+; AVX2-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT: movl $0, %r14d
+; AVX2-NEXT: adcq $-1, %r14
+; AVX2-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT: movl $0, %ebx
+; AVX2-NEXT: adcq $-1, %rbx
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: addq $-1, %rax
+; AVX2-NEXT: movl $0, %r11d
+; AVX2-NEXT: adcq $-1, %r11
+; AVX2-NEXT: addq $-1, %rbp
+; AVX2-NEXT: movl $0, %r9d
+; AVX2-NEXT: adcq $-1, %r9
+; AVX2-NEXT: addq $-1, %r12
+; AVX2-NEXT: movl $0, %r8d
+; AVX2-NEXT: adcq $-1, %r8
+; AVX2-NEXT: addq $-1, %r10
+; AVX2-NEXT: movl $0, %edi
+; AVX2-NEXT: adcq $-1, %rdi
+; AVX2-NEXT: addq $-1, %rdx
+; AVX2-NEXT: movl $0, %ecx
+; AVX2-NEXT: adcq $-1, %rcx
+; AVX2-NEXT: shldq $63, %rdx, %rcx
+; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq $63, %r10, %rdi
+; AVX2-NEXT: shldq $63, %r12, %r8
+; AVX2-NEXT: shldq $63, %rbp, %r9
+; AVX2-NEXT: shldq $63, %rax, %r11
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX2-NEXT: shldq $63, %rdx, %rbx
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX2-NEXT: shldq $63, %rdx, %r14
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX2-NEXT: shldq $63, %rdx, %r13
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: shldq $63, %rax, %r15
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: shldq $63, %rax, %rsi
+; AVX2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: shldq $63, %rax, %rsi
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: shldq $63, %rax, %r12
; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT: leal -1(%rcx,%rax), %eax
-; AVX2-NEXT: vpextrq $1, %xmm0, %rsi
-; AVX2-NEXT: leal -1(%r13,%rsi), %esi
-; AVX2-NEXT: vmovq %xmm0, %rbx
-; AVX2-NEXT: leal -1(%r11,%rbx), %ebx
-; AVX2-NEXT: vpextrq $1, %xmm10, %rcx
-; AVX2-NEXT: vpextrq $1, %xmm11, %r13
-; AVX2-NEXT: leal -1(%rcx,%r13), %ecx
-; AVX2-NEXT: vmovq %xmm10, %r13
-; AVX2-NEXT: vmovq %xmm11, %r11
-; AVX2-NEXT: leaq -1(%r13,%r11), %rbp
-; AVX2-NEXT: shrq %rbp
-; AVX2-NEXT: vmovd %ebp, %xmm0
-; AVX2-NEXT: shrl %ecx
-; AVX2-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: shrl %ebx
-; AVX2-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0
-; AVX2-NEXT: shrl %esi
-; AVX2-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; AVX2-NEXT: shrl %eax
-; AVX2-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
-; AVX2-NEXT: shrl %edx
-; AVX2-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0
-; AVX2-NEXT: shrl %r14d
-; AVX2-NEXT: vpinsrb $6, %r14d, %xmm0, %xmm0
-; AVX2-NEXT: shrl %r15d
-; AVX2-NEXT: vpinsrb $7, %r15d, %xmm0, %xmm0
-; AVX2-NEXT: shrl %r12d
-; AVX2-NEXT: vpinsrb $8, %r12d, %xmm0, %xmm0
-; AVX2-NEXT: shrl %r9d
-; AVX2-NEXT: vpinsrb $9, %r9d, %xmm0, %xmm0
-; AVX2-NEXT: shrl %edi
-; AVX2-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; AVX2-NEXT: shrl %r8d
-; AVX2-NEXT: vpinsrb $11, %r8d, %xmm0, %xmm0
-; AVX2-NEXT: shrl %r10d
-; AVX2-NEXT: vpinsrb $12, %r10d, %xmm0, %xmm0
-; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; AVX2-NEXT: shrl %eax
-; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; AVX2-NEXT: shrl %eax
-; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; AVX2-NEXT: shrl %eax
-; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: shldq $63, %rax, %rcx
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: shldq $63, %rax, %r10
+; AVX2-NEXT: movq (%rsp), %rax # 8-byte Reload
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX2-NEXT: shldq $63, %rdx, %rax
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; AVX2-NEXT: shldq $63, %rdx, %rbp
+; AVX2-NEXT: vmovq %rbp, %xmm8
+; AVX2-NEXT: vmovq %rax, %xmm9
+; AVX2-NEXT: vmovq %r10, %xmm0
+; AVX2-NEXT: vmovq %rcx, %xmm1
+; AVX2-NEXT: vmovq %r12, %xmm12
+; AVX2-NEXT: vmovq %rsi, %xmm13
+; AVX2-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 8-byte Folded Reload
+; AVX2-NEXT: # xmm14 = mem[0],zero
+; AVX2-NEXT: vmovq %r15, %xmm15
+; AVX2-NEXT: vmovq %r13, %xmm10
+; AVX2-NEXT: vmovq %r14, %xmm11
+; AVX2-NEXT: vmovq %rbx, %xmm2
+; AVX2-NEXT: vmovq %r11, %xmm3
+; AVX2-NEXT: vmovq %r9, %xmm4
+; AVX2-NEXT: vmovq %r8, %xmm5
+; AVX2-NEXT: vmovq %rdi, %xmm6
+; AVX2-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 8-byte Folded Reload
+; AVX2-NEXT: # xmm7 = mem[0],zero
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX2-NEXT: vpbroadcastw %xmm8, %xmm8
+; AVX2-NEXT: vpbroadcastw %xmm9, %xmm0
+; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,6],xmm8[7]
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
+; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
+; AVX2-NEXT: vpbroadcastw %xmm9, %xmm1
+; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm8[3]
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1
+; AVX2-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5,6,7]
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; AVX2-NEXT: vpbroadcastw %xmm3, %xmm3
+; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX2-NEXT: vmovdqu %xmm0, (%rax)
+; AVX2-NEXT: addq $16, %rsp
; AVX2-NEXT: popq %rbx
; AVX2-NEXT: popq %r12
; AVX2-NEXT: popq %r13
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512-LABEL: not_avg_v16i8_wide_constants:
-; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: pushq %r15
-; AVX512-NEXT: pushq %r14
-; AVX512-NEXT: pushq %r13
-; AVX512-NEXT: pushq %r12
-; AVX512-NEXT: pushq %rbx
-; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm10 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1
-; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm3
-; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm5 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm4
-; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm7
-; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm1
-; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512-NEXT: vpextrq $1, %xmm2, %r15
-; AVX512-NEXT: vmovq %xmm2, %r14
-; AVX512-NEXT: vpextrq $1, %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: vmovq %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm1
-; AVX512-NEXT: vpextrq $1, %xmm1, %r13
-; AVX512-NEXT: vmovq %xmm1, %r11
-; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm11 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm8 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm1
-; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm6
-; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm0
-; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512-NEXT: vmovd %xmm9, %r12d
-; AVX512-NEXT: vpextrd $2, %xmm9, %r9d
-; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm0
-; AVX512-NEXT: vmovd %xmm7, %ecx
-; AVX512-NEXT: vpextrd $2, %xmm7, %edi
-; AVX512-NEXT: vmovd %xmm5, %ebx
-; AVX512-NEXT: vpextrd $2, %xmm5, %esi
-; AVX512-NEXT: vmovd %xmm4, %edx
-; AVX512-NEXT: vpextrd $2, %xmm4, %ebp
-; AVX512-NEXT: vpextrd $2, %xmm1, %eax
-; AVX512-NEXT: leal -1(%rbp,%rax), %eax
-; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512-NEXT: vmovd %xmm1, %eax
-; AVX512-NEXT: leal -1(%rdx,%rax), %eax
-; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512-NEXT: vpextrd $2, %xmm8, %eax
-; AVX512-NEXT: leal -1(%rsi,%rax), %eax
-; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512-NEXT: vmovd %xmm8, %eax
-; AVX512-NEXT: leal -1(%rbx,%rax), %r10d
-; AVX512-NEXT: vpextrd $2, %xmm6, %eax
-; AVX512-NEXT: leal -1(%rdi,%rax), %r8d
-; AVX512-NEXT: vmovd %xmm6, %eax
-; AVX512-NEXT: leal -1(%rcx,%rax), %edi
-; AVX512-NEXT: vpextrd $2, %xmm3, %eax
-; AVX512-NEXT: leal -1(%r9,%rax), %r9d
-; AVX512-NEXT: vmovd %xmm3, %ecx
-; AVX512-NEXT: leal -1(%r12,%rcx), %r12d
-; AVX512-NEXT: vpextrq $1, %xmm0, %rcx
-; AVX512-NEXT: leal -1(%r15,%rcx), %r15d
-; AVX512-NEXT: vmovq %xmm0, %rcx
-; AVX512-NEXT: leal -1(%r14,%rcx), %r14d
-; AVX512-NEXT: vpextrq $1, %xmm2, %rdx
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: leal -1(%rax,%rdx), %edx
-; AVX512-NEXT: vmovq %xmm2, %rax
-; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm0
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX512-NEXT: leal -1(%rcx,%rax), %eax
-; AVX512-NEXT: vpextrq $1, %xmm0, %rsi
-; AVX512-NEXT: leal -1(%r13,%rsi), %esi
-; AVX512-NEXT: vmovq %xmm0, %rbx
-; AVX512-NEXT: leal -1(%r11,%rbx), %ebx
-; AVX512-NEXT: vpextrq $1, %xmm10, %rcx
-; AVX512-NEXT: vpextrq $1, %xmm11, %r13
-; AVX512-NEXT: leal -1(%rcx,%r13), %ecx
-; AVX512-NEXT: vmovq %xmm10, %r13
-; AVX512-NEXT: vmovq %xmm11, %r11
-; AVX512-NEXT: leaq -1(%r13,%r11), %rbp
-; AVX512-NEXT: shrq %rbp
-; AVX512-NEXT: vmovd %ebp, %xmm0
-; AVX512-NEXT: shrl %ecx
-; AVX512-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
-; AVX512-NEXT: shrl %ebx
-; AVX512-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0
-; AVX512-NEXT: shrl %esi
-; AVX512-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; AVX512-NEXT: shrl %eax
-; AVX512-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
-; AVX512-NEXT: shrl %edx
-; AVX512-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0
-; AVX512-NEXT: shrl %r14d
-; AVX512-NEXT: vpinsrb $6, %r14d, %xmm0, %xmm0
-; AVX512-NEXT: shrl %r15d
-; AVX512-NEXT: vpinsrb $7, %r15d, %xmm0, %xmm0
-; AVX512-NEXT: shrl %r12d
-; AVX512-NEXT: vpinsrb $8, %r12d, %xmm0, %xmm0
-; AVX512-NEXT: shrl %r9d
-; AVX512-NEXT: vpinsrb $9, %r9d, %xmm0, %xmm0
-; AVX512-NEXT: shrl %edi
-; AVX512-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; AVX512-NEXT: shrl %r8d
-; AVX512-NEXT: vpinsrb $11, %r8d, %xmm0, %xmm0
-; AVX512-NEXT: shrl %r10d
-; AVX512-NEXT: vpinsrb $12, %r10d, %xmm0, %xmm0
-; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; AVX512-NEXT: shrl %eax
-; AVX512-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
-; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; AVX512-NEXT: shrl %eax
-; AVX512-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
-; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; AVX512-NEXT: shrl %eax
-; AVX512-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqu %xmm0, (%rax)
-; AVX512-NEXT: popq %rbx
-; AVX512-NEXT: popq %r12
-; AVX512-NEXT: popq %r13
-; AVX512-NEXT: popq %r14
-; AVX512-NEXT: popq %r15
-; AVX512-NEXT: popq %rbp
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512F-LABEL: not_avg_v16i8_wide_constants:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: pushq %r15
+; AVX512F-NEXT: pushq %r14
+; AVX512F-NEXT: pushq %r13
+; AVX512F-NEXT: pushq %r12
+; AVX512F-NEXT: pushq %rbx
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm4
+; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm5
+; AVX512F-NEXT: vpextrq $1, %xmm5, %rdx
+; AVX512F-NEXT: vmovq %xmm5, %rcx
+; AVX512F-NEXT: vpextrq $1, %xmm4, %rax
+; AVX512F-NEXT: vmovq %xmm4, %rbx
+; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm4
+; AVX512F-NEXT: vpextrq $1, %xmm4, %rdi
+; AVX512F-NEXT: vmovq %xmm4, %rsi
+; AVX512F-NEXT: vpextrq $1, %xmm1, %r13
+; AVX512F-NEXT: vmovq %xmm1, %r15
+; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm1
+; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrq $1, %xmm2, %r12
+; AVX512F-NEXT: vmovq %xmm2, %r14
+; AVX512F-NEXT: vpextrq $1, %xmm1, %r11
+; AVX512F-NEXT: vmovq %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpextrq $1, %xmm1, %r10
+; AVX512F-NEXT: vmovq %xmm1, %r9
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm3
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm5
+; AVX512F-NEXT: vpextrq $1, %xmm5, %rbp
+; AVX512F-NEXT: leal -1(%rdx,%rbp), %edx
+; AVX512F-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512F-NEXT: vmovq %xmm5, %rbp
+; AVX512F-NEXT: leal -1(%rcx,%rbp), %ecx
+; AVX512F-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512F-NEXT: vpextrq $1, %xmm4, %rbp
+; AVX512F-NEXT: leal -1(%rax,%rbp), %eax
+; AVX512F-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512F-NEXT: vmovq %xmm4, %rbp
+; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512F-NEXT: leal -1(%rbx,%rbp), %r8d
+; AVX512F-NEXT: vpextrq $1, %xmm4, %rbp
+; AVX512F-NEXT: leal -1(%rdi,%rbp), %edi
+; AVX512F-NEXT: vmovq %xmm4, %rbp
+; AVX512F-NEXT: leal -1(%rsi,%rbp), %esi
+; AVX512F-NEXT: vpextrq $1, %xmm3, %rbp
+; AVX512F-NEXT: leal -1(%r13,%rbp), %r13d
+; AVX512F-NEXT: vmovq %xmm3, %rbp
+; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm2
+; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512F-NEXT: leal -1(%r15,%rbp), %r15d
+; AVX512F-NEXT: vpextrq $1, %xmm3, %rbp
+; AVX512F-NEXT: leal -1(%r12,%rbp), %r12d
+; AVX512F-NEXT: vmovq %xmm3, %rbp
+; AVX512F-NEXT: leal -1(%r14,%rbp), %r14d
+; AVX512F-NEXT: vpextrq $1, %xmm2, %rdx
+; AVX512F-NEXT: leal -1(%r11,%rdx), %r11d
+; AVX512F-NEXT: vmovq %xmm2, %rbp
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512F-NEXT: leal -1(%rax,%rbp), %ebp
+; AVX512F-NEXT: vpextrq $1, %xmm2, %rcx
+; AVX512F-NEXT: leal -1(%r10,%rcx), %ecx
+; AVX512F-NEXT: vmovq %xmm2, %rax
+; AVX512F-NEXT: leal -1(%r9,%rax), %eax
+; AVX512F-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX512F-NEXT: vpextrq $1, %xmm1, %r10
+; AVX512F-NEXT: leal -1(%rdx,%r10), %edx
+; AVX512F-NEXT: vmovq %xmm0, %r10
+; AVX512F-NEXT: vmovq %xmm1, %r9
+; AVX512F-NEXT: leaq -1(%r10,%r9), %rbx
+; AVX512F-NEXT: shrq %rbx
+; AVX512F-NEXT: vmovd %ebx, %xmm0
+; AVX512F-NEXT: shrl %edx
+; AVX512F-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0
+; AVX512F-NEXT: shrl %eax
+; AVX512F-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: shrl %ecx
+; AVX512F-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
+; AVX512F-NEXT: shrl %ebp
+; AVX512F-NEXT: vpinsrb $4, %ebp, %xmm0, %xmm0
+; AVX512F-NEXT: shrl %r11d
+; AVX512F-NEXT: vpinsrb $5, %r11d, %xmm0, %xmm0
+; AVX512F-NEXT: shrl %r14d
+; AVX512F-NEXT: vpinsrb $6, %r14d, %xmm0, %xmm0
+; AVX512F-NEXT: shrl %r12d
+; AVX512F-NEXT: vpinsrb $7, %r12d, %xmm0, %xmm0
+; AVX512F-NEXT: shrl %r15d
+; AVX512F-NEXT: vpinsrb $8, %r15d, %xmm0, %xmm0
+; AVX512F-NEXT: shrl %r13d
+; AVX512F-NEXT: vpinsrb $9, %r13d, %xmm0, %xmm0
+; AVX512F-NEXT: shrl %esi
+; AVX512F-NEXT: vpinsrb $10, %esi, %xmm0, %xmm0
+; AVX512F-NEXT: shrl %edi
+; AVX512F-NEXT: vpinsrb $11, %edi, %xmm0, %xmm0
+; AVX512F-NEXT: shrl %r8d
+; AVX512F-NEXT: vpinsrb $12, %r8d, %xmm0, %xmm0
+; AVX512F-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; AVX512F-NEXT: shrl %eax
+; AVX512F-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; AVX512F-NEXT: shrl %eax
+; AVX512F-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; AVX512F-NEXT: shrl %eax
+; AVX512F-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vmovdqu %xmm0, (%rax)
+; AVX512F-NEXT: popq %rbx
+; AVX512F-NEXT: popq %r12
+; AVX512F-NEXT: popq %r13
+; AVX512F-NEXT: popq %r14
+; AVX512F-NEXT: popq %r15
+; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: not_avg_v16i8_wide_constants:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: pushq %rbp
+; AVX512BW-NEXT: pushq %r15
+; AVX512BW-NEXT: pushq %r14
+; AVX512BW-NEXT: pushq %r13
+; AVX512BW-NEXT: pushq %r12
+; AVX512BW-NEXT: pushq %rbx
+; AVX512BW-NEXT: subq $24, %rsp
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVX512BW-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512BW-NEXT: vmovq %xmm4, %rbx
+; AVX512BW-NEXT: vpextrq $1, %xmm4, %rbp
+; AVX512BW-NEXT: vmovq %xmm3, %rdi
+; AVX512BW-NEXT: vpextrq $1, %xmm3, %rsi
+; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2
+; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512BW-NEXT: vmovq %xmm3, %rdx
+; AVX512BW-NEXT: vpextrq $1, %xmm3, %r15
+; AVX512BW-NEXT: vmovq %xmm2, %r8
+; AVX512BW-NEXT: vpextrq $1, %xmm2, %r14
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512BW-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512BW-NEXT: vmovq %xmm3, %r9
+; AVX512BW-NEXT: vpextrq $1, %xmm3, %r10
+; AVX512BW-NEXT: vmovq %xmm2, %r11
+; AVX512BW-NEXT: vpextrq $1, %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT: vmovq %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512BW-NEXT: vpextrq $1, %xmm2, %r13
+; AVX512BW-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512BW-NEXT: vmovq %xmm4, %rax
+; AVX512BW-NEXT: addq %rbx, %rax
+; AVX512BW-NEXT: movq %rax, %rbx
+; AVX512BW-NEXT: vpextrq $1, %xmm4, %rax
+; AVX512BW-NEXT: addq %rbp, %rax
+; AVX512BW-NEXT: movq %rax, %rbp
+; AVX512BW-NEXT: vmovq %xmm3, %rcx
+; AVX512BW-NEXT: addq %rdi, %rcx
+; AVX512BW-NEXT: vpextrq $1, %xmm3, %r12
+; AVX512BW-NEXT: addq %rsi, %r12
+; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2
+; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512BW-NEXT: vmovq %xmm3, %rax
+; AVX512BW-NEXT: addq %rdx, %rax
+; AVX512BW-NEXT: movq %rax, %rdx
+; AVX512BW-NEXT: vpextrq $1, %xmm3, %rax
+; AVX512BW-NEXT: addq %r15, %rax
+; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512BW-NEXT: vmovq %xmm2, %rax
+; AVX512BW-NEXT: addq %r8, %rax
+; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512BW-NEXT: vpextrq $1, %xmm2, %rax
+; AVX512BW-NEXT: addq %r14, %rax
+; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512BW-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512BW-NEXT: vmovq %xmm3, %rax
+; AVX512BW-NEXT: addq %r9, %rax
+; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512BW-NEXT: vpextrq $1, %xmm3, %rax
+; AVX512BW-NEXT: addq %r10, %rax
+; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512BW-NEXT: vmovq %xmm2, %rax
+; AVX512BW-NEXT: addq %r11, %rax
+; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512BW-NEXT: vpextrq $1, %xmm2, %r14
+; AVX512BW-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BW-NEXT: vmovq %xmm2, %r10
+; AVX512BW-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
+; AVX512BW-NEXT: vpextrq $1, %xmm2, %r9
+; AVX512BW-NEXT: addq %r13, %r9
+; AVX512BW-NEXT: vmovq %xmm0, %rax
+; AVX512BW-NEXT: vmovq %xmm1, %r8
+; AVX512BW-NEXT: addq %rax, %r8
+; AVX512BW-NEXT: vpextrq $1, %xmm0, %rdi
+; AVX512BW-NEXT: vpextrq $1, %xmm1, %rsi
+; AVX512BW-NEXT: addq %rdi, %rsi
+; AVX512BW-NEXT: addq $-1, %rbx
+; AVX512BW-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512BW-NEXT: movl $0, %r15d
+; AVX512BW-NEXT: adcq $-1, %r15
+; AVX512BW-NEXT: addq $-1, %rbp
+; AVX512BW-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512BW-NEXT: movl $0, %ebx
+; AVX512BW-NEXT: adcq $-1, %rbx
+; AVX512BW-NEXT: addq $-1, %rcx
+; AVX512BW-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512BW-NEXT: movl $0, %r11d
+; AVX512BW-NEXT: adcq $-1, %r11
+; AVX512BW-NEXT: addq $-1, %r12
+; AVX512BW-NEXT: movq %r12, (%rsp) # 8-byte Spill
+; AVX512BW-NEXT: movl $0, %edi
+; AVX512BW-NEXT: adcq $-1, %rdi
+; AVX512BW-NEXT: addq $-1, %rdx
+; AVX512BW-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512BW-NEXT: movl $0, %eax
+; AVX512BW-NEXT: adcq $-1, %rax
+; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512BW-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512BW-NEXT: movl $0, %eax
+; AVX512BW-NEXT: adcq $-1, %rax
+; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512BW-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512BW-NEXT: movl $0, %r13d
+; AVX512BW-NEXT: adcq $-1, %r13
+; AVX512BW-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512BW-NEXT: movl $0, %r12d
+; AVX512BW-NEXT: adcq $-1, %r12
+; AVX512BW-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512BW-NEXT: movl $0, %eax
+; AVX512BW-NEXT: adcq $-1, %rax
+; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512BW-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512BW-NEXT: movl $0, %eax
+; AVX512BW-NEXT: adcq $-1, %rax
+; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX512BW-NEXT: addq $-1, %rcx
+; AVX512BW-NEXT: movl $0, %eax
+; AVX512BW-NEXT: adcq $-1, %rax
+; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512BW-NEXT: addq $-1, %r14
+; AVX512BW-NEXT: movl $0, %eax
+; AVX512BW-NEXT: adcq $-1, %rax
+; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512BW-NEXT: addq $-1, %r10
+; AVX512BW-NEXT: movl $0, %eax
+; AVX512BW-NEXT: adcq $-1, %rax
+; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512BW-NEXT: addq $-1, %r9
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: adcq $-1, %rdx
+; AVX512BW-NEXT: addq $-1, %r8
+; AVX512BW-NEXT: movl $0, %eax
+; AVX512BW-NEXT: adcq $-1, %rax
+; AVX512BW-NEXT: addq $-1, %rsi
+; AVX512BW-NEXT: movl $0, %ebp
+; AVX512BW-NEXT: adcq $-1, %rbp
+; AVX512BW-NEXT: shldq $63, %rsi, %rbp
+; AVX512BW-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512BW-NEXT: shldq $63, %r8, %rax
+; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512BW-NEXT: shldq $63, %r9, %rdx
+; AVX512BW-NEXT: movq %rdx, %rbp
+; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512BW-NEXT: shldq $63, %r10, %r8
+; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX512BW-NEXT: shldq $63, %r14, %r10
+; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; AVX512BW-NEXT: shldq $63, %rcx, %r9
+; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512BW-NEXT: shldq $63, %rax, %r14
+; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; AVX512BW-NEXT: shldq $63, %rax, %rsi
+; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512BW-NEXT: shldq $63, %rax, %r12
+; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512BW-NEXT: shldq $63, %rax, %r13
+; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX512BW-NEXT: shldq $63, %rax, %rdx
+; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512BW-NEXT: shldq $63, %rax, %rcx
+; AVX512BW-NEXT: movq (%rsp), %rax # 8-byte Reload
+; AVX512BW-NEXT: shldq $63, %rax, %rdi
+; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512BW-NEXT: shldq $63, %rax, %r11
+; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512BW-NEXT: shldq $63, %rax, %rbx
+; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512BW-NEXT: shldq $63, %rax, %r15
+; AVX512BW-NEXT: vmovq %r15, %xmm0
+; AVX512BW-NEXT: vmovq %rbx, %xmm1
+; AVX512BW-NEXT: vmovq %r11, %xmm2
+; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512BW-NEXT: vmovq %rdi, %xmm1
+; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512BW-NEXT: vmovd %eax, %xmm2
+; AVX512BW-NEXT: vpextrb $0, %xmm1, %eax
+; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm2, %xmm1
+; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm2
+; AVX512BW-NEXT: vpextrb $0, %xmm2, %eax
+; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm0
+; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm1, %xmm0
+; AVX512BW-NEXT: vmovq %rcx, %xmm1
+; AVX512BW-NEXT: vmovq %rdx, %xmm2
+; AVX512BW-NEXT: vmovq %r13, %xmm3
+; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512BW-NEXT: vmovq %r12, %xmm2
+; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512BW-NEXT: vpextrb $0, %xmm1, %eax
+; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BW-NEXT: vpextrb $0, %xmm2, %eax
+; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm2
+; AVX512BW-NEXT: vpextrb $0, %xmm2, %eax
+; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm1
+; AVX512BW-NEXT: vpextrb $0, %xmm1, %eax
+; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; AVX512BW-NEXT: vmovq %rsi, %xmm1
+; AVX512BW-NEXT: vmovq %r14, %xmm2
+; AVX512BW-NEXT: vmovq %r9, %xmm3
+; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512BW-NEXT: vmovq %r10, %xmm2
+; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512BW-NEXT: vpextrb $0, %xmm1, %eax
+; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BW-NEXT: vpextrb $0, %xmm2, %eax
+; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm2
+; AVX512BW-NEXT: vpextrb $0, %xmm2, %eax
+; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm1
+; AVX512BW-NEXT: vpextrb $0, %xmm1, %eax
+; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; AVX512BW-NEXT: vmovq %r8, %xmm1
+; AVX512BW-NEXT: vmovq %rbp, %xmm2
+; AVX512BW-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 8-byte Folded Reload
+; AVX512BW-NEXT: # xmm3 = mem[0],zero
+; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512BW-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 8-byte Folded Reload
+; AVX512BW-NEXT: # xmm2 = mem[0],zero
+; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512BW-NEXT: vpextrb $0, %xmm1, %eax
+; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BW-NEXT: vpextrb $0, %xmm2, %eax
+; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm2
+; AVX512BW-NEXT: vpextrb $0, %xmm2, %eax
+; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm1
+; AVX512BW-NEXT: vpextrb $0, %xmm1, %eax
+; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX512BW-NEXT: vmovdqu %xmm0, (%rax)
+; AVX512BW-NEXT: addq $24, %rsp
+; AVX512BW-NEXT: popq %rbx
+; AVX512BW-NEXT: popq %r12
+; AVX512BW-NEXT: popq %r13
+; AVX512BW-NEXT: popq %r14
+; AVX512BW-NEXT: popq %r15
+; AVX512BW-NEXT: popq %rbp
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%1 = load <16 x i8>, <16 x i8>* %a
%2 = load <16 x i8>, <16 x i8>* %b
%3 = zext <16 x i8> %1 to <16 x i128>
; CHECK: # %bb.0:
; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vmovq %xmm0, (%rdi)
; CHECK-NEXT: vzeroupper
; CHECK-LABEL: test1:
; CHECK: ## %bb.0:
; CHECK-NEXT: vcvttpd2dq %ymm0, %xmm0
+; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retl
%c = fptoui <4 x double> %d to <4 x i8>
; CHECK-LABEL: test2:
; CHECK: ## %bb.0:
; CHECK-NEXT: vcvttpd2dq %ymm0, %xmm0
+; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retl
%c = fptosi <4 x double> %d to <4 x i8>
define <8 x i32> @zext_8i8_8i32(<8 x i8> %A) nounwind {
; X32-LABEL: zext_8i8_8i32:
; X32: # %bb.0:
-; X32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
-; X32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; X32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; X32-NEXT: retl
;
; X64-LABEL: zext_8i8_8i32:
; X64: # %bb.0:
-; X64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; X64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; X64-NEXT: retq
%B = zext <8 x i8> %A to <8 x i32>
ret <8 x i32>%B
define <2 x i32> @masked_gather_v2i32(<2 x i32*>* %ptr, <2 x i1> %masks, <2 x i32> %passthro) {
; X86-LABEL: masked_gather_v2i32:
; X86: # %bb.0: # %entry
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
; X86-NEXT: vpslld $31, %xmm0, %xmm0
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; X86-NEXT: vpgatherdd %xmm0, (,%xmm2), %xmm1
-; X86-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
+; X86-NEXT: vmovdqa %xmm1, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: masked_gather_v2i32:
; X64: # %bb.0: # %entry
; X64-NEXT: vmovdqa (%rdi), %xmm2
-; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-NEXT: vpslld $31, %xmm0, %xmm0
; X64-NEXT: vpgatherqd %xmm0, (,%xmm2), %xmm1
-; X64-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
+; X64-NEXT: vmovdqa %xmm1, %xmm0
; X64-NEXT: retq
;
; NOGATHER-LABEL: masked_gather_v2i32:
; NOGATHER-NEXT: retq
; NOGATHER-NEXT: .LBB0_1: # %cond.load
; NOGATHER-NEXT: vmovq %xmm2, %rcx
-; NOGATHER-NEXT: movl (%rcx), %ecx
-; NOGATHER-NEXT: vpinsrq $0, %rcx, %xmm1, %xmm1
+; NOGATHER-NEXT: vpinsrd $0, (%rcx), %xmm1, %xmm1
; NOGATHER-NEXT: testb $2, %al
; NOGATHER-NEXT: je .LBB0_4
; NOGATHER-NEXT: .LBB0_3: # %cond.load1
; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax
-; NOGATHER-NEXT: movl (%rax), %eax
-; NOGATHER-NEXT: vpinsrq $1, %rax, %xmm1, %xmm1
+; NOGATHER-NEXT: vpinsrd $1, (%rax), %xmm1, %xmm1
; NOGATHER-NEXT: vmovdqa %xmm1, %xmm0
; NOGATHER-NEXT: retq
entry:
define <4 x i32> @masked_gather_v2i32_concat(<2 x i32*>* %ptr, <2 x i1> %masks, <2 x i32> %passthro) {
; X86-LABEL: masked_gather_v2i32_concat:
; X86: # %bb.0: # %entry
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
; X86-NEXT: vpslld $31, %xmm0, %xmm0
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; X86-NEXT: vpgatherdd %xmm0, (,%xmm2), %xmm1
; X86-NEXT: vmovdqa %xmm1, %xmm0
; X86-NEXT: retl
; X64-LABEL: masked_gather_v2i32_concat:
; X64: # %bb.0: # %entry
; X64-NEXT: vmovdqa (%rdi), %xmm2
-; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-NEXT: vpslld $31, %xmm0, %xmm0
; X64-NEXT: vpgatherqd %xmm0, (,%xmm2), %xmm1
; NOGATHER-NEXT: testb $2, %al
; NOGATHER-NEXT: jne .LBB1_3
; NOGATHER-NEXT: .LBB1_4: # %else2
-; NOGATHER-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; NOGATHER-NEXT: vmovdqa %xmm1, %xmm0
; NOGATHER-NEXT: retq
; NOGATHER-NEXT: .LBB1_1: # %cond.load
; NOGATHER-NEXT: vmovq %xmm2, %rcx
-; NOGATHER-NEXT: movl (%rcx), %ecx
-; NOGATHER-NEXT: vpinsrq $0, %rcx, %xmm1, %xmm1
+; NOGATHER-NEXT: vpinsrd $0, (%rcx), %xmm1, %xmm1
; NOGATHER-NEXT: testb $2, %al
; NOGATHER-NEXT: je .LBB1_4
; NOGATHER-NEXT: .LBB1_3: # %cond.load1
; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax
-; NOGATHER-NEXT: movl (%rax), %eax
-; NOGATHER-NEXT: vpinsrq $1, %rax, %xmm1, %xmm1
-; NOGATHER-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; NOGATHER-NEXT: vpinsrd $1, (%rax), %xmm1, %xmm1
+; NOGATHER-NEXT: vmovdqa %xmm1, %xmm0
; NOGATHER-NEXT: retq
entry:
%ld = load <2 x i32*>, <2 x i32*>* %ptr
define <2 x i64> @masked_gather_v2i64(<2 x i64*>* %ptr, <2 x i1> %masks, <2 x i64> %passthro) {
; X86-LABEL: masked_gather_v2i64:
; X86: # %bb.0: # %entry
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: vpmovsxdq (%eax), %xmm2
; X86-NEXT: vpsllq $63, %xmm0, %xmm0
-; X86-NEXT: vpgatherqq %xmm0, (,%xmm2), %xmm1
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; X86-NEXT: vpgatherdq %xmm0, (,%xmm2), %xmm1
; X86-NEXT: vmovdqa %xmm1, %xmm0
; X86-NEXT: retl
;
define <2 x double> @masked_gather_v2double(<2 x double*>* %ptr, <2 x i1> %masks, <2 x double> %passthro) {
; X86-LABEL: masked_gather_v2double:
; X86: # %bb.0: # %entry
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: vpmovsxdq (%eax), %xmm2
; X86-NEXT: vpsllq $63, %xmm0, %xmm0
-; X86-NEXT: vgatherqpd %xmm0, (,%xmm2), %xmm1
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
+; X86-NEXT: vgatherdpd %xmm0, (,%xmm2), %xmm1
; X86-NEXT: vmovapd %xmm1, %xmm0
; X86-NEXT: retl
;
define <8 x i8> @_e4(i8* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: _e4:
; X32: ## %bb.0:
-; X32-NEXT: vmovaps {{.*#+}} xmm0 = [52,52,52,52,52,52,52,52]
+; X32-NEXT: vmovaps {{.*#+}} xmm0 = <52,52,52,52,52,52,52,52,u,u,u,u,u,u,u,u>
; X32-NEXT: retl
;
; X64-LABEL: _e4:
; X64: ## %bb.0:
-; X64-NEXT: vmovaps {{.*#+}} xmm0 = [52,52,52,52,52,52,52,52]
+; X64-NEXT: vmovaps {{.*#+}} xmm0 = <52,52,52,52,52,52,52,52,u,u,u,u,u,u,u,u>
; X64-NEXT: retq
%vecinit0.i = insertelement <8 x i8> undef, i8 52, i32 0
%vecinit1.i = insertelement <8 x i8> %vecinit0.i, i8 52, i32 1
define void @any_extend_load_v8i64(<8 x i8> * %ptr) {
-; ALL-LABEL: any_extend_load_v8i64:
-; ALL: # %bb.0:
-; ALL-NEXT: vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
-; ALL-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0
-; ALL-NEXT: vpmovqb %zmm0, (%rdi)
-; ALL-NEXT: vzeroupper
-; ALL-NEXT: retq
+; KNL-LABEL: any_extend_load_v8i64:
+; KNL: # %bb.0:
+; KNL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; KNL-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
+; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; KNL-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
+; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; KNL-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; KNL-NEXT: vpmovqb %zmm0, (%rdi)
+; KNL-NEXT: vzeroupper
+; KNL-NEXT: retq
+;
+; SKX-LABEL: any_extend_load_v8i64:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
+; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; SKX-NEXT: vpmovqb %zmm0, (%rdi)
+; SKX-NEXT: vzeroupper
+; SKX-NEXT: retq
%wide.load = load <8 x i8>, <8 x i8>* %ptr, align 1
%1 = zext <8 x i8> %wide.load to <8 x i64>
%2 = add nuw nsw <8 x i64> %1, <i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 4>
define void @any_extend_load_v8i32(<8 x i8> * %ptr) {
; KNL-LABEL: any_extend_load_v8i32:
; KNL: # %bb.0:
-; KNL-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; KNL-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
-; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; KNL-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; KNL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4]
+; KNL-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: vmovq %xmm0, (%rdi)
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: any_extend_load_v8i32:
; NOVL-LABEL: f64to8uc:
; NOVL: # %bb.0:
; NOVL-NEXT: vcvttpd2dq %zmm0, %ymm0
-; NOVL-NEXT: vpmovdw %zmm0, %ymm0
-; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; NOVL-NEXT: vpmovdb %zmm0, %xmm0
; NOVL-NEXT: vzeroupper
; NOVL-NEXT: retq
;
; VL-LABEL: f64to8uc:
; VL: # %bb.0:
; VL-NEXT: vcvttpd2dq %zmm0, %ymm0
-; VL-NEXT: vpmovdw %ymm0, %xmm0
+; VL-NEXT: vpmovdb %ymm0, %xmm0
; VL-NEXT: vzeroupper
; VL-NEXT: retq
%res = fptoui <8 x double> %f to <8 x i8>
; NOVL-LABEL: f64to8sc:
; NOVL: # %bb.0:
; NOVL-NEXT: vcvttpd2dq %zmm0, %ymm0
-; NOVL-NEXT: vpmovdw %zmm0, %ymm0
-; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; NOVL-NEXT: vpmovdb %zmm0, %xmm0
; NOVL-NEXT: vzeroupper
; NOVL-NEXT: retq
;
; VL-LABEL: f64to8sc:
; VL: # %bb.0:
; VL-NEXT: vcvttpd2dq %zmm0, %ymm0
-; VL-NEXT: vpmovdw %ymm0, %xmm0
+; VL-NEXT: vpmovdb %ymm0, %xmm0
; VL-NEXT: vzeroupper
; VL-NEXT: retq
%res = fptosi <8 x double> %f to <8 x i8>
define <8 x double> @scto8f64(<8 x i8> %a) {
; ALL-LABEL: scto8f64:
; ALL: # %bb.0:
-; ALL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; ALL-NEXT: vpslld $24, %ymm0, %ymm0
-; ALL-NEXT: vpsrad $24, %ymm0, %ymm0
+; ALL-NEXT: vpmovsxbd %xmm0, %ymm0
; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0
; ALL-NEXT: retq
%1 = sitofp <8 x i8> %a to <8 x double>
}
define <2 x double> @sbto2f64(<2 x double> %a) {
-; ALL-LABEL: sbto2f64:
-; ALL: # %bb.0:
-; ALL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; ALL-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0
-; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; ALL-NEXT: vcvtdq2pd %xmm0, %xmm0
-; ALL-NEXT: retq
+; NOVL-LABEL: sbto2f64:
+; NOVL: # %bb.0:
+; NOVL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; NOVL-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0
+; NOVL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; NOVL-NEXT: vcvtdq2pd %xmm0, %xmm0
+; NOVL-NEXT: retq
+;
+; VLDQ-LABEL: sbto2f64:
+; VLDQ: # %bb.0:
+; VLDQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; VLDQ-NEXT: vcmpltpd %xmm0, %xmm1, %k0
+; VLDQ-NEXT: vpmovm2d %k0, %xmm0
+; VLDQ-NEXT: vcvtdq2pd %xmm0, %xmm0
+; VLDQ-NEXT: retq
+;
+; VLNODQ-LABEL: sbto2f64:
+; VLNODQ: # %bb.0:
+; VLNODQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; VLNODQ-NEXT: vcmpltpd %xmm0, %xmm1, %k1
+; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; VLNODQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; VLNODQ-NEXT: vcvtdq2pd %xmm0, %xmm0
+; VLNODQ-NEXT: retq
%cmpres = fcmp ogt <2 x double> %a, zeroinitializer
%1 = sitofp <2 x i1> %cmpres to <2 x double>
ret <2 x double> %1
define <8 x double> @ucto8f64(<8 x i8> %a) {
; ALL-LABEL: ucto8f64:
; ALL: # %bb.0:
-; ALL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; ALL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; ALL-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0
; ALL-NEXT: retq
%b = uitofp <8 x i8> %a to <8 x double>
}
define <2 x float> @ubto2f32(<2 x i32> %a) {
-; ALL-LABEL: ubto2f32:
-; ALL: # %bb.0:
-; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; ALL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; ALL-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
-; ALL-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0
-; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; ALL-NEXT: retq
+; NOVL-LABEL: ubto2f32:
+; NOVL: # %bb.0:
+; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; NOVL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; NOVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1065353216,1065353216,1065353216,1065353216]
+; NOVL-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NOVL-NEXT: retq
+;
+; VL-LABEL: ubto2f32:
+; VL: # %bb.0:
+; VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; VL-NEXT: vpandnd {{.*}}(%rip){1to4}, %xmm0, %xmm0
+; VL-NEXT: retq
%mask = icmp ne <2 x i32> %a, zeroinitializer
%1 = uitofp <2 x i1> %mask to <2 x float>
ret <2 x float> %1
}
define <2 x double> @ubto2f64(<2 x i32> %a) {
-; ALL-LABEL: ubto2f64:
-; ALL: # %bb.0:
-; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; ALL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; ALL-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
-; ALL-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0
-; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; ALL-NEXT: vcvtdq2pd %xmm0, %xmm0
-; ALL-NEXT: retq
+; NOVL-LABEL: ubto2f64:
+; NOVL: # %bb.0:
+; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; NOVL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; NOVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
+; NOVL-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NOVL-NEXT: vcvtdq2pd %xmm0, %xmm0
+; NOVL-NEXT: retq
+;
+; VL-LABEL: ubto2f64:
+; VL: # %bb.0:
+; VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; VL-NEXT: vpandnd {{.*}}(%rip){1to4}, %xmm0, %xmm0
+; VL-NEXT: vcvtdq2pd %xmm0, %xmm0
+; VL-NEXT: retq
%mask = icmp ne <2 x i32> %a, zeroinitializer
%1 = uitofp <2 x i1> %mask to <2 x double>
ret <2 x double> %1
}
define <4 x i32> @zext_4xi1_to_4x32(<4 x i8> %x, <4 x i8> %y) #0 {
-; ALL-LABEL: zext_4xi1_to_4x32:
-; ALL: # %bb.0:
-; ALL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255]
-; ALL-NEXT: vpand %xmm2, %xmm1, %xmm1
-; ALL-NEXT: vpand %xmm2, %xmm0, %xmm0
-; ALL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
-; ALL-NEXT: vpsrld $31, %xmm0, %xmm0
-; ALL-NEXT: retq
+; KNL-LABEL: zext_4xi1_to_4x32:
+; KNL: # %bb.0:
+; KNL-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; KNL-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; KNL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
+; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_4xi1_to_4x32:
+; SKX: # %bb.0:
+; SKX-NEXT: vpcmpeqb %xmm1, %xmm0, %k0
+; SKX-NEXT: vpmovm2d %k0, %xmm0
+; SKX-NEXT: vpsrld $31, %xmm0, %xmm0
+; SKX-NEXT: retq
+;
+; AVX512DQNOBW-LABEL: zext_4xi1_to_4x32:
+; AVX512DQNOBW: # %bb.0:
+; AVX512DQNOBW-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX512DQNOBW-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX512DQNOBW-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512DQNOBW-NEXT: retq
%mask = icmp eq <4 x i8> %x, %y
%1 = zext <4 x i1> %mask to <4 x i32>
ret <4 x i32> %1
}
define <2 x i64> @zext_2xi1_to_2xi64(<2 x i8> %x, <2 x i8> %y) #0 {
-; ALL-LABEL: zext_2xi1_to_2xi64:
-; ALL: # %bb.0:
-; ALL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255]
-; ALL-NEXT: vpand %xmm2, %xmm1, %xmm1
-; ALL-NEXT: vpand %xmm2, %xmm0, %xmm0
-; ALL-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
-; ALL-NEXT: vpsrlq $63, %xmm0, %xmm0
-; ALL-NEXT: retq
+; KNL-LABEL: zext_2xi1_to_2xi64:
+; KNL: # %bb.0:
+; KNL-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; KNL-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; KNL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_2xi1_to_2xi64:
+; SKX: # %bb.0:
+; SKX-NEXT: vpcmpeqb %xmm1, %xmm0, %k0
+; SKX-NEXT: vpmovm2q %k0, %xmm0
+; SKX-NEXT: vpsrlq $63, %xmm0, %xmm0
+; SKX-NEXT: retq
+;
+; AVX512DQNOBW-LABEL: zext_2xi1_to_2xi64:
+; AVX512DQNOBW: # %bb.0:
+; AVX512DQNOBW-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX512DQNOBW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512DQNOBW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQNOBW-NEXT: retq
%mask = icmp eq <2 x i8> %x, %y
%1 = zext <2 x i1> %mask to <2 x i64>
ret <2 x i64> %1
; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k5 ## encoding: [0x62,0xf2,0xfd,0x48,0x37,0xe9]
; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xef,0xc0]
-; CHECK-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x00]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
-; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
+; CHECK-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
-; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
-; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
-; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
+; CHECK-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
-; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
; CHECK-NEXT: movl $255, %eax ## encoding: [0xb8,0xff,0x00,0x00,0x00]
-; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
+; CHECK-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1)
define <8 x i8> @test_mask_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
; X86-LABEL: test_mask_cmp_q_512:
; X86: ## %bb.0:
-; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb7,0x44,0x24,0x04]
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x29,0xc1]
; X86-NEXT: vpcmpgtq %zmm0, %zmm1, %k2 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0x37,0xd0]
; X86-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x37,0xc9]
; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
; X86-NEXT: vpxor %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xef,0xc0]
-; X86-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x00]
+; X86-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00]
; X86-NEXT: kmovw %k2, %ecx ## encoding: [0xc5,0xf8,0x93,0xca]
-; X86-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x01]
+; X86-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x01]
; X86-NEXT: kmovw %k3, %ecx ## encoding: [0xc5,0xf8,0x93,0xcb]
-; X86-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x02]
+; X86-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x02]
; X86-NEXT: kmovw %k4, %ecx ## encoding: [0xc5,0xf8,0x93,0xcc]
-; X86-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x04]
+; X86-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x04]
; X86-NEXT: kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd]
-; X86-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x05]
+; X86-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x05]
; X86-NEXT: kmovw %k1, %ecx ## encoding: [0xc5,0xf8,0x93,0xc9]
-; X86-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x06]
-; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
+; X86-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x06]
+; X86-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
; X86-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; X86-NEXT: retl ## encoding: [0xc3]
;
; X64-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x37,0xc9]
; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
; X64-NEXT: vpxor %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xef,0xc0]
-; X64-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x00]
+; X64-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
; X64-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
-; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
+; X64-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
; X64-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
-; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
+; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; X64-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
-; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
+; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
; X64-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
-; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
+; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
; X64-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
-; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
-; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc7,0x07]
+; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; X64-NEXT: vpinsrb $7, %edi, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc7,0x07]
; X64-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask)
; CHECK-NEXT: vpcmpnleuq %zmm1, %zmm0, %k5 ## encoding: [0x62,0xf3,0xfd,0x48,0x1e,0xe9,0x06]
; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xef,0xc0]
-; CHECK-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x00]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
-; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
+; CHECK-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
-; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
-; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
-; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
+; CHECK-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
-; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
; CHECK-NEXT: movl $255, %eax ## encoding: [0xb8,0xff,0x00,0x00,0x00]
-; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
+; CHECK-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1)
define <8 x i8> @test_mask_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
; X86-LABEL: test_mask_ucmp_q_512:
; X86: ## %bb.0:
-; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb7,0x44,0x24,0x04]
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x29,0xc1]
; X86-NEXT: vpcmpltuq %zmm1, %zmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1e,0xd1,0x01]
; X86-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1e,0xc9,0x06]
; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
; X86-NEXT: vpxor %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xef,0xc0]
-; X86-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x00]
+; X86-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00]
; X86-NEXT: kmovw %k2, %ecx ## encoding: [0xc5,0xf8,0x93,0xca]
-; X86-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x01]
+; X86-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x01]
; X86-NEXT: kmovw %k3, %ecx ## encoding: [0xc5,0xf8,0x93,0xcb]
-; X86-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x02]
+; X86-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x02]
; X86-NEXT: kmovw %k4, %ecx ## encoding: [0xc5,0xf8,0x93,0xcc]
-; X86-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x04]
+; X86-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x04]
; X86-NEXT: kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd]
-; X86-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x05]
+; X86-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x05]
; X86-NEXT: kmovw %k1, %ecx ## encoding: [0xc5,0xf8,0x93,0xc9]
-; X86-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x06]
-; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
+; X86-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x06]
+; X86-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
; X86-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; X86-NEXT: retl ## encoding: [0xc3]
;
; X64-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1e,0xc9,0x06]
; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
; X64-NEXT: vpxor %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xef,0xc0]
-; X64-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x00]
+; X64-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
; X64-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
-; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
+; X64-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
; X64-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
-; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
+; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; X64-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
-; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
+; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
; X64-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
-; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
+; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
; X64-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
-; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
-; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc7,0x07]
+; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; X64-NEXT: vpinsrb $7, %edi, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc7,0x07]
; X64-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask)
; KNL-LABEL: load_2i1:
; KNL: ## %bb.0:
; KNL-NEXT: kmovw (%rdi), %k1
-; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: vpmovdw %zmm0, %ymm0
+; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 killed $ymm0
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: load_2i1:
; SKX: ## %bb.0:
; SKX-NEXT: kmovb (%rdi), %k0
-; SKX-NEXT: vpmovm2q %k0, %xmm0
+; SKX-NEXT: vpmovm2w %k0, %xmm0
; SKX-NEXT: retq
;
; AVX512BW-LABEL: load_2i1:
; AVX512BW: ## %bb.0:
-; AVX512BW-NEXT: kmovw (%rdi), %k1
-; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT: kmovw (%rdi), %k0
+; AVX512BW-NEXT: vpmovm2w %k0, %zmm0
; AVX512BW-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
; AVX512DQ-LABEL: load_2i1:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: kmovb (%rdi), %k0
-; AVX512DQ-NEXT: vpmovm2q %k0, %zmm0
-; AVX512DQ-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512DQ-NEXT: ## kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: kmovb (%eax), %k0
-; X86-NEXT: vpmovm2q %k0, %xmm0
+; X86-NEXT: vpmovm2w %k0, %xmm0
; X86-NEXT: retl
%b = load <2 x i1>, <2 x i1>* %a
%c = sext <2 x i1> %b to <2 x i16>
; KNL: ## %bb.0:
; KNL-NEXT: kmovw (%rdi), %k1
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
+; KNL-NEXT: vpmovdw %zmm0, %ymm0
+; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 killed $ymm0
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: load_4i1:
; SKX: ## %bb.0:
; SKX-NEXT: kmovb (%rdi), %k0
-; SKX-NEXT: vpmovm2d %k0, %xmm0
+; SKX-NEXT: vpmovm2w %k0, %xmm0
; SKX-NEXT: retq
;
; AVX512BW-LABEL: load_4i1:
; AVX512BW: ## %bb.0:
-; AVX512BW-NEXT: kmovw (%rdi), %k1
-; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT: kmovw (%rdi), %k0
+; AVX512BW-NEXT: vpmovm2w %k0, %zmm0
; AVX512BW-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: kmovb (%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
-; AVX512DQ-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512DQ-NEXT: ## kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: kmovb (%eax), %k0
-; X86-NEXT: vpmovm2d %k0, %xmm0
+; X86-NEXT: vpmovm2w %k0, %xmm0
; X86-NEXT: retl
%b = load <4 x i1>, <4 x i1>* %a
%c = sext <4 x i1> %b to <4 x i16>
define <8 x i8> @trunc_qb_512(<8 x i64> %i) #0 {
; ALL-LABEL: trunc_qb_512:
; ALL: ## %bb.0:
-; ALL-NEXT: vpmovqw %zmm0, %xmm0
+; ALL-NEXT: vpmovqb %zmm0, %xmm0
; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
%x = trunc <8 x i64> %i to <8 x i8>
; KNL-LABEL: trunc_qb_256:
; KNL: ## %bb.0:
; KNL-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT: vpmovqd %zmm0, %ymm0
-; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 killed $ymm0
+; KNL-NEXT: vpmovqb %zmm0, %xmm0
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_qb_256:
; SKX: ## %bb.0:
-; SKX-NEXT: vpmovqd %ymm0, %xmm0
+; SKX-NEXT: vpmovqb %ymm0, %xmm0
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x = trunc <4 x i64> %i to <4 x i8>
; KNL-LABEL: trunc_qb_256_mem:
; KNL: ## %bb.0:
; KNL-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT: vpmovqd %zmm0, %ymm0
-; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; KNL-NEXT: vpmovqb %zmm0, %xmm0
; KNL-NEXT: vmovd %xmm0, (%rdi)
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
define <2 x i8> @trunc_qb_128(<2 x i64> %i) #0 {
; ALL-LABEL: trunc_qb_128:
; ALL: ## %bb.0:
+; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; ALL-NEXT: retq
%x = trunc <2 x i64> %i to <2 x i8>
ret <2 x i8> %x
; KNL-LABEL: trunc_qw_256:
; KNL: ## %bb.0:
; KNL-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT: vpmovqd %zmm0, %ymm0
-; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 killed $ymm0
+; KNL-NEXT: vpmovqw %zmm0, %xmm0
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_qw_256:
; SKX: ## %bb.0:
-; SKX-NEXT: vpmovqd %ymm0, %xmm0
+; SKX-NEXT: vpmovqw %ymm0, %xmm0
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x = trunc <4 x i64> %i to <4 x i16>
; KNL-LABEL: trunc_qw_256_mem:
; KNL: ## %bb.0:
; KNL-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT: vpmovqd %zmm0, %ymm0
-; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; KNL-NEXT: vpmovqw %zmm0, %xmm0
; KNL-NEXT: vmovq %xmm0, (%rdi)
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
}
define <2 x i16> @trunc_qw_128(<2 x i64> %i) #0 {
-; ALL-LABEL: trunc_qw_128:
-; ALL: ## %bb.0:
-; ALL-NEXT: retq
+; KNL-LABEL: trunc_qw_128:
+; KNL: ## %bb.0:
+; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; KNL-NEXT: retq
+;
+; SKX-LABEL: trunc_qw_128:
+; SKX: ## %bb.0:
+; SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
+; SKX-NEXT: retq
%x = trunc <2 x i64> %i to <2 x i16>
ret <2 x i16> %x
}
define <2 x i32> @trunc_qd_128(<2 x i64> %i) #0 {
; ALL-LABEL: trunc_qd_128:
; ALL: ## %bb.0:
+; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; ALL-NEXT: retq
%x = trunc <2 x i64> %i to <2 x i32>
ret <2 x i32> %x
; KNL-LABEL: trunc_db_256:
; KNL: ## %bb.0:
; KNL-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT: vpmovdw %zmm0, %ymm0
-; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 killed $ymm0
+; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_db_256:
; SKX: ## %bb.0:
-; SKX-NEXT: vpmovdw %ymm0, %xmm0
+; SKX-NEXT: vpmovdb %ymm0, %xmm0
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x = trunc <8 x i32> %i to <8 x i8>
; KNL-LABEL: trunc_db_256_mem:
; KNL: ## %bb.0:
; KNL-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT: vpmovdw %zmm0, %ymm0
-; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: vmovq %xmm0, (%rdi)
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
define <4 x i8> @trunc_db_128(<4 x i32> %i) #0 {
; ALL-LABEL: trunc_db_128:
; ALL: ## %bb.0:
+; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; ALL-NEXT: retq
%x = trunc <4 x i32> %i to <4 x i8>
ret <4 x i8> %x
define <8 x i8> @trunc_wb_128(<8 x i16> %i) #0 {
; ALL-LABEL: trunc_wb_128:
; ALL: ## %bb.0:
+; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; ALL-NEXT: retq
%x = trunc <8 x i16> %i to <8 x i8>
ret <8 x i8> %x
; ALL-LABEL: usat_trunc_wb_128:
; ALL: ## %bb.0:
; ALL-NEXT: vpminuw {{.*}}(%rip), %xmm0, %xmm0
+; ALL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; ALL-NEXT: retq
%x3 = icmp ult <8 x i16> %i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
%x5 = select <8 x i1> %x3, <8 x i16> %i, <8 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
; KNL: ## %bb.0:
; KNL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
; KNL-NEXT: vpminud %ymm1, %ymm0, %ymm0
-; KNL-NEXT: vpmovdw %zmm0, %ymm0
-; KNL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
+; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: usat_trunc_db_256:
; SKX: ## %bb.0:
; SKX-NEXT: vpminud {{.*}}(%rip){1to8}, %ymm0, %ymm0
-; SKX-NEXT: vpmovdw %ymm0, %xmm0
-; SKX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
+; SKX-NEXT: vpmovdb %ymm0, %xmm0
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%tmp1 = icmp ult <8 x i32> %x, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
define <4 x i32> @test44(<4 x i16> %x, <4 x i16> %y) #0 {
; AVX512-LABEL: test44:
; AVX512: ## %bb.0:
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xef,0xd2]
-; AVX512-NEXT: vpblendw $170, %xmm2, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0e,0xca,0xaa]
-; AVX512-NEXT: ## xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; AVX512-NEXT: vpblendw $170, %xmm2, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc2,0xaa]
-; AVX512-NEXT: ## xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x76,0xc1]
+; AVX512-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x75,0xc1]
+; AVX512-NEXT: vpmovsxwd %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x23,0xc0]
; AVX512-NEXT: retq ## encoding: [0xc3]
;
; SKX-LABEL: test44:
; SKX: ## %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; SKX-NEXT: vpblendw $170, %xmm2, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0e,0xca,0xaa]
-; SKX-NEXT: ## xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; SKX-NEXT: vpblendw $170, %xmm2, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc2,0xaa]
-; SKX-NEXT: ## xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x76,0xc1]
+; SKX-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x08,0x75,0xc1]
+; SKX-NEXT: vpmovm2d %k0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
; SKX-NEXT: retq ## encoding: [0xc3]
%mask = icmp eq <4 x i16> %x, %y
%1 = sext <4 x i1> %mask to <4 x i32>
define <2 x i64> @test45(<2 x i16> %x, <2 x i16> %y) #0 {
; AVX512-LABEL: test45:
; AVX512: ## %bb.0:
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xef,0xd2]
-; AVX512-NEXT: vpblendw $17, %xmm1, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x0e,0xc9,0x11]
-; AVX512-NEXT: ## xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX512-NEXT: vpblendw $17, %xmm0, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x0e,0xc0,0x11]
-; AVX512-NEXT: ## xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
-; AVX512-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x29,0xc1]
-; AVX512-NEXT: vpsrlq $63, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x73,0xd0,0x3f]
+; AVX512-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x75,0xc1]
+; AVX512-NEXT: vpmovzxwq %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x34,0xc0]
+; AVX512-NEXT: ## xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdb,0x05,A,A,A,A]
+; AVX512-NEXT: ## fixup A - offset: 4, value: LCPI46_0-4, kind: reloc_riprel_4byte
; AVX512-NEXT: retq ## encoding: [0xc3]
;
; SKX-LABEL: test45:
; SKX: ## %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; SKX-NEXT: vpblendw $17, %xmm1, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x0e,0xc9,0x11]
-; SKX-NEXT: ## xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; SKX-NEXT: vpblendw $17, %xmm0, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x0e,0xc0,0x11]
-; SKX-NEXT: ## xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
-; SKX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x29,0xc1]
+; SKX-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x08,0x75,0xc1]
+; SKX-NEXT: vpmovm2q %k0, %xmm0 ## encoding: [0x62,0xf2,0xfe,0x08,0x38,0xc0]
; SKX-NEXT: vpsrlq $63, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x73,0xd0,0x3f]
; SKX-NEXT: retq ## encoding: [0xc3]
%mask = icmp eq <2 x i16> %x, %y
; CHECK-LABEL: foo:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovd %edi, %xmm0
-; CHECK-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0
-; CHECK-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0
-; CHECK-NEXT: vpslld $24, %xmm0, %xmm0
+; CHECK-NEXT: vpinsrb $1, %esi, %xmm0, %xmm0
+; CHECK-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
; CHECK-NEXT: vmovd %ecx, %xmm1
-; CHECK-NEXT: vpinsrd $1, %r8d, %xmm1, %xmm1
-; CHECK-NEXT: vpsrad $24, %xmm0, %xmm0
-; CHECK-NEXT: vpinsrd $2, %r9d, %xmm1, %xmm1
-; CHECK-NEXT: vpslld $24, %xmm1, %xmm1
-; CHECK-NEXT: vpsrad $24, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vpinsrb $1, %r8d, %xmm1, %xmm1
+; CHECK-NEXT: vpinsrb $2, %r9d, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; CHECK-NEXT: vpextrb $0, %xmm0, %eax
-; CHECK-NEXT: vpextrb $4, %xmm0, %edx
-; CHECK-NEXT: vpextrb $8, %xmm0, %ecx
+; CHECK-NEXT: vpextrb $1, %xmm0, %edx
+; CHECK-NEXT: vpextrb $2, %xmm0, %ecx
; CHECK-NEXT: # kill: def $al killed $al killed $eax
; CHECK-NEXT: # kill: def $dl killed $dl killed $edx
; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx
; CHECK-NEXT: vpcmpgtw %xmm1, %xmm0, %k5 # encoding: [0x62,0xf1,0x7d,0x08,0x65,0xe9]
; CHECK-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0]
-; CHECK-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x00]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
; CHECK-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1]
-; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
+; CHECK-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
; CHECK-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2]
-; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; CHECK-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3]
-; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
; CHECK-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4]
-; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
+; CHECK-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
; CHECK-NEXT: kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5]
-; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
; CHECK-NEXT: movl $255, %eax # encoding: [0xb8,0xff,0x00,0x00,0x00]
-; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
+; CHECK-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 -1)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
define <8 x i8> @test_mask_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
; X86-LABEL: test_mask_cmp_w_128:
; X86: # %bb.0:
-; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x04]
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
; X86-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x75,0xc1]
; X86-NEXT: vpcmpgtw %xmm0, %xmm1, %k2 {%k1} # encoding: [0x62,0xf1,0x75,0x09,0x65,0xd0]
; X86-NEXT: vpcmpgtw %xmm1, %xmm0, %k1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x65,0xc9]
; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
; X86-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0]
-; X86-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x00]
+; X86-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00]
; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca]
-; X86-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x01]
+; X86-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x01]
; X86-NEXT: kmovd %k3, %ecx # encoding: [0xc5,0xfb,0x93,0xcb]
-; X86-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x02]
+; X86-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x02]
; X86-NEXT: kmovd %k4, %ecx # encoding: [0xc5,0xfb,0x93,0xcc]
-; X86-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x04]
+; X86-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x04]
; X86-NEXT: kmovd %k5, %ecx # encoding: [0xc5,0xfb,0x93,0xcd]
-; X86-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x05]
+; X86-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x05]
; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9]
-; X86-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x06]
-; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
+; X86-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x06]
+; X86-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_mask_cmp_w_128:
; X64-NEXT: vpcmpgtw %xmm1, %xmm0, %k1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x65,0xc9]
; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
; X64-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0]
-; X64-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x00]
+; X64-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
; X64-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2]
-; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
+; X64-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
; X64-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3]
-; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
+; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; X64-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4]
-; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
+; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
; X64-NEXT: kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5]
-; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
+; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
; X64-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1]
-; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
-; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc7,0x07]
+; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; X64-NEXT: vpinsrb $7, %edi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc7,0x07]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 %mask)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
; CHECK-NEXT: vpcmpnleuw %xmm1, %xmm0, %k5 # encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xe9,0x06]
; CHECK-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0]
-; CHECK-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x00]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
; CHECK-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1]
-; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
+; CHECK-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
; CHECK-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2]
-; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; CHECK-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3]
-; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
; CHECK-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4]
-; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
+; CHECK-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
; CHECK-NEXT: kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5]
-; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
; CHECK-NEXT: movl $255, %eax # encoding: [0xb8,0xff,0x00,0x00,0x00]
-; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
+; CHECK-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 -1)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
define <8 x i8> @test_mask_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
; X86-LABEL: test_mask_ucmp_w_128:
; X86: # %bb.0:
-; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x04]
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
; X86-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x75,0xc1]
; X86-NEXT: vpcmpltuw %xmm1, %xmm0, %k2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xd1,0x01]
; X86-NEXT: vpcmpnleuw %xmm1, %xmm0, %k1 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xc9,0x06]
; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
; X86-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0]
-; X86-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x00]
+; X86-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00]
; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca]
-; X86-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x01]
+; X86-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x01]
; X86-NEXT: kmovd %k3, %ecx # encoding: [0xc5,0xfb,0x93,0xcb]
-; X86-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x02]
+; X86-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x02]
; X86-NEXT: kmovd %k4, %ecx # encoding: [0xc5,0xfb,0x93,0xcc]
-; X86-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x04]
+; X86-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x04]
; X86-NEXT: kmovd %k5, %ecx # encoding: [0xc5,0xfb,0x93,0xcd]
-; X86-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x05]
+; X86-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x05]
; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9]
-; X86-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x06]
-; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
+; X86-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x06]
+; X86-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_mask_ucmp_w_128:
; X64-NEXT: vpcmpnleuw %xmm1, %xmm0, %k1 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xc9,0x06]
; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
; X64-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0]
-; X64-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x00]
+; X64-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
; X64-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2]
-; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
+; X64-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
; X64-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3]
-; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
+; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; X64-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4]
-; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
+; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
; X64-NEXT: kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5]
-; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
+; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
; X64-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1]
-; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
-; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc7,0x07]
+; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; X64-NEXT: vpinsrb $7, %edi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc7,0x07]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 %mask)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
; CHECK-LABEL: test_mm256_cvtepi64_epi8:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vpmovqb %ymm0, %xmm0
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: ret{{[l|q]}}
entry:
; CHECK-LABEL: test_mm256_cvtepi64_epi16:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vpmovqw %ymm0, %xmm0
+; CHECK-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: ret{{[l|q]}}
entry:
; CHECK-LABEL: test_mm256_cvtepi32_epi8:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vpmovdb %ymm0, %xmm0
+; CHECK-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: ret{{[l|q]}}
entry:
; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k5 # encoding: [0x62,0xf1,0x7d,0x28,0x66,0xe9]
; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0]
-; CHECK-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x00]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
; CHECK-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
-; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
+; CHECK-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
; CHECK-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
-; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; CHECK-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3]
-; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
; CHECK-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4]
-; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
+; CHECK-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
; CHECK-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5]
-; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
; CHECK-NEXT: movl $255, %eax # encoding: [0xb8,0xff,0x00,0x00,0x00]
-; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
+; CHECK-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 -1)
define <8 x i8> @test_mask_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) {
; X86-LABEL: test_mask_cmp_d_256:
; X86: # %bb.0:
-; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x04]
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x76,0xc1]
; X86-NEXT: vpcmpgtd %ymm0, %ymm1, %k2 {%k1} # encoding: [0x62,0xf1,0x75,0x29,0x66,0xd0]
; X86-NEXT: vpcmpgtd %ymm1, %ymm0, %k1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x66,0xc9]
; X86-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
; X86-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0]
-; X86-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x00]
+; X86-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00]
; X86-NEXT: kmovw %k2, %ecx # encoding: [0xc5,0xf8,0x93,0xca]
-; X86-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x01]
+; X86-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x01]
; X86-NEXT: kmovw %k3, %ecx # encoding: [0xc5,0xf8,0x93,0xcb]
-; X86-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x02]
+; X86-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x02]
; X86-NEXT: kmovw %k4, %ecx # encoding: [0xc5,0xf8,0x93,0xcc]
-; X86-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x04]
+; X86-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x04]
; X86-NEXT: kmovw %k5, %ecx # encoding: [0xc5,0xf8,0x93,0xcd]
-; X86-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x05]
+; X86-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x05]
; X86-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9]
-; X86-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x06]
-; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
+; X86-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x06]
+; X86-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-NEXT: vpcmpgtd %ymm1, %ymm0, %k1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x66,0xc9]
; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
; X64-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0]
-; X64-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x00]
+; X64-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
; X64-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
-; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
+; X64-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
; X64-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3]
-; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
+; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; X64-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4]
-; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
+; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
; X64-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5]
-; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
+; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
; X64-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
-; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
-; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc7,0x07]
+; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; X64-NEXT: vpinsrb $7, %edi, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc7,0x07]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 %mask)
; CHECK-NEXT: vpcmpnleud %ymm1, %ymm0, %k5 # encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xe9,0x06]
; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0]
-; CHECK-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x00]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
; CHECK-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
-; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
+; CHECK-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
; CHECK-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
-; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; CHECK-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3]
-; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
; CHECK-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4]
-; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
+; CHECK-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
; CHECK-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5]
-; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
; CHECK-NEXT: movl $255, %eax # encoding: [0xb8,0xff,0x00,0x00,0x00]
-; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
+; CHECK-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 -1)
define <8 x i8> @test_mask_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) {
; X86-LABEL: test_mask_ucmp_d_256:
; X86: # %bb.0:
-; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x04]
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x76,0xc1]
; X86-NEXT: vpcmpltud %ymm1, %ymm0, %k2 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xd1,0x01]
; X86-NEXT: vpcmpnleud %ymm1, %ymm0, %k1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xc9,0x06]
; X86-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
; X86-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0]
-; X86-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x00]
+; X86-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00]
; X86-NEXT: kmovw %k2, %ecx # encoding: [0xc5,0xf8,0x93,0xca]
-; X86-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x01]
+; X86-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x01]
; X86-NEXT: kmovw %k3, %ecx # encoding: [0xc5,0xf8,0x93,0xcb]
-; X86-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x02]
+; X86-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x02]
; X86-NEXT: kmovw %k4, %ecx # encoding: [0xc5,0xf8,0x93,0xcc]
-; X86-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x04]
+; X86-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x04]
; X86-NEXT: kmovw %k5, %ecx # encoding: [0xc5,0xf8,0x93,0xcd]
-; X86-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x05]
+; X86-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x05]
; X86-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9]
-; X86-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x06]
-; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
+; X86-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x06]
+; X86-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-NEXT: vpcmpnleud %ymm1, %ymm0, %k1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xc9,0x06]
; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
; X64-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0]
-; X64-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x00]
+; X64-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
; X64-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
-; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
+; X64-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
; X64-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3]
-; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
+; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; X64-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4]
-; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
+; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
; X64-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5]
-; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
+; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
; X64-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
-; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
-; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc7,0x07]
+; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; X64-NEXT: vpinsrb $7, %edi, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc7,0x07]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 %mask)
; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k5 # encoding: [0x62,0xf2,0xfd,0x28,0x37,0xe9]
; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0]
-; CHECK-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x00]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
; CHECK-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
-; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
+; CHECK-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
; CHECK-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
-; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; CHECK-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3]
-; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
; CHECK-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4]
-; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
+; CHECK-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
; CHECK-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5]
-; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
; CHECK-NEXT: movl $15, %eax # encoding: [0xb8,0x0f,0x00,0x00,0x00]
-; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
+; CHECK-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 -1)
; X86-NEXT: kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
; X86-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
; X86-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0]
-; X86-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x00]
+; X86-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
; X86-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
-; X86-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
+; X86-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
; X86-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3]
-; X86-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
+; X86-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; X86-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4]
-; X86-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
+; X86-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
; X86-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5]
-; X86-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
+; X86-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
; X86-NEXT: kmovw %k6, %eax # encoding: [0xc5,0xf8,0x93,0xc6]
-; X86-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
+; X86-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
; X86-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
-; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
+; X86-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-NEXT: kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
; X64-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0]
-; X64-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x00]
+; X64-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
; X64-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
-; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
+; X64-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
; X64-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3]
-; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
+; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; X64-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4]
-; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
+; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
; X64-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5]
-; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
+; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
; X64-NEXT: kmovw %k6, %eax # encoding: [0xc5,0xf8,0x93,0xc6]
-; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
+; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
; X64-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
-; X64-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
+; X64-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 %mask)
; CHECK-NEXT: vpcmpnleuq %ymm1, %ymm0, %k5 # encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xe9,0x06]
; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0]
-; CHECK-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x00]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
; CHECK-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
-; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
+; CHECK-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
; CHECK-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
-; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; CHECK-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3]
-; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
; CHECK-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4]
-; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
+; CHECK-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
; CHECK-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5]
-; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
; CHECK-NEXT: movl $15, %eax # encoding: [0xb8,0x0f,0x00,0x00,0x00]
-; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
+; CHECK-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 -1)
; X86-NEXT: kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
; X86-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
; X86-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0]
-; X86-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x00]
+; X86-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
; X86-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
-; X86-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
+; X86-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
; X86-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3]
-; X86-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
+; X86-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; X86-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4]
-; X86-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
+; X86-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
; X86-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5]
-; X86-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
+; X86-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
; X86-NEXT: kmovw %k6, %eax # encoding: [0xc5,0xf8,0x93,0xc6]
-; X86-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
+; X86-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
; X86-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
-; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
+; X86-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-NEXT: kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
; X64-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0]
-; X64-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x00]
+; X64-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
; X64-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
-; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
+; X64-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
; X64-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3]
-; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
+; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; X64-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4]
-; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
+; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
; X64-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5]
-; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
+; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
; X64-NEXT: kmovw %k6, %eax # encoding: [0xc5,0xf8,0x93,0xc6]
-; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
+; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
; X64-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
-; X64-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
+; X64-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 %mask)
; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k5 # encoding: [0x62,0xf1,0x7d,0x08,0x66,0xe9]
; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0]
-; CHECK-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x00]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
; CHECK-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
-; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
+; CHECK-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
; CHECK-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
-; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; CHECK-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3]
-; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
; CHECK-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4]
-; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
+; CHECK-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
; CHECK-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5]
-; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
; CHECK-NEXT: movl $15, %eax # encoding: [0xb8,0x0f,0x00,0x00,0x00]
-; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
+; CHECK-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 -1)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
; X86-NEXT: kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
; X86-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
; X86-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0]
-; X86-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x00]
+; X86-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
; X86-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
-; X86-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
+; X86-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
; X86-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3]
-; X86-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
+; X86-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; X86-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4]
-; X86-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
+; X86-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
; X86-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5]
-; X86-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
+; X86-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
; X86-NEXT: kmovw %k6, %eax # encoding: [0xc5,0xf8,0x93,0xc6]
-; X86-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
+; X86-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
; X86-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
-; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
+; X86-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_mask_cmp_d_128:
; X64-NEXT: kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
; X64-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0]
-; X64-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x00]
+; X64-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
; X64-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
-; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
+; X64-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
; X64-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3]
-; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
+; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; X64-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4]
-; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
+; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
; X64-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5]
-; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
+; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
; X64-NEXT: kmovw %k6, %eax # encoding: [0xc5,0xf8,0x93,0xc6]
-; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
+; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
; X64-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
-; X64-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
+; X64-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 %mask)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
; CHECK-NEXT: vpcmpnleud %xmm1, %xmm0, %k5 # encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xe9,0x06]
; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0]
-; CHECK-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x00]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
; CHECK-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
-; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
+; CHECK-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
; CHECK-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
-; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; CHECK-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3]
-; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
; CHECK-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4]
-; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
+; CHECK-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
; CHECK-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5]
-; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
; CHECK-NEXT: movl $15, %eax # encoding: [0xb8,0x0f,0x00,0x00,0x00]
-; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
+; CHECK-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 -1)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
; X86-NEXT: kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
; X86-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
; X86-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0]
-; X86-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x00]
+; X86-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
; X86-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
-; X86-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
+; X86-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
; X86-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3]
-; X86-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
+; X86-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; X86-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4]
-; X86-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
+; X86-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
; X86-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5]
-; X86-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
+; X86-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
; X86-NEXT: kmovw %k6, %eax # encoding: [0xc5,0xf8,0x93,0xc6]
-; X86-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
+; X86-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
; X86-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
-; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
+; X86-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_mask_ucmp_d_128:
; X64-NEXT: kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
; X64-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0]
-; X64-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x00]
+; X64-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
; X64-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
-; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
+; X64-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
; X64-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3]
-; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
+; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; X64-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4]
-; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
+; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
; X64-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5]
-; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
+; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
; X64-NEXT: kmovw %k6, %eax # encoding: [0xc5,0xf8,0x93,0xc6]
-; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
+; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
; X64-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
-; X64-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
+; X64-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 %mask)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k5 # encoding: [0x62,0xf2,0xfd,0x08,0x37,0xe9]
; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0]
-; CHECK-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x00]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
; CHECK-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
-; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
+; CHECK-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
; CHECK-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
-; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; CHECK-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3]
-; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
; CHECK-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4]
-; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
+; CHECK-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
; CHECK-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5]
-; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
; CHECK-NEXT: movl $3, %eax # encoding: [0xb8,0x03,0x00,0x00,0x00]
-; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
+; CHECK-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 -1)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
; X86-NEXT: kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e]
; X86-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
; X86-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0]
-; X86-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x00]
+; X86-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
; X86-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
-; X86-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
+; X86-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
; X86-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3]
-; X86-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
+; X86-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; X86-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4]
-; X86-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
+; X86-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
; X86-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5]
-; X86-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
+; X86-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
; X86-NEXT: kmovw %k6, %eax # encoding: [0xc5,0xf8,0x93,0xc6]
-; X86-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
+; X86-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
; X86-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
-; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
+; X86-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_mask_cmp_q_128:
; X64-NEXT: kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e]
; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
; X64-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0]
-; X64-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x00]
+; X64-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
; X64-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
-; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
+; X64-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
; X64-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3]
-; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
+; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; X64-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4]
-; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
+; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
; X64-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5]
-; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
+; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
; X64-NEXT: kmovw %k6, %eax # encoding: [0xc5,0xf8,0x93,0xc6]
-; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
+; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
; X64-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
-; X64-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
+; X64-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 %mask)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
; CHECK-NEXT: vpcmpnleuq %xmm1, %xmm0, %k5 # encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xe9,0x06]
; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0]
-; CHECK-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x00]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
; CHECK-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
-; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
+; CHECK-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
; CHECK-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
-; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; CHECK-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3]
-; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
; CHECK-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4]
-; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
+; CHECK-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
; CHECK-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5]
-; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
; CHECK-NEXT: movl $3, %eax # encoding: [0xb8,0x03,0x00,0x00,0x00]
-; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
+; CHECK-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 -1)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
; X86-NEXT: kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e]
; X86-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
; X86-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0]
-; X86-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x00]
+; X86-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
; X86-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
-; X86-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
+; X86-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
; X86-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3]
-; X86-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
+; X86-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; X86-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4]
-; X86-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
+; X86-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
; X86-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5]
-; X86-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
+; X86-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
; X86-NEXT: kmovw %k6, %eax # encoding: [0xc5,0xf8,0x93,0xc6]
-; X86-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
+; X86-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
; X86-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
-; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
+; X86-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_mask_ucmp_q_128:
; X64-NEXT: kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e]
; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
; X64-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0]
-; X64-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x00]
+; X64-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
; X64-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
-; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
+; X64-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
; X64-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3]
-; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
+; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; X64-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4]
-; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
+; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
; X64-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5]
-; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
+; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
; X64-NEXT: kmovw %k6, %eax # encoding: [0xc5,0xf8,0x93,0xc6]
-; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
+; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
; X64-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
-; X64-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
+; X64-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 %mask)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
}
define i2 @v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) {
-; SSE2-SSSE3-LABEL: v2i8:
-; SSE2-SSSE3: # %bb.0:
-; SSE2-SSSE3-NEXT: psllq $56, %xmm2
-; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm4
-; SSE2-SSSE3-NEXT: psrad $31, %xmm4
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
-; SSE2-SSSE3-NEXT: psrad $24, %xmm2
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; SSE2-SSSE3-NEXT: psllq $56, %xmm3
-; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm4
-; SSE2-SSSE3-NEXT: psrad $31, %xmm4
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
-; SSE2-SSSE3-NEXT: psrad $24, %xmm3
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; SSE2-SSSE3-NEXT: psllq $56, %xmm0
-; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm4
-; SSE2-SSSE3-NEXT: psrad $31, %xmm4
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
-; SSE2-SSSE3-NEXT: psrad $24, %xmm0
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; SSE2-SSSE3-NEXT: psllq $56, %xmm1
-; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm4
-; SSE2-SSSE3-NEXT: psrad $31, %xmm4
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
-; SSE2-SSSE3-NEXT: psrad $24, %xmm1
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
-; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
-; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm1
-; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm0
-; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm5
-; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm5
-; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1
-; SSE2-SSSE3-NEXT: por %xmm0, %xmm1
-; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm3
-; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm2
-; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm0
-; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
-; SSE2-SSSE3-NEXT: pand %xmm0, %xmm3
-; SSE2-SSSE3-NEXT: por %xmm2, %xmm3
-; SSE2-SSSE3-NEXT: pand %xmm1, %xmm3
-; SSE2-SSSE3-NEXT: movmskpd %xmm3, %eax
-; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-SSSE3-NEXT: retq
-;
-; AVX1-LABEL: v2i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $56, %xmm3, %xmm3
-; AVX1-NEXT: vpsrad $31, %xmm3, %xmm4
-; AVX1-NEXT: vpsrad $24, %xmm3, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
-; AVX1-NEXT: vpsllq $56, %xmm2, %xmm2
-; AVX1-NEXT: vpsrad $31, %xmm2, %xmm4
-; AVX1-NEXT: vpsrad $24, %xmm2, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpsllq $56, %xmm1, %xmm1
-; AVX1-NEXT: vpsrad $31, %xmm1, %xmm3
-; AVX1-NEXT: vpsrad $24, %xmm1, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; AVX1-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm3
-; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovmskpd %xmm0, %eax
-; AVX1-NEXT: # kill: def $al killed $al killed $eax
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: v2i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllq $56, %xmm3, %xmm3
-; AVX2-NEXT: vpsrad $31, %xmm3, %xmm4
-; AVX2-NEXT: vpsrad $24, %xmm3, %xmm3
-; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3]
-; AVX2-NEXT: vpsllq $56, %xmm2, %xmm2
-; AVX2-NEXT: vpsrad $31, %xmm2, %xmm4
-; AVX2-NEXT: vpsrad $24, %xmm2, %xmm2
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3]
-; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpsllq $56, %xmm1, %xmm1
-; AVX2-NEXT: vpsrad $31, %xmm1, %xmm3
-; AVX2-NEXT: vpsrad $24, %xmm1, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3]
-; AVX2-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $31, %xmm0, %xmm3
-; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3]
-; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vmovmskpd %xmm0, %eax
-; AVX2-NEXT: # kill: def $al killed $al killed $eax
-; AVX2-NEXT: retq
+; SSE2-LABEL: v2i8:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm0
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-NEXT: pcmpgtb %xmm3, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: movmskpd %xmm1, %eax
+; SSE2-NEXT: # kill: def $al killed $al killed $eax
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: v2i8:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: pcmpgtb %xmm1, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = <u,u,0,0,u,u,0,0,u,u,1,1,u,u,1,1>
+; SSSE3-NEXT: pshufb %xmm1, %xmm0
+; SSSE3-NEXT: pcmpgtb %xmm3, %xmm2
+; SSSE3-NEXT: pshufb %xmm1, %xmm2
+; SSSE3-NEXT: pand %xmm0, %xmm2
+; SSSE3-NEXT: movmskpd %xmm2, %eax
+; SSSE3-NEXT: # kill: def $al killed $al killed $eax
+; SSSE3-NEXT: retq
+;
+; AVX12-LABEL: v2i8:
+; AVX12: # %bb.0:
+; AVX12-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
+; AVX12-NEXT: vpmovsxbq %xmm0, %xmm0
+; AVX12-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm1
+; AVX12-NEXT: vpmovsxbq %xmm1, %xmm1
+; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX12-NEXT: vmovmskpd %xmm0, %eax
+; AVX12-NEXT: # kill: def $al killed $al killed $eax
+; AVX12-NEXT: retq
;
; AVX512F-LABEL: v2i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpsllq $56, %xmm3, %xmm3
-; AVX512F-NEXT: vpsraq $56, %xmm3, %xmm3
-; AVX512F-NEXT: vpsllq $56, %xmm2, %xmm2
-; AVX512F-NEXT: vpsraq $56, %xmm2, %xmm2
-; AVX512F-NEXT: vpsllq $56, %xmm1, %xmm1
-; AVX512F-NEXT: vpsraq $56, %xmm1, %xmm1
-; AVX512F-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX512F-NEXT: vpsraq $56, %xmm0, %xmm0
-; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k1
-; AVX512F-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1}
+; AVX512F-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm0
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
+; AVX512F-NEXT: kandw %k1, %k0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: # kill: def $al killed $al killed $eax
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: v2i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpsllq $56, %xmm3, %xmm3
-; AVX512BW-NEXT: vpsraq $56, %xmm3, %xmm3
-; AVX512BW-NEXT: vpsllq $56, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsraq $56, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsllq $56, %xmm1, %xmm1
-; AVX512BW-NEXT: vpsraq $56, %xmm1, %xmm1
-; AVX512BW-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX512BW-NEXT: vpsraq $56, %xmm0, %xmm0
-; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k1
-; AVX512BW-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1}
+; AVX512BW-NEXT: vpcmpgtb %xmm1, %xmm0, %k0
+; AVX512BW-NEXT: vpcmpgtb %xmm3, %xmm2, %k1
+; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
; AVX512BW-NEXT: retq
define i2 @v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) {
; SSE2-SSSE3-LABEL: v2i16:
; SSE2-SSSE3: # %bb.0:
-; SSE2-SSSE3-NEXT: psllq $48, %xmm2
-; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm4
-; SSE2-SSSE3-NEXT: psrad $31, %xmm4
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
-; SSE2-SSSE3-NEXT: psrad $16, %xmm2
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; SSE2-SSSE3-NEXT: psllq $48, %xmm3
-; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm4
-; SSE2-SSSE3-NEXT: psrad $31, %xmm4
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
-; SSE2-SSSE3-NEXT: psrad $16, %xmm3
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; SSE2-SSSE3-NEXT: psllq $48, %xmm0
-; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm4
-; SSE2-SSSE3-NEXT: psrad $31, %xmm4
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
-; SSE2-SSSE3-NEXT: psrad $16, %xmm0
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; SSE2-SSSE3-NEXT: psllq $48, %xmm1
-; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm4
-; SSE2-SSSE3-NEXT: psrad $31, %xmm4
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
-; SSE2-SSSE3-NEXT: psrad $16, %xmm1
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
-; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
-; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm1
-; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm0
-; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm5
-; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm5
-; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1
-; SSE2-SSSE3-NEXT: por %xmm0, %xmm1
-; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm3
-; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm2
-; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm0
-; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
-; SSE2-SSSE3-NEXT: pand %xmm0, %xmm3
-; SSE2-SSSE3-NEXT: por %xmm2, %xmm3
-; SSE2-SSSE3-NEXT: pand %xmm1, %xmm3
-; SSE2-SSSE3-NEXT: movmskpd %xmm3, %eax
+; SSE2-SSSE3-NEXT: pcmpgtw %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-SSSE3-NEXT: pcmpgtw %xmm3, %xmm2
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; SSE2-SSSE3-NEXT: pand %xmm0, %xmm1
+; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax
; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax
; SSE2-SSSE3-NEXT: retq
;
-; AVX1-LABEL: v2i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $48, %xmm3, %xmm3
-; AVX1-NEXT: vpsrad $31, %xmm3, %xmm4
-; AVX1-NEXT: vpsrad $16, %xmm3, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
-; AVX1-NEXT: vpsllq $48, %xmm2, %xmm2
-; AVX1-NEXT: vpsrad $31, %xmm2, %xmm4
-; AVX1-NEXT: vpsrad $16, %xmm2, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpsllq $48, %xmm1, %xmm1
-; AVX1-NEXT: vpsrad $31, %xmm1, %xmm3
-; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; AVX1-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm3
-; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovmskpd %xmm0, %eax
-; AVX1-NEXT: # kill: def $al killed $al killed $eax
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: v2i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllq $48, %xmm3, %xmm3
-; AVX2-NEXT: vpsrad $31, %xmm3, %xmm4
-; AVX2-NEXT: vpsrad $16, %xmm3, %xmm3
-; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3]
-; AVX2-NEXT: vpsllq $48, %xmm2, %xmm2
-; AVX2-NEXT: vpsrad $31, %xmm2, %xmm4
-; AVX2-NEXT: vpsrad $16, %xmm2, %xmm2
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3]
-; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1
-; AVX2-NEXT: vpsrad $31, %xmm1, %xmm3
-; AVX2-NEXT: vpsrad $16, %xmm1, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3]
-; AVX2-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $31, %xmm0, %xmm3
-; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3]
-; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vmovmskpd %xmm0, %eax
-; AVX2-NEXT: # kill: def $al killed $al killed $eax
-; AVX2-NEXT: retq
+; AVX12-LABEL: v2i16:
+; AVX12: # %bb.0:
+; AVX12-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
+; AVX12-NEXT: vpmovsxwq %xmm0, %xmm0
+; AVX12-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm1
+; AVX12-NEXT: vpmovsxwq %xmm1, %xmm1
+; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX12-NEXT: vmovmskpd %xmm0, %eax
+; AVX12-NEXT: # kill: def $al killed $al killed $eax
+; AVX12-NEXT: retq
;
; AVX512F-LABEL: v2i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpsllq $48, %xmm3, %xmm3
-; AVX512F-NEXT: vpsraq $48, %xmm3, %xmm3
-; AVX512F-NEXT: vpsllq $48, %xmm2, %xmm2
-; AVX512F-NEXT: vpsraq $48, %xmm2, %xmm2
-; AVX512F-NEXT: vpsllq $48, %xmm1, %xmm1
-; AVX512F-NEXT: vpsraq $48, %xmm1, %xmm1
-; AVX512F-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX512F-NEXT: vpsraq $48, %xmm0, %xmm0
-; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k1
-; AVX512F-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1}
+; AVX512F-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX512F-NEXT: vptestmd %ymm0, %ymm0, %k0
+; AVX512F-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm0
+; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX512F-NEXT: vptestmd %ymm0, %ymm0, %k1
+; AVX512F-NEXT: kandw %k1, %k0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: # kill: def $al killed $al killed $eax
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: v2i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpsllq $48, %xmm3, %xmm3
-; AVX512BW-NEXT: vpsraq $48, %xmm3, %xmm3
-; AVX512BW-NEXT: vpsllq $48, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsraq $48, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsllq $48, %xmm1, %xmm1
-; AVX512BW-NEXT: vpsraq $48, %xmm1, %xmm1
-; AVX512BW-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX512BW-NEXT: vpsraq $48, %xmm0, %xmm0
-; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k1
-; AVX512BW-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1}
+; AVX512BW-NEXT: vpcmpgtw %xmm1, %xmm0, %k0
+; AVX512BW-NEXT: vpcmpgtw %xmm3, %xmm2, %k1
+; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
; AVX512BW-NEXT: retq
define i2 @v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) {
; SSE2-SSSE3-LABEL: v2i32:
; SSE2-SSSE3: # %bb.0:
-; SSE2-SSSE3-NEXT: psllq $32, %xmm2
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3]
-; SSE2-SSSE3-NEXT: psrad $31, %xmm2
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; SSE2-SSSE3-NEXT: psllq $32, %xmm3
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
-; SSE2-SSSE3-NEXT: psrad $31, %xmm3
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSE2-SSSE3-NEXT: psllq $32, %xmm0
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
-; SSE2-SSSE3-NEXT: psrad $31, %xmm0
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
-; SSE2-SSSE3-NEXT: psllq $32, %xmm1
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
-; SSE2-SSSE3-NEXT: psrad $31, %xmm1
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648]
-; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm0
-; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm3
-; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm5
-; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE2-SSSE3-NEXT: pand %xmm5, %xmm0
-; SSE2-SSSE3-NEXT: por %xmm3, %xmm0
-; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm2
-; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm4
-; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm1
-; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm1
-; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm4
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,0,2,2]
-; SSE2-SSSE3-NEXT: pand %xmm1, %xmm2
-; SSE2-SSSE3-NEXT: por %xmm4, %xmm2
-; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2
-; SSE2-SSSE3-NEXT: movmskpd %xmm2, %eax
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,1,1]
+; SSE2-SSSE3-NEXT: pand %xmm0, %xmm1
+; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax
; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax
; SSE2-SSSE3-NEXT: retq
;
-; AVX1-LABEL: v2i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $32, %xmm3, %xmm4
-; AVX1-NEXT: vpsrad $31, %xmm4, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
-; AVX1-NEXT: vpsllq $32, %xmm2, %xmm4
-; AVX1-NEXT: vpsrad $31, %xmm4, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpsllq $32, %xmm1, %xmm3
-; AVX1-NEXT: vpsrad $31, %xmm3, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; AVX1-NEXT: vpsllq $32, %xmm0, %xmm3
-; AVX1-NEXT: vpsrad $31, %xmm3, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovmskpd %xmm0, %eax
-; AVX1-NEXT: # kill: def $al killed $al killed $eax
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: v2i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllq $32, %xmm3, %xmm4
-; AVX2-NEXT: vpsrad $31, %xmm4, %xmm4
-; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3]
-; AVX2-NEXT: vpsllq $32, %xmm2, %xmm4
-; AVX2-NEXT: vpsrad $31, %xmm4, %xmm4
-; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3]
-; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpsllq $32, %xmm1, %xmm3
-; AVX2-NEXT: vpsrad $31, %xmm3, %xmm3
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3]
-; AVX2-NEXT: vpsllq $32, %xmm0, %xmm3
-; AVX2-NEXT: vpsrad $31, %xmm3, %xmm3
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3]
-; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vmovmskpd %xmm0, %eax
-; AVX2-NEXT: # kill: def $al killed $al killed $eax
-; AVX2-NEXT: retq
+; AVX12-LABEL: v2i32:
+; AVX12: # %bb.0:
+; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX12-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX12-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm1
+; AVX12-NEXT: vpmovsxdq %xmm1, %xmm1
+; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX12-NEXT: vmovmskpd %xmm0, %eax
+; AVX12-NEXT: # kill: def $al killed $al killed $eax
+; AVX12-NEXT: retq
;
; AVX512F-LABEL: v2i32:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpsllq $32, %xmm3, %xmm3
-; AVX512F-NEXT: vpsraq $32, %xmm3, %xmm3
-; AVX512F-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX512F-NEXT: vpsraq $32, %xmm2, %xmm2
-; AVX512F-NEXT: vpsllq $32, %xmm1, %xmm1
-; AVX512F-NEXT: vpsraq $32, %xmm1, %xmm1
-; AVX512F-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX512F-NEXT: vpsraq $32, %xmm0, %xmm0
-; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k1
-; AVX512F-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1}
+; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
+; AVX512F-NEXT: vpcmpgtd %xmm3, %xmm2, %k1
+; AVX512F-NEXT: kandw %k1, %k0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: # kill: def $al killed $al killed $eax
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: v2i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpsllq $32, %xmm3, %xmm3
-; AVX512BW-NEXT: vpsraq $32, %xmm3, %xmm3
-; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsraq $32, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsllq $32, %xmm1, %xmm1
-; AVX512BW-NEXT: vpsraq $32, %xmm1, %xmm1
-; AVX512BW-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX512BW-NEXT: vpsraq $32, %xmm0, %xmm0
-; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k1
-; AVX512BW-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1}
+; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
+; AVX512BW-NEXT: vpcmpgtd %xmm3, %xmm2, %k1
+; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
; AVX512BW-NEXT: retq
define i4 @v4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) {
; SSE2-SSSE3-LABEL: v4i8:
; SSE2-SSSE3: # %bb.0:
-; SSE2-SSSE3-NEXT: pslld $24, %xmm3
-; SSE2-SSSE3-NEXT: psrad $24, %xmm3
-; SSE2-SSSE3-NEXT: pslld $24, %xmm2
-; SSE2-SSSE3-NEXT: psrad $24, %xmm2
-; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
-; SSE2-SSSE3-NEXT: pslld $24, %xmm1
-; SSE2-SSSE3-NEXT: psrad $24, %xmm1
-; SSE2-SSSE3-NEXT: pslld $24, %xmm0
-; SSE2-SSSE3-NEXT: psrad $24, %xmm0
-; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
-; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0
-; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax
+; SSE2-SSSE3-NEXT: pcmpgtb %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-SSSE3-NEXT: pcmpgtb %xmm3, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-SSSE3-NEXT: pand %xmm0, %xmm1
+; SSE2-SSSE3-NEXT: movmskps %xmm1, %eax
; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v4i8:
; AVX12: # %bb.0:
-; AVX12-NEXT: vpslld $24, %xmm3, %xmm3
-; AVX12-NEXT: vpsrad $24, %xmm3, %xmm3
-; AVX12-NEXT: vpslld $24, %xmm2, %xmm2
-; AVX12-NEXT: vpsrad $24, %xmm2, %xmm2
-; AVX12-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm2
-; AVX12-NEXT: vpslld $24, %xmm1, %xmm1
-; AVX12-NEXT: vpsrad $24, %xmm1, %xmm1
-; AVX12-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX12-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX12-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX12-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
+; AVX12-NEXT: vpmovsxbd %xmm0, %xmm0
+; AVX12-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm1
+; AVX12-NEXT: vpmovsxbd %xmm1, %xmm1
+; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vmovmskps %xmm0, %eax
; AVX12-NEXT: # kill: def $al killed $al killed $eax
; AVX12-NEXT: retq
;
; AVX512F-LABEL: v4i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpslld $24, %xmm3, %xmm3
-; AVX512F-NEXT: vpsrad $24, %xmm3, %xmm3
-; AVX512F-NEXT: vpslld $24, %xmm2, %xmm2
-; AVX512F-NEXT: vpsrad $24, %xmm2, %xmm2
-; AVX512F-NEXT: vpslld $24, %xmm1, %xmm1
-; AVX512F-NEXT: vpsrad $24, %xmm1, %xmm1
-; AVX512F-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX512F-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %k1
-; AVX512F-NEXT: vpcmpgtd %xmm3, %xmm2, %k0 {%k1}
+; AVX512F-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm0
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
+; AVX512F-NEXT: kandw %k1, %k0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: # kill: def $al killed $al killed $eax
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: v4i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpslld $24, %xmm3, %xmm3
-; AVX512BW-NEXT: vpsrad $24, %xmm3, %xmm3
-; AVX512BW-NEXT: vpslld $24, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsrad $24, %xmm2, %xmm2
-; AVX512BW-NEXT: vpslld $24, %xmm1, %xmm1
-; AVX512BW-NEXT: vpsrad $24, %xmm1, %xmm1
-; AVX512BW-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX512BW-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k1
-; AVX512BW-NEXT: vpcmpgtd %xmm3, %xmm2, %k0 {%k1}
+; AVX512BW-NEXT: vpcmpgtb %xmm1, %xmm0, %k0
+; AVX512BW-NEXT: vpcmpgtb %xmm3, %xmm2, %k1
+; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
; AVX512BW-NEXT: retq
define i4 @v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
; SSE2-SSSE3-LABEL: v4i16:
; SSE2-SSSE3: # %bb.0:
-; SSE2-SSSE3-NEXT: pslld $16, %xmm3
-; SSE2-SSSE3-NEXT: psrad $16, %xmm3
-; SSE2-SSSE3-NEXT: pslld $16, %xmm2
-; SSE2-SSSE3-NEXT: psrad $16, %xmm2
-; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
-; SSE2-SSSE3-NEXT: pslld $16, %xmm1
-; SSE2-SSSE3-NEXT: psrad $16, %xmm1
-; SSE2-SSSE3-NEXT: pslld $16, %xmm0
-; SSE2-SSSE3-NEXT: psrad $16, %xmm0
-; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
-; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0
-; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax
+; SSE2-SSSE3-NEXT: pcmpgtw %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-SSSE3-NEXT: pcmpgtw %xmm3, %xmm2
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-SSSE3-NEXT: pand %xmm0, %xmm1
+; SSE2-SSSE3-NEXT: movmskps %xmm1, %eax
; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v4i16:
; AVX12: # %bb.0:
-; AVX12-NEXT: vpslld $16, %xmm3, %xmm3
-; AVX12-NEXT: vpsrad $16, %xmm3, %xmm3
-; AVX12-NEXT: vpslld $16, %xmm2, %xmm2
-; AVX12-NEXT: vpsrad $16, %xmm2, %xmm2
-; AVX12-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm2
-; AVX12-NEXT: vpslld $16, %xmm1, %xmm1
-; AVX12-NEXT: vpsrad $16, %xmm1, %xmm1
-; AVX12-NEXT: vpslld $16, %xmm0, %xmm0
-; AVX12-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX12-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX12-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
+; AVX12-NEXT: vpmovsxwd %xmm0, %xmm0
+; AVX12-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm1
+; AVX12-NEXT: vpmovsxwd %xmm1, %xmm1
+; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vmovmskps %xmm0, %eax
; AVX12-NEXT: # kill: def $al killed $al killed $eax
; AVX12-NEXT: retq
;
; AVX512F-LABEL: v4i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpslld $16, %xmm3, %xmm3
-; AVX512F-NEXT: vpsrad $16, %xmm3, %xmm3
-; AVX512F-NEXT: vpslld $16, %xmm2, %xmm2
-; AVX512F-NEXT: vpsrad $16, %xmm2, %xmm2
-; AVX512F-NEXT: vpslld $16, %xmm1, %xmm1
-; AVX512F-NEXT: vpsrad $16, %xmm1, %xmm1
-; AVX512F-NEXT: vpslld $16, %xmm0, %xmm0
-; AVX512F-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %k1
-; AVX512F-NEXT: vpcmpgtd %xmm3, %xmm2, %k0 {%k1}
+; AVX512F-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX512F-NEXT: vptestmd %ymm0, %ymm0, %k0
+; AVX512F-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm0
+; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX512F-NEXT: vptestmd %ymm0, %ymm0, %k1
+; AVX512F-NEXT: kandw %k1, %k0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: # kill: def $al killed $al killed $eax
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: v4i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpslld $16, %xmm3, %xmm3
-; AVX512BW-NEXT: vpsrad $16, %xmm3, %xmm3
-; AVX512BW-NEXT: vpslld $16, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsrad $16, %xmm2, %xmm2
-; AVX512BW-NEXT: vpslld $16, %xmm1, %xmm1
-; AVX512BW-NEXT: vpsrad $16, %xmm1, %xmm1
-; AVX512BW-NEXT: vpslld $16, %xmm0, %xmm0
-; AVX512BW-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k1
-; AVX512BW-NEXT: vpcmpgtd %xmm3, %xmm2, %k0 {%k1}
+; AVX512BW-NEXT: vpcmpgtw %xmm1, %xmm0, %k0
+; AVX512BW-NEXT: vpcmpgtw %xmm3, %xmm2, %k1
+; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
; AVX512BW-NEXT: retq
define i8 @v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
; SSE2-SSSE3-LABEL: v8i8:
; SSE2-SSSE3: # %bb.0:
-; SSE2-SSSE3-NEXT: psllw $8, %xmm3
-; SSE2-SSSE3-NEXT: psraw $8, %xmm3
-; SSE2-SSSE3-NEXT: psllw $8, %xmm2
-; SSE2-SSSE3-NEXT: psraw $8, %xmm2
-; SSE2-SSSE3-NEXT: pcmpgtw %xmm3, %xmm2
-; SSE2-SSSE3-NEXT: psllw $8, %xmm1
-; SSE2-SSSE3-NEXT: psraw $8, %xmm1
-; SSE2-SSSE3-NEXT: psllw $8, %xmm0
-; SSE2-SSSE3-NEXT: psraw $8, %xmm0
-; SSE2-SSSE3-NEXT: pcmpgtw %xmm1, %xmm0
-; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0
-; SSE2-SSSE3-NEXT: packsswb %xmm0, %xmm0
-; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax
+; SSE2-SSSE3-NEXT: pcmpgtb %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-SSSE3-NEXT: pcmpgtb %xmm3, %xmm2
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-SSSE3-NEXT: pand %xmm0, %xmm1
+; SSE2-SSSE3-NEXT: packsswb %xmm0, %xmm1
+; SSE2-SSSE3-NEXT: pmovmskb %xmm1, %eax
; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v8i8:
; AVX12: # %bb.0:
-; AVX12-NEXT: vpsllw $8, %xmm3, %xmm3
-; AVX12-NEXT: vpsraw $8, %xmm3, %xmm3
-; AVX12-NEXT: vpsllw $8, %xmm2, %xmm2
-; AVX12-NEXT: vpsraw $8, %xmm2, %xmm2
-; AVX12-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm2
-; AVX12-NEXT: vpsllw $8, %xmm1, %xmm1
-; AVX12-NEXT: vpsraw $8, %xmm1, %xmm1
-; AVX12-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX12-NEXT: vpsraw $8, %xmm0, %xmm0
-; AVX12-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
-; AVX12-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX12-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
+; AVX12-NEXT: vpmovsxbw %xmm0, %xmm0
+; AVX12-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm1
+; AVX12-NEXT: vpmovsxbw %xmm1, %xmm1
+; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX12-NEXT: vpmovmskb %xmm0, %eax
; AVX12-NEXT: # kill: def $al killed $al killed $eax
;
; AVX512F-LABEL: v8i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpsllw $8, %xmm3, %xmm3
-; AVX512F-NEXT: vpsraw $8, %xmm3, %xmm3
-; AVX512F-NEXT: vpsllw $8, %xmm2, %xmm2
-; AVX512F-NEXT: vpsraw $8, %xmm2, %xmm2
-; AVX512F-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vpsllw $8, %xmm1, %xmm1
-; AVX512F-NEXT: vpsraw $8, %xmm1, %xmm1
-; AVX512F-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512F-NEXT: vpsraw $8, %xmm0, %xmm0
-; AVX512F-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0
-; AVX512F-NEXT: vptestmd %ymm0, %ymm0, %k0
+; AVX512F-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm0
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
+; AVX512F-NEXT: kandw %k1, %k0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: # kill: def $al killed $al killed $eax
; AVX512F-NEXT: vzeroupper
;
; AVX512BW-LABEL: v8i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpsllw $8, %xmm3, %xmm3
-; AVX512BW-NEXT: vpsraw $8, %xmm3, %xmm3
-; AVX512BW-NEXT: vpsllw $8, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsraw $8, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsllw $8, %xmm1, %xmm1
-; AVX512BW-NEXT: vpsraw $8, %xmm1, %xmm1
-; AVX512BW-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512BW-NEXT: vpsraw $8, %xmm0, %xmm0
-; AVX512BW-NEXT: vpcmpgtw %xmm1, %xmm0, %k1
-; AVX512BW-NEXT: vpcmpgtw %xmm3, %xmm2, %k0 {%k1}
+; AVX512BW-NEXT: vpcmpgtb %xmm1, %xmm0, %k0
+; AVX512BW-NEXT: vpcmpgtb %xmm3, %xmm2, %k1
+; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
; AVX512BW-NEXT: retq
}
define i2 @v2i8(<2 x i8> %a, <2 x i8> %b) {
-; SSE2-SSSE3-LABEL: v2i8:
-; SSE2-SSSE3: # %bb.0:
-; SSE2-SSSE3-NEXT: psllq $56, %xmm0
-; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm2
-; SSE2-SSSE3-NEXT: psrad $31, %xmm2
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; SSE2-SSSE3-NEXT: psrad $24, %xmm0
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-SSSE3-NEXT: psllq $56, %xmm1
-; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm2
-; SSE2-SSSE3-NEXT: psrad $31, %xmm2
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; SSE2-SSSE3-NEXT: psrad $24, %xmm1
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm1
-; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm0
-; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm2
-; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm2
-; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-SSSE3-NEXT: pand %xmm2, %xmm1
-; SSE2-SSSE3-NEXT: por %xmm0, %xmm1
-; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax
-; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-SSSE3-NEXT: retq
-;
-; AVX1-LABEL: v2i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $56, %xmm1, %xmm1
-; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
-; AVX1-NEXT: vpsrad $24, %xmm1, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; AVX1-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2
-; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovmskpd %xmm0, %eax
-; AVX1-NEXT: # kill: def $al killed $al killed $eax
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: v2i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllq $56, %xmm1, %xmm1
-; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2
-; AVX2-NEXT: vpsrad $24, %xmm1, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX2-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2
-; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovmskpd %xmm0, %eax
-; AVX2-NEXT: # kill: def $al killed $al killed $eax
-; AVX2-NEXT: retq
+; SSE2-LABEL: v2i8:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm0
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-NEXT: movmskpd %xmm0, %eax
+; SSE2-NEXT: # kill: def $al killed $al killed $eax
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: v2i8:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: pcmpgtb %xmm1, %xmm0
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,u,0,0,u,u,0,0,u,u,1,1,u,u,1,1]
+; SSSE3-NEXT: movmskpd %xmm0, %eax
+; SSSE3-NEXT: # kill: def $al killed $al killed $eax
+; SSSE3-NEXT: retq
+;
+; AVX12-LABEL: v2i8:
+; AVX12: # %bb.0:
+; AVX12-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
+; AVX12-NEXT: vpmovsxbq %xmm0, %xmm0
+; AVX12-NEXT: vmovmskpd %xmm0, %eax
+; AVX12-NEXT: # kill: def $al killed $al killed $eax
+; AVX12-NEXT: retq
;
; AVX512F-LABEL: v2i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpsllq $56, %xmm1, %xmm1
-; AVX512F-NEXT: vpsraq $56, %xmm1, %xmm1
-; AVX512F-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX512F-NEXT: vpsraq $56, %xmm0, %xmm0
-; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
+; AVX512F-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: # kill: def $al killed $al killed $eax
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: v2i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpsllq $56, %xmm1, %xmm1
-; AVX512BW-NEXT: vpsraq $56, %xmm1, %xmm1
-; AVX512BW-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX512BW-NEXT: vpsraq $56, %xmm0, %xmm0
-; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
+; AVX512BW-NEXT: vpcmpgtb %xmm1, %xmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
; AVX512BW-NEXT: retq
define i2 @v2i16(<2 x i16> %a, <2 x i16> %b) {
; SSE2-SSSE3-LABEL: v2i16:
; SSE2-SSSE3: # %bb.0:
-; SSE2-SSSE3-NEXT: psllq $48, %xmm0
-; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm2
-; SSE2-SSSE3-NEXT: psrad $31, %xmm2
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; SSE2-SSSE3-NEXT: psrad $16, %xmm0
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-SSSE3-NEXT: psllq $48, %xmm1
-; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm2
-; SSE2-SSSE3-NEXT: psrad $31, %xmm2
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; SSE2-SSSE3-NEXT: psrad $16, %xmm1
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm1
-; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm0
-; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm2
-; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm2
-; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-SSSE3-NEXT: pand %xmm2, %xmm1
-; SSE2-SSSE3-NEXT: por %xmm0, %xmm1
-; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax
+; SSE2-SSSE3-NEXT: pcmpgtw %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax
; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax
; SSE2-SSSE3-NEXT: retq
;
-; AVX1-LABEL: v2i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $48, %xmm1, %xmm1
-; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
-; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; AVX1-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2
-; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovmskpd %xmm0, %eax
-; AVX1-NEXT: # kill: def $al killed $al killed $eax
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: v2i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1
-; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2
-; AVX2-NEXT: vpsrad $16, %xmm1, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX2-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2
-; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovmskpd %xmm0, %eax
-; AVX2-NEXT: # kill: def $al killed $al killed $eax
-; AVX2-NEXT: retq
+; AVX12-LABEL: v2i16:
+; AVX12: # %bb.0:
+; AVX12-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
+; AVX12-NEXT: vpmovsxwq %xmm0, %xmm0
+; AVX12-NEXT: vmovmskpd %xmm0, %eax
+; AVX12-NEXT: # kill: def $al killed $al killed $eax
+; AVX12-NEXT: retq
;
; AVX512F-LABEL: v2i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpsllq $48, %xmm1, %xmm1
-; AVX512F-NEXT: vpsraq $48, %xmm1, %xmm1
-; AVX512F-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX512F-NEXT: vpsraq $48, %xmm0, %xmm0
-; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
+; AVX512F-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX512F-NEXT: vptestmd %ymm0, %ymm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: # kill: def $al killed $al killed $eax
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: v2i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpsllq $48, %xmm1, %xmm1
-; AVX512BW-NEXT: vpsraq $48, %xmm1, %xmm1
-; AVX512BW-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX512BW-NEXT: vpsraq $48, %xmm0, %xmm0
-; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
+; AVX512BW-NEXT: vpcmpgtw %xmm1, %xmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
; AVX512BW-NEXT: retq
define i2 @v2i32(<2 x i32> %a, <2 x i32> %b) {
; SSE2-SSSE3-LABEL: v2i32:
; SSE2-SSSE3: # %bb.0:
-; SSE2-SSSE3-NEXT: psllq $32, %xmm0
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; SSE2-SSSE3-NEXT: psrad $31, %xmm0
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE2-SSSE3-NEXT: psllq $32, %xmm1
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
-; SSE2-SSSE3-NEXT: psrad $31, %xmm1
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648]
-; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm0
-; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm2
-; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm1
-; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
-; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0
-; SSE2-SSSE3-NEXT: por %xmm2, %xmm0
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax
; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax
; SSE2-SSSE3-NEXT: retq
;
-; AVX1-LABEL: v2i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $32, %xmm1, %xmm2
-; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; AVX1-NEXT: vpsllq $32, %xmm0, %xmm2
-; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovmskpd %xmm0, %eax
-; AVX1-NEXT: # kill: def $al killed $al killed $eax
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: v2i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllq $32, %xmm1, %xmm2
-; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX2-NEXT: vpsllq $32, %xmm0, %xmm2
-; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovmskpd %xmm0, %eax
-; AVX2-NEXT: # kill: def $al killed $al killed $eax
-; AVX2-NEXT: retq
+; AVX12-LABEL: v2i32:
+; AVX12: # %bb.0:
+; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX12-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX12-NEXT: vmovmskpd %xmm0, %eax
+; AVX12-NEXT: # kill: def $al killed $al killed $eax
+; AVX12-NEXT: retq
;
; AVX512F-LABEL: v2i32:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpsllq $32, %xmm1, %xmm1
-; AVX512F-NEXT: vpsraq $32, %xmm1, %xmm1
-; AVX512F-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX512F-NEXT: vpsraq $32, %xmm0, %xmm0
-; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
+; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: # kill: def $al killed $al killed $eax
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: v2i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpsllq $32, %xmm1, %xmm1
-; AVX512BW-NEXT: vpsraq $32, %xmm1, %xmm1
-; AVX512BW-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX512BW-NEXT: vpsraq $32, %xmm0, %xmm0
-; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
+; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
; AVX512BW-NEXT: retq
define i4 @v4i8(<4 x i8> %a, <4 x i8> %b) {
; SSE2-SSSE3-LABEL: v4i8:
; SSE2-SSSE3: # %bb.0:
-; SSE2-SSSE3-NEXT: pslld $24, %xmm1
-; SSE2-SSSE3-NEXT: psrad $24, %xmm1
-; SSE2-SSSE3-NEXT: pslld $24, %xmm0
-; SSE2-SSSE3-NEXT: psrad $24, %xmm0
-; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: pcmpgtb %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax
; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v4i8:
; AVX12: # %bb.0:
-; AVX12-NEXT: vpslld $24, %xmm1, %xmm1
-; AVX12-NEXT: vpsrad $24, %xmm1, %xmm1
-; AVX12-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX12-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX12-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
+; AVX12-NEXT: vpmovsxbd %xmm0, %xmm0
; AVX12-NEXT: vmovmskps %xmm0, %eax
; AVX12-NEXT: # kill: def $al killed $al killed $eax
; AVX12-NEXT: retq
;
; AVX512F-LABEL: v4i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpslld $24, %xmm1, %xmm1
-; AVX512F-NEXT: vpsrad $24, %xmm1, %xmm1
-; AVX512F-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX512F-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
+; AVX512F-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: # kill: def $al killed $al killed $eax
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: v4i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpslld $24, %xmm1, %xmm1
-; AVX512BW-NEXT: vpsrad $24, %xmm1, %xmm1
-; AVX512BW-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX512BW-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
+; AVX512BW-NEXT: vpcmpgtb %xmm1, %xmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
; AVX512BW-NEXT: retq
define i4 @v4i16(<4 x i16> %a, <4 x i16> %b) {
; SSE2-SSSE3-LABEL: v4i16:
; SSE2-SSSE3: # %bb.0:
-; SSE2-SSSE3-NEXT: pslld $16, %xmm1
-; SSE2-SSSE3-NEXT: psrad $16, %xmm1
-; SSE2-SSSE3-NEXT: pslld $16, %xmm0
-; SSE2-SSSE3-NEXT: psrad $16, %xmm0
-; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: pcmpgtw %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax
; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v4i16:
; AVX12: # %bb.0:
-; AVX12-NEXT: vpslld $16, %xmm1, %xmm1
-; AVX12-NEXT: vpsrad $16, %xmm1, %xmm1
-; AVX12-NEXT: vpslld $16, %xmm0, %xmm0
-; AVX12-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX12-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
+; AVX12-NEXT: vpmovsxwd %xmm0, %xmm0
; AVX12-NEXT: vmovmskps %xmm0, %eax
; AVX12-NEXT: # kill: def $al killed $al killed $eax
; AVX12-NEXT: retq
;
; AVX512F-LABEL: v4i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpslld $16, %xmm1, %xmm1
-; AVX512F-NEXT: vpsrad $16, %xmm1, %xmm1
-; AVX512F-NEXT: vpslld $16, %xmm0, %xmm0
-; AVX512F-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
+; AVX512F-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX512F-NEXT: vptestmd %ymm0, %ymm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: # kill: def $al killed $al killed $eax
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: v4i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpslld $16, %xmm1, %xmm1
-; AVX512BW-NEXT: vpsrad $16, %xmm1, %xmm1
-; AVX512BW-NEXT: vpslld $16, %xmm0, %xmm0
-; AVX512BW-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
+; AVX512BW-NEXT: vpcmpgtw %xmm1, %xmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
; AVX512BW-NEXT: retq
define i8 @v8i8(<8 x i8> %a, <8 x i8> %b) {
; SSE2-SSSE3-LABEL: v8i8:
; SSE2-SSSE3: # %bb.0:
-; SSE2-SSSE3-NEXT: psllw $8, %xmm1
-; SSE2-SSSE3-NEXT: psraw $8, %xmm1
-; SSE2-SSSE3-NEXT: psllw $8, %xmm0
-; SSE2-SSSE3-NEXT: psraw $8, %xmm0
-; SSE2-SSSE3-NEXT: pcmpgtw %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: pcmpgtb %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-SSSE3-NEXT: packsswb %xmm0, %xmm0
; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax
; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax
;
; AVX12-LABEL: v8i8:
; AVX12: # %bb.0:
-; AVX12-NEXT: vpsllw $8, %xmm1, %xmm1
-; AVX12-NEXT: vpsraw $8, %xmm1, %xmm1
-; AVX12-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX12-NEXT: vpsraw $8, %xmm0, %xmm0
-; AVX12-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
+; AVX12-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
+; AVX12-NEXT: vpmovsxbw %xmm0, %xmm0
; AVX12-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX12-NEXT: vpmovmskb %xmm0, %eax
; AVX12-NEXT: # kill: def $al killed $al killed $eax
;
; AVX512F-LABEL: v8i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpsllw $8, %xmm1, %xmm1
-; AVX512F-NEXT: vpsraw $8, %xmm1, %xmm1
-; AVX512F-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512F-NEXT: vpsraw $8, %xmm0, %xmm0
-; AVX512F-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0
-; AVX512F-NEXT: vptestmd %ymm0, %ymm0, %k0
+; AVX512F-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: # kill: def $al killed $al killed $eax
; AVX512F-NEXT: vzeroupper
;
; AVX512BW-LABEL: v8i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpsllw $8, %xmm1, %xmm1
-; AVX512BW-NEXT: vpsraw $8, %xmm1, %xmm1
-; AVX512BW-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512BW-NEXT: vpsraw $8, %xmm0, %xmm0
-; AVX512BW-NEXT: vpcmpgtw %xmm1, %xmm0, %k0
+; AVX512BW-NEXT: vpcmpgtb %xmm1, %xmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
; AVX512BW-NEXT: retq
}
define i8 @bitcast_v16i8_to_v2i8(<16 x i8> %a0) nounwind {
-; SSE2-LABEL: bitcast_v16i8_to_v2i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pmovmskb %xmm0, %eax
-; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: bitcast_v16i8_to_v2i8:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: pmovmskb %xmm0, %eax
-; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,u,u,u,u,u,u,u,1,u,u,u,u,u,u,u]
-; SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al
-; SSSE3-NEXT: retq
+; SSE2-SSSE3-LABEL: bitcast_v16i8_to_v2i8:
+; SSE2-SSSE3: # %bb.0:
+; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm0
+; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: bitcast_v16i8_to_v2i8:
; AVX12: # %bb.0:
; AVX512: # %bb.0:
; AVX512-NEXT: vpmovb2m %xmm0, %k0
; AVX512-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0
; AVX512-NEXT: vpextrb $0, %xmm0, %ecx
; AVX512-NEXT: vpextrb $1, %xmm0, %eax
; AVX512-NEXT: addb %cl, %al
}
define i8 @bitcast_v16i16_to_v2i8(<16 x i16> %a0) nounwind {
-; SSE2-LABEL: bitcast_v16i16_to_v2i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: packsswb %xmm1, %xmm0
-; SSE2-NEXT: pmovmskb %xmm0, %eax
-; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: bitcast_v16i16_to_v2i8:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: packsswb %xmm1, %xmm0
-; SSSE3-NEXT: pmovmskb %xmm0, %eax
-; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,u,u,u,u,u,u,u,1,u,u,u,u,u,u,u]
-; SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al
-; SSSE3-NEXT: retq
+; SSE2-SSSE3-LABEL: bitcast_v16i16_to_v2i8:
+; SSE2-SSSE3: # %bb.0:
+; SSE2-SSSE3-NEXT: packsswb %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm0
+; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: bitcast_v16i16_to_v2i8:
; AVX1: # %bb.0:
; AVX512: # %bb.0:
; AVX512-NEXT: vpmovw2m %ymm0, %k0
; AVX512-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0
; AVX512-NEXT: vpextrb $0, %xmm0, %ecx
; AVX512-NEXT: vpextrb $1, %xmm0, %eax
; AVX512-NEXT: addb %cl, %al
define i16 @bitcast_v32i8_to_v2i16(<32 x i8> %a0) nounwind {
; SSE2-SSSE3-LABEL: bitcast_v32i8_to_v2i16:
; SSE2-SSSE3: # %bb.0:
-; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax
-; SSE2-SSSE3-NEXT: pmovmskb %xmm1, %ecx
-; SSE2-SSSE3-NEXT: shll $16, %ecx
-; SSE2-SSSE3-NEXT: orl %eax, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: pextrw $0, %xmm0, %ecx
+; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %ecx
+; SSE2-SSSE3-NEXT: pmovmskb %xmm1, %eax
+; SSE2-SSSE3-NEXT: shll $16, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm0
; SSE2-SSSE3-NEXT: pextrw $1, %xmm0, %eax
; SSE2-SSSE3-NEXT: addl %ecx, %eax
; SSE2-SSSE3-NEXT: # kill: def $ax killed $ax killed $eax
; AVX1-NEXT: shll $16, %ecx
; AVX1-NEXT: orl %eax, %ecx
; AVX1-NEXT: vmovd %ecx, %xmm0
-; AVX1-NEXT: vpextrw $0, %xmm0, %ecx
; AVX1-NEXT: vpextrw $1, %xmm0, %eax
; AVX1-NEXT: addl %ecx, %eax
; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
;
; AVX2-LABEL: bitcast_v32i8_to_v2i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovmskb %ymm0, %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vpextrw $0, %xmm0, %ecx
+; AVX2-NEXT: vpmovmskb %ymm0, %ecx
+; AVX2-NEXT: vmovd %ecx, %xmm0
; AVX2-NEXT: vpextrw $1, %xmm0, %eax
; AVX2-NEXT: addl %ecx, %eax
; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
; AVX512-NEXT: subq $32, %rsp
; AVX512-NEXT: vpmovb2m %ymm0, %k0
; AVX512-NEXT: kmovd %k0, (%rsp)
-; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX512-NEXT: vpextrw $0, %xmm0, %ecx
+; AVX512-NEXT: vmovdqa (%rsp), %xmm0
+; AVX512-NEXT: vmovd %xmm0, %ecx
; AVX512-NEXT: vpextrw $1, %xmm0, %eax
; AVX512-NEXT: addl %ecx, %eax
; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
}
define i8 @bitcast_v16i32_to_v2i8(<16 x i32> %a0) nounwind {
-; SSE2-LABEL: bitcast_v16i32_to_v2i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: packssdw %xmm3, %xmm2
-; SSE2-NEXT: packssdw %xmm1, %xmm0
-; SSE2-NEXT: packsswb %xmm2, %xmm0
-; SSE2-NEXT: pmovmskb %xmm0, %eax
-; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: bitcast_v16i32_to_v2i8:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: packssdw %xmm3, %xmm2
-; SSSE3-NEXT: packssdw %xmm1, %xmm0
-; SSSE3-NEXT: packsswb %xmm2, %xmm0
-; SSSE3-NEXT: pmovmskb %xmm0, %eax
-; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,u,u,u,u,u,u,u,1,u,u,u,u,u,u,u]
-; SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al
-; SSSE3-NEXT: retq
+; SSE2-SSSE3-LABEL: bitcast_v16i32_to_v2i8:
+; SSE2-SSSE3: # %bb.0:
+; SSE2-SSSE3-NEXT: packssdw %xmm3, %xmm2
+; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: packsswb %xmm2, %xmm0
+; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm0
+; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: bitcast_v16i32_to_v2i8:
; AVX1: # %bb.0:
; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512-NEXT: vpcmpgtd %zmm0, %zmm1, %k0
; AVX512-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0
; AVX512-NEXT: vpextrb $0, %xmm0, %ecx
; AVX512-NEXT: vpextrb $1, %xmm0, %eax
; AVX512-NEXT: addb %cl, %al
; SSE2-SSSE3-LABEL: bitcast_v32i16_to_v2i16:
; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: packsswb %xmm1, %xmm0
-; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax
+; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %ecx
; SSE2-SSSE3-NEXT: packsswb %xmm3, %xmm2
-; SSE2-SSSE3-NEXT: pmovmskb %xmm2, %ecx
-; SSE2-SSSE3-NEXT: shll $16, %ecx
-; SSE2-SSSE3-NEXT: orl %eax, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: pextrw $0, %xmm0, %ecx
+; SSE2-SSSE3-NEXT: pmovmskb %xmm2, %eax
+; SSE2-SSSE3-NEXT: shll $16, %eax
+; SSE2-SSSE3-NEXT: movd %eax, %xmm0
; SSE2-SSSE3-NEXT: pextrw $1, %xmm0, %eax
; SSE2-SSSE3-NEXT: addl %ecx, %eax
; SSE2-SSSE3-NEXT: # kill: def $ax killed $ax killed $eax
; AVX1-NEXT: shll $16, %ecx
; AVX1-NEXT: orl %eax, %ecx
; AVX1-NEXT: vmovd %ecx, %xmm0
-; AVX1-NEXT: vpextrw $0, %xmm0, %ecx
; AVX1-NEXT: vpextrw $1, %xmm0, %eax
; AVX1-NEXT: addl %ecx, %eax
; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
; AVX2: # %bb.0:
; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT: vpmovmskb %ymm0, %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vpextrw $0, %xmm0, %ecx
+; AVX2-NEXT: vpmovmskb %ymm0, %ecx
+; AVX2-NEXT: vmovd %ecx, %xmm0
; AVX2-NEXT: vpextrw $1, %xmm0, %eax
; AVX2-NEXT: addl %ecx, %eax
; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
; AVX512-NEXT: subq $32, %rsp
; AVX512-NEXT: vpmovw2m %zmm0, %k0
; AVX512-NEXT: kmovd %k0, (%rsp)
-; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX512-NEXT: vpextrw $0, %xmm0, %ecx
+; AVX512-NEXT: vmovdqa (%rsp), %xmm0
+; AVX512-NEXT: vmovd %xmm0, %ecx
; AVX512-NEXT: vpextrw $1, %xmm0, %eax
; AVX512-NEXT: addl %ecx, %eax
; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
; SSE2-SSSE3-NEXT: orl %ecx, %edx
; SSE2-SSSE3-NEXT: orl %eax, %edx
; SSE2-SSSE3-NEXT: movw %dx, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-SSSE3-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0
; SSE2-SSSE3-NEXT: movd %xmm0, %ecx
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,0,1]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; SSE2-SSSE3-NEXT: movd %xmm0, %eax
; SSE2-SSSE3-NEXT: addl %ecx, %eax
; SSE2-SSSE3-NEXT: retq
; AVX1-NEXT: orl %ecx, %edx
; AVX1-NEXT: orl %eax, %edx
; AVX1-NEXT: movl %edx, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0
; AVX1-NEXT: vmovd %xmm0, %ecx
; AVX1-NEXT: vpextrd $1, %xmm0, %eax
; AVX1-NEXT: addl %ecx, %eax
; AVX2-NEXT: orl %ecx, %edx
; AVX2-NEXT: orl %eax, %edx
; AVX2-NEXT: movl %edx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0
; AVX2-NEXT: vmovd %xmm0, %ecx
; AVX2-NEXT: vpextrd $1, %xmm0, %eax
; AVX2-NEXT: addl %ecx, %eax
; AVX512: # %bb.0:
; AVX512-NEXT: vpmovb2m %zmm0, %k0
; AVX512-NEXT: kmovq %k0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0
; AVX512-NEXT: vmovd %xmm0, %ecx
; AVX512-NEXT: vpextrd $1, %xmm0, %eax
; AVX512-NEXT: addl %ecx, %eax
; X64-NEXT: pxor %xmm1, %xmm1
; X64-NEXT: movdqa %xmm0, %xmm2
; X64-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; X64-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
-; X64-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; X64-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
+; X64-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6]
; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
-; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
+; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
; X64-NEXT: packuswb %xmm2, %xmm0
; X64-NEXT: movdqa %xmm0, %xmm1
; X64-NEXT: psllw $4, %xmm1
; X64-NEXT: pand {{.*}}(%rip), %xmm0
; X64-NEXT: psrlw $1, %xmm0
; X64-NEXT: por %xmm1, %xmm0
-; X64-NEXT: psrlq $48, %xmm0
; X64-NEXT: retq
%b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> %a)
ret <2 x i16> %b
;
; X64-LABEL: fold_v2i16:
; X64: # %bb.0:
-; X64-NEXT: movaps {{.*#+}} xmm0 = [61440,240]
+; X64-NEXT: movaps {{.*#+}} xmm0 = <61440,240,u,u,u,u,u,u>
; X64-NEXT: retq
%b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> <i16 15, i16 3840>)
ret <2 x i16> %b
; CHECK-NOSSSE3-NEXT: pxor %xmm1, %xmm1
; CHECK-NOSSSE3-NEXT: movdqa %xmm0, %xmm2
; CHECK-NOSSSE3-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; CHECK-NOSSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
-; CHECK-NOSSSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; CHECK-NOSSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
+; CHECK-NOSSSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6]
; CHECK-NOSSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; CHECK-NOSSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
-; CHECK-NOSSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; CHECK-NOSSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
+; CHECK-NOSSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
; CHECK-NOSSSE3-NEXT: packuswb %xmm2, %xmm0
-; CHECK-NOSSSE3-NEXT: psrld $16, %xmm0
; CHECK-NOSSSE3-NEXT: retq
;
; CHECK-SSSE3-LABEL: test7:
; CHECK-SSSE3: # %bb.0: # %entry
-; CHECK-SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,0],zero,zero,xmm0[5,4],zero,zero,xmm0[9,8],zero,zero,xmm0[13,12],zero,zero
+; CHECK-SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
; CHECK-SSSE3-NEXT: retq
;
; CHECK-AVX-LABEL: test7:
; CHECK-AVX: # %bb.0: # %entry
-; CHECK-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0],zero,zero,xmm0[5,4],zero,zero,xmm0[9,8],zero,zero,xmm0[13,12],zero,zero
+; CHECK-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
; CHECK-AVX-NEXT: retq
;
; CHECK-WIDE-AVX-LABEL: test7:
; SSE2-LABEL: foo:
; SSE2: # %bb.0:
; SSE2-NEXT: cvttps2dq %xmm0, %xmm0
-; SSE2-NEXT: movl $255, %eax
-; SSE2-NEXT: movd %eax, %xmm1
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
-; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
+; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: movl -{{[0-9]+}}(%rsp), %ecx
+; SSE2-NEXT: shll $8, %ecx
+; SSE2-NEXT: orl %eax, %ecx
+; SSE2-NEXT: movd %ecx, %xmm0
+; SSE2-NEXT: movl $65280, %eax # imm = 0xFF00
+; SSE2-NEXT: orl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: pinsrw $1, %eax, %xmm0
; SSE2-NEXT: movd %xmm0, (%rdi)
; SSE2-NEXT: retq
;
; SSE41-LABEL: foo:
; SSE41: # %bb.0:
; SSE41-NEXT: cvttps2dq %xmm0, %xmm0
+; SSE41-NEXT: pextrb $8, %xmm0, %eax
+; SSE41-NEXT: pextrb $4, %xmm0, %ecx
+; SSE41-NEXT: pextrb $0, %xmm0, %edx
+; SSE41-NEXT: movd %edx, %xmm0
+; SSE41-NEXT: pinsrb $1, %ecx, %xmm0
+; SSE41-NEXT: pinsrb $2, %eax, %xmm0
; SSE41-NEXT: movl $255, %eax
-; SSE41-NEXT: pinsrd $3, %eax, %xmm0
-; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE41-NEXT: pinsrb $3, %eax, %xmm0
; SSE41-NEXT: movd %xmm0, (%rdi)
; SSE41-NEXT: retq
%t0 = fptoui <3 x float> %in to <3 x i8>
define double @test3_mul(double %A, double %B) {
; SSE41-LABEL: test3_mul:
; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; SSE41-NEXT: pmullw %xmm2, %xmm0
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT: pmullw %xmm1, %xmm0
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; SSE41-NEXT: retq
%1 = bitcast double %A to <8 x i8>
define <4 x i8> @test_crash(<4 x i8> %a, <4 x i8> %b) {
; CHECK-LABEL: test_crash:
; CHECK: # %bb.0:
-; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
; CHECK-NEXT: retq
%shuf1 = shufflevector <4 x i8> %a, <4 x i8> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
%shuf2 = shufflevector <4 x i8> %b, <4 x i8> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
; FMA-NEXT: vaddss %xmm0, %xmm0, %xmm2
; FMA-NEXT: vmulss %xmm2, %xmm1, %xmm2
; FMA-NEXT: vmulss %xmm1, %xmm1, %xmm1
-; FMA-NEXT: vfmsub231ss %xmm0, %xmm0, %xmm1
+; FMA-NEXT: vfmsub231ss {{.*#+}} xmm1 = (xmm0 * xmm0) - xmm1
; FMA-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm2[0],xmm1[2,3]
; FMA-NEXT: retq
%2 = extractelement <2 x float> %0, i32 0
; FMA-NEXT: vaddsd %xmm0, %xmm0, %xmm2
; FMA-NEXT: vmulsd %xmm2, %xmm1, %xmm2
; FMA-NEXT: vmulsd %xmm1, %xmm1, %xmm1
-; FMA-NEXT: vfmsub231sd %xmm0, %xmm0, %xmm1
+; FMA-NEXT: vfmsub231sd {{.*#+}} xmm1 = (xmm0 * xmm0) - xmm1
; FMA-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm2[0]
; FMA-NEXT: retq
%2 = extractelement <2 x double> %0, i32 0
; FMA-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
; FMA-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
; FMA-NEXT: vmulss %xmm2, %xmm1, %xmm4
-; FMA-NEXT: vfmadd231ss %xmm0, %xmm3, %xmm4
+; FMA-NEXT: vfmadd231ss {{.*#+}} xmm4 = (xmm3 * xmm0) + xmm4
; FMA-NEXT: vmulss %xmm2, %xmm3, %xmm2
-; FMA-NEXT: vfmsub231ss %xmm0, %xmm1, %xmm2
+; FMA-NEXT: vfmsub231ss {{.*#+}} xmm2 = (xmm1 * xmm0) - xmm2
; FMA-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0],xmm4[0],xmm2[2,3]
; FMA-NEXT: retq
%3 = extractelement <2 x float> %0, i32 0
; FMA-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; FMA-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
; FMA-NEXT: vmulsd %xmm2, %xmm1, %xmm4
-; FMA-NEXT: vfmadd231sd %xmm0, %xmm3, %xmm4
+; FMA-NEXT: vfmadd231sd {{.*#+}} xmm4 = (xmm3 * xmm0) + xmm4
; FMA-NEXT: vmulsd %xmm2, %xmm3, %xmm2
-; FMA-NEXT: vfmsub231sd %xmm0, %xmm1, %xmm2
+; FMA-NEXT: vfmsub231sd {{.*#+}} xmm2 = (xmm1 * xmm0) - xmm2
; FMA-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm4[0]
; FMA-NEXT: retq
%3 = extractelement <2 x double> %0, i32 0
define <2 x float> @uitofp_2i32_buildvector_cvt(i32 %x, i32 %y, <2 x float> %v) {
; X32-LABEL: uitofp_2i32_buildvector_cvt:
; X32: # %bb.0:
-; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X32-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; X32-NEXT: movapd {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
-; X32-NEXT: orpd %xmm1, %xmm2
+; X32-NEXT: movdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
+; X32-NEXT: pmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
+; X32-NEXT: por %xmm1, %xmm2
; X32-NEXT: subpd %xmm1, %xmm2
; X32-NEXT: cvtpd2ps %xmm2, %xmm1
; X32-NEXT: mulps %xmm1, %xmm0
;
; X64-LABEL: uitofp_2i32_buildvector_cvt:
; X64: # %bb.0:
-; X64-NEXT: movd %esi, %xmm1
-; X64-NEXT: movd %edi, %xmm2
-; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; X64-NEXT: movdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
-; X64-NEXT: por %xmm1, %xmm2
-; X64-NEXT: subpd %xmm1, %xmm2
-; X64-NEXT: cvtpd2ps %xmm2, %xmm1
+; X64-NEXT: movd %edi, %xmm1
+; X64-NEXT: pinsrd $1, %esi, %xmm1
+; X64-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; X64-NEXT: movdqa {{.*#+}} xmm2 = [4.503599627370496E+15,4.503599627370496E+15]
+; X64-NEXT: por %xmm2, %xmm1
+; X64-NEXT: subpd %xmm2, %xmm1
+; X64-NEXT: cvtpd2ps %xmm1, %xmm1
; X64-NEXT: mulps %xmm1, %xmm0
; X64-NEXT: retq
%t1 = insertelement <2 x i32> undef, i32 %x, i32 0
define <2 x float> @uitofp_2i32_legalized(<2 x i32> %in, <2 x float> %v) {
; X32-LABEL: uitofp_2i32_legalized:
; X32: # %bb.0:
-; X32-NEXT: xorps %xmm2, %xmm2
-; X32-NEXT: blendps {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; X32-NEXT: movaps {{.*#+}} xmm0 = [4.503599627370496E+15,4.503599627370496E+15]
-; X32-NEXT: orps %xmm0, %xmm2
-; X32-NEXT: subpd %xmm0, %xmm2
-; X32-NEXT: cvtpd2ps %xmm2, %xmm0
+; X32-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; X32-NEXT: movdqa {{.*#+}} xmm2 = [4.503599627370496E+15,4.503599627370496E+15]
+; X32-NEXT: por %xmm2, %xmm0
+; X32-NEXT: subpd %xmm2, %xmm0
+; X32-NEXT: cvtpd2ps %xmm0, %xmm0
; X32-NEXT: mulps %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: uitofp_2i32_legalized:
; X64: # %bb.0:
-; X64-NEXT: xorps %xmm2, %xmm2
-; X64-NEXT: blendps {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; X64-NEXT: movaps {{.*#+}} xmm0 = [4.503599627370496E+15,4.503599627370496E+15]
-; X64-NEXT: orps %xmm0, %xmm2
-; X64-NEXT: subpd %xmm0, %xmm2
-; X64-NEXT: cvtpd2ps %xmm2, %xmm0
+; X64-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; X64-NEXT: movdqa {{.*#+}} xmm2 = [4.503599627370496E+15,4.503599627370496E+15]
+; X64-NEXT: por %xmm2, %xmm0
+; X64-NEXT: subpd %xmm2, %xmm0
+; X64-NEXT: cvtpd2ps %xmm0, %xmm0
; X64-NEXT: mulps %xmm1, %xmm0
; X64-NEXT: retq
%t1 = uitofp <2 x i32> %in to <2 x float>
; CHECK-LABEL: foo:
; CHECK: # %bb.0:
; CHECK-NEXT: cvttps2dq %xmm0, %xmm0
+; CHECK-NEXT: pextrb $8, %xmm0, %eax
+; CHECK-NEXT: pextrb $4, %xmm0, %ecx
+; CHECK-NEXT: pextrb $0, %xmm0, %edx
+; CHECK-NEXT: movd %edx, %xmm0
+; CHECK-NEXT: pinsrb $1, %ecx, %xmm0
+; CHECK-NEXT: pinsrb $2, %eax, %xmm0
; CHECK-NEXT: movl $255, %eax
-; CHECK-NEXT: pinsrd $3, %eax, %xmm0
-; CHECK-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; CHECK-NEXT: pinsrb $3, %eax, %xmm0
; CHECK-NEXT: movd %xmm0, (%rdi)
; CHECK-NEXT: retq
%t0 = fptosi <4 x float> %in to <4 x i32>
define i8 @extractelt_bitcast_extra_use(i32 %x, <4 x i8>* %p) nounwind {
; X86-LABEL: extractelt_bitcast_extra_use:
; X86: # %bb.0:
-; X86-NEXT: pushl %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl %eax, (%ecx)
; X86-NEXT: # kill: def $al killed $al killed $eax
-; X86-NEXT: popl %ecx
; X86-NEXT: retl
;
; X64-LABEL: extractelt_bitcast_extra_use:
; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_128_m:
; X32-AVX512VL: # %bb.0: # %entry
; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X32-AVX512VL-NEXT: vcvtps2ph $3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x03]
-; X32-AVX512VL-NEXT: vmovlps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x13,0x00]
+; X32-AVX512VL-NEXT: vcvtps2ph $3, %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0x00,0x03]
; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
;
; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_128_m:
; X64-AVX512VL: # %bb.0: # %entry
-; X64-AVX512VL-NEXT: vcvtps2ph $3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x03]
-; X64-AVX512VL-NEXT: vmovlps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x13,0x07]
+; X64-AVX512VL-NEXT: vcvtps2ph $3, %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0x07,0x03]
; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
entry:
%0 = tail call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %a, i32 3)
define <4 x i16> @test_sext_4i8_4i16() {
; X32-LABEL: test_sext_4i8_4i16:
; X32: # %bb.0:
-; X32-NEXT: vmovaps {{.*#+}} xmm0 = [0,4294967295,2,4294967293]
+; X32-NEXT: vmovaps {{.*#+}} xmm0 = <0,65535,2,65533,u,u,u,u>
; X32-NEXT: retl
;
; X64-LABEL: test_sext_4i8_4i16:
; X64: # %bb.0:
-; X64-NEXT: vmovaps {{.*#+}} xmm0 = [0,4294967295,2,4294967293]
+; X64-NEXT: vmovaps {{.*#+}} xmm0 = <0,65535,2,65533,u,u,u,u>
; X64-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 0, i32 0
%2 = insertelement <4 x i8> %1, i8 -1, i32 1
define <4 x i16> @test_sext_4i8_4i16_undef() {
; X32-LABEL: test_sext_4i8_4i16_undef:
; X32: # %bb.0:
-; X32-NEXT: vmovaps {{.*#+}} xmm0 = <u,4294967295,u,4294967293>
+; X32-NEXT: vmovaps {{.*#+}} xmm0 = <u,65535,u,65533,u,u,u,u>
; X32-NEXT: retl
;
; X64-LABEL: test_sext_4i8_4i16_undef:
; X64: # %bb.0:
-; X64-NEXT: vmovaps {{.*#+}} xmm0 = <u,4294967295,u,4294967293>
+; X64-NEXT: vmovaps {{.*#+}} xmm0 = <u,65535,u,65533,u,u,u,u>
; X64-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 undef, i32 0
%2 = insertelement <4 x i8> %1, i8 -1, i32 1
define <4 x i16> @test_zext_4i8_4i16() {
; X32-LABEL: test_zext_4i8_4i16:
; X32: # %bb.0:
-; X32-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,2,253]
+; X32-NEXT: vmovaps {{.*#+}} xmm0 = <0,255,2,253,u,u,u,u>
; X32-NEXT: retl
;
; X64-LABEL: test_zext_4i8_4i16:
; X64: # %bb.0:
-; X64-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,2,253]
+; X64-NEXT: vmovaps {{.*#+}} xmm0 = <0,255,2,253,u,u,u,u>
; X64-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 0, i32 0
%2 = insertelement <4 x i8> %1, i8 -1, i32 1
define <4 x i16> @test_zext_4i8_4i16_undef() {
; X32-LABEL: test_zext_4i8_4i16_undef:
; X32: # %bb.0:
-; X32-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,0,253]
+; X32-NEXT: vmovaps {{.*#+}} xmm0 = <0,255,0,253,u,u,u,u>
; X32-NEXT: retl
;
; X64-LABEL: test_zext_4i8_4i16_undef:
; X64: # %bb.0:
-; X64-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,0,253]
+; X64-NEXT: vmovaps {{.*#+}} xmm0 = <0,255,0,253,u,u,u,u>
; X64-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 undef, i32 0
%2 = insertelement <4 x i8> %1, i8 -1, i32 1
define <8 x i64> @insert_subvector_512(i32 %x0, i32 %x1, <8 x i64> %v) nounwind {
; X86_AVX256-LABEL: insert_subvector_512:
; X86_AVX256: # %bb.0:
-; X86_AVX256-NEXT: pushl %ebp
-; X86_AVX256-NEXT: movl %esp, %ebp
-; X86_AVX256-NEXT: andl $-8, %esp
-; X86_AVX256-NEXT: subl $8, %esp
-; X86_AVX256-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; X86_AVX256-NEXT: vmovlps %xmm2, (%esp)
; X86_AVX256-NEXT: vextracti128 $1, %ymm0, %xmm2
-; X86_AVX256-NEXT: vpinsrd $0, (%esp), %xmm2, %xmm2
+; X86_AVX256-NEXT: vpinsrd $0, {{[0-9]+}}(%esp), %xmm2, %xmm2
; X86_AVX256-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm2, %xmm2
; X86_AVX256-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; X86_AVX256-NEXT: movl %ebp, %esp
-; X86_AVX256-NEXT: popl %ebp
; X86_AVX256-NEXT: retl
;
; X64_AVX256-LABEL: insert_subvector_512:
define void @knownbits_zext_in_reg(i8*) nounwind {
; X32-LABEL: knownbits_zext_in_reg:
; X32: # %bb.0: # %BB
-; X32-NEXT: pushl %ebp
; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %esi
-; X32-NEXT: subl $16, %esp
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movzbl (%eax), %ecx
; X32-NEXT: imull $101, %ecx, %eax
; X32-NEXT: shrl $14, %eax
-; X32-NEXT: imull $177, %ecx, %ecx
-; X32-NEXT: shrl $14, %ecx
-; X32-NEXT: movzbl %al, %eax
-; X32-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; X32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm1
-; X32-NEXT: vbroadcastss {{.*#+}} xmm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
-; X32-NEXT: vpand %xmm2, %xmm1, %xmm1
-; X32-NEXT: movzbl %cl, %eax
-; X32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
-; X32-NEXT: vpand %xmm2, %xmm0, %xmm0
-; X32-NEXT: vpextrd $1, %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: vpextrd $1, %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: xorl %ecx, %ecx
-; X32-NEXT: vmovd %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT: vmovd %xmm0, (%esp) # 4-byte Folded Spill
-; X32-NEXT: vpextrd $2, %xmm1, %edi
-; X32-NEXT: vpextrd $2, %xmm0, %esi
-; X32-NEXT: vpextrd $3, %xmm1, %ebx
-; X32-NEXT: vpextrd $3, %xmm0, %ebp
+; X32-NEXT: imull $177, %ecx, %edx
+; X32-NEXT: shrl $14, %edx
+; X32-NEXT: movzbl %al, %ecx
+; X32-NEXT: xorl %ebx, %ebx
; X32-NEXT: .p2align 4, 0x90
; X32-NEXT: .LBB0_1: # %CF
; X32-NEXT: # =>This Loop Header: Depth=1
; X32-NEXT: # Child Loop BB0_2 Depth 2
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: xorl %edx, %edx
-; X32-NEXT: divl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: xorl %edx, %edx
-; X32-NEXT: divl (%esp) # 4-byte Folded Reload
-; X32-NEXT: movl %edi, %eax
-; X32-NEXT: xorl %edx, %edx
-; X32-NEXT: divl %esi
-; X32-NEXT: movl %ebx, %eax
-; X32-NEXT: xorl %edx, %edx
-; X32-NEXT: divl %ebp
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: divb %dl
; X32-NEXT: .p2align 4, 0x90
; X32-NEXT: .LBB0_2: # %CF237
; X32-NEXT: # Parent Loop BB0_1 Depth=1
; X32-NEXT: # => This Inner Loop Header: Depth=2
-; X32-NEXT: testb %cl, %cl
+; X32-NEXT: testb %bl, %bl
; X32-NEXT: jne .LBB0_2
; X32-NEXT: jmp .LBB0_1
;
; X64-LABEL: knownbits_zext_in_reg:
; X64: # %bb.0: # %BB
-; X64-NEXT: pushq %rbp
-; X64-NEXT: pushq %rbx
; X64-NEXT: movzbl (%rdi), %eax
; X64-NEXT: imull $101, %eax, %ecx
; X64-NEXT: shrl $14, %ecx
-; X64-NEXT: imull $177, %eax, %eax
-; X64-NEXT: shrl $14, %eax
+; X64-NEXT: imull $177, %eax, %edx
+; X64-NEXT: shrl $14, %edx
; X64-NEXT: movzbl %cl, %ecx
-; X64-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; X64-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm1
-; X64-NEXT: vbroadcastss {{.*#+}} xmm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
-; X64-NEXT: vpand %xmm2, %xmm1, %xmm1
-; X64-NEXT: movzbl %al, %eax
-; X64-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
-; X64-NEXT: vpand %xmm2, %xmm0, %xmm0
-; X64-NEXT: vpextrd $1, %xmm1, %r8d
-; X64-NEXT: vpextrd $1, %xmm0, %r9d
; X64-NEXT: xorl %esi, %esi
-; X64-NEXT: vmovd %xmm1, %r10d
-; X64-NEXT: vmovd %xmm0, %r11d
-; X64-NEXT: vpextrd $2, %xmm1, %edi
-; X64-NEXT: vpextrd $2, %xmm0, %ebx
-; X64-NEXT: vpextrd $3, %xmm1, %ecx
-; X64-NEXT: vpextrd $3, %xmm0, %ebp
; X64-NEXT: .p2align 4, 0x90
; X64-NEXT: .LBB0_1: # %CF
; X64-NEXT: # =>This Loop Header: Depth=1
; X64-NEXT: # Child Loop BB0_2 Depth 2
-; X64-NEXT: movl %r8d, %eax
-; X64-NEXT: xorl %edx, %edx
-; X64-NEXT: divl %r9d
-; X64-NEXT: movl %r10d, %eax
-; X64-NEXT: xorl %edx, %edx
-; X64-NEXT: divl %r11d
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: xorl %edx, %edx
-; X64-NEXT: divl %ebx
; X64-NEXT: movl %ecx, %eax
-; X64-NEXT: xorl %edx, %edx
-; X64-NEXT: divl %ebp
+; X64-NEXT: divb %dl
; X64-NEXT: .p2align 4, 0x90
; X64-NEXT: .LBB0_2: # %CF237
; X64-NEXT: # Parent Loop BB0_1 Depth=1
; SSE2: # %bb.0:
; SSE2-NEXT: movzwl {{.*}}(%rip), %eax
; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,0,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: movl $2, %eax
-; SSE2-NEXT: movd %eax, %xmm1
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
-; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por {{.*}}(%rip), %xmm0
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: retq
;
; SSSE3: # %bb.0:
; SSSE3-NEXT: movzwl {{.*}}(%rip), %eax
; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,xmm0[3,4,5,6,7,8,9,10,11,12,13,14,15]
+; SSSE3-NEXT: por {{.*}}(%rip), %xmm0
; SSSE3-NEXT: movd %xmm0, %eax
; SSSE3-NEXT: retq
;
; SSE41: # %bb.0:
; SSE41-NEXT: movzwl {{.*}}(%rip), %eax
; SSE41-NEXT: movd %eax, %xmm0
-; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,1],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; SSE41-NEXT: movl $2, %eax
-; SSE41-NEXT: pinsrd $2, %eax, %xmm0
-; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE41-NEXT: pinsrb $2, %eax, %xmm0
; SSE41-NEXT: movd %xmm0, %eax
; SSE41-NEXT: retq
;
; AVX: # %bb.0:
; AVX-NEXT: movzwl {{.*}}(%rip), %eax
; AVX-NEXT: vmovd %eax, %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,1],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX-NEXT: movl $2, %eax
-; AVX-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: retq
%1 = load <2 x i8>, <2 x i8>* bitcast (i8* @h to <2 x i8>*), align 1
define double @test1(double %A) {
; CHECK-LABEL: test1:
; CHECK: # %bb.0:
-; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
; CHECK-NEXT: paddd {{.*}}(%rip), %xmm0
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-NEXT: retq
;
; CHECK-WIDE-LABEL: test1:
; CHECK-LABEL: test4:
; CHECK: # %bb.0:
; CHECK-NEXT: movq %rdi, %xmm0
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
; CHECK-NEXT: paddd {{.*}}(%rip), %xmm0
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-NEXT: movq %xmm0, %rax
; CHECK-NEXT: retq
;
define double @test6(double %A) {
; CHECK-LABEL: test6:
; CHECK: # %bb.0:
-; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; CHECK-NEXT: paddw {{.*}}(%rip), %xmm0
-; CHECK-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; CHECK-NEXT: retq
;
; CHECK-WIDE-LABEL: test6:
define double @test8(double %A) {
; CHECK-LABEL: test8:
; CHECK: # %bb.0:
-; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; CHECK-NEXT: paddb {{.*}}(%rip), %xmm0
-; CHECK-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; CHECK-NEXT: retq
;
; CHECK-WIDE-LABEL: test8:
;
; AVX1-LABEL: larger_mul:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
-; AVX1-NEXT: vpackssdw %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpmovsxwd %xmm1, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpmovsxwd %xmm1, %xmm1
-; AVX1-NEXT: vpackssdw %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: larger_mul:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
-; AVX2-NEXT: vpmovsxwd %xmm1, %ymm1
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: movdqa (%rsi), %xmm1
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[2,1,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[2,1,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pmulhw %xmm2, %xmm4
-; SSE2-NEXT: pmullw %xmm2, %xmm3
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pmulhw %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pmulhw %xmm3, %xmm4
+; SSE2-NEXT: pmullw %xmm3, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: pmulhw %xmm1, %xmm3
; SSE2-NEXT: pmullw %xmm1, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT: paddd %xmm3, %xmm0
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE2-NEXT: paddd %xmm2, %xmm0
; SSE2-NEXT: retq
;
; AVX-LABEL: pmaddwd_bad_indices:
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vmovdqa (%rsi), %xmm1
; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,3,4,5,10,11,12,13,12,13,10,11,12,13,14,15]
-; AVX-NEXT: vpmovsxwd %xmm2, %xmm2
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,8,9,14,15,8,9,14,15,12,13,14,15]
; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; AVX-NEXT: vpmovsxwd %xmm2, %xmm2
; AVX-NEXT: vpmovsxwd %xmm3, %xmm3
; AVX-NEXT: vpmulld %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,8,9,14,15,8,9,14,15,12,13,14,15]
; AVX-NEXT: vpmovsxwd %xmm0, %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
; AVX-NEXT: vpmovsxwd %xmm1, %xmm1
; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpaddd %xmm0, %xmm2, %xmm0
define void @compressstore_v2f32_v2i32(float* %base, <2 x float> %V, <2 x i32> %trigger) {
; SSE2-LABEL: compressstore_v2f32_v2i32:
; SSE2: ## %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2]
-; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,1,1]
; SSE2-NEXT: movmskpd %xmm1, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: jne LBB2_1
; SSE42-LABEL: compressstore_v2f32_v2i32:
; SSE42: ## %bb.0:
; SSE42-NEXT: pxor %xmm2, %xmm2
-; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; SSE42-NEXT: pcmpeqq %xmm2, %xmm1
+; SSE42-NEXT: pcmpeqd %xmm1, %xmm2
+; SSE42-NEXT: pmovsxdq %xmm2, %xmm1
; SSE42-NEXT: movmskpd %xmm1, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: jne LBB2_1
; SSE42-NEXT: extractps $1, %xmm0, (%rdi)
; SSE42-NEXT: retq
;
-; AVX1-LABEL: compressstore_v2f32_v2i32:
-; AVX1: ## %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vmovmskpd %xmm1, %eax
-; AVX1-NEXT: testb $1, %al
-; AVX1-NEXT: jne LBB2_1
-; AVX1-NEXT: ## %bb.2: ## %else
-; AVX1-NEXT: testb $2, %al
-; AVX1-NEXT: jne LBB2_3
-; AVX1-NEXT: LBB2_4: ## %else2
-; AVX1-NEXT: retq
-; AVX1-NEXT: LBB2_1: ## %cond.store
-; AVX1-NEXT: vmovss %xmm0, (%rdi)
-; AVX1-NEXT: addq $4, %rdi
-; AVX1-NEXT: testb $2, %al
-; AVX1-NEXT: je LBB2_4
-; AVX1-NEXT: LBB2_3: ## %cond.store1
-; AVX1-NEXT: vextractps $1, %xmm0, (%rdi)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: compressstore_v2f32_v2i32:
-; AVX2: ## %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX2-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vmovmskpd %xmm1, %eax
-; AVX2-NEXT: testb $1, %al
-; AVX2-NEXT: jne LBB2_1
-; AVX2-NEXT: ## %bb.2: ## %else
-; AVX2-NEXT: testb $2, %al
-; AVX2-NEXT: jne LBB2_3
-; AVX2-NEXT: LBB2_4: ## %else2
-; AVX2-NEXT: retq
-; AVX2-NEXT: LBB2_1: ## %cond.store
-; AVX2-NEXT: vmovss %xmm0, (%rdi)
-; AVX2-NEXT: addq $4, %rdi
-; AVX2-NEXT: testb $2, %al
-; AVX2-NEXT: je LBB2_4
-; AVX2-NEXT: LBB2_3: ## %cond.store1
-; AVX2-NEXT: vextractps $1, %xmm0, (%rdi)
-; AVX2-NEXT: retq
+; AVX1OR2-LABEL: compressstore_v2f32_v2i32:
+; AVX1OR2: ## %bb.0:
+; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1OR2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
+; AVX1OR2-NEXT: vpmovsxdq %xmm1, %xmm1
+; AVX1OR2-NEXT: vmovmskpd %xmm1, %eax
+; AVX1OR2-NEXT: testb $1, %al
+; AVX1OR2-NEXT: jne LBB2_1
+; AVX1OR2-NEXT: ## %bb.2: ## %else
+; AVX1OR2-NEXT: testb $2, %al
+; AVX1OR2-NEXT: jne LBB2_3
+; AVX1OR2-NEXT: LBB2_4: ## %else2
+; AVX1OR2-NEXT: retq
+; AVX1OR2-NEXT: LBB2_1: ## %cond.store
+; AVX1OR2-NEXT: vmovss %xmm0, (%rdi)
+; AVX1OR2-NEXT: addq $4, %rdi
+; AVX1OR2-NEXT: testb $2, %al
+; AVX1OR2-NEXT: je LBB2_4
+; AVX1OR2-NEXT: LBB2_3: ## %cond.store1
+; AVX1OR2-NEXT: vextractps $1, %xmm0, (%rdi)
+; AVX1OR2-NEXT: retq
;
; AVX512F-LABEL: compressstore_v2f32_v2i32:
; AVX512F: ## %bb.0:
+; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX512F-NEXT: vptestnmq %zmm1, %zmm1, %k0
+; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k0
; AVX512F-NEXT: kshiftlw $14, %k0, %k0
; AVX512F-NEXT: kshiftrw $14, %k0, %k1
; AVX512F-NEXT: vcompressps %zmm0, (%rdi) {%k1}
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
-; AVX512VL-LABEL: compressstore_v2f32_v2i32:
-; AVX512VL: ## %bb.0:
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX512VL-NEXT: vptestnmq %xmm1, %xmm1, %k1
-; AVX512VL-NEXT: vcompressps %xmm0, (%rdi) {%k1}
-; AVX512VL-NEXT: retq
+; AVX512VLDQ-LABEL: compressstore_v2f32_v2i32:
+; AVX512VLDQ: ## %bb.0:
+; AVX512VLDQ-NEXT: vptestnmd %xmm1, %xmm1, %k0
+; AVX512VLDQ-NEXT: kshiftlb $6, %k0, %k0
+; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k1
+; AVX512VLDQ-NEXT: vcompressps %xmm0, (%rdi) {%k1}
+; AVX512VLDQ-NEXT: retq
+;
+; AVX512VLBW-LABEL: compressstore_v2f32_v2i32:
+; AVX512VLBW: ## %bb.0:
+; AVX512VLBW-NEXT: vptestnmd %xmm1, %xmm1, %k0
+; AVX512VLBW-NEXT: kshiftlw $14, %k0, %k0
+; AVX512VLBW-NEXT: kshiftrw $14, %k0, %k1
+; AVX512VLBW-NEXT: vcompressps %xmm0, (%rdi) {%k1}
+; AVX512VLBW-NEXT: retq
%mask = icmp eq <2 x i32> %trigger, zeroinitializer
call void @llvm.masked.compressstore.v2f32(<2 x float> %V, float* %base, <2 x i1> %mask)
ret void
define <2 x float> @expandload_v2f32_v2i1(float* %base, <2 x float> %src0, <2 x i32> %trigger) {
; SSE2-LABEL: expandload_v2f32_v2i1:
; SSE2: ## %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2]
-; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,1,1]
; SSE2-NEXT: movmskpd %xmm1, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: jne LBB4_1
; SSE42-LABEL: expandload_v2f32_v2i1:
; SSE42: ## %bb.0:
; SSE42-NEXT: pxor %xmm2, %xmm2
-; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; SSE42-NEXT: pcmpeqq %xmm2, %xmm1
+; SSE42-NEXT: pcmpeqd %xmm1, %xmm2
+; SSE42-NEXT: pmovsxdq %xmm2, %xmm1
; SSE42-NEXT: movmskpd %xmm1, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: jne LBB4_1
; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
; SSE42-NEXT: retq
;
-; AVX1-LABEL: expandload_v2f32_v2i1:
-; AVX1: ## %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vmovmskpd %xmm1, %eax
-; AVX1-NEXT: testb $1, %al
-; AVX1-NEXT: jne LBB4_1
-; AVX1-NEXT: ## %bb.2: ## %else
-; AVX1-NEXT: testb $2, %al
-; AVX1-NEXT: jne LBB4_3
-; AVX1-NEXT: LBB4_4: ## %else2
-; AVX1-NEXT: retq
-; AVX1-NEXT: LBB4_1: ## %cond.load
-; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; AVX1-NEXT: addq $4, %rdi
-; AVX1-NEXT: testb $2, %al
-; AVX1-NEXT: je LBB4_4
-; AVX1-NEXT: LBB4_3: ## %cond.load1
-; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: expandload_v2f32_v2i1:
-; AVX2: ## %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX2-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vmovmskpd %xmm1, %eax
-; AVX2-NEXT: testb $1, %al
-; AVX2-NEXT: jne LBB4_1
-; AVX2-NEXT: ## %bb.2: ## %else
-; AVX2-NEXT: testb $2, %al
-; AVX2-NEXT: jne LBB4_3
-; AVX2-NEXT: LBB4_4: ## %else2
-; AVX2-NEXT: retq
-; AVX2-NEXT: LBB4_1: ## %cond.load
-; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; AVX2-NEXT: addq $4, %rdi
-; AVX2-NEXT: testb $2, %al
-; AVX2-NEXT: je LBB4_4
-; AVX2-NEXT: LBB4_3: ## %cond.load1
-; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; AVX2-NEXT: retq
+; AVX1OR2-LABEL: expandload_v2f32_v2i1:
+; AVX1OR2: ## %bb.0:
+; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1OR2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
+; AVX1OR2-NEXT: vpmovsxdq %xmm1, %xmm1
+; AVX1OR2-NEXT: vmovmskpd %xmm1, %eax
+; AVX1OR2-NEXT: testb $1, %al
+; AVX1OR2-NEXT: jne LBB4_1
+; AVX1OR2-NEXT: ## %bb.2: ## %else
+; AVX1OR2-NEXT: testb $2, %al
+; AVX1OR2-NEXT: jne LBB4_3
+; AVX1OR2-NEXT: LBB4_4: ## %else2
+; AVX1OR2-NEXT: retq
+; AVX1OR2-NEXT: LBB4_1: ## %cond.load
+; AVX1OR2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX1OR2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX1OR2-NEXT: addq $4, %rdi
+; AVX1OR2-NEXT: testb $2, %al
+; AVX1OR2-NEXT: je LBB4_4
+; AVX1OR2-NEXT: LBB4_3: ## %cond.load1
+; AVX1OR2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; AVX1OR2-NEXT: retq
;
; AVX512F-LABEL: expandload_v2f32_v2i1:
; AVX512F: ## %bb.0:
+; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX512F-NEXT: vptestnmq %zmm1, %zmm1, %k0
+; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k0
; AVX512F-NEXT: kshiftlw $14, %k0, %k0
; AVX512F-NEXT: kshiftrw $14, %k0, %k1
; AVX512F-NEXT: vexpandps (%rdi), %zmm0 {%k1}
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
-; AVX512VL-LABEL: expandload_v2f32_v2i1:
-; AVX512VL: ## %bb.0:
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX512VL-NEXT: vptestnmq %xmm1, %xmm1, %k1
-; AVX512VL-NEXT: vexpandps (%rdi), %xmm0 {%k1}
-; AVX512VL-NEXT: retq
+; AVX512VLDQ-LABEL: expandload_v2f32_v2i1:
+; AVX512VLDQ: ## %bb.0:
+; AVX512VLDQ-NEXT: vptestnmd %xmm1, %xmm1, %k0
+; AVX512VLDQ-NEXT: kshiftlb $6, %k0, %k0
+; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k1
+; AVX512VLDQ-NEXT: vexpandps (%rdi), %xmm0 {%k1}
+; AVX512VLDQ-NEXT: retq
+;
+; AVX512VLBW-LABEL: expandload_v2f32_v2i1:
+; AVX512VLBW: ## %bb.0:
+; AVX512VLBW-NEXT: vptestnmd %xmm1, %xmm1, %k0
+; AVX512VLBW-NEXT: kshiftlw $14, %k0, %k0
+; AVX512VLBW-NEXT: kshiftrw $14, %k0, %k1
+; AVX512VLBW-NEXT: vexpandps (%rdi), %xmm0 {%k1}
+; AVX512VLBW-NEXT: retq
%mask = icmp eq <2 x i32> %trigger, zeroinitializer
%res = call <2 x float> @llvm.masked.expandload.v2f32(float* %base, <2 x i1> %mask, <2 x float> %src0)
ret <2 x float> %res
; KNL_64-LABEL: test17:
; KNL_64: # %bb.0:
; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
-; KNL_64-NEXT: vpsllq $32, %xmm0, %xmm0
-; KNL_64-NEXT: vpsraq $32, %zmm0, %zmm0
+; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1
; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0
; KNL_64-NEXT: kshiftlw $14, %k0, %k0
; KNL_64-NEXT: kshiftrw $14, %k0, %k1
-; KNL_64-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1}
+; KNL_64-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k1}
; KNL_64-NEXT: vmovapd %xmm2, %xmm0
; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
; KNL_32-LABEL: test17:
; KNL_32: # %bb.0:
; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
-; KNL_32-NEXT: vpsllq $32, %xmm0, %xmm0
-; KNL_32-NEXT: vpsraq $32, %zmm0, %zmm0
+; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1
; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0
; KNL_32-NEXT: kshiftlw $14, %k0, %k0
; KNL_32-NEXT: kshiftrw $14, %k0, %k1
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT: vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1}
+; KNL_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm2 {%k1}
; KNL_32-NEXT: vmovapd %xmm2, %xmm0
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test17:
; SKX: # %bb.0:
-; SKX-NEXT: vpsllq $32, %xmm0, %xmm0
-; SKX-NEXT: vpsraq $32, %xmm0, %xmm0
; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
; SKX-NEXT: vpmovq2m %xmm1, %k1
-; SKX-NEXT: vgatherqpd (%rdi,%xmm0,8), %xmm2 {%k1}
+; SKX-NEXT: vgatherdpd (%rdi,%xmm0,8), %xmm2 {%k1}
; SKX-NEXT: vmovapd %xmm2, %xmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test17:
; SKX_32: # %bb.0:
-; SKX_32-NEXT: vpsllq $32, %xmm0, %xmm0
-; SKX_32-NEXT: vpsraq $32, %xmm0, %xmm0
; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
; SKX_32-NEXT: vpmovq2m %xmm1, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; SKX_32-NEXT: vgatherqpd (%eax,%xmm0,8), %xmm2 {%k1}
+; SKX_32-NEXT: vgatherdpd (%eax,%xmm0,8), %xmm2 {%k1}
; SKX_32-NEXT: vmovapd %xmm2, %xmm0
; SKX_32-NEXT: retl
;
; KNL_32-LABEL: test20:
; KNL_32: # %bb.0:
+; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; KNL_32-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3]
; KNL_32-NEXT: vpsllq $63, %xmm2, %xmm2
; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k0
; KNL_32-NEXT: kshiftlw $14, %k0, %k0
;
; SKX_32-LABEL: test20:
; SKX_32: # %bb.0:
-; SKX_32-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2
; SKX_32-NEXT: vpmovq2m %xmm2, %k1
; SKX_32-NEXT: vscatterdps %xmm0, (,%xmm1) {%k1}
; KNL_64-LABEL: test21:
; KNL_64: # %bb.0:
; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
+; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; KNL_64-NEXT: vpsllq $63, %xmm2, %xmm2
; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k0
-; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; KNL_64-NEXT: kshiftlw $14, %k0, %k0
; KNL_64-NEXT: kshiftrw $14, %k0, %k1
; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
;
; KNL_32-LABEL: test21:
; KNL_32: # %bb.0:
+; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
+; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; KNL_32-NEXT: vpsllq $63, %xmm2, %xmm2
; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k0
-; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; KNL_32-NEXT: kshiftlw $14, %k0, %k0
; KNL_32-NEXT: kshiftrw $14, %k0, %k1
; KNL_32-NEXT: vpscatterdd %zmm0, (,%zmm1) {%k1}
; SKX: # %bb.0:
; SKX-NEXT: vpsllq $63, %xmm2, %xmm2
; SKX-NEXT: vpmovq2m %xmm2, %k1
-; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX-NEXT: vpscatterqd %xmm0, (,%xmm1) {%k1}
; SKX-NEXT: retq
;
; SKX_32: # %bb.0:
; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2
; SKX_32-NEXT: vpmovq2m %xmm2, %k1
-; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SKX_32-NEXT: vpscatterdd %xmm0, (,%xmm1) {%k1}
; SKX_32-NEXT: retl
call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask)
; KNL_64-LABEL: test22:
; KNL_64: # %bb.0:
; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
-; KNL_64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1
; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0
; KNL_64-NEXT: kshiftlw $14, %k0, %k0
; KNL_32-LABEL: test22:
; KNL_32: # %bb.0:
; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
-; KNL_32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1
; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0
; KNL_32-NEXT: kshiftlw $14, %k0, %k0
;
; SKX-LABEL: test22:
; SKX: # %bb.0:
-; SKX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
; SKX-NEXT: vpmovq2m %xmm1, %k1
; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm2 {%k1}
;
; SKX_32-LABEL: test22:
; SKX_32: # %bb.0:
-; SKX_32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
; SKX_32-NEXT: vpmovq2m %xmm1, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %src0) {
; KNL_64-LABEL: test23:
; KNL_64: # %bb.0:
+; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
+; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1
; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0
-; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; KNL_64-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
; KNL_64-NEXT: kshiftlw $14, %k0, %k0
; KNL_64-NEXT: kshiftrw $14, %k0, %k1
-; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
-; KNL_64-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
+; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
+; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test23:
; KNL_32: # %bb.0:
+; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
+; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1
; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0
-; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
; KNL_32-NEXT: kshiftlw $14, %k0, %k0
; KNL_32-NEXT: kshiftrw $14, %k0, %k1
-; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
-; KNL_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1}
+; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX: # %bb.0:
; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
; SKX-NEXT: vpmovq2m %xmm1, %k1
-; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
-; SKX-NEXT: vpgatherdd (%rdi,%xmm0,4), %xmm1 {%k1}
-; SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
+; SKX-NEXT: vpgatherdd (%rdi,%xmm0,4), %xmm2 {%k1}
+; SKX-NEXT: vmovdqa %xmm2, %xmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test23:
; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
; SKX_32-NEXT: vpmovq2m %xmm1, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
-; SKX_32-NEXT: vpgatherdd (%eax,%xmm0,4), %xmm1 {%k1}
-; SKX_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
+; SKX_32-NEXT: vpgatherdd (%eax,%xmm0,4), %xmm2 {%k1}
+; SKX_32-NEXT: vmovdqa %xmm2, %xmm0
; SKX_32-NEXT: retl
%sext_ind = sext <2 x i32> %ind to <2 x i64>
%gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
define <2 x i32> @test23b(i32* %base, <2 x i64> %ind, <2 x i1> %mask, <2 x i32> %src0) {
; KNL_64-LABEL: test23b:
; KNL_64: # %bb.0:
+; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1
; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0
-; KNL_64-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
; KNL_64-NEXT: kshiftlw $14, %k0, %k0
; KNL_64-NEXT: kshiftrw $14, %k0, %k1
-; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm1 {%k1}
-; KNL_64-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
+; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm2 {%k1}
+; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test23b:
; KNL_32: # %bb.0:
+; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1
; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0
-; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
; KNL_32-NEXT: kshiftlw $14, %k0, %k0
; KNL_32-NEXT: kshiftrw $14, %k0, %k1
-; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm1 {%k1}
-; KNL_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm2 {%k1}
+; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX: # %bb.0:
; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
; SKX-NEXT: vpmovq2m %xmm1, %k1
-; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
-; SKX-NEXT: vpgatherqd (%rdi,%xmm0,4), %xmm1 {%k1}
-; SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
+; SKX-NEXT: vpgatherqd (%rdi,%xmm0,4), %xmm2 {%k1}
+; SKX-NEXT: vmovdqa %xmm2, %xmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test23b:
; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
; SKX_32-NEXT: vpmovq2m %xmm1, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
-; SKX_32-NEXT: vpgatherqd (%eax,%xmm0,4), %xmm1 {%k1}
-; SKX_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
+; SKX_32-NEXT: vpgatherqd (%eax,%xmm0,4), %xmm2 {%k1}
+; SKX_32-NEXT: vmovdqa %xmm2, %xmm0
; SKX_32-NEXT: retl
%gep.random = getelementptr i32, i32* %base, <2 x i64> %ind
%res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0)
define <2 x i32> @test24(i32* %base, <2 x i32> %ind) {
; KNL_64-LABEL: test24:
; KNL_64: # %bb.0:
-; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; KNL_64-NEXT: movw $3, %ax
; KNL_64-NEXT: kmovw %eax, %k1
; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
-; KNL_64-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
+; KNL_64-NEXT: vmovdqa %xmm1, %xmm0
; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test24:
; KNL_32: # %bb.0:
+; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; KNL_32-NEXT: movw $3, %cx
; KNL_32-NEXT: kmovw %ecx, %k1
; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
-; KNL_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
+; KNL_32-NEXT: vmovdqa %xmm1, %xmm0
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX: # %bb.0:
; SKX-NEXT: movb $3, %al
; SKX-NEXT: kmovw %eax, %k1
-; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX-NEXT: vpgatherdd (%rdi,%xmm0,4), %xmm1 {%k1}
-; SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
+; SKX-NEXT: vmovdqa %xmm1, %xmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test24:
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: movb $3, %cl
; SKX_32-NEXT: kmovw %ecx, %k1
-; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX_32-NEXT: vpgatherdd (%eax,%xmm0,4), %xmm1 {%k1}
-; SKX_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
+; SKX_32-NEXT: vmovdqa %xmm1, %xmm0
; SKX_32-NEXT: retl
%sext_ind = sext <2 x i32> %ind to <2 x i64>
%gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
; KNL_64-LABEL: test25:
; KNL_64: # %bb.0:
; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
-; KNL_64-NEXT: vpsllq $32, %xmm0, %xmm0
-; KNL_64-NEXT: vpsraq $32, %zmm0, %zmm0
+; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1
; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0
; KNL_64-NEXT: kshiftlw $14, %k0, %k0
; KNL_64-NEXT: kshiftrw $14, %k0, %k1
-; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1}
+; KNL_64-NEXT: vpgatherdq (%rdi,%ymm0,8), %zmm2 {%k1}
; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
; KNL_32-LABEL: test25:
; KNL_32: # %bb.0:
; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
-; KNL_32-NEXT: vpsllq $32, %xmm0, %xmm0
-; KNL_32-NEXT: vpsraq $32, %zmm0, %zmm0
+; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1
; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0
; KNL_32-NEXT: kshiftlw $14, %k0, %k0
; KNL_32-NEXT: kshiftrw $14, %k0, %k1
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1}
+; KNL_32-NEXT: vpgatherdq (%eax,%ymm0,8), %zmm2 {%k1}
; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test25:
; SKX: # %bb.0:
-; SKX-NEXT: vpsllq $32, %xmm0, %xmm0
-; SKX-NEXT: vpsraq $32, %xmm0, %xmm0
; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
; SKX-NEXT: vpmovq2m %xmm1, %k1
-; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1}
+; SKX-NEXT: vpgatherdq (%rdi,%xmm0,8), %xmm2 {%k1}
; SKX-NEXT: vmovdqa %xmm2, %xmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test25:
; SKX_32: # %bb.0:
-; SKX_32-NEXT: vpsllq $32, %xmm0, %xmm0
-; SKX_32-NEXT: vpsraq $32, %xmm0, %xmm0
; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
; SKX_32-NEXT: vpmovq2m %xmm1, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm2 {%k1}
+; SKX_32-NEXT: vpgatherdq (%eax,%xmm0,8), %xmm2 {%k1}
; SKX_32-NEXT: vmovdqa %xmm2, %xmm0
; SKX_32-NEXT: retl
%sext_ind = sext <2 x i32> %ind to <2 x i64>
; KNL_64-LABEL: test26:
; KNL_64: # %bb.0:
; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; KNL_64-NEXT: vpsllq $32, %xmm0, %xmm0
-; KNL_64-NEXT: vpsraq $32, %zmm0, %zmm0
+; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; KNL_64-NEXT: movb $3, %al
; KNL_64-NEXT: kmovw %eax, %k1
-; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1}
+; KNL_64-NEXT: vpgatherdq (%rdi,%ymm0,8), %zmm1 {%k1}
; KNL_64-NEXT: vmovdqa %xmm1, %xmm0
; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
; KNL_32-LABEL: test26:
; KNL_32: # %bb.0:
; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; KNL_32-NEXT: vpsllq $32, %xmm0, %xmm0
-; KNL_32-NEXT: vpsraq $32, %zmm0, %zmm0
+; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: movb $3, %cl
; KNL_32-NEXT: kmovw %ecx, %k1
-; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1}
+; KNL_32-NEXT: vpgatherdq (%eax,%ymm0,8), %zmm1 {%k1}
; KNL_32-NEXT: vmovdqa %xmm1, %xmm0
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test26:
; SKX: # %bb.0:
-; SKX-NEXT: vpsllq $32, %xmm0, %xmm0
-; SKX-NEXT: vpsraq $32, %xmm0, %xmm0
; SKX-NEXT: kxnorw %k0, %k0, %k1
-; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1}
+; SKX-NEXT: vpgatherdq (%rdi,%xmm0,8), %xmm1 {%k1}
; SKX-NEXT: vmovdqa %xmm1, %xmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test26:
; SKX_32: # %bb.0:
-; SKX_32-NEXT: vpsllq $32, %xmm0, %xmm0
-; SKX_32-NEXT: vpsraq $32, %xmm0, %xmm0
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
-; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm1 {%k1}
+; SKX_32-NEXT: vpgatherdq (%eax,%xmm0,8), %xmm1 {%k1}
; SKX_32-NEXT: vmovdqa %xmm1, %xmm0
; SKX_32-NEXT: retl
%sext_ind = sext <2 x i32> %ind to <2 x i64>
define <2 x float> @test27(float* %base, <2 x i32> %ind) {
; KNL_64-LABEL: test27:
; KNL_64: # %bb.0:
-; KNL_64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; KNL_64-NEXT: movw $3, %ax
; KNL_64-NEXT: kmovw %eax, %k1
-; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
-; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; KNL_64-NEXT: vmovaps %xmm1, %xmm0
; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test27:
; KNL_32: # %bb.0:
-; KNL_32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: movw $3, %cx
; KNL_32-NEXT: kmovw %ecx, %k1
-; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
-; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
+; KNL_32-NEXT: vmovaps %xmm1, %xmm0
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test27:
; SKX: # %bb.0:
-; SKX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
; SKX-NEXT: movb $3, %al
; SKX-NEXT: kmovw %eax, %k1
-; SKX-NEXT: vgatherdps (%rdi,%xmm1,4), %xmm0 {%k1}
+; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm1 {%k1}
+; SKX-NEXT: vmovaps %xmm1, %xmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test27:
; SKX_32: # %bb.0:
-; SKX_32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: movb $3, %cl
; SKX_32-NEXT: kmovw %ecx, %k1
-; SKX_32-NEXT: vgatherdps (%eax,%xmm1,4), %xmm0 {%k1}
+; SKX_32-NEXT: vgatherdps (%eax,%xmm0,4), %xmm1 {%k1}
+; SKX_32-NEXT: vmovaps %xmm1, %xmm0
; SKX_32-NEXT: retl
%sext_ind = sext <2 x i32> %ind to <2 x i64>
%gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind
; KNL_64-LABEL: test28:
; KNL_64: # %bb.0:
; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; KNL_64-NEXT: movb $3, %al
; KNL_64-NEXT: kmovw %eax, %k1
; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
;
; KNL_32-LABEL: test28:
; KNL_32: # %bb.0:
-; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
+; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; KNL_32-NEXT: movw $3, %ax
; KNL_32-NEXT: kmovw %eax, %k1
; KNL_32-NEXT: vpscatterdd %zmm0, (,%zmm1) {%k1}
;
; SKX-LABEL: test28:
; SKX: # %bb.0:
-; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX-NEXT: kxnorw %k0, %k0, %k1
; SKX-NEXT: vpscatterqd %xmm0, (,%xmm1) {%k1}
; SKX-NEXT: retq
; SKX_32: # %bb.0:
; SKX_32-NEXT: movb $3, %al
; SKX_32-NEXT: kmovw %eax, %k1
-; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SKX_32-NEXT: vpscatterdd %xmm0, (,%xmm1) {%k1}
; SKX_32-NEXT: retl
call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> <i1 true, i1 true>)
define <8 x float> @sext_v8i8_index(float* %base, <8 x i8> %ind) {
; KNL_64-LABEL: sext_v8i8_index:
; KNL_64: # %bb.0:
-; KNL_64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; KNL_64-NEXT: vpslld $24, %ymm0, %ymm0
-; KNL_64-NEXT: vpsrad $24, %ymm0, %ymm1
+; KNL_64-NEXT: vpmovsxbd %xmm0, %ymm1
; KNL_64-NEXT: movw $255, %ax
; KNL_64-NEXT: kmovw %eax, %k1
; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
;
; KNL_32-LABEL: sext_v8i8_index:
; KNL_32: # %bb.0:
-; KNL_32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT: vpslld $24, %ymm0, %ymm0
-; KNL_32-NEXT: vpsrad $24, %ymm0, %ymm1
+; KNL_32-NEXT: vpmovsxbd %xmm0, %ymm1
; KNL_32-NEXT: movw $255, %cx
; KNL_32-NEXT: kmovw %ecx, %k1
; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
;
; SKX-LABEL: sext_v8i8_index:
; SKX: # %bb.0:
-; SKX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SKX-NEXT: vpmovsxbd %xmm0, %ymm1
; SKX-NEXT: kxnorw %k0, %k0, %k1
-; SKX-NEXT: vpslld $24, %ymm0, %ymm0
-; SKX-NEXT: vpsrad $24, %ymm0, %ymm1
; SKX-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1}
; SKX-NEXT: retq
;
; SKX_32-LABEL: sext_v8i8_index:
; SKX_32: # %bb.0:
-; SKX_32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SKX_32-NEXT: vpmovsxbd %xmm0, %ymm1
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
-; SKX_32-NEXT: vpslld $24, %ymm0, %ymm0
-; SKX_32-NEXT: vpsrad $24, %ymm0, %ymm1
; SKX_32-NEXT: vgatherdps (%eax,%ymm1,4), %ymm0 {%k1}
; SKX_32-NEXT: retl
define void @test_scatter_2i32_index(<2 x double> %a1, double* %base, <2 x i32> %ind, <2 x i1> %mask) {
; KNL_64-LABEL: test_scatter_2i32_index:
; KNL_64: # %bb.0:
+; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; KNL_64-NEXT: vpsllq $32, %xmm1, %xmm1
-; KNL_64-NEXT: vpsraq $32, %zmm1, %zmm1
; KNL_64-NEXT: vpsllq $63, %xmm2, %xmm2
; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k0
; KNL_64-NEXT: kshiftlw $14, %k0, %k0
; KNL_64-NEXT: kshiftrw $14, %k0, %k1
-; KNL_64-NEXT: vscatterqpd %zmm0, (%rdi,%zmm1,8) {%k1}
+; KNL_64-NEXT: vscatterdpd %zmm0, (%rdi,%ymm1,8) {%k1}
; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test_scatter_2i32_index:
; KNL_32: # %bb.0:
+; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; KNL_32-NEXT: vpsllq $32, %xmm1, %xmm1
-; KNL_32-NEXT: vpsraq $32, %zmm1, %zmm1
; KNL_32-NEXT: vpsllq $63, %xmm2, %xmm2
; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k0
; KNL_32-NEXT: kshiftlw $14, %k0, %k0
; KNL_32-NEXT: kshiftrw $14, %k0, %k1
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT: vscatterqpd %zmm0, (%eax,%zmm1,8) {%k1}
+; KNL_32-NEXT: vscatterdpd %zmm0, (%eax,%ymm1,8) {%k1}
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX: # %bb.0:
; SKX-NEXT: vpsllq $63, %xmm2, %xmm2
; SKX-NEXT: vpmovq2m %xmm2, %k1
-; SKX-NEXT: vpsllq $32, %xmm1, %xmm1
-; SKX-NEXT: vpsraq $32, %xmm1, %xmm1
-; SKX-NEXT: vscatterqpd %xmm0, (%rdi,%xmm1,8) {%k1}
+; SKX-NEXT: vscatterdpd %xmm0, (%rdi,%xmm1,8) {%k1}
; SKX-NEXT: retq
;
; SKX_32-LABEL: test_scatter_2i32_index:
; SKX_32: # %bb.0:
; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2
; SKX_32-NEXT: vpmovq2m %xmm2, %k1
-; SKX_32-NEXT: vpsllq $32, %xmm1, %xmm1
-; SKX_32-NEXT: vpsraq $32, %xmm1, %xmm1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; SKX_32-NEXT: vscatterqpd %xmm0, (%eax,%xmm1,8) {%k1}
+; SKX_32-NEXT: vscatterdpd %xmm0, (%eax,%xmm1,8) {%k1}
; SKX_32-NEXT: retl
%gep = getelementptr double, double *%base, <2 x i32> %ind
call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> %a1, <2 x double*> %gep, i32 4, <2 x i1> %mask)
;
; PROMOTE_SKX-LABEL: test_gather_v2i32_index:
; PROMOTE_SKX: # %bb.0:
-; PROMOTE_SKX-NEXT: vpsllq $32, %xmm0, %xmm0
-; PROMOTE_SKX-NEXT: vpsraq $32, %xmm0, %xmm0
; PROMOTE_SKX-NEXT: vpsllq $63, %xmm1, %xmm1
; PROMOTE_SKX-NEXT: vpmovq2m %xmm1, %k1
-; PROMOTE_SKX-NEXT: vgatherqpd (%rdi,%xmm0,8), %xmm2 {%k1}
+; PROMOTE_SKX-NEXT: vgatherdpd (%rdi,%xmm0,8), %xmm2 {%k1}
; PROMOTE_SKX-NEXT: vmovapd %xmm2, %xmm0
; PROMOTE_SKX-NEXT: retq
;
; PROMOTE_KNL-LABEL: test_gather_v2i32_index:
; PROMOTE_KNL: # %bb.0:
; PROMOTE_KNL-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
-; PROMOTE_KNL-NEXT: vpsllq $32, %xmm0, %xmm0
-; PROMOTE_KNL-NEXT: vpsraq $32, %zmm0, %zmm0
+; PROMOTE_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; PROMOTE_KNL-NEXT: vpsllq $63, %xmm1, %xmm1
; PROMOTE_KNL-NEXT: vptestmq %zmm1, %zmm1, %k0
; PROMOTE_KNL-NEXT: kshiftlw $14, %k0, %k0
; PROMOTE_KNL-NEXT: kshiftrw $14, %k0, %k1
-; PROMOTE_KNL-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1}
+; PROMOTE_KNL-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k1}
; PROMOTE_KNL-NEXT: vmovapd %xmm2, %xmm0
; PROMOTE_KNL-NEXT: vzeroupper
; PROMOTE_KNL-NEXT: retq
;
; PROMOTE_AVX2-LABEL: test_gather_v2i32_index:
; PROMOTE_AVX2: # %bb.0:
-; PROMOTE_AVX2-NEXT: vpsllq $32, %xmm0, %xmm3
-; PROMOTE_AVX2-NEXT: vpsrad $31, %xmm3, %xmm3
-; PROMOTE_AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3]
; PROMOTE_AVX2-NEXT: vpsllq $63, %xmm1, %xmm1
-; PROMOTE_AVX2-NEXT: vgatherqpd %xmm1, (%rdi,%xmm0,8), %xmm2
+; PROMOTE_AVX2-NEXT: vgatherdpd %xmm1, (%rdi,%xmm0,8), %xmm2
; PROMOTE_AVX2-NEXT: vmovapd %xmm2, %xmm0
; PROMOTE_AVX2-NEXT: retq
%gep.random = getelementptr double, double* %base, <2 x i32> %ind
; PROMOTE_SKX: # %bb.0:
; PROMOTE_SKX-NEXT: vpsllq $63, %xmm2, %xmm2
; PROMOTE_SKX-NEXT: vpmovq2m %xmm2, %k1
-; PROMOTE_SKX-NEXT: vpsllq $32, %xmm1, %xmm1
-; PROMOTE_SKX-NEXT: vpsraq $32, %xmm1, %xmm1
-; PROMOTE_SKX-NEXT: vscatterqpd %xmm0, (%rdi,%xmm1,8) {%k1}
+; PROMOTE_SKX-NEXT: vscatterdpd %xmm0, (%rdi,%xmm1,8) {%k1}
; PROMOTE_SKX-NEXT: retq
;
; PROMOTE_KNL-LABEL: test_scatter_v2i32_index:
; PROMOTE_KNL: # %bb.0:
+; PROMOTE_KNL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
; PROMOTE_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; PROMOTE_KNL-NEXT: vpsllq $32, %xmm1, %xmm1
-; PROMOTE_KNL-NEXT: vpsraq $32, %zmm1, %zmm1
; PROMOTE_KNL-NEXT: vpsllq $63, %xmm2, %xmm2
; PROMOTE_KNL-NEXT: vptestmq %zmm2, %zmm2, %k0
; PROMOTE_KNL-NEXT: kshiftlw $14, %k0, %k0
; PROMOTE_KNL-NEXT: kshiftrw $14, %k0, %k1
-; PROMOTE_KNL-NEXT: vscatterqpd %zmm0, (%rdi,%zmm1,8) {%k1}
+; PROMOTE_KNL-NEXT: vscatterdpd %zmm0, (%rdi,%ymm1,8) {%k1}
; PROMOTE_KNL-NEXT: vzeroupper
; PROMOTE_KNL-NEXT: retq
;
;
; PROMOTE_AVX2-LABEL: test_scatter_v2i32_index:
; PROMOTE_AVX2: # %bb.0:
-; PROMOTE_AVX2-NEXT: vpsllq $32, %xmm1, %xmm3
-; PROMOTE_AVX2-NEXT: vpsrad $31, %xmm3, %xmm3
-; PROMOTE_AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3]
+; PROMOTE_AVX2-NEXT: vpmovsxdq %xmm1, %xmm1
; PROMOTE_AVX2-NEXT: vpsllq $3, %xmm1, %xmm1
; PROMOTE_AVX2-NEXT: vmovq %rdi, %xmm3
; PROMOTE_AVX2-NEXT: vpbroadcastq %xmm3, %xmm3
; PROMOTE_SKX: # %bb.0:
; PROMOTE_SKX-NEXT: vpsllq $63, %xmm1, %xmm1
; PROMOTE_SKX-NEXT: vpmovq2m %xmm1, %k1
-; PROMOTE_SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
-; PROMOTE_SKX-NEXT: vpgatherqd (,%xmm0), %xmm1 {%k1}
-; PROMOTE_SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
+; PROMOTE_SKX-NEXT: vpgatherqd (,%xmm0), %xmm2 {%k1}
+; PROMOTE_SKX-NEXT: vmovdqa %xmm2, %xmm0
; PROMOTE_SKX-NEXT: retq
;
; PROMOTE_KNL-LABEL: test_gather_v2i32_data:
; PROMOTE_KNL: # %bb.0:
+; PROMOTE_KNL-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
; PROMOTE_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; PROMOTE_KNL-NEXT: vpsllq $63, %xmm1, %xmm1
; PROMOTE_KNL-NEXT: vptestmq %zmm1, %zmm1, %k0
-; PROMOTE_KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
; PROMOTE_KNL-NEXT: kshiftlw $14, %k0, %k0
; PROMOTE_KNL-NEXT: kshiftrw $14, %k0, %k1
-; PROMOTE_KNL-NEXT: vpgatherqd (,%zmm0), %ymm1 {%k1}
-; PROMOTE_KNL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
+; PROMOTE_KNL-NEXT: vpgatherqd (,%zmm0), %ymm2 {%k1}
+; PROMOTE_KNL-NEXT: vmovdqa %xmm2, %xmm0
; PROMOTE_KNL-NEXT: vzeroupper
; PROMOTE_KNL-NEXT: retq
;
;
; PROMOTE_AVX2-LABEL: test_gather_v2i32_data:
; PROMOTE_AVX2: # %bb.0:
-; PROMOTE_AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; PROMOTE_AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; PROMOTE_AVX2-NEXT: vpslld $31, %xmm1, %xmm1
; PROMOTE_AVX2-NEXT: vpgatherqd %xmm1, (,%xmm0), %xmm2
-; PROMOTE_AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero
+; PROMOTE_AVX2-NEXT: vmovdqa %xmm2, %xmm0
; PROMOTE_AVX2-NEXT: retq
%res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %ptr, i32 4, <2 x i1> %mask, <2 x i32> %src0)
ret <2 x i32>%res
; PROMOTE_SKX: # %bb.0:
; PROMOTE_SKX-NEXT: vpsllq $63, %xmm2, %xmm2
; PROMOTE_SKX-NEXT: vpmovq2m %xmm2, %k1
-; PROMOTE_SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; PROMOTE_SKX-NEXT: vpscatterqd %xmm0, (,%xmm1) {%k1}
; PROMOTE_SKX-NEXT: retq
;
; PROMOTE_KNL-LABEL: test_scatter_v2i32_data:
; PROMOTE_KNL: # %bb.0:
; PROMOTE_KNL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
+; PROMOTE_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; PROMOTE_KNL-NEXT: vpsllq $63, %xmm2, %xmm2
; PROMOTE_KNL-NEXT: vptestmq %zmm2, %zmm2, %k0
-; PROMOTE_KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; PROMOTE_KNL-NEXT: kshiftlw $14, %k0, %k0
; PROMOTE_KNL-NEXT: kshiftrw $14, %k0, %k1
; PROMOTE_KNL-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
; PROMOTE_AVX2-NEXT: je .LBB3_4
; PROMOTE_AVX2-NEXT: .LBB3_3: # %cond.store1
; PROMOTE_AVX2-NEXT: vpextrq $1, %xmm1, %rax
-; PROMOTE_AVX2-NEXT: vextractps $2, %xmm0, (%rax)
+; PROMOTE_AVX2-NEXT: vextractps $1, %xmm0, (%rax)
; PROMOTE_AVX2-NEXT: retq
call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask)
ret void
; PROMOTE_SKX: # %bb.0:
; PROMOTE_SKX-NEXT: vpsllq $63, %xmm1, %xmm1
; PROMOTE_SKX-NEXT: vpmovq2m %xmm1, %k1
-; PROMOTE_SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; PROMOTE_SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
-; PROMOTE_SKX-NEXT: vpgatherdd (%rdi,%xmm0,4), %xmm1 {%k1}
-; PROMOTE_SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
+; PROMOTE_SKX-NEXT: vpgatherdd (%rdi,%xmm0,4), %xmm2 {%k1}
+; PROMOTE_SKX-NEXT: vmovdqa %xmm2, %xmm0
; PROMOTE_SKX-NEXT: retq
;
; PROMOTE_KNL-LABEL: test_gather_v2i32_data_index:
; PROMOTE_KNL: # %bb.0:
+; PROMOTE_KNL-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
+; PROMOTE_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; PROMOTE_KNL-NEXT: vpsllq $63, %xmm1, %xmm1
; PROMOTE_KNL-NEXT: vptestmq %zmm1, %zmm1, %k0
-; PROMOTE_KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; PROMOTE_KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
; PROMOTE_KNL-NEXT: kshiftlw $14, %k0, %k0
; PROMOTE_KNL-NEXT: kshiftrw $14, %k0, %k1
-; PROMOTE_KNL-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
-; PROMOTE_KNL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
+; PROMOTE_KNL-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
+; PROMOTE_KNL-NEXT: vmovdqa %xmm2, %xmm0
; PROMOTE_KNL-NEXT: vzeroupper
; PROMOTE_KNL-NEXT: retq
;
;
; PROMOTE_AVX2-LABEL: test_gather_v2i32_data_index:
; PROMOTE_AVX2: # %bb.0:
-; PROMOTE_AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; PROMOTE_AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; PROMOTE_AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
; PROMOTE_AVX2-NEXT: vpslld $31, %xmm1, %xmm1
; PROMOTE_AVX2-NEXT: vpgatherdd %xmm1, (%rdi,%xmm0,4), %xmm2
-; PROMOTE_AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero
+; PROMOTE_AVX2-NEXT: vmovdqa %xmm2, %xmm0
; PROMOTE_AVX2-NEXT: retq
%gep.random = getelementptr i32, i32* %base, <2 x i32> %ind
%res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0)
; PROMOTE_SKX: # %bb.0:
; PROMOTE_SKX-NEXT: vpsllq $63, %xmm2, %xmm2
; PROMOTE_SKX-NEXT: vpmovq2m %xmm2, %k1
-; PROMOTE_SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; PROMOTE_SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; PROMOTE_SKX-NEXT: vpscatterdd %xmm0, (%rdi,%xmm1,4) {%k1}
; PROMOTE_SKX-NEXT: retq
;
; PROMOTE_KNL-LABEL: test_scatter_v2i32_data_index:
; PROMOTE_KNL: # %bb.0:
+; PROMOTE_KNL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
+; PROMOTE_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; PROMOTE_KNL-NEXT: vpsllq $63, %xmm2, %xmm2
; PROMOTE_KNL-NEXT: vptestmq %zmm2, %zmm2, %k0
-; PROMOTE_KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; PROMOTE_KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; PROMOTE_KNL-NEXT: kshiftlw $14, %k0, %k0
; PROMOTE_KNL-NEXT: kshiftrw $14, %k0, %k1
; PROMOTE_KNL-NEXT: vpscatterdd %zmm0, (%rdi,%zmm1,4) {%k1}
;
; PROMOTE_AVX2-LABEL: test_scatter_v2i32_data_index:
; PROMOTE_AVX2: # %bb.0:
-; PROMOTE_AVX2-NEXT: vpsllq $32, %xmm1, %xmm3
-; PROMOTE_AVX2-NEXT: vpsrad $31, %xmm3, %xmm3
-; PROMOTE_AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3]
+; PROMOTE_AVX2-NEXT: vpmovsxdq %xmm1, %xmm1
; PROMOTE_AVX2-NEXT: vpsllq $2, %xmm1, %xmm1
; PROMOTE_AVX2-NEXT: vmovq %rdi, %xmm3
; PROMOTE_AVX2-NEXT: vpbroadcastq %xmm3, %xmm3
; PROMOTE_AVX2-NEXT: je .LBB5_4
; PROMOTE_AVX2-NEXT: .LBB5_3: # %cond.store1
; PROMOTE_AVX2-NEXT: vpextrq $1, %xmm1, %rax
-; PROMOTE_AVX2-NEXT: vextractps $2, %xmm0, (%rax)
+; PROMOTE_AVX2-NEXT: vextractps $1, %xmm0, (%rax)
; PROMOTE_AVX2-NEXT: retq
%gep = getelementptr i32, i32 *%base, <2 x i32> %ind
call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %gep, i32 4, <2 x i1> %mask)
;
; AVX1-LABEL: load_v8f64_v8i16:
; AVX1: ## %bb.0:
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm4, %xmm4
-; AVX1-NEXT: vpmovsxdq %xmm4, %xmm5
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
-; AVX1-NEXT: vpmovsxdq %xmm4, %xmm4
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpmovsxdq %xmm0, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpmovsxwd %xmm3, %xmm3
+; AVX1-NEXT: vpmovsxdq %xmm3, %xmm5
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
+; AVX1-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm4
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
-; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm3
-; AVX1-NEXT: vblendvpd %ymm0, %ymm3, %ymm1, %ymm0
-; AVX1-NEXT: vmaskmovpd 32(%rdi), %ymm4, %ymm1
-; AVX1-NEXT: vblendvpd %ymm4, %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0
+; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm4
+; AVX1-NEXT: vblendvpd %ymm0, %ymm4, %ymm1, %ymm0
+; AVX1-NEXT: vmaskmovpd 32(%rdi), %ymm3, %ymm1
+; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_v8f64_v8i16:
; AVX2: ## %bb.0:
-; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; AVX2-NEXT: vpcmpeqd %xmm3, %xmm4, %xmm4
-; AVX2-NEXT: vpmovsxdq %xmm4, %ymm4
-; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpmovsxwd %xmm3, %xmm3
+; AVX2-NEXT: vpmovsxdq %xmm3, %ymm3
+; AVX2-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpmovsxwd %xmm0, %xmm0
; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
-; AVX2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm3
-; AVX2-NEXT: vblendvpd %ymm0, %ymm3, %ymm1, %ymm0
-; AVX2-NEXT: vmaskmovpd 32(%rdi), %ymm4, %ymm1
-; AVX2-NEXT: vblendvpd %ymm4, %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm4
+; AVX2-NEXT: vblendvpd %ymm0, %ymm4, %ymm1, %ymm0
+; AVX2-NEXT: vmaskmovpd 32(%rdi), %ymm3, %ymm1
+; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
; AVX2-NEXT: retq
;
; AVX512F-LABEL: load_v8f64_v8i16:
define <2 x float> @load_v2f32_v2i32(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %dst) {
; SSE2-LABEL: load_v2f32_v2i32:
; SSE2: ## %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
-; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,1,1]
; SSE2-NEXT: movmskpd %xmm0, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: jne LBB7_1
; SSE42-LABEL: load_v2f32_v2i32:
; SSE42: ## %bb.0:
; SSE42-NEXT: pxor %xmm2, %xmm2
-; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; SSE42-NEXT: pcmpeqq %xmm2, %xmm0
+; SSE42-NEXT: pcmpeqd %xmm0, %xmm2
+; SSE42-NEXT: pmovsxdq %xmm2, %xmm0
; SSE42-NEXT: movmskpd %xmm0, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: jne LBB7_1
; SSE42-NEXT: movaps %xmm1, %xmm0
; SSE42-NEXT: retq
;
-; AVX1-LABEL: load_v2f32_v2i32:
-; AVX1: ## %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; AVX1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2
-; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_v2f32_v2i32:
-; AVX2: ## %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; AVX2-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2
-; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
-; AVX2-NEXT: retq
+; AVX1OR2-LABEL: load_v2f32_v2i32:
+; AVX1OR2: ## %bb.0:
+; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1OR2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX1OR2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX1OR2-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2
+; AVX1OR2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX1OR2-NEXT: retq
;
; AVX512F-LABEL: load_v2f32_v2i32:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kshiftlw $14, %k0, %k0
; AVX512F-NEXT: kshiftrw $14, %k0, %k1
; AVX512F-NEXT: vblendmps (%rdi), %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
-; AVX512VL-LABEL: load_v2f32_v2i32:
-; AVX512VL: ## %bb.0:
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX512VL-NEXT: vptestnmq %xmm0, %xmm0, %k1
-; AVX512VL-NEXT: vblendmps (%rdi), %xmm1, %xmm0 {%k1}
-; AVX512VL-NEXT: retq
+; AVX512VLDQ-LABEL: load_v2f32_v2i32:
+; AVX512VLDQ: ## %bb.0:
+; AVX512VLDQ-NEXT: vptestnmd %xmm0, %xmm0, %k0
+; AVX512VLDQ-NEXT: kshiftlb $6, %k0, %k0
+; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k1
+; AVX512VLDQ-NEXT: vblendmps (%rdi), %xmm1, %xmm0 {%k1}
+; AVX512VLDQ-NEXT: retq
+;
+; AVX512VLBW-LABEL: load_v2f32_v2i32:
+; AVX512VLBW: ## %bb.0:
+; AVX512VLBW-NEXT: vptestnmd %xmm0, %xmm0, %k0
+; AVX512VLBW-NEXT: kshiftlw $14, %k0, %k0
+; AVX512VLBW-NEXT: kshiftrw $14, %k0, %k1
+; AVX512VLBW-NEXT: vblendmps (%rdi), %xmm1, %xmm0 {%k1}
+; AVX512VLBW-NEXT: retq
%mask = icmp eq <2 x i32> %trigger, zeroinitializer
%res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
ret <2 x float> %res
define <2 x float> @load_v2f32_v2i32_undef(<2 x i32> %trigger, <2 x float>* %addr) {
; SSE2-LABEL: load_v2f32_v2i32_undef:
; SSE2: ## %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2]
-; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,1,1]
; SSE2-NEXT: movmskpd %xmm0, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: ## implicit-def: $xmm0
; SSE42-LABEL: load_v2f32_v2i32_undef:
; SSE42: ## %bb.0:
; SSE42-NEXT: pxor %xmm1, %xmm1
-; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; SSE42-NEXT: pcmpeqq %xmm1, %xmm0
+; SSE42-NEXT: pcmpeqd %xmm0, %xmm1
+; SSE42-NEXT: pmovsxdq %xmm1, %xmm0
; SSE42-NEXT: movmskpd %xmm0, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: ## implicit-def: $xmm0
; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
; SSE42-NEXT: retq
;
-; AVX1-LABEL: load_v2f32_v2i32_undef:
-; AVX1: ## %bb.0:
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; AVX1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_v2f32_v2i32_undef:
-; AVX2: ## %bb.0:
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; AVX2-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0
-; AVX2-NEXT: retq
+; AVX1OR2-LABEL: load_v2f32_v2i32_undef:
+; AVX1OR2: ## %bb.0:
+; AVX1OR2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1OR2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX1OR2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX1OR2-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0
+; AVX1OR2-NEXT: retq
;
; AVX512F-LABEL: load_v2f32_v2i32_undef:
; AVX512F: ## %bb.0:
-; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kshiftlw $14, %k0, %k0
; AVX512F-NEXT: kshiftrw $14, %k0, %k1
; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} {z}
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
-; AVX512VL-LABEL: load_v2f32_v2i32_undef:
-; AVX512VL: ## %bb.0:
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX512VL-NEXT: vptestnmq %xmm0, %xmm0, %k1
-; AVX512VL-NEXT: vmovups (%rdi), %xmm0 {%k1} {z}
-; AVX512VL-NEXT: retq
+; AVX512VLDQ-LABEL: load_v2f32_v2i32_undef:
+; AVX512VLDQ: ## %bb.0:
+; AVX512VLDQ-NEXT: vptestnmd %xmm0, %xmm0, %k0
+; AVX512VLDQ-NEXT: kshiftlb $6, %k0, %k0
+; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k1
+; AVX512VLDQ-NEXT: vmovups (%rdi), %xmm0 {%k1} {z}
+; AVX512VLDQ-NEXT: retq
+;
+; AVX512VLBW-LABEL: load_v2f32_v2i32_undef:
+; AVX512VLBW: ## %bb.0:
+; AVX512VLBW-NEXT: vptestnmd %xmm0, %xmm0, %k0
+; AVX512VLBW-NEXT: kshiftlw $14, %k0, %k0
+; AVX512VLBW-NEXT: kshiftrw $14, %k0, %k1
+; AVX512VLBW-NEXT: vmovups (%rdi), %xmm0 {%k1} {z}
+; AVX512VLBW-NEXT: retq
%mask = icmp eq <2 x i32> %trigger, zeroinitializer
%res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float>undef)
ret <2 x float> %res
;
; AVX1-LABEL: load_v8i64_v8i16:
; AVX1: ## %bb.0:
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm4, %xmm4
-; AVX1-NEXT: vpmovsxdq %xmm4, %xmm5
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
-; AVX1-NEXT: vpmovsxdq %xmm4, %xmm4
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpmovsxdq %xmm0, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpmovsxwd %xmm3, %xmm3
+; AVX1-NEXT: vpmovsxdq %xmm3, %xmm5
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
+; AVX1-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm4
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
-; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm3
-; AVX1-NEXT: vblendvpd %ymm0, %ymm3, %ymm1, %ymm0
-; AVX1-NEXT: vmaskmovpd 32(%rdi), %ymm4, %ymm1
-; AVX1-NEXT: vblendvpd %ymm4, %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0
+; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm4
+; AVX1-NEXT: vblendvpd %ymm0, %ymm4, %ymm1, %ymm0
+; AVX1-NEXT: vmaskmovpd 32(%rdi), %ymm3, %ymm1
+; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_v8i64_v8i16:
; AVX2: ## %bb.0:
-; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; AVX2-NEXT: vpcmpeqd %xmm3, %xmm4, %xmm4
-; AVX2-NEXT: vpmovsxdq %xmm4, %ymm4
-; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpmovsxwd %xmm3, %xmm3
+; AVX2-NEXT: vpmovsxdq %xmm3, %ymm3
+; AVX2-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpmovsxwd %xmm0, %xmm0
; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
-; AVX2-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm3
-; AVX2-NEXT: vblendvpd %ymm0, %ymm3, %ymm1, %ymm0
-; AVX2-NEXT: vpmaskmovq 32(%rdi), %ymm4, %ymm1
-; AVX2-NEXT: vblendvpd %ymm4, %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm4
+; AVX2-NEXT: vblendvpd %ymm0, %ymm4, %ymm1, %ymm0
+; AVX2-NEXT: vpmaskmovq 32(%rdi), %ymm3, %ymm1
+; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
; AVX2-NEXT: retq
;
; AVX512F-LABEL: load_v8i64_v8i16:
define <2 x i32> @load_v2i32_v2i32(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) {
; SSE2-LABEL: load_v2i32_v2i32:
; SSE2: ## %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
-; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,1,1]
; SSE2-NEXT: movmskpd %xmm0, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: jne LBB17_1
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: jne LBB17_3
; SSE2-NEXT: LBB17_4: ## %else2
-; SSE2-NEXT: movapd %xmm1, %xmm0
+; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
; SSE2-NEXT: LBB17_1: ## %cond.load
-; SSE2-NEXT: movl (%rdi), %ecx
-; SSE2-NEXT: movq %rcx, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je LBB17_4
; SSE2-NEXT: LBB17_3: ## %cond.load1
-; SSE2-NEXT: movl 4(%rdi), %eax
-; SSE2-NEXT: movq %rax, %xmm0
-; SSE2-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE2-NEXT: movapd %xmm1, %xmm0
+; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: load_v2i32_v2i32:
; SSE42: ## %bb.0:
; SSE42-NEXT: pxor %xmm2, %xmm2
-; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; SSE42-NEXT: pcmpeqq %xmm2, %xmm0
+; SSE42-NEXT: pcmpeqd %xmm0, %xmm2
+; SSE42-NEXT: pmovsxdq %xmm2, %xmm0
; SSE42-NEXT: movmskpd %xmm0, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: jne LBB17_1
; SSE42-NEXT: movdqa %xmm1, %xmm0
; SSE42-NEXT: retq
; SSE42-NEXT: LBB17_1: ## %cond.load
-; SSE42-NEXT: movl (%rdi), %ecx
-; SSE42-NEXT: pinsrq $0, %rcx, %xmm1
+; SSE42-NEXT: pinsrd $0, (%rdi), %xmm1
; SSE42-NEXT: testb $2, %al
; SSE42-NEXT: je LBB17_4
; SSE42-NEXT: LBB17_3: ## %cond.load1
-; SSE42-NEXT: movl 4(%rdi), %eax
-; SSE42-NEXT: pinsrq $1, %rax, %xmm1
+; SSE42-NEXT: pinsrd $1, 4(%rdi), %xmm1
; SSE42-NEXT: movdqa %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX1-LABEL: load_v2i32_v2i32:
; AVX1: ## %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3]
; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_v2i32_v2i32:
; AVX2: ## %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX2-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3]
; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
-; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX2-NEXT: retq
;
; AVX512F-LABEL: load_v2i32_v2i32:
; AVX512F: ## %bb.0:
-; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kshiftlw $14, %k0, %k0
; AVX512F-NEXT: kshiftrw $14, %k0, %k1
-; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1}
-; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512F-NEXT: vpblendmd (%rdi), %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
-; AVX512VL-LABEL: load_v2i32_v2i32:
-; AVX512VL: ## %bb.0:
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX512VL-NEXT: vptestnmq %xmm0, %xmm0, %k1
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
-; AVX512VL-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1}
-; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX512VL-NEXT: retq
+; AVX512VLDQ-LABEL: load_v2i32_v2i32:
+; AVX512VLDQ: ## %bb.0:
+; AVX512VLDQ-NEXT: vptestnmd %xmm0, %xmm0, %k0
+; AVX512VLDQ-NEXT: kshiftlb $6, %k0, %k0
+; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k1
+; AVX512VLDQ-NEXT: vpblendmd (%rdi), %xmm1, %xmm0 {%k1}
+; AVX512VLDQ-NEXT: retq
+;
+; AVX512VLBW-LABEL: load_v2i32_v2i32:
+; AVX512VLBW: ## %bb.0:
+; AVX512VLBW-NEXT: vptestnmd %xmm0, %xmm0, %k0
+; AVX512VLBW-NEXT: kshiftlw $14, %k0, %k0
+; AVX512VLBW-NEXT: kshiftrw $14, %k0, %k1
+; AVX512VLBW-NEXT: vpblendmd (%rdi), %xmm1, %xmm0 {%k1}
+; AVX512VLBW-NEXT: retq
%mask = icmp eq <2 x i32> %trigger, zeroinitializer
%res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
ret <2 x i32> %res
define void @store_v2f32_v2i32(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) {
; SSE2-LABEL: store_v2f32_v2i32:
; SSE2: ## %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
-; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,1,1]
; SSE2-NEXT: movmskpd %xmm0, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: jne LBB3_1
; SSE4-LABEL: store_v2f32_v2i32:
; SSE4: ## %bb.0:
; SSE4-NEXT: pxor %xmm2, %xmm2
-; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; SSE4-NEXT: pcmpeqq %xmm2, %xmm0
+; SSE4-NEXT: pcmpeqd %xmm0, %xmm2
+; SSE4-NEXT: pmovsxdq %xmm2, %xmm0
; SSE4-NEXT: movmskpd %xmm0, %eax
; SSE4-NEXT: testb $1, %al
; SSE4-NEXT: jne LBB3_1
; SSE4-NEXT: extractps $1, %xmm1, 4(%rdi)
; SSE4-NEXT: retq
;
-; AVX1-LABEL: store_v2f32_v2i32:
-; AVX1: ## %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: store_v2f32_v2i32:
-; AVX2: ## %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; AVX2-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi)
-; AVX2-NEXT: retq
+; AVX1OR2-LABEL: store_v2f32_v2i32:
+; AVX1OR2: ## %bb.0:
+; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1OR2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX1OR2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX1OR2-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi)
+; AVX1OR2-NEXT: retq
;
; AVX512F-LABEL: store_v2f32_v2i32:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kshiftlw $14, %k0, %k0
; AVX512F-NEXT: kshiftrw $14, %k0, %k1
; AVX512F-NEXT: vmovups %zmm1, (%rdi) {%k1}
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
-; AVX512VL-LABEL: store_v2f32_v2i32:
-; AVX512VL: ## %bb.0:
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX512VL-NEXT: vptestnmq %xmm0, %xmm0, %k1
-; AVX512VL-NEXT: vmovups %xmm1, (%rdi) {%k1}
-; AVX512VL-NEXT: retq
+; AVX512VLDQ-LABEL: store_v2f32_v2i32:
+; AVX512VLDQ: ## %bb.0:
+; AVX512VLDQ-NEXT: vptestnmd %xmm0, %xmm0, %k0
+; AVX512VLDQ-NEXT: kshiftlb $6, %k0, %k0
+; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k1
+; AVX512VLDQ-NEXT: vmovups %xmm1, (%rdi) {%k1}
+; AVX512VLDQ-NEXT: retq
+;
+; AVX512VLBW-LABEL: store_v2f32_v2i32:
+; AVX512VLBW: ## %bb.0:
+; AVX512VLBW-NEXT: vptestnmd %xmm0, %xmm0, %k0
+; AVX512VLBW-NEXT: kshiftlw $14, %k0, %k0
+; AVX512VLBW-NEXT: kshiftrw $14, %k0, %k1
+; AVX512VLBW-NEXT: vmovups %xmm1, (%rdi) {%k1}
+; AVX512VLBW-NEXT: retq
%mask = icmp eq <2 x i32> %trigger, zeroinitializer
call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask)
ret void
define void @store_v2i32_v2i32(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) {
; SSE2-LABEL: store_v2i32_v2i32:
; SSE2: ## %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
-; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,1,1]
; SSE2-NEXT: movmskpd %xmm0, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: jne LBB10_1
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je LBB10_4
; SSE2-NEXT: LBB10_3: ## %cond.store1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE2-NEXT: movd %xmm0, 4(%rdi)
; SSE2-NEXT: retq
;
; SSE4-LABEL: store_v2i32_v2i32:
; SSE4: ## %bb.0:
; SSE4-NEXT: pxor %xmm2, %xmm2
-; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; SSE4-NEXT: pcmpeqq %xmm2, %xmm0
+; SSE4-NEXT: pcmpeqd %xmm0, %xmm2
+; SSE4-NEXT: pmovsxdq %xmm2, %xmm0
; SSE4-NEXT: movmskpd %xmm0, %eax
; SSE4-NEXT: testb $1, %al
; SSE4-NEXT: jne LBB10_1
; SSE4-NEXT: testb $2, %al
; SSE4-NEXT: je LBB10_4
; SSE4-NEXT: LBB10_3: ## %cond.store1
-; SSE4-NEXT: extractps $2, %xmm1, 4(%rdi)
+; SSE4-NEXT: extractps $1, %xmm1, 4(%rdi)
; SSE4-NEXT: retq
;
; AVX1-LABEL: store_v2i32_v2i32:
; AVX1: ## %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: store_v2i32_v2i32:
; AVX2: ## %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX2-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi)
; AVX2-NEXT: retq
;
; AVX512F-LABEL: store_v2i32_v2i32:
; AVX512F: ## %bb.0:
-; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kshiftlw $14, %k0, %k0
; AVX512F-NEXT: kshiftrw $14, %k0, %k1
-; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1}
+; AVX512F-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1}
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
-; AVX512VL-LABEL: store_v2i32_v2i32:
-; AVX512VL: ## %bb.0:
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX512VL-NEXT: vptestnmq %xmm0, %xmm0, %k1
-; AVX512VL-NEXT: vpmovqd %xmm1, (%rdi) {%k1}
-; AVX512VL-NEXT: retq
+; AVX512VLDQ-LABEL: store_v2i32_v2i32:
+; AVX512VLDQ: ## %bb.0:
+; AVX512VLDQ-NEXT: vptestnmd %xmm0, %xmm0, %k0
+; AVX512VLDQ-NEXT: kshiftlb $6, %k0, %k0
+; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k1
+; AVX512VLDQ-NEXT: vmovdqu32 %xmm1, (%rdi) {%k1}
+; AVX512VLDQ-NEXT: retq
+;
+; AVX512VLBW-LABEL: store_v2i32_v2i32:
+; AVX512VLBW: ## %bb.0:
+; AVX512VLBW-NEXT: vptestnmd %xmm0, %xmm0, %k0
+; AVX512VLBW-NEXT: kshiftlw $14, %k0, %k0
+; AVX512VLBW-NEXT: kshiftrw $14, %k0, %k1
+; AVX512VLBW-NEXT: vmovdqu32 %xmm1, (%rdi) {%k1}
+; AVX512VLBW-NEXT: retq
%mask = icmp eq <2 x i32> %trigger, zeroinitializer
call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %val, <2 x i32>* %addr, i32 4, <2 x i1> %mask)
ret void
; SSE2-LABEL: truncstore_v8i64_v8i8:
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm6, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm0[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1]
+; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE2-NEXT: pand %xmm7, %xmm3
+; SSE2-NEXT: pand %xmm7, %xmm2
+; SSE2-NEXT: packuswb %xmm3, %xmm2
+; SSE2-NEXT: pand %xmm7, %xmm1
+; SSE2-NEXT: pand %xmm7, %xmm0
+; SSE2-NEXT: packuswb %xmm1, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: pcmpeqd %xmm6, %xmm5
; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm1, %xmm5
; SSE2-NEXT: jne .LBB2_5
; SSE2-NEXT: .LBB2_6: # %else4
; SSE2-NEXT: testb $8, %al
-; SSE2-NEXT: jne .LBB2_7
+; SSE2-NEXT: je .LBB2_8
+; SSE2-NEXT: .LBB2_7: # %cond.store5
+; SSE2-NEXT: shrl $24, %ecx
+; SSE2-NEXT: movb %cl, 3(%rdi)
; SSE2-NEXT: .LBB2_8: # %else6
; SSE2-NEXT: testb $16, %al
-; SSE2-NEXT: jne .LBB2_9
+; SSE2-NEXT: pextrw $2, %xmm0, %ecx
+; SSE2-NEXT: je .LBB2_10
+; SSE2-NEXT: # %bb.9: # %cond.store7
+; SSE2-NEXT: movb %cl, 4(%rdi)
; SSE2-NEXT: .LBB2_10: # %else8
; SSE2-NEXT: testb $32, %al
-; SSE2-NEXT: jne .LBB2_11
+; SSE2-NEXT: je .LBB2_12
+; SSE2-NEXT: # %bb.11: # %cond.store9
+; SSE2-NEXT: movb %ch, 5(%rdi)
; SSE2-NEXT: .LBB2_12: # %else10
; SSE2-NEXT: testb $64, %al
+; SSE2-NEXT: pextrw $3, %xmm0, %ecx
; SSE2-NEXT: jne .LBB2_13
-; SSE2-NEXT: .LBB2_14: # %else12
+; SSE2-NEXT: # %bb.14: # %else12
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: jne .LBB2_15
; SSE2-NEXT: .LBB2_16: # %else14
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je .LBB2_4
; SSE2-NEXT: .LBB2_3: # %cond.store1
-; SSE2-NEXT: shrl $16, %ecx
-; SSE2-NEXT: movb %cl, 1(%rdi)
+; SSE2-NEXT: movb %ch, 1(%rdi)
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: je .LBB2_6
; SSE2-NEXT: .LBB2_5: # %cond.store3
-; SSE2-NEXT: pextrw $2, %xmm0, %ecx
-; SSE2-NEXT: movb %cl, 2(%rdi)
+; SSE2-NEXT: movl %ecx, %edx
+; SSE2-NEXT: shrl $16, %edx
+; SSE2-NEXT: movb %dl, 2(%rdi)
; SSE2-NEXT: testb $8, %al
-; SSE2-NEXT: je .LBB2_8
-; SSE2-NEXT: .LBB2_7: # %cond.store5
-; SSE2-NEXT: pextrw $3, %xmm0, %ecx
-; SSE2-NEXT: movb %cl, 3(%rdi)
-; SSE2-NEXT: testb $16, %al
-; SSE2-NEXT: je .LBB2_10
-; SSE2-NEXT: .LBB2_9: # %cond.store7
-; SSE2-NEXT: pextrw $4, %xmm0, %ecx
-; SSE2-NEXT: movb %cl, 4(%rdi)
-; SSE2-NEXT: testb $32, %al
-; SSE2-NEXT: je .LBB2_12
-; SSE2-NEXT: .LBB2_11: # %cond.store9
-; SSE2-NEXT: pextrw $5, %xmm0, %ecx
-; SSE2-NEXT: movb %cl, 5(%rdi)
-; SSE2-NEXT: testb $64, %al
-; SSE2-NEXT: je .LBB2_14
+; SSE2-NEXT: jne .LBB2_7
+; SSE2-NEXT: jmp .LBB2_8
; SSE2-NEXT: .LBB2_13: # %cond.store11
-; SSE2-NEXT: pextrw $6, %xmm0, %ecx
; SSE2-NEXT: movb %cl, 6(%rdi)
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: je .LBB2_16
; SSE2-NEXT: .LBB2_15: # %cond.store13
-; SSE2-NEXT: pextrw $7, %xmm0, %eax
-; SSE2-NEXT: movb %al, 7(%rdi)
+; SSE2-NEXT: movb %ch, 7(%rdi)
; SSE2-NEXT: retq
;
; SSE4-LABEL: truncstore_v8i64_v8i8:
; SSE4: # %bb.0:
; SSE4-NEXT: pxor %xmm6, %xmm6
-; SSE4-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1,2,3],xmm3[4],xmm6[5,6,7]
-; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm6[1,2,3],xmm2[4],xmm6[5,6,7]
+; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE4-NEXT: pand %xmm7, %xmm3
+; SSE4-NEXT: pand %xmm7, %xmm2
; SSE4-NEXT: packusdw %xmm3, %xmm2
-; SSE4-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1,2,3],xmm1[4],xmm6[5,6,7]
-; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm6[1,2,3],xmm0[4],xmm6[5,6,7]
+; SSE4-NEXT: pand %xmm7, %xmm1
+; SSE4-NEXT: pand %xmm7, %xmm0
; SSE4-NEXT: packusdw %xmm1, %xmm0
; SSE4-NEXT: packusdw %xmm2, %xmm0
+; SSE4-NEXT: packuswb %xmm0, %xmm0
; SSE4-NEXT: pcmpeqd %xmm6, %xmm5
; SSE4-NEXT: pcmpeqd %xmm1, %xmm1
; SSE4-NEXT: pxor %xmm1, %xmm5
; SSE4-NEXT: testb $2, %al
; SSE4-NEXT: je .LBB2_4
; SSE4-NEXT: .LBB2_3: # %cond.store1
-; SSE4-NEXT: pextrb $2, %xmm0, 1(%rdi)
+; SSE4-NEXT: pextrb $1, %xmm0, 1(%rdi)
; SSE4-NEXT: testb $4, %al
; SSE4-NEXT: je .LBB2_6
; SSE4-NEXT: .LBB2_5: # %cond.store3
-; SSE4-NEXT: pextrb $4, %xmm0, 2(%rdi)
+; SSE4-NEXT: pextrb $2, %xmm0, 2(%rdi)
; SSE4-NEXT: testb $8, %al
; SSE4-NEXT: je .LBB2_8
; SSE4-NEXT: .LBB2_7: # %cond.store5
-; SSE4-NEXT: pextrb $6, %xmm0, 3(%rdi)
+; SSE4-NEXT: pextrb $3, %xmm0, 3(%rdi)
; SSE4-NEXT: testb $16, %al
; SSE4-NEXT: je .LBB2_10
; SSE4-NEXT: .LBB2_9: # %cond.store7
-; SSE4-NEXT: pextrb $8, %xmm0, 4(%rdi)
+; SSE4-NEXT: pextrb $4, %xmm0, 4(%rdi)
; SSE4-NEXT: testb $32, %al
; SSE4-NEXT: je .LBB2_12
; SSE4-NEXT: .LBB2_11: # %cond.store9
-; SSE4-NEXT: pextrb $10, %xmm0, 5(%rdi)
+; SSE4-NEXT: pextrb $5, %xmm0, 5(%rdi)
; SSE4-NEXT: testb $64, %al
; SSE4-NEXT: je .LBB2_14
; SSE4-NEXT: .LBB2_13: # %cond.store11
-; SSE4-NEXT: pextrb $12, %xmm0, 6(%rdi)
+; SSE4-NEXT: pextrb $6, %xmm0, 6(%rdi)
; SSE4-NEXT: testb $-128, %al
; SSE4-NEXT: je .LBB2_16
; SSE4-NEXT: .LBB2_15: # %cond.store13
-; SSE4-NEXT: pextrb $14, %xmm0, 7(%rdi)
+; SSE4-NEXT: pextrb $7, %xmm0, 7(%rdi)
; SSE4-NEXT: retq
;
; AVX1-LABEL: truncstore_v8i64_v8i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535]
+; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [255,255,255,255]
; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vpackusdw %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1
; AVX1-NEXT: testb $2, %al
; AVX1-NEXT: je .LBB2_4
; AVX1-NEXT: .LBB2_3: # %cond.store1
-; AVX1-NEXT: vpextrb $2, %xmm0, 1(%rdi)
+; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rdi)
; AVX1-NEXT: testb $4, %al
; AVX1-NEXT: je .LBB2_6
; AVX1-NEXT: .LBB2_5: # %cond.store3
-; AVX1-NEXT: vpextrb $4, %xmm0, 2(%rdi)
+; AVX1-NEXT: vpextrb $2, %xmm0, 2(%rdi)
; AVX1-NEXT: testb $8, %al
; AVX1-NEXT: je .LBB2_8
; AVX1-NEXT: .LBB2_7: # %cond.store5
-; AVX1-NEXT: vpextrb $6, %xmm0, 3(%rdi)
+; AVX1-NEXT: vpextrb $3, %xmm0, 3(%rdi)
; AVX1-NEXT: testb $16, %al
; AVX1-NEXT: je .LBB2_10
; AVX1-NEXT: .LBB2_9: # %cond.store7
-; AVX1-NEXT: vpextrb $8, %xmm0, 4(%rdi)
+; AVX1-NEXT: vpextrb $4, %xmm0, 4(%rdi)
; AVX1-NEXT: testb $32, %al
; AVX1-NEXT: je .LBB2_12
; AVX1-NEXT: .LBB2_11: # %cond.store9
-; AVX1-NEXT: vpextrb $10, %xmm0, 5(%rdi)
+; AVX1-NEXT: vpextrb $5, %xmm0, 5(%rdi)
; AVX1-NEXT: testb $64, %al
; AVX1-NEXT: je .LBB2_14
; AVX1-NEXT: .LBB2_13: # %cond.store11
-; AVX1-NEXT: vpextrb $12, %xmm0, 6(%rdi)
+; AVX1-NEXT: vpextrb $6, %xmm0, 6(%rdi)
; AVX1-NEXT: testb $-128, %al
; AVX1-NEXT: je .LBB2_16
; AVX1-NEXT: .LBB2_15: # %cond.store13
-; AVX1-NEXT: vpextrb $14, %xmm0, 7(%rdi)
+; AVX1-NEXT: vpextrb $7, %xmm0, 7(%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: truncstore_v8i64_v8i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2]
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
-; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm1
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm1
; AVX2-NEXT: vmovmskps %ymm1, %eax
; AVX2-NEXT: notl %eax
; AVX2-NEXT: testb $2, %al
; AVX2-NEXT: je .LBB2_4
; AVX2-NEXT: .LBB2_3: # %cond.store1
-; AVX2-NEXT: vpextrb $2, %xmm0, 1(%rdi)
+; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rdi)
; AVX2-NEXT: testb $4, %al
; AVX2-NEXT: je .LBB2_6
; AVX2-NEXT: .LBB2_5: # %cond.store3
-; AVX2-NEXT: vpextrb $4, %xmm0, 2(%rdi)
+; AVX2-NEXT: vpextrb $2, %xmm0, 2(%rdi)
; AVX2-NEXT: testb $8, %al
; AVX2-NEXT: je .LBB2_8
; AVX2-NEXT: .LBB2_7: # %cond.store5
-; AVX2-NEXT: vpextrb $6, %xmm0, 3(%rdi)
+; AVX2-NEXT: vpextrb $3, %xmm0, 3(%rdi)
; AVX2-NEXT: testb $16, %al
; AVX2-NEXT: je .LBB2_10
; AVX2-NEXT: .LBB2_9: # %cond.store7
-; AVX2-NEXT: vpextrb $8, %xmm0, 4(%rdi)
+; AVX2-NEXT: vpextrb $4, %xmm0, 4(%rdi)
; AVX2-NEXT: testb $32, %al
; AVX2-NEXT: je .LBB2_12
; AVX2-NEXT: .LBB2_11: # %cond.store9
-; AVX2-NEXT: vpextrb $10, %xmm0, 5(%rdi)
+; AVX2-NEXT: vpextrb $5, %xmm0, 5(%rdi)
; AVX2-NEXT: testb $64, %al
; AVX2-NEXT: je .LBB2_14
; AVX2-NEXT: .LBB2_13: # %cond.store11
-; AVX2-NEXT: vpextrb $12, %xmm0, 6(%rdi)
+; AVX2-NEXT: vpextrb $6, %xmm0, 6(%rdi)
; AVX2-NEXT: testb $-128, %al
; AVX2-NEXT: je .LBB2_16
; AVX2-NEXT: .LBB2_15: # %cond.store13
-; AVX2-NEXT: vpextrb $14, %xmm0, 7(%rdi)
+; AVX2-NEXT: vpextrb $7, %xmm0, 7(%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F: # %bb.0:
; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
-; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512F-NEXT: vpmovqb %zmm0, %xmm0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: testb $1, %al
; AVX512F-NEXT: jne .LBB2_1
; AVX512F-NEXT: testb $2, %al
; AVX512F-NEXT: je .LBB2_4
; AVX512F-NEXT: .LBB2_3: # %cond.store1
-; AVX512F-NEXT: vpextrb $2, %xmm0, 1(%rdi)
+; AVX512F-NEXT: vpextrb $1, %xmm0, 1(%rdi)
; AVX512F-NEXT: testb $4, %al
; AVX512F-NEXT: je .LBB2_6
; AVX512F-NEXT: .LBB2_5: # %cond.store3
-; AVX512F-NEXT: vpextrb $4, %xmm0, 2(%rdi)
+; AVX512F-NEXT: vpextrb $2, %xmm0, 2(%rdi)
; AVX512F-NEXT: testb $8, %al
; AVX512F-NEXT: je .LBB2_8
; AVX512F-NEXT: .LBB2_7: # %cond.store5
-; AVX512F-NEXT: vpextrb $6, %xmm0, 3(%rdi)
+; AVX512F-NEXT: vpextrb $3, %xmm0, 3(%rdi)
; AVX512F-NEXT: testb $16, %al
; AVX512F-NEXT: je .LBB2_10
; AVX512F-NEXT: .LBB2_9: # %cond.store7
-; AVX512F-NEXT: vpextrb $8, %xmm0, 4(%rdi)
+; AVX512F-NEXT: vpextrb $4, %xmm0, 4(%rdi)
; AVX512F-NEXT: testb $32, %al
; AVX512F-NEXT: je .LBB2_12
; AVX512F-NEXT: .LBB2_11: # %cond.store9
-; AVX512F-NEXT: vpextrb $10, %xmm0, 5(%rdi)
+; AVX512F-NEXT: vpextrb $5, %xmm0, 5(%rdi)
; AVX512F-NEXT: testb $64, %al
; AVX512F-NEXT: je .LBB2_14
; AVX512F-NEXT: .LBB2_13: # %cond.store11
-; AVX512F-NEXT: vpextrb $12, %xmm0, 6(%rdi)
+; AVX512F-NEXT: vpextrb $6, %xmm0, 6(%rdi)
; AVX512F-NEXT: testb $-128, %al
; AVX512F-NEXT: je .LBB2_16
; AVX512F-NEXT: .LBB2_15: # %cond.store13
-; AVX512F-NEXT: vpextrb $14, %xmm0, 7(%rdi)
+; AVX512F-NEXT: vpextrb $7, %xmm0, 7(%rdi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; SSE2-LABEL: truncstore_v4i64_v4i16:
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
; SSE2-NEXT: movmskps %xmm3, %eax
; SSE2-NEXT: xorl $15, %eax
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je .LBB4_4
; SSE2-NEXT: .LBB4_3: # %cond.store1
-; SSE2-NEXT: pextrw $2, %xmm0, %ecx
+; SSE2-NEXT: pextrw $1, %xmm0, %ecx
; SSE2-NEXT: movw %cx, 2(%rdi)
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: je .LBB4_6
; SSE2-NEXT: .LBB4_5: # %cond.store3
-; SSE2-NEXT: pextrw $4, %xmm0, %ecx
+; SSE2-NEXT: pextrw $2, %xmm0, %ecx
; SSE2-NEXT: movw %cx, 4(%rdi)
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: je .LBB4_8
; SSE2-NEXT: .LBB4_7: # %cond.store5
-; SSE2-NEXT: pextrw $6, %xmm0, %eax
+; SSE2-NEXT: pextrw $3, %xmm0, %eax
; SSE2-NEXT: movw %ax, 6(%rdi)
; SSE2-NEXT: retq
;
; SSE4-LABEL: truncstore_v4i64_v4i16:
; SSE4: # %bb.0:
; SSE4-NEXT: pxor %xmm3, %xmm3
-; SSE4-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE4-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE4-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE4-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE4-NEXT: pcmpeqd %xmm2, %xmm3
; SSE4-NEXT: movmskps %xmm3, %eax
; SSE4-NEXT: xorl $15, %eax
; SSE4-NEXT: testb $2, %al
; SSE4-NEXT: je .LBB4_4
; SSE4-NEXT: .LBB4_3: # %cond.store1
-; SSE4-NEXT: pextrw $2, %xmm0, 2(%rdi)
+; SSE4-NEXT: pextrw $1, %xmm0, 2(%rdi)
; SSE4-NEXT: testb $4, %al
; SSE4-NEXT: je .LBB4_6
; SSE4-NEXT: .LBB4_5: # %cond.store3
-; SSE4-NEXT: pextrw $4, %xmm0, 4(%rdi)
+; SSE4-NEXT: pextrw $2, %xmm0, 4(%rdi)
; SSE4-NEXT: testb $8, %al
; SSE4-NEXT: je .LBB4_8
; SSE4-NEXT: .LBB4_7: # %cond.store5
-; SSE4-NEXT: pextrw $6, %xmm0, 6(%rdi)
+; SSE4-NEXT: pextrw $3, %xmm0, 6(%rdi)
; SSE4-NEXT: retq
;
-; AVX-LABEL: truncstore_v4i64_v4i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
-; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vmovmskps %xmm1, %eax
-; AVX-NEXT: xorl $15, %eax
-; AVX-NEXT: testb $1, %al
-; AVX-NEXT: jne .LBB4_1
-; AVX-NEXT: # %bb.2: # %else
-; AVX-NEXT: testb $2, %al
-; AVX-NEXT: jne .LBB4_3
-; AVX-NEXT: .LBB4_4: # %else2
-; AVX-NEXT: testb $4, %al
-; AVX-NEXT: jne .LBB4_5
-; AVX-NEXT: .LBB4_6: # %else4
-; AVX-NEXT: testb $8, %al
-; AVX-NEXT: jne .LBB4_7
-; AVX-NEXT: .LBB4_8: # %else6
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
-; AVX-NEXT: .LBB4_1: # %cond.store
-; AVX-NEXT: vpextrw $0, %xmm0, (%rdi)
-; AVX-NEXT: testb $2, %al
-; AVX-NEXT: je .LBB4_4
-; AVX-NEXT: .LBB4_3: # %cond.store1
-; AVX-NEXT: vpextrw $2, %xmm0, 2(%rdi)
-; AVX-NEXT: testb $4, %al
-; AVX-NEXT: je .LBB4_6
-; AVX-NEXT: .LBB4_5: # %cond.store3
-; AVX-NEXT: vpextrw $4, %xmm0, 4(%rdi)
-; AVX-NEXT: testb $8, %al
-; AVX-NEXT: je .LBB4_8
-; AVX-NEXT: .LBB4_7: # %cond.store5
-; AVX-NEXT: vpextrw $6, %xmm0, 6(%rdi)
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX1-LABEL: truncstore_v4i64_v4i16:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vmovmskps %xmm1, %eax
+; AVX1-NEXT: xorl $15, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: jne .LBB4_1
+; AVX1-NEXT: # %bb.2: # %else
+; AVX1-NEXT: testb $2, %al
+; AVX1-NEXT: jne .LBB4_3
+; AVX1-NEXT: .LBB4_4: # %else2
+; AVX1-NEXT: testb $4, %al
+; AVX1-NEXT: jne .LBB4_5
+; AVX1-NEXT: .LBB4_6: # %else4
+; AVX1-NEXT: testb $8, %al
+; AVX1-NEXT: jne .LBB4_7
+; AVX1-NEXT: .LBB4_8: # %else6
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+; AVX1-NEXT: .LBB4_1: # %cond.store
+; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi)
+; AVX1-NEXT: testb $2, %al
+; AVX1-NEXT: je .LBB4_4
+; AVX1-NEXT: .LBB4_3: # %cond.store1
+; AVX1-NEXT: vpextrw $1, %xmm0, 2(%rdi)
+; AVX1-NEXT: testb $4, %al
+; AVX1-NEXT: je .LBB4_6
+; AVX1-NEXT: .LBB4_5: # %cond.store3
+; AVX1-NEXT: vpextrw $2, %xmm0, 4(%rdi)
+; AVX1-NEXT: testb $8, %al
+; AVX1-NEXT: je .LBB4_8
+; AVX1-NEXT: .LBB4_7: # %cond.store5
+; AVX1-NEXT: vpextrw $3, %xmm0, 6(%rdi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: truncstore_v4i64_v4i16:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vmovmskps %xmm1, %eax
+; AVX2-NEXT: xorl $15, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: jne .LBB4_1
+; AVX2-NEXT: # %bb.2: # %else
+; AVX2-NEXT: testb $2, %al
+; AVX2-NEXT: jne .LBB4_3
+; AVX2-NEXT: .LBB4_4: # %else2
+; AVX2-NEXT: testb $4, %al
+; AVX2-NEXT: jne .LBB4_5
+; AVX2-NEXT: .LBB4_6: # %else4
+; AVX2-NEXT: testb $8, %al
+; AVX2-NEXT: jne .LBB4_7
+; AVX2-NEXT: .LBB4_8: # %else6
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+; AVX2-NEXT: .LBB4_1: # %cond.store
+; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi)
+; AVX2-NEXT: testb $2, %al
+; AVX2-NEXT: je .LBB4_4
+; AVX2-NEXT: .LBB4_3: # %cond.store1
+; AVX2-NEXT: vpextrw $1, %xmm0, 2(%rdi)
+; AVX2-NEXT: testb $4, %al
+; AVX2-NEXT: je .LBB4_6
+; AVX2-NEXT: .LBB4_5: # %cond.store3
+; AVX2-NEXT: vpextrw $2, %xmm0, 4(%rdi)
+; AVX2-NEXT: testb $8, %al
+; AVX2-NEXT: je .LBB4_8
+; AVX2-NEXT: .LBB4_7: # %cond.store5
+; AVX2-NEXT: vpextrw $3, %xmm0, 6(%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: truncstore_v4i64_v4i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: testb $1, %al
; AVX512F-NEXT: jne .LBB4_1
; AVX512F-NEXT: testb $2, %al
; AVX512F-NEXT: je .LBB4_4
; AVX512F-NEXT: .LBB4_3: # %cond.store1
-; AVX512F-NEXT: vpextrw $2, %xmm0, 2(%rdi)
+; AVX512F-NEXT: vpextrw $1, %xmm0, 2(%rdi)
; AVX512F-NEXT: testb $4, %al
; AVX512F-NEXT: je .LBB4_6
; AVX512F-NEXT: .LBB4_5: # %cond.store3
-; AVX512F-NEXT: vpextrw $4, %xmm0, 4(%rdi)
+; AVX512F-NEXT: vpextrw $2, %xmm0, 4(%rdi)
; AVX512F-NEXT: testb $8, %al
; AVX512F-NEXT: je .LBB4_8
; AVX512F-NEXT: .LBB4_7: # %cond.store5
-; AVX512F-NEXT: vpextrw $6, %xmm0, 6(%rdi)
+; AVX512F-NEXT: vpextrw $3, %xmm0, 6(%rdi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512BW-NEXT: kshiftld $28, %k0, %k0
; AVX512BW-NEXT: kshiftrd $28, %k0, %k1
+; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1}
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
; SSE2-LABEL: truncstore_v4i64_v4i8:
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE2-NEXT: pand %xmm4, %xmm1
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: packuswb %xmm1, %xmm0
+; SSE2-NEXT: packuswb %xmm0, %xmm0
+; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
-; SSE2-NEXT: movmskps %xmm3, %eax
-; SSE2-NEXT: xorl $15, %eax
-; SSE2-NEXT: testb $1, %al
+; SSE2-NEXT: movmskps %xmm3, %ecx
+; SSE2-NEXT: xorl $15, %ecx
+; SSE2-NEXT: testb $1, %cl
+; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: jne .LBB5_1
; SSE2-NEXT: # %bb.2: # %else
-; SSE2-NEXT: testb $2, %al
+; SSE2-NEXT: testb $2, %cl
; SSE2-NEXT: jne .LBB5_3
; SSE2-NEXT: .LBB5_4: # %else2
-; SSE2-NEXT: testb $4, %al
+; SSE2-NEXT: testb $4, %cl
; SSE2-NEXT: jne .LBB5_5
; SSE2-NEXT: .LBB5_6: # %else4
-; SSE2-NEXT: testb $8, %al
+; SSE2-NEXT: testb $8, %cl
; SSE2-NEXT: jne .LBB5_7
; SSE2-NEXT: .LBB5_8: # %else6
; SSE2-NEXT: retq
; SSE2-NEXT: .LBB5_1: # %cond.store
-; SSE2-NEXT: movd %xmm0, %ecx
-; SSE2-NEXT: movb %cl, (%rdi)
-; SSE2-NEXT: testb $2, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: testb $2, %cl
; SSE2-NEXT: je .LBB5_4
; SSE2-NEXT: .LBB5_3: # %cond.store1
-; SSE2-NEXT: pextrw $2, %xmm0, %ecx
-; SSE2-NEXT: movb %cl, 1(%rdi)
-; SSE2-NEXT: testb $4, %al
+; SSE2-NEXT: movb %ah, 1(%rdi)
+; SSE2-NEXT: testb $4, %cl
; SSE2-NEXT: je .LBB5_6
; SSE2-NEXT: .LBB5_5: # %cond.store3
-; SSE2-NEXT: pextrw $4, %xmm0, %ecx
-; SSE2-NEXT: movb %cl, 2(%rdi)
-; SSE2-NEXT: testb $8, %al
+; SSE2-NEXT: movl %eax, %edx
+; SSE2-NEXT: shrl $16, %edx
+; SSE2-NEXT: movb %dl, 2(%rdi)
+; SSE2-NEXT: testb $8, %cl
; SSE2-NEXT: je .LBB5_8
; SSE2-NEXT: .LBB5_7: # %cond.store5
-; SSE2-NEXT: pextrw $6, %xmm0, %eax
+; SSE2-NEXT: shrl $24, %eax
; SSE2-NEXT: movb %al, 3(%rdi)
; SSE2-NEXT: retq
;
; SSE4-LABEL: truncstore_v4i64_v4i8:
; SSE4: # %bb.0:
; SSE4-NEXT: pxor %xmm3, %xmm3
-; SSE4-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE4-NEXT: movdqa {{.*#+}} xmm4 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSE4-NEXT: pshufb %xmm4, %xmm1
+; SSE4-NEXT: pshufb %xmm4, %xmm0
+; SSE4-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE4-NEXT: pcmpeqd %xmm2, %xmm3
; SSE4-NEXT: movmskps %xmm3, %eax
; SSE4-NEXT: xorl $15, %eax
; SSE4-NEXT: testb $2, %al
; SSE4-NEXT: je .LBB5_4
; SSE4-NEXT: .LBB5_3: # %cond.store1
-; SSE4-NEXT: pextrb $4, %xmm0, 1(%rdi)
+; SSE4-NEXT: pextrb $1, %xmm0, 1(%rdi)
; SSE4-NEXT: testb $4, %al
; SSE4-NEXT: je .LBB5_6
; SSE4-NEXT: .LBB5_5: # %cond.store3
-; SSE4-NEXT: pextrb $8, %xmm0, 2(%rdi)
+; SSE4-NEXT: pextrb $2, %xmm0, 2(%rdi)
; SSE4-NEXT: testb $8, %al
; SSE4-NEXT: je .LBB5_8
; SSE4-NEXT: .LBB5_7: # %cond.store5
-; SSE4-NEXT: pextrb $12, %xmm0, 3(%rdi)
+; SSE4-NEXT: pextrb $3, %xmm0, 3(%rdi)
; SSE4-NEXT: retq
;
-; AVX-LABEL: truncstore_v4i64_v4i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
-; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vmovmskps %xmm1, %eax
-; AVX-NEXT: xorl $15, %eax
-; AVX-NEXT: testb $1, %al
-; AVX-NEXT: jne .LBB5_1
-; AVX-NEXT: # %bb.2: # %else
-; AVX-NEXT: testb $2, %al
-; AVX-NEXT: jne .LBB5_3
-; AVX-NEXT: .LBB5_4: # %else2
-; AVX-NEXT: testb $4, %al
-; AVX-NEXT: jne .LBB5_5
-; AVX-NEXT: .LBB5_6: # %else4
-; AVX-NEXT: testb $8, %al
-; AVX-NEXT: jne .LBB5_7
-; AVX-NEXT: .LBB5_8: # %else6
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
-; AVX-NEXT: .LBB5_1: # %cond.store
-; AVX-NEXT: vpextrb $0, %xmm0, (%rdi)
-; AVX-NEXT: testb $2, %al
-; AVX-NEXT: je .LBB5_4
-; AVX-NEXT: .LBB5_3: # %cond.store1
-; AVX-NEXT: vpextrb $4, %xmm0, 1(%rdi)
-; AVX-NEXT: testb $4, %al
-; AVX-NEXT: je .LBB5_6
-; AVX-NEXT: .LBB5_5: # %cond.store3
-; AVX-NEXT: vpextrb $8, %xmm0, 2(%rdi)
-; AVX-NEXT: testb $8, %al
-; AVX-NEXT: je .LBB5_8
-; AVX-NEXT: .LBB5_7: # %cond.store5
-; AVX-NEXT: vpextrb $12, %xmm0, 3(%rdi)
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX1-LABEL: truncstore_v4i64_v4i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vmovmskps %xmm1, %eax
+; AVX1-NEXT: xorl $15, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: jne .LBB5_1
+; AVX1-NEXT: # %bb.2: # %else
+; AVX1-NEXT: testb $2, %al
+; AVX1-NEXT: jne .LBB5_3
+; AVX1-NEXT: .LBB5_4: # %else2
+; AVX1-NEXT: testb $4, %al
+; AVX1-NEXT: jne .LBB5_5
+; AVX1-NEXT: .LBB5_6: # %else4
+; AVX1-NEXT: testb $8, %al
+; AVX1-NEXT: jne .LBB5_7
+; AVX1-NEXT: .LBB5_8: # %else6
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+; AVX1-NEXT: .LBB5_1: # %cond.store
+; AVX1-NEXT: vpextrb $0, %xmm0, (%rdi)
+; AVX1-NEXT: testb $2, %al
+; AVX1-NEXT: je .LBB5_4
+; AVX1-NEXT: .LBB5_3: # %cond.store1
+; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rdi)
+; AVX1-NEXT: testb $4, %al
+; AVX1-NEXT: je .LBB5_6
+; AVX1-NEXT: .LBB5_5: # %cond.store3
+; AVX1-NEXT: vpextrb $2, %xmm0, 2(%rdi)
+; AVX1-NEXT: testb $8, %al
+; AVX1-NEXT: je .LBB5_8
+; AVX1-NEXT: .LBB5_7: # %cond.store5
+; AVX1-NEXT: vpextrb $3, %xmm0, 3(%rdi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: truncstore_v4i64_v4i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vmovmskps %xmm1, %eax
+; AVX2-NEXT: xorl $15, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: jne .LBB5_1
+; AVX2-NEXT: # %bb.2: # %else
+; AVX2-NEXT: testb $2, %al
+; AVX2-NEXT: jne .LBB5_3
+; AVX2-NEXT: .LBB5_4: # %else2
+; AVX2-NEXT: testb $4, %al
+; AVX2-NEXT: jne .LBB5_5
+; AVX2-NEXT: .LBB5_6: # %else4
+; AVX2-NEXT: testb $8, %al
+; AVX2-NEXT: jne .LBB5_7
+; AVX2-NEXT: .LBB5_8: # %else6
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+; AVX2-NEXT: .LBB5_1: # %cond.store
+; AVX2-NEXT: vpextrb $0, %xmm0, (%rdi)
+; AVX2-NEXT: testb $2, %al
+; AVX2-NEXT: je .LBB5_4
+; AVX2-NEXT: .LBB5_3: # %cond.store1
+; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rdi)
+; AVX2-NEXT: testb $4, %al
+; AVX2-NEXT: je .LBB5_6
+; AVX2-NEXT: .LBB5_5: # %cond.store3
+; AVX2-NEXT: vpextrb $2, %xmm0, 2(%rdi)
+; AVX2-NEXT: testb $8, %al
+; AVX2-NEXT: je .LBB5_8
+; AVX2-NEXT: .LBB5_7: # %cond.store5
+; AVX2-NEXT: vpextrb $3, %xmm0, 3(%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: truncstore_v4i64_v4i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512F-NEXT: vpmovqb %zmm0, %xmm0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: testb $1, %al
; AVX512F-NEXT: jne .LBB5_1
; AVX512F-NEXT: testb $2, %al
; AVX512F-NEXT: je .LBB5_4
; AVX512F-NEXT: .LBB5_3: # %cond.store1
-; AVX512F-NEXT: vpextrb $4, %xmm0, 1(%rdi)
+; AVX512F-NEXT: vpextrb $1, %xmm0, 1(%rdi)
; AVX512F-NEXT: testb $4, %al
; AVX512F-NEXT: je .LBB5_6
; AVX512F-NEXT: .LBB5_5: # %cond.store3
-; AVX512F-NEXT: vpextrb $8, %xmm0, 2(%rdi)
+; AVX512F-NEXT: vpextrb $2, %xmm0, 2(%rdi)
; AVX512F-NEXT: testb $8, %al
; AVX512F-NEXT: je .LBB5_8
; AVX512F-NEXT: .LBB5_7: # %cond.store5
-; AVX512F-NEXT: vpextrb $12, %xmm0, 3(%rdi)
+; AVX512F-NEXT: vpextrb $3, %xmm0, 3(%rdi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: kshiftlq $60, %k0, %k0
; AVX512BW-NEXT: kshiftrq $60, %k0, %k1
+; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0
; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1}
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
; SSE2-LABEL: truncstore_v2i64_v2i32:
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2]
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je .LBB6_4
; SSE2-NEXT: .LBB6_3: # %cond.store1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; SSE2-NEXT: movd %xmm0, 4(%rdi)
; SSE2-NEXT: retq
;
; SSE4-LABEL: truncstore_v2i64_v2i32:
; SSE4: # %bb.0:
; SSE4-NEXT: pxor %xmm2, %xmm2
+; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE4-NEXT: pcmpeqq %xmm1, %xmm2
; SSE4-NEXT: movmskpd %xmm2, %eax
; SSE4-NEXT: xorl $3, %eax
; SSE4-NEXT: .LBB6_4: # %else2
; SSE4-NEXT: retq
; SSE4-NEXT: .LBB6_1: # %cond.store
-; SSE4-NEXT: movss %xmm0, (%rdi)
+; SSE4-NEXT: movd %xmm0, (%rdi)
; SSE4-NEXT: testb $2, %al
; SSE4-NEXT: je .LBB6_4
; SSE4-NEXT: .LBB6_3: # %cond.store1
-; SSE4-NEXT: extractps $2, %xmm0, 4(%rdi)
+; SSE4-NEXT: pextrd $1, %xmm0, 4(%rdi)
; SSE4-NEXT: retq
;
; AVX1-LABEL: truncstore_v2i64_v2i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX512F-NEXT: kshiftlw $14, %k0, %k0
; AVX512F-NEXT: kshiftrw $14, %k0, %k1
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1}
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512BW-NEXT: vptestmq %zmm1, %zmm1, %k0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX512BW-NEXT: kshiftlw $14, %k0, %k0
; AVX512BW-NEXT: kshiftrw $14, %k0, %k1
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1}
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
; SSE2-LABEL: truncstore_v2i64_v2i16:
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2]
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je .LBB7_4
; SSE2-NEXT: .LBB7_3: # %cond.store1
-; SSE2-NEXT: pextrw $4, %xmm0, %eax
+; SSE2-NEXT: pextrw $1, %xmm0, %eax
; SSE2-NEXT: movw %ax, 2(%rdi)
; SSE2-NEXT: retq
;
; SSE4-LABEL: truncstore_v2i64_v2i16:
; SSE4: # %bb.0:
; SSE4-NEXT: pxor %xmm2, %xmm2
+; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE4-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; SSE4-NEXT: pcmpeqq %xmm1, %xmm2
; SSE4-NEXT: movmskpd %xmm2, %eax
; SSE4-NEXT: xorl $3, %eax
; SSE4-NEXT: testb $2, %al
; SSE4-NEXT: je .LBB7_4
; SSE4-NEXT: .LBB7_3: # %cond.store1
-; SSE4-NEXT: pextrw $4, %xmm0, 2(%rdi)
+; SSE4-NEXT: pextrw $1, %xmm0, 2(%rdi)
; SSE4-NEXT: retq
;
; AVX-LABEL: truncstore_v2i64_v2i16:
; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
; AVX-NEXT: vmovmskpd %xmm1, %eax
; AVX-NEXT: xorl $3, %eax
; AVX-NEXT: testb $2, %al
; AVX-NEXT: je .LBB7_4
; AVX-NEXT: .LBB7_3: # %cond.store1
-; AVX-NEXT: vpextrw $4, %xmm0, 2(%rdi)
+; AVX-NEXT: vpextrw $1, %xmm0, 2(%rdi)
; AVX-NEXT: retq
;
; AVX512F-LABEL: truncstore_v2i64_v2i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: testb $1, %al
; AVX512F-NEXT: jne .LBB7_1
; AVX512F-NEXT: testb $2, %al
; AVX512F-NEXT: je .LBB7_4
; AVX512F-NEXT: .LBB7_3: # %cond.store1
-; AVX512F-NEXT: vpextrw $4, %xmm0, 2(%rdi)
+; AVX512F-NEXT: vpextrw $1, %xmm0, 2(%rdi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512BW-NEXT: vptestmq %zmm1, %zmm1, %k0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX512BW-NEXT: kshiftld $30, %k0, %k0
; AVX512BW-NEXT: kshiftrd $30, %k0, %k1
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1}
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
; SSE2-LABEL: truncstore_v2i64_v2i8:
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: packuswb %xmm0, %xmm0
+; SSE2-NEXT: packuswb %xmm0, %xmm0
+; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2]
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: movmskpd %xmm1, %eax
; SSE2-NEXT: xorl $3, %eax
; SSE2-NEXT: testb $1, %al
+; SSE2-NEXT: movd %xmm0, %ecx
; SSE2-NEXT: jne .LBB8_1
; SSE2-NEXT: # %bb.2: # %else
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: .LBB8_4: # %else2
; SSE2-NEXT: retq
; SSE2-NEXT: .LBB8_1: # %cond.store
-; SSE2-NEXT: movd %xmm0, %ecx
; SSE2-NEXT: movb %cl, (%rdi)
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je .LBB8_4
; SSE2-NEXT: .LBB8_3: # %cond.store1
-; SSE2-NEXT: pextrw $4, %xmm0, %eax
-; SSE2-NEXT: movb %al, 1(%rdi)
+; SSE2-NEXT: movb %ch, 1(%rdi)
; SSE2-NEXT: retq
;
; SSE4-LABEL: truncstore_v2i64_v2i8:
; SSE4: # %bb.0:
; SSE4-NEXT: pxor %xmm2, %xmm2
+; SSE4-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; SSE4-NEXT: pcmpeqq %xmm1, %xmm2
; SSE4-NEXT: movmskpd %xmm2, %eax
; SSE4-NEXT: xorl $3, %eax
; SSE4-NEXT: testb $2, %al
; SSE4-NEXT: je .LBB8_4
; SSE4-NEXT: .LBB8_3: # %cond.store1
-; SSE4-NEXT: pextrb $8, %xmm0, 1(%rdi)
+; SSE4-NEXT: pextrb $1, %xmm0, 1(%rdi)
; SSE4-NEXT: retq
;
; AVX-LABEL: truncstore_v2i64_v2i8:
; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
; AVX-NEXT: vmovmskpd %xmm1, %eax
; AVX-NEXT: xorl $3, %eax
; AVX-NEXT: testb $2, %al
; AVX-NEXT: je .LBB8_4
; AVX-NEXT: .LBB8_3: # %cond.store1
-; AVX-NEXT: vpextrb $8, %xmm0, 1(%rdi)
+; AVX-NEXT: vpextrb $1, %xmm0, 1(%rdi)
; AVX-NEXT: retq
;
; AVX512F-LABEL: truncstore_v2i64_v2i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: testb $1, %al
; AVX512F-NEXT: jne .LBB8_1
; AVX512F-NEXT: testb $2, %al
; AVX512F-NEXT: je .LBB8_4
; AVX512F-NEXT: .LBB8_3: # %cond.store1
-; AVX512F-NEXT: vpextrb $8, %xmm0, 1(%rdi)
+; AVX512F-NEXT: vpextrb $1, %xmm0, 1(%rdi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512BW-NEXT: vptestmq %zmm1, %zmm1, %k0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: kshiftlq $62, %k0, %k0
; AVX512BW-NEXT: kshiftrq $62, %k0, %k1
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1}
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
; SSE2-LABEL: truncstore_v8i32_v8i8:
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pslld $16, %xmm1
-; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: pslld $16, %xmm0
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: packssdw %xmm1, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: pand %xmm5, %xmm0
+; SSE2-NEXT: packuswb %xmm1, %xmm0
+; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: pcmpeqd %xmm4, %xmm3
; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm1, %xmm3
; SSE2-NEXT: jne .LBB12_5
; SSE2-NEXT: .LBB12_6: # %else4
; SSE2-NEXT: testb $8, %al
-; SSE2-NEXT: jne .LBB12_7
+; SSE2-NEXT: je .LBB12_8
+; SSE2-NEXT: .LBB12_7: # %cond.store5
+; SSE2-NEXT: shrl $24, %ecx
+; SSE2-NEXT: movb %cl, 3(%rdi)
; SSE2-NEXT: .LBB12_8: # %else6
; SSE2-NEXT: testb $16, %al
-; SSE2-NEXT: jne .LBB12_9
+; SSE2-NEXT: pextrw $2, %xmm0, %ecx
+; SSE2-NEXT: je .LBB12_10
+; SSE2-NEXT: # %bb.9: # %cond.store7
+; SSE2-NEXT: movb %cl, 4(%rdi)
; SSE2-NEXT: .LBB12_10: # %else8
; SSE2-NEXT: testb $32, %al
-; SSE2-NEXT: jne .LBB12_11
+; SSE2-NEXT: je .LBB12_12
+; SSE2-NEXT: # %bb.11: # %cond.store9
+; SSE2-NEXT: movb %ch, 5(%rdi)
; SSE2-NEXT: .LBB12_12: # %else10
; SSE2-NEXT: testb $64, %al
+; SSE2-NEXT: pextrw $3, %xmm0, %ecx
; SSE2-NEXT: jne .LBB12_13
-; SSE2-NEXT: .LBB12_14: # %else12
+; SSE2-NEXT: # %bb.14: # %else12
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: jne .LBB12_15
; SSE2-NEXT: .LBB12_16: # %else14
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je .LBB12_4
; SSE2-NEXT: .LBB12_3: # %cond.store1
-; SSE2-NEXT: shrl $16, %ecx
-; SSE2-NEXT: movb %cl, 1(%rdi)
+; SSE2-NEXT: movb %ch, 1(%rdi)
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: je .LBB12_6
; SSE2-NEXT: .LBB12_5: # %cond.store3
-; SSE2-NEXT: pextrw $2, %xmm0, %ecx
-; SSE2-NEXT: movb %cl, 2(%rdi)
+; SSE2-NEXT: movl %ecx, %edx
+; SSE2-NEXT: shrl $16, %edx
+; SSE2-NEXT: movb %dl, 2(%rdi)
; SSE2-NEXT: testb $8, %al
-; SSE2-NEXT: je .LBB12_8
-; SSE2-NEXT: .LBB12_7: # %cond.store5
-; SSE2-NEXT: pextrw $3, %xmm0, %ecx
-; SSE2-NEXT: movb %cl, 3(%rdi)
-; SSE2-NEXT: testb $16, %al
-; SSE2-NEXT: je .LBB12_10
-; SSE2-NEXT: .LBB12_9: # %cond.store7
-; SSE2-NEXT: pextrw $4, %xmm0, %ecx
-; SSE2-NEXT: movb %cl, 4(%rdi)
-; SSE2-NEXT: testb $32, %al
-; SSE2-NEXT: je .LBB12_12
-; SSE2-NEXT: .LBB12_11: # %cond.store9
-; SSE2-NEXT: pextrw $5, %xmm0, %ecx
-; SSE2-NEXT: movb %cl, 5(%rdi)
-; SSE2-NEXT: testb $64, %al
-; SSE2-NEXT: je .LBB12_14
+; SSE2-NEXT: jne .LBB12_7
+; SSE2-NEXT: jmp .LBB12_8
; SSE2-NEXT: .LBB12_13: # %cond.store11
-; SSE2-NEXT: pextrw $6, %xmm0, %ecx
; SSE2-NEXT: movb %cl, 6(%rdi)
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: je .LBB12_16
; SSE2-NEXT: .LBB12_15: # %cond.store13
-; SSE2-NEXT: pextrw $7, %xmm0, %eax
-; SSE2-NEXT: movb %al, 7(%rdi)
+; SSE2-NEXT: movb %ch, 7(%rdi)
; SSE2-NEXT: retq
;
; SSE4-LABEL: truncstore_v8i32_v8i8:
; SSE4: # %bb.0:
; SSE4-NEXT: pxor %xmm4, %xmm4
-; SSE4-NEXT: movdqa {{.*#+}} xmm5 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSE4-NEXT: movdqa {{.*#+}} xmm5 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
; SSE4-NEXT: pshufb %xmm5, %xmm1
; SSE4-NEXT: pshufb %xmm5, %xmm0
-; SSE4-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE4-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE4-NEXT: pcmpeqd %xmm4, %xmm3
; SSE4-NEXT: pcmpeqd %xmm1, %xmm1
; SSE4-NEXT: pxor %xmm1, %xmm3
; SSE4-NEXT: testb $2, %al
; SSE4-NEXT: je .LBB12_4
; SSE4-NEXT: .LBB12_3: # %cond.store1
-; SSE4-NEXT: pextrb $2, %xmm0, 1(%rdi)
+; SSE4-NEXT: pextrb $1, %xmm0, 1(%rdi)
; SSE4-NEXT: testb $4, %al
; SSE4-NEXT: je .LBB12_6
; SSE4-NEXT: .LBB12_5: # %cond.store3
-; SSE4-NEXT: pextrb $4, %xmm0, 2(%rdi)
+; SSE4-NEXT: pextrb $2, %xmm0, 2(%rdi)
; SSE4-NEXT: testb $8, %al
; SSE4-NEXT: je .LBB12_8
; SSE4-NEXT: .LBB12_7: # %cond.store5
-; SSE4-NEXT: pextrb $6, %xmm0, 3(%rdi)
+; SSE4-NEXT: pextrb $3, %xmm0, 3(%rdi)
; SSE4-NEXT: testb $16, %al
; SSE4-NEXT: je .LBB12_10
; SSE4-NEXT: .LBB12_9: # %cond.store7
-; SSE4-NEXT: pextrb $8, %xmm0, 4(%rdi)
+; SSE4-NEXT: pextrb $4, %xmm0, 4(%rdi)
; SSE4-NEXT: testb $32, %al
; SSE4-NEXT: je .LBB12_12
; SSE4-NEXT: .LBB12_11: # %cond.store9
-; SSE4-NEXT: pextrb $10, %xmm0, 5(%rdi)
+; SSE4-NEXT: pextrb $5, %xmm0, 5(%rdi)
; SSE4-NEXT: testb $64, %al
; SSE4-NEXT: je .LBB12_14
; SSE4-NEXT: .LBB12_13: # %cond.store11
-; SSE4-NEXT: pextrb $12, %xmm0, 6(%rdi)
+; SSE4-NEXT: pextrb $6, %xmm0, 6(%rdi)
; SSE4-NEXT: testb $-128, %al
; SSE4-NEXT: je .LBB12_16
; SSE4-NEXT: .LBB12_15: # %cond.store13
-; SSE4-NEXT: pextrb $14, %xmm0, 7(%rdi)
+; SSE4-NEXT: pextrb $7, %xmm0, 7(%rdi)
; SSE4-NEXT: retq
;
; AVX1-LABEL: truncstore_v8i32_v8i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
; AVX1-NEXT: testb $2, %al
; AVX1-NEXT: je .LBB12_4
; AVX1-NEXT: .LBB12_3: # %cond.store1
-; AVX1-NEXT: vpextrb $2, %xmm0, 1(%rdi)
+; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rdi)
; AVX1-NEXT: testb $4, %al
; AVX1-NEXT: je .LBB12_6
; AVX1-NEXT: .LBB12_5: # %cond.store3
-; AVX1-NEXT: vpextrb $4, %xmm0, 2(%rdi)
+; AVX1-NEXT: vpextrb $2, %xmm0, 2(%rdi)
; AVX1-NEXT: testb $8, %al
; AVX1-NEXT: je .LBB12_8
; AVX1-NEXT: .LBB12_7: # %cond.store5
-; AVX1-NEXT: vpextrb $6, %xmm0, 3(%rdi)
+; AVX1-NEXT: vpextrb $3, %xmm0, 3(%rdi)
; AVX1-NEXT: testb $16, %al
; AVX1-NEXT: je .LBB12_10
; AVX1-NEXT: .LBB12_9: # %cond.store7
-; AVX1-NEXT: vpextrb $8, %xmm0, 4(%rdi)
+; AVX1-NEXT: vpextrb $4, %xmm0, 4(%rdi)
; AVX1-NEXT: testb $32, %al
; AVX1-NEXT: je .LBB12_12
; AVX1-NEXT: .LBB12_11: # %cond.store9
-; AVX1-NEXT: vpextrb $10, %xmm0, 5(%rdi)
+; AVX1-NEXT: vpextrb $5, %xmm0, 5(%rdi)
; AVX1-NEXT: testb $64, %al
; AVX1-NEXT: je .LBB12_14
; AVX1-NEXT: .LBB12_13: # %cond.store11
-; AVX1-NEXT: vpextrb $12, %xmm0, 6(%rdi)
+; AVX1-NEXT: vpextrb $6, %xmm0, 6(%rdi)
; AVX1-NEXT: testb $-128, %al
; AVX1-NEXT: je .LBB12_16
; AVX1-NEXT: .LBB12_15: # %cond.store13
-; AVX1-NEXT: vpextrb $14, %xmm0, 7(%rdi)
+; AVX1-NEXT: vpextrb $7, %xmm0, 7(%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: truncstore_v8i32_v8i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vmovmskps %ymm1, %eax
; AVX2-NEXT: notl %eax
; AVX2-NEXT: testb $2, %al
; AVX2-NEXT: je .LBB12_4
; AVX2-NEXT: .LBB12_3: # %cond.store1
-; AVX2-NEXT: vpextrb $2, %xmm0, 1(%rdi)
+; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rdi)
; AVX2-NEXT: testb $4, %al
; AVX2-NEXT: je .LBB12_6
; AVX2-NEXT: .LBB12_5: # %cond.store3
-; AVX2-NEXT: vpextrb $4, %xmm0, 2(%rdi)
+; AVX2-NEXT: vpextrb $2, %xmm0, 2(%rdi)
; AVX2-NEXT: testb $8, %al
; AVX2-NEXT: je .LBB12_8
; AVX2-NEXT: .LBB12_7: # %cond.store5
-; AVX2-NEXT: vpextrb $6, %xmm0, 3(%rdi)
+; AVX2-NEXT: vpextrb $3, %xmm0, 3(%rdi)
; AVX2-NEXT: testb $16, %al
; AVX2-NEXT: je .LBB12_10
; AVX2-NEXT: .LBB12_9: # %cond.store7
-; AVX2-NEXT: vpextrb $8, %xmm0, 4(%rdi)
+; AVX2-NEXT: vpextrb $4, %xmm0, 4(%rdi)
; AVX2-NEXT: testb $32, %al
; AVX2-NEXT: je .LBB12_12
; AVX2-NEXT: .LBB12_11: # %cond.store9
-; AVX2-NEXT: vpextrb $10, %xmm0, 5(%rdi)
+; AVX2-NEXT: vpextrb $5, %xmm0, 5(%rdi)
; AVX2-NEXT: testb $64, %al
; AVX2-NEXT: je .LBB12_14
; AVX2-NEXT: .LBB12_13: # %cond.store11
-; AVX2-NEXT: vpextrb $12, %xmm0, 6(%rdi)
+; AVX2-NEXT: vpextrb $6, %xmm0, 6(%rdi)
; AVX2-NEXT: testb $-128, %al
; AVX2-NEXT: je .LBB12_16
; AVX2-NEXT: .LBB12_15: # %cond.store13
-; AVX2-NEXT: vpextrb $14, %xmm0, 7(%rdi)
+; AVX2-NEXT: vpextrb $7, %xmm0, 7(%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
-; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: testb $1, %al
; AVX512F-NEXT: jne .LBB12_1
; AVX512F-NEXT: testb $2, %al
; AVX512F-NEXT: je .LBB12_4
; AVX512F-NEXT: .LBB12_3: # %cond.store1
-; AVX512F-NEXT: vpextrb $2, %xmm0, 1(%rdi)
+; AVX512F-NEXT: vpextrb $1, %xmm0, 1(%rdi)
; AVX512F-NEXT: testb $4, %al
; AVX512F-NEXT: je .LBB12_6
; AVX512F-NEXT: .LBB12_5: # %cond.store3
-; AVX512F-NEXT: vpextrb $4, %xmm0, 2(%rdi)
+; AVX512F-NEXT: vpextrb $2, %xmm0, 2(%rdi)
; AVX512F-NEXT: testb $8, %al
; AVX512F-NEXT: je .LBB12_8
; AVX512F-NEXT: .LBB12_7: # %cond.store5
-; AVX512F-NEXT: vpextrb $6, %xmm0, 3(%rdi)
+; AVX512F-NEXT: vpextrb $3, %xmm0, 3(%rdi)
; AVX512F-NEXT: testb $16, %al
; AVX512F-NEXT: je .LBB12_10
; AVX512F-NEXT: .LBB12_9: # %cond.store7
-; AVX512F-NEXT: vpextrb $8, %xmm0, 4(%rdi)
+; AVX512F-NEXT: vpextrb $4, %xmm0, 4(%rdi)
; AVX512F-NEXT: testb $32, %al
; AVX512F-NEXT: je .LBB12_12
; AVX512F-NEXT: .LBB12_11: # %cond.store9
-; AVX512F-NEXT: vpextrb $10, %xmm0, 5(%rdi)
+; AVX512F-NEXT: vpextrb $5, %xmm0, 5(%rdi)
; AVX512F-NEXT: testb $64, %al
; AVX512F-NEXT: je .LBB12_14
; AVX512F-NEXT: .LBB12_13: # %cond.store11
-; AVX512F-NEXT: vpextrb $12, %xmm0, 6(%rdi)
+; AVX512F-NEXT: vpextrb $6, %xmm0, 6(%rdi)
; AVX512F-NEXT: testb $-128, %al
; AVX512F-NEXT: je .LBB12_16
; AVX512F-NEXT: .LBB12_15: # %cond.store13
-; AVX512F-NEXT: vpextrb $14, %xmm0, 7(%rdi)
+; AVX512F-NEXT: vpextrb $7, %xmm0, 7(%rdi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0
-; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: kshiftlq $56, %k0, %k0
; AVX512BW-NEXT: kshiftrq $56, %k0, %k1
+; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1}
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
; SSE2-LABEL: truncstore_v4i32_v4i16:
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
; SSE2-NEXT: movmskps %xmm2, %eax
; SSE2-NEXT: xorl $15, %eax
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je .LBB13_4
; SSE2-NEXT: .LBB13_3: # %cond.store1
-; SSE2-NEXT: pextrw $2, %xmm0, %ecx
+; SSE2-NEXT: pextrw $1, %xmm0, %ecx
; SSE2-NEXT: movw %cx, 2(%rdi)
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: je .LBB13_6
; SSE2-NEXT: .LBB13_5: # %cond.store3
-; SSE2-NEXT: pextrw $4, %xmm0, %ecx
+; SSE2-NEXT: pextrw $2, %xmm0, %ecx
; SSE2-NEXT: movw %cx, 4(%rdi)
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: je .LBB13_8
; SSE2-NEXT: .LBB13_7: # %cond.store5
-; SSE2-NEXT: pextrw $6, %xmm0, %eax
+; SSE2-NEXT: pextrw $3, %xmm0, %eax
; SSE2-NEXT: movw %ax, 6(%rdi)
; SSE2-NEXT: retq
;
; SSE4-LABEL: truncstore_v4i32_v4i16:
; SSE4: # %bb.0:
; SSE4-NEXT: pxor %xmm2, %xmm2
+; SSE4-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; SSE4-NEXT: pcmpeqd %xmm1, %xmm2
; SSE4-NEXT: movmskps %xmm2, %eax
; SSE4-NEXT: xorl $15, %eax
; SSE4-NEXT: testb $2, %al
; SSE4-NEXT: je .LBB13_4
; SSE4-NEXT: .LBB13_3: # %cond.store1
-; SSE4-NEXT: pextrw $2, %xmm0, 2(%rdi)
+; SSE4-NEXT: pextrw $1, %xmm0, 2(%rdi)
; SSE4-NEXT: testb $4, %al
; SSE4-NEXT: je .LBB13_6
; SSE4-NEXT: .LBB13_5: # %cond.store3
-; SSE4-NEXT: pextrw $4, %xmm0, 4(%rdi)
+; SSE4-NEXT: pextrw $2, %xmm0, 4(%rdi)
; SSE4-NEXT: testb $8, %al
; SSE4-NEXT: je .LBB13_8
; SSE4-NEXT: .LBB13_7: # %cond.store5
-; SSE4-NEXT: pextrw $6, %xmm0, 6(%rdi)
+; SSE4-NEXT: pextrw $3, %xmm0, 6(%rdi)
; SSE4-NEXT: retq
;
; AVX-LABEL: truncstore_v4i32_v4i16:
; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
; AVX-NEXT: vmovmskps %xmm1, %eax
; AVX-NEXT: xorl $15, %eax
; AVX-NEXT: testb $2, %al
; AVX-NEXT: je .LBB13_4
; AVX-NEXT: .LBB13_3: # %cond.store1
-; AVX-NEXT: vpextrw $2, %xmm0, 2(%rdi)
+; AVX-NEXT: vpextrw $1, %xmm0, 2(%rdi)
; AVX-NEXT: testb $4, %al
; AVX-NEXT: je .LBB13_6
; AVX-NEXT: .LBB13_5: # %cond.store3
-; AVX-NEXT: vpextrw $4, %xmm0, 4(%rdi)
+; AVX-NEXT: vpextrw $2, %xmm0, 4(%rdi)
; AVX-NEXT: testb $8, %al
; AVX-NEXT: je .LBB13_8
; AVX-NEXT: .LBB13_7: # %cond.store5
-; AVX-NEXT: vpextrw $6, %xmm0, 6(%rdi)
+; AVX-NEXT: vpextrw $3, %xmm0, 6(%rdi)
; AVX-NEXT: retq
;
; AVX512F-LABEL: truncstore_v4i32_v4i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: testb $1, %al
; AVX512F-NEXT: jne .LBB13_1
; AVX512F-NEXT: testb $2, %al
; AVX512F-NEXT: je .LBB13_4
; AVX512F-NEXT: .LBB13_3: # %cond.store1
-; AVX512F-NEXT: vpextrw $2, %xmm0, 2(%rdi)
+; AVX512F-NEXT: vpextrw $1, %xmm0, 2(%rdi)
; AVX512F-NEXT: testb $4, %al
; AVX512F-NEXT: je .LBB13_6
; AVX512F-NEXT: .LBB13_5: # %cond.store3
-; AVX512F-NEXT: vpextrw $4, %xmm0, 4(%rdi)
+; AVX512F-NEXT: vpextrw $2, %xmm0, 4(%rdi)
; AVX512F-NEXT: testb $8, %al
; AVX512F-NEXT: je .LBB13_8
; AVX512F-NEXT: .LBB13_7: # %cond.store5
-; AVX512F-NEXT: vpextrw $6, %xmm0, 6(%rdi)
+; AVX512F-NEXT: vpextrw $3, %xmm0, 6(%rdi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512BW-NEXT: kshiftld $28, %k0, %k0
; AVX512BW-NEXT: kshiftrd $28, %k0, %k1
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1}
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
; SSE2-LABEL: truncstore_v4i32_v4i8:
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: packuswb %xmm0, %xmm0
+; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
-; SSE2-NEXT: movmskps %xmm2, %eax
-; SSE2-NEXT: xorl $15, %eax
-; SSE2-NEXT: testb $1, %al
+; SSE2-NEXT: movmskps %xmm2, %ecx
+; SSE2-NEXT: xorl $15, %ecx
+; SSE2-NEXT: testb $1, %cl
+; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: jne .LBB14_1
; SSE2-NEXT: # %bb.2: # %else
-; SSE2-NEXT: testb $2, %al
+; SSE2-NEXT: testb $2, %cl
; SSE2-NEXT: jne .LBB14_3
; SSE2-NEXT: .LBB14_4: # %else2
-; SSE2-NEXT: testb $4, %al
+; SSE2-NEXT: testb $4, %cl
; SSE2-NEXT: jne .LBB14_5
; SSE2-NEXT: .LBB14_6: # %else4
-; SSE2-NEXT: testb $8, %al
+; SSE2-NEXT: testb $8, %cl
; SSE2-NEXT: jne .LBB14_7
; SSE2-NEXT: .LBB14_8: # %else6
; SSE2-NEXT: retq
; SSE2-NEXT: .LBB14_1: # %cond.store
-; SSE2-NEXT: movd %xmm0, %ecx
-; SSE2-NEXT: movb %cl, (%rdi)
-; SSE2-NEXT: testb $2, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: testb $2, %cl
; SSE2-NEXT: je .LBB14_4
; SSE2-NEXT: .LBB14_3: # %cond.store1
-; SSE2-NEXT: pextrw $2, %xmm0, %ecx
-; SSE2-NEXT: movb %cl, 1(%rdi)
-; SSE2-NEXT: testb $4, %al
+; SSE2-NEXT: movb %ah, 1(%rdi)
+; SSE2-NEXT: testb $4, %cl
; SSE2-NEXT: je .LBB14_6
; SSE2-NEXT: .LBB14_5: # %cond.store3
-; SSE2-NEXT: pextrw $4, %xmm0, %ecx
-; SSE2-NEXT: movb %cl, 2(%rdi)
-; SSE2-NEXT: testb $8, %al
+; SSE2-NEXT: movl %eax, %edx
+; SSE2-NEXT: shrl $16, %edx
+; SSE2-NEXT: movb %dl, 2(%rdi)
+; SSE2-NEXT: testb $8, %cl
; SSE2-NEXT: je .LBB14_8
; SSE2-NEXT: .LBB14_7: # %cond.store5
-; SSE2-NEXT: pextrw $6, %xmm0, %eax
+; SSE2-NEXT: shrl $24, %eax
; SSE2-NEXT: movb %al, 3(%rdi)
; SSE2-NEXT: retq
;
; SSE4-LABEL: truncstore_v4i32_v4i8:
; SSE4: # %bb.0:
; SSE4-NEXT: pxor %xmm2, %xmm2
+; SSE4-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; SSE4-NEXT: pcmpeqd %xmm1, %xmm2
; SSE4-NEXT: movmskps %xmm2, %eax
; SSE4-NEXT: xorl $15, %eax
; SSE4-NEXT: testb $2, %al
; SSE4-NEXT: je .LBB14_4
; SSE4-NEXT: .LBB14_3: # %cond.store1
-; SSE4-NEXT: pextrb $4, %xmm0, 1(%rdi)
+; SSE4-NEXT: pextrb $1, %xmm0, 1(%rdi)
; SSE4-NEXT: testb $4, %al
; SSE4-NEXT: je .LBB14_6
; SSE4-NEXT: .LBB14_5: # %cond.store3
-; SSE4-NEXT: pextrb $8, %xmm0, 2(%rdi)
+; SSE4-NEXT: pextrb $2, %xmm0, 2(%rdi)
; SSE4-NEXT: testb $8, %al
; SSE4-NEXT: je .LBB14_8
; SSE4-NEXT: .LBB14_7: # %cond.store5
-; SSE4-NEXT: pextrb $12, %xmm0, 3(%rdi)
+; SSE4-NEXT: pextrb $3, %xmm0, 3(%rdi)
; SSE4-NEXT: retq
;
; AVX-LABEL: truncstore_v4i32_v4i8:
; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
; AVX-NEXT: vmovmskps %xmm1, %eax
; AVX-NEXT: xorl $15, %eax
; AVX-NEXT: testb $2, %al
; AVX-NEXT: je .LBB14_4
; AVX-NEXT: .LBB14_3: # %cond.store1
-; AVX-NEXT: vpextrb $4, %xmm0, 1(%rdi)
+; AVX-NEXT: vpextrb $1, %xmm0, 1(%rdi)
; AVX-NEXT: testb $4, %al
; AVX-NEXT: je .LBB14_6
; AVX-NEXT: .LBB14_5: # %cond.store3
-; AVX-NEXT: vpextrb $8, %xmm0, 2(%rdi)
+; AVX-NEXT: vpextrb $2, %xmm0, 2(%rdi)
; AVX-NEXT: testb $8, %al
; AVX-NEXT: je .LBB14_8
; AVX-NEXT: .LBB14_7: # %cond.store5
-; AVX-NEXT: vpextrb $12, %xmm0, 3(%rdi)
+; AVX-NEXT: vpextrb $3, %xmm0, 3(%rdi)
; AVX-NEXT: retq
;
; AVX512F-LABEL: truncstore_v4i32_v4i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: testb $1, %al
; AVX512F-NEXT: jne .LBB14_1
; AVX512F-NEXT: testb $2, %al
; AVX512F-NEXT: je .LBB14_4
; AVX512F-NEXT: .LBB14_3: # %cond.store1
-; AVX512F-NEXT: vpextrb $4, %xmm0, 1(%rdi)
+; AVX512F-NEXT: vpextrb $1, %xmm0, 1(%rdi)
; AVX512F-NEXT: testb $4, %al
; AVX512F-NEXT: je .LBB14_6
; AVX512F-NEXT: .LBB14_5: # %cond.store3
-; AVX512F-NEXT: vpextrb $8, %xmm0, 2(%rdi)
+; AVX512F-NEXT: vpextrb $2, %xmm0, 2(%rdi)
; AVX512F-NEXT: testb $8, %al
; AVX512F-NEXT: je .LBB14_8
; AVX512F-NEXT: .LBB14_7: # %cond.store5
-; AVX512F-NEXT: vpextrb $12, %xmm0, 3(%rdi)
+; AVX512F-NEXT: vpextrb $3, %xmm0, 3(%rdi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: kshiftlq $60, %k0, %k0
; AVX512BW-NEXT: kshiftrq $60, %k0, %k1
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1}
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
; SSE2-LABEL: truncstore_v8i16_v8i8:
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: pcmpeqw %xmm1, %xmm2
; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm1
; SSE2-NEXT: jne .LBB17_5
; SSE2-NEXT: .LBB17_6: # %else4
; SSE2-NEXT: testb $8, %al
-; SSE2-NEXT: jne .LBB17_7
+; SSE2-NEXT: je .LBB17_8
+; SSE2-NEXT: .LBB17_7: # %cond.store5
+; SSE2-NEXT: shrl $24, %ecx
+; SSE2-NEXT: movb %cl, 3(%rdi)
; SSE2-NEXT: .LBB17_8: # %else6
; SSE2-NEXT: testb $16, %al
-; SSE2-NEXT: jne .LBB17_9
+; SSE2-NEXT: pextrw $2, %xmm0, %ecx
+; SSE2-NEXT: je .LBB17_10
+; SSE2-NEXT: # %bb.9: # %cond.store7
+; SSE2-NEXT: movb %cl, 4(%rdi)
; SSE2-NEXT: .LBB17_10: # %else8
; SSE2-NEXT: testb $32, %al
-; SSE2-NEXT: jne .LBB17_11
+; SSE2-NEXT: je .LBB17_12
+; SSE2-NEXT: # %bb.11: # %cond.store9
+; SSE2-NEXT: movb %ch, 5(%rdi)
; SSE2-NEXT: .LBB17_12: # %else10
; SSE2-NEXT: testb $64, %al
+; SSE2-NEXT: pextrw $3, %xmm0, %ecx
; SSE2-NEXT: jne .LBB17_13
-; SSE2-NEXT: .LBB17_14: # %else12
+; SSE2-NEXT: # %bb.14: # %else12
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: jne .LBB17_15
; SSE2-NEXT: .LBB17_16: # %else14
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je .LBB17_4
; SSE2-NEXT: .LBB17_3: # %cond.store1
-; SSE2-NEXT: shrl $16, %ecx
-; SSE2-NEXT: movb %cl, 1(%rdi)
+; SSE2-NEXT: movb %ch, 1(%rdi)
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: je .LBB17_6
; SSE2-NEXT: .LBB17_5: # %cond.store3
-; SSE2-NEXT: pextrw $2, %xmm0, %ecx
-; SSE2-NEXT: movb %cl, 2(%rdi)
+; SSE2-NEXT: movl %ecx, %edx
+; SSE2-NEXT: shrl $16, %edx
+; SSE2-NEXT: movb %dl, 2(%rdi)
; SSE2-NEXT: testb $8, %al
-; SSE2-NEXT: je .LBB17_8
-; SSE2-NEXT: .LBB17_7: # %cond.store5
-; SSE2-NEXT: pextrw $3, %xmm0, %ecx
-; SSE2-NEXT: movb %cl, 3(%rdi)
-; SSE2-NEXT: testb $16, %al
-; SSE2-NEXT: je .LBB17_10
-; SSE2-NEXT: .LBB17_9: # %cond.store7
-; SSE2-NEXT: pextrw $4, %xmm0, %ecx
-; SSE2-NEXT: movb %cl, 4(%rdi)
-; SSE2-NEXT: testb $32, %al
-; SSE2-NEXT: je .LBB17_12
-; SSE2-NEXT: .LBB17_11: # %cond.store9
-; SSE2-NEXT: pextrw $5, %xmm0, %ecx
-; SSE2-NEXT: movb %cl, 5(%rdi)
-; SSE2-NEXT: testb $64, %al
-; SSE2-NEXT: je .LBB17_14
+; SSE2-NEXT: jne .LBB17_7
+; SSE2-NEXT: jmp .LBB17_8
; SSE2-NEXT: .LBB17_13: # %cond.store11
-; SSE2-NEXT: pextrw $6, %xmm0, %ecx
; SSE2-NEXT: movb %cl, 6(%rdi)
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: je .LBB17_16
; SSE2-NEXT: .LBB17_15: # %cond.store13
-; SSE2-NEXT: pextrw $7, %xmm0, %eax
-; SSE2-NEXT: movb %al, 7(%rdi)
+; SSE2-NEXT: movb %ch, 7(%rdi)
; SSE2-NEXT: retq
;
; SSE4-LABEL: truncstore_v8i16_v8i8:
; SSE4: # %bb.0:
; SSE4-NEXT: pxor %xmm2, %xmm2
+; SSE4-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; SSE4-NEXT: pcmpeqw %xmm1, %xmm2
; SSE4-NEXT: pcmpeqd %xmm1, %xmm1
; SSE4-NEXT: pxor %xmm2, %xmm1
; SSE4-NEXT: testb $2, %al
; SSE4-NEXT: je .LBB17_4
; SSE4-NEXT: .LBB17_3: # %cond.store1
-; SSE4-NEXT: pextrb $2, %xmm0, 1(%rdi)
+; SSE4-NEXT: pextrb $1, %xmm0, 1(%rdi)
; SSE4-NEXT: testb $4, %al
; SSE4-NEXT: je .LBB17_6
; SSE4-NEXT: .LBB17_5: # %cond.store3
-; SSE4-NEXT: pextrb $4, %xmm0, 2(%rdi)
+; SSE4-NEXT: pextrb $2, %xmm0, 2(%rdi)
; SSE4-NEXT: testb $8, %al
; SSE4-NEXT: je .LBB17_8
; SSE4-NEXT: .LBB17_7: # %cond.store5
-; SSE4-NEXT: pextrb $6, %xmm0, 3(%rdi)
+; SSE4-NEXT: pextrb $3, %xmm0, 3(%rdi)
; SSE4-NEXT: testb $16, %al
; SSE4-NEXT: je .LBB17_10
; SSE4-NEXT: .LBB17_9: # %cond.store7
-; SSE4-NEXT: pextrb $8, %xmm0, 4(%rdi)
+; SSE4-NEXT: pextrb $4, %xmm0, 4(%rdi)
; SSE4-NEXT: testb $32, %al
; SSE4-NEXT: je .LBB17_12
; SSE4-NEXT: .LBB17_11: # %cond.store9
-; SSE4-NEXT: pextrb $10, %xmm0, 5(%rdi)
+; SSE4-NEXT: pextrb $5, %xmm0, 5(%rdi)
; SSE4-NEXT: testb $64, %al
; SSE4-NEXT: je .LBB17_14
; SSE4-NEXT: .LBB17_13: # %cond.store11
-; SSE4-NEXT: pextrb $12, %xmm0, 6(%rdi)
+; SSE4-NEXT: pextrb $6, %xmm0, 6(%rdi)
; SSE4-NEXT: testb $-128, %al
; SSE4-NEXT: je .LBB17_16
; SSE4-NEXT: .LBB17_15: # %cond.store13
-; SSE4-NEXT: pextrb $14, %xmm0, 7(%rdi)
+; SSE4-NEXT: pextrb $7, %xmm0, 7(%rdi)
; SSE4-NEXT: retq
;
; AVX-LABEL: truncstore_v8i16_v8i8:
; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1
; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX-NEXT: testb $2, %al
; AVX-NEXT: je .LBB17_4
; AVX-NEXT: .LBB17_3: # %cond.store1
-; AVX-NEXT: vpextrb $2, %xmm0, 1(%rdi)
+; AVX-NEXT: vpextrb $1, %xmm0, 1(%rdi)
; AVX-NEXT: testb $4, %al
; AVX-NEXT: je .LBB17_6
; AVX-NEXT: .LBB17_5: # %cond.store3
-; AVX-NEXT: vpextrb $4, %xmm0, 2(%rdi)
+; AVX-NEXT: vpextrb $2, %xmm0, 2(%rdi)
; AVX-NEXT: testb $8, %al
; AVX-NEXT: je .LBB17_8
; AVX-NEXT: .LBB17_7: # %cond.store5
-; AVX-NEXT: vpextrb $6, %xmm0, 3(%rdi)
+; AVX-NEXT: vpextrb $3, %xmm0, 3(%rdi)
; AVX-NEXT: testb $16, %al
; AVX-NEXT: je .LBB17_10
; AVX-NEXT: .LBB17_9: # %cond.store7
-; AVX-NEXT: vpextrb $8, %xmm0, 4(%rdi)
+; AVX-NEXT: vpextrb $4, %xmm0, 4(%rdi)
; AVX-NEXT: testb $32, %al
; AVX-NEXT: je .LBB17_12
; AVX-NEXT: .LBB17_11: # %cond.store9
-; AVX-NEXT: vpextrb $10, %xmm0, 5(%rdi)
+; AVX-NEXT: vpextrb $5, %xmm0, 5(%rdi)
; AVX-NEXT: testb $64, %al
; AVX-NEXT: je .LBB17_14
; AVX-NEXT: .LBB17_13: # %cond.store11
-; AVX-NEXT: vpextrb $12, %xmm0, 6(%rdi)
+; AVX-NEXT: vpextrb $6, %xmm0, 6(%rdi)
; AVX-NEXT: testb $-128, %al
; AVX-NEXT: je .LBB17_16
; AVX-NEXT: .LBB17_15: # %cond.store13
-; AVX-NEXT: vpextrb $14, %xmm0, 7(%rdi)
+; AVX-NEXT: vpextrb $7, %xmm0, 7(%rdi)
; AVX-NEXT: retq
;
; AVX512F-LABEL: truncstore_v8i16_v8i8:
; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1
; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1
; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: testb $1, %al
; AVX512F-NEXT: jne .LBB17_1
; AVX512F-NEXT: testb $2, %al
; AVX512F-NEXT: je .LBB17_4
; AVX512F-NEXT: .LBB17_3: # %cond.store1
-; AVX512F-NEXT: vpextrb $2, %xmm0, 1(%rdi)
+; AVX512F-NEXT: vpextrb $1, %xmm0, 1(%rdi)
; AVX512F-NEXT: testb $4, %al
; AVX512F-NEXT: je .LBB17_6
; AVX512F-NEXT: .LBB17_5: # %cond.store3
-; AVX512F-NEXT: vpextrb $4, %xmm0, 2(%rdi)
+; AVX512F-NEXT: vpextrb $2, %xmm0, 2(%rdi)
; AVX512F-NEXT: testb $8, %al
; AVX512F-NEXT: je .LBB17_8
; AVX512F-NEXT: .LBB17_7: # %cond.store5
-; AVX512F-NEXT: vpextrb $6, %xmm0, 3(%rdi)
+; AVX512F-NEXT: vpextrb $3, %xmm0, 3(%rdi)
; AVX512F-NEXT: testb $16, %al
; AVX512F-NEXT: je .LBB17_10
; AVX512F-NEXT: .LBB17_9: # %cond.store7
-; AVX512F-NEXT: vpextrb $8, %xmm0, 4(%rdi)
+; AVX512F-NEXT: vpextrb $4, %xmm0, 4(%rdi)
; AVX512F-NEXT: testb $32, %al
; AVX512F-NEXT: je .LBB17_12
; AVX512F-NEXT: .LBB17_11: # %cond.store9
-; AVX512F-NEXT: vpextrb $10, %xmm0, 5(%rdi)
+; AVX512F-NEXT: vpextrb $5, %xmm0, 5(%rdi)
; AVX512F-NEXT: testb $64, %al
; AVX512F-NEXT: je .LBB17_14
; AVX512F-NEXT: .LBB17_13: # %cond.store11
-; AVX512F-NEXT: vpextrb $12, %xmm0, 6(%rdi)
+; AVX512F-NEXT: vpextrb $6, %xmm0, 6(%rdi)
; AVX512F-NEXT: testb $-128, %al
; AVX512F-NEXT: je .LBB17_16
; AVX512F-NEXT: .LBB17_15: # %cond.store13
-; AVX512F-NEXT: vpextrb $14, %xmm0, 7(%rdi)
+; AVX512F-NEXT: vpextrb $7, %xmm0, 7(%rdi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512BW-NEXT: vptestmw %zmm1, %zmm1, %k0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: kshiftlq $56, %k0, %k0
; AVX512BW-NEXT: kshiftrq $56, %k0, %k1
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1}
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
; SSE2-NEXT: pxor %xmm8, %xmm8
; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [127,127]
; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm2, %xmm6
+; SSE2-NEXT: movdqa %xmm3, %xmm6
; SSE2-NEXT: pxor %xmm11, %xmm6
; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483775,2147483775]
; SSE2-NEXT: movdqa %xmm10, %xmm7
; SSE2-NEXT: pand %xmm12, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3]
; SSE2-NEXT: por %xmm6, %xmm13
-; SSE2-NEXT: pand %xmm13, %xmm2
+; SSE2-NEXT: pand %xmm13, %xmm3
; SSE2-NEXT: pandn %xmm9, %xmm13
-; SSE2-NEXT: por %xmm2, %xmm13
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: pxor %xmm11, %xmm2
-; SSE2-NEXT: movdqa %xmm10, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm12, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm9, %xmm2
-; SSE2-NEXT: por %xmm3, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: por %xmm3, %xmm13
+; SSE2-NEXT: movdqa %xmm2, %xmm3
; SSE2-NEXT: pxor %xmm11, %xmm3
; SSE2-NEXT: movdqa %xmm10, %xmm6
; SSE2-NEXT: pcmpgtd %xmm3, %xmm6
; SSE2-NEXT: pand %xmm12, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
; SSE2-NEXT: por %xmm7, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm2
; SSE2-NEXT: pandn %xmm9, %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm11, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pxor %xmm11, %xmm2
; SSE2-NEXT: movdqa %xmm10, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm6[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm10, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
+; SSE2-NEXT: pand %xmm12, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
+; SSE2-NEXT: por %xmm7, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: pandn %xmm9, %xmm2
+; SSE2-NEXT: por %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: pxor %xmm11, %xmm1
+; SSE2-NEXT: movdqa %xmm10, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm7, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm10, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pand %xmm7, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm6
-; SSE2-NEXT: pand %xmm6, %xmm1
-; SSE2-NEXT: pandn %xmm9, %xmm6
; SSE2-NEXT: por %xmm1, %xmm6
+; SSE2-NEXT: pand %xmm6, %xmm0
+; SSE2-NEXT: pandn %xmm9, %xmm6
+; SSE2-NEXT: por %xmm0, %xmm6
; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744073709551488,18446744073709551488]
; SSE2-NEXT: movdqa %xmm6, %xmm0
; SSE2-NEXT: pxor %xmm11, %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [18446744071562067840,18446744071562067840]
-; SSE2-NEXT: movdqa %xmm0, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm6
-; SSE2-NEXT: pandn %xmm9, %xmm1
-; SSE2-NEXT: por %xmm6, %xmm1
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm11, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm6[0,0,2,2]
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm10, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm1[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm10, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
; SSE2-NEXT: pand %xmm12, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
; SSE2-NEXT: por %xmm7, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm3
+; SSE2-NEXT: pand %xmm0, %xmm6
; SSE2-NEXT: pandn %xmm9, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: packssdw %xmm1, %xmm0
+; SSE2-NEXT: por %xmm6, %xmm0
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: pxor %xmm11, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm10, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm6[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm10, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
+; SSE2-NEXT: pand %xmm12, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3]
+; SSE2-NEXT: por %xmm7, %xmm1
+; SSE2-NEXT: pand %xmm1, %xmm2
+; SSE2-NEXT: pandn %xmm9, %xmm1
+; SSE2-NEXT: por %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: pxor %xmm11, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm10, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm6[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm10, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
+; SSE2-NEXT: pand %xmm12, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
+; SSE2-NEXT: por %xmm7, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm9, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: pxor %xmm13, %xmm11
+; SSE2-NEXT: movdqa %xmm11, %xmm3
; SSE2-NEXT: pcmpgtd %xmm10, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm10, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm11[1,1,3,3]
+; SSE2-NEXT: pand %xmm6, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm2
+; SSE2-NEXT: por %xmm7, %xmm3
+; SSE2-NEXT: pand %xmm3, %xmm13
; SSE2-NEXT: pandn %xmm9, %xmm3
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: pxor %xmm13, %xmm11
-; SSE2-NEXT: movdqa %xmm11, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm11
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm11[1,1,3,3]
-; SSE2-NEXT: pand %xmm2, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm13
-; SSE2-NEXT: pandn %xmm9, %xmm1
-; SSE2-NEXT: por %xmm13, %xmm1
-; SSE2-NEXT: packssdw %xmm3, %xmm1
-; SSE2-NEXT: packssdw %xmm1, %xmm0
+; SSE2-NEXT: por %xmm13, %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE2-NEXT: pand %xmm6, %xmm3
+; SSE2-NEXT: pand %xmm6, %xmm2
+; SSE2-NEXT: packuswb %xmm3, %xmm2
+; SSE2-NEXT: pand %xmm6, %xmm1
+; SSE2-NEXT: pand %xmm6, %xmm0
+; SSE2-NEXT: packuswb %xmm1, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: pcmpeqd %xmm8, %xmm5
; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm1, %xmm5
; SSE2-NEXT: jne .LBB2_5
; SSE2-NEXT: .LBB2_6: # %else4
; SSE2-NEXT: testb $8, %al
-; SSE2-NEXT: jne .LBB2_7
+; SSE2-NEXT: je .LBB2_8
+; SSE2-NEXT: .LBB2_7: # %cond.store5
+; SSE2-NEXT: shrl $24, %ecx
+; SSE2-NEXT: movb %cl, 3(%rdi)
; SSE2-NEXT: .LBB2_8: # %else6
; SSE2-NEXT: testb $16, %al
-; SSE2-NEXT: jne .LBB2_9
+; SSE2-NEXT: pextrw $2, %xmm0, %ecx
+; SSE2-NEXT: je .LBB2_10
+; SSE2-NEXT: # %bb.9: # %cond.store7
+; SSE2-NEXT: movb %cl, 4(%rdi)
; SSE2-NEXT: .LBB2_10: # %else8
; SSE2-NEXT: testb $32, %al
-; SSE2-NEXT: jne .LBB2_11
+; SSE2-NEXT: je .LBB2_12
+; SSE2-NEXT: # %bb.11: # %cond.store9
+; SSE2-NEXT: movb %ch, 5(%rdi)
; SSE2-NEXT: .LBB2_12: # %else10
; SSE2-NEXT: testb $64, %al
+; SSE2-NEXT: pextrw $3, %xmm0, %ecx
; SSE2-NEXT: jne .LBB2_13
-; SSE2-NEXT: .LBB2_14: # %else12
+; SSE2-NEXT: # %bb.14: # %else12
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: jne .LBB2_15
; SSE2-NEXT: .LBB2_16: # %else14
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je .LBB2_4
; SSE2-NEXT: .LBB2_3: # %cond.store1
-; SSE2-NEXT: shrl $16, %ecx
-; SSE2-NEXT: movb %cl, 1(%rdi)
+; SSE2-NEXT: movb %ch, 1(%rdi)
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: je .LBB2_6
; SSE2-NEXT: .LBB2_5: # %cond.store3
-; SSE2-NEXT: pextrw $2, %xmm0, %ecx
-; SSE2-NEXT: movb %cl, 2(%rdi)
+; SSE2-NEXT: movl %ecx, %edx
+; SSE2-NEXT: shrl $16, %edx
+; SSE2-NEXT: movb %dl, 2(%rdi)
; SSE2-NEXT: testb $8, %al
-; SSE2-NEXT: je .LBB2_8
-; SSE2-NEXT: .LBB2_7: # %cond.store5
-; SSE2-NEXT: pextrw $3, %xmm0, %ecx
-; SSE2-NEXT: movb %cl, 3(%rdi)
-; SSE2-NEXT: testb $16, %al
-; SSE2-NEXT: je .LBB2_10
-; SSE2-NEXT: .LBB2_9: # %cond.store7
-; SSE2-NEXT: pextrw $4, %xmm0, %ecx
-; SSE2-NEXT: movb %cl, 4(%rdi)
-; SSE2-NEXT: testb $32, %al
-; SSE2-NEXT: je .LBB2_12
-; SSE2-NEXT: .LBB2_11: # %cond.store9
-; SSE2-NEXT: pextrw $5, %xmm0, %ecx
-; SSE2-NEXT: movb %cl, 5(%rdi)
-; SSE2-NEXT: testb $64, %al
-; SSE2-NEXT: je .LBB2_14
+; SSE2-NEXT: jne .LBB2_7
+; SSE2-NEXT: jmp .LBB2_8
; SSE2-NEXT: .LBB2_13: # %cond.store11
-; SSE2-NEXT: pextrw $6, %xmm0, %ecx
; SSE2-NEXT: movb %cl, 6(%rdi)
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: je .LBB2_16
; SSE2-NEXT: .LBB2_15: # %cond.store13
-; SSE2-NEXT: pextrw $7, %xmm0, %eax
-; SSE2-NEXT: movb %al, 7(%rdi)
+; SSE2-NEXT: movb %ch, 7(%rdi)
; SSE2-NEXT: retq
;
; SSE4-LABEL: truncstore_v8i64_v8i8:
; SSE4-NEXT: pxor %xmm8, %xmm8
; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [127,127]
; SSE4-NEXT: movdqa %xmm7, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm2, %xmm0
-; SSE4-NEXT: movdqa %xmm7, %xmm10
-; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm10
-; SSE4-NEXT: movdqa %xmm7, %xmm0
; SSE4-NEXT: pcmpgtq %xmm3, %xmm0
-; SSE4-NEXT: movdqa %xmm7, %xmm2
-; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm2
+; SSE4-NEXT: movdqa %xmm7, %xmm10
+; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm10
; SSE4-NEXT: movdqa %xmm7, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm9, %xmm0
+; SSE4-NEXT: pcmpgtq %xmm2, %xmm0
; SSE4-NEXT: movdqa %xmm7, %xmm3
-; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm3
+; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm3
; SSE4-NEXT: movdqa %xmm7, %xmm0
; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm7
+; SSE4-NEXT: movdqa %xmm7, %xmm2
+; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; SSE4-NEXT: movdqa %xmm7, %xmm0
+; SSE4-NEXT: pcmpgtq %xmm9, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm7
; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488]
; SSE4-NEXT: movapd %xmm7, %xmm0
; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
; SSE4-NEXT: movdqa %xmm1, %xmm6
; SSE4-NEXT: blendvpd %xmm0, %xmm7, %xmm6
-; SSE4-NEXT: movapd %xmm3, %xmm0
+; SSE4-NEXT: movapd %xmm2, %xmm0
; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
; SSE4-NEXT: movdqa %xmm1, %xmm7
-; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm7
-; SSE4-NEXT: packssdw %xmm6, %xmm7
-; SSE4-NEXT: movapd %xmm2, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm7
+; SSE4-NEXT: movapd %xmm3, %xmm0
; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT: movdqa %xmm1, %xmm3
-; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm3
+; SSE4-NEXT: movdqa %xmm1, %xmm2
+; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm2
; SSE4-NEXT: movapd %xmm10, %xmm0
; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
; SSE4-NEXT: blendvpd %xmm0, %xmm10, %xmm1
-; SSE4-NEXT: packssdw %xmm3, %xmm1
-; SSE4-NEXT: packssdw %xmm1, %xmm7
+; SSE4-NEXT: movapd {{.*#+}} xmm0 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE4-NEXT: andpd %xmm0, %xmm1
+; SSE4-NEXT: andpd %xmm0, %xmm2
+; SSE4-NEXT: packusdw %xmm1, %xmm2
+; SSE4-NEXT: andpd %xmm0, %xmm7
+; SSE4-NEXT: andpd %xmm0, %xmm6
+; SSE4-NEXT: packusdw %xmm7, %xmm6
+; SSE4-NEXT: packusdw %xmm2, %xmm6
+; SSE4-NEXT: packuswb %xmm6, %xmm6
; SSE4-NEXT: pcmpeqd %xmm8, %xmm5
; SSE4-NEXT: pcmpeqd %xmm0, %xmm0
; SSE4-NEXT: pxor %xmm0, %xmm5
; SSE4-NEXT: .LBB2_16: # %else14
; SSE4-NEXT: retq
; SSE4-NEXT: .LBB2_1: # %cond.store
-; SSE4-NEXT: pextrb $0, %xmm7, (%rdi)
+; SSE4-NEXT: pextrb $0, %xmm6, (%rdi)
; SSE4-NEXT: testb $2, %al
; SSE4-NEXT: je .LBB2_4
; SSE4-NEXT: .LBB2_3: # %cond.store1
-; SSE4-NEXT: pextrb $2, %xmm7, 1(%rdi)
+; SSE4-NEXT: pextrb $1, %xmm6, 1(%rdi)
; SSE4-NEXT: testb $4, %al
; SSE4-NEXT: je .LBB2_6
; SSE4-NEXT: .LBB2_5: # %cond.store3
-; SSE4-NEXT: pextrb $4, %xmm7, 2(%rdi)
+; SSE4-NEXT: pextrb $2, %xmm6, 2(%rdi)
; SSE4-NEXT: testb $8, %al
; SSE4-NEXT: je .LBB2_8
; SSE4-NEXT: .LBB2_7: # %cond.store5
-; SSE4-NEXT: pextrb $6, %xmm7, 3(%rdi)
+; SSE4-NEXT: pextrb $3, %xmm6, 3(%rdi)
; SSE4-NEXT: testb $16, %al
; SSE4-NEXT: je .LBB2_10
; SSE4-NEXT: .LBB2_9: # %cond.store7
-; SSE4-NEXT: pextrb $8, %xmm7, 4(%rdi)
+; SSE4-NEXT: pextrb $4, %xmm6, 4(%rdi)
; SSE4-NEXT: testb $32, %al
; SSE4-NEXT: je .LBB2_12
; SSE4-NEXT: .LBB2_11: # %cond.store9
-; SSE4-NEXT: pextrb $10, %xmm7, 5(%rdi)
+; SSE4-NEXT: pextrb $5, %xmm6, 5(%rdi)
; SSE4-NEXT: testb $64, %al
; SSE4-NEXT: je .LBB2_14
; SSE4-NEXT: .LBB2_13: # %cond.store11
-; SSE4-NEXT: pextrb $12, %xmm7, 6(%rdi)
+; SSE4-NEXT: pextrb $6, %xmm6, 6(%rdi)
; SSE4-NEXT: testb $-128, %al
; SSE4-NEXT: je .LBB2_16
; SSE4-NEXT: .LBB2_15: # %cond.store13
-; SSE4-NEXT: pextrb $14, %xmm7, 7(%rdi)
+; SSE4-NEXT: pextrb $7, %xmm6, 7(%rdi)
; SSE4-NEXT: retq
;
; AVX1-LABEL: truncstore_v8i64_v8i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [127,127]
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm8
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm9
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
-; AVX1-NEXT: vpcmpgtq %xmm7, %xmm4, %xmm5
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm6
-; AVX1-NEXT: vblendvpd %xmm6, %xmm0, %xmm4, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [18446744073709551488,18446744073709551488]
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm0, %xmm10
-; AVX1-NEXT: vblendvpd %xmm5, %xmm7, %xmm4, %xmm5
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm5, %xmm11
-; AVX1-NEXT: vblendvpd %xmm9, %xmm1, %xmm4, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm1, %xmm7
-; AVX1-NEXT: vblendvpd %xmm8, %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm4
-; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm6, %xmm3
-; AVX1-NEXT: vblendvpd %xmm7, %xmm1, %xmm6, %xmm1
-; AVX1-NEXT: vpackssdw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vblendvpd %xmm11, %xmm5, %xmm6, %xmm3
-; AVX1-NEXT: vblendvpd %xmm10, %xmm0, %xmm6, %xmm0
-; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovapd {{.*#+}} ymm9 = [127,127,127,127]
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm10
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [127,127]
+; AVX1-NEXT: vpcmpgtq %xmm10, %xmm5, %xmm6
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm5, %xmm7
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm8
+; AVX1-NEXT: vblendvpd %ymm8, %ymm1, %ymm9, %ymm8
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm4
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm5, %xmm11
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm11, %ymm12
+; AVX1-NEXT: vblendvpd %ymm12, %ymm0, %ymm9, %ymm9
+; AVX1-NEXT: vmovapd {{.*#+}} ymm12 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488]
+; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [18446744073709551488,18446744073709551488]
+; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vblendvpd %xmm11, %xmm0, %xmm5, %xmm0
+; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT: vblendvpd %ymm0, %ymm9, %ymm12, %ymm0
+; AVX1-NEXT: vblendvpd %xmm6, %xmm10, %xmm5, %xmm3
+; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vblendvpd %xmm7, %xmm1, %xmm5, %xmm1
+; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: vblendvpd %ymm1, %ymm8, %ymm12, %ymm1
+; AVX1-NEXT: vmovapd {{.*#+}} ymm3 = [255,255,255,255]
+; AVX1-NEXT: vandpd %ymm3, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vpackusdw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vandpd %ymm3, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1
; AVX1-NEXT: testb $2, %al
; AVX1-NEXT: je .LBB2_4
; AVX1-NEXT: .LBB2_3: # %cond.store1
-; AVX1-NEXT: vpextrb $2, %xmm0, 1(%rdi)
+; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rdi)
; AVX1-NEXT: testb $4, %al
; AVX1-NEXT: je .LBB2_6
; AVX1-NEXT: .LBB2_5: # %cond.store3
-; AVX1-NEXT: vpextrb $4, %xmm0, 2(%rdi)
+; AVX1-NEXT: vpextrb $2, %xmm0, 2(%rdi)
; AVX1-NEXT: testb $8, %al
; AVX1-NEXT: je .LBB2_8
; AVX1-NEXT: .LBB2_7: # %cond.store5
-; AVX1-NEXT: vpextrb $6, %xmm0, 3(%rdi)
+; AVX1-NEXT: vpextrb $3, %xmm0, 3(%rdi)
; AVX1-NEXT: testb $16, %al
; AVX1-NEXT: je .LBB2_10
; AVX1-NEXT: .LBB2_9: # %cond.store7
-; AVX1-NEXT: vpextrb $8, %xmm0, 4(%rdi)
+; AVX1-NEXT: vpextrb $4, %xmm0, 4(%rdi)
; AVX1-NEXT: testb $32, %al
; AVX1-NEXT: je .LBB2_12
; AVX1-NEXT: .LBB2_11: # %cond.store9
-; AVX1-NEXT: vpextrb $10, %xmm0, 5(%rdi)
+; AVX1-NEXT: vpextrb $5, %xmm0, 5(%rdi)
; AVX1-NEXT: testb $64, %al
; AVX1-NEXT: je .LBB2_14
; AVX1-NEXT: .LBB2_13: # %cond.store11
-; AVX1-NEXT: vpextrb $12, %xmm0, 6(%rdi)
+; AVX1-NEXT: vpextrb $6, %xmm0, 6(%rdi)
; AVX1-NEXT: testb $-128, %al
; AVX1-NEXT: je .LBB2_16
; AVX1-NEXT: .LBB2_15: # %cond.store13
-; AVX1-NEXT: vpextrb $14, %xmm0, 7(%rdi)
+; AVX1-NEXT: vpextrb $7, %xmm0, 7(%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [127,127,127,127]
-; AVX2-NEXT: vpcmpgtq %ymm0, %ymm4, %ymm5
-; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm4, %ymm5
; AVX2-NEXT: vblendvpd %ymm5, %ymm1, %ymm4, %ymm1
+; AVX2-NEXT: vpcmpgtq %ymm0, %ymm4, %ymm5
+; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488]
-; AVX2-NEXT: vpcmpgtq %ymm4, %ymm1, %ymm5
-; AVX2-NEXT: vblendvpd %ymm5, %ymm1, %ymm4, %ymm1
; AVX2-NEXT: vpcmpgtq %ymm4, %ymm0, %ymm5
; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0
-; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpgtq %ymm4, %ymm1, %ymm5
+; AVX2-NEXT: vblendvpd %ymm5, %ymm1, %ymm4, %ymm1
+; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm1
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm1
; AVX2-NEXT: vmovmskps %ymm1, %eax
; AVX2-NEXT: notl %eax
; AVX2-NEXT: testb $2, %al
; AVX2-NEXT: je .LBB2_4
; AVX2-NEXT: .LBB2_3: # %cond.store1
-; AVX2-NEXT: vpextrb $2, %xmm0, 1(%rdi)
+; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rdi)
; AVX2-NEXT: testb $4, %al
; AVX2-NEXT: je .LBB2_6
; AVX2-NEXT: .LBB2_5: # %cond.store3
-; AVX2-NEXT: vpextrb $4, %xmm0, 2(%rdi)
+; AVX2-NEXT: vpextrb $2, %xmm0, 2(%rdi)
; AVX2-NEXT: testb $8, %al
; AVX2-NEXT: je .LBB2_8
; AVX2-NEXT: .LBB2_7: # %cond.store5
-; AVX2-NEXT: vpextrb $6, %xmm0, 3(%rdi)
+; AVX2-NEXT: vpextrb $3, %xmm0, 3(%rdi)
; AVX2-NEXT: testb $16, %al
; AVX2-NEXT: je .LBB2_10
; AVX2-NEXT: .LBB2_9: # %cond.store7
-; AVX2-NEXT: vpextrb $8, %xmm0, 4(%rdi)
+; AVX2-NEXT: vpextrb $4, %xmm0, 4(%rdi)
; AVX2-NEXT: testb $32, %al
; AVX2-NEXT: je .LBB2_12
; AVX2-NEXT: .LBB2_11: # %cond.store9
-; AVX2-NEXT: vpextrb $10, %xmm0, 5(%rdi)
+; AVX2-NEXT: vpextrb $5, %xmm0, 5(%rdi)
; AVX2-NEXT: testb $64, %al
; AVX2-NEXT: je .LBB2_14
; AVX2-NEXT: .LBB2_13: # %cond.store11
-; AVX2-NEXT: vpextrb $12, %xmm0, 6(%rdi)
+; AVX2-NEXT: vpextrb $6, %xmm0, 6(%rdi)
; AVX2-NEXT: testb $-128, %al
; AVX2-NEXT: je .LBB2_16
; AVX2-NEXT: .LBB2_15: # %cond.store13
-; AVX2-NEXT: vpextrb $14, %xmm0, 7(%rdi)
+; AVX2-NEXT: vpextrb $7, %xmm0, 7(%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
; AVX512F-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0
; AVX512F-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512F-NEXT: vpmovqb %zmm0, %xmm0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: testb $1, %al
; AVX512F-NEXT: jne .LBB2_1
; AVX512F-NEXT: testb $2, %al
; AVX512F-NEXT: je .LBB2_4
; AVX512F-NEXT: .LBB2_3: # %cond.store1
-; AVX512F-NEXT: vpextrb $2, %xmm0, 1(%rdi)
+; AVX512F-NEXT: vpextrb $1, %xmm0, 1(%rdi)
; AVX512F-NEXT: testb $4, %al
; AVX512F-NEXT: je .LBB2_6
; AVX512F-NEXT: .LBB2_5: # %cond.store3
-; AVX512F-NEXT: vpextrb $4, %xmm0, 2(%rdi)
+; AVX512F-NEXT: vpextrb $2, %xmm0, 2(%rdi)
; AVX512F-NEXT: testb $8, %al
; AVX512F-NEXT: je .LBB2_8
; AVX512F-NEXT: .LBB2_7: # %cond.store5
-; AVX512F-NEXT: vpextrb $6, %xmm0, 3(%rdi)
+; AVX512F-NEXT: vpextrb $3, %xmm0, 3(%rdi)
; AVX512F-NEXT: testb $16, %al
; AVX512F-NEXT: je .LBB2_10
; AVX512F-NEXT: .LBB2_9: # %cond.store7
-; AVX512F-NEXT: vpextrb $8, %xmm0, 4(%rdi)
+; AVX512F-NEXT: vpextrb $4, %xmm0, 4(%rdi)
; AVX512F-NEXT: testb $32, %al
; AVX512F-NEXT: je .LBB2_12
; AVX512F-NEXT: .LBB2_11: # %cond.store9
-; AVX512F-NEXT: vpextrb $10, %xmm0, 5(%rdi)
+; AVX512F-NEXT: vpextrb $5, %xmm0, 5(%rdi)
; AVX512F-NEXT: testb $64, %al
; AVX512F-NEXT: je .LBB2_14
; AVX512F-NEXT: .LBB2_13: # %cond.store11
-; AVX512F-NEXT: vpextrb $12, %xmm0, 6(%rdi)
+; AVX512F-NEXT: vpextrb $6, %xmm0, 6(%rdi)
; AVX512F-NEXT: testb $-128, %al
; AVX512F-NEXT: je .LBB2_16
; AVX512F-NEXT: .LBB2_15: # %cond.store13
-; AVX512F-NEXT: vpextrb $14, %xmm0, 7(%rdi)
+; AVX512F-NEXT: vpextrb $7, %xmm0, 7(%rdi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; SSE2-NEXT: pxor %xmm9, %xmm9
; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767]
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm5
; SSE2-NEXT: pxor %xmm4, %xmm5
; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147516415,2147516415]
; SSE2-NEXT: movdqa %xmm10, %xmm7
; SSE2-NEXT: pand %xmm3, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
; SSE2-NEXT: por %xmm6, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm0
+; SSE2-NEXT: pand %xmm5, %xmm1
; SSE2-NEXT: pandn %xmm8, %xmm5
-; SSE2-NEXT: por %xmm0, %xmm5
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: pxor %xmm4, %xmm1
; SSE2-NEXT: movdqa %xmm10, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm10, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pand %xmm6, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pandn %xmm8, %xmm3
; SSE2-NEXT: por %xmm1, %xmm3
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pandn %xmm8, %xmm3
+; SSE2-NEXT: por %xmm0, %xmm3
; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709518848,18446744073709518848]
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm0
+; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: pxor %xmm4, %xmm1
; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [18446744071562035200,18446744071562035200]
-; SSE2-NEXT: movdqa %xmm0, %xmm7
+; SSE2-NEXT: movdqa %xmm1, %xmm7
; SSE2-NEXT: pcmpgtd %xmm6, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm6, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm3
-; SSE2-NEXT: pandn %xmm8, %xmm1
-; SSE2-NEXT: por %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: pand %xmm0, %xmm3
+; SSE2-NEXT: pandn %xmm8, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
; SSE2-NEXT: pxor %xmm5, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
+; SSE2-NEXT: movdqa %xmm4, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm6, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSE2-NEXT: pand %xmm3, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm5
-; SSE2-NEXT: pandn %xmm8, %xmm0
-; SSE2-NEXT: por %xmm5, %xmm0
-; SSE2-NEXT: packssdw %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: pand %xmm1, %xmm5
+; SSE2-NEXT: pandn %xmm8, %xmm1
+; SSE2-NEXT: por %xmm5, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: pcmpeqd %xmm2, %xmm9
; SSE2-NEXT: movmskps %xmm9, %eax
; SSE2-NEXT: xorl $15, %eax
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je .LBB4_4
; SSE2-NEXT: .LBB4_3: # %cond.store1
-; SSE2-NEXT: pextrw $2, %xmm0, %ecx
+; SSE2-NEXT: pextrw $1, %xmm0, %ecx
; SSE2-NEXT: movw %cx, 2(%rdi)
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: je .LBB4_6
; SSE2-NEXT: .LBB4_5: # %cond.store3
-; SSE2-NEXT: pextrw $4, %xmm0, %ecx
+; SSE2-NEXT: pextrw $2, %xmm0, %ecx
; SSE2-NEXT: movw %cx, 4(%rdi)
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: je .LBB4_8
; SSE2-NEXT: .LBB4_7: # %cond.store5
-; SSE2-NEXT: pextrw $6, %xmm0, %eax
+; SSE2-NEXT: pextrw $3, %xmm0, %eax
; SSE2-NEXT: movw %ax, 6(%rdi)
; SSE2-NEXT: retq
;
; SSE4-NEXT: pxor %xmm4, %xmm4
; SSE4-NEXT: movdqa {{.*#+}} xmm5 = [32767,32767]
; SSE4-NEXT: movdqa %xmm5, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm3, %xmm0
+; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
; SSE4-NEXT: movdqa %xmm5, %xmm6
-; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm6
+; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm6
; SSE4-NEXT: movdqa %xmm5, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5
+; SSE4-NEXT: pcmpgtq %xmm3, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm5
; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
; SSE4-NEXT: movapd %xmm5, %xmm0
; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
; SSE4-NEXT: movapd %xmm6, %xmm0
; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm1
-; SSE4-NEXT: packssdw %xmm3, %xmm1
+; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; SSE4-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
+; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
+; SSE4-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE4-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE4-NEXT: pcmpeqd %xmm2, %xmm4
; SSE4-NEXT: movmskps %xmm4, %eax
; SSE4-NEXT: xorl $15, %eax
; SSE4-NEXT: .LBB4_8: # %else6
; SSE4-NEXT: retq
; SSE4-NEXT: .LBB4_1: # %cond.store
-; SSE4-NEXT: pextrw $0, %xmm1, (%rdi)
+; SSE4-NEXT: pextrw $0, %xmm0, (%rdi)
; SSE4-NEXT: testb $2, %al
; SSE4-NEXT: je .LBB4_4
; SSE4-NEXT: .LBB4_3: # %cond.store1
-; SSE4-NEXT: pextrw $2, %xmm1, 2(%rdi)
+; SSE4-NEXT: pextrw $1, %xmm0, 2(%rdi)
; SSE4-NEXT: testb $4, %al
; SSE4-NEXT: je .LBB4_6
; SSE4-NEXT: .LBB4_5: # %cond.store3
-; SSE4-NEXT: pextrw $4, %xmm1, 4(%rdi)
+; SSE4-NEXT: pextrw $2, %xmm0, 4(%rdi)
; SSE4-NEXT: testb $8, %al
; SSE4-NEXT: je .LBB4_8
; SSE4-NEXT: .LBB4_7: # %cond.store5
-; SSE4-NEXT: pextrw $6, %xmm1, 6(%rdi)
+; SSE4-NEXT: pextrw $3, %xmm0, 6(%rdi)
; SSE4-NEXT: retq
;
; AVX1-LABEL: truncstore_v4i64_v4i16:
; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm4
; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm6, %xmm3
+; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
; AVX1-NEXT: vblendvpd %xmm7, %xmm0, %xmm6, %xmm0
-; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vmovmskps %xmm1, %eax
; AVX1-NEXT: xorl $15, %eax
; AVX1-NEXT: testb $2, %al
; AVX1-NEXT: je .LBB4_4
; AVX1-NEXT: .LBB4_3: # %cond.store1
-; AVX1-NEXT: vpextrw $2, %xmm0, 2(%rdi)
+; AVX1-NEXT: vpextrw $1, %xmm0, 2(%rdi)
; AVX1-NEXT: testb $4, %al
; AVX1-NEXT: je .LBB4_6
; AVX1-NEXT: .LBB4_5: # %cond.store3
-; AVX1-NEXT: vpextrw $4, %xmm0, 4(%rdi)
+; AVX1-NEXT: vpextrw $2, %xmm0, 4(%rdi)
; AVX1-NEXT: testb $8, %al
; AVX1-NEXT: je .LBB4_8
; AVX1-NEXT: .LBB4_7: # %cond.store5
-; AVX1-NEXT: vpextrw $6, %xmm0, 6(%rdi)
+; AVX1-NEXT: vpextrw $3, %xmm0, 6(%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-NEXT: vpcmpgtq %ymm3, %ymm0, %ymm4
; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm3, %ymm0
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
+; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vmovmskps %xmm1, %eax
; AVX2-NEXT: xorl $15, %eax
; AVX2-NEXT: testb $2, %al
; AVX2-NEXT: je .LBB4_4
; AVX2-NEXT: .LBB4_3: # %cond.store1
-; AVX2-NEXT: vpextrw $2, %xmm0, 2(%rdi)
+; AVX2-NEXT: vpextrw $1, %xmm0, 2(%rdi)
; AVX2-NEXT: testb $4, %al
; AVX2-NEXT: je .LBB4_6
; AVX2-NEXT: .LBB4_5: # %cond.store3
-; AVX2-NEXT: vpextrw $4, %xmm0, 4(%rdi)
+; AVX2-NEXT: vpextrw $2, %xmm0, 4(%rdi)
; AVX2-NEXT: testb $8, %al
; AVX2-NEXT: je .LBB4_8
; AVX2-NEXT: .LBB4_7: # %cond.store5
-; AVX2-NEXT: vpextrw $6, %xmm0, 6(%rdi)
+; AVX2-NEXT: vpextrw $3, %xmm0, 6(%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744073709518848,18446744073709518848,18446744073709518848,18446744073709518848]
; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: testb $1, %al
; AVX512F-NEXT: jne .LBB4_1
; AVX512F-NEXT: testb $2, %al
; AVX512F-NEXT: je .LBB4_4
; AVX512F-NEXT: .LBB4_3: # %cond.store1
-; AVX512F-NEXT: vpextrw $2, %xmm0, 2(%rdi)
+; AVX512F-NEXT: vpextrw $1, %xmm0, 2(%rdi)
; AVX512F-NEXT: testb $4, %al
; AVX512F-NEXT: je .LBB4_6
; AVX512F-NEXT: .LBB4_5: # %cond.store3
-; AVX512F-NEXT: vpextrw $4, %xmm0, 4(%rdi)
+; AVX512F-NEXT: vpextrw $2, %xmm0, 4(%rdi)
; AVX512F-NEXT: testb $8, %al
; AVX512F-NEXT: je .LBB4_8
; AVX512F-NEXT: .LBB4_7: # %cond.store5
-; AVX512F-NEXT: vpextrw $6, %xmm0, 6(%rdi)
+; AVX512F-NEXT: vpextrw $3, %xmm0, 6(%rdi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0
+; AVX512BW-NEXT: kshiftld $28, %k0, %k0
+; AVX512BW-NEXT: kshiftrd $28, %k0, %k1
; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [32767,32767,32767,32767]
; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744073709518848,18446744073709518848,18446744073709518848,18446744073709518848]
; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
-; AVX512BW-NEXT: kshiftld $28, %k0, %k0
-; AVX512BW-NEXT: kshiftrd $28, %k0, %k1
+; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1}
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
; SSE2-NEXT: pxor %xmm9, %xmm9
; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [127,127]
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm5
; SSE2-NEXT: pxor %xmm4, %xmm5
; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483775,2147483775]
; SSE2-NEXT: movdqa %xmm10, %xmm7
; SSE2-NEXT: pand %xmm3, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
; SSE2-NEXT: por %xmm6, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm0
+; SSE2-NEXT: pand %xmm5, %xmm1
; SSE2-NEXT: pandn %xmm8, %xmm5
-; SSE2-NEXT: por %xmm0, %xmm5
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: pxor %xmm4, %xmm1
; SSE2-NEXT: movdqa %xmm10, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm10, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pand %xmm6, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pandn %xmm8, %xmm3
; SSE2-NEXT: por %xmm1, %xmm3
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pandn %xmm8, %xmm3
+; SSE2-NEXT: por %xmm0, %xmm3
; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488]
; SSE2-NEXT: movdqa %xmm3, %xmm0
; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [18446744071562067840,18446744071562067840]
+; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [18446744071562067840,18446744071562067840]
; SSE2-NEXT: movdqa %xmm0, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm7
+; SSE2-NEXT: pcmpgtd %xmm10, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm6, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm3
-; SSE2-NEXT: pandn %xmm8, %xmm1
-; SSE2-NEXT: por %xmm3, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm10, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; SSE2-NEXT: pand %xmm1, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
+; SSE2-NEXT: por %xmm6, %xmm0
+; SSE2-NEXT: pand %xmm0, %xmm3
+; SSE2-NEXT: pandn %xmm8, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
; SSE2-NEXT: pxor %xmm5, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm6, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm10, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm10, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSE2-NEXT: pand %xmm3, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm5
-; SSE2-NEXT: pandn %xmm8, %xmm0
-; SSE2-NEXT: por %xmm5, %xmm0
-; SSE2-NEXT: packssdw %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: pand %xmm1, %xmm5
+; SSE2-NEXT: pandn %xmm8, %xmm1
+; SSE2-NEXT: por %xmm5, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: packuswb %xmm1, %xmm0
+; SSE2-NEXT: packuswb %xmm0, %xmm0
+; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: pcmpeqd %xmm2, %xmm9
-; SSE2-NEXT: movmskps %xmm9, %eax
-; SSE2-NEXT: xorl $15, %eax
-; SSE2-NEXT: testb $1, %al
+; SSE2-NEXT: movmskps %xmm9, %ecx
+; SSE2-NEXT: xorl $15, %ecx
+; SSE2-NEXT: testb $1, %cl
+; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: jne .LBB5_1
; SSE2-NEXT: # %bb.2: # %else
-; SSE2-NEXT: testb $2, %al
+; SSE2-NEXT: testb $2, %cl
; SSE2-NEXT: jne .LBB5_3
; SSE2-NEXT: .LBB5_4: # %else2
-; SSE2-NEXT: testb $4, %al
+; SSE2-NEXT: testb $4, %cl
; SSE2-NEXT: jne .LBB5_5
; SSE2-NEXT: .LBB5_6: # %else4
-; SSE2-NEXT: testb $8, %al
+; SSE2-NEXT: testb $8, %cl
; SSE2-NEXT: jne .LBB5_7
; SSE2-NEXT: .LBB5_8: # %else6
; SSE2-NEXT: retq
; SSE2-NEXT: .LBB5_1: # %cond.store
-; SSE2-NEXT: movd %xmm0, %ecx
-; SSE2-NEXT: movb %cl, (%rdi)
-; SSE2-NEXT: testb $2, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: testb $2, %cl
; SSE2-NEXT: je .LBB5_4
; SSE2-NEXT: .LBB5_3: # %cond.store1
-; SSE2-NEXT: pextrw $2, %xmm0, %ecx
-; SSE2-NEXT: movb %cl, 1(%rdi)
-; SSE2-NEXT: testb $4, %al
+; SSE2-NEXT: movb %ah, 1(%rdi)
+; SSE2-NEXT: testb $4, %cl
; SSE2-NEXT: je .LBB5_6
; SSE2-NEXT: .LBB5_5: # %cond.store3
-; SSE2-NEXT: pextrw $4, %xmm0, %ecx
-; SSE2-NEXT: movb %cl, 2(%rdi)
-; SSE2-NEXT: testb $8, %al
+; SSE2-NEXT: movl %eax, %edx
+; SSE2-NEXT: shrl $16, %edx
+; SSE2-NEXT: movb %dl, 2(%rdi)
+; SSE2-NEXT: testb $8, %cl
; SSE2-NEXT: je .LBB5_8
; SSE2-NEXT: .LBB5_7: # %cond.store5
-; SSE2-NEXT: pextrw $6, %xmm0, %eax
+; SSE2-NEXT: shrl $24, %eax
; SSE2-NEXT: movb %al, 3(%rdi)
; SSE2-NEXT: retq
;
; SSE4-NEXT: pxor %xmm4, %xmm4
; SSE4-NEXT: movdqa {{.*#+}} xmm5 = [127,127]
; SSE4-NEXT: movdqa %xmm5, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm3, %xmm0
+; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
; SSE4-NEXT: movdqa %xmm5, %xmm6
-; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm6
+; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm6
; SSE4-NEXT: movdqa %xmm5, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5
-; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488]
+; SSE4-NEXT: pcmpgtq %xmm3, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm5
+; SSE4-NEXT: movdqa {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488]
; SSE4-NEXT: movapd %xmm5, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT: movdqa %xmm1, %xmm3
-; SSE4-NEXT: blendvpd %xmm0, %xmm5, %xmm3
+; SSE4-NEXT: pcmpgtq %xmm3, %xmm0
+; SSE4-NEXT: movdqa %xmm3, %xmm1
+; SSE4-NEXT: blendvpd %xmm0, %xmm5, %xmm1
; SSE4-NEXT: movapd %xmm6, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm1
-; SSE4-NEXT: packssdw %xmm3, %xmm1
+; SSE4-NEXT: pcmpgtq %xmm3, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm3
+; SSE4-NEXT: movdqa {{.*#+}} xmm0 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSE4-NEXT: pshufb %xmm0, %xmm3
+; SSE4-NEXT: pshufb %xmm0, %xmm1
+; SSE4-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
; SSE4-NEXT: pcmpeqd %xmm2, %xmm4
; SSE4-NEXT: movmskps %xmm4, %eax
; SSE4-NEXT: xorl $15, %eax
; SSE4-NEXT: testb $2, %al
; SSE4-NEXT: je .LBB5_4
; SSE4-NEXT: .LBB5_3: # %cond.store1
-; SSE4-NEXT: pextrb $4, %xmm1, 1(%rdi)
+; SSE4-NEXT: pextrb $1, %xmm1, 1(%rdi)
; SSE4-NEXT: testb $4, %al
; SSE4-NEXT: je .LBB5_6
; SSE4-NEXT: .LBB5_5: # %cond.store3
-; SSE4-NEXT: pextrb $8, %xmm1, 2(%rdi)
+; SSE4-NEXT: pextrb $2, %xmm1, 2(%rdi)
; SSE4-NEXT: testb $8, %al
; SSE4-NEXT: je .LBB5_8
; SSE4-NEXT: .LBB5_7: # %cond.store5
-; SSE4-NEXT: pextrb $12, %xmm1, 3(%rdi)
+; SSE4-NEXT: pextrb $3, %xmm1, 3(%rdi)
; SSE4-NEXT: retq
;
; AVX1-LABEL: truncstore_v4i64_v4i8:
; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm4
; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm6, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vblendvpd %xmm7, %xmm0, %xmm6, %xmm0
-; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vmovmskps %xmm1, %eax
; AVX1-NEXT: xorl $15, %eax
; AVX1-NEXT: testb $2, %al
; AVX1-NEXT: je .LBB5_4
; AVX1-NEXT: .LBB5_3: # %cond.store1
-; AVX1-NEXT: vpextrb $4, %xmm0, 1(%rdi)
+; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rdi)
; AVX1-NEXT: testb $4, %al
; AVX1-NEXT: je .LBB5_6
; AVX1-NEXT: .LBB5_5: # %cond.store3
-; AVX1-NEXT: vpextrb $8, %xmm0, 2(%rdi)
+; AVX1-NEXT: vpextrb $2, %xmm0, 2(%rdi)
; AVX1-NEXT: testb $8, %al
; AVX1-NEXT: je .LBB5_8
; AVX1-NEXT: .LBB5_7: # %cond.store5
-; AVX1-NEXT: vpextrb $12, %xmm0, 3(%rdi)
+; AVX1-NEXT: vpextrb $3, %xmm0, 3(%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-NEXT: vpcmpgtq %ymm3, %ymm0, %ymm4
; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm3, %ymm0
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vmovmskps %xmm1, %eax
; AVX2-NEXT: xorl $15, %eax
; AVX2-NEXT: testb $2, %al
; AVX2-NEXT: je .LBB5_4
; AVX2-NEXT: .LBB5_3: # %cond.store1
-; AVX2-NEXT: vpextrb $4, %xmm0, 1(%rdi)
+; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rdi)
; AVX2-NEXT: testb $4, %al
; AVX2-NEXT: je .LBB5_6
; AVX2-NEXT: .LBB5_5: # %cond.store3
-; AVX2-NEXT: vpextrb $8, %xmm0, 2(%rdi)
+; AVX2-NEXT: vpextrb $2, %xmm0, 2(%rdi)
; AVX2-NEXT: testb $8, %al
; AVX2-NEXT: je .LBB5_8
; AVX2-NEXT: .LBB5_7: # %cond.store5
-; AVX2-NEXT: vpextrb $12, %xmm0, 3(%rdi)
+; AVX2-NEXT: vpextrb $3, %xmm0, 3(%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488]
; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512F-NEXT: vpmovqb %zmm0, %xmm0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: testb $1, %al
; AVX512F-NEXT: jne .LBB5_1
; AVX512F-NEXT: testb $2, %al
; AVX512F-NEXT: je .LBB5_4
; AVX512F-NEXT: .LBB5_3: # %cond.store1
-; AVX512F-NEXT: vpextrb $4, %xmm0, 1(%rdi)
+; AVX512F-NEXT: vpextrb $1, %xmm0, 1(%rdi)
; AVX512F-NEXT: testb $4, %al
; AVX512F-NEXT: je .LBB5_6
; AVX512F-NEXT: .LBB5_5: # %cond.store3
-; AVX512F-NEXT: vpextrb $8, %xmm0, 2(%rdi)
+; AVX512F-NEXT: vpextrb $2, %xmm0, 2(%rdi)
; AVX512F-NEXT: testb $8, %al
; AVX512F-NEXT: je .LBB5_8
; AVX512F-NEXT: .LBB5_7: # %cond.store5
-; AVX512F-NEXT: vpextrb $12, %xmm0, 3(%rdi)
+; AVX512F-NEXT: vpextrb $3, %xmm0, 3(%rdi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0
+; AVX512BW-NEXT: kshiftlq $60, %k0, %k0
+; AVX512BW-NEXT: kshiftrq $60, %k0, %k1
; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [127,127,127,127]
; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488]
; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: kshiftlq $60, %k0, %k0
-; AVX512BW-NEXT: kshiftrq $60, %k0, %k1
+; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0
; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1}
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm5
-; SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
-; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; SSE2-NEXT: pand %xmm6, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT: por %xmm0, %xmm3
+; SSE2-NEXT: pand %xmm3, %xmm5
+; SSE2-NEXT: pandn {{.*}}(%rip), %xmm3
+; SSE2-NEXT: por %xmm5, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2]
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je .LBB6_4
; SSE2-NEXT: .LBB6_3: # %cond.store1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; SSE2-NEXT: movd %xmm0, 4(%rdi)
; SSE2-NEXT: retq
;
; SSE4-NEXT: movapd %xmm4, %xmm0
; SSE4-NEXT: pcmpgtq %xmm2, %xmm0
; SSE4-NEXT: blendvpd %xmm0, %xmm4, %xmm2
+; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
; SSE4-NEXT: pcmpeqq %xmm1, %xmm3
; SSE4-NEXT: movmskpd %xmm3, %eax
; SSE4-NEXT: xorl $3, %eax
; SSE4-NEXT: .LBB6_4: # %else2
; SSE4-NEXT: retq
; SSE4-NEXT: .LBB6_1: # %cond.store
-; SSE4-NEXT: movss %xmm2, (%rdi)
+; SSE4-NEXT: movd %xmm0, (%rdi)
; SSE4-NEXT: testb $2, %al
; SSE4-NEXT: je .LBB6_4
; SSE4-NEXT: .LBB6_3: # %cond.store1
-; SSE4-NEXT: extractps $2, %xmm2, 4(%rdi)
+; SSE4-NEXT: pextrd $1, %xmm0, 4(%rdi)
; SSE4-NEXT: retq
;
; AVX1-LABEL: truncstore_v2i64_v2i32:
; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2147483647,2147483647]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm3
; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm3
; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
; AVX1-NEXT: vmaskmovps %xmm0, %xmm1, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [2147483647,2147483647]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm3
; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0
; AVX2-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm3
; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0
; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
; AVX2-NEXT: vpmaskmovd %xmm0, %xmm1, (%rdi)
; AVX2-NEXT: retq
;
; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0
+; AVX512F-NEXT: kshiftlw $14, %k0, %k0
+; AVX512F-NEXT: kshiftrw $14, %k0, %k1
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [2147483647,2147483647]
; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512F-NEXT: kshiftlw $14, %k0, %k0
-; AVX512F-NEXT: kshiftrw $14, %k0, %k1
; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1}
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512BW-NEXT: vptestmq %zmm1, %zmm1, %k0
+; AVX512BW-NEXT: kshiftlw $14, %k0, %k0
+; AVX512BW-NEXT: kshiftrw $14, %k0, %k1
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [2147483647,2147483647]
; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512BW-NEXT: kshiftlw $14, %k0, %k0
-; AVX512BW-NEXT: kshiftrw $14, %k0, %k1
; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1}
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm5
-; SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
-; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; SSE2-NEXT: pand %xmm6, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT: por %xmm0, %xmm3
+; SSE2-NEXT: pand %xmm3, %xmm5
+; SSE2-NEXT: pandn {{.*}}(%rip), %xmm3
+; SSE2-NEXT: por %xmm5, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2]
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je .LBB7_4
; SSE2-NEXT: .LBB7_3: # %cond.store1
-; SSE2-NEXT: pextrw $4, %xmm0, %eax
+; SSE2-NEXT: pextrw $1, %xmm0, %eax
; SSE2-NEXT: movw %ax, 2(%rdi)
; SSE2-NEXT: retq
;
; SSE4-NEXT: movapd %xmm4, %xmm0
; SSE4-NEXT: pcmpgtq %xmm2, %xmm0
; SSE4-NEXT: blendvpd %xmm0, %xmm4, %xmm2
+; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE4-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; SSE4-NEXT: pcmpeqq %xmm1, %xmm3
; SSE4-NEXT: movmskpd %xmm3, %eax
; SSE4-NEXT: xorl $3, %eax
; SSE4-NEXT: .LBB7_4: # %else2
; SSE4-NEXT: retq
; SSE4-NEXT: .LBB7_1: # %cond.store
-; SSE4-NEXT: pextrw $0, %xmm2, (%rdi)
+; SSE4-NEXT: pextrw $0, %xmm0, (%rdi)
; SSE4-NEXT: testb $2, %al
; SSE4-NEXT: je .LBB7_4
; SSE4-NEXT: .LBB7_3: # %cond.store1
-; SSE4-NEXT: pextrw $4, %xmm2, 2(%rdi)
+; SSE4-NEXT: pextrw $1, %xmm0, 2(%rdi)
; SSE4-NEXT: retq
;
; AVX-LABEL: truncstore_v2i64_v2i16:
; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848]
; AVX-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm4
; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
; AVX-NEXT: vmovmskpd %xmm1, %eax
; AVX-NEXT: xorl $3, %eax
; AVX-NEXT: testb $2, %al
; AVX-NEXT: je .LBB7_4
; AVX-NEXT: .LBB7_3: # %cond.store1
-; AVX-NEXT: vpextrw $4, %xmm0, 2(%rdi)
+; AVX-NEXT: vpextrw $1, %xmm0, 2(%rdi)
; AVX-NEXT: retq
;
; AVX512F-LABEL: truncstore_v2i64_v2i16:
; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: testb $1, %al
; AVX512F-NEXT: jne .LBB7_1
; AVX512F-NEXT: testb $2, %al
; AVX512F-NEXT: je .LBB7_4
; AVX512F-NEXT: .LBB7_3: # %cond.store1
-; AVX512F-NEXT: vpextrw $4, %xmm0, 2(%rdi)
+; AVX512F-NEXT: vpextrw $1, %xmm0, 2(%rdi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512BW-NEXT: vptestmq %zmm1, %zmm1, %k0
+; AVX512BW-NEXT: kshiftld $30, %k0, %k0
+; AVX512BW-NEXT: kshiftrd $30, %k0, %k1
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767]
; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512BW-NEXT: kshiftld $30, %k0, %k0
-; AVX512BW-NEXT: kshiftrd $30, %k0, %k1
; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1}
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm5
-; SSE2-NEXT: pandn {{.*}}(%rip), %xmm0
-; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; SSE2-NEXT: pand %xmm6, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT: por %xmm0, %xmm3
+; SSE2-NEXT: pand %xmm3, %xmm5
+; SSE2-NEXT: pandn {{.*}}(%rip), %xmm3
+; SSE2-NEXT: por %xmm5, %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT: packuswb %xmm3, %xmm3
+; SSE2-NEXT: packuswb %xmm3, %xmm3
+; SSE2-NEXT: packuswb %xmm3, %xmm3
; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2]
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: movmskpd %xmm1, %eax
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: movmskpd %xmm0, %eax
; SSE2-NEXT: xorl $3, %eax
; SSE2-NEXT: testb $1, %al
+; SSE2-NEXT: movd %xmm3, %ecx
; SSE2-NEXT: jne .LBB8_1
; SSE2-NEXT: # %bb.2: # %else
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: .LBB8_4: # %else2
; SSE2-NEXT: retq
; SSE2-NEXT: .LBB8_1: # %cond.store
-; SSE2-NEXT: movd %xmm0, %ecx
; SSE2-NEXT: movb %cl, (%rdi)
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je .LBB8_4
; SSE2-NEXT: .LBB8_3: # %cond.store1
-; SSE2-NEXT: pextrw $4, %xmm0, %eax
-; SSE2-NEXT: movb %al, 1(%rdi)
+; SSE2-NEXT: movb %ch, 1(%rdi)
; SSE2-NEXT: retq
;
; SSE4-LABEL: truncstore_v2i64_v2i8:
; SSE4-NEXT: movapd %xmm4, %xmm0
; SSE4-NEXT: pcmpgtq %xmm2, %xmm0
; SSE4-NEXT: blendvpd %xmm0, %xmm4, %xmm2
+; SSE4-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; SSE4-NEXT: pcmpeqq %xmm1, %xmm3
; SSE4-NEXT: movmskpd %xmm3, %eax
; SSE4-NEXT: xorl $3, %eax
; SSE4-NEXT: testb $2, %al
; SSE4-NEXT: je .LBB8_4
; SSE4-NEXT: .LBB8_3: # %cond.store1
-; SSE4-NEXT: pextrb $8, %xmm2, 1(%rdi)
+; SSE4-NEXT: pextrb $1, %xmm2, 1(%rdi)
; SSE4-NEXT: retq
;
; AVX-LABEL: truncstore_v2i64_v2i8:
; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488]
; AVX-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm4
; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
; AVX-NEXT: vmovmskpd %xmm1, %eax
; AVX-NEXT: xorl $3, %eax
; AVX-NEXT: testb $2, %al
; AVX-NEXT: je .LBB8_4
; AVX-NEXT: .LBB8_3: # %cond.store1
-; AVX-NEXT: vpextrb $8, %xmm0, 1(%rdi)
+; AVX-NEXT: vpextrb $1, %xmm0, 1(%rdi)
; AVX-NEXT: retq
;
; AVX512F-LABEL: truncstore_v2i64_v2i8:
; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488]
; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: testb $1, %al
; AVX512F-NEXT: jne .LBB8_1
; AVX512F-NEXT: testb $2, %al
; AVX512F-NEXT: je .LBB8_4
; AVX512F-NEXT: .LBB8_3: # %cond.store1
-; AVX512F-NEXT: vpextrb $8, %xmm0, 1(%rdi)
+; AVX512F-NEXT: vpextrb $1, %xmm0, 1(%rdi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512BW-NEXT: vptestmq %zmm1, %zmm1, %k0
+; AVX512BW-NEXT: kshiftlq $62, %k0, %k0
+; AVX512BW-NEXT: kshiftrq $62, %k0, %k1
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127]
; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488]
; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: kshiftlq $62, %k0, %k0
-; AVX512BW-NEXT: kshiftrq $62, %k0, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1}
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
; SSE2-LABEL: truncstore_v8i32_v8i8:
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [127,127,127,127]
-; SSE2-NEXT: movdqa %xmm5, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE2-NEXT: pand %xmm6, %xmm0
-; SSE2-NEXT: pandn %xmm5, %xmm6
-; SSE2-NEXT: por %xmm0, %xmm6
-; SSE2-NEXT: movdqa %xmm5, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pandn %xmm5, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168]
-; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm5
-; SSE2-NEXT: por %xmm0, %xmm5
-; SSE2-NEXT: movdqa %xmm6, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm6
-; SSE2-NEXT: pandn %xmm1, %xmm0
-; SSE2-NEXT: por %xmm6, %xmm0
-; SSE2-NEXT: packssdw %xmm5, %xmm0
+; SSE2-NEXT: packssdw %xmm1, %xmm0
+; SSE2-NEXT: packsswb %xmm0, %xmm0
; SSE2-NEXT: pcmpeqd %xmm4, %xmm3
; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm1, %xmm3
; SSE2-NEXT: jne .LBB12_5
; SSE2-NEXT: .LBB12_6: # %else4
; SSE2-NEXT: testb $8, %al
-; SSE2-NEXT: jne .LBB12_7
+; SSE2-NEXT: je .LBB12_8
+; SSE2-NEXT: .LBB12_7: # %cond.store5
+; SSE2-NEXT: shrl $24, %ecx
+; SSE2-NEXT: movb %cl, 3(%rdi)
; SSE2-NEXT: .LBB12_8: # %else6
; SSE2-NEXT: testb $16, %al
-; SSE2-NEXT: jne .LBB12_9
+; SSE2-NEXT: pextrw $2, %xmm0, %ecx
+; SSE2-NEXT: je .LBB12_10
+; SSE2-NEXT: # %bb.9: # %cond.store7
+; SSE2-NEXT: movb %cl, 4(%rdi)
; SSE2-NEXT: .LBB12_10: # %else8
; SSE2-NEXT: testb $32, %al
-; SSE2-NEXT: jne .LBB12_11
+; SSE2-NEXT: je .LBB12_12
+; SSE2-NEXT: # %bb.11: # %cond.store9
+; SSE2-NEXT: movb %ch, 5(%rdi)
; SSE2-NEXT: .LBB12_12: # %else10
; SSE2-NEXT: testb $64, %al
+; SSE2-NEXT: pextrw $3, %xmm0, %ecx
; SSE2-NEXT: jne .LBB12_13
-; SSE2-NEXT: .LBB12_14: # %else12
+; SSE2-NEXT: # %bb.14: # %else12
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: jne .LBB12_15
; SSE2-NEXT: .LBB12_16: # %else14
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je .LBB12_4
; SSE2-NEXT: .LBB12_3: # %cond.store1
-; SSE2-NEXT: shrl $16, %ecx
-; SSE2-NEXT: movb %cl, 1(%rdi)
+; SSE2-NEXT: movb %ch, 1(%rdi)
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: je .LBB12_6
; SSE2-NEXT: .LBB12_5: # %cond.store3
-; SSE2-NEXT: pextrw $2, %xmm0, %ecx
-; SSE2-NEXT: movb %cl, 2(%rdi)
+; SSE2-NEXT: movl %ecx, %edx
+; SSE2-NEXT: shrl $16, %edx
+; SSE2-NEXT: movb %dl, 2(%rdi)
; SSE2-NEXT: testb $8, %al
-; SSE2-NEXT: je .LBB12_8
-; SSE2-NEXT: .LBB12_7: # %cond.store5
-; SSE2-NEXT: pextrw $3, %xmm0, %ecx
-; SSE2-NEXT: movb %cl, 3(%rdi)
-; SSE2-NEXT: testb $16, %al
-; SSE2-NEXT: je .LBB12_10
-; SSE2-NEXT: .LBB12_9: # %cond.store7
-; SSE2-NEXT: pextrw $4, %xmm0, %ecx
-; SSE2-NEXT: movb %cl, 4(%rdi)
-; SSE2-NEXT: testb $32, %al
-; SSE2-NEXT: je .LBB12_12
-; SSE2-NEXT: .LBB12_11: # %cond.store9
-; SSE2-NEXT: pextrw $5, %xmm0, %ecx
-; SSE2-NEXT: movb %cl, 5(%rdi)
-; SSE2-NEXT: testb $64, %al
-; SSE2-NEXT: je .LBB12_14
+; SSE2-NEXT: jne .LBB12_7
+; SSE2-NEXT: jmp .LBB12_8
; SSE2-NEXT: .LBB12_13: # %cond.store11
-; SSE2-NEXT: pextrw $6, %xmm0, %ecx
; SSE2-NEXT: movb %cl, 6(%rdi)
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: je .LBB12_16
; SSE2-NEXT: .LBB12_15: # %cond.store13
-; SSE2-NEXT: pextrw $7, %xmm0, %eax
-; SSE2-NEXT: movb %al, 7(%rdi)
+; SSE2-NEXT: movb %ch, 7(%rdi)
; SSE2-NEXT: retq
;
; SSE4-LABEL: truncstore_v8i32_v8i8:
; SSE4: # %bb.0:
; SSE4-NEXT: pxor %xmm4, %xmm4
-; SSE4-NEXT: movdqa {{.*#+}} xmm5 = [127,127,127,127]
-; SSE4-NEXT: pminsd %xmm5, %xmm0
-; SSE4-NEXT: pminsd %xmm5, %xmm1
-; SSE4-NEXT: movdqa {{.*#+}} xmm5 = [4294967168,4294967168,4294967168,4294967168]
-; SSE4-NEXT: pmaxsd %xmm5, %xmm1
-; SSE4-NEXT: pmaxsd %xmm5, %xmm0
; SSE4-NEXT: packssdw %xmm1, %xmm0
+; SSE4-NEXT: packsswb %xmm0, %xmm0
; SSE4-NEXT: pcmpeqd %xmm4, %xmm3
; SSE4-NEXT: pcmpeqd %xmm1, %xmm1
; SSE4-NEXT: pxor %xmm1, %xmm3
; SSE4-NEXT: testb $2, %al
; SSE4-NEXT: je .LBB12_4
; SSE4-NEXT: .LBB12_3: # %cond.store1
-; SSE4-NEXT: pextrb $2, %xmm0, 1(%rdi)
+; SSE4-NEXT: pextrb $1, %xmm0, 1(%rdi)
; SSE4-NEXT: testb $4, %al
; SSE4-NEXT: je .LBB12_6
; SSE4-NEXT: .LBB12_5: # %cond.store3
-; SSE4-NEXT: pextrb $4, %xmm0, 2(%rdi)
+; SSE4-NEXT: pextrb $2, %xmm0, 2(%rdi)
; SSE4-NEXT: testb $8, %al
; SSE4-NEXT: je .LBB12_8
; SSE4-NEXT: .LBB12_7: # %cond.store5
-; SSE4-NEXT: pextrb $6, %xmm0, 3(%rdi)
+; SSE4-NEXT: pextrb $3, %xmm0, 3(%rdi)
; SSE4-NEXT: testb $16, %al
; SSE4-NEXT: je .LBB12_10
; SSE4-NEXT: .LBB12_9: # %cond.store7
-; SSE4-NEXT: pextrb $8, %xmm0, 4(%rdi)
+; SSE4-NEXT: pextrb $4, %xmm0, 4(%rdi)
; SSE4-NEXT: testb $32, %al
; SSE4-NEXT: je .LBB12_12
; SSE4-NEXT: .LBB12_11: # %cond.store9
-; SSE4-NEXT: pextrb $10, %xmm0, 5(%rdi)
+; SSE4-NEXT: pextrb $5, %xmm0, 5(%rdi)
; SSE4-NEXT: testb $64, %al
; SSE4-NEXT: je .LBB12_14
; SSE4-NEXT: .LBB12_13: # %cond.store11
-; SSE4-NEXT: pextrb $12, %xmm0, 6(%rdi)
+; SSE4-NEXT: pextrb $6, %xmm0, 6(%rdi)
; SSE4-NEXT: testb $-128, %al
; SSE4-NEXT: je .LBB12_16
; SSE4-NEXT: .LBB12_15: # %cond.store13
-; SSE4-NEXT: pextrb $14, %xmm0, 7(%rdi)
+; SSE4-NEXT: pextrb $7, %xmm0, 7(%rdi)
; SSE4-NEXT: retq
;
; AVX1-LABEL: truncstore_v8i32_v8i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127]
-; AVX1-NEXT: vpminsd %xmm2, %xmm0, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpminsd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4294967168,4294967168,4294967168,4294967168]
-; AVX1-NEXT: vpmaxsd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpackssdw %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
; AVX1-NEXT: testb $2, %al
; AVX1-NEXT: je .LBB12_4
; AVX1-NEXT: .LBB12_3: # %cond.store1
-; AVX1-NEXT: vpextrb $2, %xmm0, 1(%rdi)
+; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rdi)
; AVX1-NEXT: testb $4, %al
; AVX1-NEXT: je .LBB12_6
; AVX1-NEXT: .LBB12_5: # %cond.store3
-; AVX1-NEXT: vpextrb $4, %xmm0, 2(%rdi)
+; AVX1-NEXT: vpextrb $2, %xmm0, 2(%rdi)
; AVX1-NEXT: testb $8, %al
; AVX1-NEXT: je .LBB12_8
; AVX1-NEXT: .LBB12_7: # %cond.store5
-; AVX1-NEXT: vpextrb $6, %xmm0, 3(%rdi)
+; AVX1-NEXT: vpextrb $3, %xmm0, 3(%rdi)
; AVX1-NEXT: testb $16, %al
; AVX1-NEXT: je .LBB12_10
; AVX1-NEXT: .LBB12_9: # %cond.store7
-; AVX1-NEXT: vpextrb $8, %xmm0, 4(%rdi)
+; AVX1-NEXT: vpextrb $4, %xmm0, 4(%rdi)
; AVX1-NEXT: testb $32, %al
; AVX1-NEXT: je .LBB12_12
; AVX1-NEXT: .LBB12_11: # %cond.store9
-; AVX1-NEXT: vpextrb $10, %xmm0, 5(%rdi)
+; AVX1-NEXT: vpextrb $5, %xmm0, 5(%rdi)
; AVX1-NEXT: testb $64, %al
; AVX1-NEXT: je .LBB12_14
; AVX1-NEXT: .LBB12_13: # %cond.store11
-; AVX1-NEXT: vpextrb $12, %xmm0, 6(%rdi)
+; AVX1-NEXT: vpextrb $6, %xmm0, 6(%rdi)
; AVX1-NEXT: testb $-128, %al
; AVX1-NEXT: je .LBB12_16
; AVX1-NEXT: .LBB12_15: # %cond.store13
-; AVX1-NEXT: vpextrb $14, %xmm0, 7(%rdi)
+; AVX1-NEXT: vpextrb $7, %xmm0, 7(%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: truncstore_v8i32_v8i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [127,127,127,127,127,127,127,127]
-; AVX2-NEXT: vpminsd %ymm3, %ymm0, %ymm0
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168]
-; AVX2-NEXT: vpmaxsd %ymm3, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
; AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vmovmskps %ymm1, %eax
; AVX2-NEXT: notl %eax
; AVX2-NEXT: testb $2, %al
; AVX2-NEXT: je .LBB12_4
; AVX2-NEXT: .LBB12_3: # %cond.store1
-; AVX2-NEXT: vpextrb $2, %xmm0, 1(%rdi)
+; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rdi)
; AVX2-NEXT: testb $4, %al
; AVX2-NEXT: je .LBB12_6
; AVX2-NEXT: .LBB12_5: # %cond.store3
-; AVX2-NEXT: vpextrb $4, %xmm0, 2(%rdi)
+; AVX2-NEXT: vpextrb $2, %xmm0, 2(%rdi)
; AVX2-NEXT: testb $8, %al
; AVX2-NEXT: je .LBB12_8
; AVX2-NEXT: .LBB12_7: # %cond.store5
-; AVX2-NEXT: vpextrb $6, %xmm0, 3(%rdi)
+; AVX2-NEXT: vpextrb $3, %xmm0, 3(%rdi)
; AVX2-NEXT: testb $16, %al
; AVX2-NEXT: je .LBB12_10
; AVX2-NEXT: .LBB12_9: # %cond.store7
-; AVX2-NEXT: vpextrb $8, %xmm0, 4(%rdi)
+; AVX2-NEXT: vpextrb $4, %xmm0, 4(%rdi)
; AVX2-NEXT: testb $32, %al
; AVX2-NEXT: je .LBB12_12
; AVX2-NEXT: .LBB12_11: # %cond.store9
-; AVX2-NEXT: vpextrb $10, %xmm0, 5(%rdi)
+; AVX2-NEXT: vpextrb $5, %xmm0, 5(%rdi)
; AVX2-NEXT: testb $64, %al
; AVX2-NEXT: je .LBB12_14
; AVX2-NEXT: .LBB12_13: # %cond.store11
-; AVX2-NEXT: vpextrb $12, %xmm0, 6(%rdi)
+; AVX2-NEXT: vpextrb $6, %xmm0, 6(%rdi)
; AVX2-NEXT: testb $-128, %al
; AVX2-NEXT: je .LBB12_16
; AVX2-NEXT: .LBB12_15: # %cond.store13
-; AVX2-NEXT: vpextrb $14, %xmm0, 7(%rdi)
+; AVX2-NEXT: vpextrb $7, %xmm0, 7(%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168]
; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: testb $1, %al
; AVX512F-NEXT: jne .LBB12_1
; AVX512F-NEXT: testb $2, %al
; AVX512F-NEXT: je .LBB12_4
; AVX512F-NEXT: .LBB12_3: # %cond.store1
-; AVX512F-NEXT: vpextrb $2, %xmm0, 1(%rdi)
+; AVX512F-NEXT: vpextrb $1, %xmm0, 1(%rdi)
; AVX512F-NEXT: testb $4, %al
; AVX512F-NEXT: je .LBB12_6
; AVX512F-NEXT: .LBB12_5: # %cond.store3
-; AVX512F-NEXT: vpextrb $4, %xmm0, 2(%rdi)
+; AVX512F-NEXT: vpextrb $2, %xmm0, 2(%rdi)
; AVX512F-NEXT: testb $8, %al
; AVX512F-NEXT: je .LBB12_8
; AVX512F-NEXT: .LBB12_7: # %cond.store5
-; AVX512F-NEXT: vpextrb $6, %xmm0, 3(%rdi)
+; AVX512F-NEXT: vpextrb $3, %xmm0, 3(%rdi)
; AVX512F-NEXT: testb $16, %al
; AVX512F-NEXT: je .LBB12_10
; AVX512F-NEXT: .LBB12_9: # %cond.store7
-; AVX512F-NEXT: vpextrb $8, %xmm0, 4(%rdi)
+; AVX512F-NEXT: vpextrb $4, %xmm0, 4(%rdi)
; AVX512F-NEXT: testb $32, %al
; AVX512F-NEXT: je .LBB12_12
; AVX512F-NEXT: .LBB12_11: # %cond.store9
-; AVX512F-NEXT: vpextrb $10, %xmm0, 5(%rdi)
+; AVX512F-NEXT: vpextrb $5, %xmm0, 5(%rdi)
; AVX512F-NEXT: testb $64, %al
; AVX512F-NEXT: je .LBB12_14
; AVX512F-NEXT: .LBB12_13: # %cond.store11
-; AVX512F-NEXT: vpextrb $12, %xmm0, 6(%rdi)
+; AVX512F-NEXT: vpextrb $6, %xmm0, 6(%rdi)
; AVX512F-NEXT: testb $-128, %al
; AVX512F-NEXT: je .LBB12_16
; AVX512F-NEXT: .LBB12_15: # %cond.store13
-; AVX512F-NEXT: vpextrb $14, %xmm0, 7(%rdi)
+; AVX512F-NEXT: vpextrb $7, %xmm0, 7(%rdi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0
+; AVX512BW-NEXT: kshiftlq $56, %k0, %k0
+; AVX512BW-NEXT: kshiftrq $56, %k0, %k1
; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [127,127,127,127,127,127,127,127]
; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168]
; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
-; AVX512BW-NEXT: kshiftlq $56, %k0, %k0
-; AVX512BW-NEXT: kshiftrq $56, %k0, %k1
+; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1}
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
; SSE2-LABEL: truncstore_v4i32_v4i16:
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [32767,32767,32767,32767]
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pandn %xmm3, %xmm4
-; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [4294934528,4294934528,4294934528,4294934528]
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm4
-; SSE2-NEXT: pandn %xmm3, %xmm0
-; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: packssdw %xmm0, %xmm0
; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
; SSE2-NEXT: movmskps %xmm2, %eax
; SSE2-NEXT: xorl $15, %eax
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je .LBB13_4
; SSE2-NEXT: .LBB13_3: # %cond.store1
-; SSE2-NEXT: pextrw $2, %xmm0, %ecx
+; SSE2-NEXT: pextrw $1, %xmm0, %ecx
; SSE2-NEXT: movw %cx, 2(%rdi)
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: je .LBB13_6
; SSE2-NEXT: .LBB13_5: # %cond.store3
-; SSE2-NEXT: pextrw $4, %xmm0, %ecx
+; SSE2-NEXT: pextrw $2, %xmm0, %ecx
; SSE2-NEXT: movw %cx, 4(%rdi)
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: je .LBB13_8
; SSE2-NEXT: .LBB13_7: # %cond.store5
-; SSE2-NEXT: pextrw $6, %xmm0, %eax
+; SSE2-NEXT: pextrw $3, %xmm0, %eax
; SSE2-NEXT: movw %ax, 6(%rdi)
; SSE2-NEXT: retq
;
; SSE4-LABEL: truncstore_v4i32_v4i16:
; SSE4: # %bb.0:
; SSE4-NEXT: pxor %xmm2, %xmm2
-; SSE4-NEXT: pminsd {{.*}}(%rip), %xmm0
-; SSE4-NEXT: pmaxsd {{.*}}(%rip), %xmm0
+; SSE4-NEXT: packssdw %xmm0, %xmm0
; SSE4-NEXT: pcmpeqd %xmm1, %xmm2
; SSE4-NEXT: movmskps %xmm2, %eax
; SSE4-NEXT: xorl $15, %eax
; SSE4-NEXT: testb $2, %al
; SSE4-NEXT: je .LBB13_4
; SSE4-NEXT: .LBB13_3: # %cond.store1
-; SSE4-NEXT: pextrw $2, %xmm0, 2(%rdi)
+; SSE4-NEXT: pextrw $1, %xmm0, 2(%rdi)
; SSE4-NEXT: testb $4, %al
; SSE4-NEXT: je .LBB13_6
; SSE4-NEXT: .LBB13_5: # %cond.store3
-; SSE4-NEXT: pextrw $4, %xmm0, 4(%rdi)
+; SSE4-NEXT: pextrw $2, %xmm0, 4(%rdi)
; SSE4-NEXT: testb $8, %al
; SSE4-NEXT: je .LBB13_8
; SSE4-NEXT: .LBB13_7: # %cond.store5
-; SSE4-NEXT: pextrw $6, %xmm0, 6(%rdi)
+; SSE4-NEXT: pextrw $3, %xmm0, 6(%rdi)
; SSE4-NEXT: retq
;
-; AVX1-LABEL: truncstore_v4i32_v4i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpminsd {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpmaxsd {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vmovmskps %xmm1, %eax
-; AVX1-NEXT: xorl $15, %eax
-; AVX1-NEXT: testb $1, %al
-; AVX1-NEXT: jne .LBB13_1
-; AVX1-NEXT: # %bb.2: # %else
-; AVX1-NEXT: testb $2, %al
-; AVX1-NEXT: jne .LBB13_3
-; AVX1-NEXT: .LBB13_4: # %else2
-; AVX1-NEXT: testb $4, %al
-; AVX1-NEXT: jne .LBB13_5
-; AVX1-NEXT: .LBB13_6: # %else4
-; AVX1-NEXT: testb $8, %al
-; AVX1-NEXT: jne .LBB13_7
-; AVX1-NEXT: .LBB13_8: # %else6
-; AVX1-NEXT: retq
-; AVX1-NEXT: .LBB13_1: # %cond.store
-; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi)
-; AVX1-NEXT: testb $2, %al
-; AVX1-NEXT: je .LBB13_4
-; AVX1-NEXT: .LBB13_3: # %cond.store1
-; AVX1-NEXT: vpextrw $2, %xmm0, 2(%rdi)
-; AVX1-NEXT: testb $4, %al
-; AVX1-NEXT: je .LBB13_6
-; AVX1-NEXT: .LBB13_5: # %cond.store3
-; AVX1-NEXT: vpextrw $4, %xmm0, 4(%rdi)
-; AVX1-NEXT: testb $8, %al
-; AVX1-NEXT: je .LBB13_8
-; AVX1-NEXT: .LBB13_7: # %cond.store5
-; AVX1-NEXT: vpextrw $6, %xmm0, 6(%rdi)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: truncstore_v4i32_v4i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32767,32767,32767,32767]
-; AVX2-NEXT: vpminsd %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [4294934528,4294934528,4294934528,4294934528]
-; AVX2-NEXT: vpmaxsd %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vmovmskps %xmm1, %eax
-; AVX2-NEXT: xorl $15, %eax
-; AVX2-NEXT: testb $1, %al
-; AVX2-NEXT: jne .LBB13_1
-; AVX2-NEXT: # %bb.2: # %else
-; AVX2-NEXT: testb $2, %al
-; AVX2-NEXT: jne .LBB13_3
-; AVX2-NEXT: .LBB13_4: # %else2
-; AVX2-NEXT: testb $4, %al
-; AVX2-NEXT: jne .LBB13_5
-; AVX2-NEXT: .LBB13_6: # %else4
-; AVX2-NEXT: testb $8, %al
-; AVX2-NEXT: jne .LBB13_7
-; AVX2-NEXT: .LBB13_8: # %else6
-; AVX2-NEXT: retq
-; AVX2-NEXT: .LBB13_1: # %cond.store
-; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi)
-; AVX2-NEXT: testb $2, %al
-; AVX2-NEXT: je .LBB13_4
-; AVX2-NEXT: .LBB13_3: # %cond.store1
-; AVX2-NEXT: vpextrw $2, %xmm0, 2(%rdi)
-; AVX2-NEXT: testb $4, %al
-; AVX2-NEXT: je .LBB13_6
-; AVX2-NEXT: .LBB13_5: # %cond.store3
-; AVX2-NEXT: vpextrw $4, %xmm0, 4(%rdi)
-; AVX2-NEXT: testb $8, %al
-; AVX2-NEXT: je .LBB13_8
-; AVX2-NEXT: .LBB13_7: # %cond.store5
-; AVX2-NEXT: vpextrw $6, %xmm0, 6(%rdi)
-; AVX2-NEXT: retq
+; AVX-LABEL: truncstore_v4i32_v4i16:
+; AVX: # %bb.0:
+; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vmovmskps %xmm1, %eax
+; AVX-NEXT: xorl $15, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: jne .LBB13_1
+; AVX-NEXT: # %bb.2: # %else
+; AVX-NEXT: testb $2, %al
+; AVX-NEXT: jne .LBB13_3
+; AVX-NEXT: .LBB13_4: # %else2
+; AVX-NEXT: testb $4, %al
+; AVX-NEXT: jne .LBB13_5
+; AVX-NEXT: .LBB13_6: # %else4
+; AVX-NEXT: testb $8, %al
+; AVX-NEXT: jne .LBB13_7
+; AVX-NEXT: .LBB13_8: # %else6
+; AVX-NEXT: retq
+; AVX-NEXT: .LBB13_1: # %cond.store
+; AVX-NEXT: vpextrw $0, %xmm0, (%rdi)
+; AVX-NEXT: testb $2, %al
+; AVX-NEXT: je .LBB13_4
+; AVX-NEXT: .LBB13_3: # %cond.store1
+; AVX-NEXT: vpextrw $1, %xmm0, 2(%rdi)
+; AVX-NEXT: testb $4, %al
+; AVX-NEXT: je .LBB13_6
+; AVX-NEXT: .LBB13_5: # %cond.store3
+; AVX-NEXT: vpextrw $2, %xmm0, 4(%rdi)
+; AVX-NEXT: testb $8, %al
+; AVX-NEXT: je .LBB13_8
+; AVX-NEXT: .LBB13_7: # %cond.store5
+; AVX-NEXT: vpextrw $3, %xmm0, 6(%rdi)
+; AVX-NEXT: retq
;
; AVX512F-LABEL: truncstore_v4i32_v4i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528]
; AVX512F-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: testb $1, %al
; AVX512F-NEXT: jne .LBB13_1
; AVX512F-NEXT: testb $2, %al
; AVX512F-NEXT: je .LBB13_4
; AVX512F-NEXT: .LBB13_3: # %cond.store1
-; AVX512F-NEXT: vpextrw $2, %xmm0, 2(%rdi)
+; AVX512F-NEXT: vpextrw $1, %xmm0, 2(%rdi)
; AVX512F-NEXT: testb $4, %al
; AVX512F-NEXT: je .LBB13_6
; AVX512F-NEXT: .LBB13_5: # %cond.store3
-; AVX512F-NEXT: vpextrw $4, %xmm0, 4(%rdi)
+; AVX512F-NEXT: vpextrw $2, %xmm0, 4(%rdi)
; AVX512F-NEXT: testb $8, %al
; AVX512F-NEXT: je .LBB13_8
; AVX512F-NEXT: .LBB13_7: # %cond.store5
-; AVX512F-NEXT: vpextrw $6, %xmm0, 6(%rdi)
+; AVX512F-NEXT: vpextrw $3, %xmm0, 6(%rdi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0
+; AVX512BW-NEXT: kshiftld $28, %k0, %k0
+; AVX512BW-NEXT: kshiftrd $28, %k0, %k1
; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32767,32767,32767,32767]
; AVX512BW-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528]
; AVX512BW-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
-; AVX512BW-NEXT: kshiftld $28, %k0, %k0
-; AVX512BW-NEXT: kshiftrd $28, %k0, %k1
; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1}
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: pandn %xmm3, %xmm4
; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [4294967168,4294967168,4294967168,4294967168]
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm4
-; SSE2-NEXT: pandn %xmm3, %xmm0
-; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [4294967168,4294967168,4294967168,4294967168]
+; SSE2-NEXT: movdqa %xmm4, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
+; SSE2-NEXT: pand %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm3
+; SSE2-NEXT: por %xmm4, %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT: packuswb %xmm3, %xmm3
+; SSE2-NEXT: packuswb %xmm3, %xmm3
; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
-; SSE2-NEXT: movmskps %xmm2, %eax
-; SSE2-NEXT: xorl $15, %eax
-; SSE2-NEXT: testb $1, %al
+; SSE2-NEXT: movmskps %xmm2, %ecx
+; SSE2-NEXT: xorl $15, %ecx
+; SSE2-NEXT: testb $1, %cl
+; SSE2-NEXT: movd %xmm3, %eax
; SSE2-NEXT: jne .LBB14_1
; SSE2-NEXT: # %bb.2: # %else
-; SSE2-NEXT: testb $2, %al
+; SSE2-NEXT: testb $2, %cl
; SSE2-NEXT: jne .LBB14_3
; SSE2-NEXT: .LBB14_4: # %else2
-; SSE2-NEXT: testb $4, %al
+; SSE2-NEXT: testb $4, %cl
; SSE2-NEXT: jne .LBB14_5
; SSE2-NEXT: .LBB14_6: # %else4
-; SSE2-NEXT: testb $8, %al
+; SSE2-NEXT: testb $8, %cl
; SSE2-NEXT: jne .LBB14_7
; SSE2-NEXT: .LBB14_8: # %else6
; SSE2-NEXT: retq
; SSE2-NEXT: .LBB14_1: # %cond.store
-; SSE2-NEXT: movd %xmm0, %ecx
-; SSE2-NEXT: movb %cl, (%rdi)
-; SSE2-NEXT: testb $2, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: testb $2, %cl
; SSE2-NEXT: je .LBB14_4
; SSE2-NEXT: .LBB14_3: # %cond.store1
-; SSE2-NEXT: pextrw $2, %xmm0, %ecx
-; SSE2-NEXT: movb %cl, 1(%rdi)
-; SSE2-NEXT: testb $4, %al
+; SSE2-NEXT: movb %ah, 1(%rdi)
+; SSE2-NEXT: testb $4, %cl
; SSE2-NEXT: je .LBB14_6
; SSE2-NEXT: .LBB14_5: # %cond.store3
-; SSE2-NEXT: pextrw $4, %xmm0, %ecx
-; SSE2-NEXT: movb %cl, 2(%rdi)
-; SSE2-NEXT: testb $8, %al
+; SSE2-NEXT: movl %eax, %edx
+; SSE2-NEXT: shrl $16, %edx
+; SSE2-NEXT: movb %dl, 2(%rdi)
+; SSE2-NEXT: testb $8, %cl
; SSE2-NEXT: je .LBB14_8
; SSE2-NEXT: .LBB14_7: # %cond.store5
-; SSE2-NEXT: pextrw $6, %xmm0, %eax
+; SSE2-NEXT: shrl $24, %eax
; SSE2-NEXT: movb %al, 3(%rdi)
; SSE2-NEXT: retq
;
; SSE4-NEXT: pxor %xmm2, %xmm2
; SSE4-NEXT: pminsd {{.*}}(%rip), %xmm0
; SSE4-NEXT: pmaxsd {{.*}}(%rip), %xmm0
+; SSE4-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; SSE4-NEXT: pcmpeqd %xmm1, %xmm2
; SSE4-NEXT: movmskps %xmm2, %eax
; SSE4-NEXT: xorl $15, %eax
; SSE4-NEXT: testb $2, %al
; SSE4-NEXT: je .LBB14_4
; SSE4-NEXT: .LBB14_3: # %cond.store1
-; SSE4-NEXT: pextrb $4, %xmm0, 1(%rdi)
+; SSE4-NEXT: pextrb $1, %xmm0, 1(%rdi)
; SSE4-NEXT: testb $4, %al
; SSE4-NEXT: je .LBB14_6
; SSE4-NEXT: .LBB14_5: # %cond.store3
-; SSE4-NEXT: pextrb $8, %xmm0, 2(%rdi)
+; SSE4-NEXT: pextrb $2, %xmm0, 2(%rdi)
; SSE4-NEXT: testb $8, %al
; SSE4-NEXT: je .LBB14_8
; SSE4-NEXT: .LBB14_7: # %cond.store5
-; SSE4-NEXT: pextrb $12, %xmm0, 3(%rdi)
+; SSE4-NEXT: pextrb $3, %xmm0, 3(%rdi)
; SSE4-NEXT: retq
;
; AVX1-LABEL: truncstore_v4i32_v4i8:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpminsd {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vpmaxsd {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vmovmskps %xmm1, %eax
; AVX1-NEXT: xorl $15, %eax
; AVX1-NEXT: testb $2, %al
; AVX1-NEXT: je .LBB14_4
; AVX1-NEXT: .LBB14_3: # %cond.store1
-; AVX1-NEXT: vpextrb $4, %xmm0, 1(%rdi)
+; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rdi)
; AVX1-NEXT: testb $4, %al
; AVX1-NEXT: je .LBB14_6
; AVX1-NEXT: .LBB14_5: # %cond.store3
-; AVX1-NEXT: vpextrb $8, %xmm0, 2(%rdi)
+; AVX1-NEXT: vpextrb $2, %xmm0, 2(%rdi)
; AVX1-NEXT: testb $8, %al
; AVX1-NEXT: je .LBB14_8
; AVX1-NEXT: .LBB14_7: # %cond.store5
-; AVX1-NEXT: vpextrb $12, %xmm0, 3(%rdi)
+; AVX1-NEXT: vpextrb $3, %xmm0, 3(%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: truncstore_v4i32_v4i8:
; AVX2-NEXT: vpminsd %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [4294967168,4294967168,4294967168,4294967168]
; AVX2-NEXT: vpmaxsd %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vmovmskps %xmm1, %eax
; AVX2-NEXT: xorl $15, %eax
; AVX2-NEXT: testb $2, %al
; AVX2-NEXT: je .LBB14_4
; AVX2-NEXT: .LBB14_3: # %cond.store1
-; AVX2-NEXT: vpextrb $4, %xmm0, 1(%rdi)
+; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rdi)
; AVX2-NEXT: testb $4, %al
; AVX2-NEXT: je .LBB14_6
; AVX2-NEXT: .LBB14_5: # %cond.store3
-; AVX2-NEXT: vpextrb $8, %xmm0, 2(%rdi)
+; AVX2-NEXT: vpextrb $2, %xmm0, 2(%rdi)
; AVX2-NEXT: testb $8, %al
; AVX2-NEXT: je .LBB14_8
; AVX2-NEXT: .LBB14_7: # %cond.store5
-; AVX2-NEXT: vpextrb $12, %xmm0, 3(%rdi)
+; AVX2-NEXT: vpextrb $3, %xmm0, 3(%rdi)
; AVX2-NEXT: retq
;
; AVX512F-LABEL: truncstore_v4i32_v4i8:
; AVX512F-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168]
; AVX512F-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: testb $1, %al
; AVX512F-NEXT: jne .LBB14_1
; AVX512F-NEXT: testb $2, %al
; AVX512F-NEXT: je .LBB14_4
; AVX512F-NEXT: .LBB14_3: # %cond.store1
-; AVX512F-NEXT: vpextrb $4, %xmm0, 1(%rdi)
+; AVX512F-NEXT: vpextrb $1, %xmm0, 1(%rdi)
; AVX512F-NEXT: testb $4, %al
; AVX512F-NEXT: je .LBB14_6
; AVX512F-NEXT: .LBB14_5: # %cond.store3
-; AVX512F-NEXT: vpextrb $8, %xmm0, 2(%rdi)
+; AVX512F-NEXT: vpextrb $2, %xmm0, 2(%rdi)
; AVX512F-NEXT: testb $8, %al
; AVX512F-NEXT: je .LBB14_8
; AVX512F-NEXT: .LBB14_7: # %cond.store5
-; AVX512F-NEXT: vpextrb $12, %xmm0, 3(%rdi)
+; AVX512F-NEXT: vpextrb $3, %xmm0, 3(%rdi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0
+; AVX512BW-NEXT: kshiftlq $60, %k0, %k0
+; AVX512BW-NEXT: kshiftrq $60, %k0, %k1
; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [127,127,127,127]
; AVX512BW-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168]
; AVX512BW-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: kshiftlq $60, %k0, %k0
-; AVX512BW-NEXT: kshiftrq $60, %k0, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1}
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
; SSE2-LABEL: truncstore_v8i16_v8i8:
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pminsw {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pmaxsw {{.*}}(%rip), %xmm0
+; SSE2-NEXT: packsswb %xmm0, %xmm0
; SSE2-NEXT: pcmpeqw %xmm1, %xmm2
; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm1
; SSE2-NEXT: jne .LBB17_5
; SSE2-NEXT: .LBB17_6: # %else4
; SSE2-NEXT: testb $8, %al
-; SSE2-NEXT: jne .LBB17_7
+; SSE2-NEXT: je .LBB17_8
+; SSE2-NEXT: .LBB17_7: # %cond.store5
+; SSE2-NEXT: shrl $24, %ecx
+; SSE2-NEXT: movb %cl, 3(%rdi)
; SSE2-NEXT: .LBB17_8: # %else6
; SSE2-NEXT: testb $16, %al
-; SSE2-NEXT: jne .LBB17_9
+; SSE2-NEXT: pextrw $2, %xmm0, %ecx
+; SSE2-NEXT: je .LBB17_10
+; SSE2-NEXT: # %bb.9: # %cond.store7
+; SSE2-NEXT: movb %cl, 4(%rdi)
; SSE2-NEXT: .LBB17_10: # %else8
; SSE2-NEXT: testb $32, %al
-; SSE2-NEXT: jne .LBB17_11
+; SSE2-NEXT: je .LBB17_12
+; SSE2-NEXT: # %bb.11: # %cond.store9
+; SSE2-NEXT: movb %ch, 5(%rdi)
; SSE2-NEXT: .LBB17_12: # %else10
; SSE2-NEXT: testb $64, %al
+; SSE2-NEXT: pextrw $3, %xmm0, %ecx
; SSE2-NEXT: jne .LBB17_13
-; SSE2-NEXT: .LBB17_14: # %else12
+; SSE2-NEXT: # %bb.14: # %else12
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: jne .LBB17_15
; SSE2-NEXT: .LBB17_16: # %else14
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je .LBB17_4
; SSE2-NEXT: .LBB17_3: # %cond.store1
-; SSE2-NEXT: shrl $16, %ecx
-; SSE2-NEXT: movb %cl, 1(%rdi)
+; SSE2-NEXT: movb %ch, 1(%rdi)
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: je .LBB17_6
; SSE2-NEXT: .LBB17_5: # %cond.store3
-; SSE2-NEXT: pextrw $2, %xmm0, %ecx
-; SSE2-NEXT: movb %cl, 2(%rdi)
+; SSE2-NEXT: movl %ecx, %edx
+; SSE2-NEXT: shrl $16, %edx
+; SSE2-NEXT: movb %dl, 2(%rdi)
; SSE2-NEXT: testb $8, %al
-; SSE2-NEXT: je .LBB17_8
-; SSE2-NEXT: .LBB17_7: # %cond.store5
-; SSE2-NEXT: pextrw $3, %xmm0, %ecx
-; SSE2-NEXT: movb %cl, 3(%rdi)
-; SSE2-NEXT: testb $16, %al
-; SSE2-NEXT: je .LBB17_10
-; SSE2-NEXT: .LBB17_9: # %cond.store7
-; SSE2-NEXT: pextrw $4, %xmm0, %ecx
-; SSE2-NEXT: movb %cl, 4(%rdi)
-; SSE2-NEXT: testb $32, %al
-; SSE2-NEXT: je .LBB17_12
-; SSE2-NEXT: .LBB17_11: # %cond.store9
-; SSE2-NEXT: pextrw $5, %xmm0, %ecx
-; SSE2-NEXT: movb %cl, 5(%rdi)
-; SSE2-NEXT: testb $64, %al
-; SSE2-NEXT: je .LBB17_14
+; SSE2-NEXT: jne .LBB17_7
+; SSE2-NEXT: jmp .LBB17_8
; SSE2-NEXT: .LBB17_13: # %cond.store11
-; SSE2-NEXT: pextrw $6, %xmm0, %ecx
; SSE2-NEXT: movb %cl, 6(%rdi)
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: je .LBB17_16
; SSE2-NEXT: .LBB17_15: # %cond.store13
-; SSE2-NEXT: pextrw $7, %xmm0, %eax
-; SSE2-NEXT: movb %al, 7(%rdi)
+; SSE2-NEXT: movb %ch, 7(%rdi)
; SSE2-NEXT: retq
;
; SSE4-LABEL: truncstore_v8i16_v8i8:
; SSE4: # %bb.0:
; SSE4-NEXT: pxor %xmm2, %xmm2
-; SSE4-NEXT: pminsw {{.*}}(%rip), %xmm0
-; SSE4-NEXT: pmaxsw {{.*}}(%rip), %xmm0
+; SSE4-NEXT: packsswb %xmm0, %xmm0
; SSE4-NEXT: pcmpeqw %xmm1, %xmm2
; SSE4-NEXT: pcmpeqd %xmm1, %xmm1
; SSE4-NEXT: pxor %xmm2, %xmm1
; SSE4-NEXT: testb $2, %al
; SSE4-NEXT: je .LBB17_4
; SSE4-NEXT: .LBB17_3: # %cond.store1
-; SSE4-NEXT: pextrb $2, %xmm0, 1(%rdi)
+; SSE4-NEXT: pextrb $1, %xmm0, 1(%rdi)
; SSE4-NEXT: testb $4, %al
; SSE4-NEXT: je .LBB17_6
; SSE4-NEXT: .LBB17_5: # %cond.store3
-; SSE4-NEXT: pextrb $4, %xmm0, 2(%rdi)
+; SSE4-NEXT: pextrb $2, %xmm0, 2(%rdi)
; SSE4-NEXT: testb $8, %al
; SSE4-NEXT: je .LBB17_8
; SSE4-NEXT: .LBB17_7: # %cond.store5
-; SSE4-NEXT: pextrb $6, %xmm0, 3(%rdi)
+; SSE4-NEXT: pextrb $3, %xmm0, 3(%rdi)
; SSE4-NEXT: testb $16, %al
; SSE4-NEXT: je .LBB17_10
; SSE4-NEXT: .LBB17_9: # %cond.store7
-; SSE4-NEXT: pextrb $8, %xmm0, 4(%rdi)
+; SSE4-NEXT: pextrb $4, %xmm0, 4(%rdi)
; SSE4-NEXT: testb $32, %al
; SSE4-NEXT: je .LBB17_12
; SSE4-NEXT: .LBB17_11: # %cond.store9
-; SSE4-NEXT: pextrb $10, %xmm0, 5(%rdi)
+; SSE4-NEXT: pextrb $5, %xmm0, 5(%rdi)
; SSE4-NEXT: testb $64, %al
; SSE4-NEXT: je .LBB17_14
; SSE4-NEXT: .LBB17_13: # %cond.store11
-; SSE4-NEXT: pextrb $12, %xmm0, 6(%rdi)
+; SSE4-NEXT: pextrb $6, %xmm0, 6(%rdi)
; SSE4-NEXT: testb $-128, %al
; SSE4-NEXT: je .LBB17_16
; SSE4-NEXT: .LBB17_15: # %cond.store13
-; SSE4-NEXT: pextrb $14, %xmm0, 7(%rdi)
+; SSE4-NEXT: pextrb $7, %xmm0, 7(%rdi)
; SSE4-NEXT: retq
;
; AVX-LABEL: truncstore_v8i16_v8i8:
; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpminsw {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vpmaxsw {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1
; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX-NEXT: testb $2, %al
; AVX-NEXT: je .LBB17_4
; AVX-NEXT: .LBB17_3: # %cond.store1
-; AVX-NEXT: vpextrb $2, %xmm0, 1(%rdi)
+; AVX-NEXT: vpextrb $1, %xmm0, 1(%rdi)
; AVX-NEXT: testb $4, %al
; AVX-NEXT: je .LBB17_6
; AVX-NEXT: .LBB17_5: # %cond.store3
-; AVX-NEXT: vpextrb $4, %xmm0, 2(%rdi)
+; AVX-NEXT: vpextrb $2, %xmm0, 2(%rdi)
; AVX-NEXT: testb $8, %al
; AVX-NEXT: je .LBB17_8
; AVX-NEXT: .LBB17_7: # %cond.store5
-; AVX-NEXT: vpextrb $6, %xmm0, 3(%rdi)
+; AVX-NEXT: vpextrb $3, %xmm0, 3(%rdi)
; AVX-NEXT: testb $16, %al
; AVX-NEXT: je .LBB17_10
; AVX-NEXT: .LBB17_9: # %cond.store7
-; AVX-NEXT: vpextrb $8, %xmm0, 4(%rdi)
+; AVX-NEXT: vpextrb $4, %xmm0, 4(%rdi)
; AVX-NEXT: testb $32, %al
; AVX-NEXT: je .LBB17_12
; AVX-NEXT: .LBB17_11: # %cond.store9
-; AVX-NEXT: vpextrb $10, %xmm0, 5(%rdi)
+; AVX-NEXT: vpextrb $5, %xmm0, 5(%rdi)
; AVX-NEXT: testb $64, %al
; AVX-NEXT: je .LBB17_14
; AVX-NEXT: .LBB17_13: # %cond.store11
-; AVX-NEXT: vpextrb $12, %xmm0, 6(%rdi)
+; AVX-NEXT: vpextrb $6, %xmm0, 6(%rdi)
; AVX-NEXT: testb $-128, %al
; AVX-NEXT: je .LBB17_16
; AVX-NEXT: .LBB17_15: # %cond.store13
-; AVX-NEXT: vpextrb $14, %xmm0, 7(%rdi)
+; AVX-NEXT: vpextrb $7, %xmm0, 7(%rdi)
; AVX-NEXT: retq
;
; AVX512F-LABEL: truncstore_v8i16_v8i8:
; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0
; AVX512F-NEXT: vpminsw {{.*}}(%rip), %xmm0, %xmm0
; AVX512F-NEXT: vpmaxsw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: testb $1, %al
; AVX512F-NEXT: jne .LBB17_1
; AVX512F-NEXT: testb $2, %al
; AVX512F-NEXT: je .LBB17_4
; AVX512F-NEXT: .LBB17_3: # %cond.store1
-; AVX512F-NEXT: vpextrb $2, %xmm0, 1(%rdi)
+; AVX512F-NEXT: vpextrb $1, %xmm0, 1(%rdi)
; AVX512F-NEXT: testb $4, %al
; AVX512F-NEXT: je .LBB17_6
; AVX512F-NEXT: .LBB17_5: # %cond.store3
-; AVX512F-NEXT: vpextrb $4, %xmm0, 2(%rdi)
+; AVX512F-NEXT: vpextrb $2, %xmm0, 2(%rdi)
; AVX512F-NEXT: testb $8, %al
; AVX512F-NEXT: je .LBB17_8
; AVX512F-NEXT: .LBB17_7: # %cond.store5
-; AVX512F-NEXT: vpextrb $6, %xmm0, 3(%rdi)
+; AVX512F-NEXT: vpextrb $3, %xmm0, 3(%rdi)
; AVX512F-NEXT: testb $16, %al
; AVX512F-NEXT: je .LBB17_10
; AVX512F-NEXT: .LBB17_9: # %cond.store7
-; AVX512F-NEXT: vpextrb $8, %xmm0, 4(%rdi)
+; AVX512F-NEXT: vpextrb $4, %xmm0, 4(%rdi)
; AVX512F-NEXT: testb $32, %al
; AVX512F-NEXT: je .LBB17_12
; AVX512F-NEXT: .LBB17_11: # %cond.store9
-; AVX512F-NEXT: vpextrb $10, %xmm0, 5(%rdi)
+; AVX512F-NEXT: vpextrb $5, %xmm0, 5(%rdi)
; AVX512F-NEXT: testb $64, %al
; AVX512F-NEXT: je .LBB17_14
; AVX512F-NEXT: .LBB17_13: # %cond.store11
-; AVX512F-NEXT: vpextrb $12, %xmm0, 6(%rdi)
+; AVX512F-NEXT: vpextrb $6, %xmm0, 6(%rdi)
; AVX512F-NEXT: testb $-128, %al
; AVX512F-NEXT: je .LBB17_16
; AVX512F-NEXT: .LBB17_15: # %cond.store13
-; AVX512F-NEXT: vpextrb $14, %xmm0, 7(%rdi)
+; AVX512F-NEXT: vpextrb $7, %xmm0, 7(%rdi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512BW-NEXT: vptestmw %zmm1, %zmm1, %k0
+; AVX512BW-NEXT: kshiftlq $56, %k0, %k0
+; AVX512BW-NEXT: kshiftrq $56, %k0, %k1
; AVX512BW-NEXT: vpminsw {{.*}}(%rip), %xmm0, %xmm0
; AVX512BW-NEXT: vpmaxsw {{.*}}(%rip), %xmm0, %xmm0
; AVX512BW-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
-; AVX512BW-NEXT: kshiftlq $56, %k0, %k0
-; AVX512BW-NEXT: kshiftrq $56, %k0, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1}
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: packuswb %xmm1, %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm7
+; SSE2-NEXT: packuswb %xmm7, %xmm7
; SSE2-NEXT: pcmpeqd %xmm8, %xmm5
; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
; SSE2-NEXT: pxor %xmm0, %xmm5
; SSE2-NEXT: jne .LBB2_5
; SSE2-NEXT: .LBB2_6: # %else4
; SSE2-NEXT: testb $8, %al
-; SSE2-NEXT: jne .LBB2_7
+; SSE2-NEXT: je .LBB2_8
+; SSE2-NEXT: .LBB2_7: # %cond.store5
+; SSE2-NEXT: shrl $24, %ecx
+; SSE2-NEXT: movb %cl, 3(%rdi)
; SSE2-NEXT: .LBB2_8: # %else6
; SSE2-NEXT: testb $16, %al
-; SSE2-NEXT: jne .LBB2_9
+; SSE2-NEXT: pextrw $2, %xmm7, %ecx
+; SSE2-NEXT: je .LBB2_10
+; SSE2-NEXT: # %bb.9: # %cond.store7
+; SSE2-NEXT: movb %cl, 4(%rdi)
; SSE2-NEXT: .LBB2_10: # %else8
; SSE2-NEXT: testb $32, %al
-; SSE2-NEXT: jne .LBB2_11
+; SSE2-NEXT: je .LBB2_12
+; SSE2-NEXT: # %bb.11: # %cond.store9
+; SSE2-NEXT: movb %ch, 5(%rdi)
; SSE2-NEXT: .LBB2_12: # %else10
; SSE2-NEXT: testb $64, %al
+; SSE2-NEXT: pextrw $3, %xmm7, %ecx
; SSE2-NEXT: jne .LBB2_13
-; SSE2-NEXT: .LBB2_14: # %else12
+; SSE2-NEXT: # %bb.14: # %else12
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: jne .LBB2_15
; SSE2-NEXT: .LBB2_16: # %else14
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je .LBB2_4
; SSE2-NEXT: .LBB2_3: # %cond.store1
-; SSE2-NEXT: shrl $16, %ecx
-; SSE2-NEXT: movb %cl, 1(%rdi)
+; SSE2-NEXT: movb %ch, 1(%rdi)
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: je .LBB2_6
; SSE2-NEXT: .LBB2_5: # %cond.store3
-; SSE2-NEXT: pextrw $2, %xmm7, %ecx
-; SSE2-NEXT: movb %cl, 2(%rdi)
+; SSE2-NEXT: movl %ecx, %edx
+; SSE2-NEXT: shrl $16, %edx
+; SSE2-NEXT: movb %dl, 2(%rdi)
; SSE2-NEXT: testb $8, %al
-; SSE2-NEXT: je .LBB2_8
-; SSE2-NEXT: .LBB2_7: # %cond.store5
-; SSE2-NEXT: pextrw $3, %xmm7, %ecx
-; SSE2-NEXT: movb %cl, 3(%rdi)
-; SSE2-NEXT: testb $16, %al
-; SSE2-NEXT: je .LBB2_10
-; SSE2-NEXT: .LBB2_9: # %cond.store7
-; SSE2-NEXT: pextrw $4, %xmm7, %ecx
-; SSE2-NEXT: movb %cl, 4(%rdi)
-; SSE2-NEXT: testb $32, %al
-; SSE2-NEXT: je .LBB2_12
-; SSE2-NEXT: .LBB2_11: # %cond.store9
-; SSE2-NEXT: pextrw $5, %xmm7, %ecx
-; SSE2-NEXT: movb %cl, 5(%rdi)
-; SSE2-NEXT: testb $64, %al
-; SSE2-NEXT: je .LBB2_14
+; SSE2-NEXT: jne .LBB2_7
+; SSE2-NEXT: jmp .LBB2_8
; SSE2-NEXT: .LBB2_13: # %cond.store11
-; SSE2-NEXT: pextrw $6, %xmm7, %ecx
; SSE2-NEXT: movb %cl, 6(%rdi)
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: je .LBB2_16
; SSE2-NEXT: .LBB2_15: # %cond.store13
-; SSE2-NEXT: pextrw $7, %xmm7, %eax
-; SSE2-NEXT: movb %al, 7(%rdi)
+; SSE2-NEXT: movb %ch, 7(%rdi)
; SSE2-NEXT: retq
;
; SSE4-LABEL: truncstore_v8i64_v8i8:
; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm6
; SSE4-NEXT: packusdw %xmm7, %xmm6
; SSE4-NEXT: packusdw %xmm6, %xmm1
+; SSE4-NEXT: packuswb %xmm1, %xmm1
; SSE4-NEXT: pcmpeqd %xmm8, %xmm5
; SSE4-NEXT: pcmpeqd %xmm0, %xmm0
; SSE4-NEXT: pxor %xmm0, %xmm5
; SSE4-NEXT: testb $2, %al
; SSE4-NEXT: je .LBB2_4
; SSE4-NEXT: .LBB2_3: # %cond.store1
-; SSE4-NEXT: pextrb $2, %xmm1, 1(%rdi)
+; SSE4-NEXT: pextrb $1, %xmm1, 1(%rdi)
; SSE4-NEXT: testb $4, %al
; SSE4-NEXT: je .LBB2_6
; SSE4-NEXT: .LBB2_5: # %cond.store3
-; SSE4-NEXT: pextrb $4, %xmm1, 2(%rdi)
+; SSE4-NEXT: pextrb $2, %xmm1, 2(%rdi)
; SSE4-NEXT: testb $8, %al
; SSE4-NEXT: je .LBB2_8
; SSE4-NEXT: .LBB2_7: # %cond.store5
-; SSE4-NEXT: pextrb $6, %xmm1, 3(%rdi)
+; SSE4-NEXT: pextrb $3, %xmm1, 3(%rdi)
; SSE4-NEXT: testb $16, %al
; SSE4-NEXT: je .LBB2_10
; SSE4-NEXT: .LBB2_9: # %cond.store7
-; SSE4-NEXT: pextrb $8, %xmm1, 4(%rdi)
+; SSE4-NEXT: pextrb $4, %xmm1, 4(%rdi)
; SSE4-NEXT: testb $32, %al
; SSE4-NEXT: je .LBB2_12
; SSE4-NEXT: .LBB2_11: # %cond.store9
-; SSE4-NEXT: pextrb $10, %xmm1, 5(%rdi)
+; SSE4-NEXT: pextrb $5, %xmm1, 5(%rdi)
; SSE4-NEXT: testb $64, %al
; SSE4-NEXT: je .LBB2_14
; SSE4-NEXT: .LBB2_13: # %cond.store11
-; SSE4-NEXT: pextrb $12, %xmm1, 6(%rdi)
+; SSE4-NEXT: pextrb $6, %xmm1, 6(%rdi)
; SSE4-NEXT: testb $-128, %al
; SSE4-NEXT: je .LBB2_16
; SSE4-NEXT: .LBB2_15: # %cond.store13
-; SSE4-NEXT: pextrb $14, %xmm1, 7(%rdi)
+; SSE4-NEXT: pextrb $7, %xmm1, 7(%rdi)
; SSE4-NEXT: retq
;
; AVX1-LABEL: truncstore_v8i64_v8i8:
; AVX1-NEXT: vblendvpd %xmm8, %xmm0, %xmm5, %xmm0
; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1
; AVX1-NEXT: testb $2, %al
; AVX1-NEXT: je .LBB2_4
; AVX1-NEXT: .LBB2_3: # %cond.store1
-; AVX1-NEXT: vpextrb $2, %xmm0, 1(%rdi)
+; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rdi)
; AVX1-NEXT: testb $4, %al
; AVX1-NEXT: je .LBB2_6
; AVX1-NEXT: .LBB2_5: # %cond.store3
-; AVX1-NEXT: vpextrb $4, %xmm0, 2(%rdi)
+; AVX1-NEXT: vpextrb $2, %xmm0, 2(%rdi)
; AVX1-NEXT: testb $8, %al
; AVX1-NEXT: je .LBB2_8
; AVX1-NEXT: .LBB2_7: # %cond.store5
-; AVX1-NEXT: vpextrb $6, %xmm0, 3(%rdi)
+; AVX1-NEXT: vpextrb $3, %xmm0, 3(%rdi)
; AVX1-NEXT: testb $16, %al
; AVX1-NEXT: je .LBB2_10
; AVX1-NEXT: .LBB2_9: # %cond.store7
-; AVX1-NEXT: vpextrb $8, %xmm0, 4(%rdi)
+; AVX1-NEXT: vpextrb $4, %xmm0, 4(%rdi)
; AVX1-NEXT: testb $32, %al
; AVX1-NEXT: je .LBB2_12
; AVX1-NEXT: .LBB2_11: # %cond.store9
-; AVX1-NEXT: vpextrb $10, %xmm0, 5(%rdi)
+; AVX1-NEXT: vpextrb $5, %xmm0, 5(%rdi)
; AVX1-NEXT: testb $64, %al
; AVX1-NEXT: je .LBB2_14
; AVX1-NEXT: .LBB2_13: # %cond.store11
-; AVX1-NEXT: vpextrb $12, %xmm0, 6(%rdi)
+; AVX1-NEXT: vpextrb $6, %xmm0, 6(%rdi)
; AVX1-NEXT: testb $-128, %al
; AVX1-NEXT: je .LBB2_16
; AVX1-NEXT: .LBB2_15: # %cond.store13
-; AVX1-NEXT: vpextrb $14, %xmm0, 7(%rdi)
+; AVX1-NEXT: vpextrb $7, %xmm0, 7(%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255]
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpxor %ymm5, %ymm1, %ymm6
+; AVX2-NEXT: vpxor %ymm5, %ymm0, %ymm6
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm7 = [9223372036854776063,9223372036854776063,9223372036854776063,9223372036854776063]
; AVX2-NEXT: vpcmpgtq %ymm6, %ymm7, %ymm6
-; AVX2-NEXT: vblendvpd %ymm6, %ymm1, %ymm4, %ymm1
-; AVX2-NEXT: vpxor %ymm5, %ymm0, %ymm5
+; AVX2-NEXT: vblendvpd %ymm6, %ymm0, %ymm4, %ymm0
+; AVX2-NEXT: vpxor %ymm5, %ymm1, %ymm5
; AVX2-NEXT: vpcmpgtq %ymm5, %ymm7, %ymm5
-; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0
-; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vblendvpd %ymm5, %ymm1, %ymm4, %ymm1
+; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm1
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm1
; AVX2-NEXT: vmovmskps %ymm1, %eax
; AVX2-NEXT: notl %eax
; AVX2-NEXT: testb $2, %al
; AVX2-NEXT: je .LBB2_4
; AVX2-NEXT: .LBB2_3: # %cond.store1
-; AVX2-NEXT: vpextrb $2, %xmm0, 1(%rdi)
+; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rdi)
; AVX2-NEXT: testb $4, %al
; AVX2-NEXT: je .LBB2_6
; AVX2-NEXT: .LBB2_5: # %cond.store3
-; AVX2-NEXT: vpextrb $4, %xmm0, 2(%rdi)
+; AVX2-NEXT: vpextrb $2, %xmm0, 2(%rdi)
; AVX2-NEXT: testb $8, %al
; AVX2-NEXT: je .LBB2_8
; AVX2-NEXT: .LBB2_7: # %cond.store5
-; AVX2-NEXT: vpextrb $6, %xmm0, 3(%rdi)
+; AVX2-NEXT: vpextrb $3, %xmm0, 3(%rdi)
; AVX2-NEXT: testb $16, %al
; AVX2-NEXT: je .LBB2_10
; AVX2-NEXT: .LBB2_9: # %cond.store7
-; AVX2-NEXT: vpextrb $8, %xmm0, 4(%rdi)
+; AVX2-NEXT: vpextrb $4, %xmm0, 4(%rdi)
; AVX2-NEXT: testb $32, %al
; AVX2-NEXT: je .LBB2_12
; AVX2-NEXT: .LBB2_11: # %cond.store9
-; AVX2-NEXT: vpextrb $10, %xmm0, 5(%rdi)
+; AVX2-NEXT: vpextrb $5, %xmm0, 5(%rdi)
; AVX2-NEXT: testb $64, %al
; AVX2-NEXT: je .LBB2_14
; AVX2-NEXT: .LBB2_13: # %cond.store11
-; AVX2-NEXT: vpextrb $12, %xmm0, 6(%rdi)
+; AVX2-NEXT: vpextrb $6, %xmm0, 6(%rdi)
; AVX2-NEXT: testb $-128, %al
; AVX2-NEXT: je .LBB2_16
; AVX2-NEXT: .LBB2_15: # %cond.store13
-; AVX2-NEXT: vpextrb $14, %xmm0, 7(%rdi)
+; AVX2-NEXT: vpextrb $7, %xmm0, 7(%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
; AVX512F-NEXT: vpminuq {{.*}}(%rip){1to8}, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512F-NEXT: vpmovqb %zmm0, %xmm0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: testb $1, %al
; AVX512F-NEXT: jne .LBB2_1
; AVX512F-NEXT: testb $2, %al
; AVX512F-NEXT: je .LBB2_4
; AVX512F-NEXT: .LBB2_3: # %cond.store1
-; AVX512F-NEXT: vpextrb $2, %xmm0, 1(%rdi)
+; AVX512F-NEXT: vpextrb $1, %xmm0, 1(%rdi)
; AVX512F-NEXT: testb $4, %al
; AVX512F-NEXT: je .LBB2_6
; AVX512F-NEXT: .LBB2_5: # %cond.store3
-; AVX512F-NEXT: vpextrb $4, %xmm0, 2(%rdi)
+; AVX512F-NEXT: vpextrb $2, %xmm0, 2(%rdi)
; AVX512F-NEXT: testb $8, %al
; AVX512F-NEXT: je .LBB2_8
; AVX512F-NEXT: .LBB2_7: # %cond.store5
-; AVX512F-NEXT: vpextrb $6, %xmm0, 3(%rdi)
+; AVX512F-NEXT: vpextrb $3, %xmm0, 3(%rdi)
; AVX512F-NEXT: testb $16, %al
; AVX512F-NEXT: je .LBB2_10
; AVX512F-NEXT: .LBB2_9: # %cond.store7
-; AVX512F-NEXT: vpextrb $8, %xmm0, 4(%rdi)
+; AVX512F-NEXT: vpextrb $4, %xmm0, 4(%rdi)
; AVX512F-NEXT: testb $32, %al
; AVX512F-NEXT: je .LBB2_12
; AVX512F-NEXT: .LBB2_11: # %cond.store9
-; AVX512F-NEXT: vpextrb $10, %xmm0, 5(%rdi)
+; AVX512F-NEXT: vpextrb $5, %xmm0, 5(%rdi)
; AVX512F-NEXT: testb $64, %al
; AVX512F-NEXT: je .LBB2_14
; AVX512F-NEXT: .LBB2_13: # %cond.store11
-; AVX512F-NEXT: vpextrb $12, %xmm0, 6(%rdi)
+; AVX512F-NEXT: vpextrb $6, %xmm0, 6(%rdi)
; AVX512F-NEXT: testb $-128, %al
; AVX512F-NEXT: je .LBB2_16
; AVX512F-NEXT: .LBB2_15: # %cond.store13
-; AVX512F-NEXT: vpextrb $14, %xmm0, 7(%rdi)
+; AVX512F-NEXT: vpextrb $7, %xmm0, 7(%rdi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535]
; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
-; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: movdqa %xmm0, %xmm6
; SSE2-NEXT: pxor %xmm5, %xmm6
; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002324991,9223372039002324991]
; SSE2-NEXT: movdqa %xmm9, %xmm7
; SSE2-NEXT: pand %xmm4, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
; SSE2-NEXT: por %xmm6, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
+; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: pandn %xmm8, %xmm4
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: pxor %xmm0, %xmm5
-; SSE2-NEXT: movdqa %xmm9, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
+; SSE2-NEXT: por %xmm0, %xmm4
+; SSE2-NEXT: pxor %xmm1, %xmm5
+; SSE2-NEXT: movdqa %xmm9, %xmm0
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
; SSE2-NEXT: pand %xmm6, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: pandn %xmm8, %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: pandn %xmm8, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
; SSE2-NEXT: movmskps %xmm3, %eax
; SSE2-NEXT: xorl $15, %eax
; SSE2-NEXT: .LBB4_8: # %else6
; SSE2-NEXT: retq
; SSE2-NEXT: .LBB4_1: # %cond.store
-; SSE2-NEXT: movd %xmm1, %ecx
+; SSE2-NEXT: movd %xmm0, %ecx
; SSE2-NEXT: movw %cx, (%rdi)
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je .LBB4_4
; SSE2-NEXT: .LBB4_3: # %cond.store1
-; SSE2-NEXT: pextrw $2, %xmm1, %ecx
+; SSE2-NEXT: pextrw $1, %xmm0, %ecx
; SSE2-NEXT: movw %cx, 2(%rdi)
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: je .LBB4_6
; SSE2-NEXT: .LBB4_5: # %cond.store3
-; SSE2-NEXT: pextrw $4, %xmm1, %ecx
+; SSE2-NEXT: pextrw $2, %xmm0, %ecx
; SSE2-NEXT: movw %cx, 4(%rdi)
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: je .LBB4_8
; SSE2-NEXT: .LBB4_7: # %cond.store5
-; SSE2-NEXT: pextrw $6, %xmm1, %eax
+; SSE2-NEXT: pextrw $3, %xmm0, %eax
; SSE2-NEXT: movw %ax, 6(%rdi)
; SSE2-NEXT: retq
;
; SSE4-LABEL: truncstore_v4i64_v4i16:
; SSE4: # %bb.0:
-; SSE4-NEXT: movdqa %xmm0, %xmm8
-; SSE4-NEXT: pxor %xmm6, %xmm6
-; SSE4-NEXT: movapd {{.*#+}} xmm5 = [65535,65535]
+; SSE4-NEXT: movdqa %xmm0, %xmm5
+; SSE4-NEXT: pxor %xmm8, %xmm8
+; SSE4-NEXT: movapd {{.*#+}} xmm6 = [65535,65535]
; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808]
-; SSE4-NEXT: movdqa %xmm1, %xmm3
+; SSE4-NEXT: movdqa %xmm0, %xmm3
; SSE4-NEXT: pxor %xmm7, %xmm3
; SSE4-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854841343,9223372036854841343]
; SSE4-NEXT: movdqa %xmm4, %xmm0
; SSE4-NEXT: pcmpgtq %xmm3, %xmm0
-; SSE4-NEXT: movapd %xmm5, %xmm3
-; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm3
-; SSE4-NEXT: pxor %xmm8, %xmm7
+; SSE4-NEXT: movapd %xmm6, %xmm3
+; SSE4-NEXT: blendvpd %xmm0, %xmm5, %xmm3
+; SSE4-NEXT: pxor %xmm1, %xmm7
; SSE4-NEXT: pcmpgtq %xmm7, %xmm4
; SSE4-NEXT: movdqa %xmm4, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm5
-; SSE4-NEXT: packusdw %xmm3, %xmm5
-; SSE4-NEXT: pcmpeqd %xmm2, %xmm6
-; SSE4-NEXT: movmskps %xmm6, %eax
+; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm6
+; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3]
+; SSE4-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
+; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
+; SSE4-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE4-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE4-NEXT: pcmpeqd %xmm2, %xmm8
+; SSE4-NEXT: movmskps %xmm8, %eax
; SSE4-NEXT: xorl $15, %eax
; SSE4-NEXT: testb $1, %al
; SSE4-NEXT: jne .LBB4_1
; SSE4-NEXT: .LBB4_8: # %else6
; SSE4-NEXT: retq
; SSE4-NEXT: .LBB4_1: # %cond.store
-; SSE4-NEXT: pextrw $0, %xmm5, (%rdi)
+; SSE4-NEXT: pextrw $0, %xmm0, (%rdi)
; SSE4-NEXT: testb $2, %al
; SSE4-NEXT: je .LBB4_4
; SSE4-NEXT: .LBB4_3: # %cond.store1
-; SSE4-NEXT: pextrw $2, %xmm5, 2(%rdi)
+; SSE4-NEXT: pextrw $1, %xmm0, 2(%rdi)
; SSE4-NEXT: testb $4, %al
; SSE4-NEXT: je .LBB4_6
; SSE4-NEXT: .LBB4_5: # %cond.store3
-; SSE4-NEXT: pextrw $4, %xmm5, 4(%rdi)
+; SSE4-NEXT: pextrw $2, %xmm0, 4(%rdi)
; SSE4-NEXT: testb $8, %al
; SSE4-NEXT: je .LBB4_8
; SSE4-NEXT: .LBB4_7: # %cond.store5
-; SSE4-NEXT: pextrw $6, %xmm5, 6(%rdi)
+; SSE4-NEXT: pextrw $3, %xmm0, 6(%rdi)
; SSE4-NEXT: retq
;
; AVX1-LABEL: truncstore_v4i64_v4i16:
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm3
; AVX1-NEXT: vmovapd {{.*#+}} xmm5 = [65535,65535]
; AVX1-NEXT: vblendvpd %xmm3, %xmm6, %xmm5, %xmm3
+; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm5, %xmm0
-; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vmovmskps %xmm1, %eax
; AVX1-NEXT: xorl $15, %eax
; AVX1-NEXT: testb $2, %al
; AVX1-NEXT: je .LBB4_4
; AVX1-NEXT: .LBB4_3: # %cond.store1
-; AVX1-NEXT: vpextrw $2, %xmm0, 2(%rdi)
+; AVX1-NEXT: vpextrw $1, %xmm0, 2(%rdi)
; AVX1-NEXT: testb $4, %al
; AVX1-NEXT: je .LBB4_6
; AVX1-NEXT: .LBB4_5: # %cond.store3
-; AVX1-NEXT: vpextrw $4, %xmm0, 4(%rdi)
+; AVX1-NEXT: vpextrw $2, %xmm0, 4(%rdi)
; AVX1-NEXT: testb $8, %al
; AVX1-NEXT: je .LBB4_8
; AVX1-NEXT: .LBB4_7: # %cond.store5
-; AVX1-NEXT: vpextrw $6, %xmm0, 6(%rdi)
+; AVX1-NEXT: vpextrw $3, %xmm0, 6(%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4
; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm3, %ymm0
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
+; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vmovmskps %xmm1, %eax
; AVX2-NEXT: xorl $15, %eax
; AVX2-NEXT: testb $2, %al
; AVX2-NEXT: je .LBB4_4
; AVX2-NEXT: .LBB4_3: # %cond.store1
-; AVX2-NEXT: vpextrw $2, %xmm0, 2(%rdi)
+; AVX2-NEXT: vpextrw $1, %xmm0, 2(%rdi)
; AVX2-NEXT: testb $4, %al
; AVX2-NEXT: je .LBB4_6
; AVX2-NEXT: .LBB4_5: # %cond.store3
-; AVX2-NEXT: vpextrw $4, %xmm0, 4(%rdi)
+; AVX2-NEXT: vpextrw $2, %xmm0, 4(%rdi)
; AVX2-NEXT: testb $8, %al
; AVX2-NEXT: je .LBB4_8
; AVX2-NEXT: .LBB4_7: # %cond.store5
-; AVX2-NEXT: vpextrw $6, %xmm0, 6(%rdi)
+; AVX2-NEXT: vpextrw $3, %xmm0, 6(%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [65535,65535,65535,65535]
; AVX512F-NEXT: vpminuq %zmm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: testb $1, %al
; AVX512F-NEXT: jne .LBB4_1
; AVX512F-NEXT: testb $2, %al
; AVX512F-NEXT: je .LBB4_4
; AVX512F-NEXT: .LBB4_3: # %cond.store1
-; AVX512F-NEXT: vpextrw $2, %xmm0, 2(%rdi)
+; AVX512F-NEXT: vpextrw $1, %xmm0, 2(%rdi)
; AVX512F-NEXT: testb $4, %al
; AVX512F-NEXT: je .LBB4_6
; AVX512F-NEXT: .LBB4_5: # %cond.store3
-; AVX512F-NEXT: vpextrw $4, %xmm0, 4(%rdi)
+; AVX512F-NEXT: vpextrw $2, %xmm0, 4(%rdi)
; AVX512F-NEXT: testb $8, %al
; AVX512F-NEXT: je .LBB4_8
; AVX512F-NEXT: .LBB4_7: # %cond.store5
-; AVX512F-NEXT: vpextrw $6, %xmm0, 6(%rdi)
+; AVX512F-NEXT: vpextrw $3, %xmm0, 6(%rdi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [65535,65535,65535,65535]
-; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
; AVX512BW-NEXT: kshiftld $28, %k0, %k0
; AVX512BW-NEXT: kshiftrd $28, %k0, %k1
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [65535,65535,65535,65535]
+; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1}
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
define void @truncstore_v4i64_v4i8(<4 x i64> %x, <4 x i8>* %p, <4 x i32> %mask) {
; SSE2-LABEL: truncstore_v4i64_v4i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pxor %xmm9, %xmm9
; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255]
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
-; SSE2-NEXT: movdqa %xmm1, %xmm6
-; SSE2-NEXT: pxor %xmm5, %xmm6
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259711,9223372039002259711]
-; SSE2-NEXT: movdqa %xmm9, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT: pand %xmm4, %xmm6
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: pxor %xmm6, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259711,9223372039002259711]
+; SSE2-NEXT: movdqa %xmm10, %xmm7
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm10, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
+; SSE2-NEXT: pand %xmm3, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
+; SSE2-NEXT: por %xmm5, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: pandn %xmm8, %xmm4
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: pxor %xmm0, %xmm5
-; SSE2-NEXT: movdqa %xmm9, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: pandn %xmm8, %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: packuswb %xmm4, %xmm1
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
-; SSE2-NEXT: movmskps %xmm3, %eax
-; SSE2-NEXT: xorl $15, %eax
-; SSE2-NEXT: testb $1, %al
+; SSE2-NEXT: por %xmm0, %xmm4
+; SSE2-NEXT: pxor %xmm1, %xmm6
+; SSE2-NEXT: movdqa %xmm10, %xmm0
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm10, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSE2-NEXT: pand %xmm3, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: pandn %xmm8, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: pand %xmm8, %xmm0
+; SSE2-NEXT: pand %xmm8, %xmm4
+; SSE2-NEXT: packuswb %xmm0, %xmm4
+; SSE2-NEXT: packuswb %xmm4, %xmm4
+; SSE2-NEXT: packuswb %xmm4, %xmm4
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm9
+; SSE2-NEXT: movmskps %xmm9, %ecx
+; SSE2-NEXT: xorl $15, %ecx
+; SSE2-NEXT: testb $1, %cl
+; SSE2-NEXT: movd %xmm4, %eax
; SSE2-NEXT: jne .LBB5_1
; SSE2-NEXT: # %bb.2: # %else
-; SSE2-NEXT: testb $2, %al
+; SSE2-NEXT: testb $2, %cl
; SSE2-NEXT: jne .LBB5_3
; SSE2-NEXT: .LBB5_4: # %else2
-; SSE2-NEXT: testb $4, %al
+; SSE2-NEXT: testb $4, %cl
; SSE2-NEXT: jne .LBB5_5
; SSE2-NEXT: .LBB5_6: # %else4
-; SSE2-NEXT: testb $8, %al
+; SSE2-NEXT: testb $8, %cl
; SSE2-NEXT: jne .LBB5_7
; SSE2-NEXT: .LBB5_8: # %else6
; SSE2-NEXT: retq
; SSE2-NEXT: .LBB5_1: # %cond.store
-; SSE2-NEXT: movd %xmm1, %ecx
-; SSE2-NEXT: movb %cl, (%rdi)
-; SSE2-NEXT: testb $2, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: testb $2, %cl
; SSE2-NEXT: je .LBB5_4
; SSE2-NEXT: .LBB5_3: # %cond.store1
-; SSE2-NEXT: pextrw $2, %xmm1, %ecx
-; SSE2-NEXT: movb %cl, 1(%rdi)
-; SSE2-NEXT: testb $4, %al
+; SSE2-NEXT: movb %ah, 1(%rdi)
+; SSE2-NEXT: testb $4, %cl
; SSE2-NEXT: je .LBB5_6
; SSE2-NEXT: .LBB5_5: # %cond.store3
-; SSE2-NEXT: pextrw $4, %xmm1, %ecx
-; SSE2-NEXT: movb %cl, 2(%rdi)
-; SSE2-NEXT: testb $8, %al
+; SSE2-NEXT: movl %eax, %edx
+; SSE2-NEXT: shrl $16, %edx
+; SSE2-NEXT: movb %dl, 2(%rdi)
+; SSE2-NEXT: testb $8, %cl
; SSE2-NEXT: je .LBB5_8
; SSE2-NEXT: .LBB5_7: # %cond.store5
-; SSE2-NEXT: pextrw $6, %xmm1, %eax
+; SSE2-NEXT: shrl $24, %eax
; SSE2-NEXT: movb %al, 3(%rdi)
; SSE2-NEXT: retq
;
; SSE4-LABEL: truncstore_v4i64_v4i8:
; SSE4: # %bb.0:
-; SSE4-NEXT: movdqa %xmm0, %xmm8
-; SSE4-NEXT: pxor %xmm6, %xmm6
-; SSE4-NEXT: movapd {{.*#+}} xmm5 = [255,255]
-; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808]
-; SSE4-NEXT: movdqa %xmm1, %xmm3
-; SSE4-NEXT: pxor %xmm7, %xmm3
+; SSE4-NEXT: movdqa %xmm0, %xmm3
+; SSE4-NEXT: pxor %xmm8, %xmm8
+; SSE4-NEXT: movapd {{.*#+}} xmm7 = [255,255]
+; SSE4-NEXT: movdqa {{.*#+}} xmm6 = [9223372036854775808,9223372036854775808]
+; SSE4-NEXT: movdqa %xmm0, %xmm5
+; SSE4-NEXT: pxor %xmm6, %xmm5
; SSE4-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854776063,9223372036854776063]
; SSE4-NEXT: movdqa %xmm4, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm3, %xmm0
-; SSE4-NEXT: movapd %xmm5, %xmm3
-; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm3
-; SSE4-NEXT: pxor %xmm8, %xmm7
-; SSE4-NEXT: pcmpgtq %xmm7, %xmm4
+; SSE4-NEXT: pcmpgtq %xmm5, %xmm0
+; SSE4-NEXT: movapd %xmm7, %xmm5
+; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm5
+; SSE4-NEXT: pxor %xmm1, %xmm6
+; SSE4-NEXT: pcmpgtq %xmm6, %xmm4
; SSE4-NEXT: movdqa %xmm4, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm5
-; SSE4-NEXT: packusdw %xmm3, %xmm5
-; SSE4-NEXT: pcmpeqd %xmm2, %xmm6
-; SSE4-NEXT: movmskps %xmm6, %eax
+; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm7
+; SSE4-NEXT: movdqa {{.*#+}} xmm0 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSE4-NEXT: pshufb %xmm0, %xmm7
+; SSE4-NEXT: pshufb %xmm0, %xmm5
+; SSE4-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
+; SSE4-NEXT: pcmpeqd %xmm2, %xmm8
+; SSE4-NEXT: movmskps %xmm8, %eax
; SSE4-NEXT: xorl $15, %eax
; SSE4-NEXT: testb $1, %al
; SSE4-NEXT: jne .LBB5_1
; SSE4-NEXT: testb $2, %al
; SSE4-NEXT: je .LBB5_4
; SSE4-NEXT: .LBB5_3: # %cond.store1
-; SSE4-NEXT: pextrb $4, %xmm5, 1(%rdi)
+; SSE4-NEXT: pextrb $1, %xmm5, 1(%rdi)
; SSE4-NEXT: testb $4, %al
; SSE4-NEXT: je .LBB5_6
; SSE4-NEXT: .LBB5_5: # %cond.store3
-; SSE4-NEXT: pextrb $8, %xmm5, 2(%rdi)
+; SSE4-NEXT: pextrb $2, %xmm5, 2(%rdi)
; SSE4-NEXT: testb $8, %al
; SSE4-NEXT: je .LBB5_8
; SSE4-NEXT: .LBB5_7: # %cond.store5
-; SSE4-NEXT: pextrb $12, %xmm5, 3(%rdi)
+; SSE4-NEXT: pextrb $3, %xmm5, 3(%rdi)
; SSE4-NEXT: retq
;
; AVX1-LABEL: truncstore_v4i64_v4i8:
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm3
; AVX1-NEXT: vmovapd {{.*#+}} xmm5 = [255,255]
; AVX1-NEXT: vblendvpd %xmm3, %xmm6, %xmm5, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3
; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm5, %xmm0
-; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vmovmskps %xmm1, %eax
; AVX1-NEXT: xorl $15, %eax
; AVX1-NEXT: testb $2, %al
; AVX1-NEXT: je .LBB5_4
; AVX1-NEXT: .LBB5_3: # %cond.store1
-; AVX1-NEXT: vpextrb $4, %xmm0, 1(%rdi)
+; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rdi)
; AVX1-NEXT: testb $4, %al
; AVX1-NEXT: je .LBB5_6
; AVX1-NEXT: .LBB5_5: # %cond.store3
-; AVX1-NEXT: vpextrb $8, %xmm0, 2(%rdi)
+; AVX1-NEXT: vpextrb $2, %xmm0, 2(%rdi)
; AVX1-NEXT: testb $8, %al
; AVX1-NEXT: je .LBB5_8
; AVX1-NEXT: .LBB5_7: # %cond.store5
-; AVX1-NEXT: vpextrb $12, %xmm0, 3(%rdi)
+; AVX1-NEXT: vpextrb $3, %xmm0, 3(%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4
; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm3, %ymm0
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vmovmskps %xmm1, %eax
; AVX2-NEXT: xorl $15, %eax
; AVX2-NEXT: testb $2, %al
; AVX2-NEXT: je .LBB5_4
; AVX2-NEXT: .LBB5_3: # %cond.store1
-; AVX2-NEXT: vpextrb $4, %xmm0, 1(%rdi)
+; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rdi)
; AVX2-NEXT: testb $4, %al
; AVX2-NEXT: je .LBB5_6
; AVX2-NEXT: .LBB5_5: # %cond.store3
-; AVX2-NEXT: vpextrb $8, %xmm0, 2(%rdi)
+; AVX2-NEXT: vpextrb $2, %xmm0, 2(%rdi)
; AVX2-NEXT: testb $8, %al
; AVX2-NEXT: je .LBB5_8
; AVX2-NEXT: .LBB5_7: # %cond.store5
-; AVX2-NEXT: vpextrb $12, %xmm0, 3(%rdi)
+; AVX2-NEXT: vpextrb $3, %xmm0, 3(%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [255,255,255,255]
; AVX512F-NEXT: vpminuq %zmm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512F-NEXT: vpmovqb %zmm0, %xmm0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: testb $1, %al
; AVX512F-NEXT: jne .LBB5_1
; AVX512F-NEXT: testb $2, %al
; AVX512F-NEXT: je .LBB5_4
; AVX512F-NEXT: .LBB5_3: # %cond.store1
-; AVX512F-NEXT: vpextrb $4, %xmm0, 1(%rdi)
+; AVX512F-NEXT: vpextrb $1, %xmm0, 1(%rdi)
; AVX512F-NEXT: testb $4, %al
; AVX512F-NEXT: je .LBB5_6
; AVX512F-NEXT: .LBB5_5: # %cond.store3
-; AVX512F-NEXT: vpextrb $8, %xmm0, 2(%rdi)
+; AVX512F-NEXT: vpextrb $2, %xmm0, 2(%rdi)
; AVX512F-NEXT: testb $8, %al
; AVX512F-NEXT: je .LBB5_8
; AVX512F-NEXT: .LBB5_7: # %cond.store5
-; AVX512F-NEXT: vpextrb $12, %xmm0, 3(%rdi)
+; AVX512F-NEXT: vpextrb $3, %xmm0, 3(%rdi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [255,255,255,255]
-; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: kshiftlq $60, %k0, %k0
; AVX512BW-NEXT: kshiftrq $60, %k0, %k1
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [255,255,255,255]
+; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0
; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1}
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
define void @truncstore_v2i64_v2i32(<2 x i64> %x, <2 x i32>* %p, <2 x i64> %mask) {
; SSE2-LABEL: truncstore_v2i64_v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
-; SSE2-NEXT: pxor %xmm0, %xmm2
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT: pxor %xmm0, %xmm3
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455]
; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm4, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn {{.*}}(%rip), %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2]
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: movmskpd %xmm0, %eax
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT: pand %xmm6, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE2-NEXT: por %xmm3, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: pandn {{.*}}(%rip), %xmm4
+; SSE2-NEXT: por %xmm0, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3]
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2]
+; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: movmskpd %xmm1, %eax
; SSE2-NEXT: xorl $3, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: jne .LBB6_1
; SSE2-NEXT: .LBB6_4: # %else2
; SSE2-NEXT: retq
; SSE2-NEXT: .LBB6_1: # %cond.store
-; SSE2-NEXT: movd %xmm2, (%rdi)
+; SSE2-NEXT: movd %xmm0, (%rdi)
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je .LBB6_4
; SSE2-NEXT: .LBB6_3: # %cond.store1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; SSE2-NEXT: movd %xmm0, 4(%rdi)
; SSE2-NEXT: retq
;
; SSE4-LABEL: truncstore_v2i64_v2i32:
; SSE4: # %bb.0:
; SSE4-NEXT: movdqa %xmm0, %xmm2
-; SSE4-NEXT: pxor %xmm4, %xmm4
-; SSE4-NEXT: movapd {{.*#+}} xmm3 = [4294967295,4294967295]
+; SSE4-NEXT: pxor %xmm3, %xmm3
+; SSE4-NEXT: movapd {{.*#+}} xmm4 = [4294967295,4294967295]
; SSE4-NEXT: movdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
; SSE4-NEXT: pxor %xmm0, %xmm5
; SSE4-NEXT: movdqa {{.*#+}} xmm0 = [9223372041149743103,9223372041149743103]
; SSE4-NEXT: pcmpgtq %xmm5, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm3
-; SSE4-NEXT: pcmpeqq %xmm1, %xmm4
-; SSE4-NEXT: movmskpd %xmm4, %eax
+; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm4
+; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3]
+; SSE4-NEXT: pcmpeqq %xmm1, %xmm3
+; SSE4-NEXT: movmskpd %xmm3, %eax
; SSE4-NEXT: xorl $3, %eax
; SSE4-NEXT: testb $1, %al
; SSE4-NEXT: jne .LBB6_1
; SSE4-NEXT: .LBB6_4: # %else2
; SSE4-NEXT: retq
; SSE4-NEXT: .LBB6_1: # %cond.store
-; SSE4-NEXT: movss %xmm3, (%rdi)
+; SSE4-NEXT: movd %xmm0, (%rdi)
; SSE4-NEXT: testb $2, %al
; SSE4-NEXT: je .LBB6_4
; SSE4-NEXT: .LBB6_3: # %cond.store1
-; SSE4-NEXT: extractps $2, %xmm3, 4(%rdi)
+; SSE4-NEXT: pextrd $1, %xmm0, 4(%rdi)
; SSE4-NEXT: retq
;
; AVX1-LABEL: truncstore_v2i64_v2i32:
; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
; AVX1-NEXT: vmovapd {{.*#+}} xmm2 = [4294967295,4294967295]
; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm3
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372041149743103,9223372041149743103]
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX1-NEXT: vmaskmovps %xmm0, %xmm1, (%rdi)
; AVX1-NEXT: retq
; AVX2-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
; AVX2-NEXT: vmovapd {{.*#+}} xmm2 = [4294967295,4294967295]
; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm3
; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372041149743103,9223372041149743103]
; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0
-; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX2-NEXT: vpmaskmovd %xmm0, %xmm1, (%rdi)
; AVX2-NEXT: retq
; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0
+; AVX512F-NEXT: kshiftlw $14, %k0, %k0
+; AVX512F-NEXT: kshiftrw $14, %k0, %k1
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [4294967295,4294967295]
; AVX512F-NEXT: vpminuq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512F-NEXT: kshiftlw $14, %k0, %k0
-; AVX512F-NEXT: kshiftrw $14, %k0, %k1
; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1}
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512BW-NEXT: vptestmq %zmm1, %zmm1, %k0
+; AVX512BW-NEXT: kshiftlw $14, %k0, %k0
+; AVX512BW-NEXT: kshiftrw $14, %k0, %k1
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [4294967295,4294967295]
; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512BW-NEXT: kshiftlw $14, %k0, %k0
-; AVX512BW-NEXT: kshiftrw $14, %k0, %k1
; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1}
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
define void @truncstore_v2i64_v2i16(<2 x i64> %x, <2 x i16>* %p, <2 x i64> %mask) {
; SSE2-LABEL: truncstore_v2i64_v2i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
-; SSE2-NEXT: pxor %xmm0, %xmm2
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT: pxor %xmm0, %xmm3
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002324991,9223372039002324991]
; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm4, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn {{.*}}(%rip), %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2]
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: movmskpd %xmm0, %eax
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT: pand %xmm6, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE2-NEXT: por %xmm3, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: pandn {{.*}}(%rip), %xmm4
+; SSE2-NEXT: por %xmm0, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2]
+; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: movmskpd %xmm1, %eax
; SSE2-NEXT: xorl $3, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: jne .LBB7_1
; SSE2-NEXT: .LBB7_4: # %else2
; SSE2-NEXT: retq
; SSE2-NEXT: .LBB7_1: # %cond.store
-; SSE2-NEXT: movd %xmm2, %ecx
+; SSE2-NEXT: movd %xmm0, %ecx
; SSE2-NEXT: movw %cx, (%rdi)
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je .LBB7_4
; SSE2-NEXT: .LBB7_3: # %cond.store1
-; SSE2-NEXT: pextrw $4, %xmm2, %eax
+; SSE2-NEXT: pextrw $1, %xmm0, %eax
; SSE2-NEXT: movw %ax, 2(%rdi)
; SSE2-NEXT: retq
;
; SSE4-LABEL: truncstore_v2i64_v2i16:
; SSE4: # %bb.0:
; SSE4-NEXT: movdqa %xmm0, %xmm2
-; SSE4-NEXT: pxor %xmm4, %xmm4
-; SSE4-NEXT: movapd {{.*#+}} xmm3 = [65535,65535]
+; SSE4-NEXT: pxor %xmm3, %xmm3
+; SSE4-NEXT: movapd {{.*#+}} xmm4 = [65535,65535]
; SSE4-NEXT: movdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
; SSE4-NEXT: pxor %xmm0, %xmm5
; SSE4-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854841343,9223372036854841343]
; SSE4-NEXT: pcmpgtq %xmm5, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm3
-; SSE4-NEXT: pcmpeqq %xmm1, %xmm4
-; SSE4-NEXT: movmskpd %xmm4, %eax
+; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm4
+; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3]
+; SSE4-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE4-NEXT: pcmpeqq %xmm1, %xmm3
+; SSE4-NEXT: movmskpd %xmm3, %eax
; SSE4-NEXT: xorl $3, %eax
; SSE4-NEXT: testb $1, %al
; SSE4-NEXT: jne .LBB7_1
; SSE4-NEXT: .LBB7_4: # %else2
; SSE4-NEXT: retq
; SSE4-NEXT: .LBB7_1: # %cond.store
-; SSE4-NEXT: pextrw $0, %xmm3, (%rdi)
+; SSE4-NEXT: pextrw $0, %xmm0, (%rdi)
; SSE4-NEXT: testb $2, %al
; SSE4-NEXT: je .LBB7_4
; SSE4-NEXT: .LBB7_3: # %cond.store1
-; SSE4-NEXT: pextrw $4, %xmm3, 2(%rdi)
+; SSE4-NEXT: pextrw $1, %xmm0, 2(%rdi)
; SSE4-NEXT: retq
;
; AVX-LABEL: truncstore_v2i64_v2i16:
; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854841343,9223372036854841343]
; AVX-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
; AVX-NEXT: vmovmskpd %xmm1, %eax
; AVX-NEXT: xorl $3, %eax
; AVX-NEXT: testb $2, %al
; AVX-NEXT: je .LBB7_4
; AVX-NEXT: .LBB7_3: # %cond.store1
-; AVX-NEXT: vpextrw $4, %xmm0, 2(%rdi)
+; AVX-NEXT: vpextrw $1, %xmm0, 2(%rdi)
; AVX-NEXT: retq
;
; AVX512F-LABEL: truncstore_v2i64_v2i16:
; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,65535]
; AVX512F-NEXT: vpminuq %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: testb $1, %al
; AVX512F-NEXT: jne .LBB7_1
; AVX512F-NEXT: testb $2, %al
; AVX512F-NEXT: je .LBB7_4
; AVX512F-NEXT: .LBB7_3: # %cond.store1
-; AVX512F-NEXT: vpextrw $4, %xmm0, 2(%rdi)
+; AVX512F-NEXT: vpextrw $1, %xmm0, 2(%rdi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512BW-NEXT: vptestmq %zmm1, %zmm1, %k0
+; AVX512BW-NEXT: kshiftld $30, %k0, %k0
+; AVX512BW-NEXT: kshiftrd $30, %k0, %k1
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,65535]
; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512BW-NEXT: kshiftld $30, %k0, %k0
-; AVX512BW-NEXT: kshiftrd $30, %k0, %k1
; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1}
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
define void @truncstore_v2i64_v2i8(<2 x i64> %x, <2 x i8>* %p, <2 x i64> %mask) {
; SSE2-LABEL: truncstore_v2i64_v2i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
-; SSE2-NEXT: pxor %xmm0, %xmm2
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT: pxor %xmm0, %xmm3
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259711,9223372039002259711]
; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm4, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm2
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT: pand %xmm6, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE2-NEXT: por %xmm3, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: pandn {{.*}}(%rip), %xmm4
+; SSE2-NEXT: por %xmm0, %xmm4
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm4
+; SSE2-NEXT: packuswb %xmm4, %xmm4
+; SSE2-NEXT: packuswb %xmm4, %xmm4
+; SSE2-NEXT: packuswb %xmm4, %xmm4
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn {{.*}}(%rip), %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2]
-; SSE2-NEXT: pand %xmm3, %xmm0
; SSE2-NEXT: movmskpd %xmm0, %eax
; SSE2-NEXT: xorl $3, %eax
; SSE2-NEXT: testb $1, %al
+; SSE2-NEXT: movd %xmm4, %ecx
; SSE2-NEXT: jne .LBB8_1
; SSE2-NEXT: # %bb.2: # %else
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: .LBB8_4: # %else2
; SSE2-NEXT: retq
; SSE2-NEXT: .LBB8_1: # %cond.store
-; SSE2-NEXT: movd %xmm2, %ecx
; SSE2-NEXT: movb %cl, (%rdi)
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je .LBB8_4
; SSE2-NEXT: .LBB8_3: # %cond.store1
-; SSE2-NEXT: pextrw $4, %xmm2, %eax
-; SSE2-NEXT: movb %al, 1(%rdi)
+; SSE2-NEXT: movb %ch, 1(%rdi)
; SSE2-NEXT: retq
;
; SSE4-LABEL: truncstore_v2i64_v2i8:
; SSE4-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854776063,9223372036854776063]
; SSE4-NEXT: pcmpgtq %xmm5, %xmm0
; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm3
+; SSE4-NEXT: pshufb {{.*#+}} xmm3 = xmm3[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; SSE4-NEXT: pcmpeqq %xmm1, %xmm4
; SSE4-NEXT: movmskpd %xmm4, %eax
; SSE4-NEXT: xorl $3, %eax
; SSE4-NEXT: testb $2, %al
; SSE4-NEXT: je .LBB8_4
; SSE4-NEXT: .LBB8_3: # %cond.store1
-; SSE4-NEXT: pextrb $8, %xmm3, 1(%rdi)
+; SSE4-NEXT: pextrb $1, %xmm3, 1(%rdi)
; SSE4-NEXT: retq
;
; AVX-LABEL: truncstore_v2i64_v2i8:
; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854776063,9223372036854776063]
; AVX-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
; AVX-NEXT: vmovmskpd %xmm1, %eax
; AVX-NEXT: xorl $3, %eax
; AVX-NEXT: testb $2, %al
; AVX-NEXT: je .LBB8_4
; AVX-NEXT: .LBB8_3: # %cond.store1
-; AVX-NEXT: vpextrb $8, %xmm0, 1(%rdi)
+; AVX-NEXT: vpextrb $1, %xmm0, 1(%rdi)
; AVX-NEXT: retq
;
; AVX512F-LABEL: truncstore_v2i64_v2i8:
; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255]
; AVX512F-NEXT: vpminuq %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: testb $1, %al
; AVX512F-NEXT: jne .LBB8_1
; AVX512F-NEXT: testb $2, %al
; AVX512F-NEXT: je .LBB8_4
; AVX512F-NEXT: .LBB8_3: # %cond.store1
-; AVX512F-NEXT: vpextrb $8, %xmm0, 1(%rdi)
+; AVX512F-NEXT: vpextrb $1, %xmm0, 1(%rdi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512BW-NEXT: vptestmq %zmm1, %zmm1, %k0
+; AVX512BW-NEXT: kshiftlq $62, %k0, %k0
+; AVX512BW-NEXT: kshiftrq $62, %k0, %k1
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255]
; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: kshiftlq $62, %k0, %k0
-; AVX512BW-NEXT: kshiftrq $62, %k0, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1}
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
; SSE2-NEXT: pandn %xmm9, %xmm6
; SSE2-NEXT: por %xmm0, %xmm6
; SSE2-NEXT: packuswb %xmm4, %xmm6
+; SSE2-NEXT: packuswb %xmm6, %xmm6
; SSE2-NEXT: pcmpeqd %xmm8, %xmm3
; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
; SSE2-NEXT: pxor %xmm0, %xmm3
; SSE2-NEXT: jne .LBB12_5
; SSE2-NEXT: .LBB12_6: # %else4
; SSE2-NEXT: testb $8, %al
-; SSE2-NEXT: jne .LBB12_7
+; SSE2-NEXT: je .LBB12_8
+; SSE2-NEXT: .LBB12_7: # %cond.store5
+; SSE2-NEXT: shrl $24, %ecx
+; SSE2-NEXT: movb %cl, 3(%rdi)
; SSE2-NEXT: .LBB12_8: # %else6
; SSE2-NEXT: testb $16, %al
-; SSE2-NEXT: jne .LBB12_9
+; SSE2-NEXT: pextrw $2, %xmm6, %ecx
+; SSE2-NEXT: je .LBB12_10
+; SSE2-NEXT: # %bb.9: # %cond.store7
+; SSE2-NEXT: movb %cl, 4(%rdi)
; SSE2-NEXT: .LBB12_10: # %else8
; SSE2-NEXT: testb $32, %al
-; SSE2-NEXT: jne .LBB12_11
+; SSE2-NEXT: je .LBB12_12
+; SSE2-NEXT: # %bb.11: # %cond.store9
+; SSE2-NEXT: movb %ch, 5(%rdi)
; SSE2-NEXT: .LBB12_12: # %else10
; SSE2-NEXT: testb $64, %al
+; SSE2-NEXT: pextrw $3, %xmm6, %ecx
; SSE2-NEXT: jne .LBB12_13
-; SSE2-NEXT: .LBB12_14: # %else12
+; SSE2-NEXT: # %bb.14: # %else12
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: jne .LBB12_15
; SSE2-NEXT: .LBB12_16: # %else14
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je .LBB12_4
; SSE2-NEXT: .LBB12_3: # %cond.store1
-; SSE2-NEXT: shrl $16, %ecx
-; SSE2-NEXT: movb %cl, 1(%rdi)
+; SSE2-NEXT: movb %ch, 1(%rdi)
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: je .LBB12_6
; SSE2-NEXT: .LBB12_5: # %cond.store3
-; SSE2-NEXT: pextrw $2, %xmm6, %ecx
-; SSE2-NEXT: movb %cl, 2(%rdi)
+; SSE2-NEXT: movl %ecx, %edx
+; SSE2-NEXT: shrl $16, %edx
+; SSE2-NEXT: movb %dl, 2(%rdi)
; SSE2-NEXT: testb $8, %al
-; SSE2-NEXT: je .LBB12_8
-; SSE2-NEXT: .LBB12_7: # %cond.store5
-; SSE2-NEXT: pextrw $3, %xmm6, %ecx
-; SSE2-NEXT: movb %cl, 3(%rdi)
-; SSE2-NEXT: testb $16, %al
-; SSE2-NEXT: je .LBB12_10
-; SSE2-NEXT: .LBB12_9: # %cond.store7
-; SSE2-NEXT: pextrw $4, %xmm6, %ecx
-; SSE2-NEXT: movb %cl, 4(%rdi)
-; SSE2-NEXT: testb $32, %al
-; SSE2-NEXT: je .LBB12_12
-; SSE2-NEXT: .LBB12_11: # %cond.store9
-; SSE2-NEXT: pextrw $5, %xmm6, %ecx
-; SSE2-NEXT: movb %cl, 5(%rdi)
-; SSE2-NEXT: testb $64, %al
-; SSE2-NEXT: je .LBB12_14
+; SSE2-NEXT: jne .LBB12_7
+; SSE2-NEXT: jmp .LBB12_8
; SSE2-NEXT: .LBB12_13: # %cond.store11
-; SSE2-NEXT: pextrw $6, %xmm6, %ecx
; SSE2-NEXT: movb %cl, 6(%rdi)
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: je .LBB12_16
; SSE2-NEXT: .LBB12_15: # %cond.store13
-; SSE2-NEXT: pextrw $7, %xmm6, %eax
-; SSE2-NEXT: movb %al, 7(%rdi)
+; SSE2-NEXT: movb %ch, 7(%rdi)
; SSE2-NEXT: retq
;
; SSE4-LABEL: truncstore_v8i32_v8i8:
; SSE4: # %bb.0:
; SSE4-NEXT: pxor %xmm4, %xmm4
; SSE4-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255]
-; SSE4-NEXT: pminud %xmm5, %xmm1
; SSE4-NEXT: pminud %xmm5, %xmm0
-; SSE4-NEXT: packusdw %xmm1, %xmm0
+; SSE4-NEXT: pminud %xmm5, %xmm1
+; SSE4-NEXT: movdqa {{.*#+}} xmm5 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSE4-NEXT: pshufb %xmm5, %xmm1
+; SSE4-NEXT: pshufb %xmm5, %xmm0
+; SSE4-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE4-NEXT: pcmpeqd %xmm4, %xmm3
; SSE4-NEXT: pcmpeqd %xmm1, %xmm1
; SSE4-NEXT: pxor %xmm1, %xmm3
; SSE4-NEXT: testb $2, %al
; SSE4-NEXT: je .LBB12_4
; SSE4-NEXT: .LBB12_3: # %cond.store1
-; SSE4-NEXT: pextrb $2, %xmm0, 1(%rdi)
+; SSE4-NEXT: pextrb $1, %xmm0, 1(%rdi)
; SSE4-NEXT: testb $4, %al
; SSE4-NEXT: je .LBB12_6
; SSE4-NEXT: .LBB12_5: # %cond.store3
-; SSE4-NEXT: pextrb $4, %xmm0, 2(%rdi)
+; SSE4-NEXT: pextrb $2, %xmm0, 2(%rdi)
; SSE4-NEXT: testb $8, %al
; SSE4-NEXT: je .LBB12_8
; SSE4-NEXT: .LBB12_7: # %cond.store5
-; SSE4-NEXT: pextrb $6, %xmm0, 3(%rdi)
+; SSE4-NEXT: pextrb $3, %xmm0, 3(%rdi)
; SSE4-NEXT: testb $16, %al
; SSE4-NEXT: je .LBB12_10
; SSE4-NEXT: .LBB12_9: # %cond.store7
-; SSE4-NEXT: pextrb $8, %xmm0, 4(%rdi)
+; SSE4-NEXT: pextrb $4, %xmm0, 4(%rdi)
; SSE4-NEXT: testb $32, %al
; SSE4-NEXT: je .LBB12_12
; SSE4-NEXT: .LBB12_11: # %cond.store9
-; SSE4-NEXT: pextrb $10, %xmm0, 5(%rdi)
+; SSE4-NEXT: pextrb $5, %xmm0, 5(%rdi)
; SSE4-NEXT: testb $64, %al
; SSE4-NEXT: je .LBB12_14
; SSE4-NEXT: .LBB12_13: # %cond.store11
-; SSE4-NEXT: pextrb $12, %xmm0, 6(%rdi)
+; SSE4-NEXT: pextrb $6, %xmm0, 6(%rdi)
; SSE4-NEXT: testb $-128, %al
; SSE4-NEXT: je .LBB12_16
; SSE4-NEXT: .LBB12_15: # %cond.store13
-; SSE4-NEXT: pextrb $14, %xmm0, 7(%rdi)
+; SSE4-NEXT: pextrb $7, %xmm0, 7(%rdi)
; SSE4-NEXT: retq
;
; AVX1-LABEL: truncstore_v8i32_v8i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255]
-; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpminud %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255]
+; AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
; AVX1-NEXT: testb $2, %al
; AVX1-NEXT: je .LBB12_4
; AVX1-NEXT: .LBB12_3: # %cond.store1
-; AVX1-NEXT: vpextrb $2, %xmm0, 1(%rdi)
+; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rdi)
; AVX1-NEXT: testb $4, %al
; AVX1-NEXT: je .LBB12_6
; AVX1-NEXT: .LBB12_5: # %cond.store3
-; AVX1-NEXT: vpextrb $4, %xmm0, 2(%rdi)
+; AVX1-NEXT: vpextrb $2, %xmm0, 2(%rdi)
; AVX1-NEXT: testb $8, %al
; AVX1-NEXT: je .LBB12_8
; AVX1-NEXT: .LBB12_7: # %cond.store5
-; AVX1-NEXT: vpextrb $6, %xmm0, 3(%rdi)
+; AVX1-NEXT: vpextrb $3, %xmm0, 3(%rdi)
; AVX1-NEXT: testb $16, %al
; AVX1-NEXT: je .LBB12_10
; AVX1-NEXT: .LBB12_9: # %cond.store7
-; AVX1-NEXT: vpextrb $8, %xmm0, 4(%rdi)
+; AVX1-NEXT: vpextrb $4, %xmm0, 4(%rdi)
; AVX1-NEXT: testb $32, %al
; AVX1-NEXT: je .LBB12_12
; AVX1-NEXT: .LBB12_11: # %cond.store9
-; AVX1-NEXT: vpextrb $10, %xmm0, 5(%rdi)
+; AVX1-NEXT: vpextrb $5, %xmm0, 5(%rdi)
; AVX1-NEXT: testb $64, %al
; AVX1-NEXT: je .LBB12_14
; AVX1-NEXT: .LBB12_13: # %cond.store11
-; AVX1-NEXT: vpextrb $12, %xmm0, 6(%rdi)
+; AVX1-NEXT: vpextrb $6, %xmm0, 6(%rdi)
; AVX1-NEXT: testb $-128, %al
; AVX1-NEXT: je .LBB12_16
; AVX1-NEXT: .LBB12_15: # %cond.store13
-; AVX1-NEXT: vpextrb $14, %xmm0, 7(%rdi)
+; AVX1-NEXT: vpextrb $7, %xmm0, 7(%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpminud %ymm3, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vmovmskps %ymm1, %eax
; AVX2-NEXT: notl %eax
; AVX2-NEXT: testb $2, %al
; AVX2-NEXT: je .LBB12_4
; AVX2-NEXT: .LBB12_3: # %cond.store1
-; AVX2-NEXT: vpextrb $2, %xmm0, 1(%rdi)
+; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rdi)
; AVX2-NEXT: testb $4, %al
; AVX2-NEXT: je .LBB12_6
; AVX2-NEXT: .LBB12_5: # %cond.store3
-; AVX2-NEXT: vpextrb $4, %xmm0, 2(%rdi)
+; AVX2-NEXT: vpextrb $2, %xmm0, 2(%rdi)
; AVX2-NEXT: testb $8, %al
; AVX2-NEXT: je .LBB12_8
; AVX2-NEXT: .LBB12_7: # %cond.store5
-; AVX2-NEXT: vpextrb $6, %xmm0, 3(%rdi)
+; AVX2-NEXT: vpextrb $3, %xmm0, 3(%rdi)
; AVX2-NEXT: testb $16, %al
; AVX2-NEXT: je .LBB12_10
; AVX2-NEXT: .LBB12_9: # %cond.store7
-; AVX2-NEXT: vpextrb $8, %xmm0, 4(%rdi)
+; AVX2-NEXT: vpextrb $4, %xmm0, 4(%rdi)
; AVX2-NEXT: testb $32, %al
; AVX2-NEXT: je .LBB12_12
; AVX2-NEXT: .LBB12_11: # %cond.store9
-; AVX2-NEXT: vpextrb $10, %xmm0, 5(%rdi)
+; AVX2-NEXT: vpextrb $5, %xmm0, 5(%rdi)
; AVX2-NEXT: testb $64, %al
; AVX2-NEXT: je .LBB12_14
; AVX2-NEXT: .LBB12_13: # %cond.store11
-; AVX2-NEXT: vpextrb $12, %xmm0, 6(%rdi)
+; AVX2-NEXT: vpextrb $6, %xmm0, 6(%rdi)
; AVX2-NEXT: testb $-128, %al
; AVX2-NEXT: je .LBB12_16
; AVX2-NEXT: .LBB12_15: # %cond.store13
-; AVX2-NEXT: vpextrb $14, %xmm0, 7(%rdi)
+; AVX2-NEXT: vpextrb $7, %xmm0, 7(%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpminud %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: testb $1, %al
; AVX512F-NEXT: jne .LBB12_1
; AVX512F-NEXT: testb $2, %al
; AVX512F-NEXT: je .LBB12_4
; AVX512F-NEXT: .LBB12_3: # %cond.store1
-; AVX512F-NEXT: vpextrb $2, %xmm0, 1(%rdi)
+; AVX512F-NEXT: vpextrb $1, %xmm0, 1(%rdi)
; AVX512F-NEXT: testb $4, %al
; AVX512F-NEXT: je .LBB12_6
; AVX512F-NEXT: .LBB12_5: # %cond.store3
-; AVX512F-NEXT: vpextrb $4, %xmm0, 2(%rdi)
+; AVX512F-NEXT: vpextrb $2, %xmm0, 2(%rdi)
; AVX512F-NEXT: testb $8, %al
; AVX512F-NEXT: je .LBB12_8
; AVX512F-NEXT: .LBB12_7: # %cond.store5
-; AVX512F-NEXT: vpextrb $6, %xmm0, 3(%rdi)
+; AVX512F-NEXT: vpextrb $3, %xmm0, 3(%rdi)
; AVX512F-NEXT: testb $16, %al
; AVX512F-NEXT: je .LBB12_10
; AVX512F-NEXT: .LBB12_9: # %cond.store7
-; AVX512F-NEXT: vpextrb $8, %xmm0, 4(%rdi)
+; AVX512F-NEXT: vpextrb $4, %xmm0, 4(%rdi)
; AVX512F-NEXT: testb $32, %al
; AVX512F-NEXT: je .LBB12_12
; AVX512F-NEXT: .LBB12_11: # %cond.store9
-; AVX512F-NEXT: vpextrb $10, %xmm0, 5(%rdi)
+; AVX512F-NEXT: vpextrb $5, %xmm0, 5(%rdi)
; AVX512F-NEXT: testb $64, %al
; AVX512F-NEXT: je .LBB12_14
; AVX512F-NEXT: .LBB12_13: # %cond.store11
-; AVX512F-NEXT: vpextrb $12, %xmm0, 6(%rdi)
+; AVX512F-NEXT: vpextrb $6, %xmm0, 6(%rdi)
; AVX512F-NEXT: testb $-128, %al
; AVX512F-NEXT: je .LBB12_16
; AVX512F-NEXT: .LBB12_15: # %cond.store13
-; AVX512F-NEXT: vpextrb $14, %xmm0, 7(%rdi)
+; AVX512F-NEXT: vpextrb $7, %xmm0, 7(%rdi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
-; AVX512BW-NEXT: vpminud %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; AVX512BW-NEXT: kshiftlq $56, %k0, %k0
; AVX512BW-NEXT: kshiftrq $56, %k0, %k1
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1}
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
define void @truncstore_v4i32_v4i16(<4 x i32> %x, <4 x i16>* %p, <4 x i32> %mask) {
; SSE2-LABEL: truncstore_v4i32_v4i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: pxor %xmm0, %xmm4
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147549183,2147549183,2147549183,2147549183]
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn {{.*}}(%rip), %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm3
-; SSE2-NEXT: movmskps %xmm3, %eax
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: pxor %xmm0, %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183]
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: pandn {{.*}}(%rip), %xmm4
+; SSE2-NEXT: por %xmm0, %xmm4
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; SSE2-NEXT: movmskps %xmm2, %eax
; SSE2-NEXT: xorl $15, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: jne .LBB13_1
; SSE2-NEXT: .LBB13_8: # %else6
; SSE2-NEXT: retq
; SSE2-NEXT: .LBB13_1: # %cond.store
-; SSE2-NEXT: movd %xmm2, %ecx
+; SSE2-NEXT: movd %xmm0, %ecx
; SSE2-NEXT: movw %cx, (%rdi)
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je .LBB13_4
; SSE2-NEXT: .LBB13_3: # %cond.store1
-; SSE2-NEXT: pextrw $2, %xmm2, %ecx
+; SSE2-NEXT: pextrw $1, %xmm0, %ecx
; SSE2-NEXT: movw %cx, 2(%rdi)
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: je .LBB13_6
; SSE2-NEXT: .LBB13_5: # %cond.store3
-; SSE2-NEXT: pextrw $4, %xmm2, %ecx
+; SSE2-NEXT: pextrw $2, %xmm0, %ecx
; SSE2-NEXT: movw %cx, 4(%rdi)
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: je .LBB13_8
; SSE2-NEXT: .LBB13_7: # %cond.store5
-; SSE2-NEXT: pextrw $6, %xmm2, %eax
+; SSE2-NEXT: pextrw $3, %xmm0, %eax
; SSE2-NEXT: movw %ax, 6(%rdi)
; SSE2-NEXT: retq
;
; SSE4: # %bb.0:
; SSE4-NEXT: pxor %xmm2, %xmm2
; SSE4-NEXT: pminud {{.*}}(%rip), %xmm0
+; SSE4-NEXT: packusdw %xmm0, %xmm0
; SSE4-NEXT: pcmpeqd %xmm1, %xmm2
; SSE4-NEXT: movmskps %xmm2, %eax
; SSE4-NEXT: xorl $15, %eax
; SSE4-NEXT: testb $2, %al
; SSE4-NEXT: je .LBB13_4
; SSE4-NEXT: .LBB13_3: # %cond.store1
-; SSE4-NEXT: pextrw $2, %xmm0, 2(%rdi)
+; SSE4-NEXT: pextrw $1, %xmm0, 2(%rdi)
; SSE4-NEXT: testb $4, %al
; SSE4-NEXT: je .LBB13_6
; SSE4-NEXT: .LBB13_5: # %cond.store3
-; SSE4-NEXT: pextrw $4, %xmm0, 4(%rdi)
+; SSE4-NEXT: pextrw $2, %xmm0, 4(%rdi)
; SSE4-NEXT: testb $8, %al
; SSE4-NEXT: je .LBB13_8
; SSE4-NEXT: .LBB13_7: # %cond.store5
-; SSE4-NEXT: pextrw $6, %xmm0, 6(%rdi)
+; SSE4-NEXT: pextrw $3, %xmm0, 6(%rdi)
; SSE4-NEXT: retq
;
; AVX1-LABEL: truncstore_v4i32_v4i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vmovmskps %xmm1, %eax
; AVX1-NEXT: xorl $15, %eax
; AVX1-NEXT: testb $2, %al
; AVX1-NEXT: je .LBB13_4
; AVX1-NEXT: .LBB13_3: # %cond.store1
-; AVX1-NEXT: vpextrw $2, %xmm0, 2(%rdi)
+; AVX1-NEXT: vpextrw $1, %xmm0, 2(%rdi)
; AVX1-NEXT: testb $4, %al
; AVX1-NEXT: je .LBB13_6
; AVX1-NEXT: .LBB13_5: # %cond.store3
-; AVX1-NEXT: vpextrw $4, %xmm0, 4(%rdi)
+; AVX1-NEXT: vpextrw $2, %xmm0, 4(%rdi)
; AVX1-NEXT: testb $8, %al
; AVX1-NEXT: je .LBB13_8
; AVX1-NEXT: .LBB13_7: # %cond.store5
-; AVX1-NEXT: vpextrw $6, %xmm0, 6(%rdi)
+; AVX1-NEXT: vpextrw $3, %xmm0, 6(%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: truncstore_v4i32_v4i16:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [65535,65535,65535,65535]
; AVX2-NEXT: vpminud %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vmovmskps %xmm1, %eax
; AVX2-NEXT: xorl $15, %eax
; AVX2-NEXT: testb $2, %al
; AVX2-NEXT: je .LBB13_4
; AVX2-NEXT: .LBB13_3: # %cond.store1
-; AVX2-NEXT: vpextrw $2, %xmm0, 2(%rdi)
+; AVX2-NEXT: vpextrw $1, %xmm0, 2(%rdi)
; AVX2-NEXT: testb $4, %al
; AVX2-NEXT: je .LBB13_6
; AVX2-NEXT: .LBB13_5: # %cond.store3
-; AVX2-NEXT: vpextrw $4, %xmm0, 4(%rdi)
+; AVX2-NEXT: vpextrw $2, %xmm0, 4(%rdi)
; AVX2-NEXT: testb $8, %al
; AVX2-NEXT: je .LBB13_8
; AVX2-NEXT: .LBB13_7: # %cond.store5
-; AVX2-NEXT: vpextrw $6, %xmm0, 6(%rdi)
+; AVX2-NEXT: vpextrw $3, %xmm0, 6(%rdi)
; AVX2-NEXT: retq
;
; AVX512F-LABEL: truncstore_v4i32_v4i16:
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65535,65535,65535,65535]
; AVX512F-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: testb $1, %al
; AVX512F-NEXT: jne .LBB13_1
; AVX512F-NEXT: testb $2, %al
; AVX512F-NEXT: je .LBB13_4
; AVX512F-NEXT: .LBB13_3: # %cond.store1
-; AVX512F-NEXT: vpextrw $2, %xmm0, 2(%rdi)
+; AVX512F-NEXT: vpextrw $1, %xmm0, 2(%rdi)
; AVX512F-NEXT: testb $4, %al
; AVX512F-NEXT: je .LBB13_6
; AVX512F-NEXT: .LBB13_5: # %cond.store3
-; AVX512F-NEXT: vpextrw $4, %xmm0, 4(%rdi)
+; AVX512F-NEXT: vpextrw $2, %xmm0, 4(%rdi)
; AVX512F-NEXT: testb $8, %al
; AVX512F-NEXT: je .LBB13_8
; AVX512F-NEXT: .LBB13_7: # %cond.store5
-; AVX512F-NEXT: vpextrw $6, %xmm0, 6(%rdi)
+; AVX512F-NEXT: vpextrw $3, %xmm0, 6(%rdi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0
+; AVX512BW-NEXT: kshiftld $28, %k0, %k0
+; AVX512BW-NEXT: kshiftrd $28, %k0, %k1
; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65535,65535,65535,65535]
; AVX512BW-NEXT: vpminud %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
-; AVX512BW-NEXT: kshiftld $28, %k0, %k0
-; AVX512BW-NEXT: kshiftrd $28, %k0, %k1
; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1}
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
define void @truncstore_v4i32_v4i8(<4 x i32> %x, <4 x i8>* %p, <4 x i32> %mask) {
; SSE2-LABEL: truncstore_v4i32_v4i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: pxor %xmm0, %xmm4
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483903,2147483903,2147483903,2147483903]
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn {{.*}}(%rip), %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm3
-; SSE2-NEXT: movmskps %xmm3, %eax
-; SSE2-NEXT: xorl $15, %eax
-; SSE2-NEXT: testb $1, %al
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: pxor %xmm0, %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483903,2147483903,2147483903,2147483903]
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: pandn {{.*}}(%rip), %xmm4
+; SSE2-NEXT: por %xmm0, %xmm4
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm4
+; SSE2-NEXT: packuswb %xmm4, %xmm4
+; SSE2-NEXT: packuswb %xmm4, %xmm4
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; SSE2-NEXT: movmskps %xmm2, %ecx
+; SSE2-NEXT: xorl $15, %ecx
+; SSE2-NEXT: testb $1, %cl
+; SSE2-NEXT: movd %xmm4, %eax
; SSE2-NEXT: jne .LBB14_1
; SSE2-NEXT: # %bb.2: # %else
-; SSE2-NEXT: testb $2, %al
+; SSE2-NEXT: testb $2, %cl
; SSE2-NEXT: jne .LBB14_3
; SSE2-NEXT: .LBB14_4: # %else2
-; SSE2-NEXT: testb $4, %al
+; SSE2-NEXT: testb $4, %cl
; SSE2-NEXT: jne .LBB14_5
; SSE2-NEXT: .LBB14_6: # %else4
-; SSE2-NEXT: testb $8, %al
+; SSE2-NEXT: testb $8, %cl
; SSE2-NEXT: jne .LBB14_7
; SSE2-NEXT: .LBB14_8: # %else6
; SSE2-NEXT: retq
; SSE2-NEXT: .LBB14_1: # %cond.store
-; SSE2-NEXT: movd %xmm2, %ecx
-; SSE2-NEXT: movb %cl, (%rdi)
-; SSE2-NEXT: testb $2, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: testb $2, %cl
; SSE2-NEXT: je .LBB14_4
; SSE2-NEXT: .LBB14_3: # %cond.store1
-; SSE2-NEXT: pextrw $2, %xmm2, %ecx
-; SSE2-NEXT: movb %cl, 1(%rdi)
-; SSE2-NEXT: testb $4, %al
+; SSE2-NEXT: movb %ah, 1(%rdi)
+; SSE2-NEXT: testb $4, %cl
; SSE2-NEXT: je .LBB14_6
; SSE2-NEXT: .LBB14_5: # %cond.store3
-; SSE2-NEXT: pextrw $4, %xmm2, %ecx
-; SSE2-NEXT: movb %cl, 2(%rdi)
-; SSE2-NEXT: testb $8, %al
+; SSE2-NEXT: movl %eax, %edx
+; SSE2-NEXT: shrl $16, %edx
+; SSE2-NEXT: movb %dl, 2(%rdi)
+; SSE2-NEXT: testb $8, %cl
; SSE2-NEXT: je .LBB14_8
; SSE2-NEXT: .LBB14_7: # %cond.store5
-; SSE2-NEXT: pextrw $6, %xmm2, %eax
+; SSE2-NEXT: shrl $24, %eax
; SSE2-NEXT: movb %al, 3(%rdi)
; SSE2-NEXT: retq
;
; SSE4: # %bb.0:
; SSE4-NEXT: pxor %xmm2, %xmm2
; SSE4-NEXT: pminud {{.*}}(%rip), %xmm0
+; SSE4-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; SSE4-NEXT: pcmpeqd %xmm1, %xmm2
; SSE4-NEXT: movmskps %xmm2, %eax
; SSE4-NEXT: xorl $15, %eax
; SSE4-NEXT: testb $2, %al
; SSE4-NEXT: je .LBB14_4
; SSE4-NEXT: .LBB14_3: # %cond.store1
-; SSE4-NEXT: pextrb $4, %xmm0, 1(%rdi)
+; SSE4-NEXT: pextrb $1, %xmm0, 1(%rdi)
; SSE4-NEXT: testb $4, %al
; SSE4-NEXT: je .LBB14_6
; SSE4-NEXT: .LBB14_5: # %cond.store3
-; SSE4-NEXT: pextrb $8, %xmm0, 2(%rdi)
+; SSE4-NEXT: pextrb $2, %xmm0, 2(%rdi)
; SSE4-NEXT: testb $8, %al
; SSE4-NEXT: je .LBB14_8
; SSE4-NEXT: .LBB14_7: # %cond.store5
-; SSE4-NEXT: pextrb $12, %xmm0, 3(%rdi)
+; SSE4-NEXT: pextrb $3, %xmm0, 3(%rdi)
; SSE4-NEXT: retq
;
; AVX1-LABEL: truncstore_v4i32_v4i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vmovmskps %xmm1, %eax
; AVX1-NEXT: xorl $15, %eax
; AVX1-NEXT: testb $2, %al
; AVX1-NEXT: je .LBB14_4
; AVX1-NEXT: .LBB14_3: # %cond.store1
-; AVX1-NEXT: vpextrb $4, %xmm0, 1(%rdi)
+; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rdi)
; AVX1-NEXT: testb $4, %al
; AVX1-NEXT: je .LBB14_6
; AVX1-NEXT: .LBB14_5: # %cond.store3
-; AVX1-NEXT: vpextrb $8, %xmm0, 2(%rdi)
+; AVX1-NEXT: vpextrb $2, %xmm0, 2(%rdi)
; AVX1-NEXT: testb $8, %al
; AVX1-NEXT: je .LBB14_8
; AVX1-NEXT: .LBB14_7: # %cond.store5
-; AVX1-NEXT: vpextrb $12, %xmm0, 3(%rdi)
+; AVX1-NEXT: vpextrb $3, %xmm0, 3(%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: truncstore_v4i32_v4i8:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [255,255,255,255]
; AVX2-NEXT: vpminud %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vmovmskps %xmm1, %eax
; AVX2-NEXT: xorl $15, %eax
; AVX2-NEXT: testb $2, %al
; AVX2-NEXT: je .LBB14_4
; AVX2-NEXT: .LBB14_3: # %cond.store1
-; AVX2-NEXT: vpextrb $4, %xmm0, 1(%rdi)
+; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rdi)
; AVX2-NEXT: testb $4, %al
; AVX2-NEXT: je .LBB14_6
; AVX2-NEXT: .LBB14_5: # %cond.store3
-; AVX2-NEXT: vpextrb $8, %xmm0, 2(%rdi)
+; AVX2-NEXT: vpextrb $2, %xmm0, 2(%rdi)
; AVX2-NEXT: testb $8, %al
; AVX2-NEXT: je .LBB14_8
; AVX2-NEXT: .LBB14_7: # %cond.store5
-; AVX2-NEXT: vpextrb $12, %xmm0, 3(%rdi)
+; AVX2-NEXT: vpextrb $3, %xmm0, 3(%rdi)
; AVX2-NEXT: retq
;
; AVX512F-LABEL: truncstore_v4i32_v4i8:
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255]
; AVX512F-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: testb $1, %al
; AVX512F-NEXT: jne .LBB14_1
; AVX512F-NEXT: testb $2, %al
; AVX512F-NEXT: je .LBB14_4
; AVX512F-NEXT: .LBB14_3: # %cond.store1
-; AVX512F-NEXT: vpextrb $4, %xmm0, 1(%rdi)
+; AVX512F-NEXT: vpextrb $1, %xmm0, 1(%rdi)
; AVX512F-NEXT: testb $4, %al
; AVX512F-NEXT: je .LBB14_6
; AVX512F-NEXT: .LBB14_5: # %cond.store3
-; AVX512F-NEXT: vpextrb $8, %xmm0, 2(%rdi)
+; AVX512F-NEXT: vpextrb $2, %xmm0, 2(%rdi)
; AVX512F-NEXT: testb $8, %al
; AVX512F-NEXT: je .LBB14_8
; AVX512F-NEXT: .LBB14_7: # %cond.store5
-; AVX512F-NEXT: vpextrb $12, %xmm0, 3(%rdi)
+; AVX512F-NEXT: vpextrb $3, %xmm0, 3(%rdi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0
+; AVX512BW-NEXT: kshiftlq $60, %k0, %k0
+; AVX512BW-NEXT: kshiftrq $60, %k0, %k1
; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255]
; AVX512BW-NEXT: vpminud %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: kshiftlq $60, %k0, %k0
-; AVX512BW-NEXT: kshiftrq $60, %k0, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1}
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
; SSE2-LABEL: truncstore_v8i16_v8i8:
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm3, %xmm0
+; SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
; SSE2-NEXT: pminsw {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pxor %xmm3, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: pcmpeqw %xmm1, %xmm2
; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm1
; SSE2-NEXT: jne .LBB17_5
; SSE2-NEXT: .LBB17_6: # %else4
; SSE2-NEXT: testb $8, %al
-; SSE2-NEXT: jne .LBB17_7
+; SSE2-NEXT: je .LBB17_8
+; SSE2-NEXT: .LBB17_7: # %cond.store5
+; SSE2-NEXT: shrl $24, %ecx
+; SSE2-NEXT: movb %cl, 3(%rdi)
; SSE2-NEXT: .LBB17_8: # %else6
; SSE2-NEXT: testb $16, %al
-; SSE2-NEXT: jne .LBB17_9
+; SSE2-NEXT: pextrw $2, %xmm0, %ecx
+; SSE2-NEXT: je .LBB17_10
+; SSE2-NEXT: # %bb.9: # %cond.store7
+; SSE2-NEXT: movb %cl, 4(%rdi)
; SSE2-NEXT: .LBB17_10: # %else8
; SSE2-NEXT: testb $32, %al
-; SSE2-NEXT: jne .LBB17_11
+; SSE2-NEXT: je .LBB17_12
+; SSE2-NEXT: # %bb.11: # %cond.store9
+; SSE2-NEXT: movb %ch, 5(%rdi)
; SSE2-NEXT: .LBB17_12: # %else10
; SSE2-NEXT: testb $64, %al
+; SSE2-NEXT: pextrw $3, %xmm0, %ecx
; SSE2-NEXT: jne .LBB17_13
-; SSE2-NEXT: .LBB17_14: # %else12
+; SSE2-NEXT: # %bb.14: # %else12
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: jne .LBB17_15
; SSE2-NEXT: .LBB17_16: # %else14
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je .LBB17_4
; SSE2-NEXT: .LBB17_3: # %cond.store1
-; SSE2-NEXT: shrl $16, %ecx
-; SSE2-NEXT: movb %cl, 1(%rdi)
+; SSE2-NEXT: movb %ch, 1(%rdi)
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: je .LBB17_6
; SSE2-NEXT: .LBB17_5: # %cond.store3
-; SSE2-NEXT: pextrw $2, %xmm0, %ecx
-; SSE2-NEXT: movb %cl, 2(%rdi)
+; SSE2-NEXT: movl %ecx, %edx
+; SSE2-NEXT: shrl $16, %edx
+; SSE2-NEXT: movb %dl, 2(%rdi)
; SSE2-NEXT: testb $8, %al
-; SSE2-NEXT: je .LBB17_8
-; SSE2-NEXT: .LBB17_7: # %cond.store5
-; SSE2-NEXT: pextrw $3, %xmm0, %ecx
-; SSE2-NEXT: movb %cl, 3(%rdi)
-; SSE2-NEXT: testb $16, %al
-; SSE2-NEXT: je .LBB17_10
-; SSE2-NEXT: .LBB17_9: # %cond.store7
-; SSE2-NEXT: pextrw $4, %xmm0, %ecx
-; SSE2-NEXT: movb %cl, 4(%rdi)
-; SSE2-NEXT: testb $32, %al
-; SSE2-NEXT: je .LBB17_12
-; SSE2-NEXT: .LBB17_11: # %cond.store9
-; SSE2-NEXT: pextrw $5, %xmm0, %ecx
-; SSE2-NEXT: movb %cl, 5(%rdi)
-; SSE2-NEXT: testb $64, %al
-; SSE2-NEXT: je .LBB17_14
+; SSE2-NEXT: jne .LBB17_7
+; SSE2-NEXT: jmp .LBB17_8
; SSE2-NEXT: .LBB17_13: # %cond.store11
-; SSE2-NEXT: pextrw $6, %xmm0, %ecx
; SSE2-NEXT: movb %cl, 6(%rdi)
; SSE2-NEXT: testb $-128, %al
; SSE2-NEXT: je .LBB17_16
; SSE2-NEXT: .LBB17_15: # %cond.store13
-; SSE2-NEXT: pextrw $7, %xmm0, %eax
-; SSE2-NEXT: movb %al, 7(%rdi)
+; SSE2-NEXT: movb %ch, 7(%rdi)
; SSE2-NEXT: retq
;
; SSE4-LABEL: truncstore_v8i16_v8i8:
; SSE4: # %bb.0:
; SSE4-NEXT: pxor %xmm2, %xmm2
; SSE4-NEXT: pminuw {{.*}}(%rip), %xmm0
+; SSE4-NEXT: packuswb %xmm0, %xmm0
; SSE4-NEXT: pcmpeqw %xmm1, %xmm2
; SSE4-NEXT: pcmpeqd %xmm1, %xmm1
; SSE4-NEXT: pxor %xmm2, %xmm1
; SSE4-NEXT: testb $2, %al
; SSE4-NEXT: je .LBB17_4
; SSE4-NEXT: .LBB17_3: # %cond.store1
-; SSE4-NEXT: pextrb $2, %xmm0, 1(%rdi)
+; SSE4-NEXT: pextrb $1, %xmm0, 1(%rdi)
; SSE4-NEXT: testb $4, %al
; SSE4-NEXT: je .LBB17_6
; SSE4-NEXT: .LBB17_5: # %cond.store3
-; SSE4-NEXT: pextrb $4, %xmm0, 2(%rdi)
+; SSE4-NEXT: pextrb $2, %xmm0, 2(%rdi)
; SSE4-NEXT: testb $8, %al
; SSE4-NEXT: je .LBB17_8
; SSE4-NEXT: .LBB17_7: # %cond.store5
-; SSE4-NEXT: pextrb $6, %xmm0, 3(%rdi)
+; SSE4-NEXT: pextrb $3, %xmm0, 3(%rdi)
; SSE4-NEXT: testb $16, %al
; SSE4-NEXT: je .LBB17_10
; SSE4-NEXT: .LBB17_9: # %cond.store7
-; SSE4-NEXT: pextrb $8, %xmm0, 4(%rdi)
+; SSE4-NEXT: pextrb $4, %xmm0, 4(%rdi)
; SSE4-NEXT: testb $32, %al
; SSE4-NEXT: je .LBB17_12
; SSE4-NEXT: .LBB17_11: # %cond.store9
-; SSE4-NEXT: pextrb $10, %xmm0, 5(%rdi)
+; SSE4-NEXT: pextrb $5, %xmm0, 5(%rdi)
; SSE4-NEXT: testb $64, %al
; SSE4-NEXT: je .LBB17_14
; SSE4-NEXT: .LBB17_13: # %cond.store11
-; SSE4-NEXT: pextrb $12, %xmm0, 6(%rdi)
+; SSE4-NEXT: pextrb $6, %xmm0, 6(%rdi)
; SSE4-NEXT: testb $-128, %al
; SSE4-NEXT: je .LBB17_16
; SSE4-NEXT: .LBB17_15: # %cond.store13
-; SSE4-NEXT: pextrb $14, %xmm0, 7(%rdi)
+; SSE4-NEXT: pextrb $7, %xmm0, 7(%rdi)
; SSE4-NEXT: retq
;
; AVX-LABEL: truncstore_v8i16_v8i8:
; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX-NEXT: vpminuw {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; AVX-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1
; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX-NEXT: testb $2, %al
; AVX-NEXT: je .LBB17_4
; AVX-NEXT: .LBB17_3: # %cond.store1
-; AVX-NEXT: vpextrb $2, %xmm0, 1(%rdi)
+; AVX-NEXT: vpextrb $1, %xmm0, 1(%rdi)
; AVX-NEXT: testb $4, %al
; AVX-NEXT: je .LBB17_6
; AVX-NEXT: .LBB17_5: # %cond.store3
-; AVX-NEXT: vpextrb $4, %xmm0, 2(%rdi)
+; AVX-NEXT: vpextrb $2, %xmm0, 2(%rdi)
; AVX-NEXT: testb $8, %al
; AVX-NEXT: je .LBB17_8
; AVX-NEXT: .LBB17_7: # %cond.store5
-; AVX-NEXT: vpextrb $6, %xmm0, 3(%rdi)
+; AVX-NEXT: vpextrb $3, %xmm0, 3(%rdi)
; AVX-NEXT: testb $16, %al
; AVX-NEXT: je .LBB17_10
; AVX-NEXT: .LBB17_9: # %cond.store7
-; AVX-NEXT: vpextrb $8, %xmm0, 4(%rdi)
+; AVX-NEXT: vpextrb $4, %xmm0, 4(%rdi)
; AVX-NEXT: testb $32, %al
; AVX-NEXT: je .LBB17_12
; AVX-NEXT: .LBB17_11: # %cond.store9
-; AVX-NEXT: vpextrb $10, %xmm0, 5(%rdi)
+; AVX-NEXT: vpextrb $5, %xmm0, 5(%rdi)
; AVX-NEXT: testb $64, %al
; AVX-NEXT: je .LBB17_14
; AVX-NEXT: .LBB17_13: # %cond.store11
-; AVX-NEXT: vpextrb $12, %xmm0, 6(%rdi)
+; AVX-NEXT: vpextrb $6, %xmm0, 6(%rdi)
; AVX-NEXT: testb $-128, %al
; AVX-NEXT: je .LBB17_16
; AVX-NEXT: .LBB17_15: # %cond.store13
-; AVX-NEXT: vpextrb $14, %xmm0, 7(%rdi)
+; AVX-NEXT: vpextrb $7, %xmm0, 7(%rdi)
; AVX-NEXT: retq
;
; AVX512F-LABEL: truncstore_v8i16_v8i8:
; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1
; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0
; AVX512F-NEXT: vpminuw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: testb $1, %al
; AVX512F-NEXT: jne .LBB17_1
; AVX512F-NEXT: testb $2, %al
; AVX512F-NEXT: je .LBB17_4
; AVX512F-NEXT: .LBB17_3: # %cond.store1
-; AVX512F-NEXT: vpextrb $2, %xmm0, 1(%rdi)
+; AVX512F-NEXT: vpextrb $1, %xmm0, 1(%rdi)
; AVX512F-NEXT: testb $4, %al
; AVX512F-NEXT: je .LBB17_6
; AVX512F-NEXT: .LBB17_5: # %cond.store3
-; AVX512F-NEXT: vpextrb $4, %xmm0, 2(%rdi)
+; AVX512F-NEXT: vpextrb $2, %xmm0, 2(%rdi)
; AVX512F-NEXT: testb $8, %al
; AVX512F-NEXT: je .LBB17_8
; AVX512F-NEXT: .LBB17_7: # %cond.store5
-; AVX512F-NEXT: vpextrb $6, %xmm0, 3(%rdi)
+; AVX512F-NEXT: vpextrb $3, %xmm0, 3(%rdi)
; AVX512F-NEXT: testb $16, %al
; AVX512F-NEXT: je .LBB17_10
; AVX512F-NEXT: .LBB17_9: # %cond.store7
-; AVX512F-NEXT: vpextrb $8, %xmm0, 4(%rdi)
+; AVX512F-NEXT: vpextrb $4, %xmm0, 4(%rdi)
; AVX512F-NEXT: testb $32, %al
; AVX512F-NEXT: je .LBB17_12
; AVX512F-NEXT: .LBB17_11: # %cond.store9
-; AVX512F-NEXT: vpextrb $10, %xmm0, 5(%rdi)
+; AVX512F-NEXT: vpextrb $5, %xmm0, 5(%rdi)
; AVX512F-NEXT: testb $64, %al
; AVX512F-NEXT: je .LBB17_14
; AVX512F-NEXT: .LBB17_13: # %cond.store11
-; AVX512F-NEXT: vpextrb $12, %xmm0, 6(%rdi)
+; AVX512F-NEXT: vpextrb $6, %xmm0, 6(%rdi)
; AVX512F-NEXT: testb $-128, %al
; AVX512F-NEXT: je .LBB17_16
; AVX512F-NEXT: .LBB17_15: # %cond.store13
-; AVX512F-NEXT: vpextrb $14, %xmm0, 7(%rdi)
+; AVX512F-NEXT: vpextrb $7, %xmm0, 7(%rdi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512BW-NEXT: vptestmw %zmm1, %zmm1, %k0
-; AVX512BW-NEXT: vpminuw {{.*}}(%rip), %xmm0, %xmm0
-; AVX512BW-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; AVX512BW-NEXT: kshiftlq $56, %k0, %k0
; AVX512BW-NEXT: kshiftrq $56, %k0, %k1
+; AVX512BW-NEXT: vpminuw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1}
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
define <2 x i8> @PR42846(<2 x i8>* %j, <2 x i8> %k) {
; AVX-LABEL: PR42846:
; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*}}(%rip), %ymm1
-; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT: vpextrw $0, %xmm1, (%rdi)
+; AVX-NEXT: vmovdqa {{.*}}(%rip), %ymm0
+; AVX-NEXT: vpextrw $0, %xmm0, (%rdi)
+; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; X32-AVX-LABEL: PR42846:
; X32-AVX: # %bb.0:
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX-NEXT: vmovdqa l, %ymm1
-; X32-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; X32-AVX-NEXT: vpextrw $0, %xmm1, (%eax)
+; X32-AVX-NEXT: vmovdqa l, %ymm0
+; X32-AVX-NEXT: vpextrw $0, %xmm0, (%eax)
+; X32-AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; X32-AVX-NEXT: vzeroupper
; X32-AVX-NEXT: retl
%t0 = load volatile <32 x i8>, <32 x i8>* @l, align 32
define void @t4(x86_mmx %v1, x86_mmx %v2) nounwind {
; X86-64-LABEL: t4:
; X86-64: ## %bb.0:
-; X86-64-NEXT: movdq2q %xmm1, %mm0
-; X86-64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp)
; X86-64-NEXT: movdq2q %xmm0, %mm0
; X86-64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp)
-; X86-64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; X86-64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X86-64-NEXT: paddb %xmm1, %xmm0
+; X86-64-NEXT: movdq2q %xmm1, %mm0
+; X86-64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp)
+; X86-64-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0
+; X86-64-NEXT: paddb -{{[0-9]+}}(%rsp), %xmm0
; X86-64-NEXT: movb $1, %al
; X86-64-NEXT: jmp _pass_v8qi ## TAILCALL
%v1a = bitcast x86_mmx %v1 to <8 x i8>
; X32-NEXT: .cfi_offset %ebp, -8
; X32-NEXT: movl %esp, %ebp
; X32-NEXT: .cfi_def_cfa_register %ebp
-; X32-NEXT: andl $-8, %esp
-; X32-NEXT: subl $16, %esp
+; X32-NEXT: andl $-16, %esp
+; X32-NEXT: subl $48, %esp
; X32-NEXT: movl 12(%ebp), %ecx
; X32-NEXT: movl 8(%ebp), %eax
; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X32-NEXT: movq %mm0, (%eax)
; X32-NEXT: paddusb (%ecx), %mm0
; X32-NEXT: movq %mm0, {{[0-9]+}}(%esp)
-; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: movdqa {{[0-9]+}}(%esp), %xmm0
; X32-NEXT: movq %mm0, (%eax)
; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; X32-NEXT: psubb %xmm1, %xmm0
; X32-NEXT: movq %mm0, (%eax)
; X32-NEXT: psubusb (%ecx), %mm0
; X32-NEXT: movq %mm0, (%esp)
-; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-NEXT: movdqa (%esp), %xmm0
; X32-NEXT: movq %mm0, (%eax)
; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; X32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; X32-NEXT: pmullw %xmm0, %xmm1
-; X32-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; X32-NEXT: movdqa %xmm1, %xmm2
-; X32-NEXT: pand %xmm0, %xmm2
-; X32-NEXT: packuswb %xmm0, %xmm2
-; X32-NEXT: movq %xmm2, (%eax)
-; X32-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
-; X32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; X32-NEXT: pand %xmm1, %xmm2
-; X32-NEXT: movdqa %xmm2, %xmm1
+; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-NEXT: pmullw %xmm1, %xmm0
+; X32-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-NEXT: packuswb %xmm0, %xmm0
+; X32-NEXT: movq %xmm0, (%eax)
+; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; X32-NEXT: pand %xmm0, %xmm1
-; X32-NEXT: packuswb %xmm0, %xmm1
; X32-NEXT: movq %xmm1, (%eax)
+; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: por %xmm1, %xmm0
+; X32-NEXT: movq %xmm0, (%eax)
; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; X32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; X32-NEXT: por %xmm2, %xmm1
-; X32-NEXT: movdqa %xmm1, %xmm2
-; X32-NEXT: pand %xmm0, %xmm2
-; X32-NEXT: packuswb %xmm0, %xmm2
-; X32-NEXT: movq %xmm2, (%eax)
-; X32-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
-; X32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; X32-NEXT: pxor %xmm1, %xmm2
-; X32-NEXT: pand %xmm0, %xmm2
-; X32-NEXT: packuswb %xmm0, %xmm2
-; X32-NEXT: movq %xmm2, (%eax)
+; X32-NEXT: pxor %xmm0, %xmm1
+; X32-NEXT: movq %xmm1, (%eax)
; X32-NEXT: emms
; X32-NEXT: movl %ebp, %esp
; X32-NEXT: popl %ebp
; X64-NEXT: movq %mm0, (%rdi)
; X64-NEXT: paddusb (%rsi), %mm0
; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0
; X64-NEXT: movq %mm0, (%rdi)
; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; X64-NEXT: psubb %xmm1, %xmm0
; X64-NEXT: movq %mm0, (%rdi)
; X64-NEXT: psubusb (%rsi), %mm0
; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X64-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0
; X64-NEXT: movq %mm0, (%rdi)
; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; X64-NEXT: pmullw %xmm0, %xmm1
-; X64-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; X64-NEXT: movdqa %xmm1, %xmm2
-; X64-NEXT: pand %xmm0, %xmm2
-; X64-NEXT: packuswb %xmm0, %xmm2
-; X64-NEXT: movq %xmm2, (%rdi)
-; X64-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
-; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; X64-NEXT: pand %xmm1, %xmm2
-; X64-NEXT: movdqa %xmm2, %xmm1
+; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X64-NEXT: pmullw %xmm1, %xmm0
+; X64-NEXT: pand {{.*}}(%rip), %xmm0
+; X64-NEXT: packuswb %xmm0, %xmm0
+; X64-NEXT: movq %xmm0, (%rdi)
+; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; X64-NEXT: pand %xmm0, %xmm1
-; X64-NEXT: packuswb %xmm0, %xmm1
; X64-NEXT: movq %xmm1, (%rdi)
+; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT: por %xmm1, %xmm0
+; X64-NEXT: movq %xmm0, (%rdi)
; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; X64-NEXT: por %xmm2, %xmm1
-; X64-NEXT: movdqa %xmm1, %xmm2
-; X64-NEXT: pand %xmm0, %xmm2
-; X64-NEXT: packuswb %xmm0, %xmm2
-; X64-NEXT: movq %xmm2, (%rdi)
-; X64-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
-; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; X64-NEXT: pxor %xmm1, %xmm2
-; X64-NEXT: pand %xmm0, %xmm2
-; X64-NEXT: packuswb %xmm0, %xmm2
-; X64-NEXT: movq %xmm2, (%rdi)
+; X64-NEXT: pxor %xmm0, %xmm1
+; X64-NEXT: movq %xmm1, (%rdi)
; X64-NEXT: emms
; X64-NEXT: retq
entry:
define void @test1(x86_mmx* %A, x86_mmx* %B) {
; X32-LABEL: test1:
; X32: # %bb.0: # %entry
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; X32-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
-; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1,1,3]
-; X32-NEXT: paddq %xmm0, %xmm1
-; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
-; X32-NEXT: movq %xmm0, (%eax)
-; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; X32-NEXT: pmuludq %xmm1, %xmm0
-; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
-; X32-NEXT: movq %xmm1, (%eax)
-; X32-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
-; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1,1,3]
-; X32-NEXT: andps %xmm0, %xmm1
-; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
-; X32-NEXT: movq %xmm0, (%eax)
-; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; X32-NEXT: orps %xmm1, %xmm0
-; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
-; X32-NEXT: movq %xmm1, (%eax)
-; X32-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
-; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1,1,3]
-; X32-NEXT: xorps %xmm0, %xmm1
-; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
-; X32-NEXT: movq %xmm0, (%eax)
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; X32-NEXT: paddd %xmm0, %xmm1
+; X32-NEXT: movq %xmm1, (%ecx)
+; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; X32-NEXT: pmuludq %xmm0, %xmm1
+; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X32-NEXT: pmuludq %xmm0, %xmm2
+; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X32-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X32-NEXT: movq %xmm1, (%ecx)
+; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: pand %xmm1, %xmm0
+; X32-NEXT: movq %xmm0, (%ecx)
+; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; X32-NEXT: por %xmm0, %xmm1
+; X32-NEXT: movq %xmm1, (%ecx)
+; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: pxor %xmm1, %xmm0
+; X32-NEXT: movq %xmm0, (%ecx)
; X32-NEXT: emms
; X32-NEXT: retl
;
; X64-LABEL: test1:
; X64: # %bb.0: # %entry
; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
-; X64-NEXT: paddq %xmm0, %xmm1
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
-; X64-NEXT: movq %xmm0, (%rdi)
-; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; X64-NEXT: pmuludq %xmm1, %xmm0
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; X64-NEXT: paddd %xmm0, %xmm1
; X64-NEXT: movq %xmm1, (%rdi)
-; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
-; X64-NEXT: pand %xmm0, %xmm1
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
-; X64-NEXT: movq %xmm0, (%rdi)
; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; X64-NEXT: por %xmm1, %xmm0
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; X64-NEXT: pmuludq %xmm0, %xmm1
+; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X64-NEXT: pmuludq %xmm2, %xmm0
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; X64-NEXT: movq %xmm1, (%rdi)
+; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT: pand %xmm1, %xmm0
+; X64-NEXT: movq %xmm0, (%rdi)
; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
-; X64-NEXT: pxor %xmm0, %xmm1
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; X64-NEXT: por %xmm0, %xmm1
+; X64-NEXT: movq %xmm1, (%rdi)
+; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT: pxor %xmm1, %xmm0
; X64-NEXT: movq %xmm0, (%rdi)
; X64-NEXT: emms
; X64-NEXT: retq
; X32-NEXT: .cfi_offset %ebp, -8
; X32-NEXT: movl %esp, %ebp
; X32-NEXT: .cfi_def_cfa_register %ebp
-; X32-NEXT: andl $-8, %esp
-; X32-NEXT: subl $24, %esp
+; X32-NEXT: andl $-16, %esp
+; X32-NEXT: subl $64, %esp
; X32-NEXT: movl 12(%ebp), %ecx
; X32-NEXT: movl 8(%ebp), %eax
; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X32-NEXT: movq %mm0, (%eax)
; X32-NEXT: paddusw (%ecx), %mm0
; X32-NEXT: movq %mm0, {{[0-9]+}}(%esp)
-; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: movdqa {{[0-9]+}}(%esp), %xmm0
; X32-NEXT: movq %mm0, (%eax)
; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; X32-NEXT: psubw %xmm1, %xmm0
; X32-NEXT: movq %mm0, (%eax)
; X32-NEXT: psubusw (%ecx), %mm0
; X32-NEXT: movq %mm0, {{[0-9]+}}(%esp)
-; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X32-NEXT: movq %mm0, (%eax)
-; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; X32-NEXT: pmullw %xmm0, %xmm1
-; X32-NEXT: movdq2q %xmm1, %mm0
-; X32-NEXT: movq %xmm1, (%eax)
+; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: pmullw {{[0-9]+}}(%esp), %xmm0
+; X32-NEXT: movdq2q %xmm0, %mm0
+; X32-NEXT: movq %xmm0, (%eax)
; X32-NEXT: pmulhw (%ecx), %mm0
; X32-NEXT: movq %mm0, (%eax)
; X32-NEXT: pmaddwd (%ecx), %mm0
; X32-NEXT: movq %mm0, (%esp)
-; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; X32-NEXT: movq %mm0, (%eax)
-; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; X32-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X32-NEXT: pand %xmm0, %xmm1
-; X32-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
-; X32-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X32-NEXT: movq %xmm0, (%eax)
-; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; X32-NEXT: por %xmm1, %xmm0
-; X32-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
-; X32-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X32-NEXT: movq %xmm1, (%eax)
-; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; X32-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X32-NEXT: pxor %xmm0, %xmm1
-; X32-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
-; X32-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X32-NEXT: movq %xmm0, (%eax)
+; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: andps (%esp), %xmm0
+; X32-NEXT: movlps %xmm0, (%eax)
+; X32-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; X32-NEXT: orps %xmm0, %xmm1
+; X32-NEXT: movlps %xmm1, (%eax)
+; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: xorps %xmm1, %xmm0
+; X32-NEXT: movlps %xmm0, (%eax)
; X32-NEXT: emms
; X32-NEXT: movl %ebp, %esp
; X32-NEXT: popl %ebp
; X64-NEXT: movq %mm0, (%rdi)
; X64-NEXT: paddusw (%rsi), %mm0
; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0
; X64-NEXT: movq %mm0, (%rdi)
; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; X64-NEXT: psubw %xmm1, %xmm0
; X64-NEXT: movq %mm0, (%rdi)
; X64-NEXT: psubusw (%rsi), %mm0
; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X64-NEXT: movq %mm0, (%rdi)
-; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; X64-NEXT: pmullw %xmm0, %xmm1
-; X64-NEXT: movdq2q %xmm1, %mm0
-; X64-NEXT: movq %xmm1, (%rdi)
+; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT: pmullw -{{[0-9]+}}(%rsp), %xmm0
+; X64-NEXT: movdq2q %xmm0, %mm0
+; X64-NEXT: movq %xmm0, (%rdi)
; X64-NEXT: pmulhw (%rsi), %mm0
; X64-NEXT: movq %mm0, (%rdi)
; X64-NEXT: pmaddwd (%rsi), %mm0
; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; X64-NEXT: movq %mm0, (%rdi)
-; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-NEXT: pand %xmm0, %xmm1
-; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
-; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-NEXT: movq %xmm0, (%rdi)
-; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; X64-NEXT: por %xmm1, %xmm0
-; X64-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
-; X64-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X64-NEXT: movq %xmm1, (%rdi)
-; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-NEXT: pxor %xmm0, %xmm1
-; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
-; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-NEXT: movq %xmm0, (%rdi)
+; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT: andps -{{[0-9]+}}(%rsp), %xmm0
+; X64-NEXT: movlps %xmm0, (%rdi)
+; X64-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; X64-NEXT: orps %xmm0, %xmm1
+; X64-NEXT: movlps %xmm1, (%rdi)
+; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT: xorps %xmm1, %xmm0
+; X64-NEXT: movlps %xmm0, (%rdi)
; X64-NEXT: emms
; X64-NEXT: retq
entry:
; X32-LABEL: test3:
; X32: # %bb.0: # %entry
; X32-NEXT: pushl %ebp
-; X32-NEXT: movl %esp, %ebp
; X32-NEXT: pushl %ebx
; X32-NEXT: pushl %edi
; X32-NEXT: pushl %esi
-; X32-NEXT: andl $-8, %esp
-; X32-NEXT: subl $16, %esp
-; X32-NEXT: cmpl $0, 16(%ebp)
+; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)
; X32-NEXT: je .LBB3_1
; X32-NEXT: # %bb.2: # %bb26.preheader
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
; X32-NEXT: xorl %ebx, %ebx
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: xorl %edx, %edx
; X32-NEXT: .p2align 4, 0x90
; X32-NEXT: .LBB3_3: # %bb26
; X32-NEXT: # =>This Inner Loop Header: Depth=1
-; X32-NEXT: movl 8(%ebp), %ecx
-; X32-NEXT: movl %ecx, %esi
-; X32-NEXT: movl (%ecx,%ebx,8), %ecx
-; X32-NEXT: movl 4(%esi,%ebx,8), %esi
-; X32-NEXT: movl 12(%ebp), %edi
-; X32-NEXT: addl (%edi,%ebx,8), %ecx
-; X32-NEXT: adcl 4(%edi,%ebx,8), %esi
-; X32-NEXT: addl %eax, %ecx
-; X32-NEXT: movl %ecx, (%esp)
-; X32-NEXT: adcl %edx, %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT: movd %xmm0, %eax
-; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
-; X32-NEXT: movd %xmm0, %edx
+; X32-NEXT: movl (%edi,%ebx,8), %ebp
+; X32-NEXT: movl 4(%edi,%ebx,8), %ecx
+; X32-NEXT: addl (%esi,%ebx,8), %ebp
+; X32-NEXT: adcl 4(%esi,%ebx,8), %ecx
+; X32-NEXT: addl %ebp, %eax
+; X32-NEXT: adcl %ecx, %edx
; X32-NEXT: incl %ebx
-; X32-NEXT: cmpl 16(%ebp), %ebx
+; X32-NEXT: cmpl {{[0-9]+}}(%esp), %ebx
; X32-NEXT: jb .LBB3_3
; X32-NEXT: jmp .LBB3_4
; X32-NEXT: .LBB3_1:
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: xorl %edx, %edx
; X32-NEXT: .LBB3_4: # %bb31
-; X32-NEXT: leal -12(%ebp), %esp
; X32-NEXT: popl %esi
; X32-NEXT: popl %edi
; X32-NEXT: popl %ebx
; X86: # %bb.0:
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $8, %esp
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $32, %esp
; X86-NEXT: movl 8(%ebp), %eax
; X86-NEXT: movq (%eax), %mm0
; X86-NEXT: paddd %mm0, %mm0
; PR6399
define <2 x i32> @_mul2xi32a(<2 x i32>, <2 x i32>) {
-; SSE-LABEL: _mul2xi32a:
-; SSE: # %bb.0:
-; SSE-NEXT: pmuludq %xmm1, %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: _mul2xi32a:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: _mul2xi32a:
+; SSE42: # %bb.0:
+; SSE42-NEXT: pmulld %xmm1, %xmm0
+; SSE42-NEXT: retq
;
; AVX-LABEL: _mul2xi32a:
; AVX: # %bb.0:
-; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%r = mul <2 x i32> %0, %1
ret <2 x i32> %r
}
define <2 x i32> @_mul2xi32b(<2 x i32>, <2 x i32>) {
-; SSE2-LABEL: _mul2xi32b:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pmuludq %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; SSE2-NEXT: retq
-;
-; SSE42-LABEL: _mul2xi32b:
-; SSE42: # %bb.0:
-; SSE42-NEXT: pmuludq %xmm1, %xmm0
-; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; SSE42-NEXT: retq
+; SSE-LABEL: _mul2xi32b:
+; SSE: # %bb.0:
+; SSE-NEXT: pmuludq %xmm1, %xmm0
+; SSE-NEXT: retq
;
; AVX-LABEL: _mul2xi32b:
; AVX: # %bb.0:
; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX-NEXT: retq
%factor0 = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef>
%factor1 = shufflevector <2 x i32> %1, <2 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef>
;
; AVX1-LABEL: _mul4xi32toi64a:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,2,3,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,3,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
define void @v3i32(<2 x i32> %a, <2 x i32> %b, <3 x i32>* %p) nounwind {
; SSE2-LABEL: v3i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: movd %xmm2, 8(%rdi)
; SSE2-NEXT: movq %xmm0, (%rdi)
;
; SSE42-LABEL: v3i32:
; SSE42: # %bb.0:
-; SSE42-NEXT: extractps $2, %xmm0, 8(%rdi)
+; SSE42-NEXT: extractps $1, %xmm0, 8(%rdi)
; SSE42-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE42-NEXT: movlps %xmm0, (%rdi)
; SSE42-NEXT: retq
; AVX-LABEL: v3i32:
; AVX: # %bb.0:
; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX-NEXT: vextractps $2, %xmm0, 8(%rdi)
+; AVX-NEXT: vextractps $1, %xmm0, 8(%rdi)
; AVX-NEXT: vmovlps %xmm1, (%rdi)
; AVX-NEXT: retq
;
; XOP-LABEL: v3i32:
; XOP: # %bb.0:
; XOP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; XOP-NEXT: vextractps $2, %xmm0, 8(%rdi)
+; XOP-NEXT: vextractps $1, %xmm0, 8(%rdi)
; XOP-NEXT: vmovlps %xmm1, (%rdi)
; XOP-NEXT: retq
%r = shufflevector <2 x i32> %a, <2 x i32> %b, <3 x i32> <i32 0, i32 2, i32 1>
define void @v5i16(<4 x i16> %a, <4 x i16> %b, <5 x i16>* %p) nounwind {
; SSE2-LABEL: v5i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-NEXT: pextrw $6, %xmm0, %eax
+; SSE2-NEXT: psrlq $16, %xmm1
+; SSE2-NEXT: pextrw $3, %xmm0, %eax
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE2-NEXT: movw %ax, 8(%rdi)
-; SSE2-NEXT: movq %xmm2, (%rdi)
+; SSE2-NEXT: movq %xmm0, (%rdi)
; SSE2-NEXT: retq
;
; SSE42-LABEL: v5i16:
; SSE42: # %bb.0:
-; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
-; SSE42-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
-; SSE42-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE42-NEXT: pextrw $6, %xmm0, 8(%rdi)
-; SSE42-NEXT: movq %xmm2, (%rdi)
+; SSE42-NEXT: psrlq $16, %xmm1
+; SSE42-NEXT: pextrw $3, %xmm0, 8(%rdi)
+; SSE42-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE42-NEXT: movq %xmm0, (%rdi)
; SSE42-NEXT: retq
;
-; AVX1-LABEL: v5i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX1-NEXT: vpextrw $6, %xmm0, 8(%rdi)
-; AVX1-NEXT: vmovq %xmm1, (%rdi)
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: v5i16:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-SLOW-NEXT: vpextrw $6, %xmm0, 8(%rdi)
-; AVX2-SLOW-NEXT: vmovq %xmm1, (%rdi)
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: v5i16:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,8,9,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
-; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-FAST-NEXT: vpextrw $6, %xmm0, 8(%rdi)
-; AVX2-FAST-NEXT: vmovq %xmm1, (%rdi)
-; AVX2-FAST-NEXT: retq
+; AVX-LABEL: v5i16:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsrlq $16, %xmm1, %xmm1
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT: vpextrw $3, %xmm0, 8(%rdi)
+; AVX-NEXT: vmovq %xmm1, (%rdi)
+; AVX-NEXT: retq
;
; XOP-LABEL: v5i16:
; XOP: # %bb.0:
-; XOP-NEXT: vpperm {{.*#+}} xmm1 = xmm0[0,1],xmm1[4,5],xmm0[4,5],xmm1[8,9],xmm0[4,5],xmm1[4,5],xmm0[6,7],xmm1[6,7]
-; XOP-NEXT: vpextrw $6, %xmm0, 8(%rdi)
+; XOP-NEXT: vpsrlq $16, %xmm1, %xmm1
+; XOP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; XOP-NEXT: vpextrw $3, %xmm0, 8(%rdi)
; XOP-NEXT: vmovq %xmm1, (%rdi)
; XOP-NEXT: retq
%r = shufflevector <4 x i16> %a, <4 x i16> %b, <5 x i32> <i32 0, i32 5, i32 1, i32 6, i32 3>
define void @v7i8(<4 x i8> %a, <4 x i8> %b, <7 x i8>* %p) nounwind {
; SSE2-LABEL: v7i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,3]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,0,65535,0,65535,65535,65535]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,1,3,4,5,6,7]
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,0,255,0,255,255,255,255,255,255,255,255,255,255,255]
+; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,3,0,4,5,6,7]
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pandn %xmm0, %xmm2
; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp)
; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
; SSE2-NEXT: movb %al, 6(%rdi)
-; SSE2-NEXT: movd %xmm0, (%rdi)
-; SSE2-NEXT: pextrw $2, %xmm0, %eax
+; SSE2-NEXT: movd %xmm2, (%rdi)
+; SSE2-NEXT: pextrw $2, %xmm2, %eax
; SSE2-NEXT: movw %ax, 4(%rdi)
; SSE2-NEXT: retq
;
; SSE42-LABEL: v7i8:
; SSE42: # %bb.0:
-; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,3]
; SSE42-NEXT: pextrb $0, %xmm1, 6(%rdi)
-; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,4,5,8,9,0,1,12,13,0,1,14,15]
-; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7]
-; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,u,u,u,u,u,u,u,u,u]
+; SSE42-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[1,4,7,4,3,6,0,u,u,u,u,u,u,u,u,u]
; SSE42-NEXT: pextrw $2, %xmm1, 4(%rdi)
; SSE42-NEXT: movd %xmm1, (%rdi)
; SSE42-NEXT: retq
;
; AVX-LABEL: v7i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,1,3]
-; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9,8,9,4,5,8,9,0,1,12,13,0,1,14,15]
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5,6,7]
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,4,7,4,3,6,0,u,u,u,u,u,u,u,u,u]
; AVX-NEXT: vpextrb $0, %xmm1, 6(%rdi)
; AVX-NEXT: vpextrw $2, %xmm0, 4(%rdi)
; AVX-NEXT: vmovd %xmm0, (%rdi)
;
; XOP-LABEL: v7i8:
; XOP: # %bb.0:
-; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm1[8],xmm0[12],xmm1[8],xmm0[4],xmm1[12,0,u,u,u,u,u,u,u,u,u]
+; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[3],xmm1[2],xmm0[1],xmm1[3,0,u,u,u,u,u,u,u,u,u]
; XOP-NEXT: vpextrb $0, %xmm1, 6(%rdi)
; XOP-NEXT: vpextrw $2, %xmm0, 4(%rdi)
; XOP-NEXT: vmovd %xmm0, (%rdi)
define void @v7i16(<4 x i16> %a, <4 x i16> %b, <7 x i16>* %p) nounwind {
; SSE2-LABEL: v7i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,3]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,0,65535,0,65535,65535,65535]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,1,0,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,2,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,4,7]
-; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: por %xmm3, %xmm2
; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,0,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,2,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,5,6,4,7]
; SSE2-NEXT: movw %ax, 12(%rdi)
-; SSE2-NEXT: movq %xmm2, (%rdi)
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT: movq %xmm0, (%rdi)
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
; SSE2-NEXT: movd %xmm0, 8(%rdi)
; SSE2-NEXT: retq
;
; SSE42-LABEL: v7i16:
; SSE42: # %bb.0:
-; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,3]
; SSE42-NEXT: pextrw $0, %xmm1, 12(%rdi)
-; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,4,5,8,9,0,1,12,13,0,1,14,15]
-; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7]
+; SSE42-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2,3,8,9,14,15,8,9,6,7,12,13,0,1,14,15]
; SSE42-NEXT: pextrd $2, %xmm1, 8(%rdi)
; SSE42-NEXT: movq %xmm1, (%rdi)
; SSE42-NEXT: retq
;
; AVX-LABEL: v7i16:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,1,3]
-; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9,8,9,4,5,8,9,0,1,12,13,0,1,14,15]
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5,6,7]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,8,9,14,15,8,9,6,7,12,13,0,1,14,15]
; AVX-NEXT: vpextrw $0, %xmm1, 12(%rdi)
; AVX-NEXT: vpextrd $2, %xmm0, 8(%rdi)
; AVX-NEXT: vmovq %xmm0, (%rdi)
;
; XOP-LABEL: v7i16:
; XOP: # %bb.0:
-; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,1],xmm1[8,9],xmm0[12,13],xmm1[8,9],xmm0[4,5],xmm1[12,13,0,1,14,15]
+; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,1],xmm1[4,5],xmm0[6,7],xmm1[4,5],xmm0[2,3],xmm1[6,7,0,1],xmm0[6,7]
; XOP-NEXT: vpextrw $0, %xmm1, 12(%rdi)
; XOP-NEXT: vpextrd $2, %xmm0, 8(%rdi)
; XOP-NEXT: vmovq %xmm0, (%rdi)
define void @v12i8(<8 x i8> %a, <8 x i8> %b, <12 x i8>* %p) nounwind {
; SSE2-LABEL: v12i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,255,255]
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,1,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7]
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,1,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,4]
-; SSE2-NEXT: packuswb %xmm3, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,255,255]
; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,1,1,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,3]
; SSE2-NEXT: pandn %xmm1, %xmm2
; SSE2-NEXT: por %xmm0, %xmm2
; SSE2-NEXT: movq %xmm2, (%rdi)
;
; SSE42-LABEL: v12i8:
; SSE42: # %bb.0:
-; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,xmm1[0],zero,zero,xmm1[2],zero,zero,xmm1[4],zero,zero,xmm1[6,u,u,u,u]
-; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8],zero,xmm0[2,10],zero,xmm0[4,12],zero,xmm0[6,14],zero,xmm0[u,u,u,u]
-; SSE42-NEXT: por %xmm1, %xmm0
+; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8,1,2,10,3,4,12,5,6,14,7,u,u,u,u]
; SSE42-NEXT: pextrd $2, %xmm0, 8(%rdi)
; SSE42-NEXT: movq %xmm0, (%rdi)
; SSE42-NEXT: retq
;
; AVX-LABEL: v12i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[0],zero,zero,xmm1[2],zero,zero,xmm1[4],zero,zero,xmm1[6,u,u,u,u]
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8],zero,xmm0[2,10],zero,xmm0[4,12],zero,xmm0[6,14],zero,xmm0[u,u,u,u]
-; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,1,2,10,3,4,12,5,6,14,7,u,u,u,u]
; AVX-NEXT: vpextrd $2, %xmm0, 8(%rdi)
; AVX-NEXT: vmovq %xmm0, (%rdi)
; AVX-NEXT: retq
;
; XOP-LABEL: v12i8:
; XOP: # %bb.0:
-; XOP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[0],zero,zero,xmm1[2],zero,zero,xmm1[4],zero,zero,xmm1[6,u,u,u,u]
-; XOP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8],zero,xmm0[2,10],zero,xmm0[4,12],zero,xmm0[6,14],zero,xmm0[u,u,u,u]
-; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
+; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,4],xmm1[0],xmm0[1,5],xmm1[1],xmm0[2,6],xmm1[2],xmm0[3,7],xmm1[3],xmm0[u,u,u,u]
; XOP-NEXT: vpextrd $2, %xmm0, 8(%rdi)
; XOP-NEXT: vmovq %xmm0, (%rdi)
; XOP-NEXT: retq
define void @pr29025(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <12 x i8> *%p) nounwind {
; SSE2-LABEL: pr29025:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: packuswb %xmm1, %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
; SSE2-NEXT: packuswb %xmm1, %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,255,255]
; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm2
-; SSE2-NEXT: packuswb %xmm2, %xmm2
-; SSE2-NEXT: packuswb %xmm2, %xmm2
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,1,1,4,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,3]
; SSE2-NEXT: pandn %xmm2, %xmm1
;
; SSE42-LABEL: pr29025:
; SSE42: # %bb.0:
-; SSE42-NEXT: movdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; SSE42-NEXT: pshufb %xmm3, %xmm1
-; SSE42-NEXT: pshufb %xmm3, %xmm0
; SSE42-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE42-NEXT: pshufb %xmm3, %xmm2
; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8,1,2,10,3,4,12,5,6,14,7,u,u,u,u]
; SSE42-NEXT: pextrd $2, %xmm0, 8(%rdi)
;
; AVX-LABEL: pr29025:
; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm1
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,1,2,10,3,4,12,5,6,14,7,u,u,u,u]
; AVX-NEXT: vpextrd $2, %xmm0, 8(%rdi)
; AVX-NEXT: vmovq %xmm0, (%rdi)
;
; XOP-LABEL: pr29025:
; XOP: # %bb.0:
-; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,4,8,12],xmm1[0,4,8,12],xmm0[u,u,u,u,u,u,u,u]
-; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,4],xmm2[0],xmm0[1,5],xmm2[4],xmm0[2,6],xmm2[8],xmm0[3,7],xmm2[12],xmm0[u,u,u,u]
+; XOP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,4],xmm2[0],xmm0[1,5],xmm2[1],xmm0[2,6],xmm2[2],xmm0[3,7],xmm2[3],xmm0[u,u,u,u]
; XOP-NEXT: vpextrd $2, %xmm0, 8(%rdi)
; XOP-NEXT: vmovq %xmm0, (%rdi)
; XOP-NEXT: retq
; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7]
; SSE2-NEXT: packuswb %xmm0, %xmm4
-; SSE2-NEXT: movq %xmm4, (%rsi)
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,255,255,0,255,255,0,255,255,255,255,255,255,255,255,255]
-; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: pand %xmm4, %xmm5
-; SSE2-NEXT: pandn %xmm1, %xmm4
-; SSE2-NEXT: por %xmm5, %xmm4
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535]
-; SSE2-NEXT: pand %xmm5, %xmm4
-; SSE2-NEXT: pandn %xmm3, %xmm5
-; SSE2-NEXT: por %xmm4, %xmm5
-; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[2,1,0,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,3,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4]
-; SSE2-NEXT: packuswb %xmm0, %xmm4
-; SSE2-NEXT: movq %xmm4, (%rdx)
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,255,0,255,255,0,255,255,255,255,255,255,255,255]
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm4
-; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,255,255,0,255,255,0,255,255,255,255,255,255,255,255,255]
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: pand %xmm5, %xmm6
+; SSE2-NEXT: pandn %xmm1, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,65535,0,65535,65535]
+; SSE2-NEXT: pand %xmm6, %xmm5
+; SSE2-NEXT: pandn %xmm3, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[2,1,0,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,2,3,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,7,4]
+; SSE2-NEXT: packuswb %xmm0, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,255,0,255,255,0,255,255,255,255,255,255,255,255]
+; SSE2-NEXT: pand %xmm6, %xmm0
+; SSE2-NEXT: pandn %xmm1, %xmm6
+; SSE2-NEXT: por %xmm0, %xmm6
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,0,65535,65535,0,65535]
-; SSE2-NEXT: pand %xmm0, %xmm4
+; SSE2-NEXT: pand %xmm0, %xmm6
; SSE2-NEXT: pandn %xmm3, %xmm0
-; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: por %xmm6, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
; SSE2-NEXT: packuswb %xmm0, %xmm0
+; SSE2-NEXT: movq %xmm4, (%rsi)
+; SSE2-NEXT: movq %xmm5, (%rdx)
; SSE2-NEXT: movq %xmm0, (%rcx)
; SSE2-NEXT: retq
;
; SSE42-NEXT: movdqa %xmm0, %xmm3
; SSE42-NEXT: pshufb {{.*#+}} xmm3 = xmm3[0,3,6,9,12,15],zero,zero,xmm3[u,u,u,u,u,u,u,u]
; SSE42-NEXT: por %xmm2, %xmm3
-; SSE42-NEXT: movq %xmm3, (%rsi)
; SSE42-NEXT: movdqa %xmm1, %xmm2
; SSE42-NEXT: pshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm2[0,3,6,u,u,u,u,u,u,u,u]
-; SSE42-NEXT: movdqa %xmm0, %xmm3
-; SSE42-NEXT: pshufb {{.*#+}} xmm3 = xmm3[1,4,7,10,13],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
-; SSE42-NEXT: por %xmm2, %xmm3
-; SSE42-NEXT: movq %xmm3, (%rdx)
+; SSE42-NEXT: movdqa %xmm0, %xmm4
+; SSE42-NEXT: pshufb {{.*#+}} xmm4 = xmm4[1,4,7,10,13],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u]
+; SSE42-NEXT: por %xmm2, %xmm4
; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; SSE42-NEXT: por %xmm1, %xmm0
+; SSE42-NEXT: movq %xmm3, (%rsi)
+; SSE42-NEXT: movq %xmm4, (%rdx)
; SSE42-NEXT: movq %xmm0, (%rcx)
; SSE42-NEXT: retq
;
; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX-NEXT: vmovq %xmm2, (%rsi)
-; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX-NEXT: vmovq %xmm2, (%rdx)
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3
; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovq %xmm2, (%rsi)
+; AVX-NEXT: vmovq %xmm3, (%rdx)
; AVX-NEXT: vmovq %xmm0, (%rcx)
; AVX-NEXT: retq
;
; XOP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
; XOP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
; XOP-NEXT: vpor %xmm2, %xmm3, %xmm2
-; XOP-NEXT: vmovq %xmm2, (%rsi)
-; XOP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
-; XOP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; XOP-NEXT: vpor %xmm2, %xmm3, %xmm2
-; XOP-NEXT: vmovq %xmm2, (%rdx)
+; XOP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
+; XOP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; XOP-NEXT: vpor %xmm3, %xmm4, %xmm3
; XOP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
; XOP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
+; XOP-NEXT: vmovq %xmm2, (%rsi)
+; XOP-NEXT: vmovq %xmm3, (%rdx)
; XOP-NEXT: vmovq %xmm0, (%rcx)
; XOP-NEXT: retq
%wide.vec = load <24 x i8>, <24 x i8>* %p, align 4
; SSE2-LABEL: insert_v7i8_v2i16_2:
; SSE2: # %bb.0:
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,3]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
-; SSE2-NEXT: movaps {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; SSE2-NEXT: andps %xmm0, %xmm1
-; SSE2-NEXT: packuswb %xmm1, %xmm1
-; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: pextrw $3, %xmm1, %eax
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSE2-NEXT: movd %xmm1, (%rdi)
-; SSE2-NEXT: pextrw $2, %xmm1, %eax
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: pextrw $1, %xmm0, %eax
; SSE2-NEXT: movw %ax, 4(%rdi)
; SSE2-NEXT: retq
;
; SSE42: # %bb.0:
; SSE42-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE42-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; SSE42-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[3],zero,zero,zero
-; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3,4,5],xmm2[6,7]
-; SSE42-NEXT: packuswb %xmm0, %xmm0
; SSE42-NEXT: pextrb $6, %xmm1, 6(%rdi)
-; SSE42-NEXT: pextrw $2, %xmm0, 4(%rdi)
-; SSE42-NEXT: movd %xmm0, (%rdi)
+; SSE42-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE42-NEXT: pextrw $1, %xmm0, 4(%rdi)
+; SSE42-NEXT: movd %xmm1, (%rdi)
; SSE42-NEXT: retq
;
; AVX1-LABEL: insert_v7i8_v2i16_2:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3,4,5],xmm2[6,7]
-; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX1-NEXT: vpextrb $6, %xmm1, 6(%rdi)
-; AVX1-NEXT: vpextrw $2, %xmm0, 4(%rdi)
-; AVX1-NEXT: vmovd %xmm0, (%rdi)
+; AVX1-NEXT: vpextrw $1, %xmm0, 4(%rdi)
+; AVX1-NEXT: vmovd %xmm2, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: insert_v7i8_v2i16_2:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[3],zero,zero,zero
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1,2],xmm2[3]
-; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX2-NEXT: vpextrb $6, %xmm1, 6(%rdi)
-; AVX2-NEXT: vpextrw $2, %xmm0, 4(%rdi)
-; AVX2-NEXT: vmovd %xmm0, (%rdi)
+; AVX2-NEXT: vpextrw $1, %xmm0, 4(%rdi)
+; AVX2-NEXT: vmovd %xmm2, (%rdi)
; AVX2-NEXT: retq
;
; AVX512-LABEL: insert_v7i8_v2i16_2:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[3],zero,zero,zero
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1,2],xmm2[3]
-; AVX512-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX512-NEXT: vpextrb $6, %xmm1, 6(%rdi)
-; AVX512-NEXT: vpextrw $2, %xmm0, 4(%rdi)
-; AVX512-NEXT: vmovd %xmm0, (%rdi)
+; AVX512-NEXT: vpextrw $1, %xmm0, 4(%rdi)
+; AVX512-NEXT: vmovd %xmm2, (%rdi)
; AVX512-NEXT: retq
;
; XOP-LABEL: insert_v7i8_v2i16_2:
; AVX1-NEXT: vmovdqa (%rsi), %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm2
+; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vpmovsxbd %xmm3, %xmm4
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm2
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
; AVX1-NEXT: vpmulld %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
; AVX1-NEXT: vpmulld %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpmovsxbd %xmm1, %xmm3
-; AVX1-NEXT: vpmulld %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpmovsxbd %xmm1, %xmm5
+; AVX1-NEXT: vpmulld %xmm5, %xmm3, %xmm3
; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
; AVX2-NEXT: vmovdqa (%rsi), %xmm1
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm3
-; AVX2-NEXT: vpmovsxbd %xmm3, %ymm3
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm2
+; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1
+; AVX2-NEXT: vpmovsxbd %xmm3, %ymm3
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
; AVX2-NEXT: vpmulld %ymm2, %ymm3, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1
; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0
; AVX512-NEXT: vmovdqa (%rsi), %xmm1
; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm3
-; AVX512-NEXT: vpmovsxbd %xmm3, %ymm3
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm4, %xmm0, %xmm0
; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm2
+; AVX512-NEXT: vpshufb %xmm4, %xmm1, %xmm1
+; AVX512-NEXT: vpmovsxbd %xmm3, %ymm3
; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
; AVX512-NEXT: vpmulld %ymm2, %ymm3, %ymm2
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512-NEXT: vpmovsxbd %xmm1, %ymm1
; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpaddd %ymm0, %ymm2, %ymm0
; AVX1-NEXT: vmovdqa (%rdi), %xmm0
; AVX1-NEXT: vmovdqa (%rsi), %xmm1
; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,2,5,6,9,10,13,14,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpmovsxbd %xmm2, %xmm3
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,8,11,12,15,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpmovsxbd %xmm2, %xmm4
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2
-; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
-; AVX1-NEXT: vpmulld %xmm5, %xmm3, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
-; AVX1-NEXT: vpmulld %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,8,11,12,15,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpmovsxbd %xmm0, %xmm4
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; AVX1-NEXT: vpmulld %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; AVX1-NEXT: vpmulld %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpmovsxbd %xmm0, %xmm3
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; AVX1-NEXT: vpmulld %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpmulld %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovdqa (%rdi), %xmm0
; AVX2-NEXT: vmovdqa (%rsi), %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,2,5,6,9,10,13,14,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpmovsxbd %xmm2, %ymm2
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,8,11,12,15,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpmovsxbd %xmm2, %ymm2
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
; AVX2-NEXT: vpmulld %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,8,11,12,15,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-NEXT: vmovdqa (%rsi), %xmm1
; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,2,5,6,9,10,13,14,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpmovsxbd %xmm2, %ymm2
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,8,11,12,15,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpmovsxbd %xmm2, %ymm2
; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
; AVX512-NEXT: vpmulld %ymm3, %ymm2, %ymm2
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,8,11,12,15,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpmovsxbd %xmm0, %ymm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpaddd %ymm0, %ymm2, %ymm0
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
define <4 x i16> @mulhuw_v4i16(<4 x i16> %a, <4 x i16> %b) {
-; SSE2-PROMOTE-LABEL: mulhuw_v4i16:
-; SSE2-PROMOTE: # %bb.0:
-; SSE2-PROMOTE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2-PROMOTE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; SSE2-PROMOTE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-PROMOTE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE2-PROMOTE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; SSE2-PROMOTE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-PROMOTE-NEXT: pmulhuw %xmm1, %xmm0
-; SSE2-PROMOTE-NEXT: pxor %xmm1, %xmm1
-; SSE2-PROMOTE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-PROMOTE-NEXT: retq
-;
-; SSE2-WIDEN-LABEL: mulhuw_v4i16:
-; SSE2-WIDEN: # %bb.0:
-; SSE2-WIDEN-NEXT: pmulhuw %xmm1, %xmm0
-; SSE2-WIDEN-NEXT: retq
-;
-; SSE41-PROMOTE-LABEL: mulhuw_v4i16:
-; SSE41-PROMOTE: # %bb.0:
-; SSE41-PROMOTE-NEXT: pxor %xmm2, %xmm2
-; SSE41-PROMOTE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; SSE41-PROMOTE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; SSE41-PROMOTE-NEXT: pmulld %xmm1, %xmm0
-; SSE41-PROMOTE-NEXT: psrld $16, %xmm0
-; SSE41-PROMOTE-NEXT: retq
-;
-; SSE41-WIDEN-LABEL: mulhuw_v4i16:
-; SSE41-WIDEN: # %bb.0:
-; SSE41-WIDEN-NEXT: pmulhuw %xmm1, %xmm0
-; SSE41-WIDEN-NEXT: retq
+; SSE-LABEL: mulhuw_v4i16:
+; SSE: # %bb.0:
+; SSE-NEXT: pmulhuw %xmm1, %xmm0
+; SSE-NEXT: retq
;
; AVX-LABEL: mulhuw_v4i16:
; AVX: # %bb.0:
-; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%a1 = zext <4 x i16> %a to <4 x i32>
%b1 = zext <4 x i16> %b to <4 x i32>
}
define <4 x i16> @mulhw_v4i16(<4 x i16> %a, <4 x i16> %b) {
-; SSE2-PROMOTE-LABEL: mulhw_v4i16:
-; SSE2-PROMOTE: # %bb.0:
-; SSE2-PROMOTE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2-PROMOTE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; SSE2-PROMOTE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-PROMOTE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE2-PROMOTE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; SSE2-PROMOTE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-PROMOTE-NEXT: pmulhw %xmm1, %xmm0
-; SSE2-PROMOTE-NEXT: pxor %xmm1, %xmm1
-; SSE2-PROMOTE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-PROMOTE-NEXT: retq
-;
-; SSE2-WIDEN-LABEL: mulhw_v4i16:
-; SSE2-WIDEN: # %bb.0:
-; SSE2-WIDEN-NEXT: pmulhw %xmm1, %xmm0
-; SSE2-WIDEN-NEXT: retq
-;
-; SSE41-PROMOTE-LABEL: mulhw_v4i16:
-; SSE41-PROMOTE: # %bb.0:
-; SSE41-PROMOTE-NEXT: pslld $16, %xmm0
-; SSE41-PROMOTE-NEXT: psrad $16, %xmm0
-; SSE41-PROMOTE-NEXT: pslld $16, %xmm1
-; SSE41-PROMOTE-NEXT: psrad $16, %xmm1
-; SSE41-PROMOTE-NEXT: pmulld %xmm1, %xmm0
-; SSE41-PROMOTE-NEXT: psrld $16, %xmm0
-; SSE41-PROMOTE-NEXT: retq
-;
-; SSE41-WIDEN-LABEL: mulhw_v4i16:
-; SSE41-WIDEN: # %bb.0:
-; SSE41-WIDEN-NEXT: pmulhw %xmm1, %xmm0
-; SSE41-WIDEN-NEXT: retq
+; SSE-LABEL: mulhw_v4i16:
+; SSE: # %bb.0:
+; SSE-NEXT: pmulhw %xmm1, %xmm0
+; SSE-NEXT: retq
;
; AVX-LABEL: mulhw_v4i16:
; AVX: # %bb.0:
-; AVX-NEXT: vpslld $16, %xmm0, %xmm0
-; AVX-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX-NEXT: vpslld $16, %xmm1, %xmm1
-; AVX-NEXT: vpsrad $16, %xmm1, %xmm1
-; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%a1 = sext <4 x i16> %a to <4 x i32>
%b1 = sext <4 x i16> %b to <4 x i32>
; CHECK-LABEL: BITCAST1:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
+; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: retl
entry:
%G = load <2 x i8*>, <2 x i8*>* %p
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movdqa (%rdi), %xmm0
; CHECK-NEXT: pminud {{.*}}(%rip), %xmm0
-; CHECK-NEXT: pmovzxwq %xmm0, %xmm0
; CHECK-NEXT: retq
entry:
%2 = load <4 x i32>, <4 x i32>* %0, align 16
; CHECK-NEXT: movdqa (%rdi), %xmm0
; CHECK-NEXT: pminud {{.*}}(%rip), %xmm0
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; CHECK-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; CHECK-NEXT: retq
entry:
%2 = load <4 x i32>, <4 x i32>* %0, align 16
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefixes=X64,X64-SKX
define void @fetch_r16g16_snorm_unorm8(<4 x i8>*, i8*, i32, i32, { [2048 x i32], [128 x i64] }*) nounwind {
-; X86-SKYLAKE-LABEL: fetch_r16g16_snorm_unorm8:
-; X86-SKYLAKE: # %bb.0: # %entry
-; X86-SKYLAKE-NEXT: subl $8, %esp
-; X86-SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SKYLAKE-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SKYLAKE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
-; X86-SKYLAKE-NEXT: vpsrad $16, %xmm0, %xmm0
-; X86-SKYLAKE-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; X86-SKYLAKE-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; X86-SKYLAKE-NEXT: vpsrld $7, %xmm0, %xmm0
-; X86-SKYLAKE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u]
-; X86-SKYLAKE-NEXT: vmovd %xmm0, %ecx
-; X86-SKYLAKE-NEXT: orl $-16777216, %ecx # imm = 0xFF000000
-; X86-SKYLAKE-NEXT: movl %ecx, (%eax)
-; X86-SKYLAKE-NEXT: addl $8, %esp
-; X86-SKYLAKE-NEXT: retl
+; X86-LABEL: fetch_r16g16_snorm_unorm8:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X86-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-NEXT: vpsrlw $7, %xmm0, %xmm0
+; X86-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u]
+; X86-NEXT: vmovd %xmm0, %ecx
+; X86-NEXT: orl $-16777216, %ecx # imm = 0xFF000000
+; X86-NEXT: movl %ecx, (%eax)
+; X86-NEXT: retl
;
-; X86-SKX-LABEL: fetch_r16g16_snorm_unorm8:
-; X86-SKX: # %bb.0: # %entry
-; X86-SKX-NEXT: subl $8, %esp
-; X86-SKX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,0,1,u,u,2,3,u,u,u,u,u,u,u,u]
-; X86-SKX-NEXT: vpsrad $16, %xmm0, %xmm0
-; X86-SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; X86-SKX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; X86-SKX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; X86-SKX-NEXT: vpsrld $7, %xmm0, %xmm0
-; X86-SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; X86-SKX-NEXT: vpmovqw %xmm0, {{[0-9]+}}(%esp)
-; X86-SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X86-SKX-NEXT: vpmovdb %xmm0, (%esp)
-; X86-SKX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SKX-NEXT: movzwl (%esp), %ecx
-; X86-SKX-NEXT: orl $-16777216, %ecx # imm = 0xFF000000
-; X86-SKX-NEXT: movl %ecx, (%eax)
-; X86-SKX-NEXT: addl $8, %esp
-; X86-SKX-NEXT: retl
-;
-; X64-SKYLAKE-LABEL: fetch_r16g16_snorm_unorm8:
-; X64-SKYLAKE: # %bb.0: # %entry
-; X64-SKYLAKE-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-SKYLAKE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
-; X64-SKYLAKE-NEXT: vpsrad $16, %xmm0, %xmm0
-; X64-SKYLAKE-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; X64-SKYLAKE-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; X64-SKYLAKE-NEXT: vpsrld $7, %xmm0, %xmm0
-; X64-SKYLAKE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u]
-; X64-SKYLAKE-NEXT: vmovd %xmm0, %eax
-; X64-SKYLAKE-NEXT: orl $-16777216, %eax # imm = 0xFF000000
-; X64-SKYLAKE-NEXT: movl %eax, (%rdi)
-; X64-SKYLAKE-NEXT: retq
-;
-; X64-SKX-LABEL: fetch_r16g16_snorm_unorm8:
-; X64-SKX: # %bb.0: # %entry
-; X64-SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,0,1,u,u,2,3,u,u,u,u,u,u,u,u]
-; X64-SKX-NEXT: vpsrad $16, %xmm0, %xmm0
-; X64-SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; X64-SKX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; X64-SKX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; X64-SKX-NEXT: vpsrld $7, %xmm0, %xmm0
-; X64-SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; X64-SKX-NEXT: vpmovqw %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X64-SKX-NEXT: vpmovdb %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SKX-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
-; X64-SKX-NEXT: orl $-16777216, %eax # imm = 0xFF000000
-; X64-SKX-NEXT: movl %eax, (%rdi)
-; X64-SKX-NEXT: retq
+; X64-LABEL: fetch_r16g16_snorm_unorm8:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X64-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-NEXT: vpsrlw $7, %xmm0, %xmm0
+; X64-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u]
+; X64-NEXT: vmovd %xmm0, %eax
+; X64-NEXT: orl $-16777216, %eax # imm = 0xFF000000
+; X64-NEXT: movl %eax, (%rdi)
+; X64-NEXT: retq
entry:
%5 = bitcast i8* %1 to <2 x i16>*
%6 = load <2 x i16>, <2 x i16>* %5, align 2
; CHECK-NEXT: pextrb $10, %xmm0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: pextrb $12, %xmm0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: pextrb $14, %xmm0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: retq
%v = alloca i8, i32 8, align 16
call void @llvm.masked.compressstore.v16i8(<16 x i8> %a, i8* %v, <16 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>)
define <3 x i16> @zext_i8(<3 x i8>) {
; SSE3-LABEL: zext_i8:
; SSE3: # %bb.0:
-; SSE3-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; SSE3-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; SSE3-NEXT: movzbl {{[0-9]+}}(%esp), %edx
+; SSE3-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; SSE3-NEXT: movd %eax, %xmm0
-; SSE3-NEXT: pextrw $0, %xmm0, %eax
+; SSE3-NEXT: pinsrw $1, %edx, %xmm0
+; SSE3-NEXT: pinsrw $2, %ecx, %xmm0
+; SSE3-NEXT: movd %xmm0, %eax
; SSE3-NEXT: # kill: def $ax killed $ax killed $eax
; SSE3-NEXT: # kill: def $dx killed $dx killed $edx
; SSE3-NEXT: # kill: def $cx killed $cx killed $ecx
;
; SSE41-LABEL: zext_i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: pinsrb $0, {{[0-9]+}}(%esp), %xmm0
-; SSE41-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm0
-; SSE41-NEXT: pextrw $2, %xmm0, %edx
-; SSE41-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm0
+; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE41-NEXT: pinsrb $1, {{[0-9]+}}(%esp), %xmm0
+; SSE41-NEXT: pinsrb $2, {{[0-9]+}}(%esp), %xmm0
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SSE41-NEXT: movd %xmm0, %eax
-; SSE41-NEXT: pextrw $4, %xmm0, %ecx
+; SSE41-NEXT: pextrw $1, %xmm0, %edx
+; SSE41-NEXT: pextrw $2, %xmm0, %ecx
; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
; SSE41-NEXT: # kill: def $dx killed $dx killed $edx
; SSE41-NEXT: # kill: def $cx killed $cx killed $ecx
;
; AVX-32-LABEL: zext_i8:
; AVX-32: # %bb.0:
-; AVX-32-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX-32-NEXT: vpinsrb $0, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX-32-NEXT: vpinsrb $4, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX-32-NEXT: vpinsrb $8, {{[0-9]+}}(%esp), %xmm0, %xmm1
-; AVX-32-NEXT: vpextrw $2, %xmm0, %edx
-; AVX-32-NEXT: vmovd %xmm1, %eax
-; AVX-32-NEXT: vpextrw $4, %xmm1, %ecx
+; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-32-NEXT: vpinsrb $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-32-NEXT: vpinsrb $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-32-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX-32-NEXT: vmovd %xmm0, %eax
+; AVX-32-NEXT: vpextrw $1, %xmm0, %edx
+; AVX-32-NEXT: vpextrw $2, %xmm0, %ecx
; AVX-32-NEXT: # kill: def $ax killed $ax killed $eax
; AVX-32-NEXT: # kill: def $dx killed $dx killed $edx
; AVX-32-NEXT: # kill: def $cx killed $cx killed $ecx
; AVX-64-LABEL: zext_i8:
; AVX-64: # %bb.0:
; AVX-64-NEXT: vmovd %edi, %xmm0
-; AVX-64-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0
-; AVX-64-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0
-; AVX-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-64-NEXT: vpinsrb $1, %esi, %xmm0, %xmm0
+; AVX-64-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; AVX-64-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX-64-NEXT: vmovd %xmm0, %eax
-; AVX-64-NEXT: vpextrw $2, %xmm0, %edx
-; AVX-64-NEXT: vpextrw $4, %xmm0, %ecx
+; AVX-64-NEXT: vpextrw $1, %xmm0, %edx
+; AVX-64-NEXT: vpextrw $2, %xmm0, %ecx
; AVX-64-NEXT: # kill: def $ax killed $ax killed $eax
; AVX-64-NEXT: # kill: def $dx killed $dx killed $edx
; AVX-64-NEXT: # kill: def $cx killed $cx killed $ecx
; SSE3-LABEL: sext_i8:
; SSE3: # %bb.0:
; SSE3-NEXT: movl {{[0-9]+}}(%esp), %eax
-; SSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE3-NEXT: shll $8, %eax
+; SSE3-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; SSE3-NEXT: shll $8, %ecx
+; SSE3-NEXT: movd %ecx, %xmm0
; SSE3-NEXT: pinsrw $1, %eax, %xmm0
; SSE3-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SSE3-NEXT: shll $8, %eax
; SSE3-NEXT: pinsrw $2, %eax, %xmm0
-; SSE3-NEXT: psllw $8, %xmm0
; SSE3-NEXT: psraw $8, %xmm0
-; SSE3-NEXT: pextrw $0, %xmm0, %eax
+; SSE3-NEXT: movd %xmm0, %eax
; SSE3-NEXT: pextrw $1, %xmm0, %edx
; SSE3-NEXT: pextrw $2, %xmm0, %ecx
; SSE3-NEXT: # kill: def $ax killed $ax killed $eax
; SSE41-LABEL: sext_i8:
; SSE41: # %bb.0:
; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE41-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm0
-; SSE41-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm0
-; SSE41-NEXT: pslld $24, %xmm0
-; SSE41-NEXT: psrad $24, %xmm0
+; SSE41-NEXT: pinsrb $1, {{[0-9]+}}(%esp), %xmm0
+; SSE41-NEXT: pinsrb $2, {{[0-9]+}}(%esp), %xmm0
+; SSE41-NEXT: pmovsxbw %xmm0, %xmm0
; SSE41-NEXT: movd %xmm0, %eax
-; SSE41-NEXT: pextrw $2, %xmm0, %edx
-; SSE41-NEXT: pextrw $4, %xmm0, %ecx
+; SSE41-NEXT: pextrw $1, %xmm0, %edx
+; SSE41-NEXT: pextrw $2, %xmm0, %ecx
; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
; SSE41-NEXT: # kill: def $dx killed $dx killed $edx
; SSE41-NEXT: # kill: def $cx killed $cx killed $ecx
; AVX-32-LABEL: sext_i8:
; AVX-32: # %bb.0:
; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-32-NEXT: vpinsrb $4, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX-32-NEXT: vpinsrb $8, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX-32-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX-32-NEXT: vpsrad $24, %xmm0, %xmm0
+; AVX-32-NEXT: vpinsrb $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-32-NEXT: vpinsrb $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-32-NEXT: vpmovsxbw %xmm0, %xmm0
; AVX-32-NEXT: vmovd %xmm0, %eax
-; AVX-32-NEXT: vpextrw $2, %xmm0, %edx
-; AVX-32-NEXT: vpextrw $4, %xmm0, %ecx
+; AVX-32-NEXT: vpextrw $1, %xmm0, %edx
+; AVX-32-NEXT: vpextrw $2, %xmm0, %ecx
; AVX-32-NEXT: # kill: def $ax killed $ax killed $eax
; AVX-32-NEXT: # kill: def $dx killed $dx killed $edx
; AVX-32-NEXT: # kill: def $cx killed $cx killed $ecx
; AVX-64-LABEL: sext_i8:
; AVX-64: # %bb.0:
; AVX-64-NEXT: vmovd %edi, %xmm0
-; AVX-64-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0
-; AVX-64-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0
-; AVX-64-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX-64-NEXT: vpsrad $24, %xmm0, %xmm0
+; AVX-64-NEXT: vpinsrb $1, %esi, %xmm0, %xmm0
+; AVX-64-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
+; AVX-64-NEXT: vpmovsxbw %xmm0, %xmm0
; AVX-64-NEXT: vmovd %xmm0, %eax
-; AVX-64-NEXT: vpextrw $2, %xmm0, %edx
-; AVX-64-NEXT: vpextrw $4, %xmm0, %ecx
+; AVX-64-NEXT: vpextrw $1, %xmm0, %edx
+; AVX-64-NEXT: vpextrw $2, %xmm0, %ecx
; AVX-64-NEXT: # kill: def $ax killed $ax killed $eax
; AVX-64-NEXT: # kill: def $dx killed $dx killed $edx
; AVX-64-NEXT: # kill: def $cx killed $cx killed $ecx
; X86-LABEL: mul_f:
; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-NEXT: pmaddwd %xmm0, %xmm0
-; X86-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; X86-NEXT: pmullw %xmm0, %xmm0
+; X86-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
; X86-NEXT: movd %xmm0, (%eax)
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: retl
;
; X64-LABEL: mul_f:
; X64: # %bb.0: # %entry
-; X64-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X64-NEXT: pmaddwd %xmm0, %xmm0
-; X64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; X64-NEXT: pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; X64-NEXT: pmullw %xmm0, %xmm0
+; X64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
; X64-NEXT: movd %xmm0, (%rax)
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: retq
; X86-LABEL: shuff_f:
; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-NEXT: paddd %xmm0, %xmm0
-; X86-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: paddb %xmm0, %xmm0
; X86-NEXT: movd %xmm0, (%eax)
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: retl
;
; X64-LABEL: shuff_f:
; X64: # %bb.0: # %entry
-; X64-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X64-NEXT: paddd %xmm0, %xmm0
-; X64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT: paddb %xmm0, %xmm0
; X64-NEXT: movd %xmm0, (%rax)
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: retq
define void @subus_v4i8(<4 x i8>* %p1, <4 x i8>* %p2) {
; SSE-LABEL: subus_v4i8:
; SSE: # %bb.0:
-; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; SSE-NEXT: psubusb %xmm1, %xmm0
; SSE-NEXT: movd %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: subus_v4i8:
; AVX: # %bb.0:
-; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, (%rdi)
; AVX-NEXT: retq
define void @subus_v2i8(<2 x i8>* %p1, <2 x i8>* %p2) {
; SSE2-LABEL: subus_v2i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; SSE2-NEXT: psubusb %xmm1, %xmm0
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: movw %ax, (%rdi)
;
; SSSE3-LABEL: subus_v2i8:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; SSSE3-NEXT: psubusb %xmm1, %xmm0
; SSSE3-NEXT: movd %xmm0, %eax
; SSSE3-NEXT: movw %ax, (%rdi)
;
; SSE41-LABEL: subus_v2i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE41-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; SSE41-NEXT: psubusb %xmm1, %xmm0
; SSE41-NEXT: pextrw $0, %xmm0, (%rdi)
; SSE41-NEXT: retq
;
; AVX-LABEL: subus_v2i8:
; AVX: # %bb.0:
-; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpextrw $0, %xmm0, (%rdi)
; AVX-NEXT: retq
define void @subus_v2i16(<2 x i16>* %p1, <2 x i16>* %p2) {
; SSE-LABEL: subus_v2i16:
; SSE: # %bb.0:
-; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; SSE-NEXT: psubusw %xmm1, %xmm0
; SSE-NEXT: movd %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: subus_v2i16:
; AVX: # %bb.0:
-; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, (%rdi)
; AVX-NEXT: retq
; CHECK-LABEL: t3:
; CHECK: ## %bb.0:
; CHECK-NEXT: movl $1, %eax
-; CHECK-NEXT: movq %rax, %xmm0
+; CHECK-NEXT: movd %eax, %xmm0
; CHECK-NEXT: retq
ret <2 x i32> <i32 1, i32 0>
}
; SSE2-NEXT: pand %xmm1, %xmm3
; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: psadbw %xmm3, %xmm2
-; SSE2-NEXT: paddq %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT: paddd %xmm2, %xmm0
; SSE2-NEXT: addq $4, %rax
; SSE2-NEXT: jne .LBB3_1
; SSE2-NEXT: # %bb.2: # %middle.block
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: paddq %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: retq
;
; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7]
; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7]
; AVX-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vpaddq %xmm1, %xmm2, %xmm1
+; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; AVX-NEXT: vpaddd %xmm1, %xmm2, %xmm1
; AVX-NEXT: addq $4, %rax
; AVX-NEXT: jne .LBB3_1
; AVX-NEXT: # %bb.2: # %middle.block
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; AVX-NEXT: vpaddq %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: retq
entry:
define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
; SSE2-LABEL: v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: psllq $32, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE2-NEXT: psllq $32, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: paddq %xmm1, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pxor %xmm4, %xmm4
; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE2-NEXT: pxor %xmm1, %xmm4
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm3, %xmm5
-; SSE2-NEXT: pxor %xmm1, %xmm5
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm2, %xmm4
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
+; SSE2-NEXT: pxor %xmm2, %xmm5
; SSE2-NEXT: pcmpeqd %xmm5, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2]
-; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: por %xmm2, %xmm4
-; SSE2-NEXT: movdqa %xmm2, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm7, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm4
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2]
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm3, %xmm2
-; SSE2-NEXT: movdqa %xmm4, %xmm1
+; SSE2-NEXT: paddd %xmm1, %xmm0
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
+; SSE2-NEXT: pxor %xmm3, %xmm2
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm2
+; SSE2-NEXT: pandn %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm3, %xmm1
; SSE2-NEXT: pandn {{.*}}(%rip), %xmm1
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm4
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: pand %xmm2, %xmm4
+; SSE2-NEXT: psrld $1, %xmm3
+; SSE2-NEXT: por %xmm1, %xmm3
+; SSE2-NEXT: pand %xmm2, %xmm3
; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: por %xmm4, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,3,2,3]
-; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v2i32:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: psllq $32, %xmm1
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSSE3-NEXT: psllq $32, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, %xmm3
-; SSSE3-NEXT: paddq %xmm1, %xmm0
-; SSSE3-NEXT: por %xmm2, %xmm1
-; SSSE3-NEXT: movdqa %xmm2, %xmm4
+; SSSE3-NEXT: pxor %xmm3, %xmm3
+; SSSE3-NEXT: pxor %xmm4, %xmm4
; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm5, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm4
-; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1
-; SSSE3-NEXT: pxor %xmm1, %xmm4
-; SSSE3-NEXT: por %xmm2, %xmm3
-; SSSE3-NEXT: movdqa %xmm2, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT: por %xmm3, %xmm5
-; SSSE3-NEXT: pxor %xmm1, %xmm5
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm2
+; SSSE3-NEXT: pxor %xmm2, %xmm4
+; SSSE3-NEXT: pxor %xmm5, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5
+; SSSE3-NEXT: pxor %xmm2, %xmm5
; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2]
-; SSSE3-NEXT: pand %xmm4, %xmm3
-; SSSE3-NEXT: movdqa %xmm0, %xmm4
-; SSSE3-NEXT: por %xmm2, %xmm4
-; SSSE3-NEXT: movdqa %xmm2, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pand %xmm7, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm2, %xmm4
-; SSSE3-NEXT: pxor %xmm4, %xmm1
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2]
-; SSSE3-NEXT: pand %xmm1, %xmm2
-; SSSE3-NEXT: pandn %xmm3, %xmm2
-; SSSE3-NEXT: movdqa %xmm4, %xmm1
+; SSSE3-NEXT: paddd %xmm1, %xmm0
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3
+; SSSE3-NEXT: pxor %xmm3, %xmm2
+; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2
+; SSSE3-NEXT: pandn %xmm4, %xmm2
+; SSSE3-NEXT: movdqa %xmm3, %xmm1
; SSSE3-NEXT: pandn {{.*}}(%rip), %xmm1
-; SSSE3-NEXT: pand {{.*}}(%rip), %xmm4
-; SSSE3-NEXT: por %xmm1, %xmm4
-; SSSE3-NEXT: pand %xmm2, %xmm4
+; SSSE3-NEXT: psrld $1, %xmm3
+; SSSE3-NEXT: por %xmm1, %xmm3
+; SSSE3-NEXT: pand %xmm2, %xmm3
; SSSE3-NEXT: pandn %xmm0, %xmm2
-; SSSE3-NEXT: por %xmm4, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,3,2,3]
-; SSSE3-NEXT: psrad $31, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT: por %xmm3, %xmm2
+; SSSE3-NEXT: movdqa %xmm2, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: v2i32:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psllq $32, %xmm1
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
-; SSE41-NEXT: psllq $32, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: paddq %xmm1, %xmm2
-; SSE41-NEXT: por %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: pxor %xmm0, %xmm0
+; SSE41-NEXT: pxor %xmm4, %xmm4
; SSE41-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm1, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE41-NEXT: pxor %xmm1, %xmm4
-; SSE41-NEXT: por %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm0, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE41-NEXT: pand %xmm6, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT: por %xmm3, %xmm5
-; SSE41-NEXT: pxor %xmm1, %xmm5
-; SSE41-NEXT: pcmpeqq %xmm5, %xmm4
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: por %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm0, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE41-NEXT: pand %xmm7, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm0, %xmm1
-; SSE41-NEXT: pcmpeqq %xmm5, %xmm1
-; SSE41-NEXT: pandn %xmm4, %xmm1
-; SSE41-NEXT: movapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
-; SSE41-NEXT: blendvpd %xmm0, {{.*}}(%rip), %xmm3
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT: psrad $31, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm3, %xmm3
+; SSE41-NEXT: pxor %xmm3, %xmm4
+; SSE41-NEXT: pxor %xmm5, %xmm5
+; SSE41-NEXT: pcmpgtd %xmm2, %xmm5
+; SSE41-NEXT: pxor %xmm3, %xmm5
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm4
+; SSE41-NEXT: paddd %xmm1, %xmm2
+; SSE41-NEXT: pcmpgtd %xmm2, %xmm0
+; SSE41-NEXT: pxor %xmm0, %xmm3
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm3
+; SSE41-NEXT: pandn %xmm4, %xmm3
+; SSE41-NEXT: movaps {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
+; SSE41-NEXT: blendvps %xmm0, {{.*}}(%rip), %xmm1
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm2
+; SSE41-NEXT: movaps %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: v2i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3
; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5
+; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5
; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpcmpeqq %xmm3, %xmm5, %xmm3
-; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm1
; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm2
-; AVX1-NEXT: vpcmpeqq %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm5, %xmm2
; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vmovapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT: vblendvpd %xmm1, {{.*}}(%rip), %xmm3, %xmm1
-; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; AVX1-NEXT: vblendvps %xmm1, {{.*}}(%rip), %xmm3, %xmm1
+; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: v2i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3
+; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3
; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5
+; AVX2-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5
; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5
-; AVX2-NEXT: vpcmpeqq %xmm3, %xmm5, %xmm3
-; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1
+; AVX2-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm1
; AVX2-NEXT: vpxor %xmm4, %xmm1, %xmm2
-; AVX2-NEXT: vpcmpeqq %xmm2, %xmm5, %xmm2
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm5, %xmm2
; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vmovapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vblendvpd %xmm1, {{.*}}(%rip), %xmm3, %xmm1
-; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $31, %xmm0, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647]
+; AVX2-NEXT: vbroadcastss {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT: vblendvps %xmm1, %xmm3, %xmm4, %xmm1
+; AVX2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: v2i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllq $32, %xmm1, %xmm1
; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpnltq %xmm2, %xmm1, %k0
-; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k1
+; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k0
+; AVX512-NEXT: vpcmpnltd %xmm2, %xmm0, %k1
; AVX512-NEXT: kxorw %k0, %k1, %k0
-; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k2
+; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpcmpnltd %xmm2, %xmm0, %k2
; AVX512-NEXT: kxorw %k2, %k1, %k1
; AVX512-NEXT: kandnw %k1, %k0, %k1
-; AVX512-NEXT: vpcmpgtq %xmm0, %xmm2, %k2
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
-; AVX512-NEXT: vmovdqa64 {{.*}}(%rip), %xmm1 {%k2}
-; AVX512-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
-; AVX512-NEXT: vpsraq $32, %xmm0, %xmm0
+; AVX512-NEXT: vpcmpgtd %xmm0, %xmm2, %k2
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
+; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 {%k2}
+; AVX512-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
; AVX512-NEXT: retq
%z = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %x, <2 x i32> %y)
ret <2 x i32> %z
define <4 x i16> @test_ushort_div(<4 x i16> %num, <4 x i16> %div) {
; CHECK-LABEL: test_ushort_div:
; CHECK: # %bb.0:
-; CHECK-NEXT: pxor %xmm2, %xmm2
-; CHECK-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; CHECK-NEXT: pextrd $1, %xmm0, %eax
-; CHECK-NEXT: pextrd $1, %xmm1, %ecx
+; CHECK-NEXT: pextrw $1, %xmm0, %eax
+; CHECK-NEXT: pextrw $1, %xmm1, %ecx
+; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
; CHECK-NEXT: xorl %edx, %edx
-; CHECK-NEXT: divl %ecx
+; CHECK-NEXT: divw %cx
; CHECK-NEXT: movl %eax, %ecx
; CHECK-NEXT: movd %xmm0, %eax
; CHECK-NEXT: movd %xmm1, %esi
+; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
; CHECK-NEXT: xorl %edx, %edx
-; CHECK-NEXT: divl %esi
+; CHECK-NEXT: divw %si
+; CHECK-NEXT: # kill: def $ax killed $ax def $eax
; CHECK-NEXT: movd %eax, %xmm2
-; CHECK-NEXT: pinsrd $1, %ecx, %xmm2
-; CHECK-NEXT: pextrd $2, %xmm0, %eax
-; CHECK-NEXT: pextrd $2, %xmm1, %ecx
+; CHECK-NEXT: pinsrw $1, %ecx, %xmm2
+; CHECK-NEXT: pextrw $2, %xmm0, %eax
+; CHECK-NEXT: pextrw $2, %xmm1, %ecx
+; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
; CHECK-NEXT: xorl %edx, %edx
-; CHECK-NEXT: divl %ecx
-; CHECK-NEXT: pinsrd $2, %eax, %xmm2
-; CHECK-NEXT: pextrd $3, %xmm0, %eax
-; CHECK-NEXT: pextrd $3, %xmm1, %ecx
+; CHECK-NEXT: divw %cx
+; CHECK-NEXT: # kill: def $ax killed $ax def $eax
+; CHECK-NEXT: pinsrw $2, %eax, %xmm2
+; CHECK-NEXT: pextrw $3, %xmm0, %eax
+; CHECK-NEXT: pextrw $3, %xmm1, %ecx
+; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
; CHECK-NEXT: xorl %edx, %edx
-; CHECK-NEXT: divl %ecx
-; CHECK-NEXT: pinsrd $3, %eax, %xmm2
+; CHECK-NEXT: divw %cx
+; CHECK-NEXT: # kill: def $ax killed $ax def $eax
+; CHECK-NEXT: pinsrw $3, %eax, %xmm2
; CHECK-NEXT: movdqa %xmm2, %xmm0
; CHECK-NEXT: retq
%div.r = udiv <4 x i16> %num, %div
define <4 x i8> @test_char_rem(<4 x i8> %num, <4 x i8> %rem) {
; CHECK-LABEL: test_char_rem:
; CHECK: # %bb.0:
-; CHECK-NEXT: pslld $24, %xmm1
-; CHECK-NEXT: psrad $24, %xmm1
-; CHECK-NEXT: pslld $24, %xmm0
-; CHECK-NEXT: psrad $24, %xmm0
-; CHECK-NEXT: pextrd $1, %xmm0, %eax
-; CHECK-NEXT: pextrd $1, %xmm1, %ecx
-; CHECK-NEXT: cltd
-; CHECK-NEXT: idivl %ecx
-; CHECK-NEXT: movl %edx, %ecx
-; CHECK-NEXT: movd %xmm0, %eax
-; CHECK-NEXT: movd %xmm1, %esi
-; CHECK-NEXT: cltd
-; CHECK-NEXT: idivl %esi
-; CHECK-NEXT: movd %edx, %xmm2
-; CHECK-NEXT: pinsrd $1, %ecx, %xmm2
-; CHECK-NEXT: pextrd $2, %xmm0, %eax
-; CHECK-NEXT: pextrd $2, %xmm1, %ecx
-; CHECK-NEXT: cltd
-; CHECK-NEXT: idivl %ecx
-; CHECK-NEXT: pinsrd $2, %edx, %xmm2
-; CHECK-NEXT: pextrd $3, %xmm0, %eax
-; CHECK-NEXT: pextrd $3, %xmm1, %ecx
-; CHECK-NEXT: cltd
-; CHECK-NEXT: idivl %ecx
-; CHECK-NEXT: pinsrd $3, %edx, %xmm2
+; CHECK-NEXT: pextrb $1, %xmm0, %eax
+; CHECK-NEXT: # kill: def $al killed $al killed $eax
+; CHECK-NEXT: cbtw
+; CHECK-NEXT: pextrb $1, %xmm1, %ecx
+; CHECK-NEXT: idivb %cl
+; CHECK-NEXT: movsbl %ah, %ecx
+; CHECK-NEXT: pextrb $0, %xmm0, %eax
+; CHECK-NEXT: # kill: def $al killed $al killed $eax
+; CHECK-NEXT: cbtw
+; CHECK-NEXT: pextrb $0, %xmm1, %edx
+; CHECK-NEXT: idivb %dl
+; CHECK-NEXT: movsbl %ah, %eax
+; CHECK-NEXT: movd %eax, %xmm2
+; CHECK-NEXT: pextrb $2, %xmm0, %eax
+; CHECK-NEXT: # kill: def $al killed $al killed $eax
+; CHECK-NEXT: cbtw
+; CHECK-NEXT: pinsrb $1, %ecx, %xmm2
+; CHECK-NEXT: pextrb $2, %xmm1, %ecx
+; CHECK-NEXT: idivb %cl
+; CHECK-NEXT: movsbl %ah, %ecx
+; CHECK-NEXT: pextrb $3, %xmm0, %eax
+; CHECK-NEXT: # kill: def $al killed $al killed $eax
+; CHECK-NEXT: cbtw
+; CHECK-NEXT: pinsrb $2, %ecx, %xmm2
+; CHECK-NEXT: pextrb $3, %xmm1, %ecx
+; CHECK-NEXT: idivb %cl
+; CHECK-NEXT: movsbl %ah, %eax
+; CHECK-NEXT: pinsrb $3, %eax, %xmm2
; CHECK-NEXT: movdqa %xmm2, %xmm0
; CHECK-NEXT: retq
%rem.r = srem <4 x i8> %num, %rem
}
define void @test5(i1 %c, <2 x i16> %a, <2 x i16> %b, <2 x i16>* %p) nounwind {
-; CHECK-LABEL: test5:
-; CHECK: ## %bb.0:
-; CHECK-NEXT: testb $1, %dil
-; CHECK-NEXT: jne LBB4_2
-; CHECK-NEXT: ## %bb.1:
-; CHECK-NEXT: movdqa %xmm1, %xmm0
-; CHECK-NEXT: LBB4_2:
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; CHECK-NEXT: movd %xmm0, (%rsi)
-; CHECK-NEXT: retq
+; GENERIC-LABEL: test5:
+; GENERIC: ## %bb.0:
+; GENERIC-NEXT: testb $1, %dil
+; GENERIC-NEXT: jne LBB4_2
+; GENERIC-NEXT: ## %bb.1:
+; GENERIC-NEXT: movaps %xmm1, %xmm0
+; GENERIC-NEXT: LBB4_2:
+; GENERIC-NEXT: movss %xmm0, (%rsi)
+; GENERIC-NEXT: retq
+;
+; ATOM-LABEL: test5:
+; ATOM: ## %bb.0:
+; ATOM-NEXT: testb $1, %dil
+; ATOM-NEXT: jne LBB4_2
+; ATOM-NEXT: ## %bb.1:
+; ATOM-NEXT: movaps %xmm1, %xmm0
+; ATOM-NEXT: LBB4_2:
+; ATOM-NEXT: movss %xmm0, (%rsi)
+; ATOM-NEXT: nop
+; ATOM-NEXT: nop
+; ATOM-NEXT: retq
;
; ATHLON-LABEL: test5:
; ATHLON: ## %bb.0:
;
; X64-LABEL: ashr_add_shl_v4i8:
; X64: # %bb.0:
+; X64-NEXT: pand {{.*}}(%rip), %xmm0
+; X64-NEXT: packuswb %xmm0, %xmm0
+; X64-NEXT: packuswb %xmm0, %xmm0
; X64-NEXT: pcmpeqd %xmm1, %xmm1
-; X64-NEXT: psubd %xmm1, %xmm0
-; X64-NEXT: pslld $24, %xmm0
+; X64-NEXT: psubb %xmm1, %xmm0
+; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; X64-NEXT: psrad $24, %xmm0
; X64-NEXT: retq
%conv = shl <4 x i32> %r, <i32 24, i32 24, i32 24, i32 24>
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-AVX-NEXT: movl c, %esi
-; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; X86-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
-; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-AVX-NEXT: movzwl (%edx,%ecx), %edx
+; X86-AVX-NEXT: vmovd %edx, %xmm0
+; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X86-AVX-NEXT: movzwl (%eax,%ecx), %eax
+; X86-AVX-NEXT: vmovd %eax, %xmm1
+; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; X86-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0
; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
; X86-AVX-NEXT: popl %esi
; X86-AVX-NEXT: retl
; X64-AVX-LABEL: mul_2xi8:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; X64-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
-; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-AVX-NEXT: movzwl (%rdi,%rdx), %ecx
+; X64-AVX-NEXT: vmovd %ecx, %xmm0
+; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X64-AVX-NEXT: movzwl (%rsi,%rdx), %ecx
+; X64-AVX-NEXT: vmovd %ecx, %xmm1
+; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; X64-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
; X64-AVX-NEXT: retq
entry:
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-AVX-NEXT: movl c, %esi
-; X86-AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
-; X86-AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
-; X86-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
-; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
; X86-AVX-NEXT: popl %esi
; X86-AVX-NEXT: retl
; X64-AVX-LABEL: mul_2xi16:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
-; X64-AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
-; X64-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
-; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
; X64-AVX-NEXT: retq
entry:
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-AVX-NEXT: movl c, %esi
-; X86-AVX-NEXT: vpmovsxbq (%edx,%ecx), %xmm0
-; X86-AVX-NEXT: vpmovsxbq (%eax,%ecx), %xmm1
-; X86-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
-; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-AVX-NEXT: movzwl (%edx,%ecx), %edx
+; X86-AVX-NEXT: vmovd %edx, %xmm0
+; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
+; X86-AVX-NEXT: movzwl (%eax,%ecx), %eax
+; X86-AVX-NEXT: vmovd %eax, %xmm1
+; X86-AVX-NEXT: vpmovsxbd %xmm1, %xmm1
+; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
; X86-AVX-NEXT: popl %esi
; X86-AVX-NEXT: retl
; X64-AVX-LABEL: mul_2xi8_sext:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: vpmovsxbq (%rdi,%rdx), %xmm0
-; X64-AVX-NEXT: vpmovsxbq (%rsi,%rdx), %xmm1
-; X64-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
-; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-AVX-NEXT: movzwl (%rdi,%rdx), %ecx
+; X64-AVX-NEXT: vmovd %ecx, %xmm0
+; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
+; X64-AVX-NEXT: movzwl (%rsi,%rdx), %ecx
+; X64-AVX-NEXT: vmovd %ecx, %xmm1
+; X64-AVX-NEXT: vpmovsxbd %xmm1, %xmm1
+; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
; X64-AVX-NEXT: retq
entry:
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-AVX-NEXT: movl c, %esi
-; X86-AVX-NEXT: vpmovsxbq (%edx,%ecx), %xmm0
-; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; X86-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
-; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-AVX-NEXT: movzwl (%edx,%ecx), %edx
+; X86-AVX-NEXT: vmovd %edx, %xmm0
+; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
+; X86-AVX-NEXT: movzwl (%eax,%ecx), %eax
+; X86-AVX-NEXT: vmovd %eax, %xmm1
+; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
; X86-AVX-NEXT: popl %esi
; X86-AVX-NEXT: retl
; X64-AVX-LABEL: mul_2xi8_sext_zext:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: vpmovsxbq (%rdi,%rdx), %xmm0
-; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; X64-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
-; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-AVX-NEXT: movzwl (%rdi,%rdx), %ecx
+; X64-AVX-NEXT: vmovd %ecx, %xmm0
+; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
+; X64-AVX-NEXT: movzwl (%rsi,%rdx), %ecx
+; X64-AVX-NEXT: vmovd %ecx, %xmm1
+; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
; X64-AVX-NEXT: retq
entry:
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-AVX-NEXT: movl c, %esi
-; X86-AVX-NEXT: vpmovsxwq (%edx,%ecx), %xmm0
-; X86-AVX-NEXT: vpmovsxwq (%eax,%ecx), %xmm1
-; X86-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
-; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX-NEXT: vpmovsxwd %xmm0, %xmm0
+; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-AVX-NEXT: vpmovsxwd %xmm1, %xmm1
+; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
; X86-AVX-NEXT: popl %esi
; X86-AVX-NEXT: retl
; X64-AVX-LABEL: mul_2xi16_sext:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: vpmovsxwq (%rdi,%rdx), %xmm0
-; X64-AVX-NEXT: vpmovsxwq (%rsi,%rdx), %xmm1
-; X64-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
-; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-AVX-NEXT: vpmovsxwd %xmm0, %xmm0
+; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-AVX-NEXT: vpmovsxwd %xmm1, %xmm1
+; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
; X64-AVX-NEXT: retq
entry:
; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
; X86-SSE-NEXT: psrad $16, %xmm0
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X86-SSE-NEXT: pxor %xmm2, %xmm2
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; X86-SSE-NEXT: pmuludq %xmm0, %xmm1
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
-; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4)
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X86-SSE-NEXT: pmuludq %xmm2, %xmm0
+; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4)
; X86-SSE-NEXT: popl %esi
; X86-SSE-NEXT: retl
;
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-AVX-NEXT: movl c, %esi
-; X86-AVX-NEXT: vpmovsxwq (%edx,%ecx), %xmm0
-; X86-AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
-; X86-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
-; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX-NEXT: vpmovsxwd %xmm0, %xmm0
+; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
; X86-AVX-NEXT: popl %esi
; X86-AVX-NEXT: retl
; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
; X64-SSE-NEXT: psrad $16, %xmm0
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X64-SSE-NEXT: pxor %xmm2, %xmm2
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; X64-SSE-NEXT: pmuludq %xmm0, %xmm1
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
-; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4)
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X64-SSE-NEXT: pmuludq %xmm2, %xmm0
+; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4)
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: mul_2xi16_sext_zext:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: vpmovsxwq (%rdi,%rdx), %xmm0
-; X64-AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
-; X64-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
-; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-AVX-NEXT: vpmovsxwd %xmm0, %xmm0
+; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
; X64-AVX-NEXT: retq
entry:
; X86-SSE-NEXT: movd %ecx, %xmm0
; X86-SSE-NEXT: pxor %xmm1, %xmm1
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; X86-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-SSE-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0
; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
; X86-SSE-NEXT: retl
;
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl c, %edx
-; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0
-; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx
+; X86-AVX-NEXT: vmovd %ecx, %xmm0
+; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X86-AVX-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
; X86-AVX-NEXT: retl
;
; X64-SSE-NEXT: movd %ecx, %xmm0
; X64-SSE-NEXT: pxor %xmm1, %xmm1
; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; X64-SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-SSE-NEXT: pmaddwd {{.*}}(%rip), %xmm0
; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: mul_2xi8_varconst1:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx
+; X64-AVX-NEXT: vmovd %ecx, %xmm0
+; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X64-AVX-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
; X64-AVX-NEXT: retq
entry:
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl c, %edx
-; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0
-; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0
-; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx
+; X86-AVX-NEXT: vmovd %ecx, %xmm0
+; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
+; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
; X86-AVX-NEXT: retl
;
; X64-AVX-LABEL: mul_2xi8_varconst2:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0
-; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx
+; X64-AVX-NEXT: vmovd %ecx, %xmm0
+; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
+; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
; X64-AVX-NEXT: retq
entry:
; X86-SSE-NEXT: movd %ecx, %xmm0
; X86-SSE-NEXT: pxor %xmm1, %xmm1
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u>
-; X86-SSE-NEXT: movdqa %xmm0, %xmm2
-; X86-SSE-NEXT: pmulhw %xmm1, %xmm2
-; X86-SSE-NEXT: pmullw %xmm1, %xmm0
-; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-SSE-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0
; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
; X86-SSE-NEXT: retl
;
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl c, %edx
-; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0
-; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx
+; X86-AVX-NEXT: vmovd %ecx, %xmm0
+; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X86-AVX-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
; X86-AVX-NEXT: retl
;
; X64-SSE-NEXT: movd %ecx, %xmm0
; X64-SSE-NEXT: pxor %xmm1, %xmm1
; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u>
-; X64-SSE-NEXT: movdqa %xmm0, %xmm2
-; X64-SSE-NEXT: pmulhw %xmm1, %xmm2
-; X64-SSE-NEXT: pmullw %xmm1, %xmm0
-; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-SSE-NEXT: pmaddwd {{.*}}(%rip), %xmm0
; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: mul_2xi8_varconst3:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx
+; X64-AVX-NEXT: vmovd %ecx, %xmm0
+; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X64-AVX-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
; X64-AVX-NEXT: retq
entry:
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl c, %edx
-; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0
-; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx
+; X86-AVX-NEXT: vmovd %ecx, %xmm0
+; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
; X86-AVX-NEXT: retl
;
; X64-AVX-LABEL: mul_2xi8_varconst4:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx
+; X64-AVX-NEXT: vmovd %ecx, %xmm0
+; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
; X64-AVX-NEXT: retq
entry:
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl c, %edx
-; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0
-; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0
-; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx
+; X86-AVX-NEXT: vmovd %ecx, %xmm0
+; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
+; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
; X86-AVX-NEXT: retl
;
; X64-AVX-LABEL: mul_2xi8_varconst5:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0
-; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx
+; X64-AVX-NEXT: vmovd %ecx, %xmm0
+; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
+; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
; X64-AVX-NEXT: retq
entry:
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl c, %edx
-; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0
-; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0
-; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx
+; X86-AVX-NEXT: vmovd %ecx, %xmm0
+; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
+; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
; X86-AVX-NEXT: retl
;
; X64-AVX-LABEL: mul_2xi8_varconst6:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0
-; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx
+; X64-AVX-NEXT: vmovd %ecx, %xmm0
+; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
+; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
; X64-AVX-NEXT: retq
entry:
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl c, %edx
-; X86-AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
-; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0
-; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
; X86-AVX-NEXT: retl
;
; X64-AVX-LABEL: mul_2xi16_varconst1:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
-; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
; X64-AVX-NEXT: retq
entry:
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl c, %edx
-; X86-AVX-NEXT: vpmovsxwq (%ecx,%eax), %xmm0
-; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0
-; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX-NEXT: vpmovsxwd %xmm0, %xmm0
+; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
; X86-AVX-NEXT: retl
;
; X64-AVX-LABEL: mul_2xi16_varconst2:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: vpmovsxwq (%rdi,%rsi), %xmm0
-; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-AVX-NEXT: vpmovsxwd %xmm0, %xmm0
+; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
; X64-AVX-NEXT: retq
entry:
; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE-NEXT: pxor %xmm1, %xmm1
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; X86-SSE-NEXT: pmuludq {{\.LCPI.*}}, %xmm0
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65536,u,u>
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; X86-SSE-NEXT: pmuludq %xmm1, %xmm0
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-SSE-NEXT: pmuludq %xmm2, %xmm1
+; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
; X86-SSE-NEXT: retl
;
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl c, %edx
-; X86-AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
-; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0
-; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
; X86-AVX-NEXT: retl
;
; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-SSE-NEXT: pxor %xmm1, %xmm1
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; X64-SSE-NEXT: pmuludq {{.*}}(%rip), %xmm0
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65536,u,u>
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; X64-SSE-NEXT: pmuludq %xmm1, %xmm0
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X64-SSE-NEXT: pmuludq %xmm2, %xmm1
+; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: mul_2xi16_varconst3:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
-; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
; X64-AVX-NEXT: retq
entry:
; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
; X86-SSE-NEXT: psrad $16, %xmm0
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; X86-SSE-NEXT: pmuludq {{\.LCPI.*}}, %xmm0
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,32768,u,u>
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; X86-SSE-NEXT: pmuludq %xmm1, %xmm0
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-SSE-NEXT: pmuludq %xmm2, %xmm1
+; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
; X86-SSE-NEXT: retl
;
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl c, %edx
-; X86-AVX-NEXT: vpmovsxwq (%ecx,%eax), %xmm0
-; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0
-; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX-NEXT: vpmovsxwd %xmm0, %xmm0
+; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
; X86-AVX-NEXT: retl
;
; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
; X64-SSE-NEXT: psrad $16, %xmm0
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; X64-SSE-NEXT: pmuludq {{.*}}(%rip), %xmm0
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,32768,u,u>
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; X64-SSE-NEXT: pmuludq %xmm1, %xmm0
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X64-SSE-NEXT: pmuludq %xmm2, %xmm1
+; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: mul_2xi16_varconst4:
; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: vpmovsxwq (%rdi,%rsi), %xmm0
-; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-AVX-NEXT: vpmovsxwd %xmm0, %xmm0
+; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
; X64-AVX-NEXT: retq
entry:
; AVX-NEXT: vmovq %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512F-LABEL: shuffle_v16i8_to_v8i8_1:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT: vmovq %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i8_to_v8i8_1:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
-; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v16i8_to_v8i8_1:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v16i8_to_v8i8_1:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpsrlw $8, (%rdi), %xmm0
-; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
-; AVX512BWVL-NEXT: retq
+; AVX512-LABEL: shuffle_v16i8_to_v8i8_1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vmovq %xmm0, (%rsi)
+; AVX512-NEXT: retq
%vec = load <16 x i8>, <16 x i8>* %L
%strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
store <8 x i8> %strided.vec, <8 x i8>* %S
; AVX-NEXT: vmovq %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512F-LABEL: shuffle_v8i16_to_v4i16_1:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
-; AVX512F-NEXT: vmovq %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v8i16_to_v4i16_1:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsrld $16, (%rdi), %xmm0
-; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v8i16_to_v4i16_1:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
-; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v8i16_to_v4i16_1:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpsrld $16, (%rdi), %xmm0
-; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
-; AVX512BWVL-NEXT: retq
+; AVX512-LABEL: shuffle_v8i16_to_v4i16_1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; AVX512-NEXT: vmovq %xmm0, (%rsi)
+; AVX512-NEXT: retq
%vec = load <8 x i16>, <8 x i16>* %L
%strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
store <4 x i16> %strided.vec, <4 x i16>* %S
; AVX-NEXT: vmovlps %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512F-LABEL: shuffle_v4i32_to_v2i32_1:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,3,2,3]
-; AVX512F-NEXT: vmovlps %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v4i32_to_v2i32_1:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,1,3,3]
-; AVX512VL-NEXT: vpmovqd %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v4i32_to_v2i32_1:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,3,2,3]
-; AVX512BW-NEXT: vmovlps %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v4i32_to_v2i32_1:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,1,3,3]
-; AVX512BWVL-NEXT: vpmovqd %xmm0, (%rsi)
-; AVX512BWVL-NEXT: retq
+; AVX512-LABEL: shuffle_v4i32_to_v2i32_1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,3,2,3]
+; AVX512-NEXT: vmovlps %xmm0, (%rsi)
+; AVX512-NEXT: retq
%vec = load <4 x i32>, <4 x i32>* %L
%strided.vec = shufflevector <4 x i32> %vec, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
store <2 x i32> %strided.vec, <2 x i32>* %S
; AVX-NEXT: vmovd %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512F-LABEL: shuffle_v16i8_to_v4i8_1:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT: vmovd %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i8_to_v4i8_1:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v16i8_to_v4i8_1:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v16i8_to_v4i8_1:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpsrlw $8, (%rdi), %xmm0
-; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
-; AVX512BWVL-NEXT: retq
+; AVX512-LABEL: shuffle_v16i8_to_v4i8_1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vmovd %xmm0, (%rsi)
+; AVX512-NEXT: retq
%vec = load <16 x i8>, <16 x i8>* %L
%strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
store <4 x i8> %strided.vec, <4 x i8>* %S
; AVX-NEXT: vmovd %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512F-LABEL: shuffle_v16i8_to_v4i8_2:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT: vmovd %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i8_to_v4i8_2:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsrld $16, (%rdi), %xmm0
-; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v16i8_to_v4i8_2:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v16i8_to_v4i8_2:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpsrld $16, (%rdi), %xmm0
-; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
-; AVX512BWVL-NEXT: retq
+; AVX512-LABEL: shuffle_v16i8_to_v4i8_2:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vmovd %xmm0, (%rsi)
+; AVX512-NEXT: retq
%vec = load <16 x i8>, <16 x i8>* %L
%strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
store <4 x i8> %strided.vec, <4 x i8>* %S
; AVX-NEXT: vmovd %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512F-LABEL: shuffle_v16i8_to_v4i8_3:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT: vmovd %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i8_to_v4i8_3:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsrld $24, (%rdi), %xmm0
-; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v16i8_to_v4i8_3:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v16i8_to_v4i8_3:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpsrld $24, (%rdi), %xmm0
-; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
-; AVX512BWVL-NEXT: retq
+; AVX512-LABEL: shuffle_v16i8_to_v4i8_3:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vmovd %xmm0, (%rsi)
+; AVX512-NEXT: retq
%vec = load <16 x i8>, <16 x i8>* %L
%strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
store <4 x i8> %strided.vec, <4 x i8>* %S
;
; AVX512VL-LABEL: shuffle_v8i16_to_v2i16_1:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsrld $16, (%rdi), %xmm0
-; AVX512VL-NEXT: vpmovqw %xmm0, (%rsi)
+; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX512VL-NEXT: vmovd %xmm0, (%rsi)
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_1:
;
; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16_1:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpsrld $16, (%rdi), %xmm0
-; AVX512BWVL-NEXT: vpmovqw %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX512BWVL-NEXT: vmovd %xmm0, (%rsi)
; AVX512BWVL-NEXT: retq
%vec = load <8 x i16>, <8 x i16>* %L
%strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> <i32 1, i32 5>
;
; AVX512VL-LABEL: shuffle_v8i16_to_v2i16_2:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,1,3,3]
-; AVX512VL-NEXT: vpmovqw %xmm0, (%rsi)
+; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512VL-NEXT: vmovd %xmm0, (%rsi)
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_2:
;
; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16_2:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,1,3,3]
-; AVX512BWVL-NEXT: vpmovqw %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512BWVL-NEXT: vmovd %xmm0, (%rsi)
; AVX512BWVL-NEXT: retq
%vec = load <8 x i16>, <8 x i16>* %L
%strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> <i32 2, i32 6>
;
; AVX512VL-LABEL: shuffle_v8i16_to_v2i16_3:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsrlq $48, (%rdi), %xmm0
-; AVX512VL-NEXT: vpmovqw %xmm0, (%rsi)
+; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512VL-NEXT: vmovd %xmm0, (%rsi)
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_3:
;
; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16_3:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpsrlq $48, (%rdi), %xmm0
-; AVX512BWVL-NEXT: vpmovqw %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512BWVL-NEXT: vmovd %xmm0, (%rsi)
; AVX512BWVL-NEXT: retq
%vec = load <8 x i16>, <8 x i16>* %L
%strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> <i32 3, i32 7>
; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512F-LABEL: shuffle_v16i8_to_v2i8_1:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i8_to_v2i8_1:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v16i8_to_v2i8_1:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8_1:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpsrlw $8, (%rdi), %xmm0
-; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi)
-; AVX512BWVL-NEXT: retq
+; AVX512-LABEL: shuffle_v16i8_to_v2i8_1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX512-NEXT: retq
%vec = load <16 x i8>, <16 x i8>* %L
%strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 1, i32 9>
store <2 x i8> %strided.vec, <2 x i8>* %S
; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512F-LABEL: shuffle_v16i8_to_v2i8_2:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i8_to_v2i8_2:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsrld $16, (%rdi), %xmm0
-; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v16i8_to_v2i8_2:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8_2:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpsrld $16, (%rdi), %xmm0
-; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi)
-; AVX512BWVL-NEXT: retq
+; AVX512-LABEL: shuffle_v16i8_to_v2i8_2:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX512-NEXT: retq
%vec = load <16 x i8>, <16 x i8>* %L
%strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 2, i32 10>
store <2 x i8> %strided.vec, <2 x i8>* %S
; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512F-LABEL: shuffle_v16i8_to_v2i8_3:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i8_to_v2i8_3:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsrld $24, (%rdi), %xmm0
-; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v16i8_to_v2i8_3:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8_3:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpsrld $24, (%rdi), %xmm0
-; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi)
-; AVX512BWVL-NEXT: retq
+; AVX512-LABEL: shuffle_v16i8_to_v2i8_3:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX512-NEXT: retq
%vec = load <16 x i8>, <16 x i8>* %L
%strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 3, i32 11>
store <2 x i8> %strided.vec, <2 x i8>* %S
; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512F-LABEL: shuffle_v16i8_to_v2i8_4:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i8_to_v2i8_4:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,1,3,3]
-; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v16i8_to_v2i8_4:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8_4:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,1,3,3]
-; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi)
-; AVX512BWVL-NEXT: retq
+; AVX512-LABEL: shuffle_v16i8_to_v2i8_4:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX512-NEXT: retq
%vec = load <16 x i8>, <16 x i8>* %L
%strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 4, i32 12>
store <2 x i8> %strided.vec, <2 x i8>* %S
; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512F-LABEL: shuffle_v16i8_to_v2i8_5:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i8_to_v2i8_5:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsrlq $40, (%rdi), %xmm0
-; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v16i8_to_v2i8_5:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8_5:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpsrlq $40, (%rdi), %xmm0
-; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi)
-; AVX512BWVL-NEXT: retq
+; AVX512-LABEL: shuffle_v16i8_to_v2i8_5:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX512-NEXT: retq
%vec = load <16 x i8>, <16 x i8>* %L
%strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 5, i32 13>
store <2 x i8> %strided.vec, <2 x i8>* %S
; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512F-LABEL: shuffle_v16i8_to_v2i8_6:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i8_to_v2i8_6:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsrlq $48, (%rdi), %xmm0
-; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v16i8_to_v2i8_6:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8_6:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpsrlq $48, (%rdi), %xmm0
-; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi)
-; AVX512BWVL-NEXT: retq
+; AVX512-LABEL: shuffle_v16i8_to_v2i8_6:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX512-NEXT: retq
%vec = load <16 x i8>, <16 x i8>* %L
%strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 6, i32 14>
store <2 x i8> %strided.vec, <2 x i8>* %S
; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512F-LABEL: shuffle_v16i8_to_v2i8_7:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i8_to_v2i8_7:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsrlq $56, (%rdi), %xmm0
-; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v16i8_to_v2i8_7:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8_7:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpsrlq $56, (%rdi), %xmm0
-; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi)
-; AVX512BWVL-NEXT: retq
+; AVX512-LABEL: shuffle_v16i8_to_v2i8_7:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX512-NEXT: retq
%vec = load <16 x i8>, <16 x i8>* %L
%strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 7, i32 15>
store <2 x i8> %strided.vec, <2 x i8>* %S
; AVX-NEXT: vmovq %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512F-LABEL: shuffle_v32i8_to_v8i8_1:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512F-NEXT: vmovq %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_1:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_1:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_1:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,5,5,9,9,13,13,13,13,5,5,12,12,13,13]
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
-; AVX512BWVL-NEXT: retq
+; AVX512-LABEL: shuffle_v32i8_to_v8i8_1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512-NEXT: vmovq %xmm0, (%rsi)
+; AVX512-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
store <8 x i8> %strided.vec, <8 x i8>* %S
; AVX-NEXT: vmovq %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512F-LABEL: shuffle_v32i8_to_v8i8_2:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512F-NEXT: vmovq %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_2:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_2:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_2:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,3,5,7,9,11,13,15]
-; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
-; AVX512BWVL-NEXT: vpmovwb %xmm1, (%rsi)
-; AVX512BWVL-NEXT: retq
+; AVX512-LABEL: shuffle_v32i8_to_v8i8_2:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512-NEXT: vmovq %xmm0, (%rsi)
+; AVX512-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
store <8 x i8> %strided.vec, <8 x i8>* %S
; AVX-NEXT: vmovq %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512F-LABEL: shuffle_v32i8_to_v8i8_3:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512F-NEXT: vmovq %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_3:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_3:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_3:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [3,3,7,7,11,11,15,15,7,7,15,15,6,6,7,7]
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
-; AVX512BWVL-NEXT: retq
+; AVX512-LABEL: shuffle_v32i8_to_v8i8_3:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512-NEXT: vmovq %xmm0, (%rsi)
+; AVX512-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
store <8 x i8> %strided.vec, <8 x i8>* %S
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15]
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
+; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_1:
; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_1:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,5,5,9,9,13,13]
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,4,5,12,13]
; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
-; AVX512BWVL-NEXT: vpmovdw %xmm1, (%rsi)
+; AVX512BWVL-NEXT: vmovq %xmm1, (%rsi)
; AVX512BWVL-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %L
%strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
;
; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_2:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovaps (%rdi), %xmm0
-; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3]
-; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
+; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_2:
;
; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_2:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovaps (%rdi), %xmm0
-; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3]
-; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [2,6,10,14,2,3,10,11]
+; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
+; AVX512BWVL-NEXT: vmovq %xmm1, (%rsi)
; AVX512BWVL-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %L
%strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15]
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
+; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_3:
; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_3:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [3,7,7,3,11,15,15,11]
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [3,7,11,15,2,3,10,11]
; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
-; AVX512BWVL-NEXT: vpmovdw %xmm1, (%rsi)
+; AVX512BWVL-NEXT: vmovq %xmm1, (%rsi)
; AVX512BWVL-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %L
%strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
; AVX-NEXT: vmovd %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512F-LABEL: shuffle_v32i8_to_v4i8_1:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512F-NEXT: vmovd %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_1:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,255,255,9,9,255,255,9,9,255,255,11,11,255,255]
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_1:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_1:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,255,255,9,9,255,255,9,9,255,255,11,11,255,255]
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
-; AVX512BWVL-NEXT: retq
+; AVX512-LABEL: shuffle_v32i8_to_v4i8_1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512-NEXT: vmovd %xmm0, (%rsi)
+; AVX512-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 1, i32 9, i32 17, i32 25>
store <4 x i8> %strided.vec, <4 x i8>* %S
; AVX-NEXT: vmovd %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512F-LABEL: shuffle_v32i8_to_v4i8_2:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512F-NEXT: vmovd %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_2:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15]
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_2:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_2:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,5,5,9,9,13,13]
-; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
-; AVX512BWVL-NEXT: vpmovdb %xmm1, (%rsi)
-; AVX512BWVL-NEXT: retq
+; AVX512-LABEL: shuffle_v32i8_to_v4i8_2:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512-NEXT: vmovd %xmm0, (%rsi)
+; AVX512-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 2, i32 10, i32 18, i32 26>
store <4 x i8> %strided.vec, <4 x i8>* %S
; AVX-NEXT: vmovd %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512F-LABEL: shuffle_v32i8_to_v4i8_3:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512F-NEXT: vmovd %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_3:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [3,3,11,11,11,11,3,3,8,8,9,9,10,10,11,11]
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_3:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_3:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [3,3,11,11,11,11,3,3,8,8,9,9,10,10,11,11]
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
-; AVX512BWVL-NEXT: retq
+; AVX512-LABEL: shuffle_v32i8_to_v4i8_3:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512-NEXT: vmovd %xmm0, (%rsi)
+; AVX512-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 3, i32 11, i32 19, i32 27>
store <4 x i8> %strided.vec, <4 x i8>* %S
; AVX-NEXT: vmovd %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512F-LABEL: shuffle_v32i8_to_v4i8_4:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512F-NEXT: vmovd %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_4:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovaps (%rdi), %xmm0
-; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3]
-; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_4:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_4:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovaps (%rdi), %xmm0
-; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3]
-; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
-; AVX512BWVL-NEXT: retq
+; AVX512-LABEL: shuffle_v32i8_to_v4i8_4:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512-NEXT: vmovd %xmm0, (%rsi)
+; AVX512-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 4, i32 12, i32 20, i32 28>
store <4 x i8> %strided.vec, <4 x i8>* %S
; AVX-NEXT: vmovd %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512F-LABEL: shuffle_v32i8_to_v4i8_5:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512F-NEXT: vmovd %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_5:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [5,5,13,13,13,13,5,5,4,4,5,5,6,6,7,7]
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_5:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_5:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [5,5,13,13,13,13,5,5,4,4,5,5,6,6,7,7]
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
-; AVX512BWVL-NEXT: retq
+; AVX512-LABEL: shuffle_v32i8_to_v4i8_5:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512-NEXT: vmovd %xmm0, (%rsi)
+; AVX512-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 5, i32 13, i32 21, i32 29>
store <4 x i8> %strided.vec, <4 x i8>* %S
; AVX-NEXT: vmovd %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512F-LABEL: shuffle_v32i8_to_v4i8_6:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512F-NEXT: vmovd %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_6:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15]
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_6:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_6:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [3,7,7,3,11,15,15,11]
-; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
-; AVX512BWVL-NEXT: vpmovdb %xmm1, (%rsi)
-; AVX512BWVL-NEXT: retq
+; AVX512-LABEL: shuffle_v32i8_to_v4i8_6:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512-NEXT: vmovd %xmm0, (%rsi)
+; AVX512-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 6, i32 14, i32 22, i32 30>
store <4 x i8> %strided.vec, <4 x i8>* %S
; AVX-NEXT: vmovd %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512F-LABEL: shuffle_v32i8_to_v4i8_7:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512F-NEXT: vmovd %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_7:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,14,14,15,15,14,14,15,15,4,4,5,5,6,6]
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_7:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_7:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,14,14,15,15,14,14,15,15,4,4,5,5,6,6]
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
-; AVX512BWVL-NEXT: retq
+; AVX512-LABEL: shuffle_v32i8_to_v4i8_7:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512-NEXT: vmovd %xmm0, (%rsi)
+; AVX512-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 7, i32 15, i32 23, i32 31>
store <4 x i8> %strided.vec, <4 x i8>* %S
}
define void @shuffle_v64i8_to_v8i8_1(<64 x i8>* %L, <8 x i8>* %S) nounwind {
-; AVX512F-LABEL: shuffle_v64i8_to_v8i8_1:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512F-NEXT: vmovq %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_1:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_1:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_1:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512BWVL-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm4 = [0,0,1,1,1,1,9,9,8,8,9,9,10,10,11,11]
-; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,9,9,8,8,9,9,8,8,9,9,10,10,11,11]
-; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
-; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
-; AVX512BWVL-NEXT: retq
+; AVX512-LABEL: shuffle_v64i8_to_v8i8_1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; AVX512-NEXT: vmovq %xmm0, (%rsi)
+; AVX512-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
%strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57>
store <8 x i8> %strided.vec, <8 x i8>* %S
}
define void @shuffle_v64i8_to_v8i8_2(<64 x i8>* %L, <8 x i8>* %S) nounwind {
-; AVX512F-LABEL: shuffle_v64i8_to_v8i8_2:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512F-NEXT: vmovq %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_2:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_2:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_2:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [1,5,9,13,17,21,25,29]
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1
-; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
-; AVX512BWVL-NEXT: vpmovwb %xmm1, (%rsi)
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
+; AVX512-LABEL: shuffle_v64i8_to_v8i8_2:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; AVX512-NEXT: vmovq %xmm0, (%rsi)
+; AVX512-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
%strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 50, i32 58>
store <8 x i8> %strided.vec, <8 x i8>* %S
}
define void @shuffle_v64i8_to_v8i8_3(<64 x i8>* %L, <8 x i8>* %S) nounwind {
-; AVX512F-LABEL: shuffle_v64i8_to_v8i8_3:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512F-NEXT: vmovq %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_3:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_3:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_3:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512BWVL-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm4 = [10,10,11,11,3,3,11,11,8,8,9,9,10,10,11,11]
-; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [3,3,11,11,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
-; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
-; AVX512BWVL-NEXT: retq
+; AVX512-LABEL: shuffle_v64i8_to_v8i8_3:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; AVX512-NEXT: vmovq %xmm0, (%rsi)
+; AVX512-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
%strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 51, i32 59>
store <8 x i8> %strided.vec, <8 x i8>* %S
}
define void @shuffle_v64i8_to_v8i8_4(<64 x i8>* %L, <8 x i8>* %S) nounwind {
-; AVX512F-LABEL: shuffle_v64i8_to_v8i8_4:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512F-NEXT: vmovq %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_4:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_4:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_4:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [2,6,10,14,18,22,26,30]
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1
-; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
-; AVX512BWVL-NEXT: vpmovwb %xmm1, (%rsi)
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
+; AVX512-LABEL: shuffle_v64i8_to_v8i8_4:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; AVX512-NEXT: vmovq %xmm0, (%rsi)
+; AVX512-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
%strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 52, i32 60>
store <8 x i8> %strided.vec, <8 x i8>* %S
}
define void @shuffle_v64i8_to_v8i8_5(<64 x i8>* %L, <8 x i8>* %S) nounwind {
-; AVX512F-LABEL: shuffle_v64i8_to_v8i8_5:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512F-NEXT: vmovq %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_5:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_5:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_5:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512BWVL-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm4 = [12,12,13,13,5,5,13,13,4,4,5,5,6,6,7,7]
-; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [5,5,13,13,4,4,5,5,4,4,5,5,6,6,7,7]
-; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
-; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
-; AVX512BWVL-NEXT: retq
+; AVX512-LABEL: shuffle_v64i8_to_v8i8_5:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; AVX512-NEXT: vmovq %xmm0, (%rsi)
+; AVX512-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
%strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61>
store <8 x i8> %strided.vec, <8 x i8>* %S
}
define void @shuffle_v64i8_to_v8i8_6(<64 x i8>* %L, <8 x i8>* %S) nounwind {
-; AVX512F-LABEL: shuffle_v64i8_to_v8i8_6:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512F-NEXT: vmovq %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_6:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_6:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_6:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [3,7,11,15,19,23,27,31]
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1
-; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
-; AVX512BWVL-NEXT: vpmovwb %xmm1, (%rsi)
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
+; AVX512-LABEL: shuffle_v64i8_to_v8i8_6:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; AVX512-NEXT: vmovq %xmm0, (%rsi)
+; AVX512-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
%strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62>
store <8 x i8> %strided.vec, <8 x i8>* %S
}
define void @shuffle_v64i8_to_v8i8_7(<64 x i8>* %L, <8 x i8>* %S) nounwind {
-; AVX512F-LABEL: shuffle_v64i8_to_v8i8_7:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512F-NEXT: vmovq %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_7:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_7:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_7:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512BWVL-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm4 = [14,14,15,15,7,7,15,15,4,4,5,5,6,6,7,7]
-; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,15,15,6,6,7,7,4,4,5,5,6,6,7,7]
-; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
-; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
-; AVX512BWVL-NEXT: retq
+; AVX512-LABEL: shuffle_v64i8_to_v8i8_7:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; AVX512-NEXT: vmovq %xmm0, (%rsi)
+; AVX512-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
%strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63>
store <8 x i8> %strided.vec, <8 x i8>* %S
; AVX-NEXT: vmovq %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512F-LABEL: shuffle_v16i8_to_v8i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT: vmovq %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i8_to_v8i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v16i8_to_v8i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v16i8_to_v8i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
-; AVX512BWVL-NEXT: retq
+; AVX512-LABEL: shuffle_v16i8_to_v8i8:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vmovq %xmm0, (%rsi)
+; AVX512-NEXT: retq
%vec = load <16 x i8>, <16 x i8>* %L
%strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
store <8 x i8> %strided.vec, <8 x i8>* %S
; AVX-NEXT: vmovq %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512F-LABEL: shuffle_v8i16_to_v4i16:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512F-NEXT: vmovq %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v8i16_to_v4i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v8i16_to_v4i16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v8i16_to_v4i16:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
-; AVX512BWVL-NEXT: retq
+; AVX512-LABEL: shuffle_v8i16_to_v4i16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512-NEXT: vmovq %xmm0, (%rsi)
+; AVX512-NEXT: retq
%vec = load <8 x i16>, <8 x i16>* %L
%strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
store <4 x i16> %strided.vec, <4 x i16>* %S
; AVX-NEXT: vmovlps %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512F-LABEL: shuffle_v4i32_to_v2i32:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX512F-NEXT: vmovlps %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v4i32_to_v2i32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vpmovqd %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v4i32_to_v2i32:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX512BW-NEXT: vmovlps %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v4i32_to_v2i32:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vpmovqd %xmm0, (%rsi)
-; AVX512BWVL-NEXT: retq
+; AVX512-LABEL: shuffle_v4i32_to_v2i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX512-NEXT: vmovlps %xmm0, (%rsi)
+; AVX512-NEXT: retq
%vec = load <4 x i32>, <4 x i32>* %L
%strided.vec = shufflevector <4 x i32> %vec, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
store <2 x i32> %strided.vec, <2 x i32>* %S
; AVX-NEXT: vmovd %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512F-LABEL: shuffle_v16i8_to_v4i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT: vmovd %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i8_to_v4i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v16i8_to_v4i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v16i8_to_v4i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
-; AVX512BWVL-NEXT: retq
+; AVX512-LABEL: shuffle_v16i8_to_v4i8:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vmovd %xmm0, (%rsi)
+; AVX512-NEXT: retq
%vec = load <16 x i8>, <16 x i8>* %L
%strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
store <4 x i8> %strided.vec, <4 x i8>* %S
; AVX512VL-LABEL: shuffle_v8i16_to_v2i16:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vpmovqw %xmm0, (%rsi)
+; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX512VL-NEXT: vmovd %xmm0, (%rsi)
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v8i16_to_v2i16:
; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vpmovqw %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX512BWVL-NEXT: vmovd %xmm0, (%rsi)
; AVX512BWVL-NEXT: retq
%vec = load <8 x i16>, <8 x i16>* %L
%strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> <i32 0, i32 4>
; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512F-LABEL: shuffle_v16i8_to_v2i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i8_to_v2i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v16i8_to_v2i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi)
-; AVX512BWVL-NEXT: retq
+; AVX512-LABEL: shuffle_v16i8_to_v2i8:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX512-NEXT: retq
%vec = load <16 x i8>, <16 x i8>* %L
%strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 0, i32 8>
store <2 x i8> %strided.vec, <2 x i8>* %S
; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14]
-; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
-; AVX512BWVL-NEXT: vpmovwb %xmm1, (%rsi)
+; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512BWVL-NEXT: vmovq %xmm0, (%rsi)
; AVX512BWVL-NEXT: retq
;
; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v8i8:
; AVX512VBMIVL: # %bb.0:
; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14]
-; AVX512VBMIVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
-; AVX512VBMIVL-NEXT: vpmovwb %xmm1, (%rsi)
+; AVX512VBMIVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2024390091656922112,2024390091656922112]
+; AVX512VBMIVL-NEXT: vpermi2b 16(%rdi), %xmm0, %xmm1
+; AVX512VBMIVL-NEXT: vmovq %xmm1, (%rsi)
; AVX512VBMIVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
}
define void @trunc_v8i32_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
-; AVX1-LABEL: trunc_v8i32_to_v8i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa (%rdi), %xmm0
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX1-NEXT: vmovq %xmm0, (%rsi)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_v8i32_to_v8i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa (%rdi), %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vmovq %xmm0, (%rsi)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX-LABEL: trunc_v8i32_to_v8i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX-NEXT: vmovq %xmm0, (%rsi)
+; AVX-NEXT: retq
;
; AVX512F-LABEL: trunc_v8i32_to_v8i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vmovq %xmm0, (%rsi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
; AVX512BW-LABEL: trunc_v8i32_to_v8i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
; AVX1-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0
+; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
; AVX512VBMIVL: # %bb.0:
; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0
+; AVX512VBMIVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512VBMIVL-NEXT: vzeroupper
; AVX512VBMIVL-NEXT: retq
%truncated.vec = trunc <8 x i32> %vec to <8 x i8>
; AVX1-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0
+; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
; AVX512VBMIVL: # %bb.0:
; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0
+; AVX512VBMIVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512VBMIVL-NEXT: vzeroupper
; AVX512VBMIVL-NEXT: retq
%truncated = trunc <8 x i32> %vec to <8 x i8>
; AVX1-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX2-FAST-NEXT: vzeroupper
; AVX2-FAST-NEXT: retq
;
; AVX512F-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0
+; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0
+; AVX512BWVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
; AVX512VBMIVL: # %bb.0:
; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0
+; AVX512VBMIVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512VBMIVL-NEXT: vzeroupper
; AVX512VBMIVL-NEXT: retq
%truncated = trunc <4 x i64> %vec to <4 x i16>
; AVX1-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX2-FAST-NEXT: vzeroupper
; AVX2-FAST-NEXT: retq
;
; AVX512F-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0
+; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0
+; AVX512BWVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
; AVX512VBMIVL: # %bb.0:
; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0
+; AVX512VBMIVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512VBMIVL-NEXT: vzeroupper
; AVX512VBMIVL-NEXT: retq
%truncated = trunc <4 x i64> %vec to <4 x i16>
; AVX1-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[u],zero
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[u],zero
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[u],zero
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
+; AVX2-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[u],zero
+; AVX512F-NEXT: vpmovqb %zmm0, %xmm0
+; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovqb %ymm0, %xmm0
+; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[u],zero
+; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0
+; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpmovqb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
; AVX512VBMIVL: # %bb.0:
; AVX512VBMIVL-NEXT: vpmovqb %ymm0, %xmm0
+; AVX512VBMIVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VBMIVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; AVX512VBMIVL-NEXT: vzeroupper
; AVX512VBMIVL-NEXT: retq
%truncated = trunc <4 x i64> %vec to <4 x i8>
;
; AVX512VL-LABEL: shuffle_v16i16_to_v4i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovaps (%rdi), %xmm0
-; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
-; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
+; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v16i16_to_v4i16:
;
; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovaps (%rdi), %xmm0
-; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
-; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4,8,12,4,5,12,13]
+; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
+; AVX512BWVL-NEXT: vmovq %xmm1, (%rsi)
; AVX512BWVL-NEXT: retq
;
; AVX512VBMIVL-LABEL: shuffle_v16i16_to_v4i16:
; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vmovaps (%rdi), %xmm0
-; AVX512VBMIVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
-; AVX512VBMIVL-NEXT: vpmovdw %xmm0, (%rsi)
+; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4,8,12,4,5,12,13]
+; AVX512VBMIVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
+; AVX512VBMIVL-NEXT: vmovq %xmm1, (%rsi)
; AVX512VBMIVL-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %L
%strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
define void @trunc_v4i64_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
; AVX1-LABEL: trunc_v4i64_to_v4i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps (%rdi), %xmm0
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; AVX1-NEXT: vmovq %xmm0, (%rsi)
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm0
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi)
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16:
; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermd (%rdi), %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi)
-; AVX2-FAST-NEXT: vzeroupper
; AVX2-FAST-NEXT: retq
;
; AVX512F-LABEL: trunc_v4i64_to_v4i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
; AVX512F-NEXT: vmovq %xmm0, (%rsi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
; AVX512BW-LABEL: trunc_v4i64_to_v4i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i8_to_v4i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovaps (%rdi), %xmm0
-; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
-; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-NEXT: vmovd %xmm0, (%rsi)
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v32i8_to_v4i8:
;
; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovaps (%rdi), %xmm0
-; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
-; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BWVL-NEXT: vmovd %xmm0, (%rsi)
; AVX512BWVL-NEXT: retq
;
; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v4i8:
; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vmovaps (%rdi), %xmm0
-; AVX512VBMIVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
-; AVX512VBMIVL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VBMIVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [403703808,403703808,403703808,403703808]
+; AVX512VBMIVL-NEXT: vpermi2b 16(%rdi), %xmm0, %xmm1
+; AVX512VBMIVL-NEXT: vmovd %xmm1, (%rsi)
; AVX512VBMIVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
}
define void @trunc_v4i64_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
-; AVX1-LABEL: trunc_v4i64_to_v4i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps (%rdi), %xmm0
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vmovd %xmm0, (%rsi)
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i8:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm0
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-SLOW-NEXT: vmovd %xmm0, (%rsi)
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_v4i64_to_v4i8:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermd (%rdi), %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FAST-NEXT: vmovd %xmm0, (%rsi)
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
+; AVX-LABEL: trunc_v4i64_to_v4i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT: vmovd %xmm0, (%rsi)
+; AVX-NEXT: retq
;
; AVX512F-LABEL: trunc_v4i64_to_v4i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vpmovqb %zmm0, %xmm0
; AVX512F-NEXT: vmovd %xmm0, (%rsi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
; AVX512BW-LABEL: trunc_v4i64_to_v4i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0
; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28]
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1
-; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
-; AVX512BWVL-NEXT: vpmovwb %xmm1, (%rsi)
-; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512BWVL-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; AVX512BWVL-NEXT: vmovq %xmm0, (%rsi)
; AVX512BWVL-NEXT: retq
;
; AVX512VBMI-LABEL: shuffle_v64i8_to_v8i8:
;
; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v8i8:
; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28]
-; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm1
-; AVX512VBMIVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
-; AVX512VBMIVL-NEXT: vpmovwb %xmm1, (%rsi)
+; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VBMIVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4048780183313844224,4048780183313844224,4048780183313844224,4048780183313844224]
+; AVX512VBMIVL-NEXT: vpermi2b 32(%rdi), %ymm0, %ymm1
+; AVX512VBMIVL-NEXT: vmovq %xmm1, (%rsi)
; AVX512VBMIVL-NEXT: vzeroupper
; AVX512VBMIVL-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
; AVX512-LABEL: trunc_v8i64_to_v8i8_return_v16i8:
; AVX512: # %bb.0:
; AVX512-NEXT: vpmovqb %zmm0, %xmm0
+; AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%truncated = trunc <8 x i64> %vec to <8 x i8>
define <4 x i32> @test_mul_v4i32_v4i8(<4 x i8> %A) {
; CHECK32-LABEL: test_mul_v4i32_v4i8:
; CHECK32: # %bb.0:
-; CHECK32-NEXT: pand {{\.LCPI.*}}, %xmm0
+; CHECK32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; CHECK32-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0
; CHECK32-NEXT: retl
;
; CHECK64-LABEL: test_mul_v4i32_v4i8:
; CHECK64: # %bb.0:
-; CHECK64-NEXT: pand {{.*}}(%rip), %xmm0
+; CHECK64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; CHECK64-NEXT: pmaddwd {{.*}}(%rip), %xmm0
; CHECK64-NEXT: retq
;
; SSE4-32-LABEL: test_mul_v4i32_v4i8:
; SSE4-32: # %bb.0:
-; SSE4-32-NEXT: pand {{\.LCPI.*}}, %xmm0
+; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SSE4-32-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0
; SSE4-32-NEXT: retl
;
; SSE4-64-LABEL: test_mul_v4i32_v4i8:
; SSE4-64: # %bb.0:
-; SSE4-64-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SSE4-64-NEXT: pmaddwd {{.*}}(%rip), %xmm0
; SSE4-64-NEXT: retq
;
; AVX2-32-LABEL: test_mul_v4i32_v4i8:
; AVX2-32: # %bb.0:
-; AVX2-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
+; AVX2-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX2-32-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
; AVX2-32-NEXT: retl
;
; AVX2-64-LABEL: test_mul_v4i32_v4i8:
; AVX2-64: # %bb.0:
-; AVX2-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX2-64-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
; AVX2-64-NEXT: retq
;
; AVX512DQ-32-LABEL: test_mul_v4i32_v4i8:
; AVX512DQ-32: # %bb.0:
-; AVX512DQ-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
+; AVX512DQ-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX512DQ-32-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
; AVX512DQ-32-NEXT: retl
;
; AVX512DQ-64-LABEL: test_mul_v4i32_v4i8:
; AVX512DQ-64: # %bb.0:
-; AVX512DQ-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQ-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX512DQ-64-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
; AVX512DQ-64-NEXT: retq
;
; AVX512BW-32-LABEL: test_mul_v4i32_v4i8:
; AVX512BW-32: # %bb.0:
-; AVX512BW-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
+; AVX512BW-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX512BW-32-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
; AVX512BW-32-NEXT: retl
;
; AVX512BW-64-LABEL: test_mul_v4i32_v4i8:
; AVX512BW-64: # %bb.0:
-; AVX512BW-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX512BW-64-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
; AVX512BW-64-NEXT: retq
;
; KNL-32-LABEL: test_mul_v4i32_v4i8:
; KNL-32: # %bb.0:
-; KNL-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
+; KNL-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; KNL-32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
; KNL-32-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; KNL-32-NEXT: retl
;
; KNL-64-LABEL: test_mul_v4i32_v4i8:
; KNL-64: # %bb.0:
-; KNL-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; KNL-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; KNL-64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
; KNL-64-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; KNL-64-NEXT: retq
define <8 x i32> @test_mul_v8i32_v8i8(<8 x i8> %A) {
; SLM32-LABEL: test_mul_v8i32_v8i8:
; SLM32: # %bb.0:
-; SLM32-NEXT: movdqa %xmm0, %xmm1
-; SLM32-NEXT: pand {{\.LCPI.*}}, %xmm1
+; SLM32-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SLM32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
; SLM32-NEXT: movdqa %xmm1, %xmm2
; SLM32-NEXT: pmullw %xmm0, %xmm1
; SLM32-NEXT: pmulhw %xmm0, %xmm2
; SLM32-NEXT: movdqa %xmm1, %xmm0
-; SLM32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SLM32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SLM32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SLM32-NEXT: retl
;
; SLM64-LABEL: test_mul_v8i32_v8i8:
; SLM64: # %bb.0:
-; SLM64-NEXT: movdqa %xmm0, %xmm1
-; SLM64-NEXT: pand {{.*}}(%rip), %xmm1
+; SLM64-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SLM64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
; SLM64-NEXT: movdqa %xmm1, %xmm2
; SLM64-NEXT: pmullw %xmm0, %xmm1
; SLM64-NEXT: pmulhw %xmm0, %xmm2
; SLM64-NEXT: movdqa %xmm1, %xmm0
-; SLM64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SLM64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SLM64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SLM64-NEXT: retq
;
; SLOW32-LABEL: test_mul_v8i32_v8i8:
; SLOW32: # %bb.0:
-; SLOW32-NEXT: movdqa %xmm0, %xmm1
-; SLOW32-NEXT: pand {{\.LCPI.*}}, %xmm1
+; SLOW32-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SLOW32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
; SLOW32-NEXT: movdqa %xmm1, %xmm2
; SLOW32-NEXT: pmulhw %xmm0, %xmm2
;
; SLOW64-LABEL: test_mul_v8i32_v8i8:
; SLOW64: # %bb.0:
-; SLOW64-NEXT: movdqa %xmm0, %xmm1
-; SLOW64-NEXT: pand {{.*}}(%rip), %xmm1
+; SLOW64-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SLOW64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
; SLOW64-NEXT: movdqa %xmm1, %xmm2
; SLOW64-NEXT: pmulhw %xmm0, %xmm2
;
; SSE4-32-LABEL: test_mul_v8i32_v8i8:
; SSE4-32: # %bb.0:
-; SSE4-32-NEXT: pand {{\.LCPI.*}}, %xmm0
-; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SSE4-32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
; SSE4-32-NEXT: pmaddwd %xmm2, %xmm0
; SSE4-32-NEXT: pmaddwd %xmm2, %xmm1
;
; SSE4-64-LABEL: test_mul_v8i32_v8i8:
; SSE4-64: # %bb.0:
-; SSE4-64-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SSE4-64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
; SSE4-64-NEXT: pmaddwd %xmm2, %xmm0
; SSE4-64-NEXT: pmaddwd %xmm2, %xmm1
;
; AVX2-32-LABEL: test_mul_v8i32_v8i8:
; AVX2-32: # %bb.0:
-; AVX2-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
-; AVX2-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX2-32-NEXT: vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0
; AVX2-32-NEXT: retl
;
; AVX2-64-LABEL: test_mul_v8i32_v8i8:
; AVX2-64: # %bb.0:
-; AVX2-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX2-64-NEXT: vpmaddwd {{.*}}(%rip), %ymm0, %ymm0
; AVX2-64-NEXT: retq
;
; AVX512DQ-32-LABEL: test_mul_v8i32_v8i8:
; AVX512DQ-32: # %bb.0:
-; AVX512DQ-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
-; AVX512DQ-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQ-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX512DQ-32-NEXT: vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0
; AVX512DQ-32-NEXT: retl
;
; AVX512DQ-64-LABEL: test_mul_v8i32_v8i8:
; AVX512DQ-64: # %bb.0:
-; AVX512DQ-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512DQ-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQ-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX512DQ-64-NEXT: vpmaddwd {{.*}}(%rip), %ymm0, %ymm0
; AVX512DQ-64-NEXT: retq
;
; AVX512BW-32-LABEL: test_mul_v8i32_v8i8:
; AVX512BW-32: # %bb.0:
-; AVX512BW-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
-; AVX512BW-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512BW-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX512BW-32-NEXT: vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0
; AVX512BW-32-NEXT: retl
;
; AVX512BW-64-LABEL: test_mul_v8i32_v8i8:
; AVX512BW-64: # %bb.0:
-; AVX512BW-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512BW-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512BW-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX512BW-64-NEXT: vpmaddwd {{.*}}(%rip), %ymm0, %ymm0
; AVX512BW-64-NEXT: retq
;
; KNL-32-LABEL: test_mul_v8i32_v8i8:
; KNL-32: # %bb.0:
-; KNL-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
-; KNL-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; KNL-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; KNL-32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
; KNL-32-NEXT: vpmulld %ymm1, %ymm0, %ymm0
; KNL-32-NEXT: retl
;
; KNL-64-LABEL: test_mul_v8i32_v8i8:
; KNL-64: # %bb.0:
-; KNL-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; KNL-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; KNL-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; KNL-64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
; KNL-64-NEXT: vpmulld %ymm1, %ymm0, %ymm0
; KNL-64-NEXT: retq
}
define <4 x i32> @test_mul_v4i32_v4i16(<4 x i16> %A) {
-; SLM32-LABEL: test_mul_v4i32_v4i16:
-; SLM32: # %bb.0:
-; SLM32-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SLM32-NEXT: movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u>
-; SLM32-NEXT: movdqa %xmm0, %xmm2
-; SLM32-NEXT: pmullw %xmm1, %xmm0
-; SLM32-NEXT: pmulhuw %xmm1, %xmm2
-; SLM32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SLM32-NEXT: retl
-;
-; SLM64-LABEL: test_mul_v4i32_v4i16:
-; SLM64: # %bb.0:
-; SLM64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SLM64-NEXT: movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u>
-; SLM64-NEXT: movdqa %xmm0, %xmm2
-; SLM64-NEXT: pmullw %xmm1, %xmm0
-; SLM64-NEXT: pmulhuw %xmm1, %xmm2
-; SLM64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SLM64-NEXT: retq
-;
-; SLOW32-LABEL: test_mul_v4i32_v4i16:
-; SLOW32: # %bb.0:
-; SLOW32-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SLOW32-NEXT: movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u>
-; SLOW32-NEXT: movdqa %xmm0, %xmm2
-; SLOW32-NEXT: pmulhuw %xmm1, %xmm2
-; SLOW32-NEXT: pmullw %xmm1, %xmm0
-; SLOW32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SLOW32-NEXT: retl
+; CHECK32-LABEL: test_mul_v4i32_v4i16:
+; CHECK32: # %bb.0:
+; CHECK32-NEXT: movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u>
+; CHECK32-NEXT: movdqa %xmm0, %xmm2
+; CHECK32-NEXT: pmulhuw %xmm1, %xmm2
+; CHECK32-NEXT: pmullw %xmm1, %xmm0
+; CHECK32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; CHECK32-NEXT: retl
;
-; SLOW64-LABEL: test_mul_v4i32_v4i16:
-; SLOW64: # %bb.0:
-; SLOW64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SLOW64-NEXT: movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u>
-; SLOW64-NEXT: movdqa %xmm0, %xmm2
-; SLOW64-NEXT: pmulhuw %xmm1, %xmm2
-; SLOW64-NEXT: pmullw %xmm1, %xmm0
-; SLOW64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SLOW64-NEXT: retq
+; CHECK64-LABEL: test_mul_v4i32_v4i16:
+; CHECK64: # %bb.0:
+; CHECK64-NEXT: movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u>
+; CHECK64-NEXT: movdqa %xmm0, %xmm2
+; CHECK64-NEXT: pmulhuw %xmm1, %xmm2
+; CHECK64-NEXT: pmullw %xmm1, %xmm0
+; CHECK64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; CHECK64-NEXT: retq
;
; SSE4-32-LABEL: test_mul_v4i32_v4i16:
; SSE4-32: # %bb.0:
-; SSE4-32-NEXT: pxor %xmm1, %xmm1
-; SSE4-32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE4-32-NEXT: pmulld {{\.LCPI.*}}, %xmm0
; SSE4-32-NEXT: retl
;
; SSE4-64-LABEL: test_mul_v4i32_v4i16:
; SSE4-64: # %bb.0:
-; SSE4-64-NEXT: pxor %xmm1, %xmm1
-; SSE4-64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE4-64-NEXT: pmulld {{.*}}(%rip), %xmm0
; SSE4-64-NEXT: retq
;
; AVX-32-LABEL: test_mul_v4i32_v4i16:
; AVX-32: # %bb.0:
-; AVX-32-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX-32-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX-32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
; AVX-32-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX-32-NEXT: retl
;
; AVX-64-LABEL: test_mul_v4i32_v4i16:
; AVX-64: # %bb.0:
-; AVX-64-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX-64-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX-64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
; AVX-64-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX-64-NEXT: retq
define <4 x i32> @test_mul_v4i32_v4i8_minsize(<4 x i8> %A) minsize {
; CHECK32-LABEL: test_mul_v4i32_v4i8_minsize:
; CHECK32: # %bb.0:
-; CHECK32-NEXT: pand {{\.LCPI.*}}, %xmm0
+; CHECK32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; CHECK32-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0
; CHECK32-NEXT: retl
;
; CHECK64-LABEL: test_mul_v4i32_v4i8_minsize:
; CHECK64: # %bb.0:
-; CHECK64-NEXT: pand {{.*}}(%rip), %xmm0
+; CHECK64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; CHECK64-NEXT: pmaddwd {{.*}}(%rip), %xmm0
; CHECK64-NEXT: retq
;
; SSE4-32-LABEL: test_mul_v4i32_v4i8_minsize:
; SSE4-32: # %bb.0:
-; SSE4-32-NEXT: pand {{\.LCPI.*}}, %xmm0
+; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SSE4-32-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0
; SSE4-32-NEXT: retl
;
; SSE4-64-LABEL: test_mul_v4i32_v4i8_minsize:
; SSE4-64: # %bb.0:
-; SSE4-64-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SSE4-64-NEXT: pmaddwd {{.*}}(%rip), %xmm0
; SSE4-64-NEXT: retq
;
; AVX2-32-LABEL: test_mul_v4i32_v4i8_minsize:
; AVX2-32: # %bb.0:
-; AVX2-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
+; AVX2-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX2-32-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
; AVX2-32-NEXT: retl
;
; AVX2-64-LABEL: test_mul_v4i32_v4i8_minsize:
; AVX2-64: # %bb.0:
-; AVX2-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX2-64-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
; AVX2-64-NEXT: retq
;
; AVX512DQ-32-LABEL: test_mul_v4i32_v4i8_minsize:
; AVX512DQ-32: # %bb.0:
-; AVX512DQ-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
+; AVX512DQ-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX512DQ-32-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
; AVX512DQ-32-NEXT: retl
;
; AVX512DQ-64-LABEL: test_mul_v4i32_v4i8_minsize:
; AVX512DQ-64: # %bb.0:
-; AVX512DQ-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQ-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX512DQ-64-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
; AVX512DQ-64-NEXT: retq
;
; AVX512BW-32-LABEL: test_mul_v4i32_v4i8_minsize:
; AVX512BW-32: # %bb.0:
-; AVX512BW-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
+; AVX512BW-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX512BW-32-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
; AVX512BW-32-NEXT: retl
;
; AVX512BW-64-LABEL: test_mul_v4i32_v4i8_minsize:
; AVX512BW-64: # %bb.0:
-; AVX512BW-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX512BW-64-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
; AVX512BW-64-NEXT: retq
;
; KNL-32-LABEL: test_mul_v4i32_v4i8_minsize:
; KNL-32: # %bb.0:
-; KNL-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
+; KNL-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; KNL-32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
; KNL-32-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; KNL-32-NEXT: retl
;
; KNL-64-LABEL: test_mul_v4i32_v4i8_minsize:
; KNL-64: # %bb.0:
-; KNL-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; KNL-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; KNL-64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
; KNL-64-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; KNL-64-NEXT: retq
define <8 x i32> @test_mul_v8i32_v8i8_minsize(<8 x i8> %A) minsize {
; SLM32-LABEL: test_mul_v8i32_v8i8_minsize:
; SLM32: # %bb.0:
-; SLM32-NEXT: pand {{\.LCPI.*}}, %xmm0
; SLM32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
-; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SLM32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SLM32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SLM32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SLM32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; SLM32-NEXT: pmaddwd %xmm2, %xmm0
; SLM32-NEXT: pmaddwd %xmm2, %xmm1
; SLM32-NEXT: retl
;
; SLM64-LABEL: test_mul_v8i32_v8i8_minsize:
; SLM64: # %bb.0:
-; SLM64-NEXT: pand {{.*}}(%rip), %xmm0
; SLM64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
-; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SLM64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SLM64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SLM64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SLM64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; SLM64-NEXT: pmaddwd %xmm2, %xmm0
; SLM64-NEXT: pmaddwd %xmm2, %xmm1
; SLM64-NEXT: retq
;
; SLOW32-LABEL: test_mul_v8i32_v8i8_minsize:
; SLOW32: # %bb.0:
-; SLOW32-NEXT: pand {{\.LCPI.*}}, %xmm0
-; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SLOW32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
; SLOW32-NEXT: pmaddwd %xmm2, %xmm0
; SLOW32-NEXT: pmaddwd %xmm2, %xmm1
;
; SLOW64-LABEL: test_mul_v8i32_v8i8_minsize:
; SLOW64: # %bb.0:
-; SLOW64-NEXT: pand {{.*}}(%rip), %xmm0
-; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SLOW64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
; SLOW64-NEXT: pmaddwd %xmm2, %xmm0
; SLOW64-NEXT: pmaddwd %xmm2, %xmm1
;
; SSE4-32-LABEL: test_mul_v8i32_v8i8_minsize:
; SSE4-32: # %bb.0:
-; SSE4-32-NEXT: pand {{\.LCPI.*}}, %xmm0
-; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SSE4-32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
; SSE4-32-NEXT: pmaddwd %xmm2, %xmm0
; SSE4-32-NEXT: pmaddwd %xmm2, %xmm1
;
; SSE4-64-LABEL: test_mul_v8i32_v8i8_minsize:
; SSE4-64: # %bb.0:
-; SSE4-64-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SSE4-64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
; SSE4-64-NEXT: pmaddwd %xmm2, %xmm0
; SSE4-64-NEXT: pmaddwd %xmm2, %xmm1
;
; AVX2-32-LABEL: test_mul_v8i32_v8i8_minsize:
; AVX2-32: # %bb.0:
-; AVX2-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
-; AVX2-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX2-32-NEXT: vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0
; AVX2-32-NEXT: retl
;
; AVX2-64-LABEL: test_mul_v8i32_v8i8_minsize:
; AVX2-64: # %bb.0:
-; AVX2-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX2-64-NEXT: vpmaddwd {{.*}}(%rip), %ymm0, %ymm0
; AVX2-64-NEXT: retq
;
; AVX512DQ-32-LABEL: test_mul_v8i32_v8i8_minsize:
; AVX512DQ-32: # %bb.0:
-; AVX512DQ-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
-; AVX512DQ-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQ-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX512DQ-32-NEXT: vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0
; AVX512DQ-32-NEXT: retl
;
; AVX512DQ-64-LABEL: test_mul_v8i32_v8i8_minsize:
; AVX512DQ-64: # %bb.0:
-; AVX512DQ-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512DQ-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQ-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX512DQ-64-NEXT: vpmaddwd {{.*}}(%rip), %ymm0, %ymm0
; AVX512DQ-64-NEXT: retq
;
; AVX512BW-32-LABEL: test_mul_v8i32_v8i8_minsize:
; AVX512BW-32: # %bb.0:
-; AVX512BW-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
-; AVX512BW-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512BW-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX512BW-32-NEXT: vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0
; AVX512BW-32-NEXT: retl
;
; AVX512BW-64-LABEL: test_mul_v8i32_v8i8_minsize:
; AVX512BW-64: # %bb.0:
-; AVX512BW-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512BW-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512BW-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX512BW-64-NEXT: vpmaddwd {{.*}}(%rip), %ymm0, %ymm0
; AVX512BW-64-NEXT: retq
;
; KNL-32-LABEL: test_mul_v8i32_v8i8_minsize:
; KNL-32: # %bb.0:
-; KNL-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
-; KNL-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; KNL-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; KNL-32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
; KNL-32-NEXT: vpmulld %ymm1, %ymm0, %ymm0
; KNL-32-NEXT: retl
;
; KNL-64-LABEL: test_mul_v8i32_v8i8_minsize:
; KNL-64: # %bb.0:
-; KNL-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; KNL-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; KNL-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; KNL-64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
; KNL-64-NEXT: vpmulld %ymm1, %ymm0, %ymm0
; KNL-64-NEXT: retq
define <4 x i32> @test_mul_v4i32_v4i16_minsize(<4 x i16> %A) minsize {
; CHECK32-LABEL: test_mul_v4i32_v4i16_minsize:
; CHECK32: # %bb.0:
-; CHECK32-NEXT: pxor %xmm1, %xmm1
-; CHECK32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; CHECK32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; CHECK32-NEXT: pmulld {{\.LCPI.*}}, %xmm0
; CHECK32-NEXT: retl
;
; CHECK64-LABEL: test_mul_v4i32_v4i16_minsize:
; CHECK64: # %bb.0:
-; CHECK64-NEXT: pxor %xmm1, %xmm1
-; CHECK64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; CHECK64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; CHECK64-NEXT: pmulld {{.*}}(%rip), %xmm0
; CHECK64-NEXT: retq
;
; SSE4-32-LABEL: test_mul_v4i32_v4i16_minsize:
; SSE4-32: # %bb.0:
-; SSE4-32-NEXT: pxor %xmm1, %xmm1
-; SSE4-32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE4-32-NEXT: pmulld {{\.LCPI.*}}, %xmm0
; SSE4-32-NEXT: retl
;
; SSE4-64-LABEL: test_mul_v4i32_v4i16_minsize:
; SSE4-64: # %bb.0:
-; SSE4-64-NEXT: pxor %xmm1, %xmm1
-; SSE4-64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE4-64-NEXT: pmulld {{.*}}(%rip), %xmm0
; SSE4-64-NEXT: retq
;
; AVX-32-LABEL: test_mul_v4i32_v4i16_minsize:
; AVX-32: # %bb.0:
-; AVX-32-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX-32-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX-32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
; AVX-32-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX-32-NEXT: retl
;
; AVX-64-LABEL: test_mul_v4i32_v4i16_minsize:
; AVX-64: # %bb.0:
-; AVX-64-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX-64-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX-64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
; AVX-64-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX-64-NEXT: retq
define <8 x i8> @test_x86_sse2_paddus_b_64(<8 x i8> %a0, <8 x i8> %a1) {
; SSE-LABEL: test_x86_sse2_paddus_b_64:
; SSE: ## %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; SSE-NEXT: ## encoding: [0x66,0x0f,0x6f,0x15,A,A,A,A]
-; SSE-NEXT: ## fixup A - offset: 4, value: LCPI4_0, kind: FK_Data_4
-; SSE-NEXT: pand %xmm2, %xmm1 ## encoding: [0x66,0x0f,0xdb,0xca]
-; SSE-NEXT: packuswb %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x67,0xc9]
-; SSE-NEXT: pand %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xdb,0xc2]
-; SSE-NEXT: packuswb %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x67,0xc0]
; SSE-NEXT: paddusb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xdc,0xc1]
-; SSE-NEXT: punpcklbw %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x60,0xc0]
-; SSE-NEXT: ## xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_paddus_b_64:
; AVX2: ## %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x15,A,A,A,A]
-; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI4_0, kind: FK_Data_4
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ## encoding: [0xc4,0xe2,0x71,0x00,0xca]
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x00,0xc2]
; AVX2-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdc,0xc1]
-; AVX2-NEXT: vpmovzxbw %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x30,0xc0]
-; AVX2-NEXT: ## xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_paddus_b_64:
; SKX: ## %bb.0:
-; SKX-NEXT: vmovdqa LCPI4_0, %xmm2 ## EVEX TO VEX Compression xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; SKX-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x15,A,A,A,A]
-; SKX-NEXT: ## fixup A - offset: 4, value: LCPI4_0, kind: FK_Data_4
-; SKX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x00,0xca]
-; SKX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x00,0xc2]
; SKX-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdc,0xc1]
-; SKX-NEXT: vpmovzxbw %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x30,0xc0]
-; SKX-NEXT: ## xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SKX-NEXT: retl ## encoding: [0xc3]
%1 = add <8 x i8> %a0, %a1
%2 = icmp ugt <8 x i8> %a0, %1
define <4 x i16> @test_x86_sse2_paddus_w_64(<4 x i16> %a0, <4 x i16> %a1) {
; SSE-LABEL: test_x86_sse2_paddus_w_64:
; SSE: ## %bb.0:
-; SSE-NEXT: pshuflw $232, %xmm1, %xmm1 ## encoding: [0xf2,0x0f,0x70,0xc9,0xe8]
-; SSE-NEXT: ## xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw $232, %xmm1, %xmm1 ## encoding: [0xf3,0x0f,0x70,0xc9,0xe8]
-; SSE-NEXT: ## xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd $232, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0xe8]
-; SSE-NEXT: ## xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT: pshuflw $232, %xmm0, %xmm0 ## encoding: [0xf2,0x0f,0x70,0xc0,0xe8]
-; SSE-NEXT: ## xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw $232, %xmm0, %xmm0 ## encoding: [0xf3,0x0f,0x70,0xc0,0xe8]
-; SSE-NEXT: ## xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd $232, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x70,0xc0,0xe8]
-; SSE-NEXT: ## xmm0 = xmm0[0,2,2,3]
; SSE-NEXT: paddusw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xdd,0xc1]
-; SSE-NEXT: punpcklwd %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x61,0xc0]
-; SSE-NEXT: ## xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_paddus_w_64:
; AVX2: ## %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX2-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x15,A,A,A,A]
-; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI5_0, kind: FK_Data_4
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ## encoding: [0xc4,0xe2,0x71,0x00,0xca]
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x00,0xc2]
; AVX2-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdd,0xc1]
-; AVX2-NEXT: vpmovzxwd %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x33,0xc0]
-; AVX2-NEXT: ## xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_paddus_w_64:
; SKX: ## %bb.0:
-; SKX-NEXT: vmovdqa LCPI5_0, %xmm2 ## EVEX TO VEX Compression xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SKX-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x15,A,A,A,A]
-; SKX-NEXT: ## fixup A - offset: 4, value: LCPI5_0, kind: FK_Data_4
-; SKX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x00,0xca]
-; SKX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x00,0xc2]
; SKX-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdd,0xc1]
-; SKX-NEXT: vpmovzxwd %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x33,0xc0]
-; SKX-NEXT: ## xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SKX-NEXT: retl ## encoding: [0xc3]
%1 = add <4 x i16> %a0, %a1
%2 = icmp ugt <4 x i16> %a0, %1
define <8 x i8> @test_x86_sse2_psubus_b_64(<8 x i8> %a0, <8 x i8> %a1) {
; SSE-LABEL: test_x86_sse2_psubus_b_64:
; SSE: ## %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; SSE-NEXT: ## encoding: [0x66,0x0f,0x6f,0x15,A,A,A,A]
-; SSE-NEXT: ## fixup A - offset: 4, value: LCPI6_0, kind: FK_Data_4
-; SSE-NEXT: movdqa %xmm1, %xmm3 ## encoding: [0x66,0x0f,0x6f,0xd9]
-; SSE-NEXT: pand %xmm2, %xmm3 ## encoding: [0x66,0x0f,0xdb,0xda]
-; SSE-NEXT: pand %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xdb,0xc2]
-; SSE-NEXT: pmaxsw %xmm3, %xmm0 ## encoding: [0x66,0x0f,0xee,0xc3]
-; SSE-NEXT: psubw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xf9,0xc1]
+; SSE-NEXT: psubusb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xd8,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_psubus_b_64:
; AVX2: ## %bb.0:
-; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; AVX2-NEXT: ## encoding: [0xc4,0xe2,0x79,0x79,0x15,A,A,A,A]
-; AVX2-NEXT: ## fixup A - offset: 5, value: LCPI6_0, kind: FK_Data_4
-; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3 ## encoding: [0xc5,0xf1,0xdb,0xda]
-; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdb,0xc2]
-; AVX2-NEXT: vpmaxuw %xmm3, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x3e,0xc3]
-; AVX2-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xf9,0xc1]
+; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd8,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_psubus_b_64:
; SKX: ## %bb.0:
-; SKX-NEXT: vpbroadcastw LCPI6_0, %xmm2 ## EVEX TO VEX Compression xmm2 = [255,255,255,255,255,255,255,255]
-; SKX-NEXT: ## encoding: [0xc4,0xe2,0x79,0x79,0x15,A,A,A,A]
-; SKX-NEXT: ## fixup A - offset: 5, value: LCPI6_0, kind: FK_Data_4
-; SKX-NEXT: vpand %xmm2, %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xdb,0xda]
-; SKX-NEXT: vpand %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0xc2]
-; SKX-NEXT: vpmaxuw %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3e,0xc3]
-; SKX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf9,0xc1]
+; SKX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd8,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%cmp = icmp ugt <8 x i8> %a0, %a1
%sel = select <8 x i1> %cmp, <8 x i8> %a0, <8 x i8> %a1
define <4 x i16> @test_x86_sse2_psubus_w_64(<4 x i16> %a0, <4 x i16> %a1) {
; SSE-LABEL: test_x86_sse2_psubus_w_64:
; SSE: ## %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0]
-; SSE-NEXT: ## encoding: [0x66,0x0f,0x6f,0x15,A,A,A,A]
-; SSE-NEXT: ## fixup A - offset: 4, value: LCPI7_0, kind: FK_Data_4
-; SSE-NEXT: movdqa %xmm1, %xmm3 ## encoding: [0x66,0x0f,0x6f,0xd9]
-; SSE-NEXT: pand %xmm2, %xmm3 ## encoding: [0x66,0x0f,0xdb,0xda]
-; SSE-NEXT: pand %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xdb,0xc2]
-; SSE-NEXT: movdqa %xmm0, %xmm2 ## encoding: [0x66,0x0f,0x6f,0xd0]
-; SSE-NEXT: pcmpgtd %xmm3, %xmm2 ## encoding: [0x66,0x0f,0x66,0xd3]
-; SSE-NEXT: pand %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xdb,0xc2]
-; SSE-NEXT: pandn %xmm3, %xmm2 ## encoding: [0x66,0x0f,0xdf,0xd3]
-; SSE-NEXT: por %xmm0, %xmm2 ## encoding: [0x66,0x0f,0xeb,0xd0]
-; SSE-NEXT: psubd %xmm1, %xmm2 ## encoding: [0x66,0x0f,0xfa,0xd1]
-; SSE-NEXT: movdqa %xmm2, %xmm0 ## encoding: [0x66,0x0f,0x6f,0xc2]
+; SSE-NEXT: psubusw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xd9,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_psubus_w_64:
; AVX2: ## %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xef,0xd2]
-; AVX2-NEXT: vpblendw $170, %xmm2, %xmm1, %xmm3 ## encoding: [0xc4,0xe3,0x71,0x0e,0xda,0xaa]
-; AVX2-NEXT: ## xmm3 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; AVX2-NEXT: vpblendw $170, %xmm2, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc2,0xaa]
-; AVX2-NEXT: ## xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; AVX2-NEXT: vpmaxud %xmm3, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x3f,0xc3]
-; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfa,0xc1]
+; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd9,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_psubus_w_64:
; SKX: ## %bb.0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; SKX-NEXT: vpblendw $170, %xmm2, %xmm1, %xmm3 ## encoding: [0xc4,0xe3,0x71,0x0e,0xda,0xaa]
-; SKX-NEXT: ## xmm3 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; SKX-NEXT: vpblendw $170, %xmm2, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc2,0xaa]
-; SKX-NEXT: ## xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; SKX-NEXT: vpmaxud %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3f,0xc3]
-; SKX-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfa,0xc1]
+; SKX-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd9,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%cmp = icmp ugt <4 x i16> %a0, %a1
%sel = select <4 x i1> %cmp, <4 x i16> %a0, <4 x i16> %a1
define <4 x i32> @shl_zext_srl_v4i32(<4 x i16> %x) nounwind {
; CHECK-LABEL: shl_zext_srl_v4i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: andps {{.*}}(%rip), %xmm0
-; CHECK-NEXT: andps {{.*}}(%rip), %xmm0
+; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
+; CHECK-NEXT: pxor %xmm1, %xmm1
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; CHECK-NEXT: retq
%srl = lshr <4 x i16> %x, <i16 2, i16 2, i16 2, i16 2>
%zext = zext <4 x i16> %srl to <4 x i32>
; CHECK-LABEL: sra_trunc_srl_v4i32:
; CHECK: # %bb.0:
; CHECK-NEXT: psrad $19, %xmm0
+; CHECK-NEXT: packssdw %xmm0, %xmm0
; CHECK-NEXT: retq
%srl = lshr <4 x i32> %x, <i32 16, i32 16, i32 16, i32 16>
%trunc = trunc <4 x i32> %srl to <4 x i16>
define <4 x i32> @shl_zext_shl_v4i32(<4 x i16> %x) nounwind {
; CHECK-LABEL: shl_zext_shl_v4i32:
; CHECK: # %bb.0:
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; CHECK-NEXT: pslld $19, %xmm0
; CHECK-NEXT: retq
%shl0 = shl <4 x i16> %x, <i16 2, i16 2, i16 2, i16 2>
define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
; SSE2-LABEL: v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: psllq $32, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE2-NEXT: psllq $32, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: psubq %xmm1, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE2-NEXT: pxor %xmm1, %xmm4
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm3, %xmm5
-; SSE2-NEXT: pxor %xmm1, %xmm5
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2]
-; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: por %xmm2, %xmm4
-; SSE2-NEXT: movdqa %xmm2, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm7, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm2
-; SSE2-NEXT: pxor %xmm1, %xmm2
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,0,3,2]
-; SSE2-NEXT: pand %xmm2, %xmm5
-; SSE2-NEXT: pxor %xmm1, %xmm5
-; SSE2-NEXT: pandn %xmm5, %xmm3
-; SSE2-NEXT: movdqa %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
+; SSE2-NEXT: pxor %xmm4, %xmm0
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm5
+; SSE2-NEXT: pxor %xmm4, %xmm5
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm0
+; SSE2-NEXT: psubd %xmm1, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: pxor %xmm4, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm1
+; SSE2-NEXT: pxor %xmm4, %xmm1
+; SSE2-NEXT: pandn %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm3, %xmm1
; SSE2-NEXT: pandn {{.*}}(%rip), %xmm1
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm4
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: pand %xmm3, %xmm4
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: por %xmm4, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
-; SSE2-NEXT: psrad $31, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: psrld $1, %xmm3
+; SSE2-NEXT: por %xmm1, %xmm3
+; SSE2-NEXT: pand %xmm0, %xmm3
+; SSE2-NEXT: pandn %xmm2, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v2i32:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: psllq $32, %xmm1
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSSE3-NEXT: psllq $32, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, %xmm3
-; SSSE3-NEXT: psubq %xmm1, %xmm0
-; SSSE3-NEXT: por %xmm2, %xmm1
-; SSSE3-NEXT: movdqa %xmm2, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm5, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm4
-; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1
-; SSSE3-NEXT: pxor %xmm1, %xmm4
-; SSSE3-NEXT: por %xmm2, %xmm3
-; SSSE3-NEXT: movdqa %xmm2, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT: por %xmm3, %xmm5
-; SSSE3-NEXT: pxor %xmm1, %xmm5
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2]
-; SSSE3-NEXT: pand %xmm4, %xmm3
-; SSSE3-NEXT: movdqa %xmm0, %xmm4
-; SSSE3-NEXT: por %xmm2, %xmm4
-; SSSE3-NEXT: movdqa %xmm2, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pand %xmm7, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm2, %xmm4
-; SSSE3-NEXT: movdqa %xmm4, %xmm2
-; SSSE3-NEXT: pxor %xmm1, %xmm2
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,0,3,2]
-; SSSE3-NEXT: pand %xmm2, %xmm5
-; SSSE3-NEXT: pxor %xmm1, %xmm5
-; SSSE3-NEXT: pandn %xmm5, %xmm3
-; SSSE3-NEXT: movdqa %xmm4, %xmm1
+; SSSE3-NEXT: movdqa %xmm0, %xmm2
+; SSSE3-NEXT: pxor %xmm3, %xmm3
+; SSSE3-NEXT: pxor %xmm0, %xmm0
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4
+; SSSE3-NEXT: pxor %xmm4, %xmm0
+; SSSE3-NEXT: pxor %xmm5, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5
+; SSSE3-NEXT: pxor %xmm4, %xmm5
+; SSSE3-NEXT: pcmpeqd %xmm5, %xmm0
+; SSSE3-NEXT: psubd %xmm1, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3
+; SSSE3-NEXT: movdqa %xmm3, %xmm1
+; SSSE3-NEXT: pxor %xmm4, %xmm1
+; SSSE3-NEXT: pcmpeqd %xmm5, %xmm1
+; SSSE3-NEXT: pxor %xmm4, %xmm1
+; SSSE3-NEXT: pandn %xmm1, %xmm0
+; SSSE3-NEXT: movdqa %xmm3, %xmm1
; SSSE3-NEXT: pandn {{.*}}(%rip), %xmm1
-; SSSE3-NEXT: pand {{.*}}(%rip), %xmm4
-; SSSE3-NEXT: por %xmm1, %xmm4
-; SSSE3-NEXT: pand %xmm3, %xmm4
-; SSSE3-NEXT: pandn %xmm0, %xmm3
-; SSSE3-NEXT: por %xmm4, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
-; SSSE3-NEXT: psrad $31, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT: psrld $1, %xmm3
+; SSSE3-NEXT: por %xmm1, %xmm3
+; SSSE3-NEXT: pand %xmm0, %xmm3
+; SSSE3-NEXT: pandn %xmm2, %xmm0
+; SSSE3-NEXT: por %xmm3, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: v2i32:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psllq $32, %xmm1
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
-; SSE41-NEXT: psllq $32, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: psubq %xmm1, %xmm2
-; SSE41-NEXT: por %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm6, %xmm1
+; SSE41-NEXT: pxor %xmm0, %xmm0
+; SSE41-NEXT: pxor %xmm3, %xmm3
+; SSE41-NEXT: pcmpgtd %xmm1, %xmm3
; SSE41-NEXT: pcmpeqd %xmm4, %xmm4
-; SSE41-NEXT: pxor %xmm4, %xmm1
-; SSE41-NEXT: por %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm0, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE41-NEXT: pand %xmm6, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT: por %xmm3, %xmm5
-; SSE41-NEXT: pxor %xmm4, %xmm5
-; SSE41-NEXT: pcmpeqq %xmm5, %xmm1
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: por %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm0, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE41-NEXT: pand %xmm7, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm3
; SSE41-NEXT: pxor %xmm4, %xmm3
-; SSE41-NEXT: pcmpeqq %xmm5, %xmm3
-; SSE41-NEXT: pxor %xmm4, %xmm3
-; SSE41-NEXT: pandn %xmm3, %xmm1
-; SSE41-NEXT: movapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
-; SSE41-NEXT: blendvpd %xmm0, {{.*}}(%rip), %xmm3
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE41-NEXT: psrad $31, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: pxor %xmm5, %xmm5
+; SSE41-NEXT: pcmpgtd %xmm2, %xmm5
+; SSE41-NEXT: pxor %xmm4, %xmm5
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm3
+; SSE41-NEXT: psubd %xmm1, %xmm2
+; SSE41-NEXT: pcmpgtd %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: pxor %xmm4, %xmm1
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm1
+; SSE41-NEXT: pxor %xmm4, %xmm1
+; SSE41-NEXT: pandn %xmm1, %xmm3
+; SSE41-NEXT: movaps {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
+; SSE41-NEXT: blendvps %xmm0, {{.*}}(%rip), %xmm1
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm2
+; SSE41-NEXT: movaps %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: v2i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3
; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5
+; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5
; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpcmpeqq %xmm3, %xmm5, %xmm3
-; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm1
; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm2
-; AVX1-NEXT: vpcmpeqq %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm5, %xmm2
; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vmovapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT: vblendvpd %xmm1, {{.*}}(%rip), %xmm3, %xmm1
-; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; AVX1-NEXT: vblendvps %xmm1, {{.*}}(%rip), %xmm3, %xmm1
+; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: v2i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3
+; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3
; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5
+; AVX2-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5
; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5
-; AVX2-NEXT: vpcmpeqq %xmm3, %xmm5, %xmm3
-; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1
+; AVX2-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm1
; AVX2-NEXT: vpxor %xmm4, %xmm1, %xmm2
-; AVX2-NEXT: vpcmpeqq %xmm2, %xmm5, %xmm2
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm5, %xmm2
; AVX2-NEXT: vpxor %xmm4, %xmm2, %xmm2
; AVX2-NEXT: vpandn %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vmovapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vblendvpd %xmm1, {{.*}}(%rip), %xmm3, %xmm1
-; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $31, %xmm0, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647]
+; AVX2-NEXT: vbroadcastss {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT: vblendvps %xmm1, %xmm3, %xmm4, %xmm1
+; AVX2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: v2i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllq $32, %xmm1, %xmm1
; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpnltq %xmm2, %xmm1, %k0
-; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k1
+; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k0
+; AVX512-NEXT: vpcmpnltd %xmm2, %xmm0, %k1
; AVX512-NEXT: kxorw %k0, %k1, %k0
-; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k2
+; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpcmpnltd %xmm2, %xmm0, %k2
; AVX512-NEXT: kxorw %k2, %k1, %k1
; AVX512-NEXT: kandw %k1, %k0, %k1
-; AVX512-NEXT: vpcmpgtq %xmm0, %xmm2, %k2
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
-; AVX512-NEXT: vmovdqa64 {{.*}}(%rip), %xmm1 {%k2}
-; AVX512-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
-; AVX512-NEXT: vpsraq $32, %xmm0, %xmm0
+; AVX512-NEXT: vpcmpgtd %xmm0, %xmm2, %k2
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
+; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 {%k2}
+; AVX512-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
; AVX512-NEXT: retq
%z = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %x, <2 x i32> %y)
ret <2 x i32> %z
; CHECK-X64-NEXT: testl $263, %edi # imm = 0x107
; CHECK-X64-NEXT: je .LBB1_3
; CHECK-X64-NEXT: # %bb.1:
-; CHECK-X64-NEXT: pand {{.*}}(%rip), %xmm0
-; CHECK-X64-NEXT: pcmpeqd {{.*}}(%rip), %xmm0
-; CHECK-X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
-; CHECK-X64-NEXT: pand %xmm0, %xmm1
-; CHECK-X64-NEXT: pextrw $4, %xmm1, %eax
+; CHECK-X64-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
+; CHECK-X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; CHECK-X64-NEXT: pextrw $1, %xmm0, %eax
; CHECK-X64-NEXT: testb $1, %al
; CHECK-X64-NEXT: jne .LBB1_3
; CHECK-X64-NEXT: # %bb.2: # %no
; SSE2: # %bb.0:
; SSE2-NEXT: movzwl (%rdi), %eax
; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; SSE2-NEXT: paddq {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
+; SSE2-NEXT: paddb {{.*}}(%rip), %xmm0
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: movw %ax, (%rdi)
; SSE2-NEXT: retq
;
; SSE41-LABEL: load_2_i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: paddq {{.*}}(%rip), %xmm0
-; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE41-NEXT: movzwl (%rdi), %eax
+; SSE41-NEXT: movd %eax, %xmm0
+; SSE41-NEXT: paddb {{.*}}(%rip), %xmm0
; SSE41-NEXT: pextrw $0, %xmm0, (%rdi)
; SSE41-NEXT: retq
%T = load <2 x i8>, <2 x i8>* %A
; Read 32-bits
define void @load_2_i16(<2 x i16>* %A) {
-; SSE2-LABEL: load_2_i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
-; SSE2-NEXT: paddq {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: movd %xmm0, (%rdi)
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: load_2_i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
-; SSE41-NEXT: paddq {{.*}}(%rip), %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE41-NEXT: movd %xmm0, (%rdi)
-; SSE41-NEXT: retq
+; CHECK-LABEL: load_2_i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: paddw {{.*}}(%rip), %xmm0
+; CHECK-NEXT: movd %xmm0, (%rdi)
+; CHECK-NEXT: retq
%T = load <2 x i16>, <2 x i16>* %A
%G = add <2 x i16> %T, <i16 9, i16 7>
store <2 x i16> %G, <2 x i16>* %A
}
define void @load_2_i32(<2 x i32>* %A) {
-; SSE2-LABEL: load_2_i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; SSE2-NEXT: paddd {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: movq %xmm0, (%rdi)
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: load_2_i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
-; SSE41-NEXT: paddd {{.*}}(%rip), %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE41-NEXT: movq %xmm0, (%rdi)
-; SSE41-NEXT: retq
+; CHECK-LABEL: load_2_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: paddd {{.*}}(%rip), %xmm0
+; CHECK-NEXT: movq %xmm0, (%rdi)
+; CHECK-NEXT: retq
%T = load <2 x i32>, <2 x i32>* %A
%G = add <2 x i32> %T, <i32 9, i32 7>
store <2 x i32> %G, <2 x i32>* %A
}
define void @load_4_i8(<4 x i8>* %A) {
-; SSE2-LABEL: load_4_i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: paddd {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: movd %xmm0, (%rdi)
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: load_4_i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; SSE41-NEXT: paddd {{.*}}(%rip), %xmm0
-; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; SSE41-NEXT: movd %xmm0, (%rdi)
-; SSE41-NEXT: retq
+; CHECK-LABEL: load_4_i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: paddb {{.*}}(%rip), %xmm0
+; CHECK-NEXT: movd %xmm0, (%rdi)
+; CHECK-NEXT: retq
%T = load <4 x i8>, <4 x i8>* %A
%G = add <4 x i8> %T, <i8 1, i8 4, i8 9, i8 7>
store <4 x i8> %G, <4 x i8>* %A
}
define void @load_4_i16(<4 x i16>* %A) {
-; SSE2-LABEL: load_4_i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: paddw {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: movq %xmm0, (%rdi)
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: load_4_i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; SSE41-NEXT: paddw {{.*}}(%rip), %xmm0
-; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSE41-NEXT: movq %xmm0, (%rdi)
-; SSE41-NEXT: retq
+; CHECK-LABEL: load_4_i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: paddw {{.*}}(%rip), %xmm0
+; CHECK-NEXT: movq %xmm0, (%rdi)
+; CHECK-NEXT: retq
%T = load <4 x i16>, <4 x i16>* %A
%G = add <4 x i16> %T, <i16 1, i16 4, i16 9, i16 7>
store <4 x i16> %G, <4 x i16>* %A
define <2 x i32> @test3(<8 x i32> %v) {
; SSE2-LABEL: test3:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
-; AVX2-LABEL: test3:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpmovsxdq %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test3:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovsxdq %ymm0, %zmm0
-; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX-LABEL: test3:
+; AVX: # %bb.0:
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
%x = sext <8 x i32> %v to <8 x i64>
%s = shufflevector <8 x i64> %x, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
%t = trunc <2 x i64> %s to <2 x i32>
define <2 x i32> @test4(<8 x i32> %v) {
; SSE2-LABEL: test4:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
-; AVX2-LABEL: test4:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovsxdq %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test4:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovsxdq %ymm0, %zmm0
-; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX-LABEL: test4:
+; AVX: # %bb.0:
+; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
%x = sext <8 x i32> %v to <8 x i64>
%s = shufflevector <8 x i64> %x, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
%t = trunc <2 x i64> %s to <2 x i32>
define <2 x i32> @test5(<8 x i32> %v) {
; SSE2-LABEL: test5:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,2,2]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; AVX2-LABEL: test5:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovsxdq %xmm0, %ymm1
+; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpmovsxdq %xmm0, %xmm0
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
-; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [6,6,6,6]
+; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: test5:
; AVX512: # %bb.0:
; AVX512-NEXT: vpmovsxdq %ymm0, %zmm0
-; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm1
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vpextrq $1, %xmm1, %rax
+; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm0
+; AVX512-NEXT: vmovq %xmm0, %rcx
+; AVX512-NEXT: vmovd %eax, %xmm0
+; AVX512-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%x = sext <8 x i32> %v to <8 x i64>
; SSE2-LABEL: test8:
; SSE2: # %bb.0:
; SSE2-NEXT: movaps %xmm1, %xmm0
-; SSE2-NEXT: xorps %xmm1, %xmm1
-; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
-; AVX2-LABEL: test8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX-LABEL: test8:
+; AVX: # %bb.0:
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
%x = zext <8 x i32> %v to <8 x i64>
%s = shufflevector <8 x i64> %x, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
%t = trunc <2 x i64> %s to <2 x i32>
define <2 x i32> @test9(<8 x i32> %v) {
; SSE2-LABEL: test9:
; SSE2: # %bb.0:
-; SSE2-NEXT: xorps %xmm1, %xmm1
-; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
-; AVX2-LABEL: test9:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test9:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX-LABEL: test9:
+; AVX: # %bb.0:
+; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
%x = zext <8 x i32> %v to <8 x i64>
%s = shufflevector <8 x i64> %x, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
%t = trunc <2 x i64> %s to <2 x i32>
define <2 x i32> @test10(<8 x i32> %v) {
; SSE2-LABEL: test10:
; SSE2: # %bb.0:
-; SSE2-NEXT: xorps %xmm2, %xmm2
-; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,2,2]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; AVX2-LABEL: test10:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
-; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [6,6,6,6]
+; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: test10:
; AVX512: # %bb.0:
; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm1
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vpextrq $1, %xmm1, %rax
+; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm0
+; AVX512-NEXT: vmovq %xmm0, %rcx
+; AVX512-NEXT: vmovd %eax, %xmm0
+; AVX512-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%x = zext <8 x i32> %v to <8 x i64>
define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
; SSE2-LABEL: v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: psllq $32, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
-; SSE2-NEXT: psllq $32, %xmm1
-; SSE2-NEXT: paddq %xmm0, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE2-NEXT: pand %xmm2, %xmm3
; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: psrlq $32, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v2i32:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: psllq $32, %xmm0
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
-; SSSE3-NEXT: psllq $32, %xmm1
-; SSSE3-NEXT: paddq %xmm0, %xmm1
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT: paddd %xmm0, %xmm1
; SSSE3-NEXT: pxor %xmm2, %xmm0
; SSSE3-NEXT: pxor %xmm1, %xmm2
-; SSSE3-NEXT: movdqa %xmm0, %xmm3
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3
; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT: pand %xmm2, %xmm3
; SSSE3-NEXT: por %xmm1, %xmm0
-; SSSE3-NEXT: por %xmm3, %xmm0
-; SSSE3-NEXT: psrlq $32, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: v2i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: psllq $32, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT: psllq $32, %xmm1
-; SSE41-NEXT: paddq %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: pxor %xmm2, %xmm3
-; SSE41-NEXT: pxor %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: psrlq $32, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pxor %xmm1, %xmm2
+; SSE41-NEXT: pminud %xmm2, %xmm0
+; SSE41-NEXT: paddd %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: v2i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
-; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1
-; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2
+; AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: v2i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3
-; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1
-; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm1
-; AVX2-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
-; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2
+; AVX2-NEXT: vpminud %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: v2i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX512-NEXT: vpsllq $32, %xmm1, %xmm1
; AVX512-NEXT: vmovdqa %xmm1, %xmm2
; AVX512-NEXT: vpternlogq $15, %xmm1, %xmm1, %xmm2
-; AVX512-NEXT: vpminuq %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrlq $32, %xmm0, %xmm0
+; AVX512-NEXT: vpminud %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
%z = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> %x, <2 x i32> %y)
ret <2 x i32> %z
; CHECK-SSE2-LABEL: out_v2i8:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: andps %xmm2, %xmm0
-; CHECK-SSE2-NEXT: xorps {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: andps %xmm1, %xmm2
+; CHECK-SSE2-NEXT: andnps %xmm1, %xmm2
; CHECK-SSE2-NEXT: orps %xmm2, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-XOP-LABEL: out_v2i8:
; CHECK-XOP: # %bb.0:
-; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0
-; CHECK-XOP-NEXT: vxorps {{.*}}(%rip), %xmm2, %xmm2
-; CHECK-XOP-NEXT: vandps %xmm2, %xmm1, %xmm1
-; CHECK-XOP-NEXT: vorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-XOP-NEXT: retq
%mx = and <2 x i8> %x, %mask
%notmask = xor <2 x i8> %mask, <i8 -1, i8 -1>
; CHECK-SSE2-LABEL: out_v4i8:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: andps %xmm2, %xmm0
-; CHECK-SSE2-NEXT: xorps {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: andps %xmm1, %xmm2
+; CHECK-SSE2-NEXT: andnps %xmm1, %xmm2
; CHECK-SSE2-NEXT: orps %xmm2, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-XOP-LABEL: out_v4i8:
; CHECK-XOP: # %bb.0:
-; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0
-; CHECK-XOP-NEXT: vxorps {{.*}}(%rip), %xmm2, %xmm2
-; CHECK-XOP-NEXT: vandps %xmm2, %xmm1, %xmm1
-; CHECK-XOP-NEXT: vorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-XOP-NEXT: retq
%mx = and <4 x i8> %x, %mask
%notmask = xor <4 x i8> %mask, <i8 -1, i8 -1, i8 -1, i8 -1>
; CHECK-SSE2-LABEL: out_v4i8_undef:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: andps %xmm2, %xmm0
-; CHECK-SSE2-NEXT: xorps {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: andps %xmm1, %xmm2
+; CHECK-SSE2-NEXT: andnps %xmm1, %xmm2
; CHECK-SSE2-NEXT: orps %xmm2, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-XOP-LABEL: out_v4i8_undef:
; CHECK-XOP: # %bb.0:
-; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0
-; CHECK-XOP-NEXT: vxorps {{.*}}(%rip), %xmm2, %xmm2
-; CHECK-XOP-NEXT: vandps %xmm2, %xmm1, %xmm1
-; CHECK-XOP-NEXT: vorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-XOP-NEXT: retq
%mx = and <4 x i8> %x, %mask
%notmask = xor <4 x i8> %mask, <i8 -1, i8 -1, i8 undef, i8 -1>
; CHECK-SSE2-LABEL: out_v2i16:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: andps %xmm2, %xmm0
-; CHECK-SSE2-NEXT: xorps {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: andps %xmm1, %xmm2
+; CHECK-SSE2-NEXT: andnps %xmm1, %xmm2
; CHECK-SSE2-NEXT: orps %xmm2, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-XOP-LABEL: out_v2i16:
; CHECK-XOP: # %bb.0:
-; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0
-; CHECK-XOP-NEXT: vxorps {{.*}}(%rip), %xmm2, %xmm2
-; CHECK-XOP-NEXT: vandps %xmm2, %xmm1, %xmm1
-; CHECK-XOP-NEXT: vorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-XOP-NEXT: retq
%mx = and <2 x i16> %x, %mask
%notmask = xor <2 x i16> %mask, <i16 -1, i16 -1>
; CHECK-SSE2-LABEL: out_v8i8:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: andps %xmm2, %xmm0
-; CHECK-SSE2-NEXT: xorps {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: andps %xmm1, %xmm2
+; CHECK-SSE2-NEXT: andnps %xmm1, %xmm2
; CHECK-SSE2-NEXT: orps %xmm2, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-XOP-LABEL: out_v8i8:
; CHECK-XOP: # %bb.0:
-; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0
-; CHECK-XOP-NEXT: vxorps {{.*}}(%rip), %xmm2, %xmm2
-; CHECK-XOP-NEXT: vandps %xmm2, %xmm1, %xmm1
-; CHECK-XOP-NEXT: vorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-XOP-NEXT: retq
%mx = and <8 x i8> %x, %mask
%notmask = xor <8 x i8> %mask, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
; CHECK-SSE2-LABEL: out_v4i16:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: andps %xmm2, %xmm0
-; CHECK-SSE2-NEXT: xorps {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: andps %xmm1, %xmm2
+; CHECK-SSE2-NEXT: andnps %xmm1, %xmm2
; CHECK-SSE2-NEXT: orps %xmm2, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-XOP-LABEL: out_v4i16:
; CHECK-XOP: # %bb.0:
-; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0
-; CHECK-XOP-NEXT: vxorps {{.*}}(%rip), %xmm2, %xmm2
-; CHECK-XOP-NEXT: vandps %xmm2, %xmm1, %xmm1
-; CHECK-XOP-NEXT: vorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-XOP-NEXT: retq
%mx = and <4 x i16> %x, %mask
%notmask = xor <4 x i16> %mask, <i16 -1, i16 -1, i16 -1, i16 -1>
; CHECK-SSE2-LABEL: out_v4i16_undef:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: andps %xmm2, %xmm0
-; CHECK-SSE2-NEXT: xorps {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: andps %xmm1, %xmm2
+; CHECK-SSE2-NEXT: andnps %xmm1, %xmm2
; CHECK-SSE2-NEXT: orps %xmm2, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-XOP-LABEL: out_v4i16_undef:
; CHECK-XOP: # %bb.0:
-; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0
-; CHECK-XOP-NEXT: vxorps {{.*}}(%rip), %xmm2, %xmm2
-; CHECK-XOP-NEXT: vandps %xmm2, %xmm1, %xmm1
-; CHECK-XOP-NEXT: vorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-XOP-NEXT: retq
%mx = and <4 x i16> %x, %mask
%notmask = xor <4 x i16> %mask, <i16 -1, i16 -1, i16 undef, i16 -1>
; CHECK-SSE2-LABEL: out_v2i32:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: andps %xmm2, %xmm0
-; CHECK-SSE2-NEXT: xorps {{.*}}(%rip), %xmm2
-; CHECK-SSE2-NEXT: andps %xmm1, %xmm2
+; CHECK-SSE2-NEXT: andnps %xmm1, %xmm2
; CHECK-SSE2-NEXT: orps %xmm2, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-XOP-LABEL: out_v2i32:
; CHECK-XOP: # %bb.0:
-; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0
-; CHECK-XOP-NEXT: vxorps {{.*}}(%rip), %xmm2, %xmm2
-; CHECK-XOP-NEXT: vandps %xmm2, %xmm1, %xmm1
-; CHECK-XOP-NEXT: vorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-XOP-NEXT: retq
%mx = and <2 x i32> %x, %mask
%notmask = xor <2 x i32> %mask, <i32 -1, i32 -1>
define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
; SSE2-LABEL: v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: psllq $32, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
-; SSE2-NEXT: psllq $32, %xmm0
; SSE2-NEXT: pxor %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm4
; SSE2-NEXT: pcmpgtd %xmm3, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
-; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: psubq %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: psrlq $32, %xmm0
+; SSE2-NEXT: psubd %xmm1, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v2i32:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: psllq $32, %xmm1
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; SSSE3-NEXT: movdqa %xmm1, %xmm3
; SSSE3-NEXT: pxor %xmm2, %xmm3
-; SSSE3-NEXT: psllq $32, %xmm0
; SSSE3-NEXT: pxor %xmm0, %xmm2
-; SSSE3-NEXT: movdqa %xmm2, %xmm4
-; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4
; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
-; SSSE3-NEXT: pand %xmm4, %xmm3
-; SSSE3-NEXT: por %xmm2, %xmm3
-; SSSE3-NEXT: psubq %xmm1, %xmm0
-; SSSE3-NEXT: pand %xmm3, %xmm0
-; SSSE3-NEXT: psrlq $32, %xmm0
+; SSSE3-NEXT: psubd %xmm1, %xmm0
+; SSSE3-NEXT: pand %xmm2, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: v2i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psllq $32, %xmm1
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm3, %xmm0
-; SSE41-NEXT: psllq $32, %xmm2
-; SSE41-NEXT: pxor %xmm2, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: psubq %xmm1, %xmm2
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: psrlq $32, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pmaxud %xmm1, %xmm0
+; SSE41-NEXT: psubd %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: v2i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
-; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: v2i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3
-; AVX2-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2
-; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpand %xmm0, %xmm2, %xmm0
-; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: v2i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllq $32, %xmm1, %xmm1
-; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrlq $32, %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX-LABEL: v2i32:
+; AVX: # %bb.0:
+; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%z = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> %x, <2 x i32> %y)
ret <2 x i32> %z
}
define <8 x float> @cvt_v8i8_v8f32(<8 x i8> %src) {
; CHECK-LABEL: cvt_v8i8_v8f32:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4,4,5,5,6,6,7,7]
-; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; CHECK-NEXT: vpslld $24, %xmm0, %xmm0
-; CHECK-NEXT: vpsrad $24, %xmm0, %xmm0
-; CHECK-NEXT: vpslld $24, %xmm1, %xmm1
-; CHECK-NEXT: vpsrad $24, %xmm1, %xmm1
-; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: vpmovsxbd %xmm0, %xmm1
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; CHECK-NEXT: vpmovsxbd %xmm0, %xmm0
+; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0
; CHECK-NEXT: retl
;
define <4 x float> @cvt_v4i8_v4f32(<4 x i8> %src) {
; CHECK-LABEL: cvt_v4i8_v4f32:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpslld $24, %xmm0, %xmm0
-; CHECK-NEXT: vpsrad $24, %xmm0, %xmm0
+; CHECK-NEXT: vpmovsxbd %xmm0, %xmm0
; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0
; CHECK-NEXT: retl
;
define <4 x float> @cvt_v4i16_v4f32(<4 x i16> %src) {
; CHECK-LABEL: cvt_v4i16_v4f32:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpslld $16, %xmm0, %xmm0
-; CHECK-NEXT: vpsrad $16, %xmm0, %xmm0
+; CHECK-NEXT: vpmovsxwd %xmm0, %xmm0
; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0
; CHECK-NEXT: retl
;
define <8 x float> @cvt_v8u8_v8f32(<8 x i8> %src) {
; CHECK-LABEL: cvt_v8u8_v8f32:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpand LCPI4_0, %xmm0, %xmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0
; CHECK-NEXT: retl
;
define <4 x float> @cvt_v4u8_v4f32(<4 x i8> %src) {
; CHECK-LABEL: cvt_v4u8_v4f32:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vandps LCPI6_0, %xmm0, %xmm0
+; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0
; CHECK-NEXT: retl
;
define <4 x float> @cvt_v4u16_v4f32(<4 x i16> %src) {
; CHECK-LABEL: cvt_v4u16_v4f32:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0
; CHECK-NEXT: retl
;
; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retl
;
; CHECK-LABEL: cvt_v4f32_v4i8:
; CHECK: ## %bb.0:
; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0
+; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; CHECK-NEXT: retl
;
; CHECK-WIDE-LABEL: cvt_v4f32_v4i8:
; CHECK-LABEL: cvt_v4f32_v4i16:
; CHECK: ## %bb.0:
; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0
+; CHECK-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
; CHECK-NEXT: retl
;
; CHECK-WIDE-LABEL: cvt_v4f32_v4i16:
; CHECK: ## %bb.0:
; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retl
;
; CHECK-LABEL: cvt_v4f32_v4u8:
; CHECK: ## %bb.0:
; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0
+; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; CHECK-NEXT: retl
;
; CHECK-WIDE-LABEL: cvt_v4f32_v4u8:
; CHECK-LABEL: cvt_v4f32_v4u16:
; CHECK: ## %bb.0:
; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0
+; CHECK-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
; CHECK-NEXT: retl
;
; CHECK-WIDE-LABEL: cvt_v4f32_v4u16:
define <2 x float> @cvt_v2i8_v2f32(<2 x i8> %src) {
; CHECK-LABEL: cvt_v2i8_v2f32:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpsllq $56, %xmm0, %xmm0
-; CHECK-NEXT: vpsrad $24, %xmm0, %xmm0
-; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; CHECK-NEXT: vpmovsxbd %xmm0, %xmm0
; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0
; CHECK-NEXT: retl
;
define <2 x float> @cvt_v2i16_v2f32(<2 x i16> %src) {
; CHECK-LABEL: cvt_v2i16_v2f32:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpsllq $48, %xmm0, %xmm0
-; CHECK-NEXT: vpsrad $16, %xmm0, %xmm0
-; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; CHECK-NEXT: vpmovsxwd %xmm0, %xmm0
; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0
; CHECK-NEXT: retl
;
define <2 x float> @cvt_v2i32_v2f32(<2 x i32> %src) {
; CHECK-LABEL: cvt_v2i32_v2f32:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0
; CHECK-NEXT: retl
;
define <2 x float> @cvt_v2u8_v2f32(<2 x i8> %src) {
; CHECK-LABEL: cvt_v2u8_v2f32:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0
; CHECK-NEXT: retl
;
define <2 x float> @cvt_v2u16_v2f32(<2 x i16> %src) {
; CHECK-LABEL: cvt_v2u16_v2f32:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[8,9],zero,zero,xmm0[8,9],zero,zero,xmm0[10,11],zero,zero
+; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0
; CHECK-NEXT: retl
;
define <2 x float> @cvt_v2u32_v2f32(<2 x i32> %src) {
; CHECK-LABEL: cvt_v2u32_v2f32:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
-; CHECK-NEXT: vorps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
+; CHECK-NEXT: vpor %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vsubpd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vcvtpd2ps %xmm0, %xmm0
; CHECK-NEXT: retl
; CHECK-LABEL: cvt_v2f32_v2i8:
; CHECK: ## %bb.0:
; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0
-; CHECK-NEXT: vpmovsxdq %xmm0, %xmm0
+; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; CHECK-NEXT: retl
;
; CHECK-WIDE-LABEL: cvt_v2f32_v2i8:
; CHECK-LABEL: cvt_v2f32_v2i16:
; CHECK: ## %bb.0:
; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0
-; CHECK-NEXT: vpmovsxdq %xmm0, %xmm0
+; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; CHECK-NEXT: retl
;
; CHECK-WIDE-LABEL: cvt_v2f32_v2i16:
; CHECK-LABEL: cvt_v2f32_v2i32:
; CHECK: ## %bb.0:
; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0
-; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; CHECK-NEXT: retl
;
; CHECK-WIDE-LABEL: cvt_v2f32_v2i32:
; CHECK-LABEL: cvt_v2f32_v2u8:
; CHECK: ## %bb.0:
; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0
-; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; CHECK-NEXT: retl
;
; CHECK-WIDE-LABEL: cvt_v2f32_v2u8:
; CHECK-LABEL: cvt_v2f32_v2u16:
; CHECK: ## %bb.0:
; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0
-; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; CHECK-NEXT: retl
;
; CHECK-WIDE-LABEL: cvt_v2f32_v2u16:
define <2 x i32> @cvt_v2f32_v2u32(<2 x float> %src) {
; CHECK-LABEL: cvt_v2f32_v2u32:
; CHECK: ## %bb.0:
-; CHECK-NEXT: subl $36, %esp
-; CHECK-NEXT: .cfi_def_cfa_offset 40
-; CHECK-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-NEXT: vucomiss %xmm1, %xmm2
-; CHECK-NEXT: jb LBB11_2
-; CHECK-NEXT: ## %bb.1:
-; CHECK-NEXT: vsubss %xmm1, %xmm2, %xmm2
-; CHECK-NEXT: LBB11_2:
-; CHECK-NEXT: vmovss %xmm2, (%esp)
-; CHECK-NEXT: flds (%esp)
-; CHECK-NEXT: fisttpll (%esp)
-; CHECK-NEXT: setae %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: shll $31, %eax
-; CHECK-NEXT: xorl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vucomiss %xmm1, %xmm0
-; CHECK-NEXT: jb LBB11_4
-; CHECK-NEXT: ## %bb.3:
-; CHECK-NEXT: vsubss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: LBB11_4:
-; CHECK-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp)
-; CHECK-NEXT: flds {{[0-9]+}}(%esp)
-; CHECK-NEXT: fisttpll {{[0-9]+}}(%esp)
-; CHECK-NEXT: setae %cl
-; CHECK-NEXT: movzbl %cl, %ecx
-; CHECK-NEXT: shll $31, %ecx
-; CHECK-NEXT: xorl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
-; CHECK-NEXT: vpinsrd $2, (%esp), %xmm0, %xmm0
-; CHECK-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
-; CHECK-NEXT: addl $36, %esp
+; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
+; CHECK-NEXT: vcmpltps %xmm1, %xmm0, %xmm2
+; CHECK-NEXT: vsubps %xmm1, %xmm0, %xmm1
+; CHECK-NEXT: vcvttps2dq %xmm1, %xmm1
+; CHECK-NEXT: vxorps LCPI11_1, %xmm1, %xmm1
+; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0
+; CHECK-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; CHECK-NEXT: retl
;
; CHECK-WIDE-LABEL: cvt_v2f32_v2u32:
define <2 x i32> @promtz(<2 x i32> %a) nounwind {
; CHECK-LABEL: promtz:
; CHECK: # %bb.0:
-; CHECK-NEXT: por {{.*}}(%rip), %xmm0
; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
-; CHECK-NEXT: paddq %xmm0, %xmm1
+; CHECK-NEXT: paddd %xmm0, %xmm1
; CHECK-NEXT: pandn %xmm1, %xmm0
; CHECK-NEXT: movdqa %xmm0, %xmm1
; CHECK-NEXT: psrlw $1, %xmm1
; CHECK-NEXT: paddb %xmm0, %xmm1
; CHECK-NEXT: pand {{.*}}(%rip), %xmm1
; CHECK-NEXT: pxor %xmm0, %xmm0
+; CHECK-NEXT: movdqa %xmm1, %xmm2
+; CHECK-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; CHECK-NEXT: psadbw %xmm0, %xmm2
+; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; CHECK-NEXT: psadbw %xmm0, %xmm1
+; CHECK-NEXT: packuswb %xmm2, %xmm1
; CHECK-NEXT: movdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%c = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 false)
define <2 x i32> @promlz(<2 x i32> %a) nounwind {
; CHECK-LABEL: promlz:
; CHECK: # %bb.0:
-; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
-; CHECK-NEXT: pxor %xmm1, %xmm1
-; CHECK-NEXT: movdqa %xmm0, %xmm2
-; CHECK-NEXT: psrlq $1, %xmm2
-; CHECK-NEXT: por %xmm0, %xmm2
-; CHECK-NEXT: movdqa %xmm2, %xmm0
-; CHECK-NEXT: psrlq $2, %xmm0
-; CHECK-NEXT: por %xmm2, %xmm0
-; CHECK-NEXT: movdqa %xmm0, %xmm2
-; CHECK-NEXT: psrlq $4, %xmm2
-; CHECK-NEXT: por %xmm0, %xmm2
-; CHECK-NEXT: movdqa %xmm2, %xmm0
-; CHECK-NEXT: psrlq $8, %xmm0
-; CHECK-NEXT: por %xmm2, %xmm0
-; CHECK-NEXT: movdqa %xmm0, %xmm2
-; CHECK-NEXT: psrlq $16, %xmm2
-; CHECK-NEXT: por %xmm0, %xmm2
-; CHECK-NEXT: movdqa %xmm2, %xmm0
-; CHECK-NEXT: psrlq $32, %xmm0
-; CHECK-NEXT: por %xmm2, %xmm0
+; CHECK-NEXT: movdqa %xmm0, %xmm1
+; CHECK-NEXT: psrld $1, %xmm1
+; CHECK-NEXT: por %xmm0, %xmm1
+; CHECK-NEXT: movdqa %xmm1, %xmm0
+; CHECK-NEXT: psrld $2, %xmm0
+; CHECK-NEXT: por %xmm1, %xmm0
+; CHECK-NEXT: movdqa %xmm0, %xmm1
+; CHECK-NEXT: psrld $4, %xmm1
+; CHECK-NEXT: por %xmm0, %xmm1
+; CHECK-NEXT: movdqa %xmm1, %xmm0
+; CHECK-NEXT: psrld $8, %xmm0
+; CHECK-NEXT: por %xmm1, %xmm0
+; CHECK-NEXT: movdqa %xmm0, %xmm1
+; CHECK-NEXT: psrld $16, %xmm1
+; CHECK-NEXT: por %xmm0, %xmm1
; CHECK-NEXT: pcmpeqd %xmm2, %xmm2
-; CHECK-NEXT: pxor %xmm0, %xmm2
+; CHECK-NEXT: pxor %xmm1, %xmm2
; CHECK-NEXT: movdqa %xmm2, %xmm0
; CHECK-NEXT: psrlw $1, %xmm0
; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
; CHECK-NEXT: psubb %xmm0, %xmm2
; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; CHECK-NEXT: movdqa %xmm2, %xmm3
-; CHECK-NEXT: pand %xmm0, %xmm3
+; CHECK-NEXT: movdqa %xmm2, %xmm1
+; CHECK-NEXT: pand %xmm0, %xmm1
; CHECK-NEXT: psrlw $2, %xmm2
; CHECK-NEXT: pand %xmm0, %xmm2
-; CHECK-NEXT: paddb %xmm3, %xmm2
+; CHECK-NEXT: paddb %xmm1, %xmm2
; CHECK-NEXT: movdqa %xmm2, %xmm0
; CHECK-NEXT: psrlw $4, %xmm0
; CHECK-NEXT: paddb %xmm2, %xmm0
; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
+; CHECK-NEXT: pxor %xmm1, %xmm1
+; CHECK-NEXT: movdqa %xmm0, %xmm2
+; CHECK-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; CHECK-NEXT: psadbw %xmm1, %xmm2
+; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-NEXT: psadbw %xmm1, %xmm0
-; CHECK-NEXT: psubq {{.*}}(%rip), %xmm0
+; CHECK-NEXT: packuswb %xmm2, %xmm0
; CHECK-NEXT: retq
%c = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false)
ret <2 x i32> %c
define <2 x i32> @prompop(<2 x i32> %a) nounwind {
; CHECK-LABEL: prompop:
; CHECK: # %bb.0:
-; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
-; CHECK-NEXT: pxor %xmm2, %xmm2
; CHECK-NEXT: movdqa %xmm0, %xmm1
; CHECK-NEXT: psrlw $1, %xmm1
; CHECK-NEXT: pand {{.*}}(%rip), %xmm1
; CHECK-NEXT: psubb %xmm1, %xmm0
; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; CHECK-NEXT: movdqa %xmm0, %xmm3
-; CHECK-NEXT: pand %xmm1, %xmm3
+; CHECK-NEXT: movdqa %xmm0, %xmm2
+; CHECK-NEXT: pand %xmm1, %xmm2
; CHECK-NEXT: psrlw $2, %xmm0
; CHECK-NEXT: pand %xmm1, %xmm0
-; CHECK-NEXT: paddb %xmm3, %xmm0
+; CHECK-NEXT: paddb %xmm2, %xmm0
; CHECK-NEXT: movdqa %xmm0, %xmm1
; CHECK-NEXT: psrlw $4, %xmm1
; CHECK-NEXT: paddb %xmm0, %xmm1
; CHECK-NEXT: pand {{.*}}(%rip), %xmm1
-; CHECK-NEXT: psadbw %xmm2, %xmm1
+; CHECK-NEXT: pxor %xmm0, %xmm0
+; CHECK-NEXT: movdqa %xmm1, %xmm2
+; CHECK-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; CHECK-NEXT: psadbw %xmm0, %xmm2
+; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT: psadbw %xmm0, %xmm1
+; CHECK-NEXT: packuswb %xmm2, %xmm1
; CHECK-NEXT: movdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%c = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a)
; X32: # %bb.0:
; X32-NEXT: pushl %ebp
; X32-NEXT: movl %esp, %ebp
-; X32-NEXT: andl $-8, %esp
-; X32-NEXT: subl $8, %esp
+; X32-NEXT: andl $-16, %esp
+; X32-NEXT: subl $32, %esp
; X32-NEXT: movq %mm0, (%esp)
-; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
-; X32-NEXT: movd %xmm0, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl %ebp, %esp
; X32-NEXT: popl %ebp
; X32-NEXT: retl
; X64-LABEL: test4:
; X64: # %bb.0:
; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,0,1]
-; X64-NEXT: movd %xmm0, %eax
+; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: retq
%tmp0 = bitcast x86_mmx %a to <2 x i32>
%tmp1 = extractelement <2 x i32> %tmp0, i32 1
; SSE-LABEL: fptosi_2f64_to_2i32:
; SSE: # %bb.0:
; SSE-NEXT: cvttpd2dq %xmm0, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
; SSE-NEXT: retq
;
; AVX-LABEL: fptosi_2f64_to_2i32:
; AVX: # %bb.0:
; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0
-; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX-NEXT: retq
%cvt = fptosi <2 x double> %a to <2 x i32>
ret <2 x i32> %cvt
define <4 x i32> @fptoui_2f64_to_4i32(<2 x double> %a) {
; SSE-LABEL: fptoui_2f64_to_4i32:
; SSE: # %bb.0:
-; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
-; SSE-NEXT: movapd %xmm0, %xmm1
-; SSE-NEXT: subsd %xmm2, %xmm1
-; SSE-NEXT: cvttsd2si %xmm1, %rax
-; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; SSE-NEXT: xorq %rcx, %rax
-; SSE-NEXT: cvttsd2si %xmm0, %rdx
-; SSE-NEXT: ucomisd %xmm2, %xmm0
-; SSE-NEXT: cmovaeq %rax, %rdx
-; SSE-NEXT: movq %rdx, %xmm1
-; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
-; SSE-NEXT: movapd %xmm0, %xmm3
-; SSE-NEXT: subsd %xmm2, %xmm3
-; SSE-NEXT: cvttsd2si %xmm3, %rax
-; SSE-NEXT: xorq %rcx, %rax
+; SSE-NEXT: cvttsd2si %xmm0, %rax
+; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE-NEXT: cvttsd2si %xmm0, %rcx
-; SSE-NEXT: ucomisd %xmm2, %xmm0
-; SSE-NEXT: cmovaeq %rax, %rcx
-; SSE-NEXT: movq %rcx, %xmm0
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE-NEXT: pxor %xmm0, %xmm0
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
-; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: movd %eax, %xmm0
+; SSE-NEXT: movd %ecx, %xmm1
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
; SSE-NEXT: retq
;
-; VEX-LABEL: fptoui_2f64_to_4i32:
-; VEX: # %bb.0:
-; VEX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; VEX-NEXT: vsubsd %xmm1, %xmm0, %xmm2
-; VEX-NEXT: vcvttsd2si %xmm2, %rax
-; VEX-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; VEX-NEXT: xorq %rcx, %rax
-; VEX-NEXT: vcvttsd2si %xmm0, %rdx
-; VEX-NEXT: vucomisd %xmm1, %xmm0
-; VEX-NEXT: cmovaeq %rax, %rdx
-; VEX-NEXT: vmovq %rdx, %xmm2
-; VEX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; VEX-NEXT: vsubsd %xmm1, %xmm0, %xmm3
-; VEX-NEXT: vcvttsd2si %xmm3, %rax
-; VEX-NEXT: xorq %rcx, %rax
-; VEX-NEXT: vcvttsd2si %xmm0, %rcx
-; VEX-NEXT: vucomisd %xmm1, %xmm0
-; VEX-NEXT: cmovaeq %rax, %rcx
-; VEX-NEXT: vmovq %rcx, %xmm0
-; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; VEX-NEXT: retq
+; AVX1-LABEL: fptoui_2f64_to_4i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
+; AVX1-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2
+; AVX1-NEXT: vpackssdw %xmm0, %xmm2, %xmm2
+; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm3
+; AVX1-NEXT: vsubpd %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm0
+; AVX1-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vblendvps %xmm2, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: fptoui_2f64_to_4i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
+; AVX2-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vpackssdw %xmm0, %xmm2, %xmm2
+; AVX2-NEXT: vsubpd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT: vcvttpd2dq %ymm1, %xmm1
+; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT: vxorpd %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vcvttpd2dq %ymm0, %xmm0
+; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: fptoui_2f64_to_4i32:
; AVX512F: # %bb.0:
define <4 x i32> @fptoui_2f64_to_2i32(<2 x double> %a) {
; SSE-LABEL: fptoui_2f64_to_2i32:
; SSE: # %bb.0:
-; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
-; SSE-NEXT: movapd %xmm0, %xmm2
-; SSE-NEXT: subsd %xmm1, %xmm2
-; SSE-NEXT: cvttsd2si %xmm2, %rax
-; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; SSE-NEXT: xorq %rcx, %rax
-; SSE-NEXT: cvttsd2si %xmm0, %rdx
-; SSE-NEXT: ucomisd %xmm1, %xmm0
-; SSE-NEXT: cmovaeq %rax, %rdx
-; SSE-NEXT: movq %rdx, %xmm2
-; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
-; SSE-NEXT: movapd %xmm0, %xmm3
-; SSE-NEXT: subsd %xmm1, %xmm3
-; SSE-NEXT: cvttsd2si %xmm3, %rax
-; SSE-NEXT: xorq %rcx, %rax
-; SSE-NEXT: cvttsd2si %xmm0, %rcx
-; SSE-NEXT: ucomisd %xmm1, %xmm0
-; SSE-NEXT: cmovaeq %rax, %rcx
-; SSE-NEXT: movq %rcx, %xmm0
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE-NEXT: cvttsd2si %xmm0, %rax
+; SSE-NEXT: movd %eax, %xmm1
+; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT: cvttsd2si %xmm0, %rax
+; SSE-NEXT: movd %eax, %xmm0
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
-; VEX-LABEL: fptoui_2f64_to_2i32:
-; VEX: # %bb.0:
-; VEX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; VEX-NEXT: vsubsd %xmm1, %xmm0, %xmm2
-; VEX-NEXT: vcvttsd2si %xmm2, %rax
-; VEX-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; VEX-NEXT: xorq %rcx, %rax
-; VEX-NEXT: vcvttsd2si %xmm0, %rdx
-; VEX-NEXT: vucomisd %xmm1, %xmm0
-; VEX-NEXT: cmovaeq %rax, %rdx
-; VEX-NEXT: vmovq %rdx, %xmm2
-; VEX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; VEX-NEXT: vsubsd %xmm1, %xmm0, %xmm3
-; VEX-NEXT: vcvttsd2si %xmm3, %rax
-; VEX-NEXT: xorq %rcx, %rax
-; VEX-NEXT: vcvttsd2si %xmm0, %rcx
-; VEX-NEXT: vucomisd %xmm1, %xmm0
-; VEX-NEXT: cmovaeq %rax, %rcx
-; VEX-NEXT: vmovq %rcx, %xmm0
-; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; VEX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; VEX-NEXT: retq
+; AVX1-LABEL: fptoui_2f64_to_2i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
+; AVX1-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm3
+; AVX1-NEXT: vsubpd %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm0
+; AVX1-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vblendvps %xmm2, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: fptoui_2f64_to_2i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
+; AVX2-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vsubpd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT: vcvttpd2dq %ymm1, %xmm1
+; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT: vxorpd %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vcvttpd2dq %ymm0, %xmm0
+; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: fptoui_2f64_to_2i32:
; AVX512F: # %bb.0:
define <4 x i32> @fptoui_4f64_to_2i32(<2 x double> %a) {
; SSE-LABEL: fptoui_4f64_to_2i32:
; SSE: # %bb.0:
-; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
-; SSE-NEXT: movapd %xmm0, %xmm1
-; SSE-NEXT: subsd %xmm2, %xmm1
-; SSE-NEXT: cvttsd2si %xmm1, %rax
-; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; SSE-NEXT: xorq %rcx, %rax
-; SSE-NEXT: cvttsd2si %xmm0, %rdx
-; SSE-NEXT: ucomisd %xmm2, %xmm0
-; SSE-NEXT: cmovaeq %rax, %rdx
-; SSE-NEXT: movq %rdx, %xmm1
-; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
-; SSE-NEXT: movapd %xmm0, %xmm3
-; SSE-NEXT: subsd %xmm2, %xmm3
-; SSE-NEXT: cvttsd2si %xmm3, %rax
-; SSE-NEXT: xorq %rcx, %rax
-; SSE-NEXT: cvttsd2si %xmm0, %rcx
-; SSE-NEXT: ucomisd %xmm2, %xmm0
-; SSE-NEXT: cmovaeq %rax, %rcx
-; SSE-NEXT: movq %rcx, %xmm0
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE-NEXT: pxor %xmm0, %xmm0
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
-; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: cvttsd2si %xmm0, %rax
+; SSE-NEXT: movd %eax, %xmm1
+; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT: cvttsd2si %xmm0, %rax
+; SSE-NEXT: movd %eax, %xmm0
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero
; SSE-NEXT: retq
;
; AVX1-LABEL: fptoui_4f64_to_2i32:
define <4 x i32> @fptoui_4f64_to_4i32(<4 x double> %a) {
; SSE-LABEL: fptoui_4f64_to_4i32:
; SSE: # %bb.0:
-; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
-; SSE-NEXT: movapd %xmm1, %xmm3
-; SSE-NEXT: subsd %xmm2, %xmm3
-; SSE-NEXT: cvttsd2si %xmm3, %rcx
-; SSE-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; SSE-NEXT: xorq %rax, %rcx
-; SSE-NEXT: cvttsd2si %xmm1, %rdx
-; SSE-NEXT: ucomisd %xmm2, %xmm1
-; SSE-NEXT: cmovaeq %rcx, %rdx
-; SSE-NEXT: movq %rdx, %xmm3
-; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
-; SSE-NEXT: movapd %xmm1, %xmm4
-; SSE-NEXT: subsd %xmm2, %xmm4
-; SSE-NEXT: cvttsd2si %xmm4, %rcx
-; SSE-NEXT: xorq %rax, %rcx
-; SSE-NEXT: cvttsd2si %xmm1, %rdx
-; SSE-NEXT: ucomisd %xmm2, %xmm1
-; SSE-NEXT: cmovaeq %rcx, %rdx
-; SSE-NEXT: movq %rdx, %xmm1
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
-; SSE-NEXT: movapd %xmm0, %xmm1
-; SSE-NEXT: subsd %xmm2, %xmm1
-; SSE-NEXT: cvttsd2si %xmm1, %rcx
-; SSE-NEXT: xorq %rax, %rcx
-; SSE-NEXT: cvttsd2si %xmm0, %rdx
-; SSE-NEXT: ucomisd %xmm2, %xmm0
-; SSE-NEXT: cmovaeq %rcx, %rdx
-; SSE-NEXT: movq %rdx, %xmm1
-; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
-; SSE-NEXT: movapd %xmm0, %xmm4
-; SSE-NEXT: subsd %xmm2, %xmm4
-; SSE-NEXT: cvttsd2si %xmm4, %rcx
-; SSE-NEXT: xorq %rax, %rcx
+; SSE-NEXT: cvttsd2si %xmm1, %rax
+; SSE-NEXT: movd %eax, %xmm2
+; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
+; SSE-NEXT: cvttsd2si %xmm1, %rax
+; SSE-NEXT: movd %eax, %xmm1
+; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; SSE-NEXT: cvttsd2si %xmm0, %rax
-; SSE-NEXT: ucomisd %xmm2, %xmm0
-; SSE-NEXT: cmovaeq %rcx, %rax
-; SSE-NEXT: movq %rax, %xmm0
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
-; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: movd %eax, %xmm1
+; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT: cvttsd2si %xmm0, %rax
+; SSE-NEXT: movd %eax, %xmm0
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: fptoui_4f64_to_4i32:
; SSE-LABEL: fptosi_2f32_to_2i32:
; SSE: # %bb.0:
; SSE-NEXT: cvttps2dq %xmm0, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
; SSE-NEXT: retq
;
; AVX-LABEL: fptosi_2f32_to_2i32:
; AVX: # %bb.0:
; AVX-NEXT: vcvttps2dq %xmm0, %xmm0
-; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX-NEXT: retq
%cvt = fptosi <2 x float> %a to <2 x i32>
ret <2 x i32> %cvt
define <2 x i32> @fptoui_2f32_to_2i32(<2 x float> %a) {
; SSE-LABEL: fptoui_2f32_to_2i32:
; SSE: # %bb.0:
-; SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE-NEXT: movaps {{.*#+}} xmm2 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
; SSE-NEXT: movaps %xmm0, %xmm1
-; SSE-NEXT: subss %xmm2, %xmm1
-; SSE-NEXT: cvttss2si %xmm1, %rax
-; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; SSE-NEXT: xorq %rcx, %rax
-; SSE-NEXT: cvttss2si %xmm0, %rdx
-; SSE-NEXT: ucomiss %xmm2, %xmm0
-; SSE-NEXT: cmovaeq %rax, %rdx
-; SSE-NEXT: movq %rdx, %xmm1
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE-NEXT: movaps %xmm0, %xmm3
-; SSE-NEXT: subss %xmm2, %xmm3
-; SSE-NEXT: cvttss2si %xmm3, %rax
-; SSE-NEXT: xorq %rcx, %rax
-; SSE-NEXT: cvttss2si %xmm0, %rcx
-; SSE-NEXT: ucomiss %xmm2, %xmm0
-; SSE-NEXT: cmovaeq %rax, %rcx
-; SSE-NEXT: movq %rcx, %xmm0
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: cmpltps %xmm2, %xmm1
+; SSE-NEXT: cvttps2dq %xmm0, %xmm3
+; SSE-NEXT: subps %xmm2, %xmm0
+; SSE-NEXT: cvttps2dq %xmm0, %xmm0
+; SSE-NEXT: xorps {{.*}}(%rip), %xmm0
+; SSE-NEXT: andps %xmm1, %xmm3
+; SSE-NEXT: andnps %xmm0, %xmm1
+; SSE-NEXT: orps %xmm3, %xmm1
+; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
-; VEX-LABEL: fptoui_2f32_to_2i32:
-; VEX: # %bb.0:
-; VEX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; VEX-NEXT: vsubss %xmm1, %xmm0, %xmm2
-; VEX-NEXT: vcvttss2si %xmm2, %rax
-; VEX-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; VEX-NEXT: xorq %rcx, %rax
-; VEX-NEXT: vcvttss2si %xmm0, %rdx
-; VEX-NEXT: vucomiss %xmm1, %xmm0
-; VEX-NEXT: cmovaeq %rax, %rdx
-; VEX-NEXT: vmovq %rdx, %xmm2
-; VEX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; VEX-NEXT: vsubss %xmm1, %xmm0, %xmm3
-; VEX-NEXT: vcvttss2si %xmm3, %rax
-; VEX-NEXT: xorq %rcx, %rax
-; VEX-NEXT: vcvttss2si %xmm0, %rcx
-; VEX-NEXT: vucomiss %xmm1, %xmm0
-; VEX-NEXT: cmovaeq %rax, %rcx
-; VEX-NEXT: vmovq %rcx, %xmm0
-; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; VEX-NEXT: retq
+; AVX1-LABEL: fptoui_2f32_to_2i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
+; AVX1-NEXT: vcmpltps %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vsubps %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
+; AVX1-NEXT: vxorps {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0
+; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: fptoui_2f32_to_2i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
+; AVX2-NEXT: vcmpltps %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vsubps %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vcvttps2dq %xmm1, %xmm1
+; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT: vxorps %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vcvttps2dq %xmm0, %xmm0
+; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: fptoui_2f32_to_2i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512F-NEXT: vcvttps2udq %zmm0, %zmm0
-; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fptoui_2f32_to_2i32:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vcvttps2udq %xmm0, %xmm0
-; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: fptoui_2f32_to_2i32:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512DQ-NEXT: vcvttps2udq %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: fptoui_2f32_to_2i32:
; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvttps2udq %xmm0, %xmm0
-; AVX512VLDQ-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX512VLDQ-NEXT: retq
%cvt = fptoui <2 x float> %a to <2 x i32>
ret <2 x i32> %cvt
; SSE-LABEL: fptosi_2f16_to_4i32:
; SSE: # %bb.0:
; SSE-NEXT: pushq %rax
-; SSE-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: callq __gnu_f2h_ieee
; SSE-NEXT: movzwl %ax, %edi
; SSE-NEXT: callq __gnu_h2f_ieee
; SSE-NEXT: callq __gnu_f2h_ieee
; SSE-NEXT: movzwl %ax, %edi
; SSE-NEXT: callq __gnu_h2f_ieee
-; SSE-NEXT: cvttss2si %xmm0, %rax
-; SSE-NEXT: movq %rax, %xmm1
-; SSE-NEXT: cvttss2si (%rsp), %rax # 4-byte Folded Reload
-; SSE-NEXT: movq %rax, %xmm0
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE-NEXT: pxor %xmm1, %xmm1
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
+; SSE-NEXT: cvttss2si %xmm0, %eax
+; SSE-NEXT: cvttss2si (%rsp), %ecx # 4-byte Folded Reload
+; SSE-NEXT: movd %ecx, %xmm0
+; SSE-NEXT: movd %eax, %xmm1
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero
; SSE-NEXT: popq %rax
; SSE-NEXT: retq
;
; VEX-LABEL: fptosi_2f16_to_4i32:
; VEX: # %bb.0:
; VEX-NEXT: pushq %rax
-; VEX-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; VEX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; VEX-NEXT: vmovaps %xmm1, %xmm0
; VEX-NEXT: callq __gnu_f2h_ieee
; VEX-NEXT: movzwl %ax, %edi
; VEX-NEXT: callq __gnu_h2f_ieee
; VEX-NEXT: callq __gnu_f2h_ieee
; VEX-NEXT: movzwl %ax, %edi
; VEX-NEXT: callq __gnu_h2f_ieee
-; VEX-NEXT: vcvttss2si %xmm0, %rax
-; VEX-NEXT: vmovq %rax, %xmm0
-; VEX-NEXT: vcvttss2si (%rsp), %rax # 4-byte Folded Reload
-; VEX-NEXT: vmovq %rax, %xmm1
-; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; VEX-NEXT: vcvttss2si %xmm0, %eax
+; VEX-NEXT: vcvttss2si (%rsp), %ecx # 4-byte Folded Reload
+; VEX-NEXT: vmovd %ecx, %xmm0
+; VEX-NEXT: vmovd %eax, %xmm1
+; VEX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; VEX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; VEX-NEXT: popq %rax
; VEX-NEXT: retq
;
; AVX512-LABEL: fptosi_2f16_to_4i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512-NEXT: vcvttss2si %xmm1, %rax
-; AVX512-NEXT: vmovq %rax, %xmm1
-; AVX512-NEXT: vcvttss2si %xmm0, %rax
-; AVX512-NEXT: vmovq %rax, %xmm0
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vcvttss2si %xmm0, %eax
+; AVX512-NEXT: vcvttss2si %xmm1, %ecx
+; AVX512-NEXT: vmovd %ecx, %xmm0
+; AVX512-NEXT: vmovd %eax, %xmm1
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512-NEXT: retq
%cvt = fptosi <2 x half> %a to <2 x i32>
%ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; SSE-NEXT: orl $3072, %eax # imm = 0xC00
; SSE-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
; SSE-NEXT: fldcw -{{[0-9]+}}(%rsp)
-; SSE-NEXT: fistpll -{{[0-9]+}}(%rsp)
+; SSE-NEXT: fistpl -{{[0-9]+}}(%rsp)
; SSE-NEXT: fldcw -{{[0-9]+}}(%rsp)
; SSE-NEXT: fnstcw -{{[0-9]+}}(%rsp)
; SSE-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
; SSE-NEXT: orl $3072, %eax # imm = 0xC00
; SSE-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
; SSE-NEXT: fldcw -{{[0-9]+}}(%rsp)
-; SSE-NEXT: fistpll -{{[0-9]+}}(%rsp)
+; SSE-NEXT: fistpl -{{[0-9]+}}(%rsp)
; SSE-NEXT: fldcw -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
-; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE-NEXT: xorps %xmm1, %xmm1
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
+; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero
; SSE-NEXT: retq
;
; AVX-LABEL: fptosi_2f80_to_4i32:
; AVX: # %bb.0:
; AVX-NEXT: fldt {{[0-9]+}}(%rsp)
; AVX-NEXT: fldt {{[0-9]+}}(%rsp)
-; AVX-NEXT: fisttpll -{{[0-9]+}}(%rsp)
-; AVX-NEXT: fisttpll -{{[0-9]+}}(%rsp)
-; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; AVX-NEXT: fisttpl -{{[0-9]+}}(%rsp)
+; AVX-NEXT: fisttpl -{{[0-9]+}}(%rsp)
+; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX-NEXT: retq
%cvt = fptosi <2 x x86_fp80> %a to <2 x i32>
%ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
define <4 x i32> @fptosi_2f128_to_4i32(<2 x fp128> %a) nounwind {
; SSE-LABEL: fptosi_2f128_to_4i32:
; SSE: # %bb.0:
+; SSE-NEXT: pushq %rbp
; SSE-NEXT: pushq %r14
; SSE-NEXT: pushq %rbx
-; SSE-NEXT: subq $24, %rsp
-; SSE-NEXT: movq %rsi, %r14
-; SSE-NEXT: movq %rdi, %rbx
-; SSE-NEXT: movq %rdx, %rdi
-; SSE-NEXT: movq %rcx, %rsi
-; SSE-NEXT: callq __fixtfdi
-; SSE-NEXT: movq %rax, %xmm0
-; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
+; SSE-NEXT: movq %rcx, %r14
+; SSE-NEXT: movq %rdx, %rbx
+; SSE-NEXT: callq __fixtfsi
+; SSE-NEXT: movl %eax, %ebp
; SSE-NEXT: movq %rbx, %rdi
; SSE-NEXT: movq %r14, %rsi
-; SSE-NEXT: callq __fixtfdi
-; SSE-NEXT: movq %rax, %xmm0
-; SSE-NEXT: punpcklqdq (%rsp), %xmm0 # 16-byte Folded Reload
-; SSE-NEXT: # xmm0 = xmm0[0],mem[0]
-; SSE-NEXT: xorps %xmm1, %xmm1
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
-; SSE-NEXT: addq $24, %rsp
+; SSE-NEXT: callq __fixtfsi
+; SSE-NEXT: movd %eax, %xmm0
+; SSE-NEXT: movd %ebp, %xmm1
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero
; SSE-NEXT: popq %rbx
; SSE-NEXT: popq %r14
+; SSE-NEXT: popq %rbp
; SSE-NEXT: retq
;
; AVX-LABEL: fptosi_2f128_to_4i32:
; AVX: # %bb.0:
+; AVX-NEXT: pushq %rbp
; AVX-NEXT: pushq %r14
; AVX-NEXT: pushq %rbx
-; AVX-NEXT: subq $24, %rsp
-; AVX-NEXT: movq %rsi, %r14
-; AVX-NEXT: movq %rdi, %rbx
-; AVX-NEXT: movq %rdx, %rdi
-; AVX-NEXT: movq %rcx, %rsi
-; AVX-NEXT: callq __fixtfdi
-; AVX-NEXT: vmovq %rax, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT: movq %rcx, %r14
+; AVX-NEXT: movq %rdx, %rbx
+; AVX-NEXT: callq __fixtfsi
+; AVX-NEXT: movl %eax, %ebp
; AVX-NEXT: movq %rbx, %rdi
; AVX-NEXT: movq %r14, %rsi
-; AVX-NEXT: callq __fixtfdi
-; AVX-NEXT: vmovq %rax, %xmm0
-; AVX-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
-; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; AVX-NEXT: addq $24, %rsp
+; AVX-NEXT: callq __fixtfsi
+; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: vmovd %ebp, %xmm1
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX-NEXT: popq %rbx
; AVX-NEXT: popq %r14
+; AVX-NEXT: popq %rbp
; AVX-NEXT: retq
%cvt = fptosi <2 x fp128> %a to <2 x i32>
%ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; SSE-LABEL: fptosi_2f32_to_2i8:
; SSE: # %bb.0:
; SSE-NEXT: cvttps2dq %xmm0, %xmm0
-; SSE-NEXT: pxor %xmm1, %xmm1
-; SSE-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE-NEXT: packuswb %xmm0, %xmm0
+; SSE-NEXT: packuswb %xmm0, %xmm0
; SSE-NEXT: retq
;
-; VEX-LABEL: fptosi_2f32_to_2i8:
-; VEX: # %bb.0:
-; VEX-NEXT: vcvttps2dq %xmm0, %xmm0
-; VEX-NEXT: vpmovsxdq %xmm0, %xmm0
-; VEX-NEXT: retq
-;
-; AVX512F-LABEL: fptosi_2f32_to_2i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vcvttps2dq %xmm0, %xmm0
-; AVX512F-NEXT: vpmovsxdq %xmm0, %xmm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: fptosi_2f32_to_2i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vcvttps2dq %xmm0, %xmm0
-; AVX512VL-NEXT: vpmovsxdq %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: fptosi_2f32_to_2i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: fptosi_2f32_to_2i8:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvttps2qq %xmm0, %xmm0
-; AVX512VLDQ-NEXT: retq
+; AVX-LABEL: fptosi_2f32_to_2i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vcvttps2dq %xmm0, %xmm0
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: retq
%cvt = fptosi <2 x float> %a to <2 x i8>
ret <2 x i8> %cvt
}
; SSE-LABEL: fptosi_2f32_to_2i16:
; SSE: # %bb.0:
; SSE-NEXT: cvttps2dq %xmm0, %xmm0
-; SSE-NEXT: pxor %xmm1, %xmm1
-; SSE-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; SSE-NEXT: retq
;
-; VEX-LABEL: fptosi_2f32_to_2i16:
-; VEX: # %bb.0:
-; VEX-NEXT: vcvttps2dq %xmm0, %xmm0
-; VEX-NEXT: vpmovsxdq %xmm0, %xmm0
-; VEX-NEXT: retq
-;
-; AVX512F-LABEL: fptosi_2f32_to_2i16:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vcvttps2dq %xmm0, %xmm0
-; AVX512F-NEXT: vpmovsxdq %xmm0, %xmm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: fptosi_2f32_to_2i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vcvttps2dq %xmm0, %xmm0
-; AVX512VL-NEXT: vpmovsxdq %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: fptosi_2f32_to_2i16:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: fptosi_2f32_to_2i16:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvttps2qq %xmm0, %xmm0
-; AVX512VLDQ-NEXT: retq
+; AVX-LABEL: fptosi_2f32_to_2i16:
+; AVX: # %bb.0:
+; AVX-NEXT: vcvttps2dq %xmm0, %xmm0
+; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX-NEXT: retq
%cvt = fptosi <2 x float> %a to <2 x i16>
ret <2 x i16> %cvt
}
; SSE-LABEL: fptoui_2f32_to_2i8:
; SSE: # %bb.0:
; SSE-NEXT: cvttps2dq %xmm0, %xmm0
-; SSE-NEXT: xorps %xmm1, %xmm1
-; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE-NEXT: packuswb %xmm0, %xmm0
+; SSE-NEXT: packuswb %xmm0, %xmm0
; SSE-NEXT: retq
;
-; VEX-LABEL: fptoui_2f32_to_2i8:
-; VEX: # %bb.0:
-; VEX-NEXT: vcvttps2dq %xmm0, %xmm0
-; VEX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; VEX-NEXT: retq
-;
-; AVX512F-LABEL: fptoui_2f32_to_2i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vcvttps2dq %xmm0, %xmm0
-; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: fptoui_2f32_to_2i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vcvttps2dq %xmm0, %xmm0
-; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: fptoui_2f32_to_2i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: fptoui_2f32_to_2i8:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvttps2uqq %xmm0, %xmm0
-; AVX512VLDQ-NEXT: retq
+; AVX-LABEL: fptoui_2f32_to_2i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vcvttps2dq %xmm0, %xmm0
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: retq
%cvt = fptoui <2 x float> %a to <2 x i8>
ret <2 x i8> %cvt
}
; SSE-LABEL: fptoui_2f32_to_2i16:
; SSE: # %bb.0:
; SSE-NEXT: cvttps2dq %xmm0, %xmm0
-; SSE-NEXT: xorps %xmm1, %xmm1
-; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; SSE-NEXT: retq
;
-; VEX-LABEL: fptoui_2f32_to_2i16:
-; VEX: # %bb.0:
-; VEX-NEXT: vcvttps2dq %xmm0, %xmm0
-; VEX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; VEX-NEXT: retq
-;
-; AVX512F-LABEL: fptoui_2f32_to_2i16:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vcvttps2dq %xmm0, %xmm0
-; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: fptoui_2f32_to_2i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vcvttps2dq %xmm0, %xmm0
-; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: fptoui_2f32_to_2i16:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: fptoui_2f32_to_2i16:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvttps2uqq %xmm0, %xmm0
-; AVX512VLDQ-NEXT: retq
+; AVX-LABEL: fptoui_2f32_to_2i16:
+; AVX: # %bb.0:
+; AVX-NEXT: vcvttps2dq %xmm0, %xmm0
+; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX-NEXT: retq
%cvt = fptoui <2 x float> %a to <2 x i16>
ret <2 x i16> %cvt
}
; SSE-LABEL: fptosi_2f64_to_2i8:
; SSE: # %bb.0:
; SSE-NEXT: cvttpd2dq %xmm0, %xmm0
-; SSE-NEXT: pxor %xmm1, %xmm1
-; SSE-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT: andpd {{.*}}(%rip), %xmm0
+; SSE-NEXT: packuswb %xmm0, %xmm0
+; SSE-NEXT: packuswb %xmm0, %xmm0
; SSE-NEXT: retq
;
-; VEX-LABEL: fptosi_2f64_to_2i8:
-; VEX: # %bb.0:
-; VEX-NEXT: vcvttpd2dq %xmm0, %xmm0
-; VEX-NEXT: vpmovsxdq %xmm0, %xmm0
-; VEX-NEXT: retq
-;
-; AVX512F-LABEL: fptosi_2f64_to_2i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vcvttpd2dq %xmm0, %xmm0
-; AVX512F-NEXT: vpmovsxdq %xmm0, %xmm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: fptosi_2f64_to_2i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vcvttpd2dq %xmm0, %xmm0
-; AVX512VL-NEXT: vpmovsxdq %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: fptosi_2f64_to_2i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512DQ-NEXT: vcvttpd2qq %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: fptosi_2f64_to_2i8:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvttpd2qq %xmm0, %xmm0
-; AVX512VLDQ-NEXT: retq
+; AVX-LABEL: fptosi_2f64_to_2i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: retq
%cvt = fptosi <2 x double> %a to <2 x i8>
ret <2 x i8> %cvt
}
; SSE-LABEL: fptosi_2f64_to_2i16:
; SSE: # %bb.0:
; SSE-NEXT: cvttpd2dq %xmm0, %xmm0
-; SSE-NEXT: pxor %xmm1, %xmm1
-; SSE-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; SSE-NEXT: retq
;
-; VEX-LABEL: fptosi_2f64_to_2i16:
-; VEX: # %bb.0:
-; VEX-NEXT: vcvttpd2dq %xmm0, %xmm0
-; VEX-NEXT: vpmovsxdq %xmm0, %xmm0
-; VEX-NEXT: retq
-;
-; AVX512F-LABEL: fptosi_2f64_to_2i16:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vcvttpd2dq %xmm0, %xmm0
-; AVX512F-NEXT: vpmovsxdq %xmm0, %xmm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: fptosi_2f64_to_2i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vcvttpd2dq %xmm0, %xmm0
-; AVX512VL-NEXT: vpmovsxdq %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: fptosi_2f64_to_2i16:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512DQ-NEXT: vcvttpd2qq %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: fptosi_2f64_to_2i16:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvttpd2qq %xmm0, %xmm0
-; AVX512VLDQ-NEXT: retq
+; AVX-LABEL: fptosi_2f64_to_2i16:
+; AVX: # %bb.0:
+; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0
+; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX-NEXT: retq
%cvt = fptosi <2 x double> %a to <2 x i16>
ret <2 x i16> %cvt
}
; SSE-LABEL: fptoui_2f64_to_2i8:
; SSE: # %bb.0:
; SSE-NEXT: cvttpd2dq %xmm0, %xmm0
-; SSE-NEXT: xorpd %xmm1, %xmm1
-; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT: andpd {{.*}}(%rip), %xmm0
+; SSE-NEXT: packuswb %xmm0, %xmm0
+; SSE-NEXT: packuswb %xmm0, %xmm0
; SSE-NEXT: retq
;
-; VEX-LABEL: fptoui_2f64_to_2i8:
-; VEX: # %bb.0:
-; VEX-NEXT: vcvttpd2dq %xmm0, %xmm0
-; VEX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; VEX-NEXT: retq
-;
-; AVX512F-LABEL: fptoui_2f64_to_2i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vcvttpd2dq %xmm0, %xmm0
-; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: fptoui_2f64_to_2i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vcvttpd2dq %xmm0, %xmm0
-; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: fptoui_2f64_to_2i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512DQ-NEXT: vcvttpd2uqq %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: fptoui_2f64_to_2i8:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvttpd2uqq %xmm0, %xmm0
-; AVX512VLDQ-NEXT: retq
+; AVX-LABEL: fptoui_2f64_to_2i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: retq
%cvt = fptoui <2 x double> %a to <2 x i8>
ret <2 x i8> %cvt
}
; SSE-LABEL: fptoui_2f64_to_2i16:
; SSE: # %bb.0:
; SSE-NEXT: cvttpd2dq %xmm0, %xmm0
-; SSE-NEXT: xorpd %xmm1, %xmm1
-; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; SSE-NEXT: retq
;
-; VEX-LABEL: fptoui_2f64_to_2i16:
-; VEX: # %bb.0:
-; VEX-NEXT: vcvttpd2dq %xmm0, %xmm0
-; VEX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; VEX-NEXT: retq
-;
-; AVX512F-LABEL: fptoui_2f64_to_2i16:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vcvttpd2dq %xmm0, %xmm0
-; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: fptoui_2f64_to_2i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vcvttpd2dq %xmm0, %xmm0
-; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: fptoui_2f64_to_2i16:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512DQ-NEXT: vcvttpd2uqq %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: fptoui_2f64_to_2i16:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvttpd2uqq %xmm0, %xmm0
-; AVX512VLDQ-NEXT: retq
+; AVX-LABEL: fptoui_2f64_to_2i16:
+; AVX: # %bb.0:
+; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0
+; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX-NEXT: retq
%cvt = fptoui <2 x double> %a to <2 x i16>
ret <2 x i16> %cvt
}
; AVX1: # %bb.0:
; AVX1-NEXT: vcvttps2dq %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vcvttps2dq %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
; AVX2: # %bb.0:
; AVX2-NEXT: vcvttps2dq %ymm1, %ymm1
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vcvttps2dq %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
; X64: # %bb.0:
; X64-NEXT: shll $12, %edi
; X64-NEXT: movd %edi, %xmm0
-; X64-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
; X64-NEXT: movq %xmm0, (%rsi)
; X64-NEXT: retq
%tmp12 = shl i32 %a, 12
define x86_mmx @mmx_movzl(x86_mmx %x) nounwind {
; X32-LABEL: mmx_movzl:
; X32: ## %bb.0:
-; X32-NEXT: subl $12, %esp
+; X32-NEXT: subl $28, %esp
+; X32-NEXT: movq %mm0, (%esp)
+; X32-NEXT: movdqa (%esp), %xmm0
; X32-NEXT: movl $32, %eax
-; X32-NEXT: movd %eax, %xmm0
-; X32-NEXT: movdq2q %xmm0, %mm0
-; X32-NEXT: addl $12, %esp
+; X32-NEXT: pinsrd $0, %eax, %xmm0
+; X32-NEXT: pxor %xmm1, %xmm1
+; X32-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; X32-NEXT: movdq2q %xmm1, %mm0
+; X32-NEXT: addl $28, %esp
; X32-NEXT: retl
;
; X64-LABEL: mmx_movzl:
; X32-NEXT: movl L_g0$non_lazy_ptr, %eax
; X32-NEXT: movl L_g1$non_lazy_ptr, %ecx
; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; X32-NEXT: movzwl (%eax), %eax
-; X32-NEXT: movd %eax, %xmm1
-; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; X32-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; X32-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X32-NEXT: pinsrw $0, (%eax), %xmm0
; X32-NEXT: movq %xmm0, (%ecx)
; X32-NEXT: retl
;
; X64: ## %bb.0:
; X64-NEXT: movq _g0@{{.*}}(%rip), %rax
; X64-NEXT: movq _g1@{{.*}}(%rip), %rcx
-; X64-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X64-NEXT: movzwl (%rax), %eax
-; X64-NEXT: pinsrd $0, %eax, %xmm0
-; X64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT: pinsrw $0, (%rax), %xmm0
; X64-NEXT: movq %xmm0, (%rcx)
; X64-NEXT: retq
load i16, i16* @g0
;
; SSE41-LABEL: sitofp_load_2i16_to_2f64:
; SSE41: # %bb.0:
-; SSE41-NEXT: pmovsxwq (%rdi), %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE41-NEXT: pmovsxwd %xmm0, %xmm0
; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: sitofp_load_2i16_to_2f64:
; AVX: # %bb.0:
-; AVX-NEXT: vpmovsxwq (%rdi), %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: vpmovsxwd %xmm0, %xmm0
; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
; AVX-NEXT: retq
%ld = load <2 x i16>, <2 x i16> *%a
;
; SSE41-LABEL: sitofp_load_2i8_to_2f64:
; SSE41: # %bb.0:
-; SSE41-NEXT: pmovsxbq (%rdi), %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE41-NEXT: movzwl (%rdi), %eax
+; SSE41-NEXT: movd %eax, %xmm0
+; SSE41-NEXT: pmovsxbd %xmm0, %xmm0
; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: sitofp_load_2i8_to_2f64:
; AVX: # %bb.0:
-; AVX-NEXT: vpmovsxbq (%rdi), %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX-NEXT: movzwl (%rdi), %eax
+; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: vpmovsxbd %xmm0, %xmm0
; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
; AVX-NEXT: retq
%ld = load <2 x i8>, <2 x i8> *%a
;
; SSE41-LABEL: uitofp_load_2i8_to_2f64:
; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE41-NEXT: movzwl (%rdi), %eax
+; SSE41-NEXT: movd %eax, %xmm0
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: uitofp_load_2i8_to_2f64:
; AVX: # %bb.0:
-; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX-NEXT: movzwl (%rdi), %eax
+; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
; AVX-NEXT: retq
%ld = load <2 x i8>, <2 x i8> *%a
; SSE41-LABEL: aggregate_sitofp_8i16_to_8f32:
; SSE41: # %bb.0:
; SSE41-NEXT: movq 24(%rdi), %rax
-; SSE41-NEXT: movdqu 8(%rdi), %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmovsxwd %xmm1, %xmm1
+; SSE41-NEXT: pmovsxwd 16(%rdi), %xmm0
+; SSE41-NEXT: pmovsxwd 8(%rdi), %xmm1
; SSE41-NEXT: cvtdq2ps %xmm1, %xmm1
-; SSE41-NEXT: pmovsxwd %xmm0, %xmm0
; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0
-; SSE41-NEXT: movaps %xmm0, (%rax)
-; SSE41-NEXT: movaps %xmm1, 16(%rax)
+; SSE41-NEXT: movaps %xmm0, 16(%rax)
+; SSE41-NEXT: movaps %xmm1, (%rax)
; SSE41-NEXT: retq
;
; AVX1-LABEL: aggregate_sitofp_8i16_to_8f32:
}
define <2 x i32> @saddo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind {
-; SSE2-LABEL: saddo_v2i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: psllq $32, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
-; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT: psllq $32, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: paddq %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psllq $32, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
-; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
-; SSE2-NEXT: pxor %xmm3, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: movq %xmm1, (%rdi)
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: saddo_v2i32:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: psllq $32, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
-; SSSE3-NEXT: psrad $31, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSSE3-NEXT: psllq $32, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
-; SSSE3-NEXT: psrad $31, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSSE3-NEXT: paddq %xmm2, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: psllq $32, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; SSSE3-NEXT: psrad $31, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSSE3-NEXT: pcmpeqd %xmm1, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
-; SSSE3-NEXT: pand %xmm2, %xmm3
-; SSSE3-NEXT: pcmpeqd %xmm0, %xmm0
-; SSSE3-NEXT: pxor %xmm3, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSSE3-NEXT: movq %xmm1, (%rdi)
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: saddo_v2i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psllq $32, %xmm2
-; SSE41-NEXT: psrad $31, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psllq $32, %xmm1
-; SSE41-NEXT: psrad $31, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; SSE41-NEXT: paddq %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psllq $32, %xmm0
-; SSE41-NEXT: psrad $31, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
-; SSE41-NEXT: pcmpeqq %xmm1, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE41-NEXT: movq %xmm1, (%rdi)
-; SSE41-NEXT: retq
+; SSE-LABEL: saddo_v2i32:
+; SSE: # %bb.0:
+; SSE-NEXT: pxor %xmm2, %xmm2
+; SSE-NEXT: pxor %xmm3, %xmm3
+; SSE-NEXT: pcmpgtd %xmm1, %xmm3
+; SSE-NEXT: pcmpeqd %xmm4, %xmm4
+; SSE-NEXT: pxor %xmm4, %xmm3
+; SSE-NEXT: pxor %xmm5, %xmm5
+; SSE-NEXT: pcmpgtd %xmm0, %xmm5
+; SSE-NEXT: pxor %xmm4, %xmm5
+; SSE-NEXT: pcmpeqd %xmm5, %xmm3
+; SSE-NEXT: paddd %xmm1, %xmm0
+; SSE-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE-NEXT: pxor %xmm4, %xmm2
+; SSE-NEXT: pcmpeqd %xmm5, %xmm2
+; SSE-NEXT: pandn %xmm3, %xmm2
+; SSE-NEXT: movq %xmm0, (%rdi)
+; SSE-NEXT: movdqa %xmm2, %xmm0
+; SSE-NEXT: retq
;
; AVX1-LABEL: saddo_v2i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $32, %xmm1, %xmm2
-; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; AVX1-NEXT: vpsllq $32, %xmm0, %xmm2
-; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpsllq $32, %xmm1, %xmm0
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
-; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5
+; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm0
+; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0
+; AVX1-NEXT: vpandn %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vmovq %xmm1, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: saddo_v2i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllq $32, %xmm1, %xmm2
-; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX2-NEXT: vpsllq $32, %xmm0, %xmm2
-; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpsllq $32, %xmm1, %xmm0
-; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
-; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3
+; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5
+; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5
+; AVX2-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm0
+; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0
+; AVX2-NEXT: vpandn %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vmovq %xmm1, (%rdi)
; AVX2-NEXT: retq
;
; AVX512-LABEL: saddo_v2i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllq $32, %xmm1, %xmm1
-; AVX512-NEXT: vpsraq $32, %xmm1, %xmm1
-; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX512-NEXT: vpsraq $32, %xmm0, %xmm0
-; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsllq $32, %xmm0, %xmm1
-; AVX512-NEXT: vpsraq $32, %xmm1, %xmm1
-; AVX512-NEXT: vpmovqd %xmm0, (%rdi)
-; AVX512-NEXT: vpcmpeqq %xmm0, %xmm1, %xmm0
-; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k0
+; AVX512-NEXT: vpcmpnltd %xmm2, %xmm0, %k1
+; AVX512-NEXT: kxorw %k0, %k1, %k0
+; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k2
+; AVX512-NEXT: kxorw %k2, %k1, %k1
+; AVX512-NEXT: kandnw %k1, %k0, %k1
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: vmovq %xmm1, (%rdi)
; AVX512-NEXT: retq
%t = call {<2 x i32>, <2 x i1>} @llvm.sadd.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1)
%val = extractvalue {<2 x i32>, <2 x i1>} %t, 0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pandn %xmm3, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: saddo_v2i64:
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
; SSSE3-NEXT: pand %xmm2, %xmm0
; SSSE3-NEXT: pandn %xmm3, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: saddo_v2i64:
; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE41-NEXT: pand %xmm6, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE41-NEXT: por %xmm2, %xmm0
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: pcmpeqq %xmm5, %xmm0
-; SSE41-NEXT: pandn %xmm4, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSE41-NEXT: por %xmm0, %xmm2
+; SSE41-NEXT: pxor %xmm1, %xmm2
+; SSE41-NEXT: pcmpeqq %xmm5, %xmm2
+; SSE41-NEXT: pandn %xmm4, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
; SSE41-NEXT: retq
;
; AVX1-LABEL: saddo_v2i64:
; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpcmpeqq %xmm0, %xmm5, %xmm0
; AVX1-NEXT: vpandn %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0
; AVX2-NEXT: vpcmpeqq %xmm0, %xmm5, %xmm0
; AVX2-NEXT: vpandn %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX2-NEXT: vmovdqa %xmm1, (%rdi)
; AVX2-NEXT: retq
;
; AVX512-NEXT: vpcmpnltq %xmm2, %xmm1, %k0
; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k1
; AVX512-NEXT: kxorw %k0, %k1, %k0
-; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k2
+; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpcmpnltq %xmm2, %xmm1, %k2
; AVX512-NEXT: kxorw %k2, %k1, %k1
; AVX512-NEXT: kandnw %k1, %k0, %k1
-; AVX512-NEXT: vmovdqa %xmm0, (%rdi)
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
; AVX512-NEXT: retq
%t = call {<2 x i64>, <2 x i1>} @llvm.sadd.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1)
%val = extractvalue {<2 x i64>, <2 x i1>} %t, 0
; SSE2: # %bb.0:
; SSE2-NEXT: pushq %rbp
; SSE2-NEXT: pushq %rbx
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; SSE2-NEXT: testq %r9, %r9
+; SSE2-NEXT: setns %al
+; SSE2-NEXT: testq %rsi, %rsi
+; SSE2-NEXT: setns %bl
+; SSE2-NEXT: cmpb %al, %bl
+; SSE2-NEXT: sete %bpl
+; SSE2-NEXT: addq %r8, %rdi
+; SSE2-NEXT: adcq %r9, %rsi
+; SSE2-NEXT: setns %al
+; SSE2-NEXT: cmpb %al, %bl
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: andb %bpl, %al
; SSE2-NEXT: addq {{[0-9]+}}(%rsp), %rdx
-; SSE2-NEXT: movq %rcx, %rax
-; SSE2-NEXT: adcq %r11, %rax
+; SSE2-NEXT: movq %rcx, %rbp
+; SSE2-NEXT: adcq %r10, %rbp
; SSE2-NEXT: setns %bl
; SSE2-NEXT: testq %rcx, %rcx
; SSE2-NEXT: setns %cl
; SSE2-NEXT: cmpb %bl, %cl
-; SSE2-NEXT: setne %bpl
-; SSE2-NEXT: testq %r11, %r11
+; SSE2-NEXT: setne %r8b
+; SSE2-NEXT: testq %r10, %r10
; SSE2-NEXT: setns %bl
; SSE2-NEXT: cmpb %bl, %cl
; SSE2-NEXT: sete %cl
-; SSE2-NEXT: andb %bpl, %cl
-; SSE2-NEXT: movzbl %cl, %ebp
-; SSE2-NEXT: testq %r9, %r9
-; SSE2-NEXT: setns %bl
-; SSE2-NEXT: testq %rsi, %rsi
-; SSE2-NEXT: setns %cl
-; SSE2-NEXT: cmpb %bl, %cl
-; SSE2-NEXT: sete %r11b
-; SSE2-NEXT: addq %r8, %rdi
-; SSE2-NEXT: adcq %r9, %rsi
-; SSE2-NEXT: setns %bl
-; SSE2-NEXT: cmpb %bl, %cl
-; SSE2-NEXT: setne %cl
-; SSE2-NEXT: andb %r11b, %cl
+; SSE2-NEXT: andb %r8b, %cl
; SSE2-NEXT: movzbl %cl, %ecx
-; SSE2-NEXT: movd %ecx, %xmm0
-; SSE2-NEXT: pinsrw $4, %ebp, %xmm0
-; SSE2-NEXT: movq %rdx, 16(%r10)
-; SSE2-NEXT: movq %rdi, (%r10)
-; SSE2-NEXT: movq %rax, 24(%r10)
-; SSE2-NEXT: movq %rsi, 8(%r10)
-; SSE2-NEXT: psllq $63, %xmm0
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT: negl %ecx
+; SSE2-NEXT: movd %ecx, %xmm1
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: movq %rdx, 16(%r11)
+; SSE2-NEXT: movq %rdi, (%r11)
+; SSE2-NEXT: movq %rbp, 24(%r11)
+; SSE2-NEXT: movq %rsi, 8(%r11)
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: popq %rbp
; SSE2-NEXT: retq
; SSSE3: # %bb.0:
; SSSE3-NEXT: pushq %rbp
; SSSE3-NEXT: pushq %rbx
-; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r10
; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; SSSE3-NEXT: testq %r9, %r9
+; SSSE3-NEXT: setns %al
+; SSSE3-NEXT: testq %rsi, %rsi
+; SSSE3-NEXT: setns %bl
+; SSSE3-NEXT: cmpb %al, %bl
+; SSSE3-NEXT: sete %bpl
+; SSSE3-NEXT: addq %r8, %rdi
+; SSSE3-NEXT: adcq %r9, %rsi
+; SSSE3-NEXT: setns %al
+; SSSE3-NEXT: cmpb %al, %bl
+; SSSE3-NEXT: setne %al
+; SSSE3-NEXT: andb %bpl, %al
; SSSE3-NEXT: addq {{[0-9]+}}(%rsp), %rdx
-; SSSE3-NEXT: movq %rcx, %rax
-; SSSE3-NEXT: adcq %r11, %rax
+; SSSE3-NEXT: movq %rcx, %rbp
+; SSSE3-NEXT: adcq %r10, %rbp
; SSSE3-NEXT: setns %bl
; SSSE3-NEXT: testq %rcx, %rcx
; SSSE3-NEXT: setns %cl
; SSSE3-NEXT: cmpb %bl, %cl
-; SSSE3-NEXT: setne %bpl
-; SSSE3-NEXT: testq %r11, %r11
+; SSSE3-NEXT: setne %r8b
+; SSSE3-NEXT: testq %r10, %r10
; SSSE3-NEXT: setns %bl
; SSSE3-NEXT: cmpb %bl, %cl
; SSSE3-NEXT: sete %cl
-; SSSE3-NEXT: andb %bpl, %cl
-; SSSE3-NEXT: movzbl %cl, %ebp
-; SSSE3-NEXT: testq %r9, %r9
-; SSSE3-NEXT: setns %bl
-; SSSE3-NEXT: testq %rsi, %rsi
-; SSSE3-NEXT: setns %cl
-; SSSE3-NEXT: cmpb %bl, %cl
-; SSSE3-NEXT: sete %r11b
-; SSSE3-NEXT: addq %r8, %rdi
-; SSSE3-NEXT: adcq %r9, %rsi
-; SSSE3-NEXT: setns %bl
-; SSSE3-NEXT: cmpb %bl, %cl
-; SSSE3-NEXT: setne %cl
-; SSSE3-NEXT: andb %r11b, %cl
+; SSSE3-NEXT: andb %r8b, %cl
; SSSE3-NEXT: movzbl %cl, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm0
-; SSSE3-NEXT: pinsrw $4, %ebp, %xmm0
-; SSSE3-NEXT: movq %rdx, 16(%r10)
-; SSSE3-NEXT: movq %rdi, (%r10)
-; SSSE3-NEXT: movq %rax, 24(%r10)
-; SSSE3-NEXT: movq %rsi, 8(%r10)
-; SSSE3-NEXT: psllq $63, %xmm0
-; SSSE3-NEXT: psrad $31, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT: negl %ecx
+; SSSE3-NEXT: movd %ecx, %xmm1
+; SSSE3-NEXT: movzbl %al, %eax
+; SSSE3-NEXT: negl %eax
+; SSSE3-NEXT: movd %eax, %xmm0
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT: movq %rdx, 16(%r11)
+; SSSE3-NEXT: movq %rdi, (%r11)
+; SSSE3-NEXT: movq %rbp, 24(%r11)
+; SSSE3-NEXT: movq %rsi, 8(%r11)
; SSSE3-NEXT: popq %rbx
; SSSE3-NEXT: popq %rbp
; SSSE3-NEXT: retq
; SSE41: # %bb.0:
; SSE41-NEXT: pushq %rbp
; SSE41-NEXT: pushq %rbx
-; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r10
; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; SSE41-NEXT: testq %r9, %r9
+; SSE41-NEXT: setns %al
+; SSE41-NEXT: testq %rsi, %rsi
+; SSE41-NEXT: setns %bl
+; SSE41-NEXT: cmpb %al, %bl
+; SSE41-NEXT: sete %bpl
+; SSE41-NEXT: addq %r8, %rdi
+; SSE41-NEXT: adcq %r9, %rsi
+; SSE41-NEXT: setns %al
+; SSE41-NEXT: cmpb %al, %bl
+; SSE41-NEXT: setne %al
+; SSE41-NEXT: andb %bpl, %al
; SSE41-NEXT: addq {{[0-9]+}}(%rsp), %rdx
-; SSE41-NEXT: movq %rcx, %rax
-; SSE41-NEXT: adcq %r11, %rax
+; SSE41-NEXT: movq %rcx, %rbp
+; SSE41-NEXT: adcq %r10, %rbp
; SSE41-NEXT: setns %bl
; SSE41-NEXT: testq %rcx, %rcx
; SSE41-NEXT: setns %cl
; SSE41-NEXT: cmpb %bl, %cl
-; SSE41-NEXT: setne %bpl
-; SSE41-NEXT: testq %r11, %r11
+; SSE41-NEXT: setne %r8b
+; SSE41-NEXT: testq %r10, %r10
; SSE41-NEXT: setns %bl
; SSE41-NEXT: cmpb %bl, %cl
; SSE41-NEXT: sete %cl
-; SSE41-NEXT: andb %bpl, %cl
-; SSE41-NEXT: movzbl %cl, %ebp
-; SSE41-NEXT: testq %r9, %r9
-; SSE41-NEXT: setns %bl
-; SSE41-NEXT: testq %rsi, %rsi
-; SSE41-NEXT: setns %cl
-; SSE41-NEXT: cmpb %bl, %cl
-; SSE41-NEXT: sete %r11b
-; SSE41-NEXT: addq %r8, %rdi
-; SSE41-NEXT: adcq %r9, %rsi
-; SSE41-NEXT: setns %bl
-; SSE41-NEXT: cmpb %bl, %cl
-; SSE41-NEXT: setne %cl
-; SSE41-NEXT: andb %r11b, %cl
+; SSE41-NEXT: andb %r8b, %cl
; SSE41-NEXT: movzbl %cl, %ecx
-; SSE41-NEXT: movd %ecx, %xmm0
-; SSE41-NEXT: pinsrb $8, %ebp, %xmm0
-; SSE41-NEXT: movq %rdx, 16(%r10)
-; SSE41-NEXT: movq %rdi, (%r10)
-; SSE41-NEXT: movq %rax, 24(%r10)
-; SSE41-NEXT: movq %rsi, 8(%r10)
-; SSE41-NEXT: psllq $63, %xmm0
-; SSE41-NEXT: psrad $31, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: negl %ecx
+; SSE41-NEXT: movzbl %al, %eax
+; SSE41-NEXT: negl %eax
+; SSE41-NEXT: movd %eax, %xmm0
+; SSE41-NEXT: pinsrd $1, %ecx, %xmm0
+; SSE41-NEXT: movq %rdx, 16(%r11)
+; SSE41-NEXT: movq %rdi, (%r11)
+; SSE41-NEXT: movq %rbp, 24(%r11)
+; SSE41-NEXT: movq %rsi, 8(%r11)
; SSE41-NEXT: popq %rbx
; SSE41-NEXT: popq %rbp
; SSE41-NEXT: retq
; AVX1: # %bb.0:
; AVX1-NEXT: pushq %rbp
; AVX1-NEXT: pushq %rbx
-; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX1-NEXT: testq %r9, %r9
+; AVX1-NEXT: setns %al
+; AVX1-NEXT: testq %rsi, %rsi
+; AVX1-NEXT: setns %bl
+; AVX1-NEXT: cmpb %al, %bl
+; AVX1-NEXT: sete %bpl
+; AVX1-NEXT: addq %r8, %rdi
+; AVX1-NEXT: adcq %r9, %rsi
+; AVX1-NEXT: setns %al
+; AVX1-NEXT: cmpb %al, %bl
+; AVX1-NEXT: setne %al
+; AVX1-NEXT: andb %bpl, %al
; AVX1-NEXT: addq {{[0-9]+}}(%rsp), %rdx
-; AVX1-NEXT: movq %rcx, %rax
-; AVX1-NEXT: adcq %r11, %rax
+; AVX1-NEXT: movq %rcx, %rbp
+; AVX1-NEXT: adcq %r10, %rbp
; AVX1-NEXT: setns %bl
; AVX1-NEXT: testq %rcx, %rcx
; AVX1-NEXT: setns %cl
; AVX1-NEXT: cmpb %bl, %cl
-; AVX1-NEXT: setne %bpl
-; AVX1-NEXT: testq %r11, %r11
+; AVX1-NEXT: setne %r8b
+; AVX1-NEXT: testq %r10, %r10
; AVX1-NEXT: setns %bl
; AVX1-NEXT: cmpb %bl, %cl
; AVX1-NEXT: sete %cl
-; AVX1-NEXT: andb %bpl, %cl
-; AVX1-NEXT: movzbl %cl, %ebp
-; AVX1-NEXT: testq %r9, %r9
-; AVX1-NEXT: setns %bl
-; AVX1-NEXT: testq %rsi, %rsi
-; AVX1-NEXT: setns %cl
-; AVX1-NEXT: cmpb %bl, %cl
-; AVX1-NEXT: sete %r11b
-; AVX1-NEXT: addq %r8, %rdi
-; AVX1-NEXT: adcq %r9, %rsi
-; AVX1-NEXT: setns %bl
-; AVX1-NEXT: cmpb %bl, %cl
-; AVX1-NEXT: setne %cl
-; AVX1-NEXT: andb %r11b, %cl
+; AVX1-NEXT: andb %r8b, %cl
; AVX1-NEXT: movzbl %cl, %ecx
-; AVX1-NEXT: vmovd %ecx, %xmm0
-; AVX1-NEXT: vpinsrb $8, %ebp, %xmm0, %xmm0
-; AVX1-NEXT: movq %rdx, 16(%r10)
-; AVX1-NEXT: movq %rdi, (%r10)
-; AVX1-NEXT: movq %rax, 24(%r10)
-; AVX1-NEXT: movq %rsi, 8(%r10)
-; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: negl %ecx
+; AVX1-NEXT: movzbl %al, %eax
+; AVX1-NEXT: negl %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, 16(%r11)
+; AVX1-NEXT: movq %rdi, (%r11)
+; AVX1-NEXT: movq %rbp, 24(%r11)
+; AVX1-NEXT: movq %rsi, 8(%r11)
; AVX1-NEXT: popq %rbx
; AVX1-NEXT: popq %rbp
; AVX1-NEXT: retq
; AVX2: # %bb.0:
; AVX2-NEXT: pushq %rbp
; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX2-NEXT: testq %r9, %r9
+; AVX2-NEXT: setns %al
+; AVX2-NEXT: testq %rsi, %rsi
+; AVX2-NEXT: setns %bl
+; AVX2-NEXT: cmpb %al, %bl
+; AVX2-NEXT: sete %bpl
+; AVX2-NEXT: addq %r8, %rdi
+; AVX2-NEXT: adcq %r9, %rsi
+; AVX2-NEXT: setns %al
+; AVX2-NEXT: cmpb %al, %bl
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: andb %bpl, %al
; AVX2-NEXT: addq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: adcq %r11, %rax
+; AVX2-NEXT: movq %rcx, %rbp
+; AVX2-NEXT: adcq %r10, %rbp
; AVX2-NEXT: setns %bl
; AVX2-NEXT: testq %rcx, %rcx
; AVX2-NEXT: setns %cl
; AVX2-NEXT: cmpb %bl, %cl
-; AVX2-NEXT: setne %bpl
-; AVX2-NEXT: testq %r11, %r11
+; AVX2-NEXT: setne %r8b
+; AVX2-NEXT: testq %r10, %r10
; AVX2-NEXT: setns %bl
; AVX2-NEXT: cmpb %bl, %cl
; AVX2-NEXT: sete %cl
-; AVX2-NEXT: andb %bpl, %cl
-; AVX2-NEXT: movzbl %cl, %ebp
-; AVX2-NEXT: testq %r9, %r9
-; AVX2-NEXT: setns %bl
-; AVX2-NEXT: testq %rsi, %rsi
-; AVX2-NEXT: setns %cl
-; AVX2-NEXT: cmpb %bl, %cl
-; AVX2-NEXT: sete %r11b
-; AVX2-NEXT: addq %r8, %rdi
-; AVX2-NEXT: adcq %r9, %rsi
-; AVX2-NEXT: setns %bl
-; AVX2-NEXT: cmpb %bl, %cl
-; AVX2-NEXT: setne %cl
-; AVX2-NEXT: andb %r11b, %cl
+; AVX2-NEXT: andb %r8b, %cl
; AVX2-NEXT: movzbl %cl, %ecx
-; AVX2-NEXT: vmovd %ecx, %xmm0
-; AVX2-NEXT: vpinsrb $8, %ebp, %xmm0, %xmm0
-; AVX2-NEXT: movq %rdx, 16(%r10)
-; AVX2-NEXT: movq %rdi, (%r10)
-; AVX2-NEXT: movq %rax, 24(%r10)
-; AVX2-NEXT: movq %rsi, 8(%r10)
-; AVX2-NEXT: vpsllq $63, %xmm0, %xmm0
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: negl %ecx
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, 16(%r11)
+; AVX2-NEXT: movq %rdi, (%r11)
+; AVX2-NEXT: movq %rbp, 24(%r11)
+; AVX2-NEXT: movq %rsi, 8(%r11)
; AVX2-NEXT: popq %rbx
; AVX2-NEXT: popq %rbp
; AVX2-NEXT: retq
; AVX512-NEXT: andl $1, %ecx
; AVX512-NEXT: kmovw %ecx, %k1
; AVX512-NEXT: korw %k0, %k1, %k1
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: movq %rdx, 16(%r10)
; AVX512-NEXT: movq %rdi, (%r10)
; AVX512-NEXT: movq %r14, 24(%r10)
; AVX512-NEXT: movq %rsi, 8(%r10)
-; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: popq %rbx
; AVX512-NEXT: popq %r14
; AVX512-NEXT: retq
define <2 x i32> @smulo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind {
; SSE2-LABEL: smulo_v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: psllq $32, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
-; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
-; SSE2-NEXT: movq %xmm1, %r8
-; SSE2-NEXT: psllq $32, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-NEXT: movq %xmm0, %rcx
-; SSE2-NEXT: movq %xmm2, %rdx
-; SSE2-NEXT: movq %xmm1, %rsi
-; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: imulq %rdx, %rsi
-; SSE2-NEXT: movq $-1, %r9
-; SSE2-NEXT: movl $0, %edx
-; SSE2-NEXT: cmovoq %r9, %rdx
-; SSE2-NEXT: movq %rsi, %xmm1
-; SSE2-NEXT: imulq %r8, %rcx
-; SSE2-NEXT: movq %rcx, %xmm0
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psllq $32, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm0, %xmm2
-; SSE2-NEXT: movq %rdx, %xmm0
-; SSE2-NEXT: cmovoq %r9, %rax
-; SSE2-NEXT: movq %rax, %xmm3
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT: pand %xmm0, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT: pand %xmm1, %xmm2
+; SSE2-NEXT: paddd %xmm3, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; SSE2-NEXT: psubd %xmm2, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: movq %xmm1, (%rdi)
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: movq %xmm0, (%rdi)
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: smulo_v2i32:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: psllq $32, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
-; SSSE3-NEXT: psrad $31, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
-; SSSE3-NEXT: movq %xmm1, %r8
-; SSSE3-NEXT: psllq $32, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
-; SSSE3-NEXT: psrad $31, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSSE3-NEXT: movq %xmm0, %rcx
-; SSSE3-NEXT: movq %xmm2, %rdx
-; SSSE3-NEXT: movq %xmm1, %rsi
-; SSSE3-NEXT: xorl %eax, %eax
-; SSSE3-NEXT: imulq %rdx, %rsi
-; SSSE3-NEXT: movq $-1, %r9
-; SSSE3-NEXT: movl $0, %edx
-; SSSE3-NEXT: cmovoq %r9, %rdx
-; SSSE3-NEXT: movq %rsi, %xmm1
-; SSSE3-NEXT: imulq %r8, %rcx
-; SSSE3-NEXT: movq %rcx, %xmm0
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: psllq $32, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; SSSE3-NEXT: psrad $31, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSSE3-NEXT: pcmpeqd %xmm1, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
-; SSSE3-NEXT: pand %xmm2, %xmm0
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm2
-; SSSE3-NEXT: pxor %xmm0, %xmm2
-; SSSE3-NEXT: movq %rdx, %xmm0
-; SSSE3-NEXT: cmovoq %r9, %rax
-; SSSE3-NEXT: movq %rax, %xmm3
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; SSSE3-NEXT: por %xmm2, %xmm0
+; SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSSE3-NEXT: pxor %xmm3, %xmm3
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3
+; SSSE3-NEXT: pand %xmm0, %xmm3
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2
+; SSSE3-NEXT: pand %xmm1, %xmm2
+; SSSE3-NEXT: paddd %xmm3, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSSE3-NEXT: pmuludq %xmm1, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT: pmuludq %xmm3, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; SSSE3-NEXT: psubd %xmm2, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSSE3-NEXT: movq %xmm1, (%rdi)
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT: movq %xmm0, (%rdi)
+; SSSE3-NEXT: movdqa %xmm0, %xmm1
+; SSSE3-NEXT: psrad $31, %xmm1
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm1
+; SSSE3-NEXT: pcmpeqd %xmm0, %xmm0
+; SSSE3-NEXT: pxor %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: smulo_v2i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psllq $32, %xmm2
-; SSE41-NEXT: psrad $31, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; SSE41-NEXT: movq %xmm2, %r8
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psllq $32, %xmm1
-; SSE41-NEXT: psrad $31, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; SSE41-NEXT: movq %xmm1, %rcx
-; SSE41-NEXT: pextrq $1, %xmm2, %rdx
-; SSE41-NEXT: pextrq $1, %xmm1, %rsi
-; SSE41-NEXT: xorl %eax, %eax
-; SSE41-NEXT: imulq %rdx, %rsi
-; SSE41-NEXT: movq $-1, %r9
-; SSE41-NEXT: movl $0, %edx
-; SSE41-NEXT: cmovoq %r9, %rdx
-; SSE41-NEXT: movq %rsi, %xmm0
-; SSE41-NEXT: imulq %r8, %rcx
-; SSE41-NEXT: movq %rcx, %xmm1
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psllq $32, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE41-NEXT: pmuldq %xmm2, %xmm3
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: pmuldq %xmm1, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
+; SSE41-NEXT: pmulld %xmm1, %xmm0
+; SSE41-NEXT: movq %xmm0, (%rdi)
; SSE41-NEXT: psrad $31, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
-; SSE41-NEXT: pcmpeqq %xmm1, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE41-NEXT: pxor %xmm0, %xmm2
-; SSE41-NEXT: movq %rdx, %xmm3
-; SSE41-NEXT: cmovoq %r9, %rax
-; SSE41-NEXT: movq %rax, %xmm0
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; SSE41-NEXT: por %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE41-NEXT: movq %xmm1, (%rdi)
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT: pxor %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: smulo_v2i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $32, %xmm1, %xmm2
-; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; AVX1-NEXT: vmovq %xmm1, %r8
-; AVX1-NEXT: vpsllq $32, %xmm0, %xmm2
-; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT: vmovq %xmm0, %rcx
-; AVX1-NEXT: vpextrq $1, %xmm1, %rdx
-; AVX1-NEXT: vpextrq $1, %xmm0, %rsi
-; AVX1-NEXT: xorl %eax, %eax
-; AVX1-NEXT: imulq %rdx, %rsi
-; AVX1-NEXT: movq $-1, %r9
-; AVX1-NEXT: movl $0, %edx
-; AVX1-NEXT: cmovoq %r9, %rdx
-; AVX1-NEXT: vmovq %rsi, %xmm0
-; AVX1-NEXT: imulq %r8, %rcx
-; AVX1-NEXT: vmovq %rcx, %xmm1
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; AVX1-NEXT: vpsllq $32, %xmm1, %xmm0
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
-; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovq %rdx, %xmm2
-; AVX1-NEXT: cmovoq %r9, %rax
-; AVX1-NEXT: vmovq %rax, %xmm3
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; AVX1-NEXT: vmovq %xmm1, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: smulo_v2i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllq $32, %xmm1, %xmm2
-; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX2-NEXT: vmovq %xmm1, %r8
-; AVX2-NEXT: vpsllq $32, %xmm0, %xmm2
-; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX2-NEXT: vmovq %xmm0, %rcx
-; AVX2-NEXT: vpextrq $1, %xmm1, %rdx
-; AVX2-NEXT: vpextrq $1, %xmm0, %rsi
-; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: imulq %rdx, %rsi
-; AVX2-NEXT: movq $-1, %r9
-; AVX2-NEXT: movl $0, %edx
-; AVX2-NEXT: cmovoq %r9, %rdx
-; AVX2-NEXT: vmovq %rsi, %xmm0
-; AVX2-NEXT: imulq %r8, %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; AVX2-NEXT: vpsllq $32, %xmm1, %xmm0
-; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
-; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm3
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3]
+; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpsrad $31, %xmm1, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0
; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vmovq %rdx, %xmm2
-; AVX2-NEXT: cmovoq %r9, %rax
-; AVX2-NEXT: vmovq %rax, %xmm3
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX2-NEXT: vpor %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; AVX2-NEXT: vmovq %xmm1, (%rdi)
; AVX2-NEXT: retq
;
; AVX512-LABEL: smulo_v2i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllq $32, %xmm1, %xmm1
-; AVX512-NEXT: vpsraq $32, %xmm1, %xmm1
-; AVX512-NEXT: vmovq %xmm1, %rax
-; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX512-NEXT: vpsraq $32, %xmm0, %xmm0
-; AVX512-NEXT: vmovq %xmm0, %rcx
-; AVX512-NEXT: vpextrq $1, %xmm1, %rdx
-; AVX512-NEXT: vpextrq $1, %xmm0, %rsi
-; AVX512-NEXT: imulq %rdx, %rsi
-; AVX512-NEXT: seto %dl
-; AVX512-NEXT: vmovq %rsi, %xmm0
-; AVX512-NEXT: imulq %rax, %rcx
-; AVX512-NEXT: vmovq %rcx, %xmm1
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512-NEXT: vpsllq $32, %xmm0, %xmm1
-; AVX512-NEXT: vpsraq $32, %xmm1, %xmm1
-; AVX512-NEXT: vpcmpneqq %xmm0, %xmm1, %k0
-; AVX512-NEXT: kmovd %edx, %k1
-; AVX512-NEXT: kshiftlw $1, %k1, %k1
-; AVX512-NEXT: seto %al
-; AVX512-NEXT: andl $1, %eax
-; AVX512-NEXT: kmovw %eax, %k2
-; AVX512-NEXT: korw %k1, %k2, %k1
-; AVX512-NEXT: korw %k1, %k0, %k1
-; AVX512-NEXT: vpmovqd %xmm0, (%rdi)
+; AVX512-NEXT: vpmuldq %xmm1, %xmm0, %xmm2
+; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; AVX512-NEXT: vpmuldq %xmm3, %xmm4, %xmm3
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7]
+; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4
+; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpsrad $31, %xmm1, %xmm0
+; AVX512-NEXT: vpcmpneqd %xmm0, %xmm4, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: vmovq %xmm1, (%rdi)
; AVX512-NEXT: retq
%t = call {<2 x i32>, <2 x i1>} @llvm.smul.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1)
%val = extractvalue {<2 x i32>, <2 x i1>} %t, 0
; SSE2-NEXT: movq $-1, %r9
; SSE2-NEXT: movl $0, %edx
; SSE2-NEXT: cmovoq %r9, %rdx
-; SSE2-NEXT: movq %rdx, %xmm0
+; SSE2-NEXT: movq %rsi, %xmm1
; SSE2-NEXT: imulq %r8, %rcx
+; SSE2-NEXT: movq %rcx, %xmm0
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT: movq %rdx, %xmm0
; SSE2-NEXT: cmovoq %r9, %rax
-; SSE2-NEXT: movq %rax, %xmm1
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT: movq %rsi, %xmm1
-; SSE2-NEXT: movq %rcx, %xmm2
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT: movq %rax, %xmm2
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: movdqa %xmm1, (%rdi)
; SSE2-NEXT: retq
;
; SSSE3-NEXT: movq $-1, %r9
; SSSE3-NEXT: movl $0, %edx
; SSSE3-NEXT: cmovoq %r9, %rdx
-; SSSE3-NEXT: movq %rdx, %xmm0
+; SSSE3-NEXT: movq %rsi, %xmm1
; SSSE3-NEXT: imulq %r8, %rcx
+; SSSE3-NEXT: movq %rcx, %xmm0
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSSE3-NEXT: movq %rdx, %xmm0
; SSSE3-NEXT: cmovoq %r9, %rax
-; SSSE3-NEXT: movq %rax, %xmm1
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSSE3-NEXT: movq %rsi, %xmm1
-; SSSE3-NEXT: movq %rcx, %xmm2
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSSE3-NEXT: movq %rax, %xmm2
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSSE3-NEXT: movdqa %xmm1, (%rdi)
; SSSE3-NEXT: retq
;
; SSE41-NEXT: movq $-1, %r9
; SSE41-NEXT: movl $0, %edx
; SSE41-NEXT: cmovoq %r9, %rdx
-; SSE41-NEXT: movq %rdx, %xmm1
+; SSE41-NEXT: movq %rsi, %xmm0
; SSE41-NEXT: imulq %r8, %rcx
+; SSE41-NEXT: movq %rcx, %xmm1
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE41-NEXT: movq %rdx, %xmm0
; SSE41-NEXT: cmovoq %r9, %rax
-; SSE41-NEXT: movq %rax, %xmm0
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE41-NEXT: movq %rsi, %xmm1
-; SSE41-NEXT: movq %rcx, %xmm2
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; SSE41-NEXT: movdqa %xmm2, (%rdi)
+; SSE41-NEXT: movq %rax, %xmm2
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE41-NEXT: movdqa %xmm1, (%rdi)
; SSE41-NEXT: retq
;
; AVX1-LABEL: smulo_v2i64:
; AVX1-NEXT: movq $-1, %r9
; AVX1-NEXT: movl $0, %edx
; AVX1-NEXT: cmovoq %r9, %rdx
-; AVX1-NEXT: vmovq %rdx, %xmm0
+; AVX1-NEXT: vmovq %rsi, %xmm0
; AVX1-NEXT: imulq %r8, %rcx
+; AVX1-NEXT: vmovq %rcx, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vmovq %rdx, %xmm0
; AVX1-NEXT: cmovoq %r9, %rax
-; AVX1-NEXT: vmovq %rax, %xmm1
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX1-NEXT: vmovq %rsi, %xmm1
-; AVX1-NEXT: vmovq %rcx, %xmm2
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT: vmovq %rax, %xmm2
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-NEXT: movq $-1, %r9
; AVX2-NEXT: movl $0, %edx
; AVX2-NEXT: cmovoq %r9, %rdx
-; AVX2-NEXT: vmovq %rdx, %xmm0
+; AVX2-NEXT: vmovq %rsi, %xmm0
; AVX2-NEXT: imulq %r8, %rcx
+; AVX2-NEXT: vmovq %rcx, %xmm1
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; AVX2-NEXT: vmovq %rdx, %xmm0
; AVX2-NEXT: cmovoq %r9, %rax
-; AVX2-NEXT: vmovq %rax, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX2-NEXT: vmovq %rsi, %xmm1
-; AVX2-NEXT: vmovq %rcx, %xmm2
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX2-NEXT: vmovq %rax, %xmm2
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX2-NEXT: vmovdqa %xmm1, (%rdi)
; AVX2-NEXT: retq
;
; AVX512-LABEL: smulo_v2i64:
; AVX512: # %bb.0:
-; AVX512-NEXT: vmovq %xmm1, %rax
-; AVX512-NEXT: vmovq %xmm0, %rcx
-; AVX512-NEXT: vpextrq $1, %xmm1, %rdx
-; AVX512-NEXT: vpextrq $1, %xmm0, %rsi
+; AVX512-NEXT: vpextrq $1, %xmm1, %rax
+; AVX512-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX512-NEXT: vmovq %xmm1, %rdx
+; AVX512-NEXT: vmovq %xmm0, %rsi
; AVX512-NEXT: imulq %rdx, %rsi
; AVX512-NEXT: seto %dl
-; AVX512-NEXT: kmovd %edx, %k0
-; AVX512-NEXT: kshiftlw $1, %k0, %k0
; AVX512-NEXT: imulq %rax, %rcx
+; AVX512-NEXT: vmovq %rcx, %xmm0
+; AVX512-NEXT: vmovq %rsi, %xmm1
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; AVX512-NEXT: seto %al
-; AVX512-NEXT: andl $1, %eax
-; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: korw %k0, %k1, %k1
-; AVX512-NEXT: vmovq %rsi, %xmm0
-; AVX512-NEXT: vmovq %rcx, %xmm1
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512-NEXT: vmovdqa %xmm0, (%rdi)
+; AVX512-NEXT: kmovd %eax, %k0
+; AVX512-NEXT: kmovd %edx, %k1
+; AVX512-NEXT: kshiftrw $1, %k1, %k2
+; AVX512-NEXT: kxorw %k0, %k2, %k0
+; AVX512-NEXT: kshiftlw $15, %k0, %k0
+; AVX512-NEXT: kshiftrw $14, %k0, %k0
+; AVX512-NEXT: kxorw %k0, %k1, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
; AVX512-NEXT: retq
%t = call {<2 x i64>, <2 x i1>} @llvm.smul.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1)
%val = extractvalue {<2 x i64>, <2 x i1>} %t, 0
; SSE2-NEXT: movq %r12, %rcx
; SSE2-NEXT: callq __muloti4
; SSE2-NEXT: xorl %ecx, %ecx
-; SSE2-NEXT: cmpq $0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: setne %cl
-; SSE2-NEXT: xorl %esi, %esi
-; SSE2-NEXT: cmpq $0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: setne %sil
-; SSE2-NEXT: movd %esi, %xmm0
-; SSE2-NEXT: pinsrw $4, %ecx, %xmm0
+; SSE2-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx
+; SSE2-NEXT: sbbl %esi, %esi
+; SSE2-NEXT: movd %esi, %xmm1
+; SSE2-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx
+; SSE2-NEXT: sbbl %ecx, %ecx
+; SSE2-NEXT: movd %ecx, %xmm0
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: movq %rdx, 24(%r15)
; SSE2-NEXT: movq %rax, 16(%r15)
; SSE2-NEXT: movq %rbp, 8(%r15)
; SSE2-NEXT: movq %r13, (%r15)
-; SSE2-NEXT: psllq $63, %xmm0
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE2-NEXT: addq $24, %rsp
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: popq %r12
; SSSE3-NEXT: movq %r12, %rcx
; SSSE3-NEXT: callq __muloti4
; SSSE3-NEXT: xorl %ecx, %ecx
-; SSSE3-NEXT: cmpq $0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: setne %cl
-; SSSE3-NEXT: xorl %esi, %esi
-; SSSE3-NEXT: cmpq $0, {{[0-9]+}}(%rsp)
-; SSSE3-NEXT: setne %sil
-; SSSE3-NEXT: movd %esi, %xmm0
-; SSSE3-NEXT: pinsrw $4, %ecx, %xmm0
+; SSSE3-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx
+; SSSE3-NEXT: sbbl %esi, %esi
+; SSSE3-NEXT: movd %esi, %xmm1
+; SSSE3-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx
+; SSSE3-NEXT: sbbl %ecx, %ecx
+; SSSE3-NEXT: movd %ecx, %xmm0
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-NEXT: movq %rdx, 24(%r15)
; SSSE3-NEXT: movq %rax, 16(%r15)
; SSSE3-NEXT: movq %rbp, 8(%r15)
; SSSE3-NEXT: movq %r13, (%r15)
-; SSSE3-NEXT: psllq $63, %xmm0
-; SSSE3-NEXT: psrad $31, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSSE3-NEXT: addq $24, %rsp
; SSSE3-NEXT: popq %rbx
; SSSE3-NEXT: popq %r12
; SSE41-NEXT: movq %r12, %rcx
; SSE41-NEXT: callq __muloti4
; SSE41-NEXT: xorl %ecx, %ecx
-; SSE41-NEXT: cmpq $0, {{[0-9]+}}(%rsp)
-; SSE41-NEXT: setne %cl
-; SSE41-NEXT: xorl %esi, %esi
-; SSE41-NEXT: cmpq $0, {{[0-9]+}}(%rsp)
-; SSE41-NEXT: setne %sil
-; SSE41-NEXT: movd %esi, %xmm0
-; SSE41-NEXT: pinsrb $8, %ecx, %xmm0
+; SSE41-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx
+; SSE41-NEXT: sbbl %esi, %esi
+; SSE41-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx
+; SSE41-NEXT: sbbl %ecx, %ecx
+; SSE41-NEXT: movd %ecx, %xmm0
+; SSE41-NEXT: pinsrd $1, %esi, %xmm0
; SSE41-NEXT: movq %rdx, 24(%r15)
; SSE41-NEXT: movq %rax, 16(%r15)
; SSE41-NEXT: movq %rbp, 8(%r15)
; SSE41-NEXT: movq %r13, (%r15)
-; SSE41-NEXT: psllq $63, %xmm0
-; SSE41-NEXT: psrad $31, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE41-NEXT: addq $24, %rsp
; SSE41-NEXT: popq %rbx
; SSE41-NEXT: popq %r12
; AVX1-NEXT: movq %r12, %rcx
; AVX1-NEXT: callq __muloti4
; AVX1-NEXT: xorl %ecx, %ecx
-; AVX1-NEXT: cmpq $0, {{[0-9]+}}(%rsp)
-; AVX1-NEXT: setne %cl
-; AVX1-NEXT: xorl %esi, %esi
-; AVX1-NEXT: cmpq $0, {{[0-9]+}}(%rsp)
-; AVX1-NEXT: setne %sil
-; AVX1-NEXT: vmovd %esi, %xmm0
-; AVX1-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx
+; AVX1-NEXT: sbbl %esi, %esi
+; AVX1-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx
+; AVX1-NEXT: sbbl %ecx, %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm0
+; AVX1-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0
; AVX1-NEXT: movq %rdx, 24(%r15)
; AVX1-NEXT: movq %rax, 16(%r15)
; AVX1-NEXT: movq %rbp, 8(%r15)
; AVX1-NEXT: movq %r13, (%r15)
-; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: addq $24, %rsp
; AVX1-NEXT: popq %rbx
; AVX1-NEXT: popq %r12
; AVX2-NEXT: movq %r12, %rcx
; AVX2-NEXT: callq __muloti4
; AVX2-NEXT: xorl %ecx, %ecx
-; AVX2-NEXT: cmpq $0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: setne %cl
-; AVX2-NEXT: xorl %esi, %esi
-; AVX2-NEXT: cmpq $0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: setne %sil
-; AVX2-NEXT: vmovd %esi, %xmm0
-; AVX2-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx
+; AVX2-NEXT: sbbl %esi, %esi
+; AVX2-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx
+; AVX2-NEXT: sbbl %ecx, %ecx
+; AVX2-NEXT: vmovd %ecx, %xmm0
+; AVX2-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0
; AVX2-NEXT: movq %rdx, 24(%r15)
; AVX2-NEXT: movq %rax, 16(%r15)
; AVX2-NEXT: movq %rbp, 8(%r15)
; AVX2-NEXT: movq %r13, (%r15)
-; AVX2-NEXT: vpsllq $63, %xmm0, %xmm0
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: addq $24, %rsp
; AVX2-NEXT: popq %rbx
; AVX2-NEXT: popq %r12
; AVX512-NEXT: pushq %rbx
; AVX512-NEXT: subq $24, %rsp
; AVX512-NEXT: movq %r8, %rax
-; AVX512-NEXT: movq %rcx, %r14
+; AVX512-NEXT: movq %rcx, %r15
; AVX512-NEXT: movq %rdx, %rbx
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r15
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r12
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r13
; AVX512-NEXT: movq $0, {{[0-9]+}}(%rsp)
; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %r8
; AVX512-NEXT: movq %rax, %rdx
; AVX512-NEXT: movq %r9, %rcx
; AVX512-NEXT: callq __muloti4
-; AVX512-NEXT: movq %rax, %r13
+; AVX512-NEXT: movq %rax, %r14
; AVX512-NEXT: movq %rdx, %rbp
; AVX512-NEXT: movq $0, {{[0-9]+}}(%rsp)
; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %r8
; AVX512-NEXT: movq %rbx, %rdi
-; AVX512-NEXT: movq %r14, %rsi
+; AVX512-NEXT: movq %r15, %rsi
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; AVX512-NEXT: movq %r12, %rcx
+; AVX512-NEXT: movq %r13, %rcx
; AVX512-NEXT: callq __muloti4
; AVX512-NEXT: cmpq $0, {{[0-9]+}}(%rsp)
; AVX512-NEXT: setne %cl
; AVX512-NEXT: andl $1, %ecx
; AVX512-NEXT: kmovw %ecx, %k1
; AVX512-NEXT: korw %k0, %k1, %k1
-; AVX512-NEXT: movq %rdx, 24(%r15)
-; AVX512-NEXT: movq %rax, 16(%r15)
-; AVX512-NEXT: movq %rbp, 8(%r15)
-; AVX512-NEXT: movq %r13, (%r15)
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: movq %rdx, 24(%r12)
+; AVX512-NEXT: movq %rax, 16(%r12)
+; AVX512-NEXT: movq %rbp, 8(%r12)
+; AVX512-NEXT: movq %r14, (%r12)
; AVX512-NEXT: addq $24, %rsp
; AVX512-NEXT: popq %rbx
; AVX512-NEXT: popq %r12
}
define <2 x i32> @ssubo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind {
-; SSE2-LABEL: ssubo_v2i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: psllq $32, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
-; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT: psllq $32, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: psubq %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psllq $32, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
-; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
-; SSE2-NEXT: pxor %xmm3, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: movq %xmm1, (%rdi)
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: ssubo_v2i32:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: psllq $32, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
-; SSSE3-NEXT: psrad $31, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSSE3-NEXT: psllq $32, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
-; SSSE3-NEXT: psrad $31, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSSE3-NEXT: psubq %xmm2, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: psllq $32, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; SSSE3-NEXT: psrad $31, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSSE3-NEXT: pcmpeqd %xmm1, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
-; SSSE3-NEXT: pand %xmm2, %xmm3
-; SSSE3-NEXT: pcmpeqd %xmm0, %xmm0
-; SSSE3-NEXT: pxor %xmm3, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSSE3-NEXT: movq %xmm1, (%rdi)
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: ssubo_v2i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psllq $32, %xmm2
-; SSE41-NEXT: psrad $31, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psllq $32, %xmm1
-; SSE41-NEXT: psrad $31, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; SSE41-NEXT: psubq %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psllq $32, %xmm0
-; SSE41-NEXT: psrad $31, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
-; SSE41-NEXT: pcmpeqq %xmm1, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE41-NEXT: movq %xmm1, (%rdi)
-; SSE41-NEXT: retq
+; SSE-LABEL: ssubo_v2i32:
+; SSE: # %bb.0:
+; SSE-NEXT: pxor %xmm3, %xmm3
+; SSE-NEXT: pxor %xmm2, %xmm2
+; SSE-NEXT: pcmpgtd %xmm1, %xmm2
+; SSE-NEXT: pcmpeqd %xmm4, %xmm4
+; SSE-NEXT: pxor %xmm4, %xmm2
+; SSE-NEXT: pxor %xmm5, %xmm5
+; SSE-NEXT: pcmpgtd %xmm0, %xmm5
+; SSE-NEXT: pxor %xmm4, %xmm5
+; SSE-NEXT: pcmpeqd %xmm5, %xmm2
+; SSE-NEXT: psubd %xmm1, %xmm0
+; SSE-NEXT: pcmpgtd %xmm0, %xmm3
+; SSE-NEXT: pxor %xmm4, %xmm3
+; SSE-NEXT: pcmpeqd %xmm5, %xmm3
+; SSE-NEXT: pxor %xmm4, %xmm3
+; SSE-NEXT: pandn %xmm3, %xmm2
+; SSE-NEXT: movq %xmm0, (%rdi)
+; SSE-NEXT: movdqa %xmm2, %xmm0
+; SSE-NEXT: retq
;
; AVX1-LABEL: ssubo_v2i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $32, %xmm1, %xmm2
-; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; AVX1-NEXT: vpsllq $32, %xmm0, %xmm2
-; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpsllq $32, %xmm1, %xmm0
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
-; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5
+; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm0
+; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0
+; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpandn %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vmovq %xmm1, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: ssubo_v2i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllq $32, %xmm1, %xmm2
-; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX2-NEXT: vpsllq $32, %xmm0, %xmm2
-; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpsllq $32, %xmm1, %xmm0
-; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
-; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3
+; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5
+; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5
+; AVX2-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3
+; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm0
+; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0
+; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpandn %xmm0, %xmm3, %xmm0
; AVX2-NEXT: vmovq %xmm1, (%rdi)
; AVX2-NEXT: retq
;
; AVX512-LABEL: ssubo_v2i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllq $32, %xmm1, %xmm1
-; AVX512-NEXT: vpsraq $32, %xmm1, %xmm1
-; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX512-NEXT: vpsraq $32, %xmm0, %xmm0
-; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsllq $32, %xmm0, %xmm1
-; AVX512-NEXT: vpsraq $32, %xmm1, %xmm1
-; AVX512-NEXT: vpmovqd %xmm0, (%rdi)
-; AVX512-NEXT: vpcmpeqq %xmm0, %xmm1, %xmm0
-; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k0
+; AVX512-NEXT: vpcmpnltd %xmm2, %xmm0, %k1
+; AVX512-NEXT: kxorw %k0, %k1, %k0
+; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k2
+; AVX512-NEXT: kxorw %k2, %k1, %k1
+; AVX512-NEXT: kandw %k1, %k0, %k1
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: vmovq %xmm1, (%rdi)
; AVX512-NEXT: retq
%t = call {<2 x i32>, <2 x i1>} @llvm.ssub.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1)
%val = extractvalue {<2 x i32>, <2 x i1>} %t, 0
; SSE2-NEXT: pand %xmm5, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm5
-; SSE2-NEXT: pxor %xmm5, %xmm4
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm1, %xmm4
; SSE2-NEXT: pxor %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSE2-NEXT: pand %xmm6, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3]
-; SSE2-NEXT: por %xmm3, %xmm6
-; SSE2-NEXT: pxor %xmm5, %xmm6
-; SSE2-NEXT: pcmpeqd %xmm6, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,0,3,2]
-; SSE2-NEXT: pand %xmm4, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT: por %xmm3, %xmm5
+; SSE2-NEXT: pxor %xmm1, %xmm5
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2]
+; SSE2-NEXT: pand %xmm4, %xmm3
; SSE2-NEXT: movdqa %xmm0, (%rdi)
; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSE2-NEXT: pand %xmm6, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pxor %xmm5, %xmm2
-; SSE2-NEXT: pcmpeqd %xmm6, %xmm2
+; SSE2-NEXT: pxor %xmm1, %xmm2
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm5, %xmm0
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: pandn %xmm0, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: ssubo_v2i64:
; SSSE3-NEXT: pand %xmm5, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSSE3-NEXT: por %xmm1, %xmm4
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm5
-; SSSE3-NEXT: pxor %xmm5, %xmm4
+; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1
+; SSSE3-NEXT: pxor %xmm1, %xmm4
; SSSE3-NEXT: pxor %xmm2, %xmm3
-; SSSE3-NEXT: movdqa %xmm2, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
+; SSSE3-NEXT: movdqa %xmm2, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSSE3-NEXT: pand %xmm6, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3]
-; SSSE3-NEXT: por %xmm3, %xmm6
-; SSSE3-NEXT: pxor %xmm5, %xmm6
-; SSSE3-NEXT: pcmpeqd %xmm6, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,0,3,2]
-; SSSE3-NEXT: pand %xmm4, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT: por %xmm3, %xmm5
+; SSSE3-NEXT: pxor %xmm1, %xmm5
+; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2]
+; SSSE3-NEXT: pand %xmm4, %xmm3
; SSSE3-NEXT: movdqa %xmm0, (%rdi)
; SSSE3-NEXT: pxor %xmm2, %xmm0
-; SSSE3-NEXT: movdqa %xmm2, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSSE3-NEXT: movdqa %xmm2, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm4, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSSE3-NEXT: pand %xmm6, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
; SSSE3-NEXT: por %xmm0, %xmm2
-; SSSE3-NEXT: pxor %xmm5, %xmm2
-; SSSE3-NEXT: pcmpeqd %xmm6, %xmm2
+; SSSE3-NEXT: pxor %xmm1, %xmm2
+; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
; SSSE3-NEXT: pand %xmm2, %xmm0
-; SSSE3-NEXT: pxor %xmm5, %xmm0
-; SSSE3-NEXT: pandn %xmm0, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: pxor %xmm1, %xmm0
+; SSSE3-NEXT: pandn %xmm0, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: ssubo_v2i64:
; SSE41-NEXT: pcmpgtd %xmm1, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
; SSE41-NEXT: pcmpeqd %xmm2, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3]
-; SSE41-NEXT: pand %xmm5, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm6, %xmm1
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm4
-; SSE41-NEXT: pxor %xmm4, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE41-NEXT: pand %xmm5, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE41-NEXT: por %xmm1, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT: pxor %xmm1, %xmm4
; SSE41-NEXT: pxor %xmm2, %xmm3
; SSE41-NEXT: movdqa %xmm2, %xmm5
; SSE41-NEXT: pcmpgtd %xmm3, %xmm5
; SSE41-NEXT: pand %xmm6, %xmm3
; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
; SSE41-NEXT: por %xmm3, %xmm5
-; SSE41-NEXT: pxor %xmm4, %xmm5
-; SSE41-NEXT: pcmpeqq %xmm5, %xmm1
+; SSE41-NEXT: pxor %xmm1, %xmm5
+; SSE41-NEXT: pcmpeqq %xmm5, %xmm4
; SSE41-NEXT: movdqa %xmm0, (%rdi)
; SSE41-NEXT: pxor %xmm2, %xmm0
; SSE41-NEXT: movdqa %xmm2, %xmm3
; SSE41-NEXT: pand %xmm6, %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
; SSE41-NEXT: por %xmm0, %xmm2
-; SSE41-NEXT: pxor %xmm4, %xmm2
+; SSE41-NEXT: pxor %xmm1, %xmm2
; SSE41-NEXT: pcmpeqq %xmm5, %xmm2
-; SSE41-NEXT: pxor %xmm4, %xmm2
-; SSE41-NEXT: pandn %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pxor %xmm1, %xmm2
+; SSE41-NEXT: pandn %xmm2, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3]
; SSE41-NEXT: retq
;
; AVX1-LABEL: ssubo_v2i64:
; AVX1-NEXT: vpcmpeqq %xmm0, %xmm5, %xmm0
; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpandn %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-NEXT: vpcmpeqq %xmm0, %xmm5, %xmm0
; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0
; AVX2-NEXT: vpandn %xmm0, %xmm3, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX2-NEXT: vmovdqa %xmm1, (%rdi)
; AVX2-NEXT: retq
;
; AVX512-NEXT: vpcmpnltq %xmm2, %xmm1, %k0
; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k1
; AVX512-NEXT: kxorw %k0, %k1, %k0
-; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k2
+; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpcmpnltq %xmm2, %xmm1, %k2
; AVX512-NEXT: kxorw %k2, %k1, %k1
; AVX512-NEXT: kandw %k1, %k0, %k1
-; AVX512-NEXT: vmovdqa %xmm0, (%rdi)
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
; AVX512-NEXT: retq
%t = call {<2 x i64>, <2 x i1>} @llvm.ssub.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1)
%val = extractvalue {<2 x i64>, <2 x i1>} %t, 0
; SSE2: # %bb.0:
; SSE2-NEXT: pushq %rbp
; SSE2-NEXT: pushq %rbx
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; SSE2-NEXT: subq {{[0-9]+}}(%rsp), %rdx
-; SSE2-NEXT: movq %rcx, %rax
-; SSE2-NEXT: sbbq %r11, %rax
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; SSE2-NEXT: testq %r9, %r9
+; SSE2-NEXT: setns %al
+; SSE2-NEXT: testq %rsi, %rsi
; SSE2-NEXT: setns %bl
-; SSE2-NEXT: testq %rcx, %rcx
-; SSE2-NEXT: setns %cl
-; SSE2-NEXT: cmpb %bl, %cl
+; SSE2-NEXT: cmpb %al, %bl
; SSE2-NEXT: setne %bpl
-; SSE2-NEXT: testq %r11, %r11
-; SSE2-NEXT: setns %bl
-; SSE2-NEXT: cmpb %bl, %cl
-; SSE2-NEXT: setne %cl
-; SSE2-NEXT: andb %bpl, %cl
-; SSE2-NEXT: movzbl %cl, %ebp
-; SSE2-NEXT: testq %r9, %r9
+; SSE2-NEXT: subq %r8, %rdi
+; SSE2-NEXT: sbbq %r9, %rsi
+; SSE2-NEXT: setns %al
+; SSE2-NEXT: cmpb %al, %bl
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: andb %bpl, %al
+; SSE2-NEXT: subq {{[0-9]+}}(%rsp), %rdx
+; SSE2-NEXT: movq %rcx, %rbp
+; SSE2-NEXT: sbbq %r10, %rbp
; SSE2-NEXT: setns %bl
-; SSE2-NEXT: testq %rsi, %rsi
+; SSE2-NEXT: testq %rcx, %rcx
; SSE2-NEXT: setns %cl
; SSE2-NEXT: cmpb %bl, %cl
-; SSE2-NEXT: setne %r11b
-; SSE2-NEXT: subq %r8, %rdi
-; SSE2-NEXT: sbbq %r9, %rsi
+; SSE2-NEXT: setne %r8b
+; SSE2-NEXT: testq %r10, %r10
; SSE2-NEXT: setns %bl
; SSE2-NEXT: cmpb %bl, %cl
; SSE2-NEXT: setne %cl
-; SSE2-NEXT: andb %r11b, %cl
+; SSE2-NEXT: andb %r8b, %cl
; SSE2-NEXT: movzbl %cl, %ecx
-; SSE2-NEXT: movd %ecx, %xmm0
-; SSE2-NEXT: pinsrw $4, %ebp, %xmm0
-; SSE2-NEXT: movq %rdx, 16(%r10)
-; SSE2-NEXT: movq %rdi, (%r10)
-; SSE2-NEXT: movq %rax, 24(%r10)
-; SSE2-NEXT: movq %rsi, 8(%r10)
-; SSE2-NEXT: psllq $63, %xmm0
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT: negl %ecx
+; SSE2-NEXT: movd %ecx, %xmm1
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: movq %rdx, 16(%r11)
+; SSE2-NEXT: movq %rdi, (%r11)
+; SSE2-NEXT: movq %rbp, 24(%r11)
+; SSE2-NEXT: movq %rsi, 8(%r11)
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: popq %rbp
; SSE2-NEXT: retq
; SSSE3: # %bb.0:
; SSSE3-NEXT: pushq %rbp
; SSSE3-NEXT: pushq %rbx
-; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r10
; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; SSSE3-NEXT: subq {{[0-9]+}}(%rsp), %rdx
-; SSSE3-NEXT: movq %rcx, %rax
-; SSSE3-NEXT: sbbq %r11, %rax
+; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; SSSE3-NEXT: testq %r9, %r9
+; SSSE3-NEXT: setns %al
+; SSSE3-NEXT: testq %rsi, %rsi
; SSSE3-NEXT: setns %bl
-; SSSE3-NEXT: testq %rcx, %rcx
-; SSSE3-NEXT: setns %cl
-; SSSE3-NEXT: cmpb %bl, %cl
+; SSSE3-NEXT: cmpb %al, %bl
; SSSE3-NEXT: setne %bpl
-; SSSE3-NEXT: testq %r11, %r11
-; SSSE3-NEXT: setns %bl
-; SSSE3-NEXT: cmpb %bl, %cl
-; SSSE3-NEXT: setne %cl
-; SSSE3-NEXT: andb %bpl, %cl
-; SSSE3-NEXT: movzbl %cl, %ebp
-; SSSE3-NEXT: testq %r9, %r9
+; SSSE3-NEXT: subq %r8, %rdi
+; SSSE3-NEXT: sbbq %r9, %rsi
+; SSSE3-NEXT: setns %al
+; SSSE3-NEXT: cmpb %al, %bl
+; SSSE3-NEXT: setne %al
+; SSSE3-NEXT: andb %bpl, %al
+; SSSE3-NEXT: subq {{[0-9]+}}(%rsp), %rdx
+; SSSE3-NEXT: movq %rcx, %rbp
+; SSSE3-NEXT: sbbq %r10, %rbp
; SSSE3-NEXT: setns %bl
-; SSSE3-NEXT: testq %rsi, %rsi
+; SSSE3-NEXT: testq %rcx, %rcx
; SSSE3-NEXT: setns %cl
; SSSE3-NEXT: cmpb %bl, %cl
-; SSSE3-NEXT: setne %r11b
-; SSSE3-NEXT: subq %r8, %rdi
-; SSSE3-NEXT: sbbq %r9, %rsi
+; SSSE3-NEXT: setne %r8b
+; SSSE3-NEXT: testq %r10, %r10
; SSSE3-NEXT: setns %bl
; SSSE3-NEXT: cmpb %bl, %cl
; SSSE3-NEXT: setne %cl
-; SSSE3-NEXT: andb %r11b, %cl
+; SSSE3-NEXT: andb %r8b, %cl
; SSSE3-NEXT: movzbl %cl, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm0
-; SSSE3-NEXT: pinsrw $4, %ebp, %xmm0
-; SSSE3-NEXT: movq %rdx, 16(%r10)
-; SSSE3-NEXT: movq %rdi, (%r10)
-; SSSE3-NEXT: movq %rax, 24(%r10)
-; SSSE3-NEXT: movq %rsi, 8(%r10)
-; SSSE3-NEXT: psllq $63, %xmm0
-; SSSE3-NEXT: psrad $31, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT: negl %ecx
+; SSSE3-NEXT: movd %ecx, %xmm1
+; SSSE3-NEXT: movzbl %al, %eax
+; SSSE3-NEXT: negl %eax
+; SSSE3-NEXT: movd %eax, %xmm0
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT: movq %rdx, 16(%r11)
+; SSSE3-NEXT: movq %rdi, (%r11)
+; SSSE3-NEXT: movq %rbp, 24(%r11)
+; SSSE3-NEXT: movq %rsi, 8(%r11)
; SSSE3-NEXT: popq %rbx
; SSSE3-NEXT: popq %rbp
; SSSE3-NEXT: retq
; SSE41: # %bb.0:
; SSE41-NEXT: pushq %rbp
; SSE41-NEXT: pushq %rbx
-; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r10
; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; SSE41-NEXT: subq {{[0-9]+}}(%rsp), %rdx
-; SSE41-NEXT: movq %rcx, %rax
-; SSE41-NEXT: sbbq %r11, %rax
+; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; SSE41-NEXT: testq %r9, %r9
+; SSE41-NEXT: setns %al
+; SSE41-NEXT: testq %rsi, %rsi
; SSE41-NEXT: setns %bl
-; SSE41-NEXT: testq %rcx, %rcx
-; SSE41-NEXT: setns %cl
-; SSE41-NEXT: cmpb %bl, %cl
+; SSE41-NEXT: cmpb %al, %bl
; SSE41-NEXT: setne %bpl
-; SSE41-NEXT: testq %r11, %r11
-; SSE41-NEXT: setns %bl
-; SSE41-NEXT: cmpb %bl, %cl
-; SSE41-NEXT: setne %cl
-; SSE41-NEXT: andb %bpl, %cl
-; SSE41-NEXT: movzbl %cl, %ebp
-; SSE41-NEXT: testq %r9, %r9
+; SSE41-NEXT: subq %r8, %rdi
+; SSE41-NEXT: sbbq %r9, %rsi
+; SSE41-NEXT: setns %al
+; SSE41-NEXT: cmpb %al, %bl
+; SSE41-NEXT: setne %al
+; SSE41-NEXT: andb %bpl, %al
+; SSE41-NEXT: subq {{[0-9]+}}(%rsp), %rdx
+; SSE41-NEXT: movq %rcx, %rbp
+; SSE41-NEXT: sbbq %r10, %rbp
; SSE41-NEXT: setns %bl
-; SSE41-NEXT: testq %rsi, %rsi
+; SSE41-NEXT: testq %rcx, %rcx
; SSE41-NEXT: setns %cl
; SSE41-NEXT: cmpb %bl, %cl
-; SSE41-NEXT: setne %r11b
-; SSE41-NEXT: subq %r8, %rdi
-; SSE41-NEXT: sbbq %r9, %rsi
+; SSE41-NEXT: setne %r8b
+; SSE41-NEXT: testq %r10, %r10
; SSE41-NEXT: setns %bl
; SSE41-NEXT: cmpb %bl, %cl
; SSE41-NEXT: setne %cl
-; SSE41-NEXT: andb %r11b, %cl
+; SSE41-NEXT: andb %r8b, %cl
; SSE41-NEXT: movzbl %cl, %ecx
-; SSE41-NEXT: movd %ecx, %xmm0
-; SSE41-NEXT: pinsrb $8, %ebp, %xmm0
-; SSE41-NEXT: movq %rdx, 16(%r10)
-; SSE41-NEXT: movq %rdi, (%r10)
-; SSE41-NEXT: movq %rax, 24(%r10)
-; SSE41-NEXT: movq %rsi, 8(%r10)
-; SSE41-NEXT: psllq $63, %xmm0
-; SSE41-NEXT: psrad $31, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: negl %ecx
+; SSE41-NEXT: movzbl %al, %eax
+; SSE41-NEXT: negl %eax
+; SSE41-NEXT: movd %eax, %xmm0
+; SSE41-NEXT: pinsrd $1, %ecx, %xmm0
+; SSE41-NEXT: movq %rdx, 16(%r11)
+; SSE41-NEXT: movq %rdi, (%r11)
+; SSE41-NEXT: movq %rbp, 24(%r11)
+; SSE41-NEXT: movq %rsi, 8(%r11)
; SSE41-NEXT: popq %rbx
; SSE41-NEXT: popq %rbp
; SSE41-NEXT: retq
; AVX1: # %bb.0:
; AVX1-NEXT: pushq %rbp
; AVX1-NEXT: pushq %rbx
-; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX1-NEXT: subq {{[0-9]+}}(%rsp), %rdx
-; AVX1-NEXT: movq %rcx, %rax
-; AVX1-NEXT: sbbq %r11, %rax
+; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX1-NEXT: testq %r9, %r9
+; AVX1-NEXT: setns %al
+; AVX1-NEXT: testq %rsi, %rsi
; AVX1-NEXT: setns %bl
-; AVX1-NEXT: testq %rcx, %rcx
-; AVX1-NEXT: setns %cl
-; AVX1-NEXT: cmpb %bl, %cl
+; AVX1-NEXT: cmpb %al, %bl
; AVX1-NEXT: setne %bpl
-; AVX1-NEXT: testq %r11, %r11
-; AVX1-NEXT: setns %bl
-; AVX1-NEXT: cmpb %bl, %cl
-; AVX1-NEXT: setne %cl
-; AVX1-NEXT: andb %bpl, %cl
-; AVX1-NEXT: movzbl %cl, %ebp
-; AVX1-NEXT: testq %r9, %r9
+; AVX1-NEXT: subq %r8, %rdi
+; AVX1-NEXT: sbbq %r9, %rsi
+; AVX1-NEXT: setns %al
+; AVX1-NEXT: cmpb %al, %bl
+; AVX1-NEXT: setne %al
+; AVX1-NEXT: andb %bpl, %al
+; AVX1-NEXT: subq {{[0-9]+}}(%rsp), %rdx
+; AVX1-NEXT: movq %rcx, %rbp
+; AVX1-NEXT: sbbq %r10, %rbp
; AVX1-NEXT: setns %bl
-; AVX1-NEXT: testq %rsi, %rsi
+; AVX1-NEXT: testq %rcx, %rcx
; AVX1-NEXT: setns %cl
; AVX1-NEXT: cmpb %bl, %cl
-; AVX1-NEXT: setne %r11b
-; AVX1-NEXT: subq %r8, %rdi
-; AVX1-NEXT: sbbq %r9, %rsi
+; AVX1-NEXT: setne %r8b
+; AVX1-NEXT: testq %r10, %r10
; AVX1-NEXT: setns %bl
; AVX1-NEXT: cmpb %bl, %cl
; AVX1-NEXT: setne %cl
-; AVX1-NEXT: andb %r11b, %cl
+; AVX1-NEXT: andb %r8b, %cl
; AVX1-NEXT: movzbl %cl, %ecx
-; AVX1-NEXT: vmovd %ecx, %xmm0
-; AVX1-NEXT: vpinsrb $8, %ebp, %xmm0, %xmm0
-; AVX1-NEXT: movq %rdx, 16(%r10)
-; AVX1-NEXT: movq %rdi, (%r10)
-; AVX1-NEXT: movq %rax, 24(%r10)
-; AVX1-NEXT: movq %rsi, 8(%r10)
-; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: negl %ecx
+; AVX1-NEXT: movzbl %al, %eax
+; AVX1-NEXT: negl %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rdx, 16(%r11)
+; AVX1-NEXT: movq %rdi, (%r11)
+; AVX1-NEXT: movq %rbp, 24(%r11)
+; AVX1-NEXT: movq %rsi, 8(%r11)
; AVX1-NEXT: popq %rbx
; AVX1-NEXT: popq %rbp
; AVX1-NEXT: retq
; AVX2: # %bb.0:
; AVX2-NEXT: pushq %rbp
; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX2-NEXT: subq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: sbbq %r11, %rax
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX2-NEXT: testq %r9, %r9
+; AVX2-NEXT: setns %al
+; AVX2-NEXT: testq %rsi, %rsi
; AVX2-NEXT: setns %bl
-; AVX2-NEXT: testq %rcx, %rcx
-; AVX2-NEXT: setns %cl
-; AVX2-NEXT: cmpb %bl, %cl
+; AVX2-NEXT: cmpb %al, %bl
; AVX2-NEXT: setne %bpl
-; AVX2-NEXT: testq %r11, %r11
-; AVX2-NEXT: setns %bl
-; AVX2-NEXT: cmpb %bl, %cl
-; AVX2-NEXT: setne %cl
-; AVX2-NEXT: andb %bpl, %cl
-; AVX2-NEXT: movzbl %cl, %ebp
-; AVX2-NEXT: testq %r9, %r9
+; AVX2-NEXT: subq %r8, %rdi
+; AVX2-NEXT: sbbq %r9, %rsi
+; AVX2-NEXT: setns %al
+; AVX2-NEXT: cmpb %al, %bl
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: andb %bpl, %al
+; AVX2-NEXT: subq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT: movq %rcx, %rbp
+; AVX2-NEXT: sbbq %r10, %rbp
; AVX2-NEXT: setns %bl
-; AVX2-NEXT: testq %rsi, %rsi
+; AVX2-NEXT: testq %rcx, %rcx
; AVX2-NEXT: setns %cl
; AVX2-NEXT: cmpb %bl, %cl
-; AVX2-NEXT: setne %r11b
-; AVX2-NEXT: subq %r8, %rdi
-; AVX2-NEXT: sbbq %r9, %rsi
+; AVX2-NEXT: setne %r8b
+; AVX2-NEXT: testq %r10, %r10
; AVX2-NEXT: setns %bl
; AVX2-NEXT: cmpb %bl, %cl
; AVX2-NEXT: setne %cl
-; AVX2-NEXT: andb %r11b, %cl
+; AVX2-NEXT: andb %r8b, %cl
; AVX2-NEXT: movzbl %cl, %ecx
-; AVX2-NEXT: vmovd %ecx, %xmm0
-; AVX2-NEXT: vpinsrb $8, %ebp, %xmm0, %xmm0
-; AVX2-NEXT: movq %rdx, 16(%r10)
-; AVX2-NEXT: movq %rdi, (%r10)
-; AVX2-NEXT: movq %rax, 24(%r10)
-; AVX2-NEXT: movq %rsi, 8(%r10)
-; AVX2-NEXT: vpsllq $63, %xmm0, %xmm0
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: negl %ecx
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rdx, 16(%r11)
+; AVX2-NEXT: movq %rdi, (%r11)
+; AVX2-NEXT: movq %rbp, 24(%r11)
+; AVX2-NEXT: movq %rsi, 8(%r11)
; AVX2-NEXT: popq %rbx
; AVX2-NEXT: popq %rbp
; AVX2-NEXT: retq
; AVX512-NEXT: andl $1, %ecx
; AVX512-NEXT: kmovw %ecx, %k1
; AVX512-NEXT: korw %k0, %k1, %k1
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: movq %rdx, 16(%r10)
; AVX512-NEXT: movq %rdi, (%r10)
; AVX512-NEXT: movq %r14, 24(%r10)
; AVX512-NEXT: movq %rsi, 8(%r10)
-; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: popq %rbx
; AVX512-NEXT: popq %r14
; AVX512-NEXT: retq
define <2 x i32> @uaddo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind {
; SSE2-LABEL: uaddo_v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: paddq %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
-; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE2-NEXT: pxor %xmm3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: movq %xmm0, (%rdi)
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: paddd %xmm0, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
+; SSE2-NEXT: movq %xmm1, (%rdi)
; SSE2-NEXT: retq
;
; SSSE3-LABEL: uaddo_v2i32:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
-; SSSE3-NEXT: pand %xmm2, %xmm1
-; SSSE3-NEXT: pand %xmm2, %xmm0
-; SSSE3-NEXT: paddq %xmm1, %xmm0
-; SSSE3-NEXT: pand %xmm0, %xmm2
-; SSSE3-NEXT: pcmpeqd %xmm0, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
-; SSSE3-NEXT: pand %xmm2, %xmm3
-; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1
-; SSSE3-NEXT: pxor %xmm3, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSSE3-NEXT: movq %xmm0, (%rdi)
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT: paddd %xmm0, %xmm1
+; SSSE3-NEXT: pxor %xmm2, %xmm0
+; SSSE3-NEXT: pxor %xmm1, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0
+; SSSE3-NEXT: movq %xmm1, (%rdi)
; SSSE3-NEXT: retq
;
; SSE41-LABEL: uaddo_v2i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; SSE41-NEXT: paddq %xmm1, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; SSE41-NEXT: pcmpeqq %xmm0, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE41-NEXT: pxor %xmm2, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE41-NEXT: movq %xmm0, (%rdi)
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: paddd %xmm0, %xmm1
+; SSE41-NEXT: pmaxud %xmm1, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: movq %xmm1, (%rdi)
; SSE41-NEXT: retq
;
; AVX1-LABEL: uaddo_v2i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpmaxud %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; AVX1-NEXT: vmovq %xmm1, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: uaddo_v2i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpmaxud %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; AVX2-NEXT: vmovq %xmm1, (%rdi)
; AVX2-NEXT: retq
;
; AVX512-LABEL: uaddo_v2i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX512-NEXT: vpmovqd %xmm0, (%rdi)
-; AVX512-NEXT: vpcmpeqq %xmm0, %xmm1, %xmm0
-; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpcmpltud %xmm0, %xmm1, %k1
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: vmovq %xmm1, (%rdi)
; AVX512-NEXT: retq
%t = call {<2 x i32>, <2 x i1>} @llvm.uadd.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1)
%val = extractvalue {<2 x i32>, <2 x i1>} %t, 0
; SSE-NEXT: pxor %xmm1, %xmm2
; SSE-NEXT: movdqa %xmm0, %xmm3
; SSE-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
; SSE-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE-NEXT: pand %xmm4, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE-NEXT: por %xmm2, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE-NEXT: pand %xmm3, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSE-NEXT: por %xmm0, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
; SSE-NEXT: movdqa %xmm1, (%rdi)
; SSE-NEXT: retq
;
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm0
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX2-NEXT: vmovdqa %xmm1, (%rdi)
; AVX2-NEXT: retq
;
; AVX512: # %bb.0:
; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm1
; AVX512-NEXT: vpcmpltuq %xmm0, %xmm1, %k1
-; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
; AVX512-NEXT: retq
%t = call {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1)
%val = extractvalue {<2 x i64>, <2 x i1>} %t, 0
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10
; SSE2-NEXT: addq {{[0-9]+}}(%rsp), %rdx
; SSE2-NEXT: adcq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT: setb %al
-; SSE2-NEXT: movzbl %al, %r11d
+; SSE2-NEXT: sbbl %eax, %eax
; SSE2-NEXT: addq %r8, %rdi
; SSE2-NEXT: adcq %r9, %rsi
-; SSE2-NEXT: setb %al
-; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: movd %eax, %xmm1
+; SSE2-NEXT: sbbl %eax, %eax
; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: pinsrw $4, %r11d, %xmm0
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: movq %rdx, 16(%r10)
; SSE2-NEXT: movq %rdi, (%r10)
; SSE2-NEXT: movq %rcx, 24(%r10)
; SSE2-NEXT: movq %rsi, 8(%r10)
-; SSE2-NEXT: psllq $63, %xmm0
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: uaddo_v2i128:
; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r10
; SSSE3-NEXT: addq {{[0-9]+}}(%rsp), %rdx
; SSSE3-NEXT: adcq {{[0-9]+}}(%rsp), %rcx
-; SSSE3-NEXT: setb %al
-; SSSE3-NEXT: movzbl %al, %r11d
+; SSSE3-NEXT: sbbl %eax, %eax
; SSSE3-NEXT: addq %r8, %rdi
; SSSE3-NEXT: adcq %r9, %rsi
-; SSSE3-NEXT: setb %al
-; SSSE3-NEXT: movzbl %al, %eax
+; SSSE3-NEXT: movd %eax, %xmm1
+; SSSE3-NEXT: sbbl %eax, %eax
; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: pinsrw $4, %r11d, %xmm0
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-NEXT: movq %rdx, 16(%r10)
; SSSE3-NEXT: movq %rdi, (%r10)
; SSSE3-NEXT: movq %rcx, 24(%r10)
; SSSE3-NEXT: movq %rsi, 8(%r10)
-; SSSE3-NEXT: psllq $63, %xmm0
-; SSSE3-NEXT: psrad $31, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: uaddo_v2i128:
; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r10
; SSE41-NEXT: addq {{[0-9]+}}(%rsp), %rdx
; SSE41-NEXT: adcq {{[0-9]+}}(%rsp), %rcx
-; SSE41-NEXT: setb %al
-; SSE41-NEXT: movzbl %al, %r11d
+; SSE41-NEXT: sbbl %r11d, %r11d
; SSE41-NEXT: addq %r8, %rdi
; SSE41-NEXT: adcq %r9, %rsi
-; SSE41-NEXT: setb %al
-; SSE41-NEXT: movzbl %al, %eax
+; SSE41-NEXT: sbbl %eax, %eax
; SSE41-NEXT: movd %eax, %xmm0
-; SSE41-NEXT: pinsrb $8, %r11d, %xmm0
+; SSE41-NEXT: pinsrd $1, %r11d, %xmm0
; SSE41-NEXT: movq %rdx, 16(%r10)
; SSE41-NEXT: movq %rdi, (%r10)
; SSE41-NEXT: movq %rcx, 24(%r10)
; SSE41-NEXT: movq %rsi, 8(%r10)
-; SSE41-NEXT: psllq $63, %xmm0
-; SSE41-NEXT: psrad $31, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE41-NEXT: retq
;
; AVX1-LABEL: uaddo_v2i128:
; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX1-NEXT: addq {{[0-9]+}}(%rsp), %rdx
; AVX1-NEXT: adcq {{[0-9]+}}(%rsp), %rcx
-; AVX1-NEXT: setb %al
-; AVX1-NEXT: movzbl %al, %r11d
+; AVX1-NEXT: sbbl %r11d, %r11d
; AVX1-NEXT: addq %r8, %rdi
; AVX1-NEXT: adcq %r9, %rsi
-; AVX1-NEXT: setb %al
-; AVX1-NEXT: movzbl %al, %eax
+; AVX1-NEXT: sbbl %eax, %eax
; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vpinsrb $8, %r11d, %xmm0, %xmm0
+; AVX1-NEXT: vpinsrd $1, %r11d, %xmm0, %xmm0
; AVX1-NEXT: movq %rdx, 16(%r10)
; AVX1-NEXT: movq %rdi, (%r10)
; AVX1-NEXT: movq %rcx, 24(%r10)
; AVX1-NEXT: movq %rsi, 8(%r10)
-; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: uaddo_v2i128:
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX2-NEXT: addq {{[0-9]+}}(%rsp), %rdx
; AVX2-NEXT: adcq {{[0-9]+}}(%rsp), %rcx
-; AVX2-NEXT: setb %al
-; AVX2-NEXT: movzbl %al, %r11d
+; AVX2-NEXT: sbbl %r11d, %r11d
; AVX2-NEXT: addq %r8, %rdi
; AVX2-NEXT: adcq %r9, %rsi
-; AVX2-NEXT: setb %al
-; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: sbbl %eax, %eax
; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vpinsrb $8, %r11d, %xmm0, %xmm0
+; AVX2-NEXT: vpinsrd $1, %r11d, %xmm0, %xmm0
; AVX2-NEXT: movq %rdx, 16(%r10)
; AVX2-NEXT: movq %rdi, (%r10)
; AVX2-NEXT: movq %rcx, 24(%r10)
; AVX2-NEXT: movq %rsi, 8(%r10)
-; AVX2-NEXT: vpsllq $63, %xmm0, %xmm0
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: uaddo_v2i128:
; AVX512-NEXT: andl $1, %eax
; AVX512-NEXT: kmovw %eax, %k1
; AVX512-NEXT: korw %k0, %k1, %k1
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: movq %rdx, 16(%r10)
; AVX512-NEXT: movq %rdi, (%r10)
; AVX512-NEXT: movq %rcx, 24(%r10)
; AVX512-NEXT: movq %rsi, 8(%r10)
-; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: retq
%t = call {<2 x i128>, <2 x i1>} @llvm.uadd.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1)
%val = extractvalue {<2 x i128>, <2 x i1>} %t, 0
define <2 x i32> @umulo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind {
; SSE2-LABEL: umulo_v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
-; SSE2-NEXT: movq %xmm3, %r8
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; SSE2-NEXT: movq %xmm2, %r10
-; SSE2-NEXT: movq %xmm0, %rax
-; SSE2-NEXT: movq %xmm1, %rdx
-; SSE2-NEXT: xorl %esi, %esi
-; SSE2-NEXT: mulq %rdx
-; SSE2-NEXT: movq $-1, %r9
-; SSE2-NEXT: movl $0, %ecx
-; SSE2-NEXT: cmovoq %r9, %rcx
-; SSE2-NEXT: movq %rax, %xmm0
-; SSE2-NEXT: movq %r8, %rax
-; SSE2-NEXT: mulq %r10
-; SSE2-NEXT: movq %rax, %xmm1
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
-; SSE2-NEXT: psrlq $32, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm0, %xmm2
-; SSE2-NEXT: movq %rcx, %xmm0
-; SSE2-NEXT: cmovoq %r9, %rsi
-; SSE2-NEXT: movq %rsi, %xmm3
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: movq %xmm1, (%rdi)
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSE2-NEXT: movq %xmm0, (%rdi)
+; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: umulo_v2i32:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
-; SSSE3-NEXT: pand %xmm2, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
-; SSSE3-NEXT: movq %xmm3, %r8
-; SSSE3-NEXT: pand %xmm2, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; SSSE3-NEXT: movq %xmm2, %r10
-; SSSE3-NEXT: movq %xmm0, %rax
-; SSSE3-NEXT: movq %xmm1, %rdx
-; SSSE3-NEXT: xorl %esi, %esi
-; SSSE3-NEXT: mulq %rdx
-; SSSE3-NEXT: movq $-1, %r9
-; SSSE3-NEXT: movl $0, %ecx
-; SSSE3-NEXT: cmovoq %r9, %rcx
-; SSSE3-NEXT: movq %rax, %xmm0
-; SSSE3-NEXT: movq %r8, %rax
-; SSSE3-NEXT: mulq %r10
-; SSSE3-NEXT: movq %rax, %xmm1
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
-; SSSE3-NEXT: psrlq $32, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSSE3-NEXT: pmuludq %xmm1, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
+; SSSE3-NEXT: pmuludq %xmm2, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: pcmpeqd %xmm0, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
-; SSSE3-NEXT: pand %xmm2, %xmm0
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm2
-; SSSE3-NEXT: pxor %xmm0, %xmm2
-; SSSE3-NEXT: movq %rcx, %xmm0
-; SSSE3-NEXT: cmovoq %r9, %rsi
-; SSSE3-NEXT: movq %rsi, %xmm3
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; SSSE3-NEXT: por %xmm2, %xmm0
-; SSSE3-NEXT: movq %xmm1, (%rdi)
+; SSSE3-NEXT: pcmpeqd %xmm3, %xmm2
+; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1
+; SSSE3-NEXT: pxor %xmm2, %xmm1
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSSE3-NEXT: movq %xmm0, (%rdi)
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: umulo_v2i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; SSE41-NEXT: movq %xmm0, %r8
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; SSE41-NEXT: movq %xmm1, %rcx
-; SSE41-NEXT: pextrq $1, %xmm0, %rax
-; SSE41-NEXT: pextrq $1, %xmm1, %rdx
-; SSE41-NEXT: xorl %esi, %esi
-; SSE41-NEXT: mulq %rdx
-; SSE41-NEXT: movq %rax, %r9
-; SSE41-NEXT: movq $-1, %r10
-; SSE41-NEXT: movl $0, %eax
-; SSE41-NEXT: cmovoq %r10, %rax
-; SSE41-NEXT: movq %rax, %xmm0
-; SSE41-NEXT: movq %r8, %rax
-; SSE41-NEXT: mulq %rcx
-; SSE41-NEXT: cmovoq %r10, %rsi
-; SSE41-NEXT: movq %rsi, %xmm1
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE41-NEXT: movq %r9, %xmm0
-; SSE41-NEXT: movq %rax, %xmm3
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3]
-; SSE41-NEXT: psrlq $32, %xmm3
-; SSE41-NEXT: pcmpeqq %xmm2, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm0
-; SSE41-NEXT: pxor %xmm3, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: movq %xmm4, (%rdi)
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE41-NEXT: pmuludq %xmm2, %xmm3
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: pmuludq %xmm1, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
+; SSE41-NEXT: pxor %xmm3, %xmm3
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm3
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT: pxor %xmm3, %xmm2
+; SSE41-NEXT: pmulld %xmm1, %xmm0
+; SSE41-NEXT: movq %xmm0, (%rdi)
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: umulo_v2i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT: vmovq %xmm0, %r8
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; AVX1-NEXT: vmovq %xmm1, %rcx
-; AVX1-NEXT: vpextrq $1, %xmm0, %rax
-; AVX1-NEXT: vpextrq $1, %xmm1, %rdx
-; AVX1-NEXT: xorl %esi, %esi
-; AVX1-NEXT: mulq %rdx
-; AVX1-NEXT: movq %rax, %r9
-; AVX1-NEXT: movq $-1, %r10
-; AVX1-NEXT: movl $0, %eax
-; AVX1-NEXT: cmovoq %r10, %rax
-; AVX1-NEXT: vmovq %rax, %xmm0
-; AVX1-NEXT: movq %r8, %rax
-; AVX1-NEXT: mulq %rcx
-; AVX1-NEXT: cmovoq %r10, %rsi
-; AVX1-NEXT: vmovq %rsi, %xmm1
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX1-NEXT: vmovq %r9, %xmm1
-; AVX1-NEXT: vmovq %rax, %xmm3
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0]
-; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3
-; AVX1-NEXT: vpcmpeqq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX1-NEXT: vmovq %xmm1, (%rdi)
+; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovq %xmm0, (%rdi)
+; AVX1-NEXT: vmovdqa %xmm2, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: umulo_v2i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX2-NEXT: vmovq %xmm0, %r8
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX2-NEXT: vmovq %xmm1, %rcx
-; AVX2-NEXT: vpextrq $1, %xmm0, %rax
-; AVX2-NEXT: vpextrq $1, %xmm1, %rdx
-; AVX2-NEXT: xorl %esi, %esi
-; AVX2-NEXT: mulq %rdx
-; AVX2-NEXT: movq %rax, %r9
-; AVX2-NEXT: movq $-1, %r10
-; AVX2-NEXT: movl $0, %eax
-; AVX2-NEXT: cmovoq %r10, %rax
-; AVX2-NEXT: vmovq %rax, %xmm0
-; AVX2-NEXT: movq %r8, %rax
-; AVX2-NEXT: mulq %rcx
-; AVX2-NEXT: cmovoq %r10, %rsi
-; AVX2-NEXT: vmovq %rsi, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX2-NEXT: vmovq %r9, %xmm1
-; AVX2-NEXT: vmovq %rax, %xmm3
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0]
-; AVX2-NEXT: vpsrlq $32, %xmm1, %xmm3
-; AVX2-NEXT: vpcmpeqq %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm3
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3]
+; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX2-NEXT: vmovq %xmm1, (%rdi)
+; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vmovq %xmm0, (%rdi)
+; AVX2-NEXT: vmovdqa %xmm2, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: umulo_v2i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX512-NEXT: vmovq %xmm0, %rcx
-; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX512-NEXT: vmovq %xmm1, %rsi
-; AVX512-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512-NEXT: vpextrq $1, %xmm1, %rdx
-; AVX512-NEXT: mulq %rdx
-; AVX512-NEXT: seto %r8b
-; AVX512-NEXT: vmovq %rax, %xmm0
-; AVX512-NEXT: movq %rcx, %rax
-; AVX512-NEXT: mulq %rsi
-; AVX512-NEXT: vmovq %rax, %xmm1
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512-NEXT: vpsrlq $32, %xmm0, %xmm1
-; AVX512-NEXT: vptestmq %xmm1, %xmm1, %k0
-; AVX512-NEXT: kmovd %r8d, %k1
-; AVX512-NEXT: kshiftlw $1, %k1, %k1
-; AVX512-NEXT: seto %al
-; AVX512-NEXT: andl $1, %eax
-; AVX512-NEXT: kmovw %eax, %k2
-; AVX512-NEXT: korw %k1, %k2, %k1
-; AVX512-NEXT: korw %k1, %k0, %k1
-; AVX512-NEXT: vpmovqd %xmm0, (%rdi)
+; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
+; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; AVX512-NEXT: vpmuludq %xmm3, %xmm4, %xmm3
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7]
+; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4
+; AVX512-NEXT: vptestmd %xmm4, %xmm4, %k1
+; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: vmovq %xmm1, (%rdi)
; AVX512-NEXT: retq
%t = call {<2 x i32>, <2 x i1>} @llvm.umul.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1)
%val = extractvalue {<2 x i32>, <2 x i1>} %t, 0
; SSE2-LABEL: umulo_v2i64:
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE2-NEXT: movq %xmm2, %r9
+; SSE2-NEXT: movq %xmm2, %r8
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; SSE2-NEXT: movq %xmm2, %rsi
+; SSE2-NEXT: movq %xmm2, %r10
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: movq %xmm1, %rdx
; SSE2-NEXT: xorl %ecx, %ecx
; SSE2-NEXT: mulq %rdx
-; SSE2-NEXT: movq %rax, %r8
-; SSE2-NEXT: movq $-1, %r10
-; SSE2-NEXT: movl $0, %eax
-; SSE2-NEXT: cmovoq %r10, %rax
+; SSE2-NEXT: movq $-1, %r9
+; SSE2-NEXT: movl $0, %esi
+; SSE2-NEXT: cmovoq %r9, %rsi
+; SSE2-NEXT: movq %rax, %xmm1
+; SSE2-NEXT: movq %r8, %rax
+; SSE2-NEXT: mulq %r10
; SSE2-NEXT: movq %rax, %xmm0
-; SSE2-NEXT: movq %r9, %rax
-; SSE2-NEXT: mulq %rsi
-; SSE2-NEXT: cmovoq %r10, %rcx
-; SSE2-NEXT: movq %rcx, %xmm1
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT: movq %r8, %xmm1
-; SSE2-NEXT: movq %rax, %xmm2
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT: movq %rsi, %xmm0
+; SSE2-NEXT: cmovoq %r9, %rcx
+; SSE2-NEXT: movq %rcx, %xmm2
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: movdqa %xmm1, (%rdi)
; SSE2-NEXT: retq
;
; SSSE3-LABEL: umulo_v2i64:
; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSSE3-NEXT: movq %xmm2, %r9
+; SSSE3-NEXT: movq %xmm2, %r8
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; SSSE3-NEXT: movq %xmm2, %rsi
+; SSSE3-NEXT: movq %xmm2, %r10
; SSSE3-NEXT: movq %xmm0, %rax
; SSSE3-NEXT: movq %xmm1, %rdx
; SSSE3-NEXT: xorl %ecx, %ecx
; SSSE3-NEXT: mulq %rdx
-; SSSE3-NEXT: movq %rax, %r8
-; SSSE3-NEXT: movq $-1, %r10
-; SSSE3-NEXT: movl $0, %eax
-; SSSE3-NEXT: cmovoq %r10, %rax
+; SSSE3-NEXT: movq $-1, %r9
+; SSSE3-NEXT: movl $0, %esi
+; SSSE3-NEXT: cmovoq %r9, %rsi
+; SSSE3-NEXT: movq %rax, %xmm1
+; SSSE3-NEXT: movq %r8, %rax
+; SSSE3-NEXT: mulq %r10
; SSSE3-NEXT: movq %rax, %xmm0
-; SSSE3-NEXT: movq %r9, %rax
-; SSSE3-NEXT: mulq %rsi
-; SSSE3-NEXT: cmovoq %r10, %rcx
-; SSSE3-NEXT: movq %rcx, %xmm1
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSSE3-NEXT: movq %r8, %xmm1
-; SSSE3-NEXT: movq %rax, %xmm2
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSSE3-NEXT: movq %rsi, %xmm0
+; SSSE3-NEXT: cmovoq %r9, %rcx
+; SSSE3-NEXT: movq %rcx, %xmm2
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSSE3-NEXT: movdqa %xmm1, (%rdi)
; SSSE3-NEXT: retq
;
; SSE41-LABEL: umulo_v2i64:
; SSE41: # %bb.0:
-; SSE41-NEXT: movq %xmm0, %rcx
-; SSE41-NEXT: movq %xmm1, %r9
+; SSE41-NEXT: movq %xmm0, %r10
+; SSE41-NEXT: movq %xmm1, %r8
; SSE41-NEXT: pextrq $1, %xmm0, %rax
; SSE41-NEXT: pextrq $1, %xmm1, %rdx
; SSE41-NEXT: xorl %esi, %esi
; SSE41-NEXT: mulq %rdx
-; SSE41-NEXT: movq %rax, %r8
-; SSE41-NEXT: movq $-1, %r10
-; SSE41-NEXT: movl $0, %eax
-; SSE41-NEXT: cmovoq %r10, %rax
+; SSE41-NEXT: movq $-1, %r9
+; SSE41-NEXT: movl $0, %ecx
+; SSE41-NEXT: cmovoq %r9, %rcx
+; SSE41-NEXT: movq %rax, %xmm0
+; SSE41-NEXT: movq %r10, %rax
+; SSE41-NEXT: mulq %r8
; SSE41-NEXT: movq %rax, %xmm1
-; SSE41-NEXT: movq %rcx, %rax
-; SSE41-NEXT: mulq %r9
-; SSE41-NEXT: cmovoq %r10, %rsi
-; SSE41-NEXT: movq %rsi, %xmm0
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE41-NEXT: movq %r8, %xmm1
-; SSE41-NEXT: movq %rax, %xmm2
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; SSE41-NEXT: movdqa %xmm2, (%rdi)
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE41-NEXT: movq %rcx, %xmm0
+; SSE41-NEXT: cmovoq %r9, %rsi
+; SSE41-NEXT: movq %rsi, %xmm2
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE41-NEXT: movdqa %xmm1, (%rdi)
; SSE41-NEXT: retq
;
; AVX1-LABEL: umulo_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovq %xmm0, %rcx
-; AVX1-NEXT: vmovq %xmm1, %r9
+; AVX1-NEXT: vmovq %xmm0, %r10
+; AVX1-NEXT: vmovq %xmm1, %r8
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: vpextrq $1, %xmm1, %rdx
; AVX1-NEXT: xorl %esi, %esi
; AVX1-NEXT: mulq %rdx
-; AVX1-NEXT: movq %rax, %r8
-; AVX1-NEXT: movq $-1, %r10
-; AVX1-NEXT: movl $0, %eax
-; AVX1-NEXT: cmovoq %r10, %rax
+; AVX1-NEXT: movq $-1, %r9
+; AVX1-NEXT: movl $0, %ecx
+; AVX1-NEXT: cmovoq %r9, %rcx
; AVX1-NEXT: vmovq %rax, %xmm0
-; AVX1-NEXT: movq %rcx, %rax
-; AVX1-NEXT: mulq %r9
-; AVX1-NEXT: cmovoq %r10, %rsi
-; AVX1-NEXT: vmovq %rsi, %xmm1
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX1-NEXT: vmovq %r8, %xmm1
-; AVX1-NEXT: vmovq %rax, %xmm2
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT: movq %r10, %rax
+; AVX1-NEXT: mulq %r8
+; AVX1-NEXT: vmovq %rax, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vmovq %rcx, %xmm0
+; AVX1-NEXT: cmovoq %r9, %rsi
+; AVX1-NEXT: vmovq %rsi, %xmm2
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: umulo_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovq %xmm0, %rcx
-; AVX2-NEXT: vmovq %xmm1, %r9
+; AVX2-NEXT: vmovq %xmm0, %r10
+; AVX2-NEXT: vmovq %xmm1, %r8
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
; AVX2-NEXT: vpextrq $1, %xmm1, %rdx
; AVX2-NEXT: xorl %esi, %esi
; AVX2-NEXT: mulq %rdx
-; AVX2-NEXT: movq %rax, %r8
-; AVX2-NEXT: movq $-1, %r10
-; AVX2-NEXT: movl $0, %eax
-; AVX2-NEXT: cmovoq %r10, %rax
+; AVX2-NEXT: movq $-1, %r9
+; AVX2-NEXT: movl $0, %ecx
+; AVX2-NEXT: cmovoq %r9, %rcx
; AVX2-NEXT: vmovq %rax, %xmm0
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: mulq %r9
-; AVX2-NEXT: cmovoq %r10, %rsi
-; AVX2-NEXT: vmovq %rsi, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX2-NEXT: vmovq %r8, %xmm1
-; AVX2-NEXT: vmovq %rax, %xmm2
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX2-NEXT: movq %r10, %rax
+; AVX2-NEXT: mulq %r8
+; AVX2-NEXT: vmovq %rax, %xmm1
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; AVX2-NEXT: vmovq %rcx, %xmm0
+; AVX2-NEXT: cmovoq %r9, %rsi
+; AVX2-NEXT: vmovq %rsi, %xmm2
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX2-NEXT: vmovdqa %xmm1, (%rdi)
; AVX2-NEXT: retq
;
; AVX512-LABEL: umulo_v2i64:
; AVX512: # %bb.0:
-; AVX512-NEXT: vmovq %xmm0, %rcx
-; AVX512-NEXT: vmovq %xmm1, %rsi
-; AVX512-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512-NEXT: vpextrq $1, %xmm1, %rdx
+; AVX512-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX512-NEXT: vpextrq $1, %xmm1, %r8
+; AVX512-NEXT: vmovq %xmm0, %rax
+; AVX512-NEXT: vmovq %xmm1, %rdx
; AVX512-NEXT: mulq %rdx
-; AVX512-NEXT: movq %rax, %r8
+; AVX512-NEXT: movq %rax, %rsi
+; AVX512-NEXT: seto %r9b
+; AVX512-NEXT: movq %rcx, %rax
+; AVX512-NEXT: mulq %r8
+; AVX512-NEXT: vmovq %rax, %xmm0
+; AVX512-NEXT: vmovq %rsi, %xmm1
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; AVX512-NEXT: seto %al
; AVX512-NEXT: kmovd %eax, %k0
-; AVX512-NEXT: kshiftlw $1, %k0, %k0
-; AVX512-NEXT: movq %rcx, %rax
-; AVX512-NEXT: mulq %rsi
-; AVX512-NEXT: seto %cl
-; AVX512-NEXT: andl $1, %ecx
-; AVX512-NEXT: kmovw %ecx, %k1
-; AVX512-NEXT: korw %k0, %k1, %k1
-; AVX512-NEXT: vmovq %r8, %xmm0
-; AVX512-NEXT: vmovq %rax, %xmm1
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512-NEXT: vmovdqa %xmm0, (%rdi)
+; AVX512-NEXT: kmovd %r9d, %k1
+; AVX512-NEXT: kshiftrw $1, %k1, %k2
+; AVX512-NEXT: kxorw %k0, %k2, %k0
+; AVX512-NEXT: kshiftlw $15, %k0, %k0
+; AVX512-NEXT: kshiftrw $14, %k0, %k0
+; AVX512-NEXT: kxorw %k0, %k1, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
; AVX512-NEXT: retq
%t = call {<2 x i64>, <2 x i1>} @llvm.umul.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1)
%val = extractvalue {<2 x i64>, <2 x i1>} %t, 0
; SSE2-NEXT: pushq %r13
; SSE2-NEXT: pushq %r12
; SSE2-NEXT: pushq %rbx
-; SSE2-NEXT: movq %rcx, %rax
-; SSE2-NEXT: movq %rdx, %r12
-; SSE2-NEXT: movq %rdi, %r11
+; SSE2-NEXT: movq %r9, %r10
+; SSE2-NEXT: movq %rcx, %r12
+; SSE2-NEXT: movq %rdx, %r11
+; SSE2-NEXT: movq %rsi, %rax
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r14
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r15
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r9
; SSE2-NEXT: testq %r10, %r10
-; SSE2-NEXT: setne %dl
-; SSE2-NEXT: testq %rcx, %rcx
-; SSE2-NEXT: setne %r13b
-; SSE2-NEXT: andb %dl, %r13b
-; SSE2-NEXT: mulq %r15
-; SSE2-NEXT: movq %rax, %rdi
-; SSE2-NEXT: seto %bpl
-; SSE2-NEXT: movq %r10, %rax
-; SSE2-NEXT: mulq %r12
-; SSE2-NEXT: movq %rax, %rbx
-; SSE2-NEXT: seto %cl
-; SSE2-NEXT: orb %bpl, %cl
-; SSE2-NEXT: addq %rdi, %rbx
-; SSE2-NEXT: movq %r12, %rax
-; SSE2-NEXT: mulq %r15
-; SSE2-NEXT: movq %rax, %r10
-; SSE2-NEXT: movq %rdx, %r15
-; SSE2-NEXT: addq %rbx, %r15
-; SSE2-NEXT: setb %al
-; SSE2-NEXT: orb %cl, %al
-; SSE2-NEXT: orb %r13b, %al
-; SSE2-NEXT: movzbl %al, %ebp
-; SSE2-NEXT: testq %r9, %r9
-; SSE2-NEXT: setne %al
+; SSE2-NEXT: setne %cl
; SSE2-NEXT: testq %rsi, %rsi
; SSE2-NEXT: setne %r13b
-; SSE2-NEXT: andb %al, %r13b
-; SSE2-NEXT: movq %rsi, %rax
+; SSE2-NEXT: andb %cl, %r13b
; SSE2-NEXT: mulq %r8
; SSE2-NEXT: movq %rax, %rsi
-; SSE2-NEXT: seto %r12b
-; SSE2-NEXT: movq %r9, %rax
-; SSE2-NEXT: mulq %r11
-; SSE2-NEXT: movq %rax, %rdi
+; SSE2-NEXT: seto %bpl
+; SSE2-NEXT: movq %r10, %rax
+; SSE2-NEXT: mulq %rdi
+; SSE2-NEXT: movq %rax, %rcx
; SSE2-NEXT: seto %bl
-; SSE2-NEXT: orb %r12b, %bl
-; SSE2-NEXT: addq %rsi, %rdi
-; SSE2-NEXT: movq %r11, %rax
+; SSE2-NEXT: orb %bpl, %bl
+; SSE2-NEXT: addq %rsi, %rcx
+; SSE2-NEXT: movq %rdi, %rax
; SSE2-NEXT: mulq %r8
-; SSE2-NEXT: addq %rdi, %rdx
+; SSE2-NEXT: movq %rax, %rdi
+; SSE2-NEXT: movq %rdx, %rsi
+; SSE2-NEXT: addq %rcx, %rsi
; SSE2-NEXT: setb %cl
; SSE2-NEXT: orb %bl, %cl
; SSE2-NEXT: orb %r13b, %cl
+; SSE2-NEXT: testq %r9, %r9
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: testq %r12, %r12
+; SSE2-NEXT: setne %r8b
+; SSE2-NEXT: andb %al, %r8b
+; SSE2-NEXT: movq %r12, %rax
+; SSE2-NEXT: mulq %r15
+; SSE2-NEXT: movq %rax, %rbp
+; SSE2-NEXT: seto %r10b
+; SSE2-NEXT: movq %r9, %rax
+; SSE2-NEXT: mulq %r11
+; SSE2-NEXT: movq %rax, %rbx
+; SSE2-NEXT: seto %r9b
+; SSE2-NEXT: orb %r10b, %r9b
+; SSE2-NEXT: addq %rbp, %rbx
+; SSE2-NEXT: movq %r11, %rax
+; SSE2-NEXT: mulq %r15
+; SSE2-NEXT: addq %rbx, %rdx
+; SSE2-NEXT: setb %bl
+; SSE2-NEXT: orb %r9b, %bl
+; SSE2-NEXT: orb %r8b, %bl
+; SSE2-NEXT: movzbl %bl, %ebp
+; SSE2-NEXT: negl %ebp
+; SSE2-NEXT: movd %ebp, %xmm1
; SSE2-NEXT: movzbl %cl, %ecx
+; SSE2-NEXT: negl %ecx
; SSE2-NEXT: movd %ecx, %xmm0
-; SSE2-NEXT: pinsrw $4, %ebp, %xmm0
-; SSE2-NEXT: movq %r10, 16(%r14)
-; SSE2-NEXT: movq %rax, (%r14)
-; SSE2-NEXT: movq %r15, 24(%r14)
-; SSE2-NEXT: movq %rdx, 8(%r14)
-; SSE2-NEXT: psllq $63, %xmm0
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: movq %rax, 16(%r14)
+; SSE2-NEXT: movq %rdi, (%r14)
+; SSE2-NEXT: movq %rdx, 24(%r14)
+; SSE2-NEXT: movq %rsi, 8(%r14)
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: popq %r12
; SSE2-NEXT: popq %r13
; SSSE3-NEXT: pushq %r13
; SSSE3-NEXT: pushq %r12
; SSSE3-NEXT: pushq %rbx
-; SSSE3-NEXT: movq %rcx, %rax
-; SSSE3-NEXT: movq %rdx, %r12
-; SSSE3-NEXT: movq %rdi, %r11
+; SSSE3-NEXT: movq %r9, %r10
+; SSSE3-NEXT: movq %rcx, %r12
+; SSSE3-NEXT: movq %rdx, %r11
+; SSSE3-NEXT: movq %rsi, %rax
; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r14
; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r15
-; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r9
; SSSE3-NEXT: testq %r10, %r10
-; SSSE3-NEXT: setne %dl
-; SSSE3-NEXT: testq %rcx, %rcx
-; SSSE3-NEXT: setne %r13b
-; SSSE3-NEXT: andb %dl, %r13b
-; SSSE3-NEXT: mulq %r15
-; SSSE3-NEXT: movq %rax, %rdi
-; SSSE3-NEXT: seto %bpl
-; SSSE3-NEXT: movq %r10, %rax
-; SSSE3-NEXT: mulq %r12
-; SSSE3-NEXT: movq %rax, %rbx
-; SSSE3-NEXT: seto %cl
-; SSSE3-NEXT: orb %bpl, %cl
-; SSSE3-NEXT: addq %rdi, %rbx
-; SSSE3-NEXT: movq %r12, %rax
-; SSSE3-NEXT: mulq %r15
-; SSSE3-NEXT: movq %rax, %r10
-; SSSE3-NEXT: movq %rdx, %r15
-; SSSE3-NEXT: addq %rbx, %r15
-; SSSE3-NEXT: setb %al
-; SSSE3-NEXT: orb %cl, %al
-; SSSE3-NEXT: orb %r13b, %al
-; SSSE3-NEXT: movzbl %al, %ebp
-; SSSE3-NEXT: testq %r9, %r9
-; SSSE3-NEXT: setne %al
+; SSSE3-NEXT: setne %cl
; SSSE3-NEXT: testq %rsi, %rsi
; SSSE3-NEXT: setne %r13b
-; SSSE3-NEXT: andb %al, %r13b
-; SSSE3-NEXT: movq %rsi, %rax
+; SSSE3-NEXT: andb %cl, %r13b
; SSSE3-NEXT: mulq %r8
; SSSE3-NEXT: movq %rax, %rsi
-; SSSE3-NEXT: seto %r12b
-; SSSE3-NEXT: movq %r9, %rax
-; SSSE3-NEXT: mulq %r11
-; SSSE3-NEXT: movq %rax, %rdi
+; SSSE3-NEXT: seto %bpl
+; SSSE3-NEXT: movq %r10, %rax
+; SSSE3-NEXT: mulq %rdi
+; SSSE3-NEXT: movq %rax, %rcx
; SSSE3-NEXT: seto %bl
-; SSSE3-NEXT: orb %r12b, %bl
-; SSSE3-NEXT: addq %rsi, %rdi
-; SSSE3-NEXT: movq %r11, %rax
+; SSSE3-NEXT: orb %bpl, %bl
+; SSSE3-NEXT: addq %rsi, %rcx
+; SSSE3-NEXT: movq %rdi, %rax
; SSSE3-NEXT: mulq %r8
-; SSSE3-NEXT: addq %rdi, %rdx
+; SSSE3-NEXT: movq %rax, %rdi
+; SSSE3-NEXT: movq %rdx, %rsi
+; SSSE3-NEXT: addq %rcx, %rsi
; SSSE3-NEXT: setb %cl
; SSSE3-NEXT: orb %bl, %cl
; SSSE3-NEXT: orb %r13b, %cl
+; SSSE3-NEXT: testq %r9, %r9
+; SSSE3-NEXT: setne %al
+; SSSE3-NEXT: testq %r12, %r12
+; SSSE3-NEXT: setne %r8b
+; SSSE3-NEXT: andb %al, %r8b
+; SSSE3-NEXT: movq %r12, %rax
+; SSSE3-NEXT: mulq %r15
+; SSSE3-NEXT: movq %rax, %rbp
+; SSSE3-NEXT: seto %r10b
+; SSSE3-NEXT: movq %r9, %rax
+; SSSE3-NEXT: mulq %r11
+; SSSE3-NEXT: movq %rax, %rbx
+; SSSE3-NEXT: seto %r9b
+; SSSE3-NEXT: orb %r10b, %r9b
+; SSSE3-NEXT: addq %rbp, %rbx
+; SSSE3-NEXT: movq %r11, %rax
+; SSSE3-NEXT: mulq %r15
+; SSSE3-NEXT: addq %rbx, %rdx
+; SSSE3-NEXT: setb %bl
+; SSSE3-NEXT: orb %r9b, %bl
+; SSSE3-NEXT: orb %r8b, %bl
+; SSSE3-NEXT: movzbl %bl, %ebp
+; SSSE3-NEXT: negl %ebp
+; SSSE3-NEXT: movd %ebp, %xmm1
; SSSE3-NEXT: movzbl %cl, %ecx
+; SSSE3-NEXT: negl %ecx
; SSSE3-NEXT: movd %ecx, %xmm0
-; SSSE3-NEXT: pinsrw $4, %ebp, %xmm0
-; SSSE3-NEXT: movq %r10, 16(%r14)
-; SSSE3-NEXT: movq %rax, (%r14)
-; SSSE3-NEXT: movq %r15, 24(%r14)
-; SSSE3-NEXT: movq %rdx, 8(%r14)
-; SSSE3-NEXT: psllq $63, %xmm0
-; SSSE3-NEXT: psrad $31, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT: movq %rax, 16(%r14)
+; SSSE3-NEXT: movq %rdi, (%r14)
+; SSSE3-NEXT: movq %rdx, 24(%r14)
+; SSSE3-NEXT: movq %rsi, 8(%r14)
; SSSE3-NEXT: popq %rbx
; SSSE3-NEXT: popq %r12
; SSSE3-NEXT: popq %r13
; SSE41-NEXT: pushq %r13
; SSE41-NEXT: pushq %r12
; SSE41-NEXT: pushq %rbx
-; SSE41-NEXT: movq %rcx, %rax
-; SSE41-NEXT: movq %rdx, %r12
-; SSE41-NEXT: movq %rdi, %r11
+; SSE41-NEXT: movq %r9, %r10
+; SSE41-NEXT: movq %rcx, %r12
+; SSE41-NEXT: movq %rdx, %r11
+; SSE41-NEXT: movq %rsi, %rax
; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r14
; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r15
-; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r9
; SSE41-NEXT: testq %r10, %r10
-; SSE41-NEXT: setne %dl
-; SSE41-NEXT: testq %rcx, %rcx
-; SSE41-NEXT: setne %r13b
-; SSE41-NEXT: andb %dl, %r13b
-; SSE41-NEXT: mulq %r15
-; SSE41-NEXT: movq %rax, %rdi
-; SSE41-NEXT: seto %bpl
-; SSE41-NEXT: movq %r10, %rax
-; SSE41-NEXT: mulq %r12
-; SSE41-NEXT: movq %rax, %rbx
-; SSE41-NEXT: seto %cl
-; SSE41-NEXT: orb %bpl, %cl
-; SSE41-NEXT: addq %rdi, %rbx
-; SSE41-NEXT: movq %r12, %rax
-; SSE41-NEXT: mulq %r15
-; SSE41-NEXT: movq %rax, %r10
-; SSE41-NEXT: movq %rdx, %r15
-; SSE41-NEXT: addq %rbx, %r15
-; SSE41-NEXT: setb %al
-; SSE41-NEXT: orb %cl, %al
-; SSE41-NEXT: orb %r13b, %al
-; SSE41-NEXT: movzbl %al, %ebp
-; SSE41-NEXT: testq %r9, %r9
-; SSE41-NEXT: setne %al
+; SSE41-NEXT: setne %cl
; SSE41-NEXT: testq %rsi, %rsi
; SSE41-NEXT: setne %r13b
-; SSE41-NEXT: andb %al, %r13b
-; SSE41-NEXT: movq %rsi, %rax
+; SSE41-NEXT: andb %cl, %r13b
; SSE41-NEXT: mulq %r8
; SSE41-NEXT: movq %rax, %rsi
-; SSE41-NEXT: seto %r12b
-; SSE41-NEXT: movq %r9, %rax
-; SSE41-NEXT: mulq %r11
-; SSE41-NEXT: movq %rax, %rdi
+; SSE41-NEXT: seto %bpl
+; SSE41-NEXT: movq %r10, %rax
+; SSE41-NEXT: mulq %rdi
+; SSE41-NEXT: movq %rax, %rcx
; SSE41-NEXT: seto %bl
-; SSE41-NEXT: orb %r12b, %bl
-; SSE41-NEXT: addq %rsi, %rdi
-; SSE41-NEXT: movq %r11, %rax
+; SSE41-NEXT: orb %bpl, %bl
+; SSE41-NEXT: addq %rsi, %rcx
+; SSE41-NEXT: movq %rdi, %rax
; SSE41-NEXT: mulq %r8
-; SSE41-NEXT: addq %rdi, %rdx
+; SSE41-NEXT: movq %rax, %rdi
+; SSE41-NEXT: movq %rdx, %rsi
+; SSE41-NEXT: addq %rcx, %rsi
; SSE41-NEXT: setb %cl
; SSE41-NEXT: orb %bl, %cl
; SSE41-NEXT: orb %r13b, %cl
+; SSE41-NEXT: testq %r9, %r9
+; SSE41-NEXT: setne %al
+; SSE41-NEXT: testq %r12, %r12
+; SSE41-NEXT: setne %r8b
+; SSE41-NEXT: andb %al, %r8b
+; SSE41-NEXT: movq %r12, %rax
+; SSE41-NEXT: mulq %r15
+; SSE41-NEXT: movq %rax, %rbp
+; SSE41-NEXT: seto %r10b
+; SSE41-NEXT: movq %r9, %rax
+; SSE41-NEXT: mulq %r11
+; SSE41-NEXT: movq %rax, %rbx
+; SSE41-NEXT: seto %r9b
+; SSE41-NEXT: orb %r10b, %r9b
+; SSE41-NEXT: addq %rbp, %rbx
+; SSE41-NEXT: movq %r11, %rax
+; SSE41-NEXT: mulq %r15
+; SSE41-NEXT: addq %rbx, %rdx
+; SSE41-NEXT: setb %bl
+; SSE41-NEXT: orb %r9b, %bl
+; SSE41-NEXT: orb %r8b, %bl
+; SSE41-NEXT: movzbl %bl, %ebp
+; SSE41-NEXT: negl %ebp
; SSE41-NEXT: movzbl %cl, %ecx
+; SSE41-NEXT: negl %ecx
; SSE41-NEXT: movd %ecx, %xmm0
-; SSE41-NEXT: pinsrb $8, %ebp, %xmm0
-; SSE41-NEXT: movq %r10, 16(%r14)
-; SSE41-NEXT: movq %rax, (%r14)
-; SSE41-NEXT: movq %r15, 24(%r14)
-; SSE41-NEXT: movq %rdx, 8(%r14)
-; SSE41-NEXT: psllq $63, %xmm0
-; SSE41-NEXT: psrad $31, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pinsrd $1, %ebp, %xmm0
+; SSE41-NEXT: movq %rax, 16(%r14)
+; SSE41-NEXT: movq %rdi, (%r14)
+; SSE41-NEXT: movq %rdx, 24(%r14)
+; SSE41-NEXT: movq %rsi, 8(%r14)
; SSE41-NEXT: popq %rbx
; SSE41-NEXT: popq %r12
; SSE41-NEXT: popq %r13
; AVX1-NEXT: pushq %r13
; AVX1-NEXT: pushq %r12
; AVX1-NEXT: pushq %rbx
-; AVX1-NEXT: movq %rcx, %rax
-; AVX1-NEXT: movq %rdx, %r12
-; AVX1-NEXT: movq %rdi, %r11
+; AVX1-NEXT: movq %r9, %r10
+; AVX1-NEXT: movq %rcx, %r12
+; AVX1-NEXT: movq %rdx, %r11
+; AVX1-NEXT: movq %rsi, %rax
; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r14
; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r15
-; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r9
; AVX1-NEXT: testq %r10, %r10
-; AVX1-NEXT: setne %dl
-; AVX1-NEXT: testq %rcx, %rcx
+; AVX1-NEXT: setne %cl
+; AVX1-NEXT: testq %rsi, %rsi
; AVX1-NEXT: setne %r13b
-; AVX1-NEXT: andb %dl, %r13b
-; AVX1-NEXT: mulq %r15
-; AVX1-NEXT: movq %rax, %rdi
+; AVX1-NEXT: andb %cl, %r13b
+; AVX1-NEXT: mulq %r8
+; AVX1-NEXT: movq %rax, %rsi
; AVX1-NEXT: seto %bpl
; AVX1-NEXT: movq %r10, %rax
-; AVX1-NEXT: mulq %r12
-; AVX1-NEXT: movq %rax, %rbx
-; AVX1-NEXT: seto %cl
-; AVX1-NEXT: orb %bpl, %cl
-; AVX1-NEXT: addq %rdi, %rbx
-; AVX1-NEXT: movq %r12, %rax
-; AVX1-NEXT: mulq %r15
-; AVX1-NEXT: movq %rax, %r10
-; AVX1-NEXT: movq %rdx, %r15
-; AVX1-NEXT: addq %rbx, %r15
-; AVX1-NEXT: setb %al
-; AVX1-NEXT: orb %cl, %al
-; AVX1-NEXT: orb %r13b, %al
-; AVX1-NEXT: movzbl %al, %ebp
+; AVX1-NEXT: mulq %rdi
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: seto %bl
+; AVX1-NEXT: orb %bpl, %bl
+; AVX1-NEXT: addq %rsi, %rcx
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: mulq %r8
+; AVX1-NEXT: movq %rax, %rdi
+; AVX1-NEXT: movq %rdx, %rsi
+; AVX1-NEXT: addq %rcx, %rsi
+; AVX1-NEXT: setb %cl
+; AVX1-NEXT: orb %bl, %cl
+; AVX1-NEXT: orb %r13b, %cl
; AVX1-NEXT: testq %r9, %r9
; AVX1-NEXT: setne %al
-; AVX1-NEXT: testq %rsi, %rsi
-; AVX1-NEXT: setne %r13b
-; AVX1-NEXT: andb %al, %r13b
-; AVX1-NEXT: movq %rsi, %rax
-; AVX1-NEXT: mulq %r8
-; AVX1-NEXT: movq %rax, %rsi
-; AVX1-NEXT: seto %r12b
+; AVX1-NEXT: testq %r12, %r12
+; AVX1-NEXT: setne %r8b
+; AVX1-NEXT: andb %al, %r8b
+; AVX1-NEXT: movq %r12, %rax
+; AVX1-NEXT: mulq %r15
+; AVX1-NEXT: movq %rax, %rbp
+; AVX1-NEXT: seto %r10b
; AVX1-NEXT: movq %r9, %rax
; AVX1-NEXT: mulq %r11
-; AVX1-NEXT: movq %rax, %rdi
-; AVX1-NEXT: seto %cl
-; AVX1-NEXT: orb %r12b, %cl
-; AVX1-NEXT: addq %rsi, %rdi
+; AVX1-NEXT: movq %rax, %rbx
+; AVX1-NEXT: seto %r9b
+; AVX1-NEXT: orb %r10b, %r9b
+; AVX1-NEXT: addq %rbp, %rbx
; AVX1-NEXT: movq %r11, %rax
-; AVX1-NEXT: mulq %r8
-; AVX1-NEXT: addq %rdi, %rdx
+; AVX1-NEXT: mulq %r15
+; AVX1-NEXT: addq %rbx, %rdx
; AVX1-NEXT: setb %bl
-; AVX1-NEXT: orb %cl, %bl
-; AVX1-NEXT: orb %r13b, %bl
-; AVX1-NEXT: movzbl %bl, %ecx
+; AVX1-NEXT: orb %r9b, %bl
+; AVX1-NEXT: orb %r8b, %bl
+; AVX1-NEXT: movzbl %bl, %ebp
+; AVX1-NEXT: negl %ebp
+; AVX1-NEXT: movzbl %cl, %ecx
+; AVX1-NEXT: negl %ecx
; AVX1-NEXT: vmovd %ecx, %xmm0
-; AVX1-NEXT: vpinsrb $8, %ebp, %xmm0, %xmm0
-; AVX1-NEXT: movq %r10, 16(%r14)
-; AVX1-NEXT: movq %rax, (%r14)
-; AVX1-NEXT: movq %r15, 24(%r14)
-; AVX1-NEXT: movq %rdx, 8(%r14)
-; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpinsrd $1, %ebp, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, 16(%r14)
+; AVX1-NEXT: movq %rdi, (%r14)
+; AVX1-NEXT: movq %rdx, 24(%r14)
+; AVX1-NEXT: movq %rsi, 8(%r14)
; AVX1-NEXT: popq %rbx
; AVX1-NEXT: popq %r12
; AVX1-NEXT: popq %r13
; AVX2-NEXT: pushq %r13
; AVX2-NEXT: pushq %r12
; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: movq %rdx, %r12
-; AVX2-NEXT: movq %rdi, %r11
+; AVX2-NEXT: movq %r9, %r10
+; AVX2-NEXT: movq %rcx, %r12
+; AVX2-NEXT: movq %rdx, %r11
+; AVX2-NEXT: movq %rsi, %rax
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r14
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r15
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r9
; AVX2-NEXT: testq %r10, %r10
-; AVX2-NEXT: setne %dl
-; AVX2-NEXT: testq %rcx, %rcx
+; AVX2-NEXT: setne %cl
+; AVX2-NEXT: testq %rsi, %rsi
; AVX2-NEXT: setne %r13b
-; AVX2-NEXT: andb %dl, %r13b
-; AVX2-NEXT: mulq %r15
-; AVX2-NEXT: movq %rax, %rdi
+; AVX2-NEXT: andb %cl, %r13b
+; AVX2-NEXT: mulq %r8
+; AVX2-NEXT: movq %rax, %rsi
; AVX2-NEXT: seto %bpl
; AVX2-NEXT: movq %r10, %rax
-; AVX2-NEXT: mulq %r12
-; AVX2-NEXT: movq %rax, %rbx
-; AVX2-NEXT: seto %cl
-; AVX2-NEXT: orb %bpl, %cl
-; AVX2-NEXT: addq %rdi, %rbx
-; AVX2-NEXT: movq %r12, %rax
-; AVX2-NEXT: mulq %r15
-; AVX2-NEXT: movq %rax, %r10
-; AVX2-NEXT: movq %rdx, %r15
-; AVX2-NEXT: addq %rbx, %r15
-; AVX2-NEXT: setb %al
-; AVX2-NEXT: orb %cl, %al
-; AVX2-NEXT: orb %r13b, %al
-; AVX2-NEXT: movzbl %al, %ebp
+; AVX2-NEXT: mulq %rdi
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: seto %bl
+; AVX2-NEXT: orb %bpl, %bl
+; AVX2-NEXT: addq %rsi, %rcx
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: mulq %r8
+; AVX2-NEXT: movq %rax, %rdi
+; AVX2-NEXT: movq %rdx, %rsi
+; AVX2-NEXT: addq %rcx, %rsi
+; AVX2-NEXT: setb %cl
+; AVX2-NEXT: orb %bl, %cl
+; AVX2-NEXT: orb %r13b, %cl
; AVX2-NEXT: testq %r9, %r9
; AVX2-NEXT: setne %al
-; AVX2-NEXT: testq %rsi, %rsi
-; AVX2-NEXT: setne %r13b
-; AVX2-NEXT: andb %al, %r13b
-; AVX2-NEXT: movq %rsi, %rax
-; AVX2-NEXT: mulq %r8
-; AVX2-NEXT: movq %rax, %rsi
-; AVX2-NEXT: seto %r12b
+; AVX2-NEXT: testq %r12, %r12
+; AVX2-NEXT: setne %r8b
+; AVX2-NEXT: andb %al, %r8b
+; AVX2-NEXT: movq %r12, %rax
+; AVX2-NEXT: mulq %r15
+; AVX2-NEXT: movq %rax, %rbp
+; AVX2-NEXT: seto %r10b
; AVX2-NEXT: movq %r9, %rax
; AVX2-NEXT: mulq %r11
-; AVX2-NEXT: movq %rax, %rdi
-; AVX2-NEXT: seto %cl
-; AVX2-NEXT: orb %r12b, %cl
-; AVX2-NEXT: addq %rsi, %rdi
+; AVX2-NEXT: movq %rax, %rbx
+; AVX2-NEXT: seto %r9b
+; AVX2-NEXT: orb %r10b, %r9b
+; AVX2-NEXT: addq %rbp, %rbx
; AVX2-NEXT: movq %r11, %rax
-; AVX2-NEXT: mulq %r8
-; AVX2-NEXT: addq %rdi, %rdx
+; AVX2-NEXT: mulq %r15
+; AVX2-NEXT: addq %rbx, %rdx
; AVX2-NEXT: setb %bl
-; AVX2-NEXT: orb %cl, %bl
-; AVX2-NEXT: orb %r13b, %bl
-; AVX2-NEXT: movzbl %bl, %ecx
+; AVX2-NEXT: orb %r9b, %bl
+; AVX2-NEXT: orb %r8b, %bl
+; AVX2-NEXT: movzbl %bl, %ebp
+; AVX2-NEXT: negl %ebp
+; AVX2-NEXT: movzbl %cl, %ecx
+; AVX2-NEXT: negl %ecx
; AVX2-NEXT: vmovd %ecx, %xmm0
-; AVX2-NEXT: vpinsrb $8, %ebp, %xmm0, %xmm0
-; AVX2-NEXT: movq %r10, 16(%r14)
-; AVX2-NEXT: movq %rax, (%r14)
-; AVX2-NEXT: movq %r15, 24(%r14)
-; AVX2-NEXT: movq %rdx, 8(%r14)
-; AVX2-NEXT: vpsllq $63, %xmm0, %xmm0
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpinsrd $1, %ebp, %xmm0, %xmm0
+; AVX2-NEXT: movq %rax, 16(%r14)
+; AVX2-NEXT: movq %rdi, (%r14)
+; AVX2-NEXT: movq %rdx, 24(%r14)
+; AVX2-NEXT: movq %rsi, 8(%r14)
; AVX2-NEXT: popq %rbx
; AVX2-NEXT: popq %r12
; AVX2-NEXT: popq %r13
; AVX512-NEXT: andl $1, %esi
; AVX512-NEXT: kmovw %esi, %k1
; AVX512-NEXT: korw %k0, %k1, %k1
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: movq %r10, 16(%r14)
; AVX512-NEXT: movq %rax, (%r14)
; AVX512-NEXT: movq %r15, 24(%r14)
; AVX512-NEXT: movq %rdx, 8(%r14)
-; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: popq %rbx
; AVX512-NEXT: popq %r12
; AVX512-NEXT: popq %r13
define <2 x i32> @usubo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind {
; SSE2-LABEL: usubo_v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: psubq %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
-; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE2-NEXT: pxor %xmm3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: pxor %xmm2, %xmm3
+; SSE2-NEXT: psubd %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm2
; SSE2-NEXT: movq %xmm0, (%rdi)
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: usubo_v2i32:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
-; SSSE3-NEXT: pand %xmm2, %xmm1
-; SSSE3-NEXT: pand %xmm2, %xmm0
-; SSSE3-NEXT: psubq %xmm1, %xmm0
-; SSSE3-NEXT: pand %xmm0, %xmm2
-; SSSE3-NEXT: pcmpeqd %xmm0, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
-; SSSE3-NEXT: pand %xmm2, %xmm3
-; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1
-; SSSE3-NEXT: pxor %xmm3, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: pxor %xmm2, %xmm3
+; SSSE3-NEXT: psubd %xmm1, %xmm0
+; SSSE3-NEXT: pxor %xmm0, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
; SSSE3-NEXT: movq %xmm0, (%rdi)
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: movdqa %xmm2, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: usubo_v2i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; SSE41-NEXT: psubq %xmm1, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; SSE41-NEXT: pcmpeqq %xmm0, %xmm2
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psubd %xmm1, %xmm2
+; SSE41-NEXT: pminud %xmm2, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE41-NEXT: pxor %xmm2, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE41-NEXT: movq %xmm0, (%rdi)
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pxor %xmm1, %xmm0
+; SSE41-NEXT: movq %xmm2, (%rdi)
; SSE41-NEXT: retq
;
; AVX1-LABEL: usubo_v2i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpminud %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; AVX1-NEXT: vmovq %xmm1, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: usubo_v2i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpminud %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; AVX2-NEXT: vmovq %xmm1, (%rdi)
; AVX2-NEXT: retq
;
; AVX512-LABEL: usubo_v2i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX512-NEXT: vpmovqd %xmm0, (%rdi)
-; AVX512-NEXT: vpcmpeqq %xmm0, %xmm1, %xmm0
-; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpcmpnleud %xmm0, %xmm1, %k1
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: vmovq %xmm1, (%rdi)
; AVX512-NEXT: retq
%t = call {<2 x i32>, <2 x i1>} @llvm.usub.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1)
%val = extractvalue {<2 x i32>, <2 x i1>} %t, 0
; SSE-NEXT: pxor %xmm0, %xmm2
; SSE-NEXT: movdqa %xmm2, %xmm1
; SSE-NEXT: pcmpgtd %xmm3, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
; SSE-NEXT: pcmpeqd %xmm3, %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE-NEXT: pand %xmm4, %xmm2
+; SSE-NEXT: pand %xmm1, %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE-NEXT: por %xmm2, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE-NEXT: movdqa %xmm0, (%rdi)
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm0
; AVX2-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX2-NEXT: vmovdqa %xmm1, (%rdi)
; AVX2-NEXT: retq
;
; AVX512: # %bb.0:
; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm1
; AVX512-NEXT: vpcmpnleuq %xmm0, %xmm1, %k1
-; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
; AVX512-NEXT: retq
%t = call {<2 x i64>, <2 x i1>} @llvm.usub.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1)
%val = extractvalue {<2 x i64>, <2 x i1>} %t, 0
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10
; SSE2-NEXT: subq {{[0-9]+}}(%rsp), %rdx
; SSE2-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT: setb %al
-; SSE2-NEXT: movzbl %al, %r11d
+; SSE2-NEXT: sbbl %eax, %eax
; SSE2-NEXT: subq %r8, %rdi
; SSE2-NEXT: sbbq %r9, %rsi
-; SSE2-NEXT: setb %al
-; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: movd %eax, %xmm1
+; SSE2-NEXT: sbbl %eax, %eax
; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: pinsrw $4, %r11d, %xmm0
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: movq %rdx, 16(%r10)
; SSE2-NEXT: movq %rdi, (%r10)
; SSE2-NEXT: movq %rcx, 24(%r10)
; SSE2-NEXT: movq %rsi, 8(%r10)
-; SSE2-NEXT: psllq $63, %xmm0
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: usubo_v2i128:
; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r10
; SSSE3-NEXT: subq {{[0-9]+}}(%rsp), %rdx
; SSSE3-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx
-; SSSE3-NEXT: setb %al
-; SSSE3-NEXT: movzbl %al, %r11d
+; SSSE3-NEXT: sbbl %eax, %eax
; SSSE3-NEXT: subq %r8, %rdi
; SSSE3-NEXT: sbbq %r9, %rsi
-; SSSE3-NEXT: setb %al
-; SSSE3-NEXT: movzbl %al, %eax
+; SSSE3-NEXT: movd %eax, %xmm1
+; SSSE3-NEXT: sbbl %eax, %eax
; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: pinsrw $4, %r11d, %xmm0
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-NEXT: movq %rdx, 16(%r10)
; SSSE3-NEXT: movq %rdi, (%r10)
; SSSE3-NEXT: movq %rcx, 24(%r10)
; SSSE3-NEXT: movq %rsi, 8(%r10)
-; SSSE3-NEXT: psllq $63, %xmm0
-; SSSE3-NEXT: psrad $31, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: usubo_v2i128:
; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r10
; SSE41-NEXT: subq {{[0-9]+}}(%rsp), %rdx
; SSE41-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx
-; SSE41-NEXT: setb %al
-; SSE41-NEXT: movzbl %al, %r11d
+; SSE41-NEXT: sbbl %r11d, %r11d
; SSE41-NEXT: subq %r8, %rdi
; SSE41-NEXT: sbbq %r9, %rsi
-; SSE41-NEXT: setb %al
-; SSE41-NEXT: movzbl %al, %eax
+; SSE41-NEXT: sbbl %eax, %eax
; SSE41-NEXT: movd %eax, %xmm0
-; SSE41-NEXT: pinsrb $8, %r11d, %xmm0
+; SSE41-NEXT: pinsrd $1, %r11d, %xmm0
; SSE41-NEXT: movq %rdx, 16(%r10)
; SSE41-NEXT: movq %rdi, (%r10)
; SSE41-NEXT: movq %rcx, 24(%r10)
; SSE41-NEXT: movq %rsi, 8(%r10)
-; SSE41-NEXT: psllq $63, %xmm0
-; SSE41-NEXT: psrad $31, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE41-NEXT: retq
;
; AVX1-LABEL: usubo_v2i128:
; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX1-NEXT: subq {{[0-9]+}}(%rsp), %rdx
; AVX1-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx
-; AVX1-NEXT: setb %al
-; AVX1-NEXT: movzbl %al, %r11d
+; AVX1-NEXT: sbbl %r11d, %r11d
; AVX1-NEXT: subq %r8, %rdi
; AVX1-NEXT: sbbq %r9, %rsi
-; AVX1-NEXT: setb %al
-; AVX1-NEXT: movzbl %al, %eax
+; AVX1-NEXT: sbbl %eax, %eax
; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vpinsrb $8, %r11d, %xmm0, %xmm0
+; AVX1-NEXT: vpinsrd $1, %r11d, %xmm0, %xmm0
; AVX1-NEXT: movq %rdx, 16(%r10)
; AVX1-NEXT: movq %rdi, (%r10)
; AVX1-NEXT: movq %rcx, 24(%r10)
; AVX1-NEXT: movq %rsi, 8(%r10)
-; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: usubo_v2i128:
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX2-NEXT: subq {{[0-9]+}}(%rsp), %rdx
; AVX2-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx
-; AVX2-NEXT: setb %al
-; AVX2-NEXT: movzbl %al, %r11d
+; AVX2-NEXT: sbbl %r11d, %r11d
; AVX2-NEXT: subq %r8, %rdi
; AVX2-NEXT: sbbq %r9, %rsi
-; AVX2-NEXT: setb %al
-; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: sbbl %eax, %eax
; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vpinsrb $8, %r11d, %xmm0, %xmm0
+; AVX2-NEXT: vpinsrd $1, %r11d, %xmm0, %xmm0
; AVX2-NEXT: movq %rdx, 16(%r10)
; AVX2-NEXT: movq %rdi, (%r10)
; AVX2-NEXT: movq %rcx, 24(%r10)
; AVX2-NEXT: movq %rsi, 8(%r10)
-; AVX2-NEXT: vpsllq $63, %xmm0, %xmm0
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: usubo_v2i128:
; AVX512-NEXT: andl $1, %eax
; AVX512-NEXT: kmovw %eax, %k1
; AVX512-NEXT: korw %k0, %k1, %k1
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: movq %rdx, 16(%r10)
; AVX512-NEXT: movq %rdi, (%r10)
; AVX512-NEXT: movq %rcx, 24(%r10)
; AVX512-NEXT: movq %rsi, 8(%r10)
-; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: retq
%t = call {<2 x i128>, <2 x i1>} @llvm.usub.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1)
%val = extractvalue {<2 x i128>, <2 x i1>} %t, 0
define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
; SSE2-LABEL: vsel_4xi8:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; SSE2-NEXT: movaps {{.*#+}} xmm2 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; SSE2-NEXT: andps %xmm2, %xmm0
+; SSE2-NEXT: andnps %xmm1, %xmm2
+; SSE2-NEXT: orps %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: vsel_4xi8:
; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
-; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,5,6,u,u,u,u,u,u,u,u,u,u,u,u]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: vsel_4xi8:
; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: movaps {{.*#+}} xmm0 = <255,255,0,255,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: vsel_4xi8:
; AVX: # %bb.0: # %entry
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <255,255,0,255,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
entry:
%vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i8> %v1, <4 x i8> %v2
define <4 x i16> @vsel_4xi16(<4 x i16> %v1, <4 x i16> %v2) {
; SSE2-LABEL: vsel_4xi16:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
-; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535]
+; SSE2-NEXT: andps %xmm2, %xmm0
+; SSE2-NEXT: andnps %xmm1, %xmm2
+; SSE2-NEXT: orps %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: vsel_4xi16:
; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
-; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
-; SSSE3-NEXT: movaps %xmm1, %xmm0
+; SSSE3-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535]
+; SSSE3-NEXT: andps %xmm2, %xmm0
+; SSSE3-NEXT: andnps %xmm1, %xmm2
+; SSSE3-NEXT: orps %xmm2, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: vsel_4xi16:
; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
; SSE41-NEXT: retq
;
; AVX-LABEL: vsel_4xi16:
; AVX: # %bb.0: # %entry
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
; AVX-NEXT: retq
entry:
%vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x i16> %v1, <4 x i16> %v2
define <8 x i16> @zext_and_v8i16(<8 x i8> %x, <8 x i8> %y) {
; SSE2-LABEL: zext_and_v8i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: andps %xmm1, %xmm0
-; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: retq
;
; AVX2-LABEL: zext_and_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vandps %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
%xz = zext <8 x i8> %x to <8 x i16>
%yz = zext <8 x i8> %y to <8 x i16>
define <8 x i16> @zext_or_v8i16(<8 x i8> %x, <8 x i8> %y) {
; SSE2-LABEL: zext_or_v8i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: orps %xmm1, %xmm0
-; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; AVX2-LABEL: zext_or_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
%xz = zext <8 x i8> %x to <8 x i16>
%yz = zext <8 x i8> %y to <8 x i16>
define <8 x i16> @zext_xor_v8i16(<8 x i8> %x, <8 x i8> %y) {
; SSE2-LABEL: zext_xor_v8i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: xorps %xmm1, %xmm0
-; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT: pxor %xmm1, %xmm0
; SSE2-NEXT: retq
;
; AVX2-LABEL: zext_xor_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
%xz = zext <8 x i8> %x to <8 x i16>
%yz = zext <8 x i8> %y to <8 x i16>
define <8 x i16> @sext_and_v8i16(<8 x i8> %x, <8 x i8> %y) {
; SSE2-LABEL: sext_and_v8i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: psllw $8, %xmm0
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT: psraw $8, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE2-NEXT: psraw $8, %xmm0
-; SSE2-NEXT: psllw $8, %xmm1
-; SSE2-NEXT: psraw $8, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: retq
;
; AVX2-LABEL: sext_and_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX2-NEXT: vpsraw $8, %xmm0, %xmm0
-; AVX2-NEXT: vpsllw $8, %xmm1, %xmm1
-; AVX2-NEXT: vpsraw $8, %xmm1, %xmm1
+; AVX2-NEXT: vpmovsxbw %xmm0, %xmm0
+; AVX2-NEXT: vpmovsxbw %xmm1, %xmm1
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
%xs = sext <8 x i8> %x to <8 x i16>
define <8 x i16> @sext_or_v8i16(<8 x i8> %x, <8 x i8> %y) {
; SSE2-LABEL: sext_or_v8i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: psllw $8, %xmm0
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT: psraw $8, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE2-NEXT: psraw $8, %xmm0
-; SSE2-NEXT: psllw $8, %xmm1
-; SSE2-NEXT: psraw $8, %xmm1
-; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; AVX2-LABEL: sext_or_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX2-NEXT: vpsraw $8, %xmm0, %xmm0
-; AVX2-NEXT: vpsllw $8, %xmm1, %xmm1
-; AVX2-NEXT: vpsraw $8, %xmm1, %xmm1
+; AVX2-NEXT: vpmovsxbw %xmm0, %xmm0
+; AVX2-NEXT: vpmovsxbw %xmm1, %xmm1
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
%xs = sext <8 x i8> %x to <8 x i16>
define <8 x i16> @sext_xor_v8i16(<8 x i8> %x, <8 x i8> %y) {
; SSE2-LABEL: sext_xor_v8i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: psllw $8, %xmm0
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT: psraw $8, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE2-NEXT: psraw $8, %xmm0
-; SSE2-NEXT: psllw $8, %xmm1
-; SSE2-NEXT: psraw $8, %xmm1
-; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm0
; SSE2-NEXT: retq
;
; AVX2-LABEL: sext_xor_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX2-NEXT: vpsraw $8, %xmm0, %xmm0
-; AVX2-NEXT: vpsllw $8, %xmm1, %xmm1
-; AVX2-NEXT: vpsraw $8, %xmm1, %xmm1
+; AVX2-NEXT: vpmovsxbw %xmm0, %xmm0
+; AVX2-NEXT: vpmovsxbw %xmm1, %xmm1
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
%xs = sext <8 x i8> %x to <8 x i16>
define <8 x i32> @bool_zext_or(<8 x i1> %x, <8 x i1> %y) {
; SSE2-LABEL: bool_zext_or:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; SSE2-NEXT: por %xmm4, %xmm2
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: por %xmm1, %xmm2
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: retq
;
define <8 x i32> @bool_zext_xor(<8 x i1> %x, <8 x i1> %y) {
; SSE2-LABEL: bool_zext_xor:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pxor %xmm1, %xmm2
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: retq
;
define <4 x i8*> @AGEP5(<4 x i8*> %param, <4 x i8> %off) nounwind {
; CHECK-LABEL: AGEP5:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpslld $24, %xmm1, %xmm1
-; CHECK-NEXT: vpsrad $24, %xmm1, %xmm1
+; CHECK-NEXT: vpmovsxbd %xmm1, %xmm1
; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retl
%A = getelementptr i8, <4 x i8*> %param, <4 x i8> %off
define <4 x float> @cvt_4i16_to_4f32(<4 x i16> %a0) nounwind {
; ALL-LABEL: cvt_4i16_to_4f32:
; ALL: # %bb.0:
-; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; ALL-NEXT: vmovq %xmm0, %rax
; ALL-NEXT: movq %rax, %rcx
; ALL-NEXT: movq %rax, %rdx
}
define <2 x double> @cvt_2i16_to_2f64(<2 x i16> %a0) nounwind {
-; AVX1-LABEL: cvt_2i16_to_2f64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: movswl %ax, %ecx
-; AVX1-NEXT: shrl $16, %eax
-; AVX1-NEXT: cwtl
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT: vmovd %ecx, %xmm1
-; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: cvt_2i16_to_2f64:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vmovd %xmm0, %eax
-; AVX2-SLOW-NEXT: movswl %ax, %ecx
-; AVX2-SLOW-NEXT: shrl $16, %eax
-; AVX2-SLOW-NEXT: cwtl
-; AVX2-SLOW-NEXT: vmovd %eax, %xmm0
-; AVX2-SLOW-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vmovd %ecx, %xmm1
-; AVX2-SLOW-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX2-SLOW-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX2-SLOW-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: cvt_2i16_to_2f64:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX2-FAST-NEXT: vmovd %xmm0, %eax
-; AVX2-FAST-NEXT: movswl %ax, %ecx
-; AVX2-FAST-NEXT: shrl $16, %eax
-; AVX2-FAST-NEXT: cwtl
-; AVX2-FAST-NEXT: vmovd %eax, %xmm0
-; AVX2-FAST-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX2-FAST-NEXT: vmovd %ecx, %xmm1
-; AVX2-FAST-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX2-FAST-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX2-FAST-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX2-FAST-NEXT: retq
-;
-; AVX512F-LABEL: cvt_2i16_to_2f64:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: movswl %ax, %ecx
-; AVX512F-NEXT: shrl $16, %eax
-; AVX512F-NEXT: cwtl
-; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512F-NEXT: vmovd %ecx, %xmm1
-; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: cvt_2i16_to_2f64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovqw %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movl -{{[0-9]+}}(%rsp), %eax
-; AVX512VL-NEXT: movswl %ax, %ecx
-; AVX512VL-NEXT: shrl $16, %eax
-; AVX512VL-NEXT: cwtl
-; AVX512VL-NEXT: vmovd %eax, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovd %ecx, %xmm1
-; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512VL-NEXT: retq
+; ALL-LABEL: cvt_2i16_to_2f64:
+; ALL: # %bb.0:
+; ALL-NEXT: vmovd %xmm0, %eax
+; ALL-NEXT: movswl %ax, %ecx
+; ALL-NEXT: shrl $16, %eax
+; ALL-NEXT: cwtl
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: vmovd %ecx, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; ALL-NEXT: retq
%1 = bitcast <2 x i16> %a0 to <2 x half>
%2 = fpext <2 x half> %1 to <2 x double>
ret <2 x double> %2
define <4 x double> @cvt_4i16_to_4f64(<4 x i16> %a0) nounwind {
; ALL-LABEL: cvt_4i16_to_4f64:
; ALL: # %bb.0:
-; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; ALL-NEXT: vmovq %xmm0, %rax
; ALL-NEXT: movq %rax, %rcx
; ALL-NEXT: movl %eax, %edx
; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; ALL-NEXT: vmovd %xmm0, %eax
; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
-; ALL-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; ALL-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
; ALL-NEXT: retq
%1 = fptrunc <4 x float> %a0 to <4 x half>
%2 = bitcast <4 x half> %1 to <4 x i16>
; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; ALL-NEXT: vmovd %xmm0, %eax
; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
-; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; ALL-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
; ALL-NEXT: retq
%1 = fptrunc <4 x float> %a0 to <4 x half>
%2 = bitcast <4 x half> %1 to <4 x i16>
; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; ALL-NEXT: vmovd %xmm0, %eax
; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
-; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; ALL-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
; ALL-NEXT: vmovaps %xmm0, (%rdi)
; ALL-NEXT: retq
%1 = fptrunc <4 x float> %a0 to <4 x half>
; ALL-NEXT: subq $40, %rsp
; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; ALL-NEXT: callq __truncdfhf2
-; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; ALL-NEXT: movw %ax, (%rsp)
; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; ALL-NEXT: # xmm0 = mem[1,0]
; ALL-NEXT: callq __truncdfhf2
; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
-; ALL-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
+; ALL-NEXT: vmovaps (%rsp), %xmm0
; ALL-NEXT: addq $40, %rsp
; ALL-NEXT: retq
%1 = fptrunc <2 x double> %a0 to <2 x half>
; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; ALL-NEXT: vzeroupper
; ALL-NEXT: callq __truncdfhf2
-; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; ALL-NEXT: movw %ax, (%rsp)
; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; ALL-NEXT: # xmm0 = mem[1,0]
; ALL-NEXT: callq __truncdfhf2
; ALL-NEXT: # xmm0 = mem[1,0]
; ALL-NEXT: callq __truncdfhf2
; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
-; ALL-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; ALL-NEXT: vmovaps (%rsp), %xmm0
; ALL-NEXT: addq $88, %rsp
; ALL-NEXT: retq
%1 = fptrunc <4 x double> %a0 to <4 x half>
; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; ALL-NEXT: vzeroupper
; ALL-NEXT: callq __truncdfhf2
-; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; ALL-NEXT: movw %ax, (%rsp)
; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; ALL-NEXT: # xmm0 = mem[1,0]
; ALL-NEXT: callq __truncdfhf2
; ALL-NEXT: # xmm0 = mem[1,0]
; ALL-NEXT: callq __truncdfhf2
; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
-; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; ALL-NEXT: vmovaps (%rsp), %xmm0
; ALL-NEXT: addq $88, %rsp
; ALL-NEXT: retq
%1 = fptrunc <4 x double> %a0 to <4 x half>
; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; ALL-NEXT: vzeroupper
; ALL-NEXT: callq __truncdfhf2
-; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; ALL-NEXT: movw %ax, (%rsp)
; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; ALL-NEXT: # xmm0 = mem[1,0]
; ALL-NEXT: callq __truncdfhf2
; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; ALL-NEXT: vzeroupper
; ALL-NEXT: callq __truncdfhf2
-; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; ALL-NEXT: movw %ax, (%rsp)
; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; ALL-NEXT: # xmm0 = mem[1,0]
; ALL-NEXT: callq __truncdfhf2
; ALL-NEXT: # xmm0 = mem[1,0]
; ALL-NEXT: callq __truncdfhf2
; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
-; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; ALL-NEXT: vmovaps (%rsp), %xmm0
; ALL-NEXT: vmovaps %xmm0, (%rbx)
; ALL-NEXT: addq $80, %rsp
; ALL-NEXT: popq %rbx
; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; ALL-NEXT: vzeroupper
; ALL-NEXT: callq __truncdfhf2
-; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; ALL-NEXT: movw %ax, (%rsp)
; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; ALL-NEXT: # xmm0 = mem[1,0]
; ALL-NEXT: callq __truncdfhf2
; X64-LABEL: test_udiv7_v2i32:
; X64: # %bb.0:
; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,0,1]
-; X64-NEXT: movd %xmm1, %eax
-; X64-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
-; X64-NEXT: shrq $32, %rcx
-; X64-NEXT: subl %ecx, %eax
-; X64-NEXT: shrl %eax
-; X64-NEXT: addl %ecx, %eax
-; X64-NEXT: shrl $2, %eax
-; X64-NEXT: movd %xmm0, %ecx
-; X64-NEXT: imulq $613566757, %rcx, %rdx # imm = 0x24924925
-; X64-NEXT: shrq $32, %rdx
-; X64-NEXT: subl %edx, %ecx
-; X64-NEXT: shrl %ecx
-; X64-NEXT: addl %edx, %ecx
-; X64-NEXT: shrl $2, %ecx
-; X64-NEXT: movd %ecx, %xmm0
-; X64-NEXT: movd %eax, %xmm1
-; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
+; X64-NEXT: movdqa %xmm0, %xmm2
+; X64-NEXT: pmuludq %xmm1, %xmm2
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; X64-NEXT: pmuludq %xmm1, %xmm3
+; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
+; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X64-NEXT: psubd %xmm2, %xmm0
+; X64-NEXT: psrld $1, %xmm0
+; X64-NEXT: paddd %xmm2, %xmm0
+; X64-NEXT: psrld $2, %xmm0
; X64-NEXT: movq %xmm0, (%rsi)
; X64-NEXT: retq
;
; X86-LABEL: test_udiv7_v2i32:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT: movd %xmm0, %ecx
-; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
-; X86-NEXT: movd %xmm0, %esi
-; X86-NEXT: movl $613566757, %ebx # imm = 0x24924925
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %ebx
-; X86-NEXT: subl %edx, %esi
-; X86-NEXT: shrl %esi
-; X86-NEXT: addl %edx, %esi
-; X86-NEXT: shrl $2, %esi
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull %ebx
-; X86-NEXT: subl %edx, %ecx
-; X86-NEXT: shrl %ecx
-; X86-NEXT: addl %edx, %ecx
-; X86-NEXT: shrl $2, %ecx
-; X86-NEXT: movd %ecx, %xmm0
-; X86-NEXT: movd %esi, %xmm1
-; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X86-NEXT: movq %xmm0, (%edi)
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
+; X86-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
+; X86-NEXT: movdqa %xmm0, %xmm2
+; X86-NEXT: pmuludq %xmm1, %xmm2
+; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; X86-NEXT: movdqa %xmm0, %xmm3
+; X86-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3]
+; X86-NEXT: pmuludq %xmm1, %xmm3
+; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
+; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X86-NEXT: psubd %xmm2, %xmm0
+; X86-NEXT: psrld $1, %xmm0
+; X86-NEXT: paddd %xmm2, %xmm0
+; X86-NEXT: psrld $2, %xmm0
+; X86-NEXT: movq %xmm0, (%eax)
; X86-NEXT: retl
;
; X64_WIDEN-LABEL: test_udiv7_v2i32:
; X64-LABEL: test_urem7_v2i32:
; X64: # %bb.0:
; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,0,1]
-; X64-NEXT: movd %xmm1, %ecx
-; X64-NEXT: imulq $613566757, %rcx, %rdx # imm = 0x24924925
-; X64-NEXT: shrq $32, %rdx
-; X64-NEXT: movl %ecx, %eax
-; X64-NEXT: subl %edx, %eax
-; X64-NEXT: shrl %eax
-; X64-NEXT: addl %edx, %eax
-; X64-NEXT: shrl $2, %eax
-; X64-NEXT: leal (,%rax,8), %edx
-; X64-NEXT: subl %edx, %eax
-; X64-NEXT: addl %ecx, %eax
-; X64-NEXT: movd %xmm0, %ecx
-; X64-NEXT: imulq $613566757, %rcx, %rdx # imm = 0x24924925
-; X64-NEXT: shrq $32, %rdx
-; X64-NEXT: movl %ecx, %edi
-; X64-NEXT: subl %edx, %edi
-; X64-NEXT: shrl %edi
-; X64-NEXT: addl %edx, %edi
-; X64-NEXT: shrl $2, %edi
-; X64-NEXT: leal (,%rdi,8), %edx
-; X64-NEXT: subl %edx, %edi
-; X64-NEXT: addl %ecx, %edi
-; X64-NEXT: movd %edi, %xmm0
-; X64-NEXT: movd %eax, %xmm1
-; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X64-NEXT: movq %xmm0, (%rsi)
+; X64-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
+; X64-NEXT: movdqa %xmm0, %xmm2
+; X64-NEXT: pmuludq %xmm1, %xmm2
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; X64-NEXT: pmuludq %xmm1, %xmm3
+; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
+; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X64-NEXT: movdqa %xmm0, %xmm1
+; X64-NEXT: psubd %xmm2, %xmm1
+; X64-NEXT: psrld $1, %xmm1
+; X64-NEXT: paddd %xmm2, %xmm1
+; X64-NEXT: psrld $2, %xmm1
+; X64-NEXT: movdqa %xmm1, %xmm2
+; X64-NEXT: pslld $3, %xmm2
+; X64-NEXT: psubd %xmm2, %xmm1
+; X64-NEXT: paddd %xmm0, %xmm1
+; X64-NEXT: movq %xmm1, (%rsi)
; X64-NEXT: retq
;
; X86-LABEL: test_urem7_v2i32:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT: movd %xmm0, %ecx
-; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
-; X86-NEXT: movd %xmm0, %esi
-; X86-NEXT: movl $613566757, %edi # imm = 0x24924925
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %esi, %ebx
-; X86-NEXT: subl %edx, %ebx
-; X86-NEXT: shrl %ebx
-; X86-NEXT: addl %edx, %ebx
-; X86-NEXT: shrl $2, %ebx
-; X86-NEXT: leal (,%ebx,8), %eax
-; X86-NEXT: subl %eax, %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: addl %esi, %ebx
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: subl %edx, %eax
-; X86-NEXT: shrl %eax
-; X86-NEXT: addl %edx, %eax
-; X86-NEXT: shrl $2, %eax
-; X86-NEXT: leal (,%eax,8), %edx
-; X86-NEXT: subl %edx, %eax
-; X86-NEXT: addl %ecx, %eax
-; X86-NEXT: movd %eax, %xmm0
-; X86-NEXT: movd %ebx, %xmm1
-; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X86-NEXT: movq %xmm0, (%ebp)
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
+; X86-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
+; X86-NEXT: movdqa %xmm0, %xmm2
+; X86-NEXT: pmuludq %xmm1, %xmm2
+; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; X86-NEXT: movdqa %xmm0, %xmm3
+; X86-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3]
+; X86-NEXT: pmuludq %xmm1, %xmm3
+; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
+; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X86-NEXT: movdqa %xmm0, %xmm1
+; X86-NEXT: psubd %xmm2, %xmm1
+; X86-NEXT: psrld $1, %xmm1
+; X86-NEXT: paddd %xmm2, %xmm1
+; X86-NEXT: psrld $2, %xmm1
+; X86-NEXT: movdqa %xmm1, %xmm2
+; X86-NEXT: pslld $3, %xmm2
+; X86-NEXT: psubd %xmm2, %xmm1
+; X86-NEXT: paddd %xmm0, %xmm1
+; X86-NEXT: movq %xmm1, (%eax)
; X86-NEXT: retl
;
; X64_WIDEN-LABEL: test_urem7_v2i32:
; X64-LABEL: test_sdiv7_v2i32:
; X64: # %bb.0:
; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,0,1]
-; X64-NEXT: movd %xmm1, %eax
-; X64-NEXT: cltq
-; X64-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
-; X64-NEXT: shrq $32, %rcx
-; X64-NEXT: addl %ecx, %eax
-; X64-NEXT: movl %eax, %ecx
-; X64-NEXT: shrl $31, %ecx
-; X64-NEXT: sarl $2, %eax
-; X64-NEXT: addl %ecx, %eax
-; X64-NEXT: movd %xmm0, %ecx
-; X64-NEXT: movslq %ecx, %rcx
-; X64-NEXT: imulq $-1840700269, %rcx, %rdx # imm = 0x92492493
-; X64-NEXT: shrq $32, %rdx
-; X64-NEXT: addl %edx, %ecx
-; X64-NEXT: movl %ecx, %edx
-; X64-NEXT: shrl $31, %edx
-; X64-NEXT: sarl $2, %ecx
-; X64-NEXT: addl %edx, %ecx
-; X64-NEXT: movd %ecx, %xmm0
-; X64-NEXT: movd %eax, %xmm1
-; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X64-NEXT: movq %xmm0, (%rsi)
+; X64-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
+; X64-NEXT: movdqa %xmm0, %xmm2
+; X64-NEXT: pmuludq %xmm1, %xmm2
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; X64-NEXT: pmuludq %xmm1, %xmm3
+; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; X64-NEXT: pxor %xmm3, %xmm3
+; X64-NEXT: pcmpgtd %xmm0, %xmm3
+; X64-NEXT: pand %xmm1, %xmm3
+; X64-NEXT: paddd %xmm0, %xmm3
+; X64-NEXT: psubd %xmm3, %xmm2
+; X64-NEXT: paddd %xmm0, %xmm2
+; X64-NEXT: movdqa %xmm2, %xmm0
+; X64-NEXT: psrld $31, %xmm0
+; X64-NEXT: psrad $2, %xmm2
+; X64-NEXT: paddd %xmm0, %xmm2
+; X64-NEXT: movq %xmm2, (%rsi)
; X64-NEXT: retq
;
; X86-LABEL: test_sdiv7_v2i32:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT: movd %xmm0, %ecx
-; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
-; X86-NEXT: movd %xmm0, %esi
-; X86-NEXT: movl $-1840700269, %ebp # imm = 0x92492493
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: imull %ebp
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: addl %esi, %edi
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: shrl $31, %eax
-; X86-NEXT: sarl $2, %edi
-; X86-NEXT: addl %eax, %edi
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: imull %ebp
-; X86-NEXT: addl %ecx, %edx
-; X86-NEXT: movl %edx, %eax
-; X86-NEXT: shrl $31, %eax
-; X86-NEXT: sarl $2, %edx
-; X86-NEXT: addl %eax, %edx
-; X86-NEXT: movd %edx, %xmm0
-; X86-NEXT: movd %edi, %xmm1
-; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X86-NEXT: movq %xmm0, (%ebx)
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
+; X86-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
+; X86-NEXT: movdqa %xmm0, %xmm2
+; X86-NEXT: pmuludq %xmm1, %xmm2
+; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; X86-NEXT: movdqa %xmm0, %xmm3
+; X86-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3]
+; X86-NEXT: pmuludq %xmm1, %xmm3
+; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; X86-NEXT: pxor %xmm3, %xmm3
+; X86-NEXT: pcmpgtd %xmm0, %xmm3
+; X86-NEXT: pand %xmm1, %xmm3
+; X86-NEXT: paddd %xmm0, %xmm3
+; X86-NEXT: psubd %xmm3, %xmm2
+; X86-NEXT: paddd %xmm0, %xmm2
+; X86-NEXT: movdqa %xmm2, %xmm0
+; X86-NEXT: psrld $31, %xmm0
+; X86-NEXT: psrad $2, %xmm2
+; X86-NEXT: paddd %xmm0, %xmm2
+; X86-NEXT: movq %xmm2, (%eax)
; X86-NEXT: retl
;
; X64_WIDEN-LABEL: test_sdiv7_v2i32:
; X64-LABEL: test_srem7_v2i32:
; X64: # %bb.0:
; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,0,1]
-; X64-NEXT: movd %xmm1, %eax
-; X64-NEXT: movslq %eax, %rcx
-; X64-NEXT: imulq $-1840700269, %rcx, %rax # imm = 0x92492493
-; X64-NEXT: shrq $32, %rax
-; X64-NEXT: addl %ecx, %eax
-; X64-NEXT: movl %eax, %edx
-; X64-NEXT: shrl $31, %edx
-; X64-NEXT: sarl $2, %eax
-; X64-NEXT: addl %edx, %eax
-; X64-NEXT: leal (,%rax,8), %edx
-; X64-NEXT: subl %edx, %eax
-; X64-NEXT: addl %ecx, %eax
-; X64-NEXT: movd %xmm0, %ecx
-; X64-NEXT: movslq %ecx, %rcx
-; X64-NEXT: imulq $-1840700269, %rcx, %rdx # imm = 0x92492493
-; X64-NEXT: shrq $32, %rdx
-; X64-NEXT: addl %ecx, %edx
-; X64-NEXT: movl %edx, %edi
-; X64-NEXT: shrl $31, %edi
-; X64-NEXT: sarl $2, %edx
-; X64-NEXT: addl %edi, %edx
-; X64-NEXT: leal (,%rdx,8), %edi
-; X64-NEXT: subl %edi, %edx
-; X64-NEXT: addl %ecx, %edx
-; X64-NEXT: movd %edx, %xmm0
-; X64-NEXT: movd %eax, %xmm1
-; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X64-NEXT: movq %xmm0, (%rsi)
+; X64-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
+; X64-NEXT: movdqa %xmm0, %xmm2
+; X64-NEXT: pmuludq %xmm1, %xmm2
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; X64-NEXT: pmuludq %xmm1, %xmm3
+; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; X64-NEXT: pxor %xmm3, %xmm3
+; X64-NEXT: pcmpgtd %xmm0, %xmm3
+; X64-NEXT: pand %xmm1, %xmm3
+; X64-NEXT: paddd %xmm0, %xmm3
+; X64-NEXT: psubd %xmm3, %xmm2
+; X64-NEXT: paddd %xmm0, %xmm2
+; X64-NEXT: movdqa %xmm2, %xmm1
+; X64-NEXT: psrld $31, %xmm1
+; X64-NEXT: psrad $2, %xmm2
+; X64-NEXT: paddd %xmm1, %xmm2
+; X64-NEXT: movdqa %xmm2, %xmm1
+; X64-NEXT: pslld $3, %xmm1
+; X64-NEXT: psubd %xmm1, %xmm2
+; X64-NEXT: paddd %xmm0, %xmm2
+; X64-NEXT: movq %xmm2, (%rsi)
; X64-NEXT: retq
;
; X86-LABEL: test_srem7_v2i32:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT: movd %xmm0, %ecx
-; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
-; X86-NEXT: movd %xmm0, %esi
-; X86-NEXT: movl $-1840700269, %ebx # imm = 0x92492493
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: imull %ebx
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: addl %esi, %edi
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: shrl $31, %eax
-; X86-NEXT: sarl $2, %edi
-; X86-NEXT: addl %eax, %edi
-; X86-NEXT: leal (,%edi,8), %eax
-; X86-NEXT: subl %eax, %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: addl %esi, %edi
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: imull %ebx
-; X86-NEXT: addl %ecx, %edx
-; X86-NEXT: movl %edx, %eax
-; X86-NEXT: shrl $31, %eax
-; X86-NEXT: sarl $2, %edx
-; X86-NEXT: addl %eax, %edx
-; X86-NEXT: leal (,%edx,8), %eax
-; X86-NEXT: subl %eax, %edx
-; X86-NEXT: addl %ecx, %edx
-; X86-NEXT: movd %edx, %xmm0
-; X86-NEXT: movd %edi, %xmm1
-; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X86-NEXT: movq %xmm0, (%ebp)
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
+; X86-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
+; X86-NEXT: movdqa %xmm0, %xmm2
+; X86-NEXT: pmuludq %xmm1, %xmm2
+; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; X86-NEXT: movdqa %xmm0, %xmm3
+; X86-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3]
+; X86-NEXT: pmuludq %xmm1, %xmm3
+; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; X86-NEXT: pxor %xmm3, %xmm3
+; X86-NEXT: pcmpgtd %xmm0, %xmm3
+; X86-NEXT: pand %xmm1, %xmm3
+; X86-NEXT: paddd %xmm0, %xmm3
+; X86-NEXT: psubd %xmm3, %xmm2
+; X86-NEXT: paddd %xmm0, %xmm2
+; X86-NEXT: movdqa %xmm2, %xmm1
+; X86-NEXT: psrld $31, %xmm1
+; X86-NEXT: psrad $2, %xmm2
+; X86-NEXT: paddd %xmm1, %xmm2
+; X86-NEXT: movdqa %xmm2, %xmm1
+; X86-NEXT: pslld $3, %xmm1
+; X86-NEXT: psubd %xmm1, %xmm2
+; X86-NEXT: paddd %xmm0, %xmm2
+; X86-NEXT: movq %xmm2, (%eax)
; X86-NEXT: retl
;
; X64_WIDEN-LABEL: test_srem7_v2i32:
; X64-LABEL: test_udiv_pow2_v2i32:
; X64: # %bb.0:
; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: pxor %xmm1, %xmm1
-; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X64-NEXT: psrlq $3, %xmm0
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-NEXT: psrld $3, %xmm0
; X64-NEXT: movq %xmm0, (%rsi)
; X64-NEXT: retq
;
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT: pxor %xmm1, %xmm1
-; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X86-NEXT: psrlq $3, %xmm0
-; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-NEXT: psrld $3, %xmm0
; X86-NEXT: movq %xmm0, (%eax)
; X86-NEXT: retl
;
define void @test_urem_pow2_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind {
; X64-LABEL: test_urem_pow2_v2i32:
; X64: # %bb.0:
-; X64-NEXT: movl (%rdi), %eax
-; X64-NEXT: movl 4(%rdi), %ecx
-; X64-NEXT: movq %rcx, %xmm0
-; X64-NEXT: movq %rax, %xmm1
-; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; X64-NEXT: pand {{.*}}(%rip), %xmm1
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
-; X64-NEXT: movq %xmm0, (%rsi)
+; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT: andps {{.*}}(%rip), %xmm0
+; X64-NEXT: movlps %xmm0, (%rsi)
; X64-NEXT: retq
;
; X86-LABEL: test_urem_pow2_v2i32:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
; X86-NEXT: andps {{\.LCPI.*}}, %xmm0
-; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-NEXT: movq %xmm0, (%eax)
+; X86-NEXT: movlps %xmm0, (%eax)
; X86-NEXT: retl
;
; X64_WIDEN-LABEL: test_urem_pow2_v2i32:
; X64-LABEL: test_sdiv_pow2_v2i32:
; X64: # %bb.0:
; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,1]
+; X64-NEXT: movdqa %xmm0, %xmm1
; X64-NEXT: psrad $31, %xmm1
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
-; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X64-NEXT: psrlq $31, %xmm0
-; X64-NEXT: pand {{.*}}(%rip), %xmm0
-; X64-NEXT: psrlq $29, %xmm0
-; X64-NEXT: paddq %xmm2, %xmm0
-; X64-NEXT: psllq $32, %xmm0
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
-; X64-NEXT: psrad $31, %xmm0
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X64-NEXT: psrlq $3, %xmm1
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
-; X64-NEXT: movq %xmm0, (%rsi)
+; X64-NEXT: psrld $29, %xmm1
+; X64-NEXT: paddd %xmm0, %xmm1
+; X64-NEXT: psrad $3, %xmm1
+; X64-NEXT: movq %xmm1, (%rsi)
; X64-NEXT: retq
;
; X86-LABEL: test_sdiv_pow2_v2i32:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X86-NEXT: movdqa %xmm0, %xmm1
; X86-NEXT: psrad $31, %xmm1
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; X86-NEXT: movdqa %xmm0, %xmm2
-; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; X86-NEXT: psrlq $31, %xmm2
-; X86-NEXT: movsd {{.*#+}} xmm2 = xmm2[0,1]
-; X86-NEXT: movapd {{.*#+}} xmm1 = [2.1219957909652723E-314,2.1219957909652723E-314]
-; X86-NEXT: xorpd %xmm1, %xmm2
-; X86-NEXT: psubq %xmm1, %xmm2
-; X86-NEXT: pand {{\.LCPI.*}}, %xmm2
-; X86-NEXT: psrlq $29, %xmm2
-; X86-NEXT: paddq %xmm0, %xmm2
-; X86-NEXT: psllq $32, %xmm2
-; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,3,2,3]
-; X86-NEXT: psrad $31, %xmm2
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
-; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X86-NEXT: psrlq $3, %xmm0
-; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-NEXT: movq %xmm0, (%eax)
+; X86-NEXT: psrld $29, %xmm1
+; X86-NEXT: paddd %xmm0, %xmm1
+; X86-NEXT: psrad $3, %xmm1
+; X86-NEXT: movq %xmm1, (%eax)
; X86-NEXT: retl
;
; X64_WIDEN-LABEL: test_sdiv_pow2_v2i32:
; X64-LABEL: test_srem_pow2_v2i32:
; X64: # %bb.0:
; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: pxor %xmm1, %xmm1
-; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X64-NEXT: psrlq $3, %xmm0
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-NEXT: psrld $3, %xmm0
; X64-NEXT: movq %xmm0, (%rsi)
; X64-NEXT: retq
;
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT: pxor %xmm1, %xmm1
-; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X86-NEXT: psrlq $3, %xmm0
-; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-NEXT: psrld $3, %xmm0
; X86-NEXT: movq %xmm0, (%eax)
; X86-NEXT: retl
;
; X64-NEXT: movq %rdx, %rcx
; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,0,1]
-; X64-NEXT: movd %xmm2, %eax
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,0,1]
-; X64-NEXT: movd %xmm2, %esi
+; X64-NEXT: movd %xmm0, %eax
+; X64-NEXT: movd %xmm1, %esi
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: divl %esi
-; X64-NEXT: movl %eax, %esi
+; X64-NEXT: movd %eax, %xmm2
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; X64-NEXT: movd %xmm0, %eax
-; X64-NEXT: movd %xmm1, %edi
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-NEXT: movd %xmm0, %esi
; X64-NEXT: xorl %edx, %edx
-; X64-NEXT: divl %edi
+; X64-NEXT: divl %esi
; X64-NEXT: movd %eax, %xmm0
-; X64-NEXT: movd %esi, %xmm1
-; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X64-NEXT: movq %xmm0, (%rcx)
+; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; X64-NEXT: movq %xmm2, (%rcx)
; X64-NEXT: retq
;
; X86-LABEL: test_udiv_v2i32:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; X86-NEXT: movd %xmm0, %ecx
-; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
; X86-NEXT: movd %xmm0, %eax
-; X86-NEXT: movd %xmm1, %ebx
-; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,0,1]
; X86-NEXT: movd %xmm1, %esi
; X86-NEXT: xorl %edx, %edx
; X86-NEXT: divl %esi
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movd %eax, %xmm2
+; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X86-NEXT: movd %xmm0, %eax
+; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
+; X86-NEXT: movd %xmm1, %esi
; X86-NEXT: xorl %edx, %edx
-; X86-NEXT: divl %ebx
+; X86-NEXT: divl %esi
; X86-NEXT: movd %eax, %xmm0
-; X86-NEXT: movd %esi, %xmm1
-; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X86-NEXT: movq %xmm0, (%edi)
+; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; X86-NEXT: movq %xmm2, (%ecx)
; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
; X86-NEXT: retl
;
; X64_WIDEN-LABEL: test_udiv_v2i32:
; X64-NEXT: movq %rdx, %rcx
; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,0,1]
-; X64-NEXT: movd %xmm2, %eax
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,0,1]
-; X64-NEXT: movd %xmm2, %esi
+; X64-NEXT: movd %xmm0, %eax
+; X64-NEXT: movd %xmm1, %esi
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: divl %esi
-; X64-NEXT: movl %edx, %esi
+; X64-NEXT: movd %edx, %xmm2
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; X64-NEXT: movd %xmm0, %eax
-; X64-NEXT: movd %xmm1, %edi
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-NEXT: movd %xmm0, %esi
; X64-NEXT: xorl %edx, %edx
-; X64-NEXT: divl %edi
+; X64-NEXT: divl %esi
; X64-NEXT: movd %edx, %xmm0
-; X64-NEXT: movd %esi, %xmm1
-; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X64-NEXT: movq %xmm0, (%rcx)
+; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; X64-NEXT: movq %xmm2, (%rcx)
; X64-NEXT: retq
;
; X86-LABEL: test_urem_v2i32:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; X86-NEXT: movd %xmm0, %ecx
-; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
; X86-NEXT: movd %xmm0, %eax
-; X86-NEXT: movd %xmm1, %ebx
-; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,0,1]
; X86-NEXT: movd %xmm1, %esi
; X86-NEXT: xorl %edx, %edx
; X86-NEXT: divl %esi
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movd %edx, %xmm2
+; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X86-NEXT: movd %xmm0, %eax
+; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
+; X86-NEXT: movd %xmm1, %esi
; X86-NEXT: xorl %edx, %edx
-; X86-NEXT: divl %ebx
+; X86-NEXT: divl %esi
; X86-NEXT: movd %edx, %xmm0
-; X86-NEXT: movd %esi, %xmm1
-; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X86-NEXT: movq %xmm0, (%edi)
+; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; X86-NEXT: movq %xmm2, (%ecx)
; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
; X86-NEXT: retl
;
; X64_WIDEN-LABEL: test_urem_v2i32:
; X64-NEXT: movq %rdx, %rcx
; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,0,1]
-; X64-NEXT: movd %xmm2, %eax
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,0,1]
-; X64-NEXT: movd %xmm2, %esi
+; X64-NEXT: movd %xmm0, %eax
+; X64-NEXT: movd %xmm1, %esi
; X64-NEXT: cltd
; X64-NEXT: idivl %esi
-; X64-NEXT: movl %eax, %esi
+; X64-NEXT: movd %eax, %xmm2
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; X64-NEXT: movd %xmm0, %eax
-; X64-NEXT: movd %xmm1, %edi
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-NEXT: movd %xmm0, %esi
; X64-NEXT: cltd
-; X64-NEXT: idivl %edi
+; X64-NEXT: idivl %esi
; X64-NEXT: movd %eax, %xmm0
-; X64-NEXT: movd %esi, %xmm1
-; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X64-NEXT: movq %xmm0, (%rcx)
+; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; X64-NEXT: movq %xmm2, (%rcx)
; X64-NEXT: retq
;
; X86-LABEL: test_sdiv_v2i32:
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; X86-NEXT: movd %xmm0, %ecx
-; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
+; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
; X86-NEXT: movd %xmm0, %eax
+; X86-NEXT: movd %xmm1, %edi
+; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
; X86-NEXT: movd %xmm1, %ebx
-; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,0,1]
-; X86-NEXT: movd %xmm1, %esi
-; X86-NEXT: cltd
-; X86-NEXT: idivl %esi
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: movl %ecx, %eax
; X86-NEXT: cltd
; X86-NEXT: idivl %ebx
; X86-NEXT: movd %eax, %xmm0
-; X86-NEXT: movd %esi, %xmm1
-; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X86-NEXT: movq %xmm0, (%edi)
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: cltd
+; X86-NEXT: idivl %edi
+; X86-NEXT: movd %eax, %xmm1
+; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X86-NEXT: movq %xmm1, (%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
; X64-NEXT: movq %rdx, %rcx
; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,0,1]
-; X64-NEXT: movd %xmm2, %eax
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,0,1]
-; X64-NEXT: movd %xmm2, %esi
+; X64-NEXT: movd %xmm0, %eax
+; X64-NEXT: movd %xmm1, %esi
; X64-NEXT: cltd
; X64-NEXT: idivl %esi
-; X64-NEXT: movl %eax, %esi
+; X64-NEXT: movd %eax, %xmm2
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; X64-NEXT: movd %xmm0, %eax
-; X64-NEXT: movd %xmm1, %edi
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-NEXT: movd %xmm0, %esi
; X64-NEXT: cltd
-; X64-NEXT: idivl %edi
+; X64-NEXT: idivl %esi
; X64-NEXT: movd %eax, %xmm0
-; X64-NEXT: movd %esi, %xmm1
-; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X64-NEXT: movq %xmm0, (%rcx)
+; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; X64-NEXT: movq %xmm2, (%rcx)
; X64-NEXT: retq
;
; X86-LABEL: test_srem_v2i32:
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; X86-NEXT: movd %xmm0, %ecx
-; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
+; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
; X86-NEXT: movd %xmm0, %eax
+; X86-NEXT: movd %xmm1, %edi
+; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
; X86-NEXT: movd %xmm1, %ebx
-; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,0,1]
-; X86-NEXT: movd %xmm1, %esi
-; X86-NEXT: cltd
-; X86-NEXT: idivl %esi
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: movl %ecx, %eax
; X86-NEXT: cltd
; X86-NEXT: idivl %ebx
; X86-NEXT: movd %eax, %xmm0
-; X86-NEXT: movd %esi, %xmm1
-; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X86-NEXT: movq %xmm0, (%edi)
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: cltd
+; X86-NEXT: idivl %edi
+; X86-NEXT: movd %eax, %xmm1
+; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X86-NEXT: movq %xmm1, (%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm2, %xmm2
; SSE-NEXT: psubd %xmm0, %xmm2
+; SSE-NEXT: psrld $16, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[2,3]
-; SSE-NEXT: movaps %xmm2, %xmm0
+; SSE-NEXT: movdqa %xmm2, %xmm0
; SSE-NEXT: retq
;
-; AVX1-LABEL: PR39893:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2],zero,xmm0[3],zero,xmm0[2],zero,xmm0[3],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: PR39893:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpsubd %xmm0, %xmm2, %xmm0
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2],zero,xmm0[3],zero,xmm0[2],zero,xmm0[3],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: PR39893:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpsubd %xmm0, %xmm2, %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2],zero,xmm0[3],zero,xmm0[2],zero,xmm0[3],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512-NEXT: retq
+; AVX-LABEL: PR39893:
+; AVX: # %bb.0:
+; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpsubd %xmm0, %xmm2, %xmm0
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX-NEXT: retq
%sub = sub <2 x i32> <i32 0, i32 undef>, %x
%bc = bitcast <2 x i32> %sub to <8 x i8>
%shuffle = shufflevector <8 x i8> %y, <8 x i8> %bc, <2 x i32> <i32 10, i32 4>
; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm1, %xmm1
; SSE-NEXT: subps %xmm0, %xmm1
-; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
+; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: PR39893_2:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX-NEXT: vsubps %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; AVX-NEXT: retq
%fsub = fsub <2 x float> zeroinitializer, %x
%bc = bitcast <2 x float> %fsub to <8 x i8>
define i32 @test_v2i32(<2 x i32> %a0) {
; SSE-LABEL: test_v2i32:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT: paddq %xmm0, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT: paddd %xmm0, %xmm1
; SSE-NEXT: movd %xmm1, %eax
; SSE-NEXT: retq
;
-; AVX-LABEL: test_v2i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: retq
+; AVX1-SLOW-LABEL: test_v2i32:
+; AVX1-SLOW: # %bb.0:
+; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
+; AVX1-SLOW-NEXT: retq
+;
+; AVX1-FAST-LABEL: test_v2i32:
+; AVX1-FAST: # %bb.0:
+; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vmovd %xmm0, %eax
+; AVX1-FAST-NEXT: retq
+;
+; AVX2-LABEL: test_v2i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vmovd %xmm0, %eax
+; AVX2-NEXT: retq
;
; AVX512-LABEL: test_v2i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: retq
%1 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> %a0)
define i16 @test_v2i16(<2 x i16> %a0) {
; SSE-LABEL: test_v2i16:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT: paddq %xmm0, %xmm1
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psrld $16, %xmm1
+; SSE-NEXT: paddw %xmm0, %xmm1
; SSE-NEXT: movd %xmm1, %eax
; SSE-NEXT: # kill: def $ax killed $ax killed $eax
; SSE-NEXT: retq
;
-; AVX-LABEL: test_v2i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX-NEXT: retq
+; AVX1-SLOW-LABEL: test_v2i16:
+; AVX1-SLOW: # %bb.0:
+; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
+; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
+; AVX1-SLOW-NEXT: retq
+;
+; AVX1-FAST-LABEL: test_v2i16:
+; AVX1-FAST: # %bb.0:
+; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vmovd %xmm0, %eax
+; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax
+; AVX1-FAST-NEXT: retq
+;
+; AVX2-LABEL: test_v2i16:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vmovd %xmm0, %eax
+; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
+; AVX2-NEXT: retq
;
; AVX512-LABEL: test_v2i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
; AVX512-NEXT: retq
define i16 @test_v4i16(<4 x i16> %a0) {
; SSE-LABEL: test_v4i16:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT: paddd %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE-NEXT: paddd %xmm1, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT: paddw %xmm0, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: psrld $16, %xmm0
+; SSE-NEXT: paddw %xmm1, %xmm0
; SSE-NEXT: movd %xmm0, %eax
; SSE-NEXT: # kill: def $ax killed $ax killed $eax
; SSE-NEXT: retq
;
; AVX1-SLOW-LABEL: test_v4i16:
; AVX1-SLOW: # %bb.0:
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
; AVX1-SLOW-NEXT: retq
;
; AVX1-FAST-LABEL: test_v4i16:
; AVX1-FAST: # %bb.0:
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vmovd %xmm0, %eax
; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax
; AVX1-FAST-NEXT: retq
;
; AVX2-LABEL: test_v4i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovd %xmm0, %eax
; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_v4i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
; AVX512-NEXT: retq
define i8 @test_v2i8(<2 x i8> %a0) {
; SSE2-LABEL: test_v2i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: paddq %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlw $8, %xmm1
+; SSE2-NEXT: paddb %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v2i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT: paddq %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: paddb %xmm0, %xmm1
; SSE41-NEXT: pextrb $0, %xmm1, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v2i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpextrb $0, %xmm0, %eax
; AVX-NEXT: # kill: def $al killed $al killed $eax
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v2i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
; AVX512-NEXT: # kill: def $al killed $al killed $eax
; AVX512-NEXT: retq
define i8 @test_v4i8(<4 x i8> %a0) {
; SSE2-LABEL: test_v4i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE2-NEXT: paddd %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrld $16, %xmm1
+; SSE2-NEXT: paddb %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psrlw $8, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v4i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT: paddd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE41-NEXT: paddd %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrld $16, %xmm1
+; SSE41-NEXT: paddb %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psrlw $8, %xmm0
+; SSE41-NEXT: paddb %xmm1, %xmm0
; SSE41-NEXT: pextrb $0, %xmm0, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
; SSE41-NEXT: retq
;
-; AVX1-SLOW-LABEL: test_v4i8:
-; AVX1-SLOW: # %bb.0:
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vpextrb $0, %xmm0, %eax
-; AVX1-SLOW-NEXT: # kill: def $al killed $al killed $eax
-; AVX1-SLOW-NEXT: retq
-;
-; AVX1-FAST-LABEL: test_v4i8:
-; AVX1-FAST: # %bb.0:
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vpextrb $0, %xmm0, %eax
-; AVX1-FAST-NEXT: # kill: def $al killed $al killed $eax
-; AVX1-FAST-NEXT: retq
-;
-; AVX2-LABEL: test_v4i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpextrb $0, %xmm0, %eax
-; AVX2-NEXT: # kill: def $al killed $al killed $eax
-; AVX2-NEXT: retq
+; AVX-LABEL: test_v4i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpextrb $0, %xmm0, %eax
+; AVX-NEXT: # kill: def $al killed $al killed $eax
+; AVX-NEXT: retq
;
; AVX512-LABEL: test_v4i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
; AVX512-NEXT: # kill: def $al killed $al killed $eax
; AVX512-NEXT: retq
define i8 @test_v8i8(<8 x i8> %a0) {
; SSE2-LABEL: test_v8i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: paddw %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE2-NEXT: paddw %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT: paddb %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psrld $16, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: paddw %xmm0, %xmm1
+; SSE2-NEXT: psrlw $8, %xmm1
+; SSE2-NEXT: paddb %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v8i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT: paddw %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE41-NEXT: paddw %xmm1, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT: paddb %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psrld $16, %xmm0
+; SSE41-NEXT: paddb %xmm1, %xmm0
; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrld $16, %xmm1
-; SSE41-NEXT: paddw %xmm0, %xmm1
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: paddb %xmm0, %xmm1
; SSE41-NEXT: pextrb $0, %xmm1, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
; SSE41-NEXT: retq
;
-; AVX1-SLOW-LABEL: test_v8i8:
-; AVX1-SLOW: # %bb.0:
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vpextrb $0, %xmm0, %eax
-; AVX1-SLOW-NEXT: # kill: def $al killed $al killed $eax
-; AVX1-SLOW-NEXT: retq
-;
-; AVX1-FAST-LABEL: test_v8i8:
-; AVX1-FAST: # %bb.0:
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vpextrb $0, %xmm0, %eax
-; AVX1-FAST-NEXT: # kill: def $al killed $al killed $eax
-; AVX1-FAST-NEXT: retq
-;
-; AVX2-LABEL: test_v8i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpextrb $0, %xmm0, %eax
-; AVX2-NEXT: # kill: def $al killed $al killed $eax
-; AVX2-NEXT: retq
+; AVX-LABEL: test_v8i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpextrb $0, %xmm0, %eax
+; AVX-NEXT: # kill: def $al killed $al killed $eax
+; AVX-NEXT: retq
;
; AVX512-LABEL: test_v8i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
; AVX512-NEXT: # kill: def $al killed $al killed $eax
; AVX512-NEXT: retq
}
define i1 @trunc_v8i16_v8i1(<8 x i8>) {
-; SSE-LABEL: trunc_v8i16_v8i1:
-; SSE: # %bb.0:
-; SSE-NEXT: psllw $15, %xmm0
-; SSE-NEXT: packsswb %xmm0, %xmm0
-; SSE-NEXT: pmovmskb %xmm0, %eax
-; SSE-NEXT: cmpb $-1, %al
-; SSE-NEXT: sete %al
-; SSE-NEXT: retq
+; SSE2-LABEL: trunc_v8i16_v8i1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psllw $15, %xmm0
+; SSE2-NEXT: packsswb %xmm0, %xmm0
+; SSE2-NEXT: pmovmskb %xmm0, %eax
+; SSE2-NEXT: cmpb $-1, %al
+; SSE2-NEXT: sete %al
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: trunc_v8i16_v8i1:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT: psllw $15, %xmm0
+; SSE41-NEXT: packsswb %xmm0, %xmm0
+; SSE41-NEXT: pmovmskb %xmm0, %eax
+; SSE41-NEXT: cmpb $-1, %al
+; SSE41-NEXT: sete %al
+; SSE41-NEXT: retq
;
; AVX-LABEL: trunc_v8i16_v8i1:
; AVX: # %bb.0:
+; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX-NEXT: vpsllw $15, %xmm0, %xmm0
; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX-NEXT: vpmovmskb %xmm0, %eax
;
; AVX512F-LABEL: trunc_v8i16_v8i1:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
-; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
-; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: cmpb $-1, %al
; AVX512F-NEXT: sete %al
;
; AVX512BW-LABEL: trunc_v8i16_v8i1:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpsllw $15, %xmm0, %xmm0
-; AVX512BW-NEXT: vpmovw2m %zmm0, %k0
+; AVX512BW-NEXT: vpsllw $7, %xmm0, %xmm0
+; AVX512BW-NEXT: vpmovb2m %zmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: cmpb $-1, %al
; AVX512BW-NEXT: sete %al
;
; AVX512VL-LABEL: trunc_v8i16_v8i1:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllw $15, %xmm0, %xmm0
-; AVX512VL-NEXT: vpmovw2m %xmm0, %k0
+; AVX512VL-NEXT: vpsllw $7, %xmm0, %xmm0
+; AVX512VL-NEXT: vpmovb2m %xmm0, %k0
; AVX512VL-NEXT: kmovd %k0, %eax
; AVX512VL-NEXT: cmpb $-1, %al
; AVX512VL-NEXT: sete %al
}
define i1 @icmp_v8i16_v8i1(<8 x i8>) {
-; SSE-LABEL: icmp_v8i16_v8i1:
-; SSE: # %bb.0:
-; SSE-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE-NEXT: pxor %xmm1, %xmm1
-; SSE-NEXT: pcmpeqw %xmm0, %xmm1
-; SSE-NEXT: packsswb %xmm0, %xmm1
-; SSE-NEXT: pmovmskb %xmm1, %eax
-; SSE-NEXT: cmpb $-1, %al
-; SSE-NEXT: sete %al
-; SSE-NEXT: retq
+; SSE2-LABEL: icmp_v8i16_v8i1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpeqb %xmm0, %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT: packsswb %xmm0, %xmm0
+; SSE2-NEXT: pmovmskb %xmm0, %eax
+; SSE2-NEXT: cmpb $-1, %al
+; SSE2-NEXT: sete %al
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: icmp_v8i16_v8i1:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: pcmpeqb %xmm0, %xmm1
+; SSE41-NEXT: pmovsxbw %xmm1, %xmm0
+; SSE41-NEXT: packsswb %xmm0, %xmm0
+; SSE41-NEXT: pmovmskb %xmm0, %eax
+; SSE41-NEXT: cmpb $-1, %al
+; SSE41-NEXT: sete %al
+; SSE41-NEXT: retq
;
; AVX-LABEL: icmp_v8i16_v8i1:
; AVX: # %bb.0:
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpmovsxbw %xmm0, %xmm0
; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX-NEXT: vpmovmskb %xmm0, %eax
; AVX-NEXT: cmpb $-1, %al
;
; AVX512F-LABEL: icmp_v8i16_v8i1:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
-; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: cmpb $-1, %al
; AVX512F-NEXT: sete %al
; AVX512BW-LABEL: icmp_v8i16_v8i1:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; AVX512BW-NEXT: vptestnmw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: cmpb $-1, %al
; AVX512BW-NEXT: sete %al
;
; AVX512VL-LABEL: icmp_v8i16_v8i1:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vptestnmw {{.*}}(%rip), %xmm0, %k0
+; AVX512VL-NEXT: vptestnmb %xmm0, %xmm0, %k0
; AVX512VL-NEXT: kmovd %k0, %eax
; AVX512VL-NEXT: cmpb $-1, %al
; AVX512VL-NEXT: sete %al
define i32 @test_v2i32(<2 x i32> %a0) {
; SSE-LABEL: test_v2i32:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE-NEXT: pand %xmm0, %xmm1
; SSE-NEXT: movd %xmm1, %eax
; SSE-NEXT: retq
;
; AVX-LABEL: test_v2i32:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: retq
define i16 @test_v2i16(<2 x i16> %a0) {
; SSE-LABEL: test_v2i16:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psrld $16, %xmm1
; SSE-NEXT: pand %xmm0, %xmm1
; SSE-NEXT: movd %xmm1, %eax
; SSE-NEXT: # kill: def $ax killed $ax killed $eax
;
; AVX-LABEL: test_v2i16:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: # kill: def $ax killed $ax killed $eax
define i16 @test_v4i16(<4 x i16> %a0) {
; SSE-LABEL: test_v4i16:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: psrld $16, %xmm0
; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: movd %xmm0, %eax
; SSE-NEXT: # kill: def $ax killed $ax killed $eax
;
; AVX-LABEL: test_v4i16:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: # kill: def $ax killed $ax killed $eax
; AVX-NEXT: retq
define i8 @test_v2i8(<2 x i8> %a0) {
; SSE2-LABEL: test_v2i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlw $8, %xmm1
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
;
; SSE41-LABEL: test_v2i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrlw $8, %xmm1
; SSE41-NEXT: pand %xmm0, %xmm1
; SSE41-NEXT: pextrb $0, %xmm1, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
;
; AVX-LABEL: test_v2i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpextrb $0, %xmm0, %eax
; AVX-NEXT: # kill: def $al killed $al killed $eax
define i8 @test_v4i8(<4 x i8> %a0) {
; SSE2-LABEL: test_v4i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrld $16, %xmm1
; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psrlw $8, %xmm0
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
;
; SSE41-LABEL: test_v4i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrld $16, %xmm1
; SSE41-NEXT: pand %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psrlw $8, %xmm0
; SSE41-NEXT: pand %xmm1, %xmm0
; SSE41-NEXT: pextrb $0, %xmm0, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
;
; AVX-LABEL: test_v4i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpextrb $0, %xmm0, %eax
; AVX-NEXT: # kill: def $al killed $al killed $eax
define i8 @test_v8i8(<8 x i8> %a0) {
; SSE2-LABEL: test_v8i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psrld $16, %xmm0
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
+; SSE2-NEXT: psrlw $8, %xmm1
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
;
; SSE41-LABEL: test_v8i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE41-NEXT: pand %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psrld $16, %xmm0
; SSE41-NEXT: pand %xmm1, %xmm0
; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrld $16, %xmm1
+; SSE41-NEXT: psrlw $8, %xmm1
; SSE41-NEXT: pand %xmm0, %xmm1
; SSE41-NEXT: pextrb $0, %xmm1, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
;
; AVX-LABEL: test_v8i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpextrb $0, %xmm0, %eax
; AVX-NEXT: # kill: def $al killed $al killed $eax
; AVX-NEXT: retq
;
define i32 @test_v2i32(<2 x i32> %a0) {
-; SSE-LABEL: test_v2i32:
-; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT: pmuludq %xmm0, %xmm1
-; SSE-NEXT: movd %xmm1, %eax
-; SSE-NEXT: retq
+; SSE2-LABEL: test_v2i32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT: pmuludq %xmm0, %xmm1
+; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v2i32:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT: pmulld %xmm0, %xmm1
+; SSE41-NEXT: movd %xmm1, %eax
+; SSE41-NEXT: retq
;
; AVX-LABEL: test_v2i32:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v2i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: retq
%1 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> %a0)
define i16 @test_v2i16(<2 x i16> %a0) {
; SSE-LABEL: test_v2i16:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT: pmuludq %xmm0, %xmm1
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psrld $16, %xmm1
+; SSE-NEXT: pmullw %xmm0, %xmm1
; SSE-NEXT: movd %xmm1, %eax
; SSE-NEXT: # kill: def $ax killed $ax killed $eax
; SSE-NEXT: retq
;
; AVX-LABEL: test_v2i16:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: # kill: def $ax killed $ax killed $eax
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v2i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
; AVX512-NEXT: retq
}
define i16 @test_v4i16(<4 x i16> %a0) {
-; SSE2-LABEL: test_v4i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,1,1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm2, %xmm3
-; SSE2-NEXT: pmuludq %xmm0, %xmm1
-; SSE2-NEXT: pmuludq %xmm3, %xmm1
-; SSE2-NEXT: pextrw $0, %xmm1, %eax
-; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v4i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmulld %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE41-NEXT: pmulld %xmm1, %xmm0
-; SSE41-NEXT: movd %xmm0, %eax
-; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE41-NEXT: retq
+; SSE-LABEL: test_v4i16:
+; SSE: # %bb.0:
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT: pmullw %xmm0, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: psrld $16, %xmm0
+; SSE-NEXT: pmullw %xmm1, %xmm0
+; SSE-NEXT: movd %xmm0, %eax
+; SSE-NEXT: # kill: def $ax killed $ax killed $eax
+; SSE-NEXT: retq
;
; AVX-LABEL: test_v4i16:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: # kill: def $ax killed $ax killed $eax
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v4i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
; AVX512-NEXT: retq
define i8 @test_v2i8(<2 x i8> %a0) {
; SSE2-LABEL: test_v2i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: pmuludq %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlw $8, %xmm1
+; SSE2-NEXT: pmullw %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v2i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmuludq %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: pmullw %xmm0, %xmm1
; SSE41-NEXT: pextrb $0, %xmm1, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v2i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpextrb $0, %xmm0, %eax
; AVX-NEXT: # kill: def $al killed $al killed $eax
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v2i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
; AVX512-NEXT: # kill: def $al killed $al killed $eax
; AVX512-NEXT: retq
define i8 @test_v4i8(<4 x i8> %a0) {
; SSE2-LABEL: test_v4i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,1,1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm2, %xmm3
-; SSE2-NEXT: pmuludq %xmm0, %xmm1
-; SSE2-NEXT: pmuludq %xmm3, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pmullw %xmm1, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: packuswb %xmm0, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlw $8, %xmm1
+; SSE2-NEXT: pmullw %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v4i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmulld %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE41-NEXT: pmulld %xmm1, %xmm0
-; SSE41-NEXT: pextrb $0, %xmm0, %eax
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE41-NEXT: pmullw %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,xmm1[6],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero
+; SSE41-NEXT: pmullw %xmm0, %xmm1
+; SSE41-NEXT: pextrb $0, %xmm1, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v4i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX-NEXT: vpmullw %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero
+; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpextrb $0, %xmm0, %eax
; AVX-NEXT: # kill: def $al killed $al killed $eax
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v4i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX512-NEXT: vpmullw %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero
+; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
; AVX512-NEXT: # kill: def $al killed $al killed $eax
; AVX512-NEXT: retq
define i8 @test_v8i8(<8 x i8> %a0) {
; SSE2-LABEL: test_v8i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,2,3,3]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSE2-NEXT: pmullw %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,2,3,0]
; SSE2-NEXT: pmullw %xmm1, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
+; SSE2-NEXT: psrlw $8, %xmm1
; SSE2-NEXT: pmullw %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
;
; SSE41-LABEL: test_v8i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmullw %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT: pmullw %xmm1, %xmm0
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; SSE41-NEXT: pmullw %xmm1, %xmm0
; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrld $16, %xmm1
+; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,xmm1[6],zero,xmm1[10],zero,xmm1[14],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero
; SSE41-NEXT: pmullw %xmm0, %xmm1
; SSE41-NEXT: pextrb $0, %xmm1, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
;
; AVX-LABEL: test_v8i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT: vpmullw %xmm0, %xmm1, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[10],zero,xmm0[14],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero
; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpextrb $0, %xmm0, %eax
; AVX-NEXT: # kill: def $al killed $al killed $eax
; AVX-NEXT: retq
;
-; AVX512-LABEL: test_v8i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: retq
+; AVX512BW-LABEL: test_v8i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512BW-NEXT: vpmullw %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[10],zero,xmm0[14],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero
+; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_v8i8:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512VL-NEXT: vpmullw %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3]
+; AVX512VL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[10],zero,xmm0[14],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero
+; AVX512VL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
+; AVX512VL-NEXT: retq
+;
+; AVX512DQ-LABEL: test_v8i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512DQ-NEXT: vpmullw %xmm0, %xmm1, %xmm0
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[10],zero,xmm0[14],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero
+; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
+; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax
+; AVX512DQ-NEXT: retq
%1 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> %a0)
ret i8 %1
}
}
define i1 @trunc_v8i16_v8i1(<8 x i8>) {
-; SSE-LABEL: trunc_v8i16_v8i1:
-; SSE: # %bb.0:
-; SSE-NEXT: psllw $15, %xmm0
-; SSE-NEXT: packsswb %xmm0, %xmm0
-; SSE-NEXT: pmovmskb %xmm0, %eax
-; SSE-NEXT: testb %al, %al
-; SSE-NEXT: setne %al
-; SSE-NEXT: retq
+; SSE2-LABEL: trunc_v8i16_v8i1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psllw $15, %xmm0
+; SSE2-NEXT: packsswb %xmm0, %xmm0
+; SSE2-NEXT: pmovmskb %xmm0, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: trunc_v8i16_v8i1:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT: psllw $15, %xmm0
+; SSE41-NEXT: packsswb %xmm0, %xmm0
+; SSE41-NEXT: pmovmskb %xmm0, %eax
+; SSE41-NEXT: testb %al, %al
+; SSE41-NEXT: setne %al
+; SSE41-NEXT: retq
;
; AVX-LABEL: trunc_v8i16_v8i1:
; AVX: # %bb.0:
+; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX-NEXT: vpsllw $15, %xmm0, %xmm0
; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX-NEXT: vpmovmskb %xmm0, %eax
;
; AVX512F-LABEL: trunc_v8i16_v8i1:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
-; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
-; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: testb %al, %al
; AVX512F-NEXT: setne %al
;
; AVX512BW-LABEL: trunc_v8i16_v8i1:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpsllw $15, %xmm0, %xmm0
-; AVX512BW-NEXT: vpmovw2m %zmm0, %k0
+; AVX512BW-NEXT: vpsllw $7, %xmm0, %xmm0
+; AVX512BW-NEXT: vpmovb2m %zmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: testb %al, %al
; AVX512BW-NEXT: setne %al
;
; AVX512VL-LABEL: trunc_v8i16_v8i1:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllw $15, %xmm0, %xmm0
-; AVX512VL-NEXT: vpmovw2m %xmm0, %k0
+; AVX512VL-NEXT: vpsllw $7, %xmm0, %xmm0
+; AVX512VL-NEXT: vpmovb2m %xmm0, %k0
; AVX512VL-NEXT: kmovd %k0, %eax
; AVX512VL-NEXT: testb %al, %al
; AVX512VL-NEXT: setne %al
}
define i1 @icmp_v8i16_v8i1(<8 x i8>) {
-; SSE-LABEL: icmp_v8i16_v8i1:
-; SSE: # %bb.0:
-; SSE-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE-NEXT: pxor %xmm1, %xmm1
-; SSE-NEXT: pcmpeqw %xmm0, %xmm1
-; SSE-NEXT: packsswb %xmm0, %xmm1
-; SSE-NEXT: pmovmskb %xmm1, %eax
-; SSE-NEXT: testb %al, %al
-; SSE-NEXT: setne %al
-; SSE-NEXT: retq
+; SSE2-LABEL: icmp_v8i16_v8i1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpeqb %xmm0, %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT: packsswb %xmm0, %xmm0
+; SSE2-NEXT: pmovmskb %xmm0, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: icmp_v8i16_v8i1:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: pcmpeqb %xmm0, %xmm1
+; SSE41-NEXT: pmovsxbw %xmm1, %xmm0
+; SSE41-NEXT: packsswb %xmm0, %xmm0
+; SSE41-NEXT: pmovmskb %xmm0, %eax
+; SSE41-NEXT: testb %al, %al
+; SSE41-NEXT: setne %al
+; SSE41-NEXT: retq
;
; AVX-LABEL: icmp_v8i16_v8i1:
; AVX: # %bb.0:
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpmovsxbw %xmm0, %xmm0
; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX-NEXT: vpmovmskb %xmm0, %eax
; AVX-NEXT: testb %al, %al
;
; AVX512F-LABEL: icmp_v8i16_v8i1:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
-; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: testb %al, %al
; AVX512F-NEXT: setne %al
; AVX512BW-LABEL: icmp_v8i16_v8i1:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; AVX512BW-NEXT: vptestnmw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: testb %al, %al
; AVX512BW-NEXT: setne %al
;
; AVX512VL-LABEL: icmp_v8i16_v8i1:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vptestnmw {{.*}}(%rip), %xmm0, %k0
+; AVX512VL-NEXT: vptestnmb %xmm0, %xmm0, %k0
; AVX512VL-NEXT: kmovd %k0, %eax
; AVX512VL-NEXT: testb %al, %al
; AVX512VL-NEXT: setne %al
define i32 @test_v2i32(<2 x i32> %a0) {
; SSE-LABEL: test_v2i32:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE-NEXT: por %xmm0, %xmm1
; SSE-NEXT: movd %xmm1, %eax
; SSE-NEXT: retq
;
; AVX-LABEL: test_v2i32:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: retq
define i16 @test_v2i16(<2 x i16> %a0) {
; SSE-LABEL: test_v2i16:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psrld $16, %xmm1
; SSE-NEXT: por %xmm0, %xmm1
; SSE-NEXT: movd %xmm1, %eax
; SSE-NEXT: # kill: def $ax killed $ax killed $eax
;
; AVX-LABEL: test_v2i16:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: # kill: def $ax killed $ax killed $eax
define i16 @test_v4i16(<4 x i16> %a0) {
; SSE-LABEL: test_v4i16:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE-NEXT: por %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: psrld $16, %xmm0
; SSE-NEXT: por %xmm1, %xmm0
; SSE-NEXT: movd %xmm0, %eax
; SSE-NEXT: # kill: def $ax killed $ax killed $eax
;
; AVX-LABEL: test_v4i16:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: # kill: def $ax killed $ax killed $eax
; AVX-NEXT: retq
define i8 @test_v2i8(<2 x i8> %a0) {
; SSE2-LABEL: test_v2i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlw $8, %xmm1
; SSE2-NEXT: por %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
;
; SSE41-LABEL: test_v2i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrlw $8, %xmm1
; SSE41-NEXT: por %xmm0, %xmm1
; SSE41-NEXT: pextrb $0, %xmm1, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
;
; AVX-LABEL: test_v2i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpextrb $0, %xmm0, %eax
; AVX-NEXT: # kill: def $al killed $al killed $eax
define i8 @test_v4i8(<4 x i8> %a0) {
; SSE2-LABEL: test_v4i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrld $16, %xmm1
; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psrlw $8, %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
;
; SSE41-LABEL: test_v4i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrld $16, %xmm1
; SSE41-NEXT: por %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psrlw $8, %xmm0
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: pextrb $0, %xmm0, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
;
; AVX-LABEL: test_v4i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpextrb $0, %xmm0, %eax
; AVX-NEXT: # kill: def $al killed $al killed $eax
define i8 @test_v8i8(<8 x i8> %a0) {
; SSE2-LABEL: test_v8i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psrld $16, %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
+; SSE2-NEXT: psrlw $8, %xmm1
; SSE2-NEXT: por %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
;
; SSE41-LABEL: test_v8i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE41-NEXT: por %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psrld $16, %xmm0
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrld $16, %xmm1
+; SSE41-NEXT: psrlw $8, %xmm1
; SSE41-NEXT: por %xmm0, %xmm1
; SSE41-NEXT: pextrb $0, %xmm1, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
;
; AVX-LABEL: test_v8i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpextrb $0, %xmm0, %eax
; AVX-NEXT: # kill: def $al killed $al killed $eax
; AVX-NEXT: retq
define i32 @test_v2i32(<2 x i32> %a0) {
; SSE2-LABEL: test_v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: psllq $32, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
-; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm1, %xmm3
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: movd %xmm3, %eax
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pandn %xmm1, %xmm2
+; SSE2-NEXT: por %xmm0, %xmm2
+; SSE2-NEXT: movd %xmm2, %eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v2i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psllq $32, %xmm1
-; SSE41-NEXT: psrad $31, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
-; SSE41-NEXT: psrad $31, %xmm3
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: pxor %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm2, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
-; SSE41-NEXT: movd %xmm3, %eax
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT: pmaxsd %xmm0, %xmm1
+; SSE41-NEXT: movd %xmm1, %eax
; SSE41-NEXT: retq
;
-; AVX1-LABEL: test_v2i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $32, %xmm0, %xmm1
-; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
-; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v2i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllq $32, %xmm0, %xmm1
-; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
-; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
-; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: retq
-;
-; AVX512BW-LABEL: test_v2i32:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpsllq $32, %xmm0, %xmm1
-; AVX512BW-NEXT: vpsraq $32, %zmm1, %zmm1
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512BW-NEXT: vpsraq $32, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: vmovd %xmm0, %eax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
+; AVX-LABEL: test_v2i32:
+; AVX: # %bb.0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: retq
;
-; AVX512VL-LABEL: test_v2i32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllq $32, %xmm0, %xmm1
-; AVX512VL-NEXT: vpsraq $32, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX512VL-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsraq $32, %xmm0, %xmm0
-; AVX512VL-NEXT: vpmaxsq %xmm0, %xmm1, %xmm0
-; AVX512VL-NEXT: vmovd %xmm0, %eax
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: test_v2i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: retq
%1 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> %a0)
ret i32 %1
}
;
define i16 @test_v2i16(<2 x i16> %a0) {
-; SSE2-LABEL: test_v2i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: psllq $48, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: psllq $48, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm2, %xmm3
-; SSE2-NEXT: pxor %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: movd %xmm3, %eax
-; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v2i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT: psllq $48, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psrad $31, %xmm2
-; SSE41-NEXT: psrad $16, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
-; SSE41-NEXT: psllq $48, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrad $31, %xmm0
-; SSE41-NEXT: psrad $16, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm3, %xmm2
-; SSE41-NEXT: pxor %xmm0, %xmm2
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm2, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
-; SSE41-NEXT: movd %xmm1, %eax
-; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v2i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $48, %xmm0, %xmm1
-; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
-; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2
-; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
-; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v2i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllq $48, %xmm0, %xmm1
-; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2
-; AVX2-NEXT: vpsrad $16, %xmm1, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2
-; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
-; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX2-NEXT: retq
-;
-; AVX512BW-LABEL: test_v2i16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpsllq $48, %xmm0, %xmm1
-; AVX512BW-NEXT: vpsraq $48, %zmm1, %zmm1
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX512BW-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX512BW-NEXT: vpsraq $48, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: vmovd %xmm0, %eax
-; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
+; SSE-LABEL: test_v2i16:
+; SSE: # %bb.0:
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psrld $16, %xmm1
+; SSE-NEXT: pmaxsw %xmm0, %xmm1
+; SSE-NEXT: movd %xmm1, %eax
+; SSE-NEXT: # kill: def $ax killed $ax killed $eax
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_v2i16:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: # kill: def $ax killed $ax killed $eax
+; AVX-NEXT: retq
;
-; AVX512VL-LABEL: test_v2i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllq $48, %xmm0, %xmm1
-; AVX512VL-NEXT: vpsraq $48, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX512VL-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsraq $48, %xmm0, %xmm0
-; AVX512VL-NEXT: vpmaxsq %xmm0, %xmm1, %xmm0
-; AVX512VL-NEXT: vmovd %xmm0, %eax
-; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: test_v2i16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT: retq
%1 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> %a0)
ret i16 %1
}
define i16 @test_v4i16(<4 x i16> %a0) {
-; SSE2-LABEL: test_v4i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: pslld $16, %xmm0
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: pslld $16, %xmm1
-; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v4i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT: pslld $16, %xmm0
-; SSE41-NEXT: psrad $16, %xmm0
-; SSE41-NEXT: pslld $16, %xmm1
-; SSE41-NEXT: psrad $16, %xmm1
-; SSE41-NEXT: pmaxsd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE41-NEXT: pmaxsd %xmm1, %xmm0
-; SSE41-NEXT: movd %xmm0, %eax
-; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE41-NEXT: retq
+; SSE-LABEL: test_v4i16:
+; SSE: # %bb.0:
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT: pmaxsw %xmm0, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: psrld $16, %xmm0
+; SSE-NEXT: pmaxsw %xmm1, %xmm0
+; SSE-NEXT: movd %xmm0, %eax
+; SSE-NEXT: # kill: def $ax killed $ax killed $eax
+; SSE-NEXT: retq
;
; AVX-LABEL: test_v4i16:
; AVX: # %bb.0:
-; AVX-NEXT: vpslld $16, %xmm0, %xmm1
-; AVX-NEXT: vpsrad $16, %xmm1, %xmm1
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX-NEXT: vpslld $16, %xmm0, %xmm0
-; AVX-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX-NEXT: vpmaxsd %xmm0, %xmm1, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: # kill: def $ax killed $ax killed $eax
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v4i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpslld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpsrad $16, %xmm1, %xmm1
-; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpslld $16, %xmm0, %xmm0
-; AVX512-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX512-NEXT: vpmaxsd %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
; AVX512-NEXT: retq
define i8 @test_v2i8(<2 x i8> %a0) {
; SSE2-LABEL: test_v2i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: psllq $56, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlw $8, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; SSE2-NEXT: psrad $24, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: psllq $56, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; SSE2-NEXT: psrad $24, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm2, %xmm3
-; SSE2-NEXT: pxor %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: movd %xmm3, %eax
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pandn %xmm1, %xmm2
+; SSE2-NEXT: por %xmm0, %xmm2
+; SSE2-NEXT: movd %xmm2, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v2i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT: psllq $56, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psrad $31, %xmm2
-; SSE41-NEXT: psrad $24, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
-; SSE41-NEXT: psllq $56, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrad $31, %xmm0
-; SSE41-NEXT: psrad $24, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm3, %xmm2
-; SSE41-NEXT: pxor %xmm0, %xmm2
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm2, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: pmaxsb %xmm0, %xmm1
; SSE41-NEXT: pextrb $0, %xmm1, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
; SSE41-NEXT: retq
;
-; AVX1-LABEL: test_v2i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $56, %xmm0, %xmm1
-; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
-; AVX1-NEXT: vpsrad $24, %xmm1, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2
-; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
-; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpextrb $0, %xmm0, %eax
-; AVX1-NEXT: # kill: def $al killed $al killed $eax
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v2i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllq $56, %xmm0, %xmm1
-; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2
-; AVX2-NEXT: vpsrad $24, %xmm1, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2
-; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
-; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpextrb $0, %xmm0, %eax
-; AVX2-NEXT: # kill: def $al killed $al killed $eax
-; AVX2-NEXT: retq
-;
-; AVX512BW-LABEL: test_v2i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpsllq $56, %xmm0, %xmm1
-; AVX512BW-NEXT: vpsraq $56, %zmm1, %zmm1
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX512BW-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX512BW-NEXT: vpsraq $56, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
+; AVX-LABEL: test_v2i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpextrb $0, %xmm0, %eax
+; AVX-NEXT: # kill: def $al killed $al killed $eax
+; AVX-NEXT: retq
;
-; AVX512VL-LABEL: test_v2i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllq $56, %xmm0, %xmm1
-; AVX512VL-NEXT: vpsraq $56, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX512VL-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsraq $56, %xmm0, %xmm0
-; AVX512VL-NEXT: vpmaxsq %xmm0, %xmm1, %xmm0
-; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: test_v2i8:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512-NEXT: # kill: def $al killed $al killed $eax
+; AVX512-NEXT: retq
%1 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> %a0)
ret i8 %1
}
define i8 @test_v4i8(<4 x i8> %a0) {
; SSE2-LABEL: test_v4i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: pslld $24, %xmm0
-; SSE2-NEXT: psrad $24, %xmm0
-; SSE2-NEXT: pslld $24, %xmm1
-; SSE2-NEXT: psrad $24, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrld $16, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pandn %xmm1, %xmm2
; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: psrlw $8, %xmm0
; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; SSE2-NEXT: pcmpgtb %xmm0, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: pandn %xmm0, %xmm1
; SSE2-NEXT: por %xmm2, %xmm1
;
; SSE41-LABEL: test_v4i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT: pslld $24, %xmm0
-; SSE41-NEXT: psrad $24, %xmm0
-; SSE41-NEXT: pslld $24, %xmm1
-; SSE41-NEXT: psrad $24, %xmm1
-; SSE41-NEXT: pmaxsd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE41-NEXT: pmaxsd %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrld $16, %xmm1
+; SSE41-NEXT: pmaxsb %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psrlw $8, %xmm0
+; SSE41-NEXT: pmaxsb %xmm1, %xmm0
; SSE41-NEXT: pextrb $0, %xmm0, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v4i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpslld $24, %xmm0, %xmm1
-; AVX-NEXT: vpsrad $24, %xmm1, %xmm1
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX-NEXT: vpmaxsd %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpextrb $0, %xmm0, %eax
; AVX-NEXT: # kill: def $al killed $al killed $eax
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v4i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpslld $24, %xmm0, %xmm1
-; AVX512-NEXT: vpsrad $24, %xmm1, %xmm1
-; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX512-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX512-NEXT: vpmaxsd %xmm0, %xmm1, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
; AVX512-NEXT: # kill: def $al killed $al killed $eax
; AVX512-NEXT: retq
define i8 @test_v8i8(<8 x i8> %a0) {
; SSE2-LABEL: test_v8i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: psllw $8, %xmm0
-; SSE2-NEXT: psraw $8, %xmm0
-; SSE2-NEXT: psllw $8, %xmm1
-; SSE2-NEXT: psraw $8, %xmm1
-; SSE2-NEXT: pmaxsw %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE2-NEXT: psllw $8, %xmm0
-; SSE2-NEXT: psraw $8, %xmm0
-; SSE2-NEXT: pmaxsw %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: psllw $8, %xmm1
-; SSE2-NEXT: psraw $8, %xmm1
-; SSE2-NEXT: pmaxsw %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pandn %xmm1, %xmm2
+; SSE2-NEXT: por %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: psrld $16, %xmm0
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; SSE2-NEXT: pand %xmm1, %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm1
+; SSE2-NEXT: por %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psrlw $8, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: por %xmm1, %xmm2
+; SSE2-NEXT: movd %xmm2, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v8i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT: psllw $8, %xmm0
-; SSE41-NEXT: psraw $8, %xmm0
-; SSE41-NEXT: psllw $8, %xmm1
-; SSE41-NEXT: psraw $8, %xmm1
-; SSE41-NEXT: pmaxsw %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE41-NEXT: psllw $8, %xmm0
-; SSE41-NEXT: psraw $8, %xmm0
-; SSE41-NEXT: pmaxsw %xmm1, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT: pmaxsb %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psrld $16, %xmm0
+; SSE41-NEXT: pmaxsb %xmm1, %xmm0
; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrld $16, %xmm1
-; SSE41-NEXT: psllw $8, %xmm1
-; SSE41-NEXT: psraw $8, %xmm1
-; SSE41-NEXT: pmaxsw %xmm0, %xmm1
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: pmaxsb %xmm0, %xmm1
; SSE41-NEXT: pextrb $0, %xmm1, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v8i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpsllw $8, %xmm0, %xmm1
-; AVX-NEXT: vpsraw $8, %xmm1, %xmm1
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX-NEXT: vpsraw $8, %xmm0, %xmm0
-; AVX-NEXT: vpmaxsw %xmm0, %xmm1, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpsllw $8, %xmm1, %xmm1
-; AVX-NEXT: vpsraw $8, %xmm1, %xmm1
-; AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpsllw $8, %xmm1, %xmm1
-; AVX-NEXT: vpsraw $8, %xmm1, %xmm1
-; AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpextrb $0, %xmm0, %eax
; AVX-NEXT: # kill: def $al killed $al killed $eax
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v8i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpsraw $8, %xmm1, %xmm1
-; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512-NEXT: vpsraw $8, %xmm0, %xmm0
-; AVX512-NEXT: vpmaxsw %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpsllw $8, %xmm1, %xmm1
-; AVX512-NEXT: vpsraw $8, %xmm1, %xmm1
-; AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpsllw $8, %xmm1, %xmm1
-; AVX512-NEXT: vpsraw $8, %xmm1, %xmm1
-; AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
; AVX512-NEXT: # kill: def $al killed $al killed $eax
; AVX512-NEXT: retq
define i32 @test_v2i32(<2 x i32> %a0) {
; SSE2-LABEL: test_v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: psllq $32, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
-; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm1, %xmm3
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: movd %xmm3, %eax
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pandn %xmm1, %xmm2
+; SSE2-NEXT: por %xmm0, %xmm2
+; SSE2-NEXT: movd %xmm2, %eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v2i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psllq $32, %xmm1
-; SSE41-NEXT: psrad $31, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
-; SSE41-NEXT: psrad $31, %xmm3
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: pxor %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm2, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
-; SSE41-NEXT: movd %xmm3, %eax
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT: pminsd %xmm0, %xmm1
+; SSE41-NEXT: movd %xmm1, %eax
; SSE41-NEXT: retq
;
-; AVX1-LABEL: test_v2i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $32, %xmm0, %xmm1
-; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v2i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllq $32, %xmm0, %xmm1
-; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
-; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: retq
-;
-; AVX512BW-LABEL: test_v2i32:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpsllq $32, %xmm0, %xmm1
-; AVX512BW-NEXT: vpsraq $32, %zmm1, %zmm1
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512BW-NEXT: vpsraq $32, %zmm0, %zmm0
-; AVX512BW-NEXT: vpminsq %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: vmovd %xmm0, %eax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
+; AVX-LABEL: test_v2i32:
+; AVX: # %bb.0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: retq
;
-; AVX512VL-LABEL: test_v2i32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllq $32, %xmm0, %xmm1
-; AVX512VL-NEXT: vpsraq $32, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX512VL-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsraq $32, %xmm0, %xmm0
-; AVX512VL-NEXT: vpminsq %xmm0, %xmm1, %xmm0
-; AVX512VL-NEXT: vmovd %xmm0, %eax
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: test_v2i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: retq
%1 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> %a0)
ret i32 %1
}
;
define i16 @test_v2i16(<2 x i16> %a0) {
-; SSE2-LABEL: test_v2i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: psllq $48, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: psllq $48, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm2, %xmm3
-; SSE2-NEXT: pxor %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: movd %xmm3, %eax
-; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v2i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT: psllq $48, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psrad $31, %xmm2
-; SSE41-NEXT: psrad $16, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
-; SSE41-NEXT: psllq $48, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrad $31, %xmm0
-; SSE41-NEXT: psrad $16, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: pxor %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm2, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
-; SSE41-NEXT: movd %xmm1, %eax
-; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v2i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $48, %xmm0, %xmm1
-; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
-; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2
-; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v2i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllq $48, %xmm0, %xmm1
-; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2
-; AVX2-NEXT: vpsrad $16, %xmm1, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2
-; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX2-NEXT: retq
-;
-; AVX512BW-LABEL: test_v2i16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpsllq $48, %xmm0, %xmm1
-; AVX512BW-NEXT: vpsraq $48, %zmm1, %zmm1
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX512BW-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX512BW-NEXT: vpsraq $48, %zmm0, %zmm0
-; AVX512BW-NEXT: vpminsq %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: vmovd %xmm0, %eax
-; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
+; SSE-LABEL: test_v2i16:
+; SSE: # %bb.0:
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psrld $16, %xmm1
+; SSE-NEXT: pminsw %xmm0, %xmm1
+; SSE-NEXT: movd %xmm1, %eax
+; SSE-NEXT: # kill: def $ax killed $ax killed $eax
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_v2i16:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: # kill: def $ax killed $ax killed $eax
+; AVX-NEXT: retq
;
-; AVX512VL-LABEL: test_v2i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllq $48, %xmm0, %xmm1
-; AVX512VL-NEXT: vpsraq $48, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX512VL-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsraq $48, %xmm0, %xmm0
-; AVX512VL-NEXT: vpminsq %xmm0, %xmm1, %xmm0
-; AVX512VL-NEXT: vmovd %xmm0, %eax
-; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: test_v2i16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT: retq
%1 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> %a0)
ret i16 %1
}
define i16 @test_v4i16(<4 x i16> %a0) {
-; SSE2-LABEL: test_v4i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: pslld $16, %xmm0
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: pslld $16, %xmm1
-; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v4i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT: pslld $16, %xmm0
-; SSE41-NEXT: psrad $16, %xmm0
-; SSE41-NEXT: pslld $16, %xmm1
-; SSE41-NEXT: psrad $16, %xmm1
-; SSE41-NEXT: pminsd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE41-NEXT: pminsd %xmm1, %xmm0
-; SSE41-NEXT: movd %xmm0, %eax
-; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE41-NEXT: retq
+; SSE-LABEL: test_v4i16:
+; SSE: # %bb.0:
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT: pminsw %xmm0, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: psrld $16, %xmm0
+; SSE-NEXT: pminsw %xmm1, %xmm0
+; SSE-NEXT: movd %xmm0, %eax
+; SSE-NEXT: # kill: def $ax killed $ax killed $eax
+; SSE-NEXT: retq
;
; AVX-LABEL: test_v4i16:
; AVX: # %bb.0:
-; AVX-NEXT: vpslld $16, %xmm0, %xmm1
-; AVX-NEXT: vpsrad $16, %xmm1, %xmm1
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX-NEXT: vpslld $16, %xmm0, %xmm0
-; AVX-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX-NEXT: vpminsd %xmm0, %xmm1, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: # kill: def $ax killed $ax killed $eax
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v4i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpslld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpsrad $16, %xmm1, %xmm1
-; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpslld $16, %xmm0, %xmm0
-; AVX512-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX512-NEXT: vpminsd %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
; AVX512-NEXT: retq
define i8 @test_v2i8(<2 x i8> %a0) {
; SSE2-LABEL: test_v2i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: psllq $56, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; SSE2-NEXT: psrad $24, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: psllq $56, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlw $8, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; SSE2-NEXT: psrad $24, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm2, %xmm3
-; SSE2-NEXT: pxor %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: movd %xmm3, %eax
+; SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pandn %xmm1, %xmm2
+; SSE2-NEXT: por %xmm0, %xmm2
+; SSE2-NEXT: movd %xmm2, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v2i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT: psllq $56, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psrad $31, %xmm2
-; SSE41-NEXT: psrad $24, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
-; SSE41-NEXT: psllq $56, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrad $31, %xmm0
-; SSE41-NEXT: psrad $24, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: pxor %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm2, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: pminsb %xmm0, %xmm1
; SSE41-NEXT: pextrb $0, %xmm1, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
; SSE41-NEXT: retq
;
-; AVX1-LABEL: test_v2i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $56, %xmm0, %xmm1
-; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
-; AVX1-NEXT: vpsrad $24, %xmm1, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2
-; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpextrb $0, %xmm0, %eax
-; AVX1-NEXT: # kill: def $al killed $al killed $eax
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v2i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllq $56, %xmm0, %xmm1
-; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2
-; AVX2-NEXT: vpsrad $24, %xmm1, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2
-; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpextrb $0, %xmm0, %eax
-; AVX2-NEXT: # kill: def $al killed $al killed $eax
-; AVX2-NEXT: retq
-;
-; AVX512BW-LABEL: test_v2i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpsllq $56, %xmm0, %xmm1
-; AVX512BW-NEXT: vpsraq $56, %zmm1, %zmm1
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX512BW-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX512BW-NEXT: vpsraq $56, %zmm0, %zmm0
-; AVX512BW-NEXT: vpminsq %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
+; AVX-LABEL: test_v2i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpextrb $0, %xmm0, %eax
+; AVX-NEXT: # kill: def $al killed $al killed $eax
+; AVX-NEXT: retq
;
-; AVX512VL-LABEL: test_v2i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllq $56, %xmm0, %xmm1
-; AVX512VL-NEXT: vpsraq $56, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX512VL-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsraq $56, %xmm0, %xmm0
-; AVX512VL-NEXT: vpminsq %xmm0, %xmm1, %xmm0
-; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: test_v2i8:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512-NEXT: # kill: def $al killed $al killed $eax
+; AVX512-NEXT: retq
%1 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> %a0)
ret i8 %1
}
define i8 @test_v4i8(<4 x i8> %a0) {
; SSE2-LABEL: test_v4i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: pslld $24, %xmm0
-; SSE2-NEXT: psrad $24, %xmm0
-; SSE2-NEXT: pslld $24, %xmm1
-; SSE2-NEXT: psrad $24, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrld $16, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtb %xmm0, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pandn %xmm1, %xmm2
; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: psrlw $8, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
+; SSE2-NEXT: pcmpgtb %xmm2, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: pandn %xmm0, %xmm1
; SSE2-NEXT: por %xmm2, %xmm1
;
; SSE41-LABEL: test_v4i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT: pslld $24, %xmm0
-; SSE41-NEXT: psrad $24, %xmm0
-; SSE41-NEXT: pslld $24, %xmm1
-; SSE41-NEXT: psrad $24, %xmm1
-; SSE41-NEXT: pminsd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE41-NEXT: pminsd %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrld $16, %xmm1
+; SSE41-NEXT: pminsb %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psrlw $8, %xmm0
+; SSE41-NEXT: pminsb %xmm1, %xmm0
; SSE41-NEXT: pextrb $0, %xmm0, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v4i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpslld $24, %xmm0, %xmm1
-; AVX-NEXT: vpsrad $24, %xmm1, %xmm1
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX-NEXT: vpminsd %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpextrb $0, %xmm0, %eax
; AVX-NEXT: # kill: def $al killed $al killed $eax
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v4i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpslld $24, %xmm0, %xmm1
-; AVX512-NEXT: vpsrad $24, %xmm1, %xmm1
-; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX512-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX512-NEXT: vpminsd %xmm0, %xmm1, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
; AVX512-NEXT: # kill: def $al killed $al killed $eax
; AVX512-NEXT: retq
define i8 @test_v8i8(<8 x i8> %a0) {
; SSE2-LABEL: test_v8i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: psllw $8, %xmm0
-; SSE2-NEXT: psraw $8, %xmm0
-; SSE2-NEXT: psllw $8, %xmm1
-; SSE2-NEXT: psraw $8, %xmm1
-; SSE2-NEXT: pminsw %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE2-NEXT: psllw $8, %xmm0
-; SSE2-NEXT: psraw $8, %xmm0
-; SSE2-NEXT: pminsw %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pandn %xmm1, %xmm2
+; SSE2-NEXT: por %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: psrld $16, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: psllw $8, %xmm1
-; SSE2-NEXT: psraw $8, %xmm1
-; SSE2-NEXT: pminsw %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; SSE2-NEXT: pand %xmm1, %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm1
+; SSE2-NEXT: por %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psrlw $8, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: por %xmm1, %xmm2
+; SSE2-NEXT: movd %xmm2, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v8i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT: psllw $8, %xmm0
-; SSE41-NEXT: psraw $8, %xmm0
-; SSE41-NEXT: psllw $8, %xmm1
-; SSE41-NEXT: psraw $8, %xmm1
-; SSE41-NEXT: pminsw %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE41-NEXT: psllw $8, %xmm0
-; SSE41-NEXT: psraw $8, %xmm0
-; SSE41-NEXT: pminsw %xmm1, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT: pminsb %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psrld $16, %xmm0
+; SSE41-NEXT: pminsb %xmm1, %xmm0
; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrld $16, %xmm1
-; SSE41-NEXT: psllw $8, %xmm1
-; SSE41-NEXT: psraw $8, %xmm1
-; SSE41-NEXT: pminsw %xmm0, %xmm1
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: pminsb %xmm0, %xmm1
; SSE41-NEXT: pextrb $0, %xmm1, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v8i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpsllw $8, %xmm0, %xmm1
-; AVX-NEXT: vpsraw $8, %xmm1, %xmm1
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX-NEXT: vpsraw $8, %xmm0, %xmm0
-; AVX-NEXT: vpminsw %xmm0, %xmm1, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpsllw $8, %xmm1, %xmm1
-; AVX-NEXT: vpsraw $8, %xmm1, %xmm1
-; AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpsllw $8, %xmm1, %xmm1
-; AVX-NEXT: vpsraw $8, %xmm1, %xmm1
-; AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpextrb $0, %xmm0, %eax
; AVX-NEXT: # kill: def $al killed $al killed $eax
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v8i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpsraw $8, %xmm1, %xmm1
-; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512-NEXT: vpsraw $8, %xmm0, %xmm0
-; AVX512-NEXT: vpminsw %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpsllw $8, %xmm1, %xmm1
-; AVX512-NEXT: vpsraw $8, %xmm1, %xmm1
-; AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpsllw $8, %xmm1, %xmm1
-; AVX512-NEXT: vpsraw $8, %xmm1, %xmm1
-; AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
; AVX512-NEXT: # kill: def $al killed $al killed $eax
; AVX512-NEXT: retq
define i32 @test_v2i32(<2 x i32> %a0) {
; SSE2-LABEL: test_v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,0,4294967295,0]
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pxor %xmm2, %xmm4
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm4, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm3, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: movd %xmm2, %eax
+; SSE2-NEXT: pxor %xmm2, %xmm3
+; SSE2-NEXT: pxor %xmm1, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pandn %xmm1, %xmm3
+; SSE2-NEXT: por %xmm0, %xmm3
+; SSE2-NEXT: movd %xmm3, %eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v2i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: pxor %xmm0, %xmm3
-; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
-; SSE41-NEXT: movd %xmm2, %eax
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT: pmaxud %xmm0, %xmm1
+; SSE41-NEXT: movd %xmm1, %eax
; SSE41-NEXT: retq
;
-; AVX1-LABEL: test_v2i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1
-; AVX1-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v2i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1
-; AVX2-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: retq
-;
-; AVX512BW-LABEL: test_v2i32:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX512BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512BW-NEXT: vpmaxuq %zmm0, %zmm2, %zmm0
-; AVX512BW-NEXT: vmovd %xmm0, %eax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
+; AVX-LABEL: test_v2i32:
+; AVX: # %bb.0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: retq
;
-; AVX512VL-LABEL: test_v2i32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX512VL-NEXT: vpmaxuq %xmm0, %xmm1, %xmm0
-; AVX512VL-NEXT: vmovd %xmm0, %eax
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: test_v2i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: retq
%1 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> %a0)
ret i32 %1
}
define i16 @test_v2i16(<2 x i16> %a0) {
; SSE2-LABEL: test_v2i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0,65535,0,0,0]
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: por %xmm3, %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm3, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: movd %xmm2, %eax
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrld $16, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: pmaxsw %xmm0, %xmm1
+; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000
; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v2i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: por %xmm0, %xmm3
-; SSE41-NEXT: por %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
-; SSE41-NEXT: movd %xmm2, %eax
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrld $16, %xmm1
+; SSE41-NEXT: pmaxuw %xmm0, %xmm1
+; SSE41-NEXT: movd %xmm1, %eax
; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v2i16:
; AVX: # %bb.0:
-; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
-; AVX-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: # kill: def $ax killed $ax killed $eax
; AVX-NEXT: retq
;
-; AVX512BW-LABEL: test_v2i16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX512BW-NEXT: vpmaxuq %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: vmovd %xmm0, %eax
-; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512VL-LABEL: test_v2i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX512VL-NEXT: vpmaxuq %xmm0, %xmm1, %xmm0
-; AVX512VL-NEXT: vmovd %xmm0, %eax
-; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: test_v2i16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT: retq
%1 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> %a0)
ret i16 %1
}
define i16 @test_v4i16(<4 x i16> %a0) {
; SSE2-LABEL: test_v4i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: pmaxsw %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: psrld $16, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: pmaxsw %xmm1, %xmm0
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000
; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v4i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2],xmm1[3,4,5,6,7]
-; SSE41-NEXT: pmaxud %xmm0, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
-; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; SSE41-NEXT: pmaxud %xmm1, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT: pmaxuw %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psrld $16, %xmm0
+; SSE41-NEXT: pmaxuw %xmm1, %xmm0
; SSE41-NEXT: movd %xmm0, %eax
; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v4i16:
; AVX: # %bb.0:
-; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4,5,6,7]
-; AVX-NEXT: vpmaxud %xmm0, %xmm2, %xmm0
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX-NEXT: vpmaxud %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: # kill: def $ax killed $ax killed $eax
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v4i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4,5,6,7]
-; AVX512-NEXT: vpmaxud %xmm0, %xmm2, %xmm0
-; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX512-NEXT: vpmaxud %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
; AVX512-NEXT: retq
define i8 @test_v2i8(<2 x i8> %a0) {
; SSE2-LABEL: test_v2i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: por %xmm3, %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm3, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: movd %xmm2, %eax
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlw $8, %xmm1
+; SSE2-NEXT: pmaxub %xmm0, %xmm1
+; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v2i8:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm1, %xmm4
-; SSE41-NEXT: por %xmm3, %xmm4
-; SSE41-NEXT: por %xmm2, %xmm3
-; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
-; SSE41-NEXT: pextrb $0, %xmm2, %eax
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: pmaxub %xmm0, %xmm1
+; SSE41-NEXT: pextrb $0, %xmm1, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v2i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
-; AVX-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpextrb $0, %xmm0, %eax
; AVX-NEXT: # kill: def $al killed $al killed $eax
; AVX-NEXT: retq
;
-; AVX512BW-LABEL: test_v2i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT: vpmaxuq %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512VL-LABEL: test_v2i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vpmaxuq %xmm0, %xmm1, %xmm0
-; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: test_v2i8:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512-NEXT: # kill: def $al killed $al killed $eax
+; AVX512-NEXT: retq
%1 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> %a0)
ret i8 %1
}
define i8 @test_v4i8(<4 x i8> %a0) {
; SSE2-LABEL: test_v4i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrld $16, %xmm1
+; SSE2-NEXT: pmaxub %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psrlw $8, %xmm0
+; SSE2-NEXT: pmaxub %xmm1, %xmm0
+; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v4i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: pand %xmm1, %xmm2
-; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: pmaxud %xmm2, %xmm0
-; SSE41-NEXT: pand %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: pmaxud %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrld $16, %xmm1
+; SSE41-NEXT: pmaxub %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psrlw $8, %xmm0
+; SSE41-NEXT: pmaxub %xmm1, %xmm0
; SSE41-NEXT: pextrb $0, %xmm0, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
; SSE41-NEXT: retq
;
-; AVX1-LABEL: test_v4i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
-; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpmaxud %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpmaxud %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpextrb $0, %xmm0, %eax
-; AVX1-NEXT: # kill: def $al killed $al killed $eax
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v4i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255]
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpmaxud %xmm0, %xmm2, %xmm0
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpmaxud %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpextrb $0, %xmm0, %eax
-; AVX2-NEXT: # kill: def $al killed $al killed $eax
-; AVX2-NEXT: retq
+; AVX-LABEL: test_v4i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpextrb $0, %xmm0, %eax
+; AVX-NEXT: # kill: def $al killed $al killed $eax
+; AVX-NEXT: retq
;
; AVX512-LABEL: test_v4i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255]
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vpmaxud %xmm0, %xmm2, %xmm0
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm1
-; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vpmaxud %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
; AVX512-NEXT: # kill: def $al killed $al killed $eax
; AVX512-NEXT: retq
define i8 @test_v8i8(<8 x i8> %a0) {
; SSE2-LABEL: test_v8i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE2-NEXT: pmaxsw %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pmaxsw %xmm2, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT: pmaxub %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: pmaxsw %xmm1, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pmaxub %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlw $8, %xmm1
+; SSE2-NEXT: pmaxub %xmm0, %xmm1
+; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v8i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: pand %xmm1, %xmm2
-; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: pmaxuw %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: pand %xmm1, %xmm2
-; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: pmaxuw %xmm2, %xmm0
-; SSE41-NEXT: pand %xmm0, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT: pmaxub %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: pmaxuw %xmm1, %xmm0
-; SSE41-NEXT: pextrb $0, %xmm0, %eax
+; SSE41-NEXT: pmaxub %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: pmaxub %xmm0, %xmm1
+; SSE41-NEXT: pextrb $0, %xmm1, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
; SSE41-NEXT: retq
;
-; AVX1-LABEL: test_v8i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpmaxuw %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpmaxuw %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpmaxuw %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpextrb $0, %xmm0, %eax
-; AVX1-NEXT: # kill: def $al killed $al killed $eax
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v8i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpmaxuw %xmm0, %xmm2, %xmm0
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpmaxuw %xmm0, %xmm2, %xmm0
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpmaxuw %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpextrb $0, %xmm0, %eax
-; AVX2-NEXT: # kill: def $al killed $al killed $eax
-; AVX2-NEXT: retq
+; AVX-LABEL: test_v8i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpextrb $0, %xmm0, %eax
+; AVX-NEXT: # kill: def $al killed $al killed $eax
+; AVX-NEXT: retq
;
; AVX512-LABEL: test_v8i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vpmaxuw %xmm0, %xmm2, %xmm0
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vpmaxuw %xmm0, %xmm2, %xmm0
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm1
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX512-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vpmaxuw %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
; AVX512-NEXT: # kill: def $al killed $al killed $eax
; AVX512-NEXT: retq
define i32 @test_v2i32(<2 x i32> %a0) {
; SSE2-LABEL: test_v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,0,4294967295,0]
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm3, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm2
+; SSE2-NEXT: pxor %xmm2, %xmm3
+; SSE2-NEXT: pxor %xmm1, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm3, %xmm2
+; SSE2-NEXT: pandn %xmm1, %xmm2
; SSE2-NEXT: por %xmm0, %xmm2
; SSE2-NEXT: movd %xmm2, %eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v2i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm2, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
-; SSE41-NEXT: movd %xmm2, %eax
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT: pminud %xmm0, %xmm1
+; SSE41-NEXT: movd %xmm1, %eax
; SSE41-NEXT: retq
;
-; AVX1-LABEL: test_v2i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm1
-; AVX1-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v2i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm1
-; AVX2-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: retq
-;
-; AVX512BW-LABEL: test_v2i32:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX512BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512BW-NEXT: vpminuq %zmm0, %zmm2, %zmm0
-; AVX512BW-NEXT: vmovd %xmm0, %eax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
+; AVX-LABEL: test_v2i32:
+; AVX: # %bb.0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: retq
;
-; AVX512VL-LABEL: test_v2i32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX512VL-NEXT: vpminuq %xmm0, %xmm1, %xmm0
-; AVX512VL-NEXT: vmovd %xmm0, %eax
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: test_v2i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: retq
%1 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> %a0)
ret i32 %1
}
define i16 @test_v2i16(<2 x i16> %a0) {
; SSE2-LABEL: test_v2i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0,65535,0,0,0]
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: por %xmm3, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm3, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: movd %xmm2, %eax
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrld $16, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: pminsw %xmm0, %xmm1
+; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000
; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v2i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: por %xmm2, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
-; SSE41-NEXT: movd %xmm2, %eax
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrld $16, %xmm1
+; SSE41-NEXT: pminuw %xmm0, %xmm1
+; SSE41-NEXT: movd %xmm1, %eax
; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v2i16:
; AVX: # %bb.0:
-; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: # kill: def $ax killed $ax killed $eax
; AVX-NEXT: retq
;
-; AVX512BW-LABEL: test_v2i16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX512BW-NEXT: vpminuq %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: vmovd %xmm0, %eax
-; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512VL-LABEL: test_v2i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX512VL-NEXT: vpminuq %xmm0, %xmm1, %xmm0
-; AVX512VL-NEXT: vmovd %xmm0, %eax
-; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: test_v2i16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT: retq
%1 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> %a0)
ret i16 %1
}
define i16 @test_v4i16(<4 x i16> %a0) {
; SSE2-LABEL: test_v4i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: por %xmm3, %xmm2
-; SSE2-NEXT: movd %xmm2, %eax
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: pminsw %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: psrld $16, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: pminsw %xmm1, %xmm0
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000
; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v4i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2],xmm1[3,4,5,6,7]
-; SSE41-NEXT: pminud %xmm0, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
-; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; SSE41-NEXT: pminud %xmm1, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT: pminuw %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psrld $16, %xmm0
+; SSE41-NEXT: pminuw %xmm1, %xmm0
; SSE41-NEXT: movd %xmm0, %eax
; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v4i16:
; AVX: # %bb.0:
-; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4,5,6,7]
-; AVX-NEXT: vpminud %xmm0, %xmm2, %xmm0
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX-NEXT: vpminud %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: # kill: def $ax killed $ax killed $eax
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v4i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4,5,6,7]
-; AVX512-NEXT: vpminud %xmm0, %xmm2, %xmm0
-; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX512-NEXT: vpminud %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
; AVX512-NEXT: retq
define i8 @test_v2i8(<2 x i8> %a0) {
; SSE2-LABEL: test_v2i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: por %xmm3, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm3, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: movd %xmm2, %eax
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlw $8, %xmm1
+; SSE2-NEXT: pminub %xmm0, %xmm1
+; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v2i8:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: por %xmm2, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
-; SSE41-NEXT: pextrb $0, %xmm2, %eax
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: pminub %xmm0, %xmm1
+; SSE41-NEXT: pextrb $0, %xmm1, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v2i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpextrb $0, %xmm0, %eax
; AVX-NEXT: # kill: def $al killed $al killed $eax
; AVX-NEXT: retq
;
-; AVX512BW-LABEL: test_v2i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT: vpminuq %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512VL-LABEL: test_v2i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vpminuq %xmm0, %xmm1, %xmm0
-; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: test_v2i8:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512-NEXT: # kill: def $al killed $al killed $eax
+; AVX512-NEXT: retq
%1 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> %a0)
ret i8 %1
}
define i8 @test_v4i8(<4 x i8> %a0) {
; SSE2-LABEL: test_v4i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: por %xmm3, %xmm2
-; SSE2-NEXT: movd %xmm2, %eax
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrld $16, %xmm1
+; SSE2-NEXT: pminub %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psrlw $8, %xmm0
+; SSE2-NEXT: pminub %xmm1, %xmm0
+; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v4i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: pand %xmm1, %xmm2
-; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: pminud %xmm2, %xmm0
-; SSE41-NEXT: pand %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: pminud %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrld $16, %xmm1
+; SSE41-NEXT: pminub %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psrlw $8, %xmm0
+; SSE41-NEXT: pminub %xmm1, %xmm0
; SSE41-NEXT: pextrb $0, %xmm0, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
; SSE41-NEXT: retq
;
-; AVX1-LABEL: test_v4i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
-; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpminud %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpminud %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpextrb $0, %xmm0, %eax
-; AVX1-NEXT: # kill: def $al killed $al killed $eax
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v4i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255]
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpminud %xmm0, %xmm2, %xmm0
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpminud %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpextrb $0, %xmm0, %eax
-; AVX2-NEXT: # kill: def $al killed $al killed $eax
-; AVX2-NEXT: retq
+; AVX-LABEL: test_v4i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpextrb $0, %xmm0, %eax
+; AVX-NEXT: # kill: def $al killed $al killed $eax
+; AVX-NEXT: retq
;
; AVX512-LABEL: test_v4i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255]
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vpminud %xmm0, %xmm2, %xmm0
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm1
-; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vpminud %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
; AVX512-NEXT: # kill: def $al killed $al killed $eax
; AVX512-NEXT: retq
define i8 @test_v8i8(<8 x i8> %a0) {
; SSE2-LABEL: test_v8i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE2-NEXT: pminsw %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pminsw %xmm2, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT: pminub %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: pminsw %xmm1, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pminub %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlw $8, %xmm1
+; SSE2-NEXT: pminub %xmm0, %xmm1
+; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v8i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: pand %xmm1, %xmm2
-; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: pminuw %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: pand %xmm1, %xmm2
-; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: pminuw %xmm2, %xmm0
-; SSE41-NEXT: pand %xmm0, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE41-NEXT: pminub %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: pminuw %xmm1, %xmm0
-; SSE41-NEXT: pextrb $0, %xmm0, %eax
+; SSE41-NEXT: pminub %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: pminub %xmm0, %xmm1
+; SSE41-NEXT: pextrb $0, %xmm1, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
; SSE41-NEXT: retq
;
-; AVX1-LABEL: test_v8i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpminuw %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpminuw %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpminuw %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpextrb $0, %xmm0, %eax
-; AVX1-NEXT: # kill: def $al killed $al killed $eax
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v8i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpminuw %xmm0, %xmm2, %xmm0
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpminuw %xmm0, %xmm2, %xmm0
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpminuw %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpextrb $0, %xmm0, %eax
-; AVX2-NEXT: # kill: def $al killed $al killed $eax
-; AVX2-NEXT: retq
+; AVX-LABEL: test_v8i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpextrb $0, %xmm0, %eax
+; AVX-NEXT: # kill: def $al killed $al killed $eax
+; AVX-NEXT: retq
;
; AVX512-LABEL: test_v8i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vpminuw %xmm0, %xmm2, %xmm0
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vpminuw %xmm0, %xmm2, %xmm0
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm1
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX512-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vpminuw %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
; AVX512-NEXT: # kill: def $al killed $al killed $eax
; AVX512-NEXT: retq
}
define i1 @trunc_v8i16_v8i1(<8 x i8>) {
-; SSE-LABEL: trunc_v8i16_v8i1:
-; SSE: # %bb.0:
-; SSE-NEXT: psllw $15, %xmm0
-; SSE-NEXT: packsswb %xmm0, %xmm0
-; SSE-NEXT: pmovmskb %xmm0, %eax
-; SSE-NEXT: xorb $0, %al
-; SSE-NEXT: setnp %al
-; SSE-NEXT: retq
+; SSE2-LABEL: trunc_v8i16_v8i1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psllw $15, %xmm0
+; SSE2-NEXT: packsswb %xmm0, %xmm0
+; SSE2-NEXT: pmovmskb %xmm0, %eax
+; SSE2-NEXT: xorb $0, %al
+; SSE2-NEXT: setnp %al
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: trunc_v8i16_v8i1:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT: psllw $15, %xmm0
+; SSE41-NEXT: packsswb %xmm0, %xmm0
+; SSE41-NEXT: pmovmskb %xmm0, %eax
+; SSE41-NEXT: xorb $0, %al
+; SSE41-NEXT: setnp %al
+; SSE41-NEXT: retq
;
; AVX-LABEL: trunc_v8i16_v8i1:
; AVX: # %bb.0:
+; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX-NEXT: vpsllw $15, %xmm0, %xmm0
; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX-NEXT: vpmovmskb %xmm0, %eax
;
; AVX512F-LABEL: trunc_v8i16_v8i1:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
-; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
-; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: xorb $0, %al
; AVX512F-NEXT: setnp %al
;
; AVX512BW-LABEL: trunc_v8i16_v8i1:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpsllw $15, %xmm0, %xmm0
-; AVX512BW-NEXT: vpmovw2m %zmm0, %k0
+; AVX512BW-NEXT: vpsllw $7, %xmm0, %xmm0
+; AVX512BW-NEXT: vpmovb2m %zmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: xorb $0, %al
; AVX512BW-NEXT: setnp %al
;
; AVX512VL-LABEL: trunc_v8i16_v8i1:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllw $15, %xmm0, %xmm0
-; AVX512VL-NEXT: vpmovw2m %xmm0, %k0
+; AVX512VL-NEXT: vpsllw $7, %xmm0, %xmm0
+; AVX512VL-NEXT: vpmovb2m %xmm0, %k0
; AVX512VL-NEXT: kmovd %k0, %eax
; AVX512VL-NEXT: xorb $0, %al
; AVX512VL-NEXT: setnp %al
}
define i1 @icmp_v8i16_v8i1(<8 x i8>) {
-; SSE-LABEL: icmp_v8i16_v8i1:
-; SSE: # %bb.0:
-; SSE-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE-NEXT: pxor %xmm1, %xmm1
-; SSE-NEXT: pcmpeqw %xmm0, %xmm1
-; SSE-NEXT: packsswb %xmm0, %xmm1
-; SSE-NEXT: pmovmskb %xmm1, %eax
-; SSE-NEXT: xorb $0, %al
-; SSE-NEXT: setnp %al
-; SSE-NEXT: retq
+; SSE2-LABEL: icmp_v8i16_v8i1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpeqb %xmm0, %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT: packsswb %xmm0, %xmm0
+; SSE2-NEXT: pmovmskb %xmm0, %eax
+; SSE2-NEXT: xorb $0, %al
+; SSE2-NEXT: setnp %al
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: icmp_v8i16_v8i1:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: pcmpeqb %xmm0, %xmm1
+; SSE41-NEXT: pmovsxbw %xmm1, %xmm0
+; SSE41-NEXT: packsswb %xmm0, %xmm0
+; SSE41-NEXT: pmovmskb %xmm0, %eax
+; SSE41-NEXT: xorb $0, %al
+; SSE41-NEXT: setnp %al
+; SSE41-NEXT: retq
;
; AVX-LABEL: icmp_v8i16_v8i1:
; AVX: # %bb.0:
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpmovsxbw %xmm0, %xmm0
; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX-NEXT: vpmovmskb %xmm0, %eax
; AVX-NEXT: xorb $0, %al
;
; AVX512F-LABEL: icmp_v8i16_v8i1:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
-; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: xorb $0, %al
; AVX512F-NEXT: setnp %al
; AVX512BW-LABEL: icmp_v8i16_v8i1:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; AVX512BW-NEXT: vptestnmw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: xorb $0, %al
; AVX512BW-NEXT: setnp %al
;
; AVX512VL-LABEL: icmp_v8i16_v8i1:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vptestnmw {{.*}}(%rip), %xmm0, %k0
+; AVX512VL-NEXT: vptestnmb %xmm0, %xmm0, %k0
; AVX512VL-NEXT: kmovd %k0, %eax
; AVX512VL-NEXT: xorb $0, %al
; AVX512VL-NEXT: setnp %al
define i32 @test_v2i32(<2 x i32> %a0) {
; SSE-LABEL: test_v2i32:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE-NEXT: pxor %xmm0, %xmm1
; SSE-NEXT: movd %xmm1, %eax
; SSE-NEXT: retq
;
; AVX-LABEL: test_v2i32:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: retq
define i16 @test_v2i16(<2 x i16> %a0) {
; SSE-LABEL: test_v2i16:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psrld $16, %xmm1
; SSE-NEXT: pxor %xmm0, %xmm1
; SSE-NEXT: movd %xmm1, %eax
; SSE-NEXT: # kill: def $ax killed $ax killed $eax
;
; AVX-LABEL: test_v2i16:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: # kill: def $ax killed $ax killed $eax
define i16 @test_v4i16(<4 x i16> %a0) {
; SSE-LABEL: test_v4i16:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE-NEXT: pxor %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: psrld $16, %xmm0
; SSE-NEXT: pxor %xmm1, %xmm0
; SSE-NEXT: movd %xmm0, %eax
; SSE-NEXT: # kill: def $ax killed $ax killed $eax
;
; AVX-LABEL: test_v4i16:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: # kill: def $ax killed $ax killed $eax
; AVX-NEXT: retq
define i8 @test_v2i8(<2 x i8> %a0) {
; SSE2-LABEL: test_v2i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlw $8, %xmm1
; SSE2-NEXT: pxor %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
;
; SSE41-LABEL: test_v2i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrlw $8, %xmm1
; SSE41-NEXT: pxor %xmm0, %xmm1
; SSE41-NEXT: pextrb $0, %xmm1, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
;
; AVX-LABEL: test_v2i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpextrb $0, %xmm0, %eax
; AVX-NEXT: # kill: def $al killed $al killed $eax
define i8 @test_v4i8(<4 x i8> %a0) {
; SSE2-LABEL: test_v4i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrld $16, %xmm1
; SSE2-NEXT: pxor %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psrlw $8, %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm0
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
;
; SSE41-LABEL: test_v4i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrld $16, %xmm1
; SSE41-NEXT: pxor %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psrlw $8, %xmm0
; SSE41-NEXT: pxor %xmm1, %xmm0
; SSE41-NEXT: pextrb $0, %xmm0, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
;
; AVX-LABEL: test_v4i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpextrb $0, %xmm0, %eax
; AVX-NEXT: # kill: def $al killed $al killed $eax
define i8 @test_v8i8(<8 x i8> %a0) {
; SSE2-LABEL: test_v8i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE2-NEXT: pxor %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psrld $16, %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
+; SSE2-NEXT: psrlw $8, %xmm1
; SSE2-NEXT: pxor %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: # kill: def $al killed $al killed $eax
;
; SSE41-LABEL: test_v8i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE41-NEXT: pxor %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psrld $16, %xmm0
; SSE41-NEXT: pxor %xmm1, %xmm0
; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrld $16, %xmm1
+; SSE41-NEXT: psrlw $8, %xmm1
; SSE41-NEXT: pxor %xmm0, %xmm1
; SSE41-NEXT: pextrb $0, %xmm1, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
;
; AVX-LABEL: test_v8i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpextrb $0, %xmm0, %eax
; AVX-NEXT: # kill: def $al killed $al killed $eax
; AVX-NEXT: retq
;
; X32-SSE2-LABEL: sext_2i8_to_i32:
; X32-SSE2: # %bb.0: # %entry
-; X32-SSE2-NEXT: pushl %eax
-; X32-SSE2-NEXT: .cfi_def_cfa_offset 8
; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X32-SSE2-NEXT: psraw $8, %xmm0
; X32-SSE2-NEXT: movd %xmm0, %eax
-; X32-SSE2-NEXT: popl %ecx
-; X32-SSE2-NEXT: .cfi_def_cfa_offset 4
; X32-SSE2-NEXT: retl
;
; X32-SSE41-LABEL: sext_2i8_to_i32:
; X32-SSE41: # %bb.0: # %entry
-; X32-SSE41-NEXT: pushl %eax
-; X32-SSE41-NEXT: .cfi_def_cfa_offset 8
; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm0
; X32-SSE41-NEXT: movd %xmm0, %eax
-; X32-SSE41-NEXT: popl %ecx
-; X32-SSE41-NEXT: .cfi_def_cfa_offset 4
; X32-SSE41-NEXT: retl
entry:
%Shuf = shufflevector <16 x i8> %A, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) {
; SSE2-LABEL: sext_4i8_to_4i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: pslld $24, %xmm0
-; SSE2-NEXT: psrad $24, %xmm0
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT: psrad $24, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: sext_4i8_to_4i64:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: pslld $24, %xmm0
-; SSSE3-NEXT: psrad $24, %xmm0
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT: psrad $24, %xmm1
; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: pxor %xmm3, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: sext_4i8_to_4i64:
; SSE41: # %bb.0:
-; SSE41-NEXT: pslld $24, %xmm0
-; SSE41-NEXT: psrad $24, %xmm0
-; SSE41-NEXT: pmovsxdq %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmovsxdq %xmm0, %xmm1
+; SSE41-NEXT: pmovsxbq %xmm0, %xmm2
+; SSE41-NEXT: psrld $16, %xmm0
+; SSE41-NEXT: pmovsxbq %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: sext_4i8_to_4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxbq %xmm0, %xmm1
+; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxbq %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: sext_4i8_to_4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT: vpmovsxbq %xmm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: sext_4i8_to_4i64:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX512-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX512-NEXT: vpmovsxdq %xmm0, %ymm0
+; AVX512-NEXT: vpmovsxbq %xmm0, %ymm0
; AVX512-NEXT: retq
;
; X32-SSE2-LABEL: sext_4i8_to_4i64:
; X32-SSE2: # %bb.0:
-; X32-SSE2-NEXT: pslld $24, %xmm0
-; X32-SSE2-NEXT: psrad $24, %xmm0
+; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X32-SSE2-NEXT: psrad $24, %xmm1
; X32-SSE2-NEXT: pxor %xmm2, %xmm2
-; X32-SSE2-NEXT: pxor %xmm3, %xmm3
-; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm3
-; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X32-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; X32-SSE2-NEXT: retl
;
; X32-SSE41-LABEL: sext_4i8_to_4i64:
; X32-SSE41: # %bb.0:
-; X32-SSE41-NEXT: pslld $24, %xmm0
-; X32-SSE41-NEXT: psrad $24, %xmm0
-; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm2
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm1
+; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm2
+; X32-SSE41-NEXT: psrld $16, %xmm0
+; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm1
; X32-SSE41-NEXT: movdqa %xmm2, %xmm0
; X32-SSE41-NEXT: retl
%extmask = sext <4 x i8> %mask to <4 x i64>
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE2-NEXT: psrad $24, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; SSE2-NEXT: paddq %xmm0, %xmm0
+; SSE2-NEXT: paddd %xmm0, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: sext_2i8_to_2i32:
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSSE3-NEXT: psrad $24, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; SSSE3-NEXT: paddq %xmm0, %xmm0
+; SSSE3-NEXT: paddd %xmm0, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: sext_2i8_to_2i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: pmovsxbq (%rdi), %xmm0
-; SSE41-NEXT: paddq %xmm0, %xmm0
+; SSE41-NEXT: movzwl (%rdi), %eax
+; SSE41-NEXT: movd %eax, %xmm0
+; SSE41-NEXT: pmovsxbd %xmm0, %xmm0
+; SSE41-NEXT: paddd %xmm0, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: sext_2i8_to_2i32:
; AVX: # %bb.0:
-; AVX-NEXT: vpmovsxbq (%rdi), %xmm0
-; AVX-NEXT: vpaddq %xmm0, %xmm0, %xmm0
+; AVX-NEXT: movzwl (%rdi), %eax
+; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: vpmovsxbd %xmm0, %xmm0
+; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
;
; X32-SSE2-LABEL: sext_2i8_to_2i32:
; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; X32-SSE2-NEXT: psrad $24, %xmm0
-; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; X32-SSE2-NEXT: paddq %xmm0, %xmm0
+; X32-SSE2-NEXT: paddd %xmm0, %xmm0
; X32-SSE2-NEXT: retl
;
; X32-SSE41-LABEL: sext_2i8_to_2i32:
; X32-SSE41: # %bb.0:
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE41-NEXT: pmovsxbq (%eax), %xmm0
-; X32-SSE41-NEXT: paddq %xmm0, %xmm0
+; X32-SSE41-NEXT: movzwl (%eax), %eax
+; X32-SSE41-NEXT: movd %eax, %xmm0
+; X32-SSE41-NEXT: pmovsxbd %xmm0, %xmm0
+; X32-SSE41-NEXT: paddd %xmm0, %xmm0
; X32-SSE41-NEXT: retl
%x = load <2 x i8>, <2 x i8>* %addr, align 1
%y = sext <2 x i8> %x to <2 x i32>
define <8 x i32> @zext_negate_sext(<8 x i8> %x) {
; SSE2-LABEL: zext_negate_sext:
; SSE2: # %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE2-NEXT: psubw %xmm0, %xmm1
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE2-NEXT: psrad $16, %xmm0
;
; SSSE3-LABEL: zext_negate_sext:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: pand {{.*}}(%rip), %xmm0
; SSSE3-NEXT: pxor %xmm1, %xmm1
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSSE3-NEXT: psubw %xmm0, %xmm1
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSSE3-NEXT: psrad $16, %xmm0
;
; SSE41-LABEL: zext_negate_sext:
; SSE41: # %bb.0:
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: psubw %xmm0, %xmm1
; SSE41-NEXT: pmovsxwd %xmm1, %xmm0
;
; AVX1-LABEL: zext_negate_sext:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm2
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: zext_negate_sext:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsubd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: zext_negate_sext:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512-NEXT: vpsubd %ymm0, %ymm1, %ymm0
; AVX512-NEXT: retq
;
; X32-SSE2-LABEL: zext_negate_sext:
; X32-SSE2: # %bb.0:
-; X32-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE2-NEXT: pxor %xmm1, %xmm1
+; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; X32-SSE2-NEXT: psubw %xmm0, %xmm1
; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X32-SSE2-NEXT: psrad $16, %xmm0
;
; X32-SSE41-LABEL: zext_negate_sext:
; X32-SSE41: # %bb.0:
-; X32-SSE41-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; X32-SSE41-NEXT: pxor %xmm1, %xmm1
; X32-SSE41-NEXT: psubw %xmm0, %xmm1
; X32-SSE41-NEXT: pmovsxwd %xmm1, %xmm0
define <8 x i32> @zext_decremenet_sext(<8 x i8> %x) {
; SSE2-LABEL: zext_decremenet_sext:
; SSE2: # %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
; SSE2-NEXT: paddw %xmm0, %xmm1
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
;
; SSSE3-LABEL: zext_decremenet_sext:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: pand {{.*}}(%rip), %xmm0
+; SSSE3-NEXT: pxor %xmm1, %xmm1
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1
; SSSE3-NEXT: paddw %xmm0, %xmm1
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
;
; SSE41-LABEL: zext_decremenet_sext:
; SSE41: # %bb.0:
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
; SSE41-NEXT: paddw %xmm0, %xmm1
; SSE41-NEXT: pmovsxwd %xmm1, %xmm0
;
; AVX1-LABEL: zext_decremenet_sext:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: zext_decremenet_sext:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: zext_decremenet_sext:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX512-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
;
; X32-SSE2-LABEL: zext_decremenet_sext:
; X32-SSE2: # %bb.0:
-; X32-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE2-NEXT: pxor %xmm1, %xmm1
+; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; X32-SSE2-NEXT: pcmpeqd %xmm1, %xmm1
; X32-SSE2-NEXT: paddw %xmm0, %xmm1
; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
;
; X32-SSE41-LABEL: zext_decremenet_sext:
; X32-SSE41: # %bb.0:
-; X32-SSE41-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; X32-SSE41-NEXT: pcmpeqd %xmm1, %xmm1
; X32-SSE41-NEXT: paddw %xmm0, %xmm1
; X32-SSE41-NEXT: pmovsxwd %xmm1, %xmm0
define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
; SSE2-LABEL: var_shift_v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: psllq $32, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: psrlq %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; SSE2-NEXT: psrlq %xmm4, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: psrlq %xmm1, %xmm3
-; SSE2-NEXT: psrlq %xmm4, %xmm2
-; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
-; SSE2-NEXT: xorpd %xmm0, %xmm2
-; SSE2-NEXT: psubq %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: var_shift_v2i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psllq $32, %xmm2
-; SSE41-NEXT: psrad $31, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: psrlq %xmm1, %xmm3
-; SSE41-NEXT: psrlq %xmm0, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: psrlq %xmm0, %xmm4
-; SSE41-NEXT: psrlq %xmm1, %xmm3
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
-; SSE41-NEXT: pxor %xmm3, %xmm2
-; SSE41-NEXT: psubq %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: var_shift_v2i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $32, %xmm0, %xmm2
-; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm3
-; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: var_shift_v2i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX2-NEXT: vpsllq $32, %xmm0, %xmm2
-; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpsrlvq %xmm1, %xmm2, %xmm1
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; XOPAVX1-LABEL: var_shift_v2i32:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; XOPAVX1-NEXT: vpsllq $32, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
-; XOPAVX1-NEXT: vpshaq %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
-;
-; XOPAVX2-LABEL: var_shift_v2i32:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; XOPAVX2-NEXT: vpsllq $32, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm1
-; XOPAVX2-NEXT: vpshaq %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
-;
-; AVX512-LABEL: var_shift_v2i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX512-NEXT: vpsraq $32, %zmm0, %zmm0
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
-;
-; AVX512VL-LABEL: var_shift_v2i32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX512VL-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsraq $32, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsravq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; X32-SSE-LABEL: var_shift_v2i32:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psllq $32, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; X32-SSE-NEXT: psrad $31, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,2147483648,0,2147483648]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm3
-; X32-SSE-NEXT: psrlq %xmm1, %xmm3
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; X32-SSE-NEXT: xorps %xmm5, %xmm5
-; X32-SSE-NEXT: movss {{.*#+}} xmm5 = xmm4[0],xmm5[1,2,3]
-; X32-SSE-NEXT: psrlq %xmm5, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
-; X32-SSE-NEXT: movdqa %xmm2, %xmm3
-; X32-SSE-NEXT: psrlq %xmm5, %xmm3
-; X32-SSE-NEXT: psrlq %xmm1, %xmm2
-; X32-SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],xmm3[1]
-; X32-SSE-NEXT: xorpd %xmm0, %xmm2
-; X32-SSE-NEXT: psubq %xmm0, %xmm2
-; X32-SSE-NEXT: movdqa %xmm2, %xmm0
-; X32-SSE-NEXT: retl
- %shift = ashr <2 x i32> %a, %b
- ret <2 x i32> %shift
-}
-
-define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
-; SSE2-LABEL: var_shift_v4i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pslld $16, %xmm0
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: psrad %xmm2, %xmm3
; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: retq
;
-; SSE41-LABEL: var_shift_v4i16:
+; SSE41-LABEL: var_shift_v2i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
-; SSE41-NEXT: pslld $16, %xmm0
-; SSE41-NEXT: psrad $16, %xmm0
-; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
+; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: psrad %xmm2, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
; SSE41-NEXT: movdqa %xmm0, %xmm5
; SSE41-NEXT: psrad %xmm4, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
-; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm6
-; SSE41-NEXT: psrad %xmm4, %xmm6
-; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm5[0,1,2,3],xmm6[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psrad %xmm1, %xmm2
-; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,1,1,4,5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: psrad %xmm1, %xmm3
+; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
; SSE41-NEXT: psrad %xmm1, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
; SSE41-NEXT: retq
;
-; AVX1-LABEL: var_shift_v4i16:
+; AVX1-LABEL: var_shift_v2i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; AVX1-NEXT: vpslld $16, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
-; AVX1-NEXT: vpsrad %xmm4, %xmm0, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vpsrad %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3
+; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: var_shift_v4i16:
+; AVX2-LABEL: var_shift_v2i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; AVX2-NEXT: vpslld $16, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0
; AVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
-; XOPAVX1-LABEL: var_shift_v4i16:
+; XOPAVX1-LABEL: var_shift_v2i32:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; XOPAVX1-NEXT: vpslld $16, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpsrad $16, %xmm0, %xmm0
; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1
; XOPAVX1-NEXT: vpshad %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
-; XOPAVX2-LABEL: var_shift_v4i16:
+; XOPAVX2-LABEL: var_shift_v2i32:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; XOPAVX2-NEXT: vpslld $16, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpsrad $16, %xmm0, %xmm0
; XOPAVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
-; AVX512-LABEL: var_shift_v4i16:
+; AVX512-LABEL: var_shift_v2i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; AVX512-NEXT: vpslld $16, %xmm0, %xmm0
-; AVX512-NEXT: vpsrad $16, %xmm0, %xmm0
; AVX512-NEXT: vpsravd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
-; AVX512VL-LABEL: var_shift_v4i16:
+; AVX512VL-LABEL: var_shift_v2i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; AVX512VL-NEXT: vpslld $16, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsrad $16, %xmm0, %xmm0
; AVX512VL-NEXT: vpsravd %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
-; X32-SSE-LABEL: var_shift_v4i16:
+; X32-SSE-LABEL: var_shift_v2i32:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pslld $16, %xmm0
-; X32-SSE-NEXT: psrad $16, %xmm0
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
; X32-SSE-NEXT: movdqa %xmm0, %xmm3
; X32-SSE-NEXT: psrad %xmm2, %xmm3
; X32-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
; X32-SSE-NEXT: movaps %xmm2, %xmm0
; X32-SSE-NEXT: retl
- %shift = ashr <4 x i16> %a, %b
- ret <4 x i16> %shift
-}
-
-define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
-; SSE2-LABEL: var_shift_v2i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: psllq $48, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: psrlq %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; SSE2-NEXT: psrlq %xmm4, %xmm2
-; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: psrlq %xmm1, %xmm3
-; SSE2-NEXT: psrlq %xmm4, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
-; SSE2-NEXT: xorpd %xmm2, %xmm0
-; SSE2-NEXT: psubq %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: var_shift_v2i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: psllq $48, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psrad $31, %xmm2
-; SSE41-NEXT: psrad $16, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psrlq %xmm1, %xmm3
-; SSE41-NEXT: psrlq %xmm2, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: psrlq %xmm2, %xmm4
-; SSE41-NEXT: psrlq %xmm1, %xmm3
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
-; SSE41-NEXT: pxor %xmm3, %xmm0
-; SSE41-NEXT: psubq %xmm3, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: var_shift_v2i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2
-; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm3
-; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: var_shift_v2i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX2-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2
-; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpsrlvq %xmm1, %xmm2, %xmm1
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; XOP-LABEL: var_shift_v2i16:
-; XOP: # %bb.0:
-; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; XOP-NEXT: vpsllq $48, %xmm0, %xmm0
-; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: vpsubq %xmm1, %xmm2, %xmm1
-; XOP-NEXT: vpshaq %xmm1, %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512-LABEL: var_shift_v2i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX512-NEXT: vpsraq $48, %zmm0, %zmm0
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
-;
-; AVX512VL-LABEL: var_shift_v2i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX512VL-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsraq $48, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsravq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; X32-SSE-LABEL: var_shift_v2i16:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psllq $48, %xmm0
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psrad $31, %xmm2
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; X32-SSE-NEXT: psrad $16, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
-; X32-SSE-NEXT: movdqa %xmm2, %xmm3
-; X32-SSE-NEXT: psrlq %xmm1, %xmm3
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; X32-SSE-NEXT: psrlq %xmm4, %xmm2
-; X32-SSE-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm3
-; X32-SSE-NEXT: psrlq %xmm1, %xmm3
-; X32-SSE-NEXT: psrlq %xmm4, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
-; X32-SSE-NEXT: xorpd %xmm2, %xmm0
-; X32-SSE-NEXT: psubq %xmm2, %xmm0
-; X32-SSE-NEXT: retl
- %shift = ashr <2 x i16> %a, %b
- ret <2 x i16> %shift
+ %shift = ashr <2 x i32> %a, %b
+ ret <2 x i32> %shift
}
-define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
-; SSE2-LABEL: var_shift_v8i8:
+define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
+; SSE2-LABEL: var_shift_v4i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psllw $8, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: psraw $8, %xmm3
; SSE2-NEXT: psllw $12, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psraw $15, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: psraw $15, %xmm2
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: pandn %xmm3, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm0, %xmm3
+; SSE2-NEXT: psraw $8, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
; SSE2-NEXT: paddw %xmm1, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: psraw $15, %xmm2
; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
-; SSE41-LABEL: var_shift_v8i8:
+; SSE41-LABEL: var_shift_v4i16:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psllw $8, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm1
-; SSE41-NEXT: psraw $8, %xmm1
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: psllw $12, %xmm0
; SSE41-NEXT: psllw $4, %xmm2
; SSE41-NEXT: por %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: paddw %xmm2, %xmm4
-; SSE41-NEXT: psraw $15, %xmm3
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: paddw %xmm2, %xmm3
+; SSE41-NEXT: movdqa %xmm1, %xmm4
+; SSE41-NEXT: psraw $8, %xmm4
; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1
+; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm2
; SSE41-NEXT: psraw $4, %xmm2
-; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm2
; SSE41-NEXT: psraw $2, %xmm2
-; SSE41-NEXT: paddw %xmm4, %xmm4
-; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: paddw %xmm3, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm2
; SSE41-NEXT: psraw $1, %xmm2
-; SSE41-NEXT: paddw %xmm4, %xmm4
-; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: paddw %xmm3, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: var_shift_v8i8:
+; AVX1-LABEL: var_shift_v4i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX1-NEXT: vpsraw $8, %xmm0, %xmm2
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vpsllw $12, %xmm1, %xmm3
+; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2
; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
-; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm3
-; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0
-; AVX1-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2
+; AVX1-NEXT: vpsraw $8, %xmm0, %xmm3
+; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpsraw $4, %xmm0, %xmm1
-; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1
-; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm2
+; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1
; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: var_shift_v8i8:
+; AVX2-LABEL: var_shift_v4i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX2-NEXT: vpsraw $8, %xmm0, %xmm0
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; XOP-LABEL: var_shift_v8i8:
+; XOP-LABEL: var_shift_v4i16:
; XOP: # %bb.0:
-; XOP-NEXT: vpsllw $8, %xmm0, %xmm0
-; XOP-NEXT: vpsraw $8, %xmm0, %xmm0
-; XOP-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
; XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm1
; XOP-NEXT: vpshaw %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
-; AVX512DQ-LABEL: var_shift_v8i8:
+; AVX512DQ-LABEL: var_shift_v4i16:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpsraw $8, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX512DQ-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX512DQ-NEXT: vpsravd %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
-; AVX512BW-LABEL: var_shift_v8i8:
+; AVX512BW-LABEL: var_shift_v4i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512BW-NEXT: vpsraw $8, %xmm0, %xmm0
-; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
-; AVX512DQVL-LABEL: var_shift_v8i8:
+; AVX512DQVL-LABEL: var_shift_v4i16:
; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpsraw $8, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX512DQVL-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX512DQVL-NEXT: vpsravd %ymm1, %ymm0, %ymm0
; AVX512DQVL-NEXT: vzeroupper
; AVX512DQVL-NEXT: retq
;
-; AVX512BWVL-LABEL: var_shift_v8i8:
+; AVX512BWVL-LABEL: var_shift_v4i16:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpsraw $8, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpsravw %xmm1, %xmm0, %xmm0
; AVX512BWVL-NEXT: retq
;
-; X32-SSE-LABEL: var_shift_v8i8:
+; X32-SSE-LABEL: var_shift_v4i16:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psllw $8, %xmm2
-; X32-SSE-NEXT: movdqa %xmm2, %xmm3
-; X32-SSE-NEXT: psraw $8, %xmm3
; X32-SSE-NEXT: psllw $12, %xmm1
-; X32-SSE-NEXT: movdqa %xmm1, %xmm0
-; X32-SSE-NEXT: psraw $15, %xmm0
+; X32-SSE-NEXT: movdqa %xmm1, %xmm2
; X32-SSE-NEXT: psraw $15, %xmm2
-; X32-SSE-NEXT: pand %xmm0, %xmm2
-; X32-SSE-NEXT: pandn %xmm3, %xmm0
-; X32-SSE-NEXT: por %xmm2, %xmm0
+; X32-SSE-NEXT: movdqa %xmm2, %xmm3
+; X32-SSE-NEXT: pandn %xmm0, %xmm3
+; X32-SSE-NEXT: psraw $8, %xmm0
+; X32-SSE-NEXT: pand %xmm2, %xmm0
+; X32-SSE-NEXT: por %xmm3, %xmm0
; X32-SSE-NEXT: paddw %xmm1, %xmm1
; X32-SSE-NEXT: movdqa %xmm1, %xmm2
; X32-SSE-NEXT: psraw $15, %xmm2
; X32-SSE-NEXT: pand %xmm1, %xmm0
; X32-SSE-NEXT: por %xmm2, %xmm0
; X32-SSE-NEXT: retl
- %shift = ashr <8 x i8> %a, %b
- ret <8 x i8> %shift
+ %shift = ashr <4 x i16> %a, %b
+ ret <4 x i16> %shift
}
-define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
-; SSE2-LABEL: var_shift_v4i8:
+define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
+; SSE2-LABEL: var_shift_v2i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: pslld $24, %xmm0
-; SSE2-NEXT: psrad $24, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: psrad %xmm2, %xmm3
-; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrad %xmm4, %xmm2
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: psrad %xmm3, %xmm4
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
-; SSE2-NEXT: psrad %xmm1, %xmm0
-; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
-; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: psllw $12, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psraw $15, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm0, %xmm3
+; SSE2-NEXT: psraw $8, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psraw $15, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm0, %xmm3
+; SSE2-NEXT: psraw $4, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psraw $15, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm0, %xmm3
+; SSE2-NEXT: psraw $2, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm1
+; SSE2-NEXT: psraw $15, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: psraw $1, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
-; SSE41-LABEL: var_shift_v4i8:
+; SSE41-LABEL: var_shift_v2i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: pslld $24, %xmm0
-; SSE41-NEXT: psrad $24, %xmm0
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psrad %xmm2, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm5
-; SSE41-NEXT: psrad %xmm4, %xmm5
-; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
-; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psrad %xmm1, %xmm3
-; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
-; SSE41-NEXT: psrad %xmm1, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: var_shift_v4i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpsrad %xmm2, %xmm0, %xmm2
-; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3
-; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: psllw $12, %xmm0
+; SSE41-NEXT: psllw $4, %xmm2
+; SSE41-NEXT: por %xmm0, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: paddw %xmm2, %xmm3
+; SSE41-NEXT: movdqa %xmm1, %xmm4
+; SSE41-NEXT: psraw $8, %xmm4
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psraw $4, %xmm2
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psraw $2, %xmm2
+; SSE41-NEXT: paddw %xmm3, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psraw $1, %xmm2
+; SSE41-NEXT: paddw %xmm3, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: var_shift_v2i16:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2
+; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2
+; AVX1-NEXT: vpsraw $8, %xmm0, %xmm3
+; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpsraw $4, %xmm0, %xmm1
+; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1
+; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1
+; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: var_shift_v4i8:
+; AVX2-LABEL: var_shift_v2i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; XOPAVX1-LABEL: var_shift_v4i8:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpslld $24, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpsrad $24, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1
-; XOPAVX1-NEXT: vpshad %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; XOP-LABEL: var_shift_v2i16:
+; XOP: # %bb.0:
+; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm1
+; XOP-NEXT: vpshaw %xmm1, %xmm0, %xmm0
+; XOP-NEXT: retq
;
-; XOPAVX2-LABEL: var_shift_v4i8:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; XOPAVX2-NEXT: vpslld $24, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpsrad $24, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; AVX512DQ-LABEL: var_shift_v2i16:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQ-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX512DQ-NEXT: vpsravd %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; AVX512-LABEL: var_shift_v4i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX512-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX512-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX512-NEXT: vpsravd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512BW-LABEL: var_shift_v2i16:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
;
-; AVX512VL-LABEL: var_shift_v4i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX512VL-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsravd %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512DQVL-LABEL: var_shift_v2i16:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQVL-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX512DQVL-NEXT: vpsravd %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: var_shift_v2i16:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpsravw %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT: retq
+;
+; X32-SSE-LABEL: var_shift_v2i16:
+; X32-SSE: # %bb.0:
+; X32-SSE-NEXT: psllw $12, %xmm1
+; X32-SSE-NEXT: movdqa %xmm1, %xmm2
+; X32-SSE-NEXT: psraw $15, %xmm2
+; X32-SSE-NEXT: movdqa %xmm2, %xmm3
+; X32-SSE-NEXT: pandn %xmm0, %xmm3
+; X32-SSE-NEXT: psraw $8, %xmm0
+; X32-SSE-NEXT: pand %xmm2, %xmm0
+; X32-SSE-NEXT: por %xmm3, %xmm0
+; X32-SSE-NEXT: paddw %xmm1, %xmm1
+; X32-SSE-NEXT: movdqa %xmm1, %xmm2
+; X32-SSE-NEXT: psraw $15, %xmm2
+; X32-SSE-NEXT: movdqa %xmm2, %xmm3
+; X32-SSE-NEXT: pandn %xmm0, %xmm3
+; X32-SSE-NEXT: psraw $4, %xmm0
+; X32-SSE-NEXT: pand %xmm2, %xmm0
+; X32-SSE-NEXT: por %xmm3, %xmm0
+; X32-SSE-NEXT: paddw %xmm1, %xmm1
+; X32-SSE-NEXT: movdqa %xmm1, %xmm2
+; X32-SSE-NEXT: psraw $15, %xmm2
+; X32-SSE-NEXT: movdqa %xmm2, %xmm3
+; X32-SSE-NEXT: pandn %xmm0, %xmm3
+; X32-SSE-NEXT: psraw $2, %xmm0
+; X32-SSE-NEXT: pand %xmm2, %xmm0
+; X32-SSE-NEXT: por %xmm3, %xmm0
+; X32-SSE-NEXT: paddw %xmm1, %xmm1
+; X32-SSE-NEXT: psraw $15, %xmm1
+; X32-SSE-NEXT: movdqa %xmm1, %xmm2
+; X32-SSE-NEXT: pandn %xmm0, %xmm2
+; X32-SSE-NEXT: psraw $1, %xmm0
+; X32-SSE-NEXT: pand %xmm1, %xmm0
+; X32-SSE-NEXT: por %xmm2, %xmm0
+; X32-SSE-NEXT: retl
+ %shift = ashr <2 x i16> %a, %b
+ ret <2 x i16> %shift
+}
+
+define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
+; SSE2-LABEL: var_shift_v8i8:
+; SSE2: # %bb.0:
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; SSE2-NEXT: psllw $5, %xmm1
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pandn %xmm2, %xmm6
+; SSE2-NEXT: psraw $4, %xmm2
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: por %xmm6, %xmm2
+; SSE2-NEXT: paddw %xmm4, %xmm4
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pandn %xmm2, %xmm6
+; SSE2-NEXT: psraw $2, %xmm2
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: por %xmm6, %xmm2
+; SSE2-NEXT: paddw %xmm4, %xmm4
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm4
+; SSE2-NEXT: pandn %xmm2, %xmm4
+; SSE2-NEXT: psraw $1, %xmm2
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: por %xmm4, %xmm2
+; SSE2-NEXT: psrlw $8, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pcmpgtw %xmm1, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: pandn %xmm0, %xmm5
+; SSE2-NEXT: psraw $4, %xmm0
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pcmpgtw %xmm1, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: pandn %xmm0, %xmm5
+; SSE2-NEXT: psraw $2, %xmm0
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm1
+; SSE2-NEXT: pcmpgtw %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm1
+; SSE2-NEXT: psraw $1, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: psrlw $8, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: var_shift_v8i8:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psllw $5, %xmm1
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: psraw $4, %xmm4
+; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: psraw $2, %xmm4
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: psraw $1, %xmm4
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3
+; SSE41-NEXT: psrlw $8, %xmm3
+; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psraw $4, %xmm2
+; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psraw $2, %xmm2
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psraw $1, %xmm2
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: packuswb %xmm3, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: var_shift_v8i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX-NEXT: vpsraw $4, %xmm3, %xmm4
+; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX-NEXT: vpsraw $2, %xmm3, %xmm4
+; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX-NEXT: vpsraw $1, %xmm3, %xmm4
+; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
+; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX-NEXT: vpsraw $4, %xmm0, %xmm3
+; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vpsraw $2, %xmm0, %xmm3
+; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vpsraw $1, %xmm0, %xmm3
+; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; XOP-LABEL: var_shift_v8i8:
+; XOP: # %bb.0:
+; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1
+; XOP-NEXT: vpshab %xmm1, %xmm0, %xmm0
+; XOP-NEXT: retq
+;
+; AVX512DQ-LABEL: var_shift_v8i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
+;
+; AVX512BW-LABEL: var_shift_v8i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512DQVL-LABEL: var_shift_v8i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: var_shift_v8i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+;
+; X32-SSE-LABEL: var_shift_v8i8:
+; X32-SSE: # %bb.0:
+; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; X32-SSE-NEXT: psllw $5, %xmm1
+; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
+; X32-SSE-NEXT: pxor %xmm3, %xmm3
+; X32-SSE-NEXT: pxor %xmm5, %xmm5
+; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5
+; X32-SSE-NEXT: movdqa %xmm5, %xmm6
+; X32-SSE-NEXT: pandn %xmm2, %xmm6
+; X32-SSE-NEXT: psraw $4, %xmm2
+; X32-SSE-NEXT: pand %xmm5, %xmm2
+; X32-SSE-NEXT: por %xmm6, %xmm2
+; X32-SSE-NEXT: paddw %xmm4, %xmm4
+; X32-SSE-NEXT: pxor %xmm5, %xmm5
+; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5
+; X32-SSE-NEXT: movdqa %xmm5, %xmm6
+; X32-SSE-NEXT: pandn %xmm2, %xmm6
+; X32-SSE-NEXT: psraw $2, %xmm2
+; X32-SSE-NEXT: pand %xmm5, %xmm2
+; X32-SSE-NEXT: por %xmm6, %xmm2
+; X32-SSE-NEXT: paddw %xmm4, %xmm4
+; X32-SSE-NEXT: pxor %xmm5, %xmm5
+; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5
+; X32-SSE-NEXT: movdqa %xmm5, %xmm4
+; X32-SSE-NEXT: pandn %xmm2, %xmm4
+; X32-SSE-NEXT: psraw $1, %xmm2
+; X32-SSE-NEXT: pand %xmm5, %xmm2
+; X32-SSE-NEXT: por %xmm4, %xmm2
+; X32-SSE-NEXT: psrlw $8, %xmm2
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT: pxor %xmm4, %xmm4
+; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm4
+; X32-SSE-NEXT: movdqa %xmm4, %xmm5
+; X32-SSE-NEXT: pandn %xmm0, %xmm5
+; X32-SSE-NEXT: psraw $4, %xmm0
+; X32-SSE-NEXT: pand %xmm4, %xmm0
+; X32-SSE-NEXT: por %xmm5, %xmm0
+; X32-SSE-NEXT: paddw %xmm1, %xmm1
+; X32-SSE-NEXT: pxor %xmm4, %xmm4
+; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm4
+; X32-SSE-NEXT: movdqa %xmm4, %xmm5
+; X32-SSE-NEXT: pandn %xmm0, %xmm5
+; X32-SSE-NEXT: psraw $2, %xmm0
+; X32-SSE-NEXT: pand %xmm4, %xmm0
+; X32-SSE-NEXT: por %xmm5, %xmm0
+; X32-SSE-NEXT: paddw %xmm1, %xmm1
+; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm3
+; X32-SSE-NEXT: movdqa %xmm3, %xmm1
+; X32-SSE-NEXT: pandn %xmm0, %xmm1
+; X32-SSE-NEXT: psraw $1, %xmm0
+; X32-SSE-NEXT: pand %xmm3, %xmm0
+; X32-SSE-NEXT: por %xmm1, %xmm0
+; X32-SSE-NEXT: psrlw $8, %xmm0
+; X32-SSE-NEXT: packuswb %xmm2, %xmm0
+; X32-SSE-NEXT: retl
+ %shift = ashr <8 x i8> %a, %b
+ ret <8 x i8> %shift
+}
+
+define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
+; SSE2-LABEL: var_shift_v4i8:
+; SSE2: # %bb.0:
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; SSE2-NEXT: psllw $5, %xmm1
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pandn %xmm2, %xmm6
+; SSE2-NEXT: psraw $4, %xmm2
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: por %xmm6, %xmm2
+; SSE2-NEXT: paddw %xmm4, %xmm4
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pandn %xmm2, %xmm6
+; SSE2-NEXT: psraw $2, %xmm2
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: por %xmm6, %xmm2
+; SSE2-NEXT: paddw %xmm4, %xmm4
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm4
+; SSE2-NEXT: pandn %xmm2, %xmm4
+; SSE2-NEXT: psraw $1, %xmm2
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: por %xmm4, %xmm2
+; SSE2-NEXT: psrlw $8, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pcmpgtw %xmm1, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: pandn %xmm0, %xmm5
+; SSE2-NEXT: psraw $4, %xmm0
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pcmpgtw %xmm1, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: pandn %xmm0, %xmm5
+; SSE2-NEXT: psraw $2, %xmm0
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm1
+; SSE2-NEXT: pcmpgtw %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm1
+; SSE2-NEXT: psraw $1, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: psrlw $8, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: var_shift_v4i8:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psllw $5, %xmm1
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: psraw $4, %xmm4
+; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: psraw $2, %xmm4
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: psraw $1, %xmm4
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3
+; SSE41-NEXT: psrlw $8, %xmm3
+; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psraw $4, %xmm2
+; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psraw $2, %xmm2
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psraw $1, %xmm2
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: packuswb %xmm3, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: var_shift_v4i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX-NEXT: vpsraw $4, %xmm3, %xmm4
+; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX-NEXT: vpsraw $2, %xmm3, %xmm4
+; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX-NEXT: vpsraw $1, %xmm3, %xmm4
+; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
+; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX-NEXT: vpsraw $4, %xmm0, %xmm3
+; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vpsraw $2, %xmm0, %xmm3
+; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vpsraw $1, %xmm0, %xmm3
+; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; XOP-LABEL: var_shift_v4i8:
+; XOP: # %bb.0:
+; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1
+; XOP-NEXT: vpshab %xmm1, %xmm0, %xmm0
+; XOP-NEXT: retq
+;
+; AVX512DQ-LABEL: var_shift_v4i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
+;
+; AVX512BW-LABEL: var_shift_v4i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512DQVL-LABEL: var_shift_v4i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: var_shift_v4i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: var_shift_v4i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pslld $24, %xmm0
-; X32-SSE-NEXT: psrad $24, %xmm0
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm3
-; X32-SSE-NEXT: psrad %xmm2, %xmm3
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psrad %xmm4, %xmm2
-; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm4
-; X32-SSE-NEXT: psrad %xmm3, %xmm4
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
-; X32-SSE-NEXT: psrad %xmm1, %xmm0
-; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
-; X32-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
-; X32-SSE-NEXT: movaps %xmm2, %xmm0
+; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; X32-SSE-NEXT: psllw $5, %xmm1
+; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
+; X32-SSE-NEXT: pxor %xmm3, %xmm3
+; X32-SSE-NEXT: pxor %xmm5, %xmm5
+; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5
+; X32-SSE-NEXT: movdqa %xmm5, %xmm6
+; X32-SSE-NEXT: pandn %xmm2, %xmm6
+; X32-SSE-NEXT: psraw $4, %xmm2
+; X32-SSE-NEXT: pand %xmm5, %xmm2
+; X32-SSE-NEXT: por %xmm6, %xmm2
+; X32-SSE-NEXT: paddw %xmm4, %xmm4
+; X32-SSE-NEXT: pxor %xmm5, %xmm5
+; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5
+; X32-SSE-NEXT: movdqa %xmm5, %xmm6
+; X32-SSE-NEXT: pandn %xmm2, %xmm6
+; X32-SSE-NEXT: psraw $2, %xmm2
+; X32-SSE-NEXT: pand %xmm5, %xmm2
+; X32-SSE-NEXT: por %xmm6, %xmm2
+; X32-SSE-NEXT: paddw %xmm4, %xmm4
+; X32-SSE-NEXT: pxor %xmm5, %xmm5
+; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5
+; X32-SSE-NEXT: movdqa %xmm5, %xmm4
+; X32-SSE-NEXT: pandn %xmm2, %xmm4
+; X32-SSE-NEXT: psraw $1, %xmm2
+; X32-SSE-NEXT: pand %xmm5, %xmm2
+; X32-SSE-NEXT: por %xmm4, %xmm2
+; X32-SSE-NEXT: psrlw $8, %xmm2
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT: pxor %xmm4, %xmm4
+; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm4
+; X32-SSE-NEXT: movdqa %xmm4, %xmm5
+; X32-SSE-NEXT: pandn %xmm0, %xmm5
+; X32-SSE-NEXT: psraw $4, %xmm0
+; X32-SSE-NEXT: pand %xmm4, %xmm0
+; X32-SSE-NEXT: por %xmm5, %xmm0
+; X32-SSE-NEXT: paddw %xmm1, %xmm1
+; X32-SSE-NEXT: pxor %xmm4, %xmm4
+; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm4
+; X32-SSE-NEXT: movdqa %xmm4, %xmm5
+; X32-SSE-NEXT: pandn %xmm0, %xmm5
+; X32-SSE-NEXT: psraw $2, %xmm0
+; X32-SSE-NEXT: pand %xmm4, %xmm0
+; X32-SSE-NEXT: por %xmm5, %xmm0
+; X32-SSE-NEXT: paddw %xmm1, %xmm1
+; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm3
+; X32-SSE-NEXT: movdqa %xmm3, %xmm1
+; X32-SSE-NEXT: pandn %xmm0, %xmm1
+; X32-SSE-NEXT: psraw $1, %xmm0
+; X32-SSE-NEXT: pand %xmm3, %xmm0
+; X32-SSE-NEXT: por %xmm1, %xmm0
+; X32-SSE-NEXT: psrlw $8, %xmm0
+; X32-SSE-NEXT: packuswb %xmm2, %xmm0
; X32-SSE-NEXT: retl
%shift = ashr <4 x i8> %a, %b
ret <4 x i8> %shift
define <2 x i8> @var_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
; SSE2-LABEL: var_shift_v2i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: psllq $56, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; SSE2-NEXT: psrad $24, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: psrlq %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; SSE2-NEXT: psrlq %xmm4, %xmm2
-; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: psrlq %xmm1, %xmm3
-; SSE2-NEXT: psrlq %xmm4, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
-; SSE2-NEXT: xorpd %xmm2, %xmm0
-; SSE2-NEXT: psubq %xmm2, %xmm0
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; SSE2-NEXT: psllw $5, %xmm1
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pandn %xmm2, %xmm6
+; SSE2-NEXT: psraw $4, %xmm2
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: por %xmm6, %xmm2
+; SSE2-NEXT: paddw %xmm4, %xmm4
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pandn %xmm2, %xmm6
+; SSE2-NEXT: psraw $2, %xmm2
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: por %xmm6, %xmm2
+; SSE2-NEXT: paddw %xmm4, %xmm4
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm4
+; SSE2-NEXT: pandn %xmm2, %xmm4
+; SSE2-NEXT: psraw $1, %xmm2
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: por %xmm4, %xmm2
+; SSE2-NEXT: psrlw $8, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pcmpgtw %xmm1, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: pandn %xmm0, %xmm5
+; SSE2-NEXT: psraw $4, %xmm0
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pcmpgtw %xmm1, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: pandn %xmm0, %xmm5
+; SSE2-NEXT: psraw $2, %xmm0
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm1
+; SSE2-NEXT: pcmpgtw %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm1
+; SSE2-NEXT: psraw $1, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: psrlw $8, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_shift_v2i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: psllq $56, %xmm0
; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psrad $31, %xmm2
-; SSE41-NEXT: psrad $24, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: psrlq %xmm1, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; SSE41-NEXT: psrlq %xmm4, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psrlq %xmm4, %xmm3
-; SSE41-NEXT: psrlq %xmm1, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
-; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: psubq %xmm2, %xmm0
+; SSE41-NEXT: psllw $5, %xmm1
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: psraw $4, %xmm4
+; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: psraw $2, %xmm4
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: psraw $1, %xmm4
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3
+; SSE41-NEXT: psrlw $8, %xmm3
+; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psraw $4, %xmm2
+; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psraw $2, %xmm2
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psraw $1, %xmm2
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: packuswb %xmm3, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: var_shift_v2i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2
-; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpsrlq %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpsrlq %xmm4, %xmm0, %xmm3
-; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
-; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: var_shift_v2i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2
-; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpsrlvq %xmm1, %xmm2, %xmm2
-; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpsubq %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: retq
+; AVX-LABEL: var_shift_v2i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX-NEXT: vpsraw $4, %xmm3, %xmm4
+; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX-NEXT: vpsraw $2, %xmm3, %xmm4
+; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX-NEXT: vpsraw $1, %xmm3, %xmm4
+; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
+; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX-NEXT: vpsraw $4, %xmm0, %xmm3
+; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vpsraw $2, %xmm0, %xmm3
+; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vpsraw $1, %xmm0, %xmm3
+; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX-NEXT: retq
;
; XOP-LABEL: var_shift_v2i8:
; XOP: # %bb.0:
-; XOP-NEXT: vpsllq $56, %xmm0, %xmm0
-; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOP-NEXT: vpsubq %xmm1, %xmm2, %xmm1
-; XOP-NEXT: vpshaq %xmm1, %xmm0, %xmm0
+; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1
+; XOP-NEXT: vpshab %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
-; AVX512-LABEL: var_shift_v2i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX512-NEXT: vpsraq $56, %zmm0, %zmm0
-; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512DQ-LABEL: var_shift_v2i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; AVX512VL-LABEL: var_shift_v2i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX512VL-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsraq $56, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsravq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512BW-LABEL: var_shift_v2i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512DQVL-LABEL: var_shift_v2i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: var_shift_v2i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: var_shift_v2i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psllq $56, %xmm0
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psrad $31, %xmm2
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; X32-SSE-NEXT: psrad $24, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
-; X32-SSE-NEXT: movdqa %xmm2, %xmm3
-; X32-SSE-NEXT: psrlq %xmm1, %xmm3
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; X32-SSE-NEXT: psrlq %xmm4, %xmm2
-; X32-SSE-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm3
-; X32-SSE-NEXT: psrlq %xmm1, %xmm3
-; X32-SSE-NEXT: psrlq %xmm4, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
-; X32-SSE-NEXT: xorpd %xmm2, %xmm0
-; X32-SSE-NEXT: psubq %xmm2, %xmm0
+; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; X32-SSE-NEXT: psllw $5, %xmm1
+; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
+; X32-SSE-NEXT: pxor %xmm3, %xmm3
+; X32-SSE-NEXT: pxor %xmm5, %xmm5
+; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5
+; X32-SSE-NEXT: movdqa %xmm5, %xmm6
+; X32-SSE-NEXT: pandn %xmm2, %xmm6
+; X32-SSE-NEXT: psraw $4, %xmm2
+; X32-SSE-NEXT: pand %xmm5, %xmm2
+; X32-SSE-NEXT: por %xmm6, %xmm2
+; X32-SSE-NEXT: paddw %xmm4, %xmm4
+; X32-SSE-NEXT: pxor %xmm5, %xmm5
+; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5
+; X32-SSE-NEXT: movdqa %xmm5, %xmm6
+; X32-SSE-NEXT: pandn %xmm2, %xmm6
+; X32-SSE-NEXT: psraw $2, %xmm2
+; X32-SSE-NEXT: pand %xmm5, %xmm2
+; X32-SSE-NEXT: por %xmm6, %xmm2
+; X32-SSE-NEXT: paddw %xmm4, %xmm4
+; X32-SSE-NEXT: pxor %xmm5, %xmm5
+; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5
+; X32-SSE-NEXT: movdqa %xmm5, %xmm4
+; X32-SSE-NEXT: pandn %xmm2, %xmm4
+; X32-SSE-NEXT: psraw $1, %xmm2
+; X32-SSE-NEXT: pand %xmm5, %xmm2
+; X32-SSE-NEXT: por %xmm4, %xmm2
+; X32-SSE-NEXT: psrlw $8, %xmm2
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT: pxor %xmm4, %xmm4
+; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm4
+; X32-SSE-NEXT: movdqa %xmm4, %xmm5
+; X32-SSE-NEXT: pandn %xmm0, %xmm5
+; X32-SSE-NEXT: psraw $4, %xmm0
+; X32-SSE-NEXT: pand %xmm4, %xmm0
+; X32-SSE-NEXT: por %xmm5, %xmm0
+; X32-SSE-NEXT: paddw %xmm1, %xmm1
+; X32-SSE-NEXT: pxor %xmm4, %xmm4
+; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm4
+; X32-SSE-NEXT: movdqa %xmm4, %xmm5
+; X32-SSE-NEXT: pandn %xmm0, %xmm5
+; X32-SSE-NEXT: psraw $2, %xmm0
+; X32-SSE-NEXT: pand %xmm4, %xmm0
+; X32-SSE-NEXT: por %xmm5, %xmm0
+; X32-SSE-NEXT: paddw %xmm1, %xmm1
+; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm3
+; X32-SSE-NEXT: movdqa %xmm3, %xmm1
+; X32-SSE-NEXT: pandn %xmm0, %xmm1
+; X32-SSE-NEXT: psraw $1, %xmm0
+; X32-SSE-NEXT: pand %xmm3, %xmm0
+; X32-SSE-NEXT: por %xmm1, %xmm0
+; X32-SSE-NEXT: psrlw $8, %xmm0
+; X32-SSE-NEXT: packuswb %xmm2, %xmm0
; X32-SSE-NEXT: retl
%shift = ashr <2 x i8> %a, %b
ret <2 x i8> %shift
define <2 x i32> @splatvar_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: psllq $32, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: psrlq %xmm0, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
-; SSE2-NEXT: psrlq %xmm4, %xmm1
-; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1]
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: psrlq %xmm0, %xmm3
-; SSE2-NEXT: psrlq %xmm4, %xmm2
-; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
-; SSE2-NEXT: xorpd %xmm1, %xmm2
-; SSE2-NEXT: psubq %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: xorps %xmm2, %xmm2
+; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; SSE2-NEXT: psrad %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v2i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psllq $32, %xmm2
-; SSE41-NEXT: psrad $31, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: psrlq %xmm0, %xmm3
-; SSE41-NEXT: psrlq %xmm1, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: psrlq %xmm1, %xmm4
-; SSE41-NEXT: psrlq %xmm0, %xmm3
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
-; SSE41-NEXT: pxor %xmm3, %xmm2
-; SSE41-NEXT: psubq %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; SSE41-NEXT: psrad %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: splatvar_shift_v2i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $32, %xmm0, %xmm2
-; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm3
-; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: splatvar_shift_v2i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllq $32, %xmm0, %xmm2
-; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpsrlvq %xmm1, %xmm2, %xmm1
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; XOPAVX1-LABEL: splatvar_shift_v2i32:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpsllq $32, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
-; XOPAVX1-NEXT: vpshaq %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; AVX-LABEL: splatvar_shift_v2i32:
+; AVX: # %bb.0:
+; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX-NEXT: vpsrad %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
;
-; XOPAVX2-LABEL: splatvar_shift_v2i32:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpsllq $32, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1
-; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; XOPAVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm1
-; XOPAVX2-NEXT: vpshaq %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; XOP-LABEL: splatvar_shift_v2i32:
+; XOP: # %bb.0:
+; XOP-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; XOP-NEXT: vpsrad %xmm1, %xmm0, %xmm0
+; XOP-NEXT: retq
;
; AVX512-LABEL: splatvar_shift_v2i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX512-NEXT: vpsraq $32, %zmm0, %zmm0
-; AVX512-NEXT: vpbroadcastq %xmm1, %xmm1
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX512-NEXT: vpsrad %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatvar_shift_v2i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsraq $32, %xmm0, %xmm0
-; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX512VL-NEXT: vpsravq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX512VL-NEXT: vpsrad %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatvar_shift_v2i32:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psllq $32, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; X32-SSE-NEXT: psrad $31, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [4294967295,0,4294967295,0]
-; X32-SSE-NEXT: pand %xmm1, %xmm0
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
-; X32-SSE-NEXT: movdqa %xmm3, %xmm4
-; X32-SSE-NEXT: psrlq %xmm0, %xmm4
-; X32-SSE-NEXT: xorps %xmm5, %xmm5
-; X32-SSE-NEXT: movss {{.*#+}} xmm5 = xmm1[0],xmm5[1,2,3]
-; X32-SSE-NEXT: psrlq %xmm5, %xmm3
-; X32-SSE-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1]
-; X32-SSE-NEXT: movdqa %xmm2, %xmm1
-; X32-SSE-NEXT: psrlq %xmm5, %xmm1
-; X32-SSE-NEXT: psrlq %xmm0, %xmm2
-; X32-SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],xmm1[1]
-; X32-SSE-NEXT: xorpd %xmm3, %xmm2
-; X32-SSE-NEXT: psubq %xmm3, %xmm2
-; X32-SSE-NEXT: movdqa %xmm2, %xmm0
+; X32-SSE-NEXT: xorps %xmm2, %xmm2
+; X32-SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; X32-SSE-NEXT: psrad %xmm2, %xmm0
; X32-SSE-NEXT: retl
%splat = shufflevector <2 x i32> %b, <2 x i32> undef, <2 x i32> zeroinitializer
%shift = ashr <2 x i32> %a, %splat
define <4 x i16> @splatvar_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v4i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: pslld $16, %xmm0
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0]
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,3,3,3,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: psrad %xmm1, %xmm3
-; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,1,1,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrad %xmm4, %xmm1
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: psrad %xmm3, %xmm4
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
-; SSE2-NEXT: psrad %xmm2, %xmm0
-; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,3]
-; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: psraw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v4i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: pslld $16, %xmm0
-; SSE41-NEXT: psrad $16, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
-; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm5
-; SSE41-NEXT: psrad %xmm4, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
-; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm6
-; SSE41-NEXT: psrad %xmm4, %xmm6
-; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm5[0,1,2,3],xmm6[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psrad %xmm1, %xmm2
-; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,1,1,4,5,6,7]
-; SSE41-NEXT: psrad %xmm1, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7]
+; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; SSE41-NEXT: psraw %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: splatvar_shift_v4i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpslld $16, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
-; AVX1-NEXT: vpsrad %xmm4, %xmm0, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX1-NEXT: vpsrad %xmm2, %xmm0, %xmm2
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: splatvar_shift_v4i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpslld $16, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; AVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; XOPAVX1-LABEL: splatvar_shift_v4i16:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpslld $16, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpsrad $16, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1
-; XOPAVX1-NEXT: vpshad %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; AVX-LABEL: splatvar_shift_v4i16:
+; AVX: # %bb.0:
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX-NEXT: vpsraw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
;
-; XOPAVX2-LABEL: splatvar_shift_v4i16:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpslld $16, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpsrad $16, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1
-; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; XOPAVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; XOP-LABEL: splatvar_shift_v4i16:
+; XOP: # %bb.0:
+; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; XOP-NEXT: vpsraw %xmm1, %xmm0, %xmm0
+; XOP-NEXT: retq
;
; AVX512-LABEL: splatvar_shift_v4i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpslld $16, %xmm0, %xmm0
-; AVX512-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX512-NEXT: vpbroadcastd %xmm1, %xmm1
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; AVX512-NEXT: vpsravd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512-NEXT: vpsraw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatvar_shift_v4i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpslld $16, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX512VL-NEXT: vpbroadcastd %xmm1, %xmm1
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; AVX512VL-NEXT: vpsravd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VL-NEXT: vpsraw %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatvar_shift_v4i16:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pslld $16, %xmm0
-; X32-SSE-NEXT: psrad $16, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0]
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,3,3,3,4,5,6,7]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm3
-; X32-SSE-NEXT: psrad %xmm1, %xmm3
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,1,1,4,5,6,7]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrad %xmm4, %xmm1
-; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm4
-; X32-SSE-NEXT: psrad %xmm3, %xmm4
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
-; X32-SSE-NEXT: psrad %xmm2, %xmm0
-; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
-; X32-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,3]
-; X32-SSE-NEXT: movaps %xmm1, %xmm0
+; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT: psraw %xmm1, %xmm0
; X32-SSE-NEXT: retl
%splat = shufflevector <4 x i16> %b, <4 x i16> undef, <4 x i32> zeroinitializer
%shift = ashr <4 x i16> %a, %splat
define <2 x i16> @splatvar_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v2i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: psllq $48, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: psrlq %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; SSE2-NEXT: psrlq %xmm4, %xmm2
-; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: psrlq %xmm1, %xmm3
-; SSE2-NEXT: psrlq %xmm4, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
-; SSE2-NEXT: xorpd %xmm2, %xmm0
-; SSE2-NEXT: psubq %xmm2, %xmm0
+; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: psraw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v2i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: psllq $48, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psrad $31, %xmm2
-; SSE41-NEXT: psrad $16, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psrlq %xmm1, %xmm3
-; SSE41-NEXT: psrlq %xmm2, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: psrlq %xmm2, %xmm4
-; SSE41-NEXT: psrlq %xmm1, %xmm3
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
-; SSE41-NEXT: pxor %xmm3, %xmm0
-; SSE41-NEXT: psubq %xmm3, %xmm0
+; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; SSE41-NEXT: psraw %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: splatvar_shift_v2i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2
-; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm3
-; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: splatvar_shift_v2i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2
-; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpsrlvq %xmm1, %xmm2, %xmm1
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; XOPAVX1-LABEL: splatvar_shift_v2i16:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpsllq $48, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
-; XOPAVX1-NEXT: vpshaq %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; AVX-LABEL: splatvar_shift_v2i16:
+; AVX: # %bb.0:
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX-NEXT: vpsraw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
;
-; XOPAVX2-LABEL: splatvar_shift_v2i16:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpsllq $48, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1
-; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; XOPAVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm1
-; XOPAVX2-NEXT: vpshaq %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; XOP-LABEL: splatvar_shift_v2i16:
+; XOP: # %bb.0:
+; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; XOP-NEXT: vpsraw %xmm1, %xmm0, %xmm0
+; XOP-NEXT: retq
;
; AVX512-LABEL: splatvar_shift_v2i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX512-NEXT: vpsraq $48, %zmm0, %zmm0
-; AVX512-NEXT: vpbroadcastq %xmm1, %xmm1
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512-NEXT: vpsraw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatvar_shift_v2i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsraq $48, %xmm0, %xmm0
-; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX512VL-NEXT: vpsravq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VL-NEXT: vpsraw %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatvar_shift_v2i16:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psllq $48, %xmm0
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psrad $31, %xmm2
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; X32-SSE-NEXT: psrad $16, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
-; X32-SSE-NEXT: movdqa %xmm2, %xmm3
-; X32-SSE-NEXT: psrlq %xmm1, %xmm3
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; X32-SSE-NEXT: psrlq %xmm4, %xmm2
-; X32-SSE-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm3
-; X32-SSE-NEXT: psrlq %xmm1, %xmm3
-; X32-SSE-NEXT: psrlq %xmm4, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
-; X32-SSE-NEXT: xorpd %xmm2, %xmm0
-; X32-SSE-NEXT: psubq %xmm2, %xmm0
+; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT: psraw %xmm1, %xmm0
; X32-SSE-NEXT: retl
%splat = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> zeroinitializer
%shift = ashr <2 x i16> %a, %splat
ret <2 x i16> %shift
}
-define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
-; SSE2-LABEL: splatvar_shift_v8i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psllw $8, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: psraw $8, %xmm3
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
-; SSE2-NEXT: psllw $12, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psraw $15, %xmm0
-; SSE2-NEXT: psraw $15, %xmm2
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: pandn %xmm3, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psraw $15, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: psraw $4, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psraw $15, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: psraw $2, %xmm0
+define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
+; SSE2-LABEL: splatvar_shift_v8i8:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: psrlw %xmm1, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT: psrlw %xmm1, %xmm2
+; SSE2-NEXT: psrlw $8, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
-; SSE2-NEXT: psraw $15, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: psraw $1, %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; SSE2-NEXT: psrlw %xmm1, %xmm2
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: psubb %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v8i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psllw $8, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm1
-; SSE41-NEXT: psraw $8, %xmm1
-; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: psllw $12, %xmm0
-; SSE41-NEXT: psllw $4, %xmm2
-; SSE41-NEXT: por %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: paddw %xmm2, %xmm4
-; SSE41-NEXT: psraw $15, %xmm3
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psraw $4, %xmm2
-; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psraw $2, %xmm2
-; SSE41-NEXT: paddw %xmm4, %xmm4
-; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psraw $1, %xmm2
-; SSE41-NEXT: paddw %xmm4, %xmm4
-; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: psrlw %xmm1, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT: psrlw %xmm1, %xmm2
+; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; SSE41-NEXT: psrlw %xmm1, %xmm2
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: psubb %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: splatvar_shift_v8i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX1-NEXT: vpsraw $8, %xmm0, %xmm2
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
-; AVX1-NEXT: vpsllw $12, %xmm1, %xmm3
-; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
-; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm3
-; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0
-; AVX1-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpsraw $4, %xmm0, %xmm1
-; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1
-; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm2
-; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1
-; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_shift_v8i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX2-NEXT: vpsraw $8, %xmm0, %xmm0
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
-; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
+; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2
+; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
+; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
-; XOP-LABEL: splatvar_shift_v8i8:
-; XOP: # %bb.0:
-; XOP-NEXT: vpsllw $8, %xmm0, %xmm0
-; XOP-NEXT: vpsraw $8, %xmm0, %xmm0
-; XOP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
-; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm1
-; XOP-NEXT: vpshaw %xmm1, %xmm0, %xmm0
-; XOP-NEXT: retq
+; XOPAVX1-LABEL: splatvar_shift_v8i8:
+; XOPAVX1: # %bb.0:
+; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT: vpshab %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT: retq
+;
+; XOPAVX2-LABEL: splatvar_shift_v8i8:
+; XOPAVX2: # %bb.0:
+; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
+; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1
+; XOPAVX2-NEXT: vpshab %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT: retq
;
; AVX512DQ-LABEL: splatvar_shift_v8i8:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpsraw $8, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512DQ-NEXT: vpmovsxwd %xmm0, %ymm0
-; AVX512DQ-NEXT: vpsravd %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: splatvar_shift_v8i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512BW-NEXT: vpsraw $8, %xmm0, %xmm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
+; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQVL-LABEL: splatvar_shift_v8i8:
; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpsraw $8, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
-; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512DQVL-NEXT: vpmovsxwd %xmm0, %ymm0
-; AVX512DQVL-NEXT: vpsravd %ymm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQVL-NEXT: vzeroupper
; AVX512DQVL-NEXT: retq
;
; AVX512BWVL-LABEL: splatvar_shift_v8i8:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpsraw $8, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
-; AVX512BWVL-NEXT: vpsravw %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: splatvar_shift_v8i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psllw $8, %xmm2
-; X32-SSE-NEXT: movdqa %xmm2, %xmm3
-; X32-SSE-NEXT: psraw $8, %xmm3
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
-; X32-SSE-NEXT: psllw $12, %xmm1
-; X32-SSE-NEXT: movdqa %xmm1, %xmm0
-; X32-SSE-NEXT: psraw $15, %xmm0
-; X32-SSE-NEXT: psraw $15, %xmm2
-; X32-SSE-NEXT: pand %xmm0, %xmm2
-; X32-SSE-NEXT: pandn %xmm3, %xmm0
-; X32-SSE-NEXT: por %xmm2, %xmm0
-; X32-SSE-NEXT: paddw %xmm1, %xmm1
-; X32-SSE-NEXT: movdqa %xmm1, %xmm2
-; X32-SSE-NEXT: psraw $15, %xmm2
-; X32-SSE-NEXT: movdqa %xmm2, %xmm3
-; X32-SSE-NEXT: pandn %xmm0, %xmm3
-; X32-SSE-NEXT: psraw $4, %xmm0
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: por %xmm3, %xmm0
-; X32-SSE-NEXT: paddw %xmm1, %xmm1
-; X32-SSE-NEXT: movdqa %xmm1, %xmm2
-; X32-SSE-NEXT: psraw $15, %xmm2
-; X32-SSE-NEXT: movdqa %xmm2, %xmm3
-; X32-SSE-NEXT: pandn %xmm0, %xmm3
-; X32-SSE-NEXT: psraw $2, %xmm0
+; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT: psrlw %xmm1, %xmm0
+; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2
+; X32-SSE-NEXT: psrlw %xmm1, %xmm2
+; X32-SSE-NEXT: psrlw $8, %xmm2
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: por %xmm3, %xmm0
-; X32-SSE-NEXT: paddw %xmm1, %xmm1
-; X32-SSE-NEXT: psraw $15, %xmm1
-; X32-SSE-NEXT: movdqa %xmm1, %xmm2
-; X32-SSE-NEXT: pandn %xmm0, %xmm2
-; X32-SSE-NEXT: psraw $1, %xmm0
-; X32-SSE-NEXT: pand %xmm1, %xmm0
-; X32-SSE-NEXT: por %xmm2, %xmm0
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; X32-SSE-NEXT: psrlw %xmm1, %xmm2
+; X32-SSE-NEXT: pxor %xmm2, %xmm0
+; X32-SSE-NEXT: psubb %xmm2, %xmm0
; X32-SSE-NEXT: retl
%splat = shufflevector <8 x i8> %b, <8 x i8> undef, <8 x i32> zeroinitializer
%shift = ashr <8 x i8> %a, %splat
define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v4i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pslld $24, %xmm0
-; SSE2-NEXT: psrad $24, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0]
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,3,3,3,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: psrad %xmm1, %xmm3
-; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,1,1,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrad %xmm4, %xmm1
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: psrad %xmm3, %xmm4
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
-; SSE2-NEXT: psrad %xmm2, %xmm0
-; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,3]
-; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: psrlw %xmm1, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT: psrlw %xmm1, %xmm2
+; SSE2-NEXT: psrlw $8, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; SSE2-NEXT: psrlw %xmm1, %xmm2
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: psubb %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v4i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pslld $24, %xmm0
-; SSE41-NEXT: psrad $24, %xmm0
-; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
-; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psrad %xmm2, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm5
-; SSE41-NEXT: psrad %xmm4, %xmm5
-; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
-; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psrad %xmm1, %xmm3
-; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
-; SSE41-NEXT: psrad %xmm1, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
+; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: psrlw %xmm1, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT: psrlw %xmm1, %xmm2
+; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; SSE41-NEXT: psrlw %xmm1, %xmm2
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: psubb %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: splatvar_shift_v4i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpsrad %xmm2, %xmm0, %xmm2
-; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3
-; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_shift_v4i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
-; AVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
+; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2
+; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
+; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: splatvar_shift_v4i8:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpslld $24, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpsrad $24, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
+; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1
-; XOPAVX1-NEXT: vpshad %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT: vpshab %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatvar_shift_v4i8:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpslld $24, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpsrad $24, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
-; XOPAVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
+; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1
+; XOPAVX2-NEXT: vpshab %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
-; AVX512-LABEL: splatvar_shift_v4i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX512-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
-; AVX512-NEXT: vpsravd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512DQ-LABEL: splatvar_shift_v4i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; AVX512VL-LABEL: splatvar_shift_v4i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
-; AVX512VL-NEXT: vpsravd %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512BW-LABEL: splatvar_shift_v4i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512DQVL-LABEL: splatvar_shift_v4i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: splatvar_shift_v4i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: splatvar_shift_v4i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pslld $24, %xmm0
-; X32-SSE-NEXT: psrad $24, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0]
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,3,3,3,4,5,6,7]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm3
-; X32-SSE-NEXT: psrad %xmm1, %xmm3
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,1,1,4,5,6,7]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrad %xmm4, %xmm1
-; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm4
-; X32-SSE-NEXT: psrad %xmm3, %xmm4
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
-; X32-SSE-NEXT: psrad %xmm2, %xmm0
-; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
-; X32-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,3]
-; X32-SSE-NEXT: movaps %xmm1, %xmm0
+; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT: psrlw %xmm1, %xmm0
+; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2
+; X32-SSE-NEXT: psrlw %xmm1, %xmm2
+; X32-SSE-NEXT: psrlw $8, %xmm2
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; X32-SSE-NEXT: pand %xmm2, %xmm0
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; X32-SSE-NEXT: psrlw %xmm1, %xmm2
+; X32-SSE-NEXT: pxor %xmm2, %xmm0
+; X32-SSE-NEXT: psubb %xmm2, %xmm0
; X32-SSE-NEXT: retl
%splat = shufflevector <4 x i8> %b, <4 x i8> undef, <4 x i32> zeroinitializer
%shift = ashr <4 x i8> %a, %splat
define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v2i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: psllq $56, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; SSE2-NEXT: psrad $24, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: psrlq %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; SSE2-NEXT: psrlq %xmm4, %xmm2
-; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: psrlq %xmm1, %xmm3
-; SSE2-NEXT: psrlq %xmm4, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
-; SSE2-NEXT: xorpd %xmm2, %xmm0
-; SSE2-NEXT: psubq %xmm2, %xmm0
+; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: psrlw %xmm1, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT: psrlw %xmm1, %xmm2
+; SSE2-NEXT: psrlw $8, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; SSE2-NEXT: psrlw %xmm1, %xmm2
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: psubb %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v2i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: psllq $56, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psrad $31, %xmm2
-; SSE41-NEXT: psrad $24, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: psrlq %xmm1, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; SSE41-NEXT: psrlq %xmm4, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psrlq %xmm4, %xmm3
-; SSE41-NEXT: psrlq %xmm1, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
+; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: psrlw %xmm1, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT: psrlw %xmm1, %xmm2
+; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; SSE41-NEXT: psrlw %xmm1, %xmm2
; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: psubq %xmm2, %xmm0
+; SSE41-NEXT: psubb %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: splatvar_shift_v2i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2
-; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpsrlq %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpsrlq %xmm4, %xmm0, %xmm3
-; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
-; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_shift_v2i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2
-; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpsrlvq %xmm1, %xmm2, %xmm2
-; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpsubq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
+; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2
+; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
+; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; XOP-LABEL: splatvar_shift_v2i8:
; XOP: # %bb.0:
-; XOP-NEXT: vpsllq $56, %xmm0, %xmm0
-; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: insertq {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7,u,u,u,u,u,u,u,u]
; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
-; XOP-NEXT: vpsubq %xmm1, %xmm2, %xmm1
-; XOP-NEXT: vpshaq %xmm1, %xmm0, %xmm0
+; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1
+; XOP-NEXT: vpshab %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
-; AVX512-LABEL: splatvar_shift_v2i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX512-NEXT: vpsraq $56, %zmm0, %zmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512DQ-LABEL: splatvar_shift_v2i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; AVX512VL-LABEL: splatvar_shift_v2i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsraq $56, %xmm0, %xmm0
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vpsravq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512BW-LABEL: splatvar_shift_v2i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512DQVL-LABEL: splatvar_shift_v2i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: splatvar_shift_v2i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: splatvar_shift_v2i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psllq $56, %xmm0
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psrad $31, %xmm2
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; X32-SSE-NEXT: psrad $24, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
-; X32-SSE-NEXT: movdqa %xmm2, %xmm3
-; X32-SSE-NEXT: psrlq %xmm1, %xmm3
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; X32-SSE-NEXT: psrlq %xmm4, %xmm2
-; X32-SSE-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm3
-; X32-SSE-NEXT: psrlq %xmm1, %xmm3
-; X32-SSE-NEXT: psrlq %xmm4, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
-; X32-SSE-NEXT: xorpd %xmm2, %xmm0
-; X32-SSE-NEXT: psubq %xmm2, %xmm0
+; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT: psrlw %xmm1, %xmm0
+; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2
+; X32-SSE-NEXT: psrlw %xmm1, %xmm2
+; X32-SSE-NEXT: psrlw $8, %xmm2
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; X32-SSE-NEXT: pand %xmm2, %xmm0
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
+; X32-SSE-NEXT: psrlw %xmm1, %xmm2
+; X32-SSE-NEXT: pxor %xmm2, %xmm0
+; X32-SSE-NEXT: psubb %xmm2, %xmm0
; X32-SSE-NEXT: retl
%splat = shufflevector <2 x i8> %b, <2 x i8> undef, <2 x i32> zeroinitializer
%shift = ashr <2 x i8> %a, %splat
define <2 x i32> @constant_shift_v2i32(<2 x i32> %a) nounwind {
; SSE2-LABEL: constant_shift_v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: psllq $32, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrad $4, %xmm1
+; SSE2-NEXT: psrad $5, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlq $4, %xmm0
-; SSE2-NEXT: psrlq $5, %xmm1
-; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
-; SSE2-NEXT: movapd {{.*#+}} xmm0 = [576460752303423488,288230376151711744]
-; SSE2-NEXT: xorpd %xmm0, %xmm1
-; SSE2-NEXT: psubq %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v2i32:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psllq $32, %xmm1
-; SSE41-NEXT: psrad $31, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrlq $5, %xmm0
-; SSE41-NEXT: psrlq $4, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [576460752303423488,288230376151711744]
-; SSE41-NEXT: pxor %xmm0, %xmm1
-; SSE41-NEXT: psubq %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psrad $5, %xmm1
+; SSE41-NEXT: psrad $4, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_shift_v2i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $32, %xmm0, %xmm1
-; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; AVX1-NEXT: vpsrlq $5, %xmm0, %xmm1
-; AVX1-NEXT: vpsrlq $4, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [576460752303423488,288230376151711744]
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $5, %xmm0, %xmm1
+; AVX1-NEXT: vpsrad $4, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v2i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllq $32, %xmm0, %xmm1
-; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [576460752303423488,288230376151711744]
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: retq
;
-; XOP-LABEL: constant_shift_v2i32:
-; XOP: # %bb.0:
-; XOP-NEXT: vpsllq $32, %xmm0, %xmm0
-; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: retq
+; XOPAVX1-LABEL: constant_shift_v2i32:
+; XOPAVX1: # %bb.0:
+; XOPAVX1-NEXT: vpshad {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT: retq
+;
+; XOPAVX2-LABEL: constant_shift_v2i32:
+; XOPAVX2: # %bb.0:
+; XOPAVX2-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: constant_shift_v2i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX512-NEXT: vpsraq $32, %zmm0, %zmm0
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5]
-; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: constant_shift_v2i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsraq $32, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsravq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v2i32:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psllq $32, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
-; X32-SSE-NEXT: psrad $31, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; X32-SSE-NEXT: movdqa %xmm0, %xmm1
+; X32-SSE-NEXT: psrad $4, %xmm1
+; X32-SSE-NEXT: psrad $5, %xmm0
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; X32-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; X32-SSE-NEXT: movdqa %xmm1, %xmm0
-; X32-SSE-NEXT: psrlq $4, %xmm0
-; X32-SSE-NEXT: psrlq $5, %xmm1
-; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
-; X32-SSE-NEXT: movapd {{.*#+}} xmm0 = [3.7857669957336791E-270,2.0522684006491881E-289]
-; X32-SSE-NEXT: xorpd %xmm0, %xmm1
-; X32-SSE-NEXT: psubq %xmm0, %xmm1
-; X32-SSE-NEXT: movdqa %xmm1, %xmm0
; X32-SSE-NEXT: retl
%shift = ashr <2 x i32> %a, <i32 4, i32 5>
ret <2 x i32> %shift
define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
; SSE2-LABEL: constant_shift_v4i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: pslld $16, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrad $19, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: psrad $18, %xmm3
-; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
-; SSE2-NEXT: psrad $17, %xmm0
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
+; SSE2-NEXT: psraw $2, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
+; SSE2-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,0,65535,65535,65535,65535]
; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: andps %xmm2, %xmm0
+; SSE2-NEXT: psraw $1, %xmm1
+; SSE2-NEXT: andnps %xmm1, %xmm2
+; SSE2-NEXT: orps %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v4i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: pslld $16, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrad $16, %xmm1
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psrad $19, %xmm2
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psrad $17, %xmm3
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; SSE41-NEXT: psrad $18, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = <u,32768,16384,8192,u,u,u,u>
+; SSE41-NEXT: pmulhw %xmm0, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; SSE41-NEXT: psraw $1, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
; SSE41-NEXT: retq
;
-; AVX1-LABEL: constant_shift_v4i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpslld $16, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $16, %xmm0, %xmm1
-; AVX1-NEXT: vpsrad $19, %xmm0, %xmm2
-; AVX1-NEXT: vpsrad $17, %xmm0, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpsrad $18, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT: retq
+; AVX-LABEL: constant_shift_v4i16:
+; AVX: # %bb.0:
+; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX-NEXT: vpsraw $1, %xmm0, %xmm0
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
+; AVX-NEXT: retq
;
-; AVX2-LABEL: constant_shift_v4i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpslld $16, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: retq
+; XOP-LABEL: constant_shift_v4i16:
+; XOP: # %bb.0:
+; XOP-NEXT: vpshaw {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: retq
;
-; XOPAVX1-LABEL: constant_shift_v4i16:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpslld $16, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpsrad $16, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpshad {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; AVX512DQ-LABEL: constant_shift_v4i16:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX512DQ-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; XOPAVX2-LABEL: constant_shift_v4i16:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpslld $16, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpsrad $16, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; AVX512BW-LABEL: constant_shift_v4i16:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,2,3,u,u,u,u>
+; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
;
-; AVX512-LABEL: constant_shift_v4i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpslld $16, %xmm0, %xmm0
-; AVX512-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX512-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512DQVL-LABEL: constant_shift_v4i16:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX512DQVL-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
;
-; AVX512VL-LABEL: constant_shift_v4i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpslld $16, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512BWVL-LABEL: constant_shift_v4i16:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v4i16:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pslld $16, %xmm0
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrad $16, %xmm1
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psrad $19, %xmm2
-; X32-SSE-NEXT: movdqa %xmm0, %xmm3
-; X32-SSE-NEXT: psrad $18, %xmm3
-; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
-; X32-SSE-NEXT: psrad $17, %xmm0
-; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; X32-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
+; X32-SSE-NEXT: psraw $2, %xmm1
+; X32-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
+; X32-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
+; X32-SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,0,65535,65535,65535,65535]
; X32-SSE-NEXT: movaps %xmm1, %xmm0
+; X32-SSE-NEXT: andps %xmm2, %xmm0
+; X32-SSE-NEXT: psraw $1, %xmm1
+; X32-SSE-NEXT: andnps %xmm1, %xmm2
+; X32-SSE-NEXT: orps %xmm2, %xmm0
; X32-SSE-NEXT: retl
%shift = ashr <4 x i16> %a, <i16 0, i16 1, i16 2, i16 3>
ret <4 x i16> %shift
define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind {
; SSE2-LABEL: constant_shift_v2i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: psllq $48, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlq $2, %xmm1
-; SSE2-NEXT: psrlq $3, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT: movapd {{.*#+}} xmm1 = [2305843009213693952,1152921504606846976]
-; SSE2-NEXT: xorpd %xmm1, %xmm0
-; SSE2-NEXT: psubq %xmm1, %xmm0
+; SSE2-NEXT: psraw $3, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535]
+; SSE2-NEXT: psraw $2, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pandn %xmm1, %xmm2
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v2i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: psllq $48, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrad $31, %xmm1
-; SSE41-NEXT: psrad $16, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlq $3, %xmm1
-; SSE41-NEXT: psrlq $2, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2305843009213693952,1152921504606846976]
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: psubq %xmm1, %xmm0
+; SSE41-NEXT: psraw $3, %xmm1
+; SSE41-NEXT: psraw $2, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
; SSE41-NEXT: retq
;
-; AVX1-LABEL: constant_shift_v2i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1
-; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; AVX1-NEXT: vpsrlq $3, %xmm0, %xmm1
-; AVX1-NEXT: vpsrlq $2, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2305843009213693952,1152921504606846976]
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: constant_shift_v2i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $31, %xmm0, %xmm1
-; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2305843009213693952,1152921504606846976]
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
+; AVX-LABEL: constant_shift_v2i16:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsraw $3, %xmm0, %xmm1
+; AVX-NEXT: vpsraw $2, %xmm0, %xmm0
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
+; AVX-NEXT: retq
;
; XOP-LABEL: constant_shift_v2i16:
; XOP: # %bb.0:
-; XOP-NEXT: vpsllq $48, %xmm0, %xmm0
-; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: vpshaw {{.*}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
;
-; AVX512-LABEL: constant_shift_v2i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX512-NEXT: vpsraq $48, %zmm0, %zmm0
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3]
-; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512DQ-LABEL: constant_shift_v2i16:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpsraw $3, %xmm0, %xmm1
+; AVX512DQ-NEXT: vpsraw $2, %xmm0, %xmm0
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
+; AVX512DQ-NEXT: retq
;
-; AVX512VL-LABEL: constant_shift_v2i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsraq $48, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsravq {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512BW-LABEL: constant_shift_v2i16:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512DQVL-LABEL: constant_shift_v2i16:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpsraw $3, %xmm0, %xmm1
+; AVX512DQVL-NEXT: vpsraw $2, %xmm0, %xmm0
+; AVX512DQVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: constant_shift_v2i16:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v2i16:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psllq $48, %xmm0
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrad $31, %xmm1
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; X32-SSE-NEXT: psrad $16, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrlq $2, %xmm1
-; X32-SSE-NEXT: psrlq $3, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; X32-SSE-NEXT: movapd {{.*#+}} xmm1 = [1.4916681462400413E-154,1.2882297539194267E-231]
-; X32-SSE-NEXT: xorpd %xmm1, %xmm0
-; X32-SSE-NEXT: psubq %xmm1, %xmm0
+; X32-SSE-NEXT: psraw $3, %xmm1
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535]
+; X32-SSE-NEXT: psraw $2, %xmm0
+; X32-SSE-NEXT: pand %xmm2, %xmm0
+; X32-SSE-NEXT: pandn %xmm1, %xmm2
+; X32-SSE-NEXT: por %xmm2, %xmm0
; X32-SSE-NEXT: retl
%shift = ashr <2 x i16> %a, <i16 2, i16 3>
ret <2 x i16> %shift
}
define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind {
-; SSE2-LABEL: constant_shift_v8i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: psllw $8, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psraw $8, %xmm1
-; SSE2-NEXT: psraw $12, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT: movapd %xmm0, %xmm1
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
-; SSE2-NEXT: psraw $2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0]
-; SSE2-NEXT: movaps %xmm1, %xmm0
-; SSE2-NEXT: andps %xmm2, %xmm0
-; SSE2-NEXT: psraw $1, %xmm1
-; SSE2-NEXT: andnps %xmm1, %xmm2
-; SSE2-NEXT: orps %xmm2, %xmm0
-; SSE2-NEXT: retq
+; SSE-LABEL: constant_shift_v8i8:
+; SSE: # %bb.0:
+; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT: psraw $8, %xmm0
+; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
+; SSE-NEXT: psrlw $8, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: retq
;
-; SSE41-LABEL: constant_shift_v8i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: psllw $8, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psraw $8, %xmm1
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <u,32768,16384,8192,4096,2048,1024,512>
-; SSE41-NEXT: pmulhw %xmm1, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
-; SSE41-NEXT: psraw $9, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
-; SSE41-NEXT: retq
+; AVX1-LABEL: constant_shift_v8i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
;
-; AVX-LABEL: constant_shift_v8i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX-NEXT: vpsraw $8, %xmm0, %xmm1
-; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm1, %xmm2
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
-; AVX-NEXT: vpsraw $9, %xmm0, %xmm0
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
-; AVX-NEXT: retq
+; AVX2-LABEL: constant_shift_v8i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
;
; XOP-LABEL: constant_shift_v8i8:
; XOP: # %bb.0:
-; XOP-NEXT: vpsllw $8, %xmm0, %xmm0
-; XOP-NEXT: vpsraw $8, %xmm0, %xmm0
-; XOP-NEXT: vpshaw {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: vpshab {{.*}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512DQ-LABEL: constant_shift_v8i8:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpsraw $8, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpmovsxwd %xmm0, %ymm0
-; AVX512DQ-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0
-; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: constant_shift_v8i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512BW-NEXT: vpsraw $8, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0
; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQVL-LABEL: constant_shift_v8i8:
; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpsraw $8, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpmovsxwd %xmm0, %ymm0
-; AVX512DQVL-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0
-; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512DQVL-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQVL-NEXT: vzeroupper
; AVX512DQVL-NEXT: retq
;
; AVX512BWVL-LABEL: constant_shift_v8i8:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpsraw $8, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v8i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psllw $8, %xmm0
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psraw $8, %xmm1
-; X32-SSE-NEXT: psraw $12, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; X32-SSE-NEXT: movapd %xmm0, %xmm1
-; X32-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
-; X32-SSE-NEXT: psraw $2, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; X32-SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X32-SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0]
-; X32-SSE-NEXT: movaps %xmm1, %xmm0
-; X32-SSE-NEXT: andps %xmm2, %xmm0
-; X32-SSE-NEXT: psraw $1, %xmm1
-; X32-SSE-NEXT: andnps %xmm1, %xmm2
-; X32-SSE-NEXT: orps %xmm2, %xmm0
+; X32-SSE-NEXT: pxor %xmm1, %xmm1
+; X32-SSE-NEXT: movdqa %xmm0, %xmm2
+; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT: psraw $8, %xmm0
+; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: psrlw $8, %xmm0
+; X32-SSE-NEXT: packuswb %xmm2, %xmm0
; X32-SSE-NEXT: retl
%shift = ashr <8 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
ret <8 x i8> %shift
}
define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind {
-; SSE2-LABEL: constant_shift_v4i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pslld $24, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrad $24, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrad $27, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: psrad $26, %xmm3
-; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
-; SSE2-NEXT: psrad $25, %xmm0
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
-; SSE2-NEXT: movaps %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: constant_shift_v4i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pslld $24, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrad $24, %xmm1
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psrad $27, %xmm2
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psrad $25, %xmm3
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; SSE41-NEXT: psrad $26, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
-; SSE41-NEXT: retq
+; SSE-LABEL: constant_shift_v4i8:
+; SSE: # %bb.0:
+; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT: psraw $8, %xmm0
+; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
+; SSE-NEXT: psrlw $8, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: retq
;
; AVX1-LABEL: constant_shift_v4i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $24, %xmm0, %xmm1
-; AVX1-NEXT: vpsrad $27, %xmm0, %xmm2
-; AVX1-NEXT: vpsrad $25, %xmm0, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpsrad $26, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v4i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; XOPAVX1-LABEL: constant_shift_v4i8:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpslld $24, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpsrad $24, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpshad {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; XOP-LABEL: constant_shift_v4i8:
+; XOP: # %bb.0:
+; XOP-NEXT: vpshab {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: retq
;
-; XOPAVX2-LABEL: constant_shift_v4i8:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpslld $24, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpsrad $24, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; AVX512DQ-LABEL: constant_shift_v4i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; AVX512-LABEL: constant_shift_v4i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX512-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX512-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512BW-LABEL: constant_shift_v4i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
;
-; AVX512VL-LABEL: constant_shift_v4i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512DQVL-LABEL: constant_shift_v4i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512DQVL-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: constant_shift_v4i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v4i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pslld $24, %xmm0
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrad $24, %xmm1
+; X32-SSE-NEXT: pxor %xmm1, %xmm1
; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psrad $27, %xmm2
-; X32-SSE-NEXT: movdqa %xmm0, %xmm3
-; X32-SSE-NEXT: psrad $26, %xmm3
-; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
-; X32-SSE-NEXT: psrad $25, %xmm0
-; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; X32-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
-; X32-SSE-NEXT: movaps %xmm1, %xmm0
+; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT: psraw $8, %xmm0
+; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: psrlw $8, %xmm0
+; X32-SSE-NEXT: packuswb %xmm2, %xmm0
; X32-SSE-NEXT: retl
%shift = ashr <4 x i8> %a, <i8 0, i8 1, i8 2, i8 3>
ret <4 x i8> %shift
}
define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind {
-; SSE2-LABEL: constant_shift_v2i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: psllq $56, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; SSE2-NEXT: psrad $24, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlq $2, %xmm1
-; SSE2-NEXT: psrlq $3, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT: movapd {{.*#+}} xmm1 = [2305843009213693952,1152921504606846976]
-; SSE2-NEXT: xorpd %xmm1, %xmm0
-; SSE2-NEXT: psubq %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: constant_shift_v2i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: psllq $56, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrad $31, %xmm1
-; SSE41-NEXT: psrad $24, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlq $3, %xmm1
-; SSE41-NEXT: psrlq $2, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2305843009213693952,1152921504606846976]
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: psubq %xmm1, %xmm0
-; SSE41-NEXT: retq
+; SSE-LABEL: constant_shift_v2i8:
+; SSE: # %bb.0:
+; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT: psraw $8, %xmm0
+; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
+; SSE-NEXT: psrlw $8, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: retq
;
; AVX1-LABEL: constant_shift_v2i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1
-; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; AVX1-NEXT: vpsrlq $3, %xmm0, %xmm1
-; AVX1-NEXT: vpsrlq $2, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2305843009213693952,1152921504606846976]
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v2i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $31, %xmm0, %xmm1
-; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2305843009213693952,1152921504606846976]
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; XOP-LABEL: constant_shift_v2i8:
; XOP: # %bb.0:
-; XOP-NEXT: vpsllq $56, %xmm0, %xmm0
-; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: vpshab {{.*}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
;
-; AVX512-LABEL: constant_shift_v2i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX512-NEXT: vpsraq $56, %zmm0, %zmm0
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3]
-; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512DQ-LABEL: constant_shift_v2i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; AVX512VL-LABEL: constant_shift_v2i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsraq $56, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsravq {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512BW-LABEL: constant_shift_v2i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512DQVL-LABEL: constant_shift_v2i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512DQVL-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: constant_shift_v2i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v2i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psllq $56, %xmm0
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrad $31, %xmm1
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; X32-SSE-NEXT: psrad $24, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrlq $2, %xmm1
-; X32-SSE-NEXT: psrlq $3, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; X32-SSE-NEXT: movapd {{.*#+}} xmm1 = [1.4916681462400413E-154,1.2882297539194267E-231]
-; X32-SSE-NEXT: xorpd %xmm1, %xmm0
-; X32-SSE-NEXT: psubq %xmm1, %xmm0
+; X32-SSE-NEXT: pxor %xmm1, %xmm1
+; X32-SSE-NEXT: movdqa %xmm0, %xmm2
+; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT: psraw $8, %xmm0
+; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: psrlw $8, %xmm0
+; X32-SSE-NEXT: packuswb %xmm2, %xmm0
; X32-SSE-NEXT: retl
%shift = ashr <2 x i8> %a, <i8 2, i8 3>
ret <2 x i8> %shift
;
define <2 x i32> @splatconstant_shift_v2i32(<2 x i32> %a) nounwind {
-; SSE2-LABEL: splatconstant_shift_v2i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: psllq $32, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrad $5, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; SSE2-NEXT: psrlq $5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: splatconstant_shift_v2i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psllq $32, %xmm1
-; SSE41-NEXT: psrad $31, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrad $5, %xmm0
-; SSE41-NEXT: psrlq $5, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: splatconstant_shift_v2i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $32, %xmm0, %xmm1
-; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; AVX1-NEXT: vpsrad $5, %xmm0, %xmm1
-; AVX1-NEXT: vpsrlq $5, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; AVX1-NEXT: retq
+; SSE-LABEL: splatconstant_shift_v2i32:
+; SSE: # %bb.0:
+; SSE-NEXT: psrad $5, %xmm0
+; SSE-NEXT: retq
;
-; AVX2-LABEL: splatconstant_shift_v2i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllq $32, %xmm0, %xmm1
-; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX2-NEXT: vpsrad $5, %xmm0, %xmm1
-; AVX2-NEXT: vpsrlq $5, %xmm0, %xmm0
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX2-NEXT: retq
+; AVX-LABEL: splatconstant_shift_v2i32:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsrad $5, %xmm0, %xmm0
+; AVX-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v2i32:
; XOP: # %bb.0:
-; XOP-NEXT: vpsllq $32, %xmm0, %xmm0
-; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: vpsrad $5, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v2i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX512-NEXT: vpsraq $37, %zmm0, %zmm0
-; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vpsrad $5, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v2i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsraq $37, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsrad $5, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_shift_v2i32:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psllq $32, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
-; X32-SSE-NEXT: psrad $31, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X32-SSE-NEXT: movdqa %xmm1, %xmm0
; X32-SSE-NEXT: psrad $5, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; X32-SSE-NEXT: psrlq $5, %xmm1
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; X32-SSE-NEXT: retl
%shift = ashr <2 x i32> %a, <i32 5, i32 5>
ret <2 x i32> %shift
define <4 x i16> @splatconstant_shift_v4i16(<4 x i16> %a) nounwind {
; SSE-LABEL: splatconstant_shift_v4i16:
; SSE: # %bb.0:
-; SSE-NEXT: pslld $16, %xmm0
-; SSE-NEXT: psrad $19, %xmm0
+; SSE-NEXT: psraw $3, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_shift_v4i16:
; AVX: # %bb.0:
-; AVX-NEXT: vpslld $16, %xmm0, %xmm0
-; AVX-NEXT: vpsrad $19, %xmm0, %xmm0
+; AVX-NEXT: vpsraw $3, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v4i16:
; XOP: # %bb.0:
-; XOP-NEXT: vpslld $16, %xmm0, %xmm0
-; XOP-NEXT: vpsrad $19, %xmm0, %xmm0
+; XOP-NEXT: vpsraw $3, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v4i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpslld $16, %xmm0, %xmm0
-; AVX512-NEXT: vpsrad $19, %xmm0, %xmm0
+; AVX512-NEXT: vpsraw $3, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v4i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpslld $16, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsrad $19, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsraw $3, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_shift_v4i16:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pslld $16, %xmm0
-; X32-SSE-NEXT: psrad $19, %xmm0
+; X32-SSE-NEXT: psraw $3, %xmm0
; X32-SSE-NEXT: retl
%shift = ashr <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3>
ret <4 x i16> %shift
}
define <2 x i16> @splatconstant_shift_v2i16(<2 x i16> %a) nounwind {
-; SSE2-LABEL: splatconstant_shift_v2i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: psllq $48, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrad $3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; SSE2-NEXT: psrlq $3, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: splatconstant_shift_v2i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: psllq $48, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrad $31, %xmm1
-; SSE41-NEXT: psrad $16, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrad $3, %xmm1
-; SSE41-NEXT: psrlq $3, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: splatconstant_shift_v2i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1
-; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; AVX1-NEXT: vpsrad $3, %xmm0, %xmm1
-; AVX1-NEXT: vpsrlq $3, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; AVX1-NEXT: retq
+; SSE-LABEL: splatconstant_shift_v2i16:
+; SSE: # %bb.0:
+; SSE-NEXT: psraw $3, %xmm0
+; SSE-NEXT: retq
;
-; AVX2-LABEL: splatconstant_shift_v2i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $31, %xmm0, %xmm1
-; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX2-NEXT: vpsrad $3, %xmm0, %xmm1
-; AVX2-NEXT: vpsrlq $3, %xmm0, %xmm0
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX2-NEXT: retq
+; AVX-LABEL: splatconstant_shift_v2i16:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsraw $3, %xmm0, %xmm0
+; AVX-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v2i16:
; XOP: # %bb.0:
-; XOP-NEXT: vpsllq $48, %xmm0, %xmm0
-; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: vpsraw $3, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v2i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX512-NEXT: vpsraq $51, %zmm0, %zmm0
-; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vpsraw $3, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v2i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsraq $51, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsraw $3, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_shift_v2i16:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psllq $48, %xmm0
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrad $31, %xmm1
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; X32-SSE-NEXT: psrad $16, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrad $3, %xmm1
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; X32-SSE-NEXT: psrlq $3, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE-NEXT: psraw $3, %xmm0
; X32-SSE-NEXT: retl
%shift = ashr <2 x i16> %a, <i16 3, i16 3>
ret <2 x i16> %shift
define <8 x i8> @splatconstant_shift_v8i8(<8 x i8> %a) nounwind {
; SSE-LABEL: splatconstant_shift_v8i8:
; SSE: # %bb.0:
-; SSE-NEXT: psllw $8, %xmm0
-; SSE-NEXT: psraw $11, %xmm0
+; SSE-NEXT: psrlw $3, %xmm0
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; SSE-NEXT: pxor %xmm1, %xmm0
+; SSE-NEXT: psubb %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_shift_v8i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX-NEXT: vpsraw $11, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v8i8:
; XOP: # %bb.0:
-; XOP-NEXT: vpsllw $8, %xmm0, %xmm0
-; XOP-NEXT: vpsraw $11, %xmm0, %xmm0
+; XOP-NEXT: vpshab {{.*}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v8i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512-NEXT: vpsraw $11, %xmm0, %xmm0
+; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v8i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsraw $11, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_shift_v8i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psllw $8, %xmm0
-; X32-SSE-NEXT: psraw $11, %xmm0
+; X32-SSE-NEXT: psrlw $3, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; X32-SSE-NEXT: pxor %xmm1, %xmm0
+; X32-SSE-NEXT: psubb %xmm1, %xmm0
; X32-SSE-NEXT: retl
%shift = ashr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
ret <8 x i8> %shift
define <4 x i8> @splatconstant_shift_v4i8(<4 x i8> %a) nounwind {
; SSE-LABEL: splatconstant_shift_v4i8:
; SSE: # %bb.0:
-; SSE-NEXT: pslld $24, %xmm0
-; SSE-NEXT: psrad $27, %xmm0
+; SSE-NEXT: psrlw $3, %xmm0
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; SSE-NEXT: pxor %xmm1, %xmm0
+; SSE-NEXT: psubb %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_shift_v4i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX-NEXT: vpsrad $27, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v4i8:
; XOP: # %bb.0:
-; XOP-NEXT: vpslld $24, %xmm0, %xmm0
-; XOP-NEXT: vpsrad $27, %xmm0, %xmm0
+; XOP-NEXT: vpshab {{.*}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v4i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX512-NEXT: vpsrad $27, %xmm0, %xmm0
+; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v4i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsrad $27, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_shift_v4i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pslld $24, %xmm0
-; X32-SSE-NEXT: psrad $27, %xmm0
+; X32-SSE-NEXT: psrlw $3, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; X32-SSE-NEXT: pxor %xmm1, %xmm0
+; X32-SSE-NEXT: psubb %xmm1, %xmm0
; X32-SSE-NEXT: retl
%shift = ashr <4 x i8> %a, <i8 3, i8 3, i8 3, i8 3>
ret <4 x i8> %shift
}
define <2 x i8> @splatconstant_shift_v2i8(<2 x i8> %a) nounwind {
-; SSE2-LABEL: splatconstant_shift_v2i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: psllq $56, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; SSE2-NEXT: psrad $24, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrad $3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; SSE2-NEXT: psrlq $3, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: splatconstant_shift_v2i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: psllq $56, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrad $31, %xmm1
-; SSE41-NEXT: psrad $24, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrad $3, %xmm1
-; SSE41-NEXT: psrlq $3, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: splatconstant_shift_v2i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1
-; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; AVX1-NEXT: vpsrad $3, %xmm0, %xmm1
-; AVX1-NEXT: vpsrlq $3, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; AVX1-NEXT: retq
+; SSE-LABEL: splatconstant_shift_v2i8:
+; SSE: # %bb.0:
+; SSE-NEXT: psrlw $3, %xmm0
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; SSE-NEXT: pxor %xmm1, %xmm0
+; SSE-NEXT: psubb %xmm1, %xmm0
+; SSE-NEXT: retq
;
-; AVX2-LABEL: splatconstant_shift_v2i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $31, %xmm0, %xmm1
-; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX2-NEXT: vpsrad $3, %xmm0, %xmm1
-; AVX2-NEXT: vpsrlq $3, %xmm0, %xmm0
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX2-NEXT: retq
+; AVX-LABEL: splatconstant_shift_v2i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v2i8:
; XOP: # %bb.0:
-; XOP-NEXT: vpsllq $56, %xmm0, %xmm0
-; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: vpshab {{.*}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v2i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX512-NEXT: vpsraq $59, %zmm0, %zmm0
-; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v2i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsraq $59, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_shift_v2i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psllq $56, %xmm0
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrad $31, %xmm1
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; X32-SSE-NEXT: psrad $24, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrad $3, %xmm1
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; X32-SSE-NEXT: psrlq $3, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE-NEXT: psrlw $3, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; X32-SSE-NEXT: pxor %xmm1, %xmm0
+; X32-SSE-NEXT: psubb %xmm1, %xmm0
; X32-SSE-NEXT: retl
%shift = ashr <2 x i8> %a, <i8 3, i8 3>
ret <2 x i8> %shift
; SSE-NEXT: movd %r9d, %xmm0
; SSE-NEXT: movd %r8d, %xmm1
; SSE-NEXT: xorl %ecx, %ecx
-; SSE-NEXT: pmovzxdq {{.*#+}} xmm13 = xmm1[0],zero,xmm1[1],zero
-; SSE-NEXT: pmovzxdq {{.*#+}} xmm14 = xmm0[0],zero,xmm0[1],zero
+; SSE-NEXT: pmovzxdq {{.*#+}} xmm14 = xmm1[0],zero,xmm1[1],zero
+; SSE-NEXT: pmovzxdq {{.*#+}} xmm15 = xmm0[0],zero,xmm0[1],zero
; SSE-NEXT: .p2align 4, 0x90
; SSE-NEXT: .LBB0_4: # %vector.body
; SSE-NEXT: # =>This Inner Loop Header: Depth=1
-; SSE-NEXT: pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; SSE-NEXT: pmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; SSE-NEXT: pmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; SSE-NEXT: pmovzxbw {{.*#+}} xmm15 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero
+; SSE-NEXT: movq {{.*#+}} xmm4 = mem[0],zero
+; SSE-NEXT: movq {{.*#+}} xmm5 = mem[0],zero
; SSE-NEXT: pxor %xmm1, %xmm1
-; SSE-NEXT: pcmpeqw %xmm1, %xmm0
-; SSE-NEXT: pmovzxwd {{.*#+}} xmm12 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE-NEXT: pslld $24, %xmm12
-; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
-; SSE-NEXT: pslld $24, %xmm0
-; SSE-NEXT: pcmpeqw %xmm1, %xmm3
-; SSE-NEXT: pmovzxwd {{.*#+}} xmm11 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; SSE-NEXT: pslld $24, %xmm11
-; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE-NEXT: pslld $24, %xmm3
-; SSE-NEXT: pcmpeqw %xmm1, %xmm2
-; SSE-NEXT: pmovzxwd {{.*#+}} xmm9 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; SSE-NEXT: pslld $24, %xmm9
-; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE-NEXT: pslld $24, %xmm2
-; SSE-NEXT: pcmpeqw %xmm1, %xmm15
-; SSE-NEXT: pmovzxwd {{.*#+}} xmm8 = xmm15[0],zero,xmm15[1],zero,xmm15[2],zero,xmm15[3],zero
-; SSE-NEXT: pslld $24, %xmm8
-; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
-; SSE-NEXT: pslld $24, %xmm15
-; SSE-NEXT: movdqu 16(%rdi,%rcx,4), %xmm6
-; SSE-NEXT: movdqa %xmm6, %xmm4
-; SSE-NEXT: pslld %xmm14, %xmm4
-; SSE-NEXT: pslld %xmm13, %xmm6
-; SSE-NEXT: blendvps %xmm0, %xmm4, %xmm6
+; SSE-NEXT: pcmpeqb %xmm1, %xmm0
+; SSE-NEXT: pmovsxbd %xmm0, %xmm7
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE-NEXT: pmovsxbd %xmm0, %xmm0
+; SSE-NEXT: pcmpeqb %xmm1, %xmm3
+; SSE-NEXT: pmovsxbd %xmm3, %xmm13
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
+; SSE-NEXT: pmovsxbd %xmm3, %xmm6
+; SSE-NEXT: pcmpeqb %xmm1, %xmm4
+; SSE-NEXT: pmovsxbd %xmm4, %xmm11
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,2,3]
+; SSE-NEXT: pmovsxbd %xmm3, %xmm2
+; SSE-NEXT: pcmpeqb %xmm1, %xmm5
+; SSE-NEXT: pmovsxbd %xmm5, %xmm8
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,2,3]
+; SSE-NEXT: pmovsxbd %xmm3, %xmm9
+; SSE-NEXT: movdqu 16(%rdi,%rcx,4), %xmm3
+; SSE-NEXT: movdqa %xmm3, %xmm4
+; SSE-NEXT: pslld %xmm15, %xmm4
+; SSE-NEXT: pslld %xmm14, %xmm3
+; SSE-NEXT: blendvps %xmm0, %xmm4, %xmm3
; SSE-NEXT: movdqu (%rdi,%rcx,4), %xmm10
-; SSE-NEXT: movdqa %xmm10, %xmm4
-; SSE-NEXT: pslld %xmm14, %xmm4
-; SSE-NEXT: pslld %xmm13, %xmm10
-; SSE-NEXT: movdqa %xmm12, %xmm0
-; SSE-NEXT: blendvps %xmm0, %xmm4, %xmm10
+; SSE-NEXT: movdqa %xmm10, %xmm5
+; SSE-NEXT: pslld %xmm15, %xmm5
+; SSE-NEXT: pslld %xmm14, %xmm10
+; SSE-NEXT: movdqa %xmm7, %xmm0
+; SSE-NEXT: blendvps %xmm0, %xmm5, %xmm10
; SSE-NEXT: movdqu 48(%rdi,%rcx,4), %xmm12
; SSE-NEXT: movdqa %xmm12, %xmm5
-; SSE-NEXT: pslld %xmm14, %xmm5
-; SSE-NEXT: pslld %xmm13, %xmm12
-; SSE-NEXT: movdqa %xmm3, %xmm0
+; SSE-NEXT: pslld %xmm15, %xmm5
+; SSE-NEXT: pslld %xmm14, %xmm12
+; SSE-NEXT: movdqa %xmm6, %xmm0
; SSE-NEXT: blendvps %xmm0, %xmm5, %xmm12
-; SSE-NEXT: movdqu 32(%rdi,%rcx,4), %xmm3
-; SSE-NEXT: movdqa %xmm3, %xmm5
-; SSE-NEXT: pslld %xmm14, %xmm5
-; SSE-NEXT: pslld %xmm13, %xmm3
-; SSE-NEXT: movdqa %xmm11, %xmm0
-; SSE-NEXT: blendvps %xmm0, %xmm5, %xmm3
+; SSE-NEXT: movdqu 32(%rdi,%rcx,4), %xmm6
+; SSE-NEXT: movdqa %xmm6, %xmm5
+; SSE-NEXT: pslld %xmm15, %xmm5
+; SSE-NEXT: pslld %xmm14, %xmm6
+; SSE-NEXT: movdqa %xmm13, %xmm0
+; SSE-NEXT: blendvps %xmm0, %xmm5, %xmm6
; SSE-NEXT: movdqu 80(%rdi,%rcx,4), %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm5
-; SSE-NEXT: pslld %xmm14, %xmm5
-; SSE-NEXT: pslld %xmm13, %xmm1
+; SSE-NEXT: pslld %xmm15, %xmm5
+; SSE-NEXT: pslld %xmm14, %xmm1
; SSE-NEXT: movdqa %xmm2, %xmm0
; SSE-NEXT: blendvps %xmm0, %xmm5, %xmm1
-; SSE-NEXT: movdqu 64(%rdi,%rcx,4), %xmm2
-; SSE-NEXT: movdqa %xmm2, %xmm5
+; SSE-NEXT: movdqu 64(%rdi,%rcx,4), %xmm5
+; SSE-NEXT: movdqa %xmm5, %xmm2
+; SSE-NEXT: pslld %xmm15, %xmm2
; SSE-NEXT: pslld %xmm14, %xmm5
-; SSE-NEXT: pslld %xmm13, %xmm2
+; SSE-NEXT: movdqa %xmm11, %xmm0
+; SSE-NEXT: blendvps %xmm0, %xmm2, %xmm5
+; SSE-NEXT: movdqu 112(%rdi,%rcx,4), %xmm2
+; SSE-NEXT: movdqa %xmm2, %xmm4
+; SSE-NEXT: pslld %xmm15, %xmm4
+; SSE-NEXT: pslld %xmm14, %xmm2
; SSE-NEXT: movdqa %xmm9, %xmm0
-; SSE-NEXT: blendvps %xmm0, %xmm5, %xmm2
-; SSE-NEXT: movdqu 112(%rdi,%rcx,4), %xmm5
-; SSE-NEXT: movdqa %xmm5, %xmm7
-; SSE-NEXT: pslld %xmm14, %xmm7
-; SSE-NEXT: pslld %xmm13, %xmm5
-; SSE-NEXT: movdqa %xmm15, %xmm0
-; SSE-NEXT: blendvps %xmm0, %xmm7, %xmm5
-; SSE-NEXT: movdqu 96(%rdi,%rcx,4), %xmm7
-; SSE-NEXT: movdqa %xmm7, %xmm4
+; SSE-NEXT: blendvps %xmm0, %xmm4, %xmm2
+; SSE-NEXT: movdqu 96(%rdi,%rcx,4), %xmm4
+; SSE-NEXT: movdqa %xmm4, %xmm7
+; SSE-NEXT: pslld %xmm15, %xmm7
; SSE-NEXT: pslld %xmm14, %xmm4
-; SSE-NEXT: pslld %xmm13, %xmm7
; SSE-NEXT: movdqa %xmm8, %xmm0
-; SSE-NEXT: blendvps %xmm0, %xmm4, %xmm7
+; SSE-NEXT: blendvps %xmm0, %xmm7, %xmm4
; SSE-NEXT: movups %xmm10, (%rdi,%rcx,4)
-; SSE-NEXT: movups %xmm6, 16(%rdi,%rcx,4)
-; SSE-NEXT: movups %xmm3, 32(%rdi,%rcx,4)
+; SSE-NEXT: movups %xmm3, 16(%rdi,%rcx,4)
+; SSE-NEXT: movups %xmm6, 32(%rdi,%rcx,4)
; SSE-NEXT: movups %xmm12, 48(%rdi,%rcx,4)
-; SSE-NEXT: movups %xmm2, 64(%rdi,%rcx,4)
+; SSE-NEXT: movups %xmm5, 64(%rdi,%rcx,4)
; SSE-NEXT: movups %xmm1, 80(%rdi,%rcx,4)
-; SSE-NEXT: movups %xmm7, 96(%rdi,%rcx,4)
-; SSE-NEXT: movups %xmm5, 112(%rdi,%rcx,4)
+; SSE-NEXT: movups %xmm4, 96(%rdi,%rcx,4)
+; SSE-NEXT: movups %xmm2, 112(%rdi,%rcx,4)
; SSE-NEXT: addq $32, %rcx
; SSE-NEXT: cmpq %rcx, %rdx
; SSE-NEXT: jne .LBB0_4
; AVX1-NEXT: # xmm1 = mem[0],zero,mem[1],zero
; AVX1-NEXT: vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
; AVX1-NEXT: # xmm2 = mem[0],zero,mem[1],zero
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX1-NEXT: vpcmpeqw %xmm11, %xmm3, %xmm3
-; AVX1-NEXT: vpmovsxwd %xmm3, %xmm7
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
-; AVX1-NEXT: vpmovsxwd %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqw %xmm11, %xmm4, %xmm4
-; AVX1-NEXT: vpmovsxwd %xmm4, %xmm8
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
-; AVX1-NEXT: vpmovsxwd %xmm4, %xmm4
-; AVX1-NEXT: vpcmpeqw %xmm11, %xmm5, %xmm5
+; AVX1-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX1-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
+; AVX1-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
+; AVX1-NEXT: vmovq {{.*#+}} xmm6 = mem[0],zero
+; AVX1-NEXT: vpcmpeqb %xmm11, %xmm3, %xmm3
+; AVX1-NEXT: vpmovsxbd %xmm3, %xmm7
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
+; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3
+; AVX1-NEXT: vpcmpeqb %xmm11, %xmm4, %xmm4
+; AVX1-NEXT: vpmovsxbd %xmm4, %xmm8
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
+; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4
+; AVX1-NEXT: vpcmpeqb %xmm11, %xmm5, %xmm5
; AVX1-NEXT: vmovdqu (%rdi,%rcx,4), %xmm9
; AVX1-NEXT: vpslld %xmm2, %xmm9, %xmm10
; AVX1-NEXT: vpslld %xmm1, %xmm9, %xmm0
; AVX1-NEXT: vblendvps %xmm7, %xmm10, %xmm0, %xmm9
-; AVX1-NEXT: vpmovsxwd %xmm5, %xmm7
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
-; AVX1-NEXT: vpmovsxwd %xmm5, %xmm5
-; AVX1-NEXT: vpcmpeqw %xmm11, %xmm6, %xmm6
+; AVX1-NEXT: vpmovsxbd %xmm5, %xmm7
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,3]
+; AVX1-NEXT: vpmovsxbd %xmm5, %xmm5
+; AVX1-NEXT: vpcmpeqb %xmm11, %xmm6, %xmm6
; AVX1-NEXT: vmovdqu 16(%rdi,%rcx,4), %xmm0
; AVX1-NEXT: vpslld %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpslld %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpmovsxwd %xmm6, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
-; AVX1-NEXT: vpmovsxwd %xmm6, %xmm6
+; AVX1-NEXT: vpmovsxbd %xmm6, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,3]
+; AVX1-NEXT: vpmovsxbd %xmm6, %xmm6
; AVX1-NEXT: vblendvps %xmm3, %xmm2, %xmm0, %xmm10
; AVX1-NEXT: vmovdqu 32(%rdi,%rcx,4), %xmm2
; AVX1-NEXT: vpslld %xmm15, %xmm2, %xmm3
define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
; SSE2-LABEL: var_shift_v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrlq %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: psrlq %xmm1, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: var_shift_v2i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psrlq %xmm1, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE41-NEXT: psrlq %xmm1, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: var_shift_v2i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: var_shift_v2i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; XOPAVX1-LABEL: var_shift_v2i32:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
-; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
-;
-; XOPAVX2-LABEL: var_shift_v2i32:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; XOPAVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
-;
-; AVX512-LABEL: var_shift_v2i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX512-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: retq
-;
-; AVX512VL-LABEL: var_shift_v2i32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX512VL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; X32-SSE-LABEL: var_shift_v2i32:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
-; X32-SSE-NEXT: pand %xmm2, %xmm1
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psrlq %xmm1, %xmm2
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; X32-SSE-NEXT: xorps %xmm3, %xmm3
-; X32-SSE-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3]
-; X32-SSE-NEXT: psrlq %xmm3, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
-; X32-SSE-NEXT: retl
- %shift = lshr <2 x i32> %a, %b
- ret <2 x i32> %shift
-}
-
-define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
-; SSE2-LABEL: var_shift_v4i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: psrld %xmm2, %xmm3
; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: retq
;
-; SSE41-LABEL: var_shift_v4i16:
+; SSE41-LABEL: var_shift_v2i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
-; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
+; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: psrld %xmm2, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
; SSE41-NEXT: movdqa %xmm0, %xmm5
; SSE41-NEXT: psrld %xmm4, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
-; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm6
-; SSE41-NEXT: psrld %xmm4, %xmm6
-; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm5[0,1,2,3],xmm6[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psrld %xmm1, %xmm2
-; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,1,1,4,5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: psrld %xmm1, %xmm3
+; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
; SSE41-NEXT: psrld %xmm1, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
; SSE41-NEXT: retq
;
-; AVX1-LABEL: var_shift_v4i16:
+; AVX1-LABEL: var_shift_v2i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
-; AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3
+; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: var_shift_v4i16:
+; AVX2-LABEL: var_shift_v2i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
-; XOPAVX1-LABEL: var_shift_v4i16:
+; XOPAVX1-LABEL: var_shift_v2i32:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1
; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
-; XOPAVX2-LABEL: var_shift_v4i16:
+; XOPAVX2-LABEL: var_shift_v2i32:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
; XOPAVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
-; AVX512-LABEL: var_shift_v4i16:
+; AVX512-LABEL: var_shift_v2i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
; AVX512-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
-; AVX512VL-LABEL: var_shift_v4i16:
+; AVX512VL-LABEL: var_shift_v2i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
; AVX512VL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
-; X32-SSE-LABEL: var_shift_v4i16:
+; X32-SSE-LABEL: var_shift_v2i32:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0]
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: pand %xmm2, %xmm1
; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
; X32-SSE-NEXT: movdqa %xmm0, %xmm3
; X32-SSE-NEXT: psrld %xmm2, %xmm3
; X32-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
; X32-SSE-NEXT: movaps %xmm2, %xmm0
; X32-SSE-NEXT: retl
- %shift = lshr <4 x i16> %a, %b
- ret <4 x i16> %shift
-}
-
-define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
-; SSE2-LABEL: var_shift_v2i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,0,65535,0,0,0]
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrlq %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: psrlq %xmm1, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: var_shift_v2i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psrlq %xmm1, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE41-NEXT: psrlq %xmm1, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: var_shift_v2i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
-; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: var_shift_v2i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
-; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; XOPAVX1-LABEL: var_shift_v2i16:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
-; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
-;
-; XOPAVX2-LABEL: var_shift_v2i16:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
-; XOPAVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
-;
-; AVX512-LABEL: var_shift_v2i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
-; AVX512-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: retq
-;
-; AVX512VL-LABEL: var_shift_v2i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
-; AVX512VL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; X32-SSE-LABEL: var_shift_v2i16:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,0,65535,0,0,0]
-; X32-SSE-NEXT: pand %xmm2, %xmm1
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psrlq %xmm1, %xmm2
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; X32-SSE-NEXT: psrlq %xmm1, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
-; X32-SSE-NEXT: retl
- %shift = lshr <2 x i16> %a, %b
- ret <2 x i16> %shift
+ %shift = lshr <2 x i32> %a, %b
+ ret <2 x i32> %shift
}
-define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
-; SSE2-LABEL: var_shift_v8i8:
+define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
+; SSE2-LABEL: var_shift_v4i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
; SSE2-NEXT: psllw $12, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psraw $15, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psraw $15, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm0, %xmm3
+; SSE2-NEXT: psrlw $8, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
; SSE2-NEXT: paddw %xmm1, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: psraw $15, %xmm2
; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
-; SSE41-LABEL: var_shift_v8i8:
+; SSE41-LABEL: var_shift_v4i16:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm1, %xmm2
; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; SSE41-NEXT: pand %xmm0, %xmm1
-; SSE41-NEXT: pand %xmm0, %xmm2
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: psllw $12, %xmm0
; SSE41-NEXT: psllw $4, %xmm2
; SSE41-NEXT: por %xmm0, %xmm2
; SSE41-NEXT: movdqa %xmm2, %xmm3
; SSE41-NEXT: paddw %xmm2, %xmm3
-; SSE41-NEXT: pxor %xmm4, %xmm4
+; SSE41-NEXT: movdqa %xmm1, %xmm4
+; SSE41-NEXT: psrlw $8, %xmm4
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm2
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: var_shift_v8i8:
+; AVX1-LABEL: var_shift_v4i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2
; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm3
; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1
; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: var_shift_v8i8:
+; AVX2-LABEL: var_shift_v4i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; XOPAVX1-LABEL: var_shift_v8i8:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; XOPAVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1
-; XOPAVX1-NEXT: vpshlw %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
-;
-; XOPAVX2-LABEL: var_shift_v8i8:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; XOPAVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
-; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpsubw %xmm1, %xmm2, %xmm1
-; XOPAVX2-NEXT: vpshlw %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; XOP-LABEL: var_shift_v4i16:
+; XOP: # %bb.0:
+; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm1
+; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0
+; XOP-NEXT: retq
;
-; AVX512DQ-LABEL: var_shift_v8i8:
+; AVX512DQ-LABEL: var_shift_v4i16:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; AVX512DQ-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512DQ-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
-; AVX512BW-LABEL: var_shift_v8i8:
+; AVX512BW-LABEL: var_shift_v4i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; AVX512BW-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
-; AVX512DQVL-LABEL: var_shift_v8i8:
+; AVX512DQVL-LABEL: var_shift_v4i16:
; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; AVX512DQVL-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512DQVL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
; AVX512DQVL-NEXT: vzeroupper
; AVX512DQVL-NEXT: retq
;
-; AVX512BWVL-LABEL: var_shift_v8i8:
+; AVX512BWVL-LABEL: var_shift_v4i16:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; AVX512BWVL-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0
; AVX512BWVL-NEXT: retq
;
-; X32-SSE-LABEL: var_shift_v8i8:
+; X32-SSE-LABEL: var_shift_v4i16:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
; X32-SSE-NEXT: psllw $12, %xmm1
-; X32-SSE-NEXT: movdqa %xmm1, %xmm0
-; X32-SSE-NEXT: psraw $15, %xmm0
-; X32-SSE-NEXT: pandn %xmm2, %xmm0
+; X32-SSE-NEXT: movdqa %xmm1, %xmm2
+; X32-SSE-NEXT: psraw $15, %xmm2
+; X32-SSE-NEXT: movdqa %xmm2, %xmm3
+; X32-SSE-NEXT: pandn %xmm0, %xmm3
+; X32-SSE-NEXT: psrlw $8, %xmm0
+; X32-SSE-NEXT: pand %xmm2, %xmm0
+; X32-SSE-NEXT: por %xmm3, %xmm0
; X32-SSE-NEXT: paddw %xmm1, %xmm1
; X32-SSE-NEXT: movdqa %xmm1, %xmm2
; X32-SSE-NEXT: psraw $15, %xmm2
; X32-SSE-NEXT: pand %xmm1, %xmm0
; X32-SSE-NEXT: por %xmm2, %xmm0
; X32-SSE-NEXT: retl
- %shift = lshr <8 x i8> %a, %b
- ret <8 x i8> %shift
+ %shift = lshr <4 x i16> %a, %b
+ ret <4 x i16> %shift
}
-define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
-; SSE2-LABEL: var_shift_v4i8:
+define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
+; SSE2-LABEL: var_shift_v2i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE2-NEXT: psllw $12, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psraw $15, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm0, %xmm3
+; SSE2-NEXT: psrlw $8, %xmm0
; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: psrld %xmm2, %xmm3
-; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrld %xmm4, %xmm2
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: psrld %xmm3, %xmm4
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
-; SSE2-NEXT: psrld %xmm1, %xmm0
-; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
-; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psraw $15, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm0, %xmm3
+; SSE2-NEXT: psrlw $4, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psraw $15, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm0, %xmm3
+; SSE2-NEXT: psrlw $2, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm1
+; SSE2-NEXT: psraw $15, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: psrlw $1, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
-; SSE41-LABEL: var_shift_v4i8:
+; SSE41-LABEL: var_shift_v2i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: pand %xmm2, %xmm1
-; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psrld %xmm2, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm5
-; SSE41-NEXT: psrld %xmm4, %xmm5
-; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
-; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psrld %xmm1, %xmm3
-; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
-; SSE41-NEXT: psrld %xmm1, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: var_shift_v4i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
-; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2
-; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3
-; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: psllw $12, %xmm0
+; SSE41-NEXT: psllw $4, %xmm2
+; SSE41-NEXT: por %xmm0, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: paddw %xmm2, %xmm3
+; SSE41-NEXT: movdqa %xmm1, %xmm4
+; SSE41-NEXT: psrlw $8, %xmm4
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psrlw $4, %xmm2
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psrlw $2, %xmm2
+; SSE41-NEXT: paddw %xmm3, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psrlw $1, %xmm2
+; SSE41-NEXT: paddw %xmm3, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: var_shift_v2i16:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2
+; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm3
+; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1
+; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1
+; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1
+; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: var_shift_v4i8:
+; AVX2-LABEL: var_shift_v2i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255]
-; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; XOPAVX1-LABEL: var_shift_v4i8:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
-; XOPAVX1-NEXT: vandps %xmm2, %xmm0, %xmm0
-; XOPAVX1-NEXT: vandps %xmm2, %xmm1, %xmm1
-; XOPAVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1
-; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; XOP-LABEL: var_shift_v2i16:
+; XOP: # %bb.0:
+; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm1
+; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0
+; XOP-NEXT: retq
;
-; XOPAVX2-LABEL: var_shift_v4i8:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255]
-; XOPAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
-; XOPAVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; AVX512DQ-LABEL: var_shift_v2i16:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQ-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; AVX512-LABEL: var_shift_v4i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255]
-; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512BW-LABEL: var_shift_v2i16:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
;
-; AVX512VL-LABEL: var_shift_v4i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255]
-; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512DQVL-LABEL: var_shift_v2i16:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQVL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: var_shift_v2i16:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT: retq
+;
+; X32-SSE-LABEL: var_shift_v2i16:
+; X32-SSE: # %bb.0:
+; X32-SSE-NEXT: psllw $12, %xmm1
+; X32-SSE-NEXT: movdqa %xmm1, %xmm2
+; X32-SSE-NEXT: psraw $15, %xmm2
+; X32-SSE-NEXT: movdqa %xmm2, %xmm3
+; X32-SSE-NEXT: pandn %xmm0, %xmm3
+; X32-SSE-NEXT: psrlw $8, %xmm0
+; X32-SSE-NEXT: pand %xmm2, %xmm0
+; X32-SSE-NEXT: por %xmm3, %xmm0
+; X32-SSE-NEXT: paddw %xmm1, %xmm1
+; X32-SSE-NEXT: movdqa %xmm1, %xmm2
+; X32-SSE-NEXT: psraw $15, %xmm2
+; X32-SSE-NEXT: movdqa %xmm2, %xmm3
+; X32-SSE-NEXT: pandn %xmm0, %xmm3
+; X32-SSE-NEXT: psrlw $4, %xmm0
+; X32-SSE-NEXT: pand %xmm2, %xmm0
+; X32-SSE-NEXT: por %xmm3, %xmm0
+; X32-SSE-NEXT: paddw %xmm1, %xmm1
+; X32-SSE-NEXT: movdqa %xmm1, %xmm2
+; X32-SSE-NEXT: psraw $15, %xmm2
+; X32-SSE-NEXT: movdqa %xmm2, %xmm3
+; X32-SSE-NEXT: pandn %xmm0, %xmm3
+; X32-SSE-NEXT: psrlw $2, %xmm0
+; X32-SSE-NEXT: pand %xmm2, %xmm0
+; X32-SSE-NEXT: por %xmm3, %xmm0
+; X32-SSE-NEXT: paddw %xmm1, %xmm1
+; X32-SSE-NEXT: psraw $15, %xmm1
+; X32-SSE-NEXT: movdqa %xmm1, %xmm2
+; X32-SSE-NEXT: pandn %xmm0, %xmm2
+; X32-SSE-NEXT: psrlw $1, %xmm0
+; X32-SSE-NEXT: pand %xmm1, %xmm0
+; X32-SSE-NEXT: por %xmm2, %xmm0
+; X32-SSE-NEXT: retl
+ %shift = lshr <2 x i16> %a, %b
+ ret <2 x i16> %shift
+}
+
+define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
+; SSE2-LABEL: var_shift_v8i8:
+; SSE2: # %bb.0:
+; SSE2-NEXT: psllw $5, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psrlw $4, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psrlw $2, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm1
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm1
+; SSE2-NEXT: psrlw $1, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: var_shift_v8i8:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psllw $5, %xmm1
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: psrlw $4, %xmm3
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psrlw $2, %xmm3
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT: paddb %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psrlw $1, %xmm3
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT: paddb %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: var_shift_v8i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
+; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $2, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $1, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; XOP-LABEL: var_shift_v8i8:
+; XOP: # %bb.0:
+; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1
+; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0
+; XOP-NEXT: retq
+;
+; AVX512DQ-LABEL: var_shift_v8i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
+;
+; AVX512BW-LABEL: var_shift_v8i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512DQVL-LABEL: var_shift_v8i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: var_shift_v8i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+;
+; X32-SSE-LABEL: var_shift_v8i8:
+; X32-SSE: # %bb.0:
+; X32-SSE-NEXT: psllw $5, %xmm1
+; X32-SSE-NEXT: pxor %xmm2, %xmm2
+; X32-SSE-NEXT: pxor %xmm3, %xmm3
+; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3
+; X32-SSE-NEXT: movdqa %xmm3, %xmm4
+; X32-SSE-NEXT: pandn %xmm0, %xmm4
+; X32-SSE-NEXT: psrlw $4, %xmm0
+; X32-SSE-NEXT: pand %xmm3, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: por %xmm4, %xmm0
+; X32-SSE-NEXT: paddb %xmm1, %xmm1
+; X32-SSE-NEXT: pxor %xmm3, %xmm3
+; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3
+; X32-SSE-NEXT: movdqa %xmm3, %xmm4
+; X32-SSE-NEXT: pandn %xmm0, %xmm4
+; X32-SSE-NEXT: psrlw $2, %xmm0
+; X32-SSE-NEXT: pand %xmm3, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: por %xmm4, %xmm0
+; X32-SSE-NEXT: paddb %xmm1, %xmm1
+; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2
+; X32-SSE-NEXT: movdqa %xmm2, %xmm1
+; X32-SSE-NEXT: pandn %xmm0, %xmm1
+; X32-SSE-NEXT: psrlw $1, %xmm0
+; X32-SSE-NEXT: pand %xmm2, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: por %xmm1, %xmm0
+; X32-SSE-NEXT: retl
+ %shift = lshr <8 x i8> %a, %b
+ ret <8 x i8> %shift
+}
+
+define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
+; SSE2-LABEL: var_shift_v4i8:
+; SSE2: # %bb.0:
+; SSE2-NEXT: psllw $5, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psrlw $4, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psrlw $2, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm1
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm1
+; SSE2-NEXT: psrlw $1, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: var_shift_v4i8:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psllw $5, %xmm1
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: psrlw $4, %xmm3
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psrlw $2, %xmm3
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT: paddb %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psrlw $1, %xmm3
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT: paddb %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: var_shift_v4i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
+; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $2, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $1, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; XOP-LABEL: var_shift_v4i8:
+; XOP: # %bb.0:
+; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1
+; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0
+; XOP-NEXT: retq
+;
+; AVX512DQ-LABEL: var_shift_v4i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
+;
+; AVX512BW-LABEL: var_shift_v4i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512DQVL-LABEL: var_shift_v4i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: var_shift_v4i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: var_shift_v4i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; X32-SSE-NEXT: psllw $5, %xmm1
+; X32-SSE-NEXT: pxor %xmm2, %xmm2
+; X32-SSE-NEXT: pxor %xmm3, %xmm3
+; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3
+; X32-SSE-NEXT: movdqa %xmm3, %xmm4
+; X32-SSE-NEXT: pandn %xmm0, %xmm4
+; X32-SSE-NEXT: psrlw $4, %xmm0
+; X32-SSE-NEXT: pand %xmm3, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: por %xmm4, %xmm0
+; X32-SSE-NEXT: paddb %xmm1, %xmm1
+; X32-SSE-NEXT: pxor %xmm3, %xmm3
+; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3
+; X32-SSE-NEXT: movdqa %xmm3, %xmm4
+; X32-SSE-NEXT: pandn %xmm0, %xmm4
+; X32-SSE-NEXT: psrlw $2, %xmm0
+; X32-SSE-NEXT: pand %xmm3, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: por %xmm4, %xmm0
+; X32-SSE-NEXT: paddb %xmm1, %xmm1
+; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2
+; X32-SSE-NEXT: movdqa %xmm2, %xmm1
+; X32-SSE-NEXT: pandn %xmm0, %xmm1
+; X32-SSE-NEXT: psrlw $1, %xmm0
; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: pand %xmm2, %xmm1
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm3
-; X32-SSE-NEXT: psrld %xmm2, %xmm3
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psrld %xmm4, %xmm2
-; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm4
-; X32-SSE-NEXT: psrld %xmm3, %xmm4
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
-; X32-SSE-NEXT: psrld %xmm1, %xmm0
-; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
-; X32-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
-; X32-SSE-NEXT: movaps %xmm2, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: por %xmm1, %xmm0
; X32-SSE-NEXT: retl
%shift = lshr <4 x i8> %a, %b
ret <4 x i8> %shift
define <2 x i8> @var_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
; SSE2-LABEL: var_shift_v2i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: psllw $5, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psrlw $4, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psrlw $2, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm1
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm1
+; SSE2-NEXT: psrlw $1, %xmm0
; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrlq %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: psrlq %xmm1, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_shift_v2i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; SSE41-NEXT: pand %xmm2, %xmm1
-; SSE41-NEXT: pand %xmm2, %xmm0
; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psrlq %xmm1, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE41-NEXT: psrlq %xmm1, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: psllw $5, %xmm1
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: psrlw $4, %xmm3
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psrlw $2, %xmm3
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT: paddb %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psrlw $1, %xmm3
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT: paddb %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: var_shift_v2i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1.2598673968951787E-321,1.2598673968951787E-321]
-; AVX1-NEXT: # xmm2 = mem[0,0]
-; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: retq
+; AVX-LABEL: var_shift_v2i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
+; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $2, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $1, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: retq
;
-; AVX2-LABEL: var_shift_v2i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255]
-; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
+; XOP-LABEL: var_shift_v2i8:
+; XOP: # %bb.0:
+; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1
+; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0
+; XOP-NEXT: retq
;
-; XOPAVX1-LABEL: var_shift_v2i8:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1.2598673968951787E-321,1.2598673968951787E-321]
-; XOPAVX1-NEXT: # xmm2 = mem[0,0]
-; XOPAVX1-NEXT: vandps %xmm2, %xmm0, %xmm0
-; XOPAVX1-NEXT: vandps %xmm2, %xmm1, %xmm1
-; XOPAVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
-; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; AVX512DQ-LABEL: var_shift_v2i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; XOPAVX2-LABEL: var_shift_v2i8:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255]
-; XOPAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
-; XOPAVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; AVX512BW-LABEL: var_shift_v2i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
;
-; AVX512-LABEL: var_shift_v2i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255]
-; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512DQVL-LABEL: var_shift_v2i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
;
-; AVX512VL-LABEL: var_shift_v2i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255]
-; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512BWVL-LABEL: var_shift_v2i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: var_shift_v2i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; X32-SSE-NEXT: pand %xmm2, %xmm1
+; X32-SSE-NEXT: psllw $5, %xmm1
+; X32-SSE-NEXT: pxor %xmm2, %xmm2
+; X32-SSE-NEXT: pxor %xmm3, %xmm3
+; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3
+; X32-SSE-NEXT: movdqa %xmm3, %xmm4
+; X32-SSE-NEXT: pandn %xmm0, %xmm4
+; X32-SSE-NEXT: psrlw $4, %xmm0
+; X32-SSE-NEXT: pand %xmm3, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: por %xmm4, %xmm0
+; X32-SSE-NEXT: paddb %xmm1, %xmm1
+; X32-SSE-NEXT: pxor %xmm3, %xmm3
+; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3
+; X32-SSE-NEXT: movdqa %xmm3, %xmm4
+; X32-SSE-NEXT: pandn %xmm0, %xmm4
+; X32-SSE-NEXT: psrlw $2, %xmm0
+; X32-SSE-NEXT: pand %xmm3, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: por %xmm4, %xmm0
+; X32-SSE-NEXT: paddb %xmm1, %xmm1
+; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2
+; X32-SSE-NEXT: movdqa %xmm2, %xmm1
+; X32-SSE-NEXT: pandn %xmm0, %xmm1
+; X32-SSE-NEXT: psrlw $1, %xmm0
; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psrlq %xmm1, %xmm2
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; X32-SSE-NEXT: psrlq %xmm1, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: por %xmm1, %xmm0
; X32-SSE-NEXT: retl
%shift = lshr <2 x i8> %a, %b
ret <2 x i8> %shift
define <2 x i32> @splatvar_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrlq %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: psrlq %xmm1, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE2-NEXT: xorps %xmm2, %xmm2
+; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; SSE2-NEXT: psrld %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v2i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psrlq %xmm1, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE41-NEXT: psrlq %xmm1, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; SSE41-NEXT: psrld %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: splatvar_shift_v2i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: splatvar_shift_v2i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; XOPAVX1-LABEL: splatvar_shift_v2i32:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
-; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; AVX-LABEL: splatvar_shift_v2i32:
+; AVX: # %bb.0:
+; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX-NEXT: vpsrld %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
;
-; XOPAVX2-LABEL: splatvar_shift_v2i32:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1
-; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; XOPAVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; XOP-LABEL: splatvar_shift_v2i32:
+; XOP: # %bb.0:
+; XOP-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; XOP-NEXT: vpsrld %xmm1, %xmm0, %xmm0
+; XOP-NEXT: retq
;
; AVX512-LABEL: splatvar_shift_v2i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX512-NEXT: vpbroadcastq %xmm1, %xmm1
-; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX512-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX512-NEXT: vpsrld %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatvar_shift_v2i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX512VL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX512VL-NEXT: vpsrld %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatvar_shift_v2i32:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: pand %xmm1, %xmm2
-; X32-SSE-NEXT: movdqa %xmm0, %xmm3
-; X32-SSE-NEXT: psrlq %xmm2, %xmm3
-; X32-SSE-NEXT: pxor %xmm2, %xmm2
+; X32-SSE-NEXT: xorps %xmm2, %xmm2
; X32-SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
-; X32-SSE-NEXT: psrlq %xmm2, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
+; X32-SSE-NEXT: psrld %xmm2, %xmm0
; X32-SSE-NEXT: retl
%splat = shufflevector <2 x i32> %b, <2 x i32> undef, <2 x i32> zeroinitializer
%shift = lshr <2 x i32> %a, %splat
define <4 x i16> @splatvar_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v4i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0]
-; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[2,3,3,3,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrld %xmm1, %xmm2
-; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,1,1,1,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld %xmm4, %xmm1
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: psrld %xmm3, %xmm4
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
-; SSE2-NEXT: psrld %xmm2, %xmm0
-; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,3]
-; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: psrlw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v4i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
-; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm5
-; SSE41-NEXT: psrld %xmm4, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
-; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm6
-; SSE41-NEXT: psrld %xmm4, %xmm6
-; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm5[0,1,2,3],xmm6[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psrld %xmm1, %xmm2
-; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,1,1,4,5,6,7]
-; SSE41-NEXT: psrld %xmm1, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7]
+; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; SSE41-NEXT: psrlw %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: splatvar_shift_v4i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
-; AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: splatvar_shift_v4i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; XOPAVX1-LABEL: splatvar_shift_v4i16:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1
-; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; AVX-LABEL: splatvar_shift_v4i16:
+; AVX: # %bb.0:
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
;
-; XOPAVX2-LABEL: splatvar_shift_v4i16:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1
-; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; XOPAVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; XOP-LABEL: splatvar_shift_v4i16:
+; XOP: # %bb.0:
+; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; XOP-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
+; XOP-NEXT: retq
;
; AVX512-LABEL: splatvar_shift_v4i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; AVX512-NEXT: vpbroadcastd %xmm1, %xmm1
-; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; AVX512-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatvar_shift_v4i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; AVX512VL-NEXT: vpbroadcastd %xmm1, %xmm1
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; AVX512VL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VL-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatvar_shift_v4i16:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0]
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0]
-; X32-SSE-NEXT: pand %xmm2, %xmm3
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[2,3,3,3,4,5,6,7]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psrld %xmm1, %xmm2
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,1,1,1,4,5,6,7]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrld %xmm4, %xmm1
-; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm4
-; X32-SSE-NEXT: psrld %xmm3, %xmm4
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
-; X32-SSE-NEXT: psrld %xmm2, %xmm0
-; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
-; X32-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,3]
-; X32-SSE-NEXT: movaps %xmm1, %xmm0
+; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT: psrlw %xmm1, %xmm0
; X32-SSE-NEXT: retl
%splat = shufflevector <4 x i16> %b, <4 x i16> undef, <4 x i32> zeroinitializer
%shift = lshr <4 x i16> %a, %splat
define <2 x i16> @splatvar_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v2i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,0,65535,0,0,0]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrlq %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: psrlq %xmm1, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: psrlw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v2i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psrlq %xmm1, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE41-NEXT: psrlq %xmm1, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; SSE41-NEXT: psrlw %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: splatvar_shift_v2i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: splatvar_shift_v2i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
-; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; XOPAVX1-LABEL: splatvar_shift_v2i16:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
-; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; AVX-LABEL: splatvar_shift_v2i16:
+; AVX: # %bb.0:
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
;
-; XOPAVX2-LABEL: splatvar_shift_v2i16:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
-; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1
-; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; XOPAVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; XOP-LABEL: splatvar_shift_v2i16:
+; XOP: # %bb.0:
+; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; XOP-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
+; XOP-NEXT: retq
;
; AVX512-LABEL: splatvar_shift_v2i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
-; AVX512-NEXT: vpbroadcastq %xmm1, %xmm1
-; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX512-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatvar_shift_v2i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
-; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX512VL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VL-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
-; X32-SSE-LABEL: splatvar_shift_v2i16:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,0,65535,0,0,0]
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; X32-SSE-NEXT: pand %xmm2, %xmm1
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psrlq %xmm1, %xmm2
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; X32-SSE-NEXT: psrlq %xmm1, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
-; X32-SSE-NEXT: retl
- %splat = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> zeroinitializer
- %shift = lshr <2 x i16> %a, %splat
- ret <2 x i16> %shift
-}
-
-define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
-; SSE2-LABEL: splatvar_shift_v8i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
-; SSE2-NEXT: psllw $12, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psraw $15, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psraw $15, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: psrlw $4, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psraw $15, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: psrlw $2, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
-; SSE2-NEXT: psraw $15, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: psrlw $1, %xmm0
+; X32-SSE-LABEL: splatvar_shift_v2i16:
+; X32-SSE: # %bb.0:
+; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT: psrlw %xmm1, %xmm0
+; X32-SSE-NEXT: retl
+ %splat = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> zeroinitializer
+ %shift = lshr <2 x i16> %a, %splat
+ ret <2 x i16> %shift
+}
+
+define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
+; SSE2-LABEL: splatvar_shift_v8i8:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: psrlw %xmm1, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT: psrlw %xmm1, %xmm2
+; SSE2-NEXT: psrlw $8, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v8i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero,xmm2[0],zero
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: psllw $12, %xmm0
-; SSE41-NEXT: psllw $4, %xmm2
-; SSE41-NEXT: por %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: paddw %xmm2, %xmm3
-; SSE41-NEXT: pxor %xmm4, %xmm4
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psrlw $4, %xmm2
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psrlw $2, %xmm2
-; SSE41-NEXT: paddw %xmm3, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psrlw $1, %xmm2
-; SSE41-NEXT: paddw %xmm3, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: psrlw %xmm1, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT: psrlw %xmm1, %xmm2
+; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE41-NEXT: pand %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: splatvar_shift_v8i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
-; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2
-; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
-; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1
-; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1
-; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1
-; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_shift_v8i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
+; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: splatvar_shift_v8i8:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
+; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1
-; XOPAVX1-NEXT: vpshlw %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatvar_shift_v8i8:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; XOPAVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
+; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpsubw %xmm1, %xmm2, %xmm1
-; XOPAVX2-NEXT: vpshlw %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1
+; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
; AVX512DQ-LABEL: splatvar_shift_v8i8:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; AVX512DQ-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512DQ-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: splatvar_shift_v8i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; AVX512BW-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
+; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQVL-LABEL: splatvar_shift_v8i8:
; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; AVX512DQVL-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
-; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512DQVL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQVL-NEXT: vzeroupper
; AVX512DQVL-NEXT: retq
;
; AVX512BWVL-LABEL: splatvar_shift_v8i8:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; AVX512BWVL-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
-; AVX512BWVL-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: splatvar_shift_v8i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
-; X32-SSE-NEXT: psllw $12, %xmm1
-; X32-SSE-NEXT: movdqa %xmm1, %xmm0
-; X32-SSE-NEXT: psraw $15, %xmm0
-; X32-SSE-NEXT: pandn %xmm2, %xmm0
-; X32-SSE-NEXT: paddw %xmm1, %xmm1
-; X32-SSE-NEXT: movdqa %xmm1, %xmm2
-; X32-SSE-NEXT: psraw $15, %xmm2
-; X32-SSE-NEXT: movdqa %xmm2, %xmm3
-; X32-SSE-NEXT: pandn %xmm0, %xmm3
-; X32-SSE-NEXT: psrlw $4, %xmm0
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: por %xmm3, %xmm0
-; X32-SSE-NEXT: paddw %xmm1, %xmm1
-; X32-SSE-NEXT: movdqa %xmm1, %xmm2
-; X32-SSE-NEXT: psraw $15, %xmm2
-; X32-SSE-NEXT: movdqa %xmm2, %xmm3
-; X32-SSE-NEXT: pandn %xmm0, %xmm3
-; X32-SSE-NEXT: psrlw $2, %xmm0
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: por %xmm3, %xmm0
-; X32-SSE-NEXT: paddw %xmm1, %xmm1
-; X32-SSE-NEXT: psraw $15, %xmm1
-; X32-SSE-NEXT: movdqa %xmm1, %xmm2
-; X32-SSE-NEXT: pandn %xmm0, %xmm2
-; X32-SSE-NEXT: psrlw $1, %xmm0
+; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT: psrlw %xmm1, %xmm0
+; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2
+; X32-SSE-NEXT: psrlw %xmm1, %xmm2
+; X32-SSE-NEXT: psrlw $8, %xmm2
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; X32-SSE-NEXT: pand %xmm1, %xmm0
-; X32-SSE-NEXT: por %xmm2, %xmm0
; X32-SSE-NEXT: retl
%splat = shufflevector <8 x i8> %b, <8 x i8> undef, <8 x i32> zeroinitializer
%shift = lshr <8 x i8> %a, %splat
define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v4i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0]
-; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[2,3,3,3,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrld %xmm1, %xmm2
-; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,1,1,1,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld %xmm4, %xmm1
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: psrld %xmm3, %xmm4
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
-; SSE2-NEXT: psrld %xmm2, %xmm0
-; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,3]
-; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: psrlw %xmm1, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT: psrlw %xmm1, %xmm2
+; SSE2-NEXT: psrlw $8, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v4i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
-; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psrld %xmm2, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm5
-; SSE41-NEXT: psrld %xmm4, %xmm5
-; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
-; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psrld %xmm1, %xmm3
-; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
-; SSE41-NEXT: psrld %xmm1, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
+; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: psrlw %xmm1, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT: psrlw %xmm1, %xmm2
+; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE41-NEXT: pand %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: splatvar_shift_v4i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
-; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2
-; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3
-; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_shift_v4i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255]
-; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
-; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
+; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: splatvar_shift_v4i8:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
-; XOPAVX1-NEXT: vandps %xmm2, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
-; XOPAVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1
-; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1
+; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatvar_shift_v4i8:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255]
-; XOPAVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
-; XOPAVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
+; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1
+; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
-; AVX512-LABEL: splatvar_shift_v4i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255]
-; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
-; AVX512-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512DQ-LABEL: splatvar_shift_v4i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; AVX512VL-LABEL: splatvar_shift_v4i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255]
-; AVX512VL-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
-; AVX512VL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512BW-LABEL: splatvar_shift_v4i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512DQVL-LABEL: splatvar_shift_v4i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: splatvar_shift_v4i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: splatvar_shift_v4i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0]
-; X32-SSE-NEXT: pand %xmm2, %xmm3
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[2,3,3,3,4,5,6,7]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psrld %xmm1, %xmm2
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,1,1,1,4,5,6,7]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrld %xmm4, %xmm1
-; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm4
-; X32-SSE-NEXT: psrld %xmm3, %xmm4
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
-; X32-SSE-NEXT: psrld %xmm2, %xmm0
-; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
-; X32-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,3]
-; X32-SSE-NEXT: movaps %xmm1, %xmm0
+; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT: psrlw %xmm1, %xmm0
+; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2
+; X32-SSE-NEXT: psrlw %xmm1, %xmm2
+; X32-SSE-NEXT: psrlw $8, %xmm2
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X32-SSE-NEXT: pand %xmm1, %xmm0
; X32-SSE-NEXT: retl
%splat = shufflevector <4 x i8> %b, <4 x i8> undef, <4 x i32> zeroinitializer
%shift = lshr <4 x i8> %a, %splat
define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v2i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrlq %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: psrlq %xmm1, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: psrlw %xmm1, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT: psrlw %xmm1, %xmm2
+; SSE2-NEXT: psrlw $8, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v2i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psrlq %xmm1, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE41-NEXT: psrlq %xmm1, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: psrlw %xmm1, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT: psrlw %xmm1, %xmm2
+; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE41-NEXT: pand %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: splatvar_shift_v2i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1.2598673968951787E-321,1.2598673968951787E-321]
-; AVX1-NEXT: # xmm2 = mem[0,0]
-; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_shift_v2i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255]
-; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
+; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
-; XOPAVX1-LABEL: splatvar_shift_v2i8:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1.2598673968951787E-321,1.2598673968951787E-321]
-; XOPAVX1-NEXT: # xmm2 = mem[0,0]
-; XOPAVX1-NEXT: vandps %xmm2, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
-; XOPAVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
-; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; XOP-LABEL: splatvar_shift_v2i8:
+; XOP: # %bb.0:
+; XOP-NEXT: insertq {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7,u,u,u,u,u,u,u,u]
+; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1
+; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0
+; XOP-NEXT: retq
;
-; XOPAVX2-LABEL: splatvar_shift_v2i8:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255]
-; XOPAVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
-; XOPAVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; AVX512DQ-LABEL: splatvar_shift_v2i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; AVX512-LABEL: splatvar_shift_v2i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255]
-; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512BW-LABEL: splatvar_shift_v2i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
;
-; AVX512VL-LABEL: splatvar_shift_v2i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255]
-; AVX512VL-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512DQVL-LABEL: splatvar_shift_v2i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: splatvar_shift_v2i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: splatvar_shift_v2i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; X32-SSE-NEXT: pand %xmm2, %xmm1
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psrlq %xmm1, %xmm2
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; X32-SSE-NEXT: psrlq %xmm1, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT: psrlw %xmm1, %xmm0
+; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2
+; X32-SSE-NEXT: psrlw %xmm1, %xmm2
+; X32-SSE-NEXT: psrlw $8, %xmm2
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X32-SSE-NEXT: pand %xmm1, %xmm0
; X32-SSE-NEXT: retl
%splat = shufflevector <2 x i8> %b, <2 x i8> undef, <2 x i32> zeroinitializer
%shift = lshr <2 x i8> %a, %splat
define <2 x i32> @constant_shift_v2i32(<2 x i32> %a) nounwind {
; SSE2-LABEL: constant_shift_v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlq $4, %xmm1
-; SSE2-NEXT: psrlq $5, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT: psrld $4, %xmm1
+; SSE2-NEXT: psrld $5, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v2i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrlq $5, %xmm0
-; SSE41-NEXT: psrlq $4, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrld $5, %xmm1
+; SSE41-NEXT: psrld $4, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_shift_v2i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; AVX1-NEXT: vpsrlq $5, %xmm0, %xmm1
-; AVX1-NEXT: vpsrlq $4, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: vpsrld $5, %xmm0, %xmm1
+; AVX1-NEXT: vpsrld $4, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v2i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: constant_shift_v2i32:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: constant_shift_v2i32:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; XOPAVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: constant_shift_v2i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX512-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: constant_shift_v2i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX512VL-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v2i32:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrlq $4, %xmm1
-; X32-SSE-NEXT: psrlq $5, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-SSE-NEXT: psrld $4, %xmm1
+; X32-SSE-NEXT: psrld $5, %xmm0
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X32-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X32-SSE-NEXT: movdqa %xmm1, %xmm0
; X32-SSE-NEXT: retl
%shift = lshr <2 x i32> %a, <i32 4, i32 5>
ret <2 x i32> %shift
define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
; SSE2-LABEL: constant_shift_v4i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $3, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrld $2, %xmm2
-; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $1, %xmm1
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: pmulhuw {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v4i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrld $2, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: psrld $3, %xmm1
-; SSE41-NEXT: psrld $1, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = <u,32768,16384,8192,u,u,u,u>
+; SSE41-NEXT: pmulhuw %xmm0, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
; SSE41-NEXT: retq
;
-; AVX1-LABEL: constant_shift_v4i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX1-NEXT: vpsrld $3, %xmm0, %xmm1
-; AVX1-NEXT: vpsrld $1, %xmm0, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; AVX1-NEXT: vpsrld $2, %xmm0, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: constant_shift_v4i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: retq
+; AVX-LABEL: constant_shift_v4i16:
+; AVX: # %bb.0:
+; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX-NEXT: retq
;
-; XOPAVX1-LABEL: constant_shift_v4i16:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; XOP-LABEL: constant_shift_v4i16:
+; XOP: # %bb.0:
+; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: retq
;
-; XOPAVX2-LABEL: constant_shift_v4i16:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; XOPAVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; AVX512DQ-LABEL: constant_shift_v4i16:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX512DQ-NEXT: retq
;
-; AVX512-LABEL: constant_shift_v4i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512BW-LABEL: constant_shift_v4i16:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,2,3,u,u,u,u>
+; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
;
-; AVX512VL-LABEL: constant_shift_v4i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512DQVL-LABEL: constant_shift_v4i16:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
+; AVX512DQVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: constant_shift_v4i16:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v4i16:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrld $3, %xmm1
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psrld $2, %xmm2
-; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrld $1, %xmm1
-; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X32-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
+; X32-SSE-NEXT: movdqa %xmm1, %xmm2
+; X32-SSE-NEXT: pandn %xmm0, %xmm2
+; X32-SSE-NEXT: pmulhuw {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: pand %xmm1, %xmm0
+; X32-SSE-NEXT: por %xmm2, %xmm0
; X32-SSE-NEXT: retl
%shift = lshr <4 x i16> %a, <i16 0, i16 1, i16 2, i16 3>
ret <4 x i16> %shift
define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind {
; SSE2-LABEL: constant_shift_v2i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlq $2, %xmm1
-; SSE2-NEXT: psrlq $3, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT: psrlw $3, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535]
+; SSE2-NEXT: psrlw $2, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pandn %xmm1, %xmm2
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v2i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrlq $3, %xmm0
-; SSE41-NEXT: psrlq $2, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrlw $3, %xmm1
+; SSE41-NEXT: psrlw $2, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
; SSE41-NEXT: retq
;
-; AVX1-LABEL: constant_shift_v2i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
-; AVX1-NEXT: vpsrlq $3, %xmm0, %xmm1
-; AVX1-NEXT: vpsrlq $2, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; AVX1-NEXT: retq
+; AVX-LABEL: constant_shift_v2i16:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsrlw $3, %xmm0, %xmm1
+; AVX-NEXT: vpsrlw $2, %xmm0, %xmm0
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
+; AVX-NEXT: retq
;
-; AVX2-LABEL: constant_shift_v2i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
-; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: retq
+; XOP-LABEL: constant_shift_v2i16:
+; XOP: # %bb.0:
+; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: retq
;
-; XOPAVX1-LABEL: constant_shift_v2i16:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
-; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; AVX512DQ-LABEL: constant_shift_v2i16:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpsrlw $3, %xmm0, %xmm1
+; AVX512DQ-NEXT: vpsrlw $2, %xmm0, %xmm0
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
+; AVX512DQ-NEXT: retq
;
-; XOPAVX2-LABEL: constant_shift_v2i16:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
-; XOPAVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; AVX512BW-LABEL: constant_shift_v2i16:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
;
-; AVX512-LABEL: constant_shift_v2i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
-; AVX512-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512DQVL-LABEL: constant_shift_v2i16:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpsrlw $3, %xmm0, %xmm1
+; AVX512DQVL-NEXT: vpsrlw $2, %xmm0, %xmm0
+; AVX512DQVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
+; AVX512DQVL-NEXT: retq
;
-; AVX512VL-LABEL: constant_shift_v2i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
-; AVX512VL-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512BWVL-LABEL: constant_shift_v2i16:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v2i16:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrlq $2, %xmm1
-; X32-SSE-NEXT: psrlq $3, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-SSE-NEXT: psrlw $3, %xmm1
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535]
+; X32-SSE-NEXT: psrlw $2, %xmm0
+; X32-SSE-NEXT: pand %xmm2, %xmm0
+; X32-SSE-NEXT: pandn %xmm1, %xmm2
+; X32-SSE-NEXT: por %xmm2, %xmm0
; X32-SSE-NEXT: retl
%shift = lshr <2 x i16> %a, <i16 2, i16 3>
ret <2 x i16> %shift
define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind {
; SSE2-LABEL: constant_shift_v8i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: pmulhuw {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0
+; SSE2-NEXT: psrlw $8, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v8i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = <u,32768,16384,8192,4096,2048,1024,512>
-; SSE41-NEXT: pmulhuw %xmm0, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm1
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: packuswb %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: constant_shift_v8i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; AVX-NEXT: retq
+; AVX1-LABEL: constant_shift_v8i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: constant_shift_v8i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
;
; XOP-LABEL: constant_shift_v8i8:
; XOP: # %bb.0:
-; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512DQ-LABEL: constant_shift_v8i8:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512DQ-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: constant_shift_v8i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQVL-LABEL: constant_shift_v8i8:
; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
-; AVX512DQVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
; AVX512DQVL-NEXT: retq
;
; AVX512BWVL-LABEL: constant_shift_v8i8:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v8i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
-; X32-SSE-NEXT: movdqa %xmm1, %xmm2
-; X32-SSE-NEXT: pandn %xmm0, %xmm2
-; X32-SSE-NEXT: pmulhuw {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: pand %xmm1, %xmm0
-; X32-SSE-NEXT: por %xmm2, %xmm0
+; X32-SSE-NEXT: pxor %xmm1, %xmm1
+; X32-SSE-NEXT: movdqa %xmm0, %xmm2
+; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: psrlw $8, %xmm0
+; X32-SSE-NEXT: packuswb %xmm2, %xmm0
; X32-SSE-NEXT: retl
%shift = lshr <8 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
ret <8 x i8> %shift
define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind {
; SSE2-LABEL: constant_shift_v4i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $3, %xmm1
+; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrld $2, %xmm2
-; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $1, %xmm1
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0
+; SSE2-NEXT: psrlw $8, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v4i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrld $2, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT: psrld $3, %xmm0
-; SSE41-NEXT: psrld $1, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm1
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: packuswb %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_shift_v4i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpsrld $3, %xmm0, %xmm1
-; AVX1-NEXT: vpsrld $1, %xmm0, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; AVX1-NEXT: vpsrld $2, %xmm0, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v4i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; XOPAVX1-LABEL: constant_shift_v4i8:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; XOP-LABEL: constant_shift_v4i8:
+; XOP: # %bb.0:
+; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: retq
;
-; XOPAVX2-LABEL: constant_shift_v4i8:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; AVX512DQ-LABEL: constant_shift_v4i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; AVX512-LABEL: constant_shift_v4i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512BW-LABEL: constant_shift_v4i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
;
-; AVX512VL-LABEL: constant_shift_v4i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512DQVL-LABEL: constant_shift_v4i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: constant_shift_v4i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v4i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrld $3, %xmm1
+; X32-SSE-NEXT: pxor %xmm1, %xmm1
; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psrld $2, %xmm2
-; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrld $1, %xmm1
-; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X32-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
+; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: psrlw $8, %xmm0
+; X32-SSE-NEXT: packuswb %xmm2, %xmm0
; X32-SSE-NEXT: retl
%shift = lshr <4 x i8> %a, <i8 0, i8 1, i8 2, i8 3>
ret <4 x i8> %shift
define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind {
; SSE2-LABEL: constant_shift_v2i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlq $2, %xmm1
-; SSE2-NEXT: psrlq $3, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0
+; SSE2-NEXT: psrlw $8, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v2i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlq $3, %xmm1
-; SSE41-NEXT: psrlq $2, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm1
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: packuswb %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_shift_v2i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpsrlq $3, %xmm0, %xmm1
-; AVX1-NEXT: vpsrlq $2, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v2i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; XOPAVX1-LABEL: constant_shift_v2i8:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; XOP-LABEL: constant_shift_v2i8:
+; XOP: # %bb.0:
+; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: retq
;
-; XOPAVX2-LABEL: constant_shift_v2i8:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; AVX512DQ-LABEL: constant_shift_v2i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; AVX512-LABEL: constant_shift_v2i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512BW-LABEL: constant_shift_v2i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
;
-; AVX512VL-LABEL: constant_shift_v2i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512DQVL-LABEL: constant_shift_v2i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: constant_shift_v2i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v2i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrlq $2, %xmm1
-; X32-SSE-NEXT: psrlq $3, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-SSE-NEXT: pxor %xmm1, %xmm1
+; X32-SSE-NEXT: movdqa %xmm0, %xmm2
+; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: psrlw $8, %xmm0
+; X32-SSE-NEXT: packuswb %xmm2, %xmm0
; X32-SSE-NEXT: retl
%shift = lshr <2 x i8> %a, <i8 2, i8 3>
ret <2 x i8> %shift
;
define <2 x i32> @splatconstant_shift_v2i32(<2 x i32> %a) nounwind {
-; SSE2-LABEL: splatconstant_shift_v2i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: psrlq $5, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: splatconstant_shift_v2i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; SSE41-NEXT: psrlq $5, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: splatconstant_shift_v2i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; AVX1-NEXT: vpsrlq $5, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: splatconstant_shift_v2i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX2-NEXT: vpsrlq $5, %xmm0, %xmm0
-; AVX2-NEXT: retq
+; SSE-LABEL: splatconstant_shift_v2i32:
+; SSE: # %bb.0:
+; SSE-NEXT: psrld $5, %xmm0
+; SSE-NEXT: retq
;
-; XOPAVX1-LABEL: splatconstant_shift_v2i32:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; XOPAVX1-NEXT: vpsrlq $5, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; AVX-LABEL: splatconstant_shift_v2i32:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsrld $5, %xmm0, %xmm0
+; AVX-NEXT: retq
;
-; XOPAVX2-LABEL: splatconstant_shift_v2i32:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; XOPAVX2-NEXT: vpsrlq $5, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; XOP-LABEL: splatconstant_shift_v2i32:
+; XOP: # %bb.0:
+; XOP-NEXT: vpsrld $5, %xmm0, %xmm0
+; XOP-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v2i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX512-NEXT: vpsrlq $5, %xmm0, %xmm0
+; AVX512-NEXT: vpsrld $5, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v2i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX512VL-NEXT: vpsrlq $5, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsrld $5, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_shift_v2i32:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: psrlq $5, %xmm0
+; X32-SSE-NEXT: psrld $5, %xmm0
; X32-SSE-NEXT: retl
%shift = lshr <2 x i32> %a, <i32 5, i32 5>
ret <2 x i32> %shift
}
define <4 x i16> @splatconstant_shift_v4i16(<4 x i16> %a) nounwind {
-; SSE2-LABEL: splatconstant_shift_v4i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: psrld $3, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: splatconstant_shift_v4i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; SSE41-NEXT: psrld $3, %xmm0
-; SSE41-NEXT: retq
+; SSE-LABEL: splatconstant_shift_v4i16:
+; SSE: # %bb.0:
+; SSE-NEXT: psrlw $3, %xmm0
+; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_shift_v4i16:
; AVX: # %bb.0:
-; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX-NEXT: vpsrld $3, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v4i16:
; XOP: # %bb.0:
-; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; XOP-NEXT: vpsrld $3, %xmm0, %xmm0
+; XOP-NEXT: vpsrlw $3, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v4i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX512-NEXT: vpsrld $3, %xmm0, %xmm0
+; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v4i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX512VL-NEXT: vpsrld $3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_shift_v4i16:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: psrld $3, %xmm0
+; X32-SSE-NEXT: psrlw $3, %xmm0
; X32-SSE-NEXT: retl
%shift = lshr <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3>
ret <4 x i16> %shift
}
define <2 x i16> @splatconstant_shift_v2i16(<2 x i16> %a) nounwind {
-; SSE2-LABEL: splatconstant_shift_v2i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: psrlq $3, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: splatconstant_shift_v2i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
-; SSE41-NEXT: psrlq $3, %xmm0
-; SSE41-NEXT: retq
+; SSE-LABEL: splatconstant_shift_v2i16:
+; SSE: # %bb.0:
+; SSE-NEXT: psrlw $3, %xmm0
+; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_shift_v2i16:
; AVX: # %bb.0:
-; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
-; AVX-NEXT: vpsrlq $3, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v2i16:
; XOP: # %bb.0:
-; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
-; XOP-NEXT: vpsrlq $3, %xmm0, %xmm0
+; XOP-NEXT: vpsrlw $3, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v2i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
-; AVX512-NEXT: vpsrlq $3, %xmm0, %xmm0
+; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v2i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
-; AVX512VL-NEXT: vpsrlq $3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_shift_v2i16:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: psrlq $3, %xmm0
+; X32-SSE-NEXT: psrlw $3, %xmm0
; X32-SSE-NEXT: retl
%shift = lshr <2 x i16> %a, <i16 3, i16 3>
ret <2 x i16> %shift
define <8 x i8> @splatconstant_shift_v8i8(<8 x i8> %a) nounwind {
; SSE-LABEL: splatconstant_shift_v8i8:
; SSE: # %bb.0:
-; SSE-NEXT: pand {{.*}}(%rip), %xmm0
; SSE-NEXT: psrlw $3, %xmm0
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_shift_v8i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v8i8:
; XOP: # %bb.0:
-; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: vpsrlw $3, %xmm0, %xmm0
+; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v8i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v8i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_shift_v8i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: psrlw $3, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: retl
%shift = lshr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
ret <8 x i8> %shift
define <4 x i8> @splatconstant_shift_v4i8(<4 x i8> %a) nounwind {
; SSE-LABEL: splatconstant_shift_v4i8:
; SSE: # %bb.0:
+; SSE-NEXT: psrlw $3, %xmm0
; SSE-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE-NEXT: psrld $3, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_shift_v4i8:
; AVX: # %bb.0:
+; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0
; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vpsrld $3, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v4i8:
; XOP: # %bb.0:
-; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: vpsrld $3, %xmm0, %xmm0
+; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v4i8:
; AVX512: # %bb.0:
+; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0
; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vpsrld $3, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v4i8:
; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0
; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: vpsrld $3, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_shift_v4i8:
; X32-SSE: # %bb.0:
+; X32-SSE-NEXT: psrlw $3, %xmm0
; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: psrld $3, %xmm0
; X32-SSE-NEXT: retl
%shift = lshr <4 x i8> %a, <i8 3, i8 3, i8 3, i8 3>
ret <4 x i8> %shift
define <2 x i8> @splatconstant_shift_v2i8(<2 x i8> %a) nounwind {
; SSE-LABEL: splatconstant_shift_v2i8:
; SSE: # %bb.0:
+; SSE-NEXT: psrlw $3, %xmm0
; SSE-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE-NEXT: psrlq $3, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_shift_v2i8:
; AVX: # %bb.0:
+; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0
; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vpsrlq $3, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v2i8:
; XOP: # %bb.0:
-; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: vpsrlq $3, %xmm0, %xmm0
+; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v2i8:
; AVX512: # %bb.0:
+; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0
; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vpsrlq $3, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v2i8:
; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0
; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: vpsrlq $3, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_shift_v2i8:
; X32-SSE: # %bb.0:
+; X32-SSE-NEXT: psrlw $3, %xmm0
; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: psrlq $3, %xmm0
; X32-SSE-NEXT: retl
%shift = lshr <2 x i8> %a, <i8 3, i8 3>
ret <2 x i8> %shift
define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
; SSE2-LABEL: var_shift_v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psllq %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: psllq %xmm1, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: var_shift_v2i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psllq %xmm2, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSE41-NEXT: psllq %xmm2, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: var_shift_v2i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: var_shift_v2i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; XOPAVX1-LABEL: var_shift_v2i32:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
-;
-; XOPAVX2-LABEL: var_shift_v2i32:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; XOPAVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
-;
-; AVX512-LABEL: var_shift_v2i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX512-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: retq
-;
-; AVX512VL-LABEL: var_shift_v2i32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX512VL-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; X32-SSE-LABEL: var_shift_v2i32:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psllq %xmm1, %xmm2
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; X32-SSE-NEXT: xorps %xmm3, %xmm3
-; X32-SSE-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3]
-; X32-SSE-NEXT: psllq %xmm3, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
-; X32-SSE-NEXT: retl
- %shift = shl <2 x i32> %a, %b
- ret <2 x i32> %shift
-}
-
-define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
-; SSE2-LABEL: var_shift_v4i16:
-; SSE2: # %bb.0:
; SSE2-NEXT: pslld $23, %xmm1
; SSE2-NEXT: paddd {{.*}}(%rip), %xmm1
; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
-; SSE41-LABEL: var_shift_v4i16:
+; SSE41-LABEL: var_shift_v2i32:
; SSE41: # %bb.0:
; SSE41-NEXT: pslld $23, %xmm1
; SSE41-NEXT: paddd {{.*}}(%rip), %xmm1
; SSE41-NEXT: pmulld %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: var_shift_v4i16:
+; AVX1-LABEL: var_shift_v2i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: var_shift_v4i16:
+; AVX2-LABEL: var_shift_v2i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
-; XOPAVX1-LABEL: var_shift_v4i16:
+; XOPAVX1-LABEL: var_shift_v2i32:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
-; XOPAVX2-LABEL: var_shift_v4i16:
+; XOPAVX2-LABEL: var_shift_v2i32:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
; XOPAVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
-; AVX512-LABEL: var_shift_v4i16:
+; AVX512-LABEL: var_shift_v2i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
; AVX512-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
-; AVX512VL-LABEL: var_shift_v4i16:
+; AVX512VL-LABEL: var_shift_v2i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
; AVX512VL-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
-; X32-SSE-LABEL: var_shift_v4i16:
+; X32-SSE-LABEL: var_shift_v2i32:
; X32-SSE: # %bb.0:
; X32-SSE-NEXT: pslld $23, %xmm1
; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm1
; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X32-SSE-NEXT: retl
- %shift = shl <4 x i16> %a, %b
- ret <4 x i16> %shift
+ %shift = shl <2 x i32> %a, %b
+ ret <2 x i32> %shift
}
-define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
-; SSE2-LABEL: var_shift_v2i16:
+define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
+; SSE2-LABEL: var_shift_v4i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psllq %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: psllq %xmm1, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT: pslld $23, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; SSE2-NEXT: paddd %xmm3, %xmm2
+; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT: pslld $23, %xmm1
+; SSE2-NEXT: paddd %xmm3, %xmm1
+; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT: pmullw %xmm1, %xmm0
; SSE2-NEXT: retq
;
-; SSE41-LABEL: var_shift_v2i16:
+; SSE41-LABEL: var_shift_v4i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psllq %xmm2, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSE41-NEXT: psllq %xmm2, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE41-NEXT: pslld $23, %xmm1
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; SSE41-NEXT: paddd %xmm3, %xmm1
+; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
+; SSE41-NEXT: pslld $23, %xmm2
+; SSE41-NEXT: paddd %xmm3, %xmm2
+; SSE41-NEXT: cvttps2dq %xmm2, %xmm2
+; SSE41-NEXT: packusdw %xmm1, %xmm2
+; SSE41-NEXT: pmullw %xmm2, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: var_shift_v2i16:
+; AVX1-LABEL: var_shift_v4i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
+; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
+; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: var_shift_v2i16:
+; AVX2-LABEL: var_shift_v4i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; XOPAVX1-LABEL: var_shift_v2i16:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; XOP-LABEL: var_shift_v4i16:
+; XOP: # %bb.0:
+; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0
+; XOP-NEXT: retq
;
-; XOPAVX2-LABEL: var_shift_v2i16:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; XOPAVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; AVX512DQ-LABEL: var_shift_v4i16:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQ-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; AVX512-LABEL: var_shift_v2i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX512-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512BW-LABEL: var_shift_v4i16:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
;
-; AVX512VL-LABEL: var_shift_v2i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX512VL-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512DQVL-LABEL: var_shift_v4i16:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQVL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
;
-; X32-SSE-LABEL: var_shift_v2i16:
+; AVX512BWVL-LABEL: var_shift_v4i16:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpsllvw %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT: retq
+;
+; X32-SSE-LABEL: var_shift_v4i16:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psllq %xmm1, %xmm2
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; X32-SSE-NEXT: psllq %xmm1, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X32-SSE-NEXT: movdqa %xmm1, %xmm2
+; X32-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; X32-SSE-NEXT: pslld $23, %xmm2
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; X32-SSE-NEXT: paddd %xmm3, %xmm2
+; X32-SSE-NEXT: cvttps2dq %xmm2, %xmm2
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; X32-SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X32-SSE-NEXT: pslld $23, %xmm1
+; X32-SSE-NEXT: paddd %xmm3, %xmm1
+; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; X32-SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X32-SSE-NEXT: pmullw %xmm1, %xmm0
; X32-SSE-NEXT: retl
- %shift = shl <2 x i16> %a, %b
- ret <2 x i16> %shift
+ %shift = shl <4 x i16> %a, %b
+ ret <4 x i16> %shift
}
-define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
-; SSE2-LABEL: var_shift_v8i8:
+define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
+; SSE2-LABEL: var_shift_v2i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
-; SSE2-NEXT: pslld $23, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
-; SSE2-NEXT: paddd %xmm4, %xmm3
-; SSE2-NEXT: cvttps2dq %xmm3, %xmm3
-; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT: pslld $23, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; SSE2-NEXT: paddd %xmm3, %xmm2
+; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSE2-NEXT: pslld $23, %xmm1
-; SSE2-NEXT: paddd %xmm4, %xmm1
+; SSE2-NEXT: paddd %xmm3, %xmm1
; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSE2-NEXT: pmullw %xmm1, %xmm0
; SSE2-NEXT: retq
;
-; SSE41-LABEL: var_shift_v8i8:
+; SSE41-LABEL: var_shift_v2i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; SSE41-NEXT: pand %xmm1, %xmm2
-; SSE41-NEXT: pxor %xmm3, %xmm3
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
+; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSE41-NEXT: pslld $23, %xmm1
; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
; SSE41-NEXT: paddd %xmm3, %xmm1
; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
; SSE41-NEXT: pslld $23, %xmm2
; SSE41-NEXT: paddd %xmm3, %xmm2
; SSE41-NEXT: cvttps2dq %xmm2, %xmm2
; SSE41-NEXT: pmullw %xmm2, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: var_shift_v8i8:
+; AVX1-LABEL: var_shift_v2i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: var_shift_v8i8:
+; AVX2-LABEL: var_shift_v2i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; XOP-LABEL: var_shift_v8i8:
+; XOP-LABEL: var_shift_v2i16:
; XOP: # %bb.0:
-; XOP-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
-; AVX512DQ-LABEL: var_shift_v8i8:
+; AVX512DQ-LABEL: var_shift_v2i16:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512DQ-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
-; AVX512BW-LABEL: var_shift_v8i8:
+; AVX512BW-LABEL: var_shift_v2i16:
; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
-; AVX512DQVL-LABEL: var_shift_v8i8:
+; AVX512DQVL-LABEL: var_shift_v2i16:
; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512DQVL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0
; AVX512DQVL-NEXT: vzeroupper
; AVX512DQVL-NEXT: retq
;
-; AVX512BWVL-LABEL: var_shift_v8i8:
+; AVX512BWVL-LABEL: var_shift_v2i16:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX512BWVL-NEXT: vpsllvw %xmm1, %xmm0, %xmm0
; AVX512BWVL-NEXT: retq
;
-; X32-SSE-LABEL: var_shift_v8i8:
+; X32-SSE-LABEL: var_shift_v2i16:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pxor %xmm2, %xmm2
-; X32-SSE-NEXT: movdqa %xmm1, %xmm3
-; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
-; X32-SSE-NEXT: pslld $23, %xmm3
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
-; X32-SSE-NEXT: paddd %xmm4, %xmm3
-; X32-SSE-NEXT: cvttps2dq %xmm3, %xmm3
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
-; X32-SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X32-SSE-NEXT: movdqa %xmm1, %xmm2
+; X32-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; X32-SSE-NEXT: pslld $23, %xmm2
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; X32-SSE-NEXT: paddd %xmm3, %xmm2
+; X32-SSE-NEXT: cvttps2dq %xmm2, %xmm2
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; X32-SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; X32-SSE-NEXT: pslld $23, %xmm1
-; X32-SSE-NEXT: paddd %xmm4, %xmm1
+; X32-SSE-NEXT: paddd %xmm3, %xmm1
; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1
; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
; X32-SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; X32-SSE-NEXT: pmullw %xmm1, %xmm0
+; X32-SSE-NEXT: retl
+ %shift = shl <2 x i16> %a, %b
+ ret <2 x i16> %shift
+}
+
+define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
+; SSE2-LABEL: var_shift_v8i8:
+; SSE2: # %bb.0:
+; SSE2-NEXT: psllw $5, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psllw $4, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psllw $2, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm1
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm1
+; SSE2-NEXT: paddb %xmm0, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: var_shift_v8i8:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psllw $5, %xmm1
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: psllw $4, %xmm3
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psllw $2, %xmm3
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT: paddb %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: paddb %xmm2, %xmm3
+; SSE41-NEXT: paddb %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: var_shift_v8i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
+; AVX-NEXT: vpsllw $4, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpsllw $2, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2
+; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; XOP-LABEL: var_shift_v8i8:
+; XOP: # %bb.0:
+; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0
+; XOP-NEXT: retq
+;
+; AVX512DQ-LABEL: var_shift_v8i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
+;
+; AVX512BW-LABEL: var_shift_v8i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512DQVL-LABEL: var_shift_v8i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: var_shift_v8i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+;
+; X32-SSE-LABEL: var_shift_v8i8:
+; X32-SSE: # %bb.0:
+; X32-SSE-NEXT: psllw $5, %xmm1
+; X32-SSE-NEXT: pxor %xmm2, %xmm2
+; X32-SSE-NEXT: pxor %xmm3, %xmm3
+; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3
+; X32-SSE-NEXT: movdqa %xmm3, %xmm4
+; X32-SSE-NEXT: pandn %xmm0, %xmm4
+; X32-SSE-NEXT: psllw $4, %xmm0
+; X32-SSE-NEXT: pand %xmm3, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: por %xmm4, %xmm0
+; X32-SSE-NEXT: paddb %xmm1, %xmm1
+; X32-SSE-NEXT: pxor %xmm3, %xmm3
+; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3
+; X32-SSE-NEXT: movdqa %xmm3, %xmm4
+; X32-SSE-NEXT: pandn %xmm0, %xmm4
+; X32-SSE-NEXT: psllw $2, %xmm0
+; X32-SSE-NEXT: pand %xmm3, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: por %xmm4, %xmm0
+; X32-SSE-NEXT: paddb %xmm1, %xmm1
+; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2
+; X32-SSE-NEXT: movdqa %xmm2, %xmm1
+; X32-SSE-NEXT: pandn %xmm0, %xmm1
+; X32-SSE-NEXT: paddb %xmm0, %xmm0
+; X32-SSE-NEXT: pand %xmm2, %xmm0
+; X32-SSE-NEXT: por %xmm1, %xmm0
; X32-SSE-NEXT: retl
%shift = shl <8 x i8> %a, %b
ret <8 x i8> %shift
define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
; SSE2-LABEL: var_shift_v4i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT: pslld $23, %xmm1
-; SSE2-NEXT: paddd {{.*}}(%rip), %xmm1
-; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: psllw $5, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psllw $4, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psllw $2, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm1
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm1
+; SSE2-NEXT: paddb %xmm0, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_shift_v4i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE41-NEXT: pslld $23, %xmm1
-; SSE41-NEXT: paddd {{.*}}(%rip), %xmm1
-; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE41-NEXT: pmulld %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psllw $5, %xmm1
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: psllw $4, %xmm3
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psllw $2, %xmm3
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT: paddb %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: paddb %xmm2, %xmm3
+; SSE41-NEXT: paddb %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: var_shift_v4i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
-; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
-; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
+; AVX-LABEL: var_shift_v4i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
+; AVX-NEXT: vpsllw $4, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpsllw $2, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2
+; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: retq
;
-; AVX2-LABEL: var_shift_v4i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
+; XOP-LABEL: var_shift_v4i8:
+; XOP: # %bb.0:
+; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0
+; XOP-NEXT: retq
;
-; XOPAVX1-LABEL: var_shift_v4i8:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; AVX512DQ-LABEL: var_shift_v4i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; XOPAVX2-LABEL: var_shift_v4i8:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; XOPAVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; AVX512BW-LABEL: var_shift_v4i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
;
-; AVX512-LABEL: var_shift_v4i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX512-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512DQVL-LABEL: var_shift_v4i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
;
-; AVX512VL-LABEL: var_shift_v4i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX512VL-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512BWVL-LABEL: var_shift_v4i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: var_shift_v4i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
-; X32-SSE-NEXT: pslld $23, %xmm1
-; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm1
-; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X32-SSE-NEXT: pmuludq %xmm1, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X32-SSE-NEXT: pmuludq %xmm2, %xmm1
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE-NEXT: psllw $5, %xmm1
+; X32-SSE-NEXT: pxor %xmm2, %xmm2
+; X32-SSE-NEXT: pxor %xmm3, %xmm3
+; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3
+; X32-SSE-NEXT: movdqa %xmm3, %xmm4
+; X32-SSE-NEXT: pandn %xmm0, %xmm4
+; X32-SSE-NEXT: psllw $4, %xmm0
+; X32-SSE-NEXT: pand %xmm3, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: por %xmm4, %xmm0
+; X32-SSE-NEXT: paddb %xmm1, %xmm1
+; X32-SSE-NEXT: pxor %xmm3, %xmm3
+; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3
+; X32-SSE-NEXT: movdqa %xmm3, %xmm4
+; X32-SSE-NEXT: pandn %xmm0, %xmm4
+; X32-SSE-NEXT: psllw $2, %xmm0
+; X32-SSE-NEXT: pand %xmm3, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: por %xmm4, %xmm0
+; X32-SSE-NEXT: paddb %xmm1, %xmm1
+; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2
+; X32-SSE-NEXT: movdqa %xmm2, %xmm1
+; X32-SSE-NEXT: pandn %xmm0, %xmm1
+; X32-SSE-NEXT: paddb %xmm0, %xmm0
+; X32-SSE-NEXT: pand %xmm2, %xmm0
+; X32-SSE-NEXT: por %xmm1, %xmm0
; X32-SSE-NEXT: retl
%shift = shl <4 x i8> %a, %b
ret <4 x i8> %shift
define <2 x i8> @var_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
; SSE2-LABEL: var_shift_v2i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psllq %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: psllq %xmm1, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE2-NEXT: psllw $5, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psllw $4, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psllw $2, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm1
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm1
+; SSE2-NEXT: paddb %xmm0, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_shift_v2i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psllq %xmm1, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE41-NEXT: psllq %xmm1, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: psllw $5, %xmm1
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: psllw $4, %xmm3
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psllw $2, %xmm3
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE41-NEXT: paddb %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: paddb %xmm2, %xmm3
+; SSE41-NEXT: paddb %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: var_shift_v2i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: retq
+; AVX-LABEL: var_shift_v2i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
+; AVX-NEXT: vpsllw $4, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpsllw $2, %xmm0, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2
+; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
+; AVX-NEXT: retq
;
-; AVX2-LABEL: var_shift_v2i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
+; XOP-LABEL: var_shift_v2i8:
+; XOP: # %bb.0:
+; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0
+; XOP-NEXT: retq
;
-; XOPAVX1-LABEL: var_shift_v2i8:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; AVX512DQ-LABEL: var_shift_v2i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; XOPAVX2-LABEL: var_shift_v2i8:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; XOPAVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; AVX512BW-LABEL: var_shift_v2i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
;
-; AVX512-LABEL: var_shift_v2i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX512-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512DQVL-LABEL: var_shift_v2i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
;
-; AVX512VL-LABEL: var_shift_v2i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX512VL-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512BWVL-LABEL: var_shift_v2i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: var_shift_v2i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psllq %xmm1, %xmm2
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; X32-SSE-NEXT: psllq %xmm1, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X32-SSE-NEXT: psllw $5, %xmm1
+; X32-SSE-NEXT: pxor %xmm2, %xmm2
+; X32-SSE-NEXT: pxor %xmm3, %xmm3
+; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3
+; X32-SSE-NEXT: movdqa %xmm3, %xmm4
+; X32-SSE-NEXT: pandn %xmm0, %xmm4
+; X32-SSE-NEXT: psllw $4, %xmm0
+; X32-SSE-NEXT: pand %xmm3, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: por %xmm4, %xmm0
+; X32-SSE-NEXT: paddb %xmm1, %xmm1
+; X32-SSE-NEXT: pxor %xmm3, %xmm3
+; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3
+; X32-SSE-NEXT: movdqa %xmm3, %xmm4
+; X32-SSE-NEXT: pandn %xmm0, %xmm4
+; X32-SSE-NEXT: psllw $2, %xmm0
+; X32-SSE-NEXT: pand %xmm3, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: por %xmm4, %xmm0
+; X32-SSE-NEXT: paddb %xmm1, %xmm1
+; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2
+; X32-SSE-NEXT: movdqa %xmm2, %xmm1
+; X32-SSE-NEXT: pandn %xmm0, %xmm1
+; X32-SSE-NEXT: paddb %xmm0, %xmm0
+; X32-SSE-NEXT: pand %xmm2, %xmm0
+; X32-SSE-NEXT: por %xmm1, %xmm0
; X32-SSE-NEXT: retl
%shift = shl <2 x i8> %a, %b
ret <2 x i8> %shift
define <2 x i32> @splatvar_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psllq %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: psllq %xmm1, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE2-NEXT: xorps %xmm2, %xmm2
+; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; SSE2-NEXT: pslld %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v2i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psllq %xmm2, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSE41-NEXT: psllq %xmm2, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; SSE41-NEXT: pslld %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: splatvar_shift_v2i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: splatvar_shift_v2i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; XOPAVX1-LABEL: splatvar_shift_v2i32:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; AVX-LABEL: splatvar_shift_v2i32:
+; AVX: # %bb.0:
+; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX-NEXT: vpslld %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
;
-; XOPAVX2-LABEL: splatvar_shift_v2i32:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1
-; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; XOPAVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; XOP-LABEL: splatvar_shift_v2i32:
+; XOP: # %bb.0:
+; XOP-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; XOP-NEXT: vpslld %xmm1, %xmm0, %xmm0
+; XOP-NEXT: retq
;
; AVX512-LABEL: splatvar_shift_v2i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastq %xmm1, %xmm1
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX512-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX512-NEXT: vpslld %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatvar_shift_v2i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX512VL-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX512VL-NEXT: vpslld %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatvar_shift_v2i32:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
-; X32-SSE-NEXT: pand %xmm1, %xmm2
-; X32-SSE-NEXT: movdqa %xmm0, %xmm3
-; X32-SSE-NEXT: psllq %xmm2, %xmm3
-; X32-SSE-NEXT: pxor %xmm2, %xmm2
+; X32-SSE-NEXT: xorps %xmm2, %xmm2
; X32-SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
-; X32-SSE-NEXT: psllq %xmm2, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
+; X32-SSE-NEXT: pslld %xmm2, %xmm0
; X32-SSE-NEXT: retl
%splat = shufflevector <2 x i32> %b, <2 x i32> undef, <2 x i32> zeroinitializer
%shift = shl <2 x i32> %a, %splat
define <4 x i16> @splatvar_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v4i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; SSE2-NEXT: pslld $23, %xmm1
-; SSE2-NEXT: paddd {{.*}}(%rip), %xmm1
-; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: psllw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v4i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; SSE41-NEXT: pslld $23, %xmm1
-; SSE41-NEXT: paddd {{.*}}(%rip), %xmm1
-; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE41-NEXT: pmulld %xmm1, %xmm0
+; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; SSE41-NEXT: psllw %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: splatvar_shift_v4i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
-; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
-; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: splatvar_shift_v4i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; XOPAVX1-LABEL: splatvar_shift_v4i16:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; AVX-LABEL: splatvar_shift_v4i16:
+; AVX: # %bb.0:
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX-NEXT: vpsllw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
;
-; XOPAVX2-LABEL: splatvar_shift_v4i16:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1
-; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; XOPAVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; XOP-LABEL: splatvar_shift_v4i16:
+; XOP: # %bb.0:
+; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; XOP-NEXT: vpsllw %xmm1, %xmm0, %xmm0
+; XOP-NEXT: retq
;
; AVX512-LABEL: splatvar_shift_v4i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastd %xmm1, %xmm1
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; AVX512-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512-NEXT: vpsllw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatvar_shift_v4i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastd %xmm1, %xmm1
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; AVX512VL-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VL-NEXT: vpsllw %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatvar_shift_v4i16:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; X32-SSE-NEXT: pslld $23, %xmm1
-; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm1
-; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X32-SSE-NEXT: pmuludq %xmm1, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X32-SSE-NEXT: pmuludq %xmm2, %xmm1
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT: psllw %xmm1, %xmm0
; X32-SSE-NEXT: retl
%splat = shufflevector <4 x i16> %b, <4 x i16> undef, <4 x i32> zeroinitializer
%shift = shl <4 x i16> %a, %splat
define <2 x i16> @splatvar_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v2i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psllq %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: psllq %xmm1, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: psllw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v2i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psllq %xmm2, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSE41-NEXT: psllq %xmm2, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; SSE41-NEXT: psllw %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: splatvar_shift_v2i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: splatvar_shift_v2i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; XOPAVX1-LABEL: splatvar_shift_v2i16:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; AVX-LABEL: splatvar_shift_v2i16:
+; AVX: # %bb.0:
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX-NEXT: vpsllw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
;
-; XOPAVX2-LABEL: splatvar_shift_v2i16:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1
-; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; XOPAVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; XOP-LABEL: splatvar_shift_v2i16:
+; XOP: # %bb.0:
+; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; XOP-NEXT: vpsllw %xmm1, %xmm0, %xmm0
+; XOP-NEXT: retq
;
; AVX512-LABEL: splatvar_shift_v2i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastq %xmm1, %xmm1
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX512-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512-NEXT: vpsllw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatvar_shift_v2i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX512VL-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VL-NEXT: vpsllw %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatvar_shift_v2i16:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psllq %xmm1, %xmm2
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; X32-SSE-NEXT: psllq %xmm1, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT: psllw %xmm1, %xmm0
; X32-SSE-NEXT: retl
%splat = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> zeroinitializer
%shift = shl <2 x i16> %a, %splat
define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v8i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: psllw %xmm1, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT: psllw %xmm1, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
-; SSE2-NEXT: pslld $23, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
-; SSE2-NEXT: paddd %xmm4, %xmm3
-; SSE2-NEXT: cvttps2dq %xmm3, %xmm3
-; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE2-NEXT: pslld $23, %xmm1
-; SSE2-NEXT: paddd %xmm4, %xmm1
-; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; SSE2-NEXT: pmullw %xmm1, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v8i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE41-NEXT: pslld $23, %xmm1
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
-; SSE41-NEXT: paddd %xmm3, %xmm1
-; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE41-NEXT: pslld $23, %xmm2
-; SSE41-NEXT: paddd %xmm3, %xmm2
-; SSE41-NEXT: cvttps2dq %xmm2, %xmm2
-; SSE41-NEXT: packusdw %xmm1, %xmm2
-; SSE41-NEXT: pmullw %xmm2, %xmm0
+; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: psllw %xmm1, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT: psllw %xmm1, %xmm2
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: pshufb %xmm1, %xmm2
+; SSE41-NEXT: pand %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: splatvar_shift_v8i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
-; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
-; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
-; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_shift_v8i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpsllw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpsllw %xmm1, %xmm2, %xmm1
+; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
-; XOP-LABEL: splatvar_shift_v8i8:
-; XOP: # %bb.0:
-; XOP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
-; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0
-; XOP-NEXT: retq
+; XOPAVX1-LABEL: splatvar_shift_v8i8:
+; XOPAVX1: # %bb.0:
+; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT: retq
+;
+; XOPAVX2-LABEL: splatvar_shift_v8i8:
+; XOPAVX2: # %bb.0:
+; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
+; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT: retq
;
; AVX512DQ-LABEL: splatvar_shift_v8i8:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512DQ-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: splatvar_shift_v8i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
+; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQVL-LABEL: splatvar_shift_v8i8:
; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
-; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512DQVL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQVL-NEXT: vzeroupper
; AVX512DQVL-NEXT: retq
;
; AVX512BWVL-LABEL: splatvar_shift_v8i8:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
-; AVX512BWVL-NEXT: vpsllvw %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: splatvar_shift_v8i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT: psllw %xmm1, %xmm0
+; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2
+; X32-SSE-NEXT: psllw %xmm1, %xmm2
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; X32-SSE-NEXT: pxor %xmm2, %xmm2
-; X32-SSE-NEXT: movdqa %xmm1, %xmm3
-; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
-; X32-SSE-NEXT: pslld $23, %xmm3
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
-; X32-SSE-NEXT: paddd %xmm4, %xmm3
-; X32-SSE-NEXT: cvttps2dq %xmm3, %xmm3
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
-; X32-SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X32-SSE-NEXT: pslld $23, %xmm1
-; X32-SSE-NEXT: paddd %xmm4, %xmm1
-; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; X32-SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; X32-SSE-NEXT: pmullw %xmm1, %xmm0
+; X32-SSE-NEXT: pand %xmm1, %xmm0
; X32-SSE-NEXT: retl
%splat = shufflevector <8 x i8> %b, <8 x i8> undef, <8 x i32> zeroinitializer
%shift = shl <8 x i8> %a, %splat
define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v4i8:
; SSE2: # %bb.0:
+; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: psllw %xmm1, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT: psllw %xmm1, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT: pslld $23, %xmm1
-; SSE2-NEXT: paddd {{.*}}(%rip), %xmm1
-; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v4i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
-; SSE41-NEXT: pslld $23, %xmm1
-; SSE41-NEXT: paddd {{.*}}(%rip), %xmm1
-; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE41-NEXT: pmulld %xmm1, %xmm0
+; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: psllw %xmm1, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT: psllw %xmm1, %xmm2
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: pshufb %xmm1, %xmm2
+; SSE41-NEXT: pand %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: splatvar_shift_v4i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
-; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
-; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
-; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_shift_v4i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
-; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpsllw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpsllw %xmm1, %xmm2, %xmm1
+; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: splatvar_shift_v4i8:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
-; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatvar_shift_v4i8:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
-; XOPAVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
+; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
-; AVX512-LABEL: splatvar_shift_v4i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
-; AVX512-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512DQ-LABEL: splatvar_shift_v4i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; AVX512VL-LABEL: splatvar_shift_v4i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
-; AVX512VL-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512BW-LABEL: splatvar_shift_v4i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512DQVL-LABEL: splatvar_shift_v4i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: splatvar_shift_v4i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: splatvar_shift_v4i8:
; X32-SSE: # %bb.0:
+; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT: psllw %xmm1, %xmm0
+; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2
+; X32-SSE-NEXT: psllw %xmm1, %xmm2
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
-; X32-SSE-NEXT: pslld $23, %xmm1
-; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm1
-; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X32-SSE-NEXT: pmuludq %xmm1, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X32-SSE-NEXT: pmuludq %xmm2, %xmm1
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE-NEXT: pand %xmm1, %xmm0
; X32-SSE-NEXT: retl
%splat = shufflevector <4 x i8> %b, <4 x i8> undef, <4 x i32> zeroinitializer
%shift = shl <4 x i8> %a, %splat
define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v2i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psllq %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: psllq %xmm1, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: psllw %xmm1, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT: psllw %xmm1, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v2i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psllq %xmm1, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE41-NEXT: psllq %xmm1, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: psllw %xmm1, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT: psllw %xmm1, %xmm2
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: pshufb %xmm1, %xmm2
+; SSE41-NEXT: pand %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: splatvar_shift_v2i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_shift_v2i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpsllw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpsllw %xmm1, %xmm2, %xmm1
+; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
-; XOPAVX1-LABEL: splatvar_shift_v2i8:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
-; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; XOP-LABEL: splatvar_shift_v2i8:
+; XOP: # %bb.0:
+; XOP-NEXT: insertq {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7,u,u,u,u,u,u,u,u]
+; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0
+; XOP-NEXT: retq
;
-; XOPAVX2-LABEL: splatvar_shift_v2i8:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
-; XOPAVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; AVX512DQ-LABEL: splatvar_shift_v2i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; AVX512-LABEL: splatvar_shift_v2i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512BW-LABEL: splatvar_shift_v2i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
;
-; AVX512VL-LABEL: splatvar_shift_v2i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512DQVL-LABEL: splatvar_shift_v2i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: splatvar_shift_v2i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: splatvar_shift_v2i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psllq %xmm1, %xmm2
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; X32-SSE-NEXT: psllq %xmm1, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT: psllw %xmm1, %xmm0
+; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2
+; X32-SSE-NEXT: psllw %xmm1, %xmm2
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X32-SSE-NEXT: pand %xmm1, %xmm0
; X32-SSE-NEXT: retl
%splat = shufflevector <2 x i8> %b, <2 x i8> undef, <2 x i32> zeroinitializer
%shift = shl <2 x i8> %a, %splat
; SSE2-LABEL: constant_shift_v2i32:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psllq $4, %xmm1
-; SSE2-NEXT: psllq $5, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT: pslld $4, %xmm1
+; SSE2-NEXT: pslld $5, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v2i32:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psllq $5, %xmm1
-; SSE41-NEXT: psllq $4, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT: pslld $5, %xmm1
+; SSE41-NEXT: pslld $4, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_shift_v2i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $5, %xmm0, %xmm1
-; AVX1-NEXT: vpsllq $4, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: vpslld $5, %xmm0, %xmm1
+; AVX1-NEXT: vpslld $4, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v2i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: constant_shift_v2i32:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: constant_shift_v2i32:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: constant_shift_v2i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: constant_shift_v2i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v2i32:
; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psllq $4, %xmm1
-; X32-SSE-NEXT: psllq $5, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-SSE-NEXT: pslld $4, %xmm1
+; X32-SSE-NEXT: pslld $5, %xmm0
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X32-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X32-SSE-NEXT: movdqa %xmm1, %xmm0
; X32-SSE-NEXT: retl
%shift = shl <2 x i32> %a, <i32 4, i32 5>
ret <2 x i32> %shift
}
define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
-; SSE2-LABEL: constant_shift_v4i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: constant_shift_v4i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
-; SSE41-NEXT: retq
+; SSE-LABEL: constant_shift_v4i16:
+; SSE: # %bb.0:
+; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
+; SSE-NEXT: retq
;
-; AVX1-LABEL: constant_shift_v4i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: retq
+; AVX-LABEL: constant_shift_v4i16:
+; AVX: # %bb.0:
+; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: retq
;
-; AVX2-LABEL: constant_shift_v4i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: retq
+; XOP-LABEL: constant_shift_v4i16:
+; XOP: # %bb.0:
+; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: retq
;
-; XOPAVX1-LABEL: constant_shift_v4i16:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; AVX512DQ-LABEL: constant_shift_v4i16:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQ-NEXT: retq
;
-; XOPAVX2-LABEL: constant_shift_v4i16:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; AVX512BW-LABEL: constant_shift_v4i16:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,2,3,u,u,u,u>
+; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
;
-; AVX512-LABEL: constant_shift_v4i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512DQVL-LABEL: constant_shift_v4i16:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQVL-NEXT: retq
;
-; AVX512VL-LABEL: constant_shift_v4i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512BWVL-LABEL: constant_shift_v4i16:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v4i16:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X32-SSE-NEXT: pmuludq %xmm1, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X32-SSE-NEXT: pmuludq %xmm2, %xmm1
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: retl
%shift = shl <4 x i16> %a, <i16 0, i16 1, i16 2, i16 3>
ret <4 x i16> %shift
define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind {
; SSE2-LABEL: constant_shift_v2i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psllq $2, %xmm1
-; SSE2-NEXT: psllq $3, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v2i16:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psllq $3, %xmm1
-; SSE41-NEXT: psllq $2, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT: psllw $3, %xmm1
+; SSE41-NEXT: psllw $2, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
; SSE41-NEXT: retq
;
-; AVX1-LABEL: constant_shift_v2i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $3, %xmm0, %xmm1
-; AVX1-NEXT: vpsllq $2, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; AVX1-NEXT: retq
+; AVX-LABEL: constant_shift_v2i16:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsllw $3, %xmm0, %xmm1
+; AVX-NEXT: vpsllw $2, %xmm0, %xmm0
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
+; AVX-NEXT: retq
;
-; AVX2-LABEL: constant_shift_v2i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: retq
+; XOP-LABEL: constant_shift_v2i16:
+; XOP: # %bb.0:
+; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: retq
;
-; XOPAVX1-LABEL: constant_shift_v2i16:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; AVX512DQ-LABEL: constant_shift_v2i16:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpsllw $3, %xmm0, %xmm1
+; AVX512DQ-NEXT: vpsllw $2, %xmm0, %xmm0
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
+; AVX512DQ-NEXT: retq
;
-; XOPAVX2-LABEL: constant_shift_v2i16:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; AVX512BW-LABEL: constant_shift_v2i16:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
;
-; AVX512-LABEL: constant_shift_v2i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512DQVL-LABEL: constant_shift_v2i16:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpsllw $3, %xmm0, %xmm1
+; AVX512DQVL-NEXT: vpsllw $2, %xmm0, %xmm0
+; AVX512DQVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
+; AVX512DQVL-NEXT: retq
;
-; AVX512VL-LABEL: constant_shift_v2i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512BWVL-LABEL: constant_shift_v2i16:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v2i16:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psllq $2, %xmm1
-; X32-SSE-NEXT: psllq $3, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: retl
%shift = shl <2 x i16> %a, <i16 2, i16 3>
ret <2 x i16> %shift
}
define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind {
-; SSE-LABEL: constant_shift_v8i8:
-; SSE: # %bb.0:
-; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: constant_shift_v8i8:
+; SSE2: # %bb.0:
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: packuswb %xmm1, %xmm0
+; SSE2-NEXT: retq
;
-; AVX-LABEL: constant_shift_v8i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: retq
+; SSE41-LABEL: constant_shift_v8i8:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm0
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: packuswb %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: constant_shift_v8i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: constant_shift_v8i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
;
; XOP-LABEL: constant_shift_v8i8:
; XOP: # %bb.0:
-; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512DQ-LABEL: constant_shift_v8i8:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: constant_shift_v8i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQVL-LABEL: constant_shift_v8i8:
; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
; AVX512DQVL-NEXT: retq
;
; AVX512BWVL-LABEL: constant_shift_v8i8:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v8i8:
; X32-SSE: # %bb.0:
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: pxor %xmm1, %xmm1
+; X32-SSE-NEXT: packuswb %xmm1, %xmm0
; X32-SSE-NEXT: retl
%shift = shl <8 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
ret <8 x i8> %shift
define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind {
; SSE2-LABEL: constant_shift_v4i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: packuswb %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v4i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm0
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: packuswb %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_shift_v4i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v4i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; XOPAVX1-LABEL: constant_shift_v4i8:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; XOP-LABEL: constant_shift_v4i8:
+; XOP: # %bb.0:
+; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: retq
;
-; XOPAVX2-LABEL: constant_shift_v4i8:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; AVX512DQ-LABEL: constant_shift_v4i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; AVX512-LABEL: constant_shift_v4i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512BW-LABEL: constant_shift_v4i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
;
-; AVX512VL-LABEL: constant_shift_v4i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512DQVL-LABEL: constant_shift_v4i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: constant_shift_v4i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v4i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X32-SSE-NEXT: pmuludq %xmm1, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X32-SSE-NEXT: pmuludq %xmm2, %xmm1
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: pxor %xmm1, %xmm1
+; X32-SSE-NEXT: packuswb %xmm1, %xmm0
; X32-SSE-NEXT: retl
%shift = shl <4 x i8> %a, <i8 0, i8 1, i8 2, i8 3>
ret <4 x i8> %shift
define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind {
; SSE2-LABEL: constant_shift_v2i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psllq $2, %xmm1
-; SSE2-NEXT: psllq $3, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: packuswb %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v2i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psllq $3, %xmm1
-; SSE41-NEXT: psllq $2, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm0
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: packuswb %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_shift_v2i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $3, %xmm0, %xmm1
-; AVX1-NEXT: vpsllq $2, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v2i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; XOPAVX1-LABEL: constant_shift_v2i8:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
+; XOP-LABEL: constant_shift_v2i8:
+; XOP: # %bb.0:
+; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: retq
;
-; XOPAVX2-LABEL: constant_shift_v2i8:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
+; AVX512DQ-LABEL: constant_shift_v2i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
;
-; AVX512-LABEL: constant_shift_v2i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512BW-LABEL: constant_shift_v2i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
;
-; AVX512VL-LABEL: constant_shift_v2i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; AVX512DQVL-LABEL: constant_shift_v2i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQVL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: constant_shift_v2i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v2i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psllq $2, %xmm1
-; X32-SSE-NEXT: psllq $3, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: pxor %xmm1, %xmm1
+; X32-SSE-NEXT: packuswb %xmm1, %xmm0
; X32-SSE-NEXT: retl
%shift = shl <2 x i8> %a, <i8 2, i8 3>
ret <2 x i8> %shift
define <2 x i32> @splatconstant_shift_v2i32(<2 x i32> %a) nounwind {
; SSE-LABEL: splatconstant_shift_v2i32:
; SSE: # %bb.0:
-; SSE-NEXT: psllq $5, %xmm0
+; SSE-NEXT: pslld $5, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_shift_v2i32:
; AVX: # %bb.0:
-; AVX-NEXT: vpsllq $5, %xmm0, %xmm0
+; AVX-NEXT: vpslld $5, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v2i32:
; XOP: # %bb.0:
-; XOP-NEXT: vpsllq $5, %xmm0, %xmm0
+; XOP-NEXT: vpslld $5, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v2i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllq $5, %xmm0, %xmm0
+; AVX512-NEXT: vpslld $5, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v2i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllq $5, %xmm0, %xmm0
+; AVX512VL-NEXT: vpslld $5, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_shift_v2i32:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psllq $5, %xmm0
+; X32-SSE-NEXT: pslld $5, %xmm0
; X32-SSE-NEXT: retl
%shift = shl <2 x i32> %a, <i32 5, i32 5>
ret <2 x i32> %shift
define <4 x i16> @splatconstant_shift_v4i16(<4 x i16> %a) nounwind {
; SSE-LABEL: splatconstant_shift_v4i16:
; SSE: # %bb.0:
-; SSE-NEXT: pslld $3, %xmm0
+; SSE-NEXT: psllw $3, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_shift_v4i16:
; AVX: # %bb.0:
-; AVX-NEXT: vpslld $3, %xmm0, %xmm0
+; AVX-NEXT: vpsllw $3, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v4i16:
; XOP: # %bb.0:
-; XOP-NEXT: vpslld $3, %xmm0, %xmm0
+; XOP-NEXT: vpsllw $3, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v4i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpslld $3, %xmm0, %xmm0
+; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v4i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpslld $3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsllw $3, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_shift_v4i16:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pslld $3, %xmm0
+; X32-SSE-NEXT: psllw $3, %xmm0
; X32-SSE-NEXT: retl
%shift = shl <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3>
ret <4 x i16> %shift
define <2 x i16> @splatconstant_shift_v2i16(<2 x i16> %a) nounwind {
; SSE-LABEL: splatconstant_shift_v2i16:
; SSE: # %bb.0:
-; SSE-NEXT: psllq $3, %xmm0
+; SSE-NEXT: psllw $3, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_shift_v2i16:
; AVX: # %bb.0:
-; AVX-NEXT: vpsllq $3, %xmm0, %xmm0
+; AVX-NEXT: vpsllw $3, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v2i16:
; XOP: # %bb.0:
-; XOP-NEXT: vpsllq $3, %xmm0, %xmm0
+; XOP-NEXT: vpsllw $3, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v2i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllq $3, %xmm0, %xmm0
+; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v2i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllq $3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsllw $3, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_shift_v2i16:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psllq $3, %xmm0
+; X32-SSE-NEXT: psllw $3, %xmm0
; X32-SSE-NEXT: retl
%shift = shl <2 x i16> %a, <i16 3, i16 3>
ret <2 x i16> %shift
; SSE-LABEL: splatconstant_shift_v8i8:
; SSE: # %bb.0:
; SSE-NEXT: psllw $3, %xmm0
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_shift_v8i8:
; AVX: # %bb.0:
; AVX-NEXT: vpsllw $3, %xmm0, %xmm0
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v8i8:
; XOP: # %bb.0:
-; XOP-NEXT: vpsllw $3, %xmm0, %xmm0
+; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v8i8:
; AVX512: # %bb.0:
; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0
+; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v8i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsllw $3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_shift_v8i8:
; X32-SSE: # %bb.0:
; X32-SSE-NEXT: psllw $3, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: retl
%shift = shl <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
ret <8 x i8> %shift
define <4 x i8> @splatconstant_shift_v4i8(<4 x i8> %a) nounwind {
; SSE-LABEL: splatconstant_shift_v4i8:
; SSE: # %bb.0:
-; SSE-NEXT: pslld $3, %xmm0
+; SSE-NEXT: psllw $3, %xmm0
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_shift_v4i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpslld $3, %xmm0, %xmm0
+; AVX-NEXT: vpsllw $3, %xmm0, %xmm0
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v4i8:
; XOP: # %bb.0:
-; XOP-NEXT: vpslld $3, %xmm0, %xmm0
+; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v4i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpslld $3, %xmm0, %xmm0
+; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0
+; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v4i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpslld $3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsllw $3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_shift_v4i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pslld $3, %xmm0
+; X32-SSE-NEXT: psllw $3, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: retl
%shift = shl <4 x i8> %a, <i8 3, i8 3, i8 3, i8 3>
ret <4 x i8> %shift
define <2 x i8> @splatconstant_shift_v2i8(<2 x i8> %a) nounwind {
; SSE-LABEL: splatconstant_shift_v2i8:
; SSE: # %bb.0:
-; SSE-NEXT: psllq $3, %xmm0
+; SSE-NEXT: psllw $3, %xmm0
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_shift_v2i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpsllq $3, %xmm0, %xmm0
+; AVX-NEXT: vpsllw $3, %xmm0, %xmm0
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v2i8:
; XOP: # %bb.0:
-; XOP-NEXT: vpsllq $3, %xmm0, %xmm0
+; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v2i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllq $3, %xmm0, %xmm0
+; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0
+; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v2i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllq $3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsllw $3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_shift_v2i8:
; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psllq $3, %xmm0
+; X32-SSE-NEXT: psllw $3, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: retl
%shift = shl <2 x i8> %a, <i8 3, i8 3>
ret <2 x i8> %shift
}
define <16 x i8> @PR20540(<8 x i8> %a) {
-; SSE2-LABEL: PR20540:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: PR20540:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: PR20540:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: retq
+; SSE-LABEL: PR20540:
+; SSE: # %bb.0:
+; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; SSE-NEXT: retq
;
; AVX-LABEL: PR20540:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
ret <16 x i8> %shuffle
; SSE2-LABEL: combine_test1c:
; SSE2: # %bb.0:
; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; SSE2-NEXT: andps %xmm0, %xmm2
+; SSE2-NEXT: andnps %xmm1, %xmm0
+; SSE2-NEXT: orps %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_test1c:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSSE3-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_test1c:
; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
+; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE41-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE41-NEXT: movaps {{.*#+}} xmm0 = <0,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: combine_test1c:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: combine_test1c:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX2-NEXT: retq
+; AVX-LABEL: combine_test1c:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%A = load <4 x i8>, <4 x i8>* %a
%B = load <4 x i8>, <4 x i8>* %b
%1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
}
define <4 x i8> @combine_test2c(<4 x i8>* %a, <4 x i8>* %b) {
-; SSE2-LABEL: combine_test2c:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: combine_test2c:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: combine_test2c:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE41-NEXT: retq
+; SSE-LABEL: combine_test2c:
+; SSE: # %bb.0:
+; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: retq
;
; AVX-LABEL: combine_test2c:
; AVX: # %bb.0:
-; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX-NEXT: retq
%A = load <4 x i8>, <4 x i8>* %a
%B = load <4 x i8>, <4 x i8>* %b
}
define <4 x i8> @combine_test3c(<4 x i8>* %a, <4 x i8>* %b) {
-; SSE2-LABEL: combine_test3c:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: combine_test3c:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSSE3-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: combine_test3c:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; SSE41-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; SSE41-NEXT: retq
+; SSE-LABEL: combine_test3c:
+; SSE: # %bb.0:
+; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE-NEXT: retq
;
; AVX-LABEL: combine_test3c:
; AVX: # %bb.0:
-; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; AVX-NEXT: retq
%A = load <4 x i8>, <4 x i8>* %a
%B = load <4 x i8>, <4 x i8>* %b
define <4 x i8> @combine_test4c(<4 x i8>* %a, <4 x i8>* %b) {
; SSE2-LABEL: combine_test4c:
; SSE2: # %bb.0:
-; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
+; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: movaps {{.*#+}} xmm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; SSE2-NEXT: andps %xmm0, %xmm2
+; SSE2-NEXT: andnps %xmm1, %xmm0
+; SSE2-NEXT: orps %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_test4c:
; SSSE3: # %bb.0:
+; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,3,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_test4c:
; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE41-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE41-NEXT: movaps {{.*#+}} xmm0 = <255,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: combine_test4c:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: combine_test4c:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
-; AVX2-NEXT: retq
+; AVX-LABEL: combine_test4c:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <255,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%A = load <4 x i8>, <4 x i8>* %a
%B = load <4 x i8>, <4 x i8>* %b
%1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
; SSE2-NEXT: pand %xmm5, %xmm2
; SSE2-NEXT: packuswb %xmm3, %xmm2
; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_packus_v8i64_v8i8:
; SSSE3-NEXT: pand %xmm5, %xmm2
; SSSE3-NEXT: packuswb %xmm3, %xmm2
; SSSE3-NEXT: packuswb %xmm2, %xmm0
+; SSSE3-NEXT: packuswb %xmm0, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_packus_v8i64_v8i8:
; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm3
; SSE41-NEXT: packusdw %xmm4, %xmm3
; SSE41-NEXT: packusdw %xmm3, %xmm1
+; SSE41-NEXT: packuswb %xmm1, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-NEXT: vpand %xmm0, %xmm9, %xmm0
; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_packus_v8i64_v8i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [255,255,255,255]
-; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
-; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3
; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
+; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3
-; AVX2-NEXT: vpand %ymm1, %ymm3, %ymm1
-; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2
-; AVX2-NEXT: vpand %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3
+; AVX2-NEXT: vpand %ymm0, %ymm3, %ymm0
+; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
+; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0
; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: vpmovqb %zmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = icmp slt <8 x i64> %a0, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-NEXT: vmovq %xmm0, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
}
define <8 x i8> @trunc_packus_v8i32_v8i8(<8 x i32> %a0) {
-; SSE2-LABEL: trunc_packus_v8i32_v8i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255]
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pandn %xmm2, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: packuswb %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc_packus_v8i32_v8i8:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255]
-; SSSE3-NEXT: movdqa %xmm2, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm0
-; SSSE3-NEXT: pandn %xmm2, %xmm3
-; SSSE3-NEXT: por %xmm0, %xmm3
-; SSSE3-NEXT: movdqa %xmm2, %xmm0
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
-; SSSE3-NEXT: pand %xmm0, %xmm1
-; SSSE3-NEXT: pandn %xmm2, %xmm0
-; SSSE3-NEXT: por %xmm1, %xmm0
-; SSSE3-NEXT: pxor %xmm1, %xmm1
-; SSSE3-NEXT: movdqa %xmm0, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
-; SSSE3-NEXT: pand %xmm0, %xmm2
-; SSSE3-NEXT: movdqa %xmm3, %xmm0
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
-; SSSE3-NEXT: pand %xmm3, %xmm0
-; SSSE3-NEXT: packuswb %xmm2, %xmm0
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc_packus_v8i32_v8i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255]
-; SSE41-NEXT: pminsd %xmm2, %xmm0
-; SSE41-NEXT: pminsd %xmm2, %xmm1
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pmaxsd %xmm2, %xmm1
-; SSE41-NEXT: pmaxsd %xmm2, %xmm0
-; SSE41-NEXT: packusdw %xmm1, %xmm0
-; SSE41-NEXT: retq
+; SSE-LABEL: trunc_packus_v8i32_v8i8:
+; SSE: # %bb.0:
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm0, %xmm0
+; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_packus_v8i32_v8i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255]
-; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpmaxsd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_packus_v8i32_v8i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0
; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0
; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BWVL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%1 = icmp slt <8 x i32> %a0, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vmovq %xmm0, (%rdi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
+; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vmovq %xmm0, (%rdi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [127,127]
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: movdqa %xmm3, %xmm5
; SSE2-NEXT: pxor %xmm4, %xmm5
; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483775,2147483775]
; SSE2-NEXT: movdqa %xmm9, %xmm7
; SSE2-NEXT: pand %xmm10, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
; SSE2-NEXT: por %xmm6, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: pand %xmm5, %xmm3
; SSE2-NEXT: pandn %xmm8, %xmm5
-; SSE2-NEXT: por %xmm2, %xmm5
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm9, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm10, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm8, %xmm2
-; SSE2-NEXT: por %xmm3, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: por %xmm3, %xmm5
+; SSE2-NEXT: movdqa %xmm2, %xmm3
; SSE2-NEXT: pxor %xmm4, %xmm3
; SSE2-NEXT: movdqa %xmm9, %xmm6
; SSE2-NEXT: pcmpgtd %xmm3, %xmm6
; SSE2-NEXT: pand %xmm10, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
; SSE2-NEXT: por %xmm7, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm2
; SSE2-NEXT: pandn %xmm8, %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pxor %xmm4, %xmm2
; SSE2-NEXT: movdqa %xmm9, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm9, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
+; SSE2-NEXT: pand %xmm10, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
+; SSE2-NEXT: por %xmm7, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: pandn %xmm8, %xmm2
+; SSE2-NEXT: por %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: pxor %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm9, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm7, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm7
+; SSE2-NEXT: pcmpeqd %xmm9, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pand %xmm7, %xmm1
-; SSE2-NEXT: pandn %xmm8, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
; SSE2-NEXT: por %xmm1, %xmm7
+; SSE2-NEXT: pand %xmm7, %xmm0
+; SSE2-NEXT: pandn %xmm8, %xmm7
+; SSE2-NEXT: por %xmm0, %xmm7
; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488]
; SSE2-NEXT: movdqa %xmm7, %xmm0
; SSE2-NEXT: pxor %xmm4, %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840]
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: pcmpgtd %xmm9, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm7
-; SSE2-NEXT: pandn %xmm8, %xmm1
-; SSE2-NEXT: por %xmm7, %xmm1
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm9, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm10, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; SSE2-NEXT: pand %xmm10, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE2-NEXT: por %xmm6, %xmm0
+; SSE2-NEXT: pand %xmm0, %xmm7
; SSE2-NEXT: pandn %xmm8, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: packssdw %xmm1, %xmm0
+; SSE2-NEXT: por %xmm7, %xmm0
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm9, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pand %xmm7, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm7
+; SSE2-NEXT: pand %xmm7, %xmm2
+; SSE2-NEXT: pandn %xmm8, %xmm7
+; SSE2-NEXT: por %xmm2, %xmm7
+; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: pxor %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm9, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pand %xmm6, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm2
-; SSE2-NEXT: pandn %xmm8, %xmm3
-; SSE2-NEXT: por %xmm2, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm8, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
; SSE2-NEXT: pxor %xmm5, %xmm4
; SSE2-NEXT: movdqa %xmm4, %xmm1
; SSE2-NEXT: pcmpgtd %xmm9, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm2, %xmm4
+; SSE2-NEXT: pand %xmm3, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: por %xmm4, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm5
; SSE2-NEXT: pandn %xmm8, %xmm1
; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: packssdw %xmm3, %xmm1
-; SSE2-NEXT: packssdw %xmm1, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: pand %xmm3, %xmm2
+; SSE2-NEXT: packuswb %xmm1, %xmm2
+; SSE2-NEXT: pand %xmm3, %xmm7
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: packuswb %xmm7, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_ssat_v8i64_v8i8:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [127,127]
; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm2, %xmm5
+; SSSE3-NEXT: movdqa %xmm3, %xmm5
; SSSE3-NEXT: pxor %xmm4, %xmm5
; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483775,2147483775]
; SSSE3-NEXT: movdqa %xmm9, %xmm7
; SSSE3-NEXT: pand %xmm10, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
; SSSE3-NEXT: por %xmm6, %xmm5
-; SSSE3-NEXT: pand %xmm5, %xmm2
+; SSSE3-NEXT: pand %xmm5, %xmm3
; SSSE3-NEXT: pandn %xmm8, %xmm5
-; SSSE3-NEXT: por %xmm2, %xmm5
-; SSSE3-NEXT: movdqa %xmm3, %xmm2
-; SSSE3-NEXT: pxor %xmm4, %xmm2
-; SSSE3-NEXT: movdqa %xmm9, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pand %xmm10, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm2
-; SSSE3-NEXT: pand %xmm2, %xmm3
-; SSSE3-NEXT: pandn %xmm8, %xmm2
-; SSSE3-NEXT: por %xmm3, %xmm2
-; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: por %xmm3, %xmm5
+; SSSE3-NEXT: movdqa %xmm2, %xmm3
; SSSE3-NEXT: pxor %xmm4, %xmm3
; SSSE3-NEXT: movdqa %xmm9, %xmm6
; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6
; SSSE3-NEXT: pand %xmm10, %xmm7
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
; SSSE3-NEXT: por %xmm7, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm0
+; SSSE3-NEXT: pand %xmm3, %xmm2
; SSSE3-NEXT: pandn %xmm8, %xmm3
-; SSSE3-NEXT: por %xmm0, %xmm3
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: pxor %xmm4, %xmm0
+; SSSE3-NEXT: por %xmm2, %xmm3
+; SSSE3-NEXT: movdqa %xmm1, %xmm2
+; SSSE3-NEXT: pxor %xmm4, %xmm2
; SSSE3-NEXT: movdqa %xmm9, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
+; SSSE3-NEXT: pand %xmm10, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
+; SSSE3-NEXT: por %xmm7, %xmm2
+; SSSE3-NEXT: pand %xmm2, %xmm1
+; SSSE3-NEXT: pandn %xmm8, %xmm2
+; SSSE3-NEXT: por %xmm1, %xmm2
+; SSSE3-NEXT: movdqa %xmm0, %xmm1
+; SSSE3-NEXT: pxor %xmm4, %xmm1
+; SSSE3-NEXT: movdqa %xmm9, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm7, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm7
+; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSSE3-NEXT: pand %xmm7, %xmm1
-; SSSE3-NEXT: pandn %xmm8, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
; SSSE3-NEXT: por %xmm1, %xmm7
+; SSSE3-NEXT: pand %xmm7, %xmm0
+; SSSE3-NEXT: pandn %xmm8, %xmm7
+; SSSE3-NEXT: por %xmm0, %xmm7
; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488]
; SSSE3-NEXT: movdqa %xmm7, %xmm0
; SSSE3-NEXT: pxor %xmm4, %xmm0
; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840]
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pcmpgtd %xmm9, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm1
-; SSSE3-NEXT: pand %xmm1, %xmm7
-; SSSE3-NEXT: pandn %xmm8, %xmm1
-; SSSE3-NEXT: por %xmm7, %xmm1
-; SSSE3-NEXT: movdqa %xmm3, %xmm0
-; SSSE3-NEXT: pxor %xmm4, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm9, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm10, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm0
-; SSSE3-NEXT: pand %xmm0, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; SSSE3-NEXT: pand %xmm10, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSSE3-NEXT: por %xmm6, %xmm0
+; SSSE3-NEXT: pand %xmm0, %xmm7
; SSSE3-NEXT: pandn %xmm8, %xmm0
-; SSSE3-NEXT: por %xmm3, %xmm0
-; SSSE3-NEXT: packssdw %xmm1, %xmm0
+; SSSE3-NEXT: por %xmm7, %xmm0
; SSSE3-NEXT: movdqa %xmm2, %xmm1
; SSSE3-NEXT: pxor %xmm4, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm9, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
+; SSSE3-NEXT: movdqa %xmm1, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm9, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT: pand %xmm7, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
+; SSSE3-NEXT: por %xmm1, %xmm7
+; SSSE3-NEXT: pand %xmm7, %xmm2
+; SSSE3-NEXT: pandn %xmm8, %xmm7
+; SSSE3-NEXT: por %xmm2, %xmm7
+; SSSE3-NEXT: movdqa %xmm3, %xmm1
+; SSSE3-NEXT: pxor %xmm4, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm9, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSSE3-NEXT: pand %xmm6, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm2
-; SSSE3-NEXT: pandn %xmm8, %xmm3
-; SSSE3-NEXT: por %xmm2, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT: por %xmm1, %xmm2
+; SSSE3-NEXT: pand %xmm2, %xmm3
+; SSSE3-NEXT: pandn %xmm8, %xmm2
+; SSSE3-NEXT: por %xmm3, %xmm2
; SSSE3-NEXT: pxor %xmm5, %xmm4
; SSSE3-NEXT: movdqa %xmm4, %xmm1
; SSSE3-NEXT: pcmpgtd %xmm9, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pand %xmm2, %xmm4
+; SSSE3-NEXT: pand %xmm3, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSSE3-NEXT: por %xmm4, %xmm1
; SSSE3-NEXT: pand %xmm1, %xmm5
; SSSE3-NEXT: pandn %xmm8, %xmm1
; SSSE3-NEXT: por %xmm5, %xmm1
-; SSSE3-NEXT: packssdw %xmm3, %xmm1
-; SSSE3-NEXT: packssdw %xmm1, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSSE3-NEXT: pand %xmm3, %xmm1
+; SSSE3-NEXT: pand %xmm3, %xmm2
+; SSSE3-NEXT: packuswb %xmm1, %xmm2
+; SSSE3-NEXT: pand %xmm3, %xmm7
+; SSSE3-NEXT: pand %xmm3, %xmm0
+; SSSE3-NEXT: packuswb %xmm7, %xmm0
+; SSSE3-NEXT: packuswb %xmm2, %xmm0
+; SSSE3-NEXT: packuswb %xmm0, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_ssat_v8i64_v8i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm10
+; SSE41-NEXT: movdqa %xmm0, %xmm8
; SSE41-NEXT: movapd {{.*#+}} xmm11 = [127,127]
; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483775,2147483775]
-; SSE41-NEXT: movdqa %xmm4, %xmm7
+; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483775,2147483775]
+; SSE41-NEXT: movdqa %xmm6, %xmm7
; SSE41-NEXT: pcmpeqd %xmm0, %xmm7
-; SSE41-NEXT: movdqa %xmm4, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; SSE41-NEXT: movdqa %xmm6, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
; SSE41-NEXT: pand %xmm7, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm11, %xmm8
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm8
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm4, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: movapd %xmm11, %xmm9
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm9
-; SSE41-NEXT: movdqa %xmm10, %xmm0
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm2
+; SSE41-NEXT: movdqa %xmm6, %xmm3
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
+; SSE41-NEXT: movdqa %xmm6, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: movapd %xmm11, %xmm10
+; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm10
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pxor %xmm5, %xmm0
+; SSE41-NEXT: movdqa %xmm6, %xmm2
; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm4, %xmm3
+; SSE41-NEXT: movdqa %xmm6, %xmm3
; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
; SSE41-NEXT: pand %xmm2, %xmm0
; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: movapd %xmm11, %xmm2
-; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm2
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: movapd %xmm11, %xmm3
+; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; SSE41-NEXT: movdqa %xmm8, %xmm0
; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm11
-; SSE41-NEXT: movapd {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488]
+; SSE41-NEXT: movdqa %xmm6, %xmm1
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; SSE41-NEXT: pand %xmm1, %xmm0
+; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm11
+; SSE41-NEXT: movapd {{.*#+}} xmm2 = [18446744073709551488,18446744073709551488]
; SSE41-NEXT: movapd %xmm11, %xmm1
; SSE41-NEXT: xorpd %xmm5, %xmm1
; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [18446744071562067840,18446744071562067840]
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
; SSE41-NEXT: pand %xmm6, %xmm0
; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: movapd %xmm3, %xmm6
-; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm6
; SSE41-NEXT: movapd %xmm2, %xmm1
-; SSE41-NEXT: xorpd %xmm5, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm7
+; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm1
+; SSE41-NEXT: movapd %xmm3, %xmm6
+; SSE41-NEXT: xorpd %xmm5, %xmm6
+; SSE41-NEXT: movapd %xmm6, %xmm7
; SSE41-NEXT: pcmpeqd %xmm4, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm4, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
+; SSE41-NEXT: pcmpgtd %xmm4, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
; SSE41-NEXT: pand %xmm7, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: movapd %xmm3, %xmm1
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: packssdw %xmm6, %xmm1
-; SSE41-NEXT: movapd %xmm9, %xmm2
-; SSE41-NEXT: xorpd %xmm5, %xmm2
-; SSE41-NEXT: movapd %xmm2, %xmm6
+; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: movapd %xmm2, %xmm7
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7
+; SSE41-NEXT: movapd %xmm10, %xmm3
+; SSE41-NEXT: xorpd %xmm5, %xmm3
+; SSE41-NEXT: movapd %xmm3, %xmm6
; SSE41-NEXT: pcmpeqd %xmm4, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm4, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; SSE41-NEXT: pcmpgtd %xmm4, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: por %xmm2, %xmm0
-; SSE41-NEXT: movapd %xmm3, %xmm2
-; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm2
-; SSE41-NEXT: xorpd %xmm8, %xmm5
+; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: movapd %xmm2, %xmm3
+; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm3
+; SSE41-NEXT: xorpd %xmm9, %xmm5
; SSE41-NEXT: movapd %xmm5, %xmm6
; SSE41-NEXT: pcmpeqd %xmm4, %xmm6
; SSE41-NEXT: pcmpgtd %xmm4, %xmm5
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
; SSE41-NEXT: pand %xmm6, %xmm0
; SSE41-NEXT: por %xmm5, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm3
-; SSE41-NEXT: packssdw %xmm2, %xmm3
-; SSE41-NEXT: packssdw %xmm3, %xmm1
+; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm2
+; SSE41-NEXT: movapd {{.*#+}} xmm0 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE41-NEXT: andpd %xmm0, %xmm2
+; SSE41-NEXT: andpd %xmm0, %xmm3
+; SSE41-NEXT: packusdw %xmm2, %xmm3
+; SSE41-NEXT: andpd %xmm0, %xmm7
+; SSE41-NEXT: andpd %xmm0, %xmm1
+; SSE41-NEXT: packusdw %xmm7, %xmm1
+; SSE41-NEXT: packusdw %xmm3, %xmm1
+; SSE41-NEXT: packuswb %xmm1, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: trunc_ssat_v8i64_v8i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [127,127]
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm8
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm5
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm7
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4
-; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [18446744073709551488,18446744073709551488]
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm9
-; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm3, %xmm6
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm7
-; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5
-; AVX1-NEXT: vblendvpd %xmm8, %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3
-; AVX1-NEXT: vblendvpd %xmm3, %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1
-; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm4, %xmm2
-; AVX1-NEXT: vblendvpd %xmm9, %xmm0, %xmm4, %xmm0
-; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovapd {{.*#+}} ymm8 = [127,127,127,127]
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [127,127]
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm6
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm7
+; AVX1-NEXT: vblendvpd %ymm7, %ymm1, %ymm8, %ymm9
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm7
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm10
+; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm10, %ymm11
+; AVX1-NEXT: vblendvpd %ymm11, %ymm0, %ymm8, %ymm8
+; AVX1-NEXT: vmovapd {{.*#+}} ymm11 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488]
+; AVX1-NEXT: vblendvpd %xmm7, %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [18446744073709551488,18446744073709551488]
+; AVX1-NEXT: vpcmpgtq %xmm7, %xmm2, %xmm2
+; AVX1-NEXT: vblendvpd %xmm10, %xmm0, %xmm4, %xmm0
+; AVX1-NEXT: vpcmpgtq %xmm7, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vblendvpd %ymm0, %ymm8, %ymm11, %ymm0
+; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm4, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm7, %xmm2, %xmm2
+; AVX1-NEXT: vblendvpd %xmm6, %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpcmpgtq %xmm7, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vblendvpd %ymm1, %ymm9, %ymm11, %ymm1
+; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [255,255,255,255]
+; AVX1-NEXT: vandpd %ymm2, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vandpd %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_ssat_v8i64_v8i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [127,127,127,127]
-; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
-; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3
; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
+; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488]
-; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3
-; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3
; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3
+; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512: # %bb.0:
; AVX512-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0
; AVX512-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0
-; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: vpmovqb %zmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = icmp slt <8 x i64> %a0, <i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127>
; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3
; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-NEXT: vmovq %xmm0, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
}
define <8 x i8> @trunc_ssat_v8i32_v8i8(<8 x i32> %a0) {
-; SSE2-LABEL: trunc_ssat_v8i32_v8i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [127,127,127,127]
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pandn %xmm2, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm3
-; SSE2-NEXT: pandn %xmm1, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: packssdw %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc_ssat_v8i32_v8i8:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [127,127,127,127]
-; SSSE3-NEXT: movdqa %xmm2, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm0
-; SSSE3-NEXT: pandn %xmm2, %xmm3
-; SSSE3-NEXT: por %xmm0, %xmm3
-; SSSE3-NEXT: movdqa %xmm2, %xmm0
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
-; SSSE3-NEXT: pand %xmm0, %xmm1
-; SSSE3-NEXT: pandn %xmm2, %xmm0
-; SSSE3-NEXT: por %xmm1, %xmm0
-; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168]
-; SSSE3-NEXT: movdqa %xmm0, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
-; SSSE3-NEXT: pand %xmm2, %xmm0
-; SSSE3-NEXT: pandn %xmm1, %xmm2
-; SSSE3-NEXT: por %xmm0, %xmm2
-; SSSE3-NEXT: movdqa %xmm3, %xmm0
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
-; SSSE3-NEXT: pand %xmm0, %xmm3
-; SSSE3-NEXT: pandn %xmm1, %xmm0
-; SSSE3-NEXT: por %xmm3, %xmm0
-; SSSE3-NEXT: packssdw %xmm2, %xmm0
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc_ssat_v8i32_v8i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [127,127,127,127]
-; SSE41-NEXT: pminsd %xmm2, %xmm0
-; SSE41-NEXT: pminsd %xmm2, %xmm1
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4294967168,4294967168,4294967168,4294967168]
-; SSE41-NEXT: pmaxsd %xmm2, %xmm1
-; SSE41-NEXT: pmaxsd %xmm2, %xmm0
-; SSE41-NEXT: packssdw %xmm1, %xmm0
-; SSE41-NEXT: retq
+; SSE-LABEL: trunc_ssat_v8i32_v8i8:
+; SSE: # %bb.0:
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: packsswb %xmm0, %xmm0
+; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_ssat_v8i32_v8i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127]
-; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168]
-; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpmaxsd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_ssat_v8i32_v8i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [127,127,127,127,127,127,127,127]
-; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168]
-; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168]
; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0
; AVX512VL-NEXT: vpmaxsd {{.*}}(%rip){1to8}, %ymm0, %ymm0
-; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168]
; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0
; AVX512BWVL-NEXT: vpmaxsd {{.*}}(%rip){1to8}, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%1 = icmp slt <8 x i32> %a0, <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168]
; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vmovq %xmm0, (%rdi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168]
; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
+; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vmovq %xmm0, (%rdi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
define <8 x i8> @trunc_usat_v8i64_v8i8(<8 x i64> %a0) {
; SSE2-LABEL: trunc_usat_v8i64_v8i8:
; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm0, %xmm4
; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255]
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
-; SSE2-NEXT: movdqa %xmm1, %xmm7
-; SSE2-NEXT: pxor %xmm5, %xmm7
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm6, %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259711,9223372039002259711]
-; SSE2-NEXT: movdqa %xmm9, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
-; SSE2-NEXT: pand %xmm4, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm8, %xmm4
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm5, %xmm1
-; SSE2-NEXT: movdqa %xmm9, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm7, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm6
-; SSE2-NEXT: pand %xmm6, %xmm0
-; SSE2-NEXT: pandn %xmm8, %xmm6
-; SSE2-NEXT: por %xmm6, %xmm0
-; SSE2-NEXT: packuswb %xmm4, %xmm0
+; SSE2-NEXT: movdqa %xmm9, %xmm7
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm9, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT: pand %xmm5, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT: por %xmm0, %xmm5
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: pandn %xmm8, %xmm5
+; SSE2-NEXT: por %xmm1, %xmm5
+; SSE2-NEXT: movdqa %xmm4, %xmm0
+; SSE2-NEXT: pxor %xmm6, %xmm0
+; SSE2-NEXT: movdqa %xmm9, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm9, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
+; SSE2-NEXT: pand %xmm10, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE2-NEXT: por %xmm7, %xmm0
+; SSE2-NEXT: pand %xmm0, %xmm4
+; SSE2-NEXT: pandn %xmm8, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: packuswb %xmm5, %xmm0
; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm5, %xmm1
+; SSE2-NEXT: pxor %xmm6, %xmm1
; SSE2-NEXT: movdqa %xmm9, %xmm4
; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm9, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm1
+; SSE2-NEXT: pand %xmm5, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSE2-NEXT: por %xmm1, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm3
; SSE2-NEXT: pandn %xmm8, %xmm4
; SSE2-NEXT: por %xmm3, %xmm4
-; SSE2-NEXT: pxor %xmm2, %xmm5
+; SSE2-NEXT: pxor %xmm2, %xmm6
; SSE2-NEXT: movdqa %xmm9, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm9, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
; SSE2-NEXT: pand %xmm3, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: por %xmm5, %xmm1
; SSE2-NEXT: por %xmm2, %xmm1
; SSE2-NEXT: packuswb %xmm4, %xmm1
; SSE2-NEXT: packuswb %xmm1, %xmm0
+; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_usat_v8i64_v8i8:
; SSSE3: # %bb.0:
+; SSSE3-NEXT: movdqa %xmm0, %xmm4
; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
-; SSSE3-NEXT: movdqa %xmm1, %xmm7
-; SSSE3-NEXT: pxor %xmm5, %xmm7
+; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456]
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: pxor %xmm6, %xmm0
; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259711,9223372039002259711]
-; SSSE3-NEXT: movdqa %xmm9, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm7, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
-; SSSE3-NEXT: pand %xmm4, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: pandn %xmm8, %xmm4
-; SSSE3-NEXT: por %xmm1, %xmm4
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pxor %xmm5, %xmm1
-; SSSE3-NEXT: movdqa %xmm9, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm7, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm6
-; SSSE3-NEXT: pand %xmm6, %xmm0
-; SSSE3-NEXT: pandn %xmm8, %xmm6
-; SSSE3-NEXT: por %xmm6, %xmm0
-; SSSE3-NEXT: packuswb %xmm4, %xmm0
+; SSSE3-NEXT: movdqa %xmm9, %xmm7
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT: pand %xmm5, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSSE3-NEXT: por %xmm0, %xmm5
+; SSSE3-NEXT: pand %xmm5, %xmm1
+; SSSE3-NEXT: pandn %xmm8, %xmm5
+; SSSE3-NEXT: por %xmm1, %xmm5
+; SSSE3-NEXT: movdqa %xmm4, %xmm0
+; SSSE3-NEXT: pxor %xmm6, %xmm0
+; SSSE3-NEXT: movdqa %xmm9, %xmm1
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
+; SSSE3-NEXT: pand %xmm10, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSSE3-NEXT: por %xmm7, %xmm0
+; SSSE3-NEXT: pand %xmm0, %xmm4
+; SSSE3-NEXT: pandn %xmm8, %xmm0
+; SSSE3-NEXT: por %xmm4, %xmm0
+; SSSE3-NEXT: packuswb %xmm5, %xmm0
; SSSE3-NEXT: movdqa %xmm3, %xmm1
-; SSSE3-NEXT: pxor %xmm5, %xmm1
+; SSSE3-NEXT: pxor %xmm6, %xmm1
; SSSE3-NEXT: movdqa %xmm9, %xmm4
; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm1
+; SSSE3-NEXT: pand %xmm5, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSSE3-NEXT: por %xmm1, %xmm4
; SSSE3-NEXT: pand %xmm4, %xmm3
; SSSE3-NEXT: pandn %xmm8, %xmm4
; SSSE3-NEXT: por %xmm3, %xmm4
-; SSSE3-NEXT: pxor %xmm2, %xmm5
+; SSSE3-NEXT: pxor %xmm2, %xmm6
; SSSE3-NEXT: movdqa %xmm9, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm1
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm9, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
; SSSE3-NEXT: pand %xmm3, %xmm5
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSSE3-NEXT: por %xmm5, %xmm1
; SSSE3-NEXT: por %xmm2, %xmm1
; SSSE3-NEXT: packuswb %xmm4, %xmm1
; SSSE3-NEXT: packuswb %xmm1, %xmm0
+; SSSE3-NEXT: packuswb %xmm0, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_usat_v8i64_v8i8:
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm9
; SSE41-NEXT: packusdw %xmm5, %xmm9
; SSE41-NEXT: packusdw %xmm9, %xmm1
+; SSE41-NEXT: packuswb %xmm1, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-NEXT: vblendvpd %xmm8, %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2: # %bb.0:
; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [255,255,255,255]
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm4
+; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm4
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854776063,9223372036854776063,9223372036854776063,9223372036854776063]
; AVX2-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4
-; AVX2-NEXT: vblendvpd %ymm4, %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm3
+; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm3
; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3
-; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_usat_v8i64_v8i8:
; AVX512: # %bb.0:
; AVX512-NEXT: vpminuq {{.*}}(%rip){1to8}, %zmm0, %zmm0
-; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: vpmovqb %zmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = icmp ult <8 x i64> %a0, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3
; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-NEXT: vmovq %xmm0, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
define <8 x i8> @trunc_usat_v8i32_v8i8(<8 x i32> %a0) {
; SSE2-LABEL: trunc_usat_v8i32_v8i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255]
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: pxor %xmm3, %xmm4
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903]
-; SSE2-NEXT: movdqa %xmm5, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255]
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: pxor %xmm4, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483903,2147483903,2147483903,2147483903]
+; SSE2-NEXT: movdqa %xmm2, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
; SSE2-NEXT: pand %xmm6, %xmm1
-; SSE2-NEXT: pandn %xmm2, %xmm6
+; SSE2-NEXT: pandn %xmm3, %xmm6
; SSE2-NEXT: por %xmm1, %xmm6
-; SSE2-NEXT: pxor %xmm0, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm5
-; SSE2-NEXT: por %xmm5, %xmm0
-; SSE2-NEXT: packuswb %xmm6, %xmm0
+; SSE2-NEXT: pxor %xmm0, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pandn %xmm3, %xmm2
+; SSE2-NEXT: por %xmm0, %xmm2
+; SSE2-NEXT: packuswb %xmm6, %xmm2
+; SSE2-NEXT: packuswb %xmm2, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_usat_v8i32_v8i8:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255]
; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm1, %xmm4
+; SSSE3-NEXT: movdqa %xmm0, %xmm4
; SSSE3-NEXT: pxor %xmm3, %xmm4
; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903]
; SSSE3-NEXT: movdqa %xmm5, %xmm6
; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
-; SSSE3-NEXT: pand %xmm6, %xmm1
+; SSSE3-NEXT: pand %xmm6, %xmm0
; SSSE3-NEXT: pandn %xmm2, %xmm6
-; SSSE3-NEXT: por %xmm1, %xmm6
-; SSSE3-NEXT: pxor %xmm0, %xmm3
+; SSSE3-NEXT: por %xmm6, %xmm0
+; SSSE3-NEXT: pxor %xmm1, %xmm3
; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5
-; SSSE3-NEXT: pand %xmm5, %xmm0
+; SSSE3-NEXT: pand %xmm5, %xmm1
; SSSE3-NEXT: pandn %xmm2, %xmm5
-; SSSE3-NEXT: por %xmm5, %xmm0
-; SSSE3-NEXT: packuswb %xmm6, %xmm0
+; SSSE3-NEXT: por %xmm1, %xmm5
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSSE3-NEXT: pshufb %xmm1, %xmm5
+; SSSE3-NEXT: pshufb %xmm1, %xmm0
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_usat_v8i32_v8i8:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255]
-; SSE41-NEXT: pminud %xmm2, %xmm1
; SSE41-NEXT: pminud %xmm2, %xmm0
-; SSE41-NEXT: packusdw %xmm1, %xmm0
+; SSE41-NEXT: pminud %xmm2, %xmm1
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSE41-NEXT: pshufb %xmm2, %xmm1
+; SSE41-NEXT: pshufb %xmm2, %xmm0
+; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE41-NEXT: retq
;
; AVX1-LABEL: trunc_usat_v8i32_v8i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255]
-; AVX1-NEXT: vpminud %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255]
+; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpminud %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_usat_v8i32_v8i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpminud {{.*}}(%rip){1to8}, %ymm0, %ymm0
-; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
; AVX512BW-NEXT: vpminud %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_usat_v8i32_v8i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpminud {{.*}}(%rip){1to8}, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%1 = icmp ult <8 x i32> %a0, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
; SSSE3-NEXT: pand %xmm5, %xmm1
; SSSE3-NEXT: pandn %xmm2, %xmm5
; SSSE3-NEXT: por %xmm1, %xmm5
-; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
; SSSE3-NEXT: pshufb %xmm0, %xmm5
; SSSE3-NEXT: pshufb %xmm0, %xmm6
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm5[0]
-; SSSE3-NEXT: pshufb {{.*#+}} xmm6 = xmm6[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
; SSSE3-NEXT: movq %xmm6, (%rdi)
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_usat_v8i32_v8i8_store:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255]
-; SSE41-NEXT: pminud %xmm2, %xmm1
; SSE41-NEXT: pminud %xmm2, %xmm0
-; SSE41-NEXT: packusdw %xmm1, %xmm0
-; SSE41-NEXT: packuswb %xmm0, %xmm0
+; SSE41-NEXT: pminud %xmm2, %xmm1
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSE41-NEXT: pshufb %xmm2, %xmm1
+; SSE41-NEXT: pshufb %xmm2, %xmm0
+; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE41-NEXT: movq %xmm0, (%rdi)
; SSE41-NEXT: retq
;
; AVX1-LABEL: trunc_usat_v8i32_v8i8_store:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255]
-; AVX1-NEXT: vpminud %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255]
+; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; AVX1-NEXT: vmovq %xmm0, (%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX2-NEXT: vmovq %xmm0, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpminud %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vmovq %xmm0, (%rdi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
; AVX512BW-NEXT: vpminud %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
+; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vmovq %xmm0, (%rdi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-SLOW-LABEL: trunc8i64_8i8:
-; AVX2-SLOW: # %bb.0: # %entry
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-SLOW-NEXT: vmovq %xmm0, (%rax)
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc8i64_8i8:
-; AVX2-FAST: # %bb.0: # %entry
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
-; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-FAST-NEXT: vmovq %xmm0, (%rax)
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
+; AVX2-LABEL: trunc8i64_8i8:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX2-NEXT: vmovq %xmm0, (%rax)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc8i64_8i8:
; AVX512: # %bb.0: # %entry
;
; AVX2-LABEL: trunc8i32_8i8:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX2-NEXT: vmovq %xmm0, (%rax)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
; AVX512F-LABEL: trunc8i32_8i8:
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vmovq %xmm0, (%rax)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
; AVX512BW-LABEL: trunc8i32_8i8:
; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vmovq %xmm0, (%rax)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX1-LABEL: trunc2x4i64_8i16:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: trunc2x4i64_8i16:
; AVX2-SLOW: # %bb.0: # %entry
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
+; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: trunc2x4i64_8i16:
; AVX2-FAST: # %bb.0: # %entry
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
-; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
+; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX2-FAST-NEXT: vzeroupper
; AVX2-FAST-NEXT: retq
;
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512F-NEXT: vpmovqw %zmm1, %xmm1
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc2x4i64_8i16:
; AVX512VL: # %bb.0: # %entry
-; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0
-; AVX512VL-NEXT: vpmovqd %ymm1, %xmm1
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0
+; AVX512VL-NEXT: vpmovqw %ymm1, %xmm1
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512BW-NEXT: vpmovqw %zmm1, %xmm1
; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc2x4i64_8i16:
; AVX512BWVL: # %bb.0: # %entry
-; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm2
-; AVX512BWVL-NEXT: vpmovqd %ymm1, %xmm1
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,2,4,6,8,10,12,14]
-; AVX512BWVL-NEXT: vpermi2w %xmm1, %xmm2, %xmm0
+; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0
+; AVX512BWVL-NEXT: vpmovqw %ymm1, %xmm1
+; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
entry:
; NOTE: This operation could be collapsed in to a single truncate. Once that is done
; this test will have to be adjusted.
-; CHECK: PUNPCKLBWrr
-; CHECK: PUNPCKLWDrr
; CHECK: PANDrm
; CHECK: PACKUSWBrr
-; CHECK: PACKUSWBrr
-; CHECK: PACKUSWBrr
; CHECK: MOVPDI2DIrr
define void @test(double %vec.coerce) local_unnamed_addr {
define <8 x i64> @zext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: zext_16i8_to_8i64:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE2-NEXT: movdqa %xmm3, %xmm1
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
; SSE2-NEXT: movdqa %xmm3, %xmm2
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
;
; SSSE3-LABEL: zext_16i8_to_8i64:
; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,128,128,128,128,128,128,128,1,128,128,128,128,128,128,128]
-; SSSE3-NEXT: pshufb %xmm4, %xmm0
-; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2,128,128,128,128,128,128,128,3,128,128,128,128,128,128,128]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
-; SSSE3-NEXT: pshufb %xmm5, %xmm1
+; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT: movdqa %xmm3, %xmm1
+; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: movdqa %xmm3, %xmm2
-; SSSE3-NEXT: pshufb %xmm4, %xmm2
-; SSSE3-NEXT: pshufb %xmm5, %xmm3
+; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[4],zero,zero,zero,zero,zero,zero,zero,xmm2[5],zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT: pshufb {{.*#+}} xmm3 = xmm3[6],zero,zero,zero,zero,zero,zero,zero,xmm3[7],zero,zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: retq
;
; SSE41-LABEL: zext_16i8_to_8i64:
define <8 x i64> @load_zext_8i8_to_8i64(<8 x i8> *%ptr) {
; SSE2-LABEL: load_zext_8i8_to_8i64:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero
; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE2-NEXT: movdqa %xmm3, %xmm1
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
; SSE2-NEXT: movdqa %xmm3, %xmm2
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
;
; SSSE3-LABEL: load_zext_8i8_to_8i64:
; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,128,128,128,128,128,128,128,1,128,128,128,128,128,128,128]
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: pshufb %xmm4, %xmm0
-; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2,128,128,128,128,128,128,128,3,128,128,128,128,128,128,128]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
-; SSSE3-NEXT: pshufb %xmm5, %xmm1
+; SSSE3-NEXT: movq {{.*#+}} xmm3 = mem[0],zero
+; SSSE3-NEXT: movdqa %xmm3, %xmm0
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT: movdqa %xmm3, %xmm1
+; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: movdqa %xmm3, %xmm2
-; SSSE3-NEXT: pshufb %xmm4, %xmm2
-; SSSE3-NEXT: pshufb %xmm5, %xmm3
+; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[4],zero,zero,zero,zero,zero,zero,zero,xmm2[5],zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT: pshufb {{.*#+}} xmm3 = xmm3[6],zero,zero,zero,zero,zero,zero,zero,xmm3[7],zero,zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_zext_8i8_to_8i64:
; SSE2-LABEL: zext_8i8_to_8i32:
; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSSE3-LABEL: zext_8i8_to_8i32:
; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1
; SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
;
; SSE41-LABEL: zext_8i8_to_8i32:
; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: zext_8i8_to_8i32:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: zext_8i8_to_8i32:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX2-NEXT: retq
;
; AVX512-LABEL: zext_8i8_to_8i32:
; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX512-NEXT: retq
entry:
%t = zext <8 x i8> %z to <8 x i32>
; SSE2-LABEL: shuf_zext_8i8_to_8i32:
; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSSE3-LABEL: shuf_zext_8i8_to_8i32:
; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1
; SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
;
; SSE41-LABEL: shuf_zext_8i8_to_8i32:
; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
;
; AVX1-LABEL: shuf_zext_8i8_to_8i32:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
;
; AVX2-LABEL: shuf_zext_8i8_to_8i32:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX2-NEXT: retq
;
; AVX512-LABEL: shuf_zext_8i8_to_8i32:
; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX512-NEXT: retq
entry:
; AVX2-LABEL: zext_32i8_to_32i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,2,3]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX2-NEXT: vmovdqa %ymm4, %ymm0
; AVX2-NEXT: retq
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; SSE2-NEXT: paddq %xmm0, %xmm0
+; SSE2-NEXT: paddd %xmm0, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: zext_2i8_to_2i32:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movzwl (%rdi), %eax
; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSSE3-NEXT: paddq %xmm0, %xmm0
+; SSSE3-NEXT: pxor %xmm1, %xmm1
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT: paddd %xmm0, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: zext_2i8_to_2i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: paddq %xmm0, %xmm0
+; SSE41-NEXT: movzwl (%rdi), %eax
+; SSE41-NEXT: movd %eax, %xmm0
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE41-NEXT: paddd %xmm0, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: zext_2i8_to_2i32:
; AVX: # %bb.0:
-; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT: vpaddq %xmm0, %xmm0, %xmm0
+; AVX-NEXT: movzwl (%rdi), %eax
+; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%x = load <2 x i8>, <2 x i8>* %addr, align 1
%y = zext <2 x i8> %x to <2 x i32>
define <8 x i32> @eq_zero(<8 x i8>* %p, <8 x i32> %x, <8 x i32> %y) {
; AVX1-LABEL: eq_zero:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpmovsxwd %xmm2, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; AVX1-NEXT: vpmovsxwd %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpmovsxbd %xmm2, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
; AVX1-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: eq_zero:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX512-NEXT: vptestnmw %xmm2, %xmm2, %k1
+; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512-NEXT: vptestnmb %xmm2, %xmm2, %k1
; AVX512-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
; AVX512-NEXT: retq
%load = load <8 x i8>, <8 x i8>* %p
define <4 x i64> @ne_zero(<4 x i16>* %p, <4 x i64> %x, <4 x i64> %y) {
; AVX1-LABEL: ne_zero:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpmovsxdq %xmm2, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; AVX1-NEXT: vpmovsxdq %xmm2, %xmm2
+; AVX1-NEXT: vpmovsxwq %xmm2, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; AVX1-NEXT: vpmovsxwq %xmm2, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: ne_zero:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX512-NEXT: vptestmd %xmm2, %xmm2, %k1
+; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512-NEXT: vptestmw %xmm2, %xmm2, %k1
; AVX512-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
; AVX512-NEXT: retq
%load = load <4 x i16>, <4 x i16>* %p
define <8 x i32> @slt_zero(<8 x i8>* %p, <8 x i32> %x, <8 x i32> %y) {
; AVX1-LABEL: slt_zero:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovsxbw (%rdi), %xmm2
-; AVX1-NEXT: vpmovsxwd %xmm2, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; AVX1-NEXT: vpmovsxwd %xmm2, %xmm2
+; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX1-NEXT: vpmovsxbd %xmm2, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
; AVX1-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: slt_zero:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovsxbw (%rdi), %xmm2
-; AVX512-NEXT: vpmovw2m %xmm2, %k1
+; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512-NEXT: vpmovb2m %xmm2, %k1
; AVX512-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
; AVX512-NEXT: retq
%load = load <8 x i8>, <8 x i8>* %p
define <4 x double> @eq_zero_fp_select(<4 x i8>* %p, <4 x double> %x, <4 x double> %y) {
; AVX1-LABEL: eq_zero_fp_select:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpmovsxdq %xmm2, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; AVX1-NEXT: vpmovsxdq %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpmovsxbq %xmm2, %xmm3
+; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2
+; AVX1-NEXT: vpmovsxbq %xmm2, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: eq_zero_fp_select:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX512-NEXT: vptestnmd %xmm2, %xmm2, %k1
+; AVX512-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX512-NEXT: vptestnmb %xmm2, %xmm2, %k1
; AVX512-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
; AVX512-NEXT: retq
%load = load <4 x i8>, <4 x i8>* %p
define <8 x float> @ne_zero_fp_select(<8 x i8>* %p, <8 x float> %x, <8 x float> %y) {
; AVX1-LABEL: ne_zero_fp_select:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpmovsxwd %xmm2, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; AVX1-NEXT: vpmovsxwd %xmm2, %xmm2
+; AVX1-NEXT: vpmovsxbd %xmm2, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
; AVX1-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: ne_zero_fp_select:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX512-NEXT: vptestmw %xmm2, %xmm2, %k1
+; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512-NEXT: vptestmb %xmm2, %xmm2, %k1
; AVX512-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
; AVX512-NEXT: retq
%load = load <8 x i8>, <8 x i8>* %p
define <4 x double> @sgt_zero_fp_select(<4 x i8>* %p, <4 x double> %x, <4 x double> %y) {
; AVX1-LABEL: sgt_zero_fp_select:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovsxbd (%rdi), %xmm2
+; AVX1-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpmovsxdq %xmm2, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; AVX1-NEXT: vpmovsxdq %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpmovsxbq %xmm2, %xmm3
+; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2
+; AVX1-NEXT: vpmovsxbq %xmm2, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: sgt_zero_fp_select:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovsxbd (%rdi), %xmm2
+; AVX512-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; AVX512-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512-NEXT: vpcmpgtd %xmm3, %xmm2, %k1
+; AVX512-NEXT: vpcmpgtb %xmm3, %xmm2, %k1
; AVX512-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
; AVX512-NEXT: retq
%load = load <4 x i8>, <4 x i8>* %p
; AVX1-NEXT: vpsubd %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0
; AVX1-NEXT: vmovq %xmm0, (%rdi)
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT: vmovq %xmm0, (%rsi)
+; AVX1-NEXT: vmovq %xmm1, (%rsi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: test3:
; AVX2-NEXT: vpsubd %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm1
; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0
; AVX2-NEXT: vmovq %xmm0, (%rdi)
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX2-NEXT: vmovq %xmm0, (%rsi)
+; AVX2-NEXT: vmovq %xmm1, (%rsi)
; AVX2-NEXT: retq
%tmp6 = srem <4 x i32> %induction30, <i32 3, i32 3, i32 3, i32 3>
%tmp7 = icmp eq <4 x i32> %tmp6, zeroinitializer
define <2 x i32> @simplify_select(i32 %x, <2 x i1> %z) {
; SSE2-LABEL: simplify_select:
; SSE2: # %bb.0:
-; SSE2-NEXT: psllq $63, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pslld $31, %xmm0
; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE2-NEXT: movd %edi, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,0,1]
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,1,1]
+; SSE2-NEXT: por %xmm1, %xmm2
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm2[1,1]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3]
; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: pandn %xmm3, %xmm0
+; SSE2-NEXT: pandn %xmm1, %xmm0
; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: simplify_select:
; SSE41: # %bb.0:
-; SSE41-NEXT: movd %edi, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE41-NEXT: pslld $31, %xmm0
+; SSE41-NEXT: movd %edi, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,1,1]
+; SSE41-NEXT: por %xmm1, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3]
+; SSE41-NEXT: pinsrd $1, %edi, %xmm1
+; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movaps %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: simplify_select:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovd %edi, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: simplify_select:
-; AVX2: # %bb.0:
-; AVX2-NEXT: # kill: def $edi killed $edi def $rdi
-; AVX2-NEXT: vmovq %rdi, %xmm0
-; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
-; AVX2-NEXT: retq
+; AVX-LABEL: simplify_select:
+; AVX: # %bb.0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX-NEXT: vmovd %edi, %xmm1
+; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,0,1,1]
+; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1
+; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
+; AVX-NEXT: vpinsrd $1, %edi, %xmm2, %xmm2
+; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
+; AVX-NEXT: retq
%a = insertelement <2 x i32> <i32 0, i32 undef>, i32 %x, i32 1
%b = insertelement <2 x i32> <i32 undef, i32 0>, i32 %x, i32 0
%y = or <2 x i32> %a, %b
; SSE2-LABEL: vselect_any_extend_vector_inreg_crash:
; SSE2: # %bb.0:
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: pcmpeqw {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
-; SSE2-NEXT: psllq $56, %xmm0
+; SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE2-NEXT: psrad $24, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: andl $32768, %eax # imm = 0x8000
; SSE2-NEXT: retq
;
; SSE41-LABEL: vselect_any_extend_vector_inreg_crash:
; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; SSE41-NEXT: pcmpeqw {{.*}}(%rip), %xmm0
-; SSE41-NEXT: psllq $56, %xmm0
-; SSE41-NEXT: movl $32768, %eax # imm = 0x8000
-; SSE41-NEXT: movq %rax, %xmm1
-; SSE41-NEXT: xorpd %xmm2, %xmm2
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
-; SSE41-NEXT: movq %xmm2, %rax
+; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE41-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
+; SSE41-NEXT: pmovsxbq %xmm0, %xmm0
+; SSE41-NEXT: movq %xmm0, %rax
+; SSE41-NEXT: andl $32768, %eax # imm = 0x8000
; SSE41-NEXT: retq
;
; AVX-LABEL: vselect_any_extend_vector_inreg_crash:
; AVX: # %bb.0:
-; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX-NEXT: vpcmpeqw {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vpmovsxbq %xmm0, %xmm0
; AVX-NEXT: vmovq %xmm0, %rax
; AVX-NEXT: andl $32768, %eax # imm = 0x8000
; AVX-NEXT: retq
; X32-LABEL: shift2a:
; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
; X32-NEXT: xorps %xmm2, %xmm2
; X32-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
; X32-NEXT: pslld %xmm2, %xmm0
;
; X64-LABEL: shift2a:
; X64: # %bb.0: # %entry
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
; X64-NEXT: xorps %xmm2, %xmm2
; X64-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
; X64-NEXT: pslld %xmm2, %xmm0
; X32-LABEL: shift2b:
; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
; X32-NEXT: xorps %xmm2, %xmm2
; X32-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
; X32-NEXT: pslld %xmm2, %xmm0
;
; X64-LABEL: shift2b:
; X64: # %bb.0: # %entry
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
; X64-NEXT: xorps %xmm2, %xmm2
; X64-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
; X64-NEXT: pslld %xmm2, %xmm0
; X32-LABEL: shift2c:
; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
; X32-NEXT: xorps %xmm2, %xmm2
; X32-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
; X32-NEXT: pslld %xmm2, %xmm0
;
; X64-LABEL: shift2c:
; X64: # %bb.0: # %entry
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
; X64-NEXT: xorps %xmm2, %xmm2
; X64-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
; X64-NEXT: pslld %xmm2, %xmm0
define void @update(<3 x i8>* %dst, <3 x i8>* %src, i32 %n) nounwind {
; CHECK-LABEL: update:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: subl $12, %esp
+; CHECK-NEXT: pushl %eax
; CHECK-NEXT: movl $0, (%esp)
; CHECK-NEXT: pcmpeqd %xmm0, %xmm0
-; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_1: # %forcond
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: movl (%esp), %eax
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT: pmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; CHECK-NEXT: psubd %xmm0, %xmm2
-; CHECK-NEXT: pextrb $8, %xmm2, 2(%ecx,%eax,4)
-; CHECK-NEXT: pshufb %xmm1, %xmm2
-; CHECK-NEXT: pextrw $0, %xmm2, (%ecx,%eax,4)
+; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT: psubb %xmm0, %xmm1
+; CHECK-NEXT: pextrb $2, %xmm1, 2(%ecx,%eax,4)
+; CHECK-NEXT: pextrw $0, %xmm1, (%ecx,%eax,4)
; CHECK-NEXT: incl (%esp)
; CHECK-NEXT: jmp .LBB0_1
; CHECK-NEXT: .LBB0_3: # %afterfor
-; CHECK-NEXT: addl $12, %esp
+; CHECK-NEXT: popl %eax
; CHECK-NEXT: retl
entry:
%dst.addr = alloca <3 x i8>*
; CHECK-NEXT: subl $12, %esp
; CHECK-NEXT: movl $0, (%esp)
; CHECK-NEXT: pcmpeqd %xmm0, %xmm0
-; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4]
+; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <4,4,4,4,4,4,4,4,u,u,u,u,u,u,u,u>
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_1: # %forcond
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: movl %edx, {{[0-9]+}}(%esp)
; CHECK-NEXT: addl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; CHECK-NEXT: pmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; CHECK-NEXT: psubw %xmm0, %xmm2
+; CHECK-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT: psubb %xmm0, %xmm2
; CHECK-NEXT: pand %xmm1, %xmm2
-; CHECK-NEXT: packuswb %xmm0, %xmm2
; CHECK-NEXT: movq %xmm2, (%edx,%eax,8)
; CHECK-NEXT: incl (%esp)
; CHECK-NEXT: jmp .LBB0_1
; CHECK-NEXT: pushl %ebp
; CHECK-NEXT: movl %esp, %ebp
; CHECK-NEXT: andl $-8, %esp
-; CHECK-NEXT: subl $32, %esp
-; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; CHECK-NEXT: subl $16, %esp
; CHECK-NEXT: pcmpeqd %xmm0, %xmm0
; CHECK-NEXT: movw $1, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl $65537, {{[0-9]+}}(%esp) # imm = 0x10001
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl 12(%ebp), %edx
; CHECK-NEXT: movl 8(%ebp), %ecx
-; CHECK-NEXT: pmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; CHECK-NEXT: psubd %xmm0, %xmm2
-; CHECK-NEXT: pextrw $4, %xmm2, 4(%ecx,%eax,8)
-; CHECK-NEXT: pshufb %xmm1, %xmm2
-; CHECK-NEXT: movd %xmm2, (%ecx,%eax,8)
+; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT: pinsrw $2, 4(%edx,%eax,8), %xmm1
+; CHECK-NEXT: psubw %xmm0, %xmm1
+; CHECK-NEXT: pextrw $2, %xmm1, 4(%ecx,%eax,8)
+; CHECK-NEXT: movd %xmm1, (%ecx,%eax,8)
; CHECK-NEXT: incl {{[0-9]+}}(%esp)
; CHECK-NEXT: jmp .LBB0_1
; CHECK-NEXT: .LBB0_3: # %afterfor
; X32-SSE-LABEL: and_v3i8_as_i24:
; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-SSE-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm0
-; X32-SSE-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm0
+; X32-SSE-NEXT: pinsrb $1, {{[0-9]+}}(%esp), %xmm0
+; X32-SSE-NEXT: pinsrb $2, {{[0-9]+}}(%esp), %xmm0
; X32-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-SSE-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm1
-; X32-SSE-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm1
+; X32-SSE-NEXT: pinsrb $1, {{[0-9]+}}(%esp), %xmm1
+; X32-SSE-NEXT: pinsrb $2, {{[0-9]+}}(%esp), %xmm1
; X32-SSE-NEXT: pand %xmm0, %xmm1
; X32-SSE-NEXT: pextrb $0, %xmm1, %eax
-; X32-SSE-NEXT: pextrb $4, %xmm1, %edx
-; X32-SSE-NEXT: pextrb $8, %xmm1, %ecx
+; X32-SSE-NEXT: pextrb $1, %xmm1, %edx
+; X32-SSE-NEXT: pextrb $2, %xmm1, %ecx
; X32-SSE-NEXT: # kill: def $al killed $al killed $eax
; X32-SSE-NEXT: # kill: def $dl killed $dl killed $edx
; X32-SSE-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-SSE-LABEL: and_v3i8_as_i24:
; X64-SSE: # %bb.0:
; X64-SSE-NEXT: movd %ecx, %xmm0
-; X64-SSE-NEXT: pinsrd $1, %r8d, %xmm0
-; X64-SSE-NEXT: pinsrd $2, %r9d, %xmm0
+; X64-SSE-NEXT: pinsrb $1, %r8d, %xmm0
+; X64-SSE-NEXT: pinsrb $2, %r9d, %xmm0
; X64-SSE-NEXT: movd %edi, %xmm1
-; X64-SSE-NEXT: pinsrd $1, %esi, %xmm1
-; X64-SSE-NEXT: pinsrd $2, %edx, %xmm1
+; X64-SSE-NEXT: pinsrb $1, %esi, %xmm1
+; X64-SSE-NEXT: pinsrb $2, %edx, %xmm1
; X64-SSE-NEXT: pand %xmm0, %xmm1
; X64-SSE-NEXT: pextrb $0, %xmm1, %eax
-; X64-SSE-NEXT: pextrb $4, %xmm1, %edx
-; X64-SSE-NEXT: pextrb $8, %xmm1, %ecx
+; X64-SSE-NEXT: pextrb $1, %xmm1, %edx
+; X64-SSE-NEXT: pextrb $2, %xmm1, %ecx
; X64-SSE-NEXT: # kill: def $al killed $al killed $eax
; X64-SSE-NEXT: # kill: def $dl killed $dl killed $edx
; X64-SSE-NEXT: # kill: def $cl killed $cl killed $ecx
; X32-SSE-LABEL: xor_v3i8_as_i24:
; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-SSE-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm0
-; X32-SSE-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm0
+; X32-SSE-NEXT: pinsrb $1, {{[0-9]+}}(%esp), %xmm0
+; X32-SSE-NEXT: pinsrb $2, {{[0-9]+}}(%esp), %xmm0
; X32-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-SSE-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm1
-; X32-SSE-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm1
+; X32-SSE-NEXT: pinsrb $1, {{[0-9]+}}(%esp), %xmm1
+; X32-SSE-NEXT: pinsrb $2, {{[0-9]+}}(%esp), %xmm1
; X32-SSE-NEXT: pxor %xmm0, %xmm1
; X32-SSE-NEXT: pextrb $0, %xmm1, %eax
-; X32-SSE-NEXT: pextrb $4, %xmm1, %edx
-; X32-SSE-NEXT: pextrb $8, %xmm1, %ecx
+; X32-SSE-NEXT: pextrb $1, %xmm1, %edx
+; X32-SSE-NEXT: pextrb $2, %xmm1, %ecx
; X32-SSE-NEXT: # kill: def $al killed $al killed $eax
; X32-SSE-NEXT: # kill: def $dl killed $dl killed $edx
; X32-SSE-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-SSE-LABEL: xor_v3i8_as_i24:
; X64-SSE: # %bb.0:
; X64-SSE-NEXT: movd %ecx, %xmm0
-; X64-SSE-NEXT: pinsrd $1, %r8d, %xmm0
-; X64-SSE-NEXT: pinsrd $2, %r9d, %xmm0
+; X64-SSE-NEXT: pinsrb $1, %r8d, %xmm0
+; X64-SSE-NEXT: pinsrb $2, %r9d, %xmm0
; X64-SSE-NEXT: movd %edi, %xmm1
-; X64-SSE-NEXT: pinsrd $1, %esi, %xmm1
-; X64-SSE-NEXT: pinsrd $2, %edx, %xmm1
+; X64-SSE-NEXT: pinsrb $1, %esi, %xmm1
+; X64-SSE-NEXT: pinsrb $2, %edx, %xmm1
; X64-SSE-NEXT: pxor %xmm0, %xmm1
; X64-SSE-NEXT: pextrb $0, %xmm1, %eax
-; X64-SSE-NEXT: pextrb $4, %xmm1, %edx
-; X64-SSE-NEXT: pextrb $8, %xmm1, %ecx
+; X64-SSE-NEXT: pextrb $1, %xmm1, %edx
+; X64-SSE-NEXT: pextrb $2, %xmm1, %ecx
; X64-SSE-NEXT: # kill: def $al killed $al killed $eax
; X64-SSE-NEXT: # kill: def $dl killed $dl killed $edx
; X64-SSE-NEXT: # kill: def $cl killed $cl killed $ecx
; X32-SSE-LABEL: or_v3i8_as_i24:
; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-SSE-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm0
-; X32-SSE-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm0
+; X32-SSE-NEXT: pinsrb $1, {{[0-9]+}}(%esp), %xmm0
+; X32-SSE-NEXT: pinsrb $2, {{[0-9]+}}(%esp), %xmm0
; X32-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-SSE-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm1
-; X32-SSE-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm1
+; X32-SSE-NEXT: pinsrb $1, {{[0-9]+}}(%esp), %xmm1
+; X32-SSE-NEXT: pinsrb $2, {{[0-9]+}}(%esp), %xmm1
; X32-SSE-NEXT: por %xmm0, %xmm1
; X32-SSE-NEXT: pextrb $0, %xmm1, %eax
-; X32-SSE-NEXT: pextrb $4, %xmm1, %edx
-; X32-SSE-NEXT: pextrb $8, %xmm1, %ecx
+; X32-SSE-NEXT: pextrb $1, %xmm1, %edx
+; X32-SSE-NEXT: pextrb $2, %xmm1, %ecx
; X32-SSE-NEXT: # kill: def $al killed $al killed $eax
; X32-SSE-NEXT: # kill: def $dl killed $dl killed $edx
; X32-SSE-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-SSE-LABEL: or_v3i8_as_i24:
; X64-SSE: # %bb.0:
; X64-SSE-NEXT: movd %ecx, %xmm0
-; X64-SSE-NEXT: pinsrd $1, %r8d, %xmm0
-; X64-SSE-NEXT: pinsrd $2, %r9d, %xmm0
+; X64-SSE-NEXT: pinsrb $1, %r8d, %xmm0
+; X64-SSE-NEXT: pinsrb $2, %r9d, %xmm0
; X64-SSE-NEXT: movd %edi, %xmm1
-; X64-SSE-NEXT: pinsrd $1, %esi, %xmm1
-; X64-SSE-NEXT: pinsrd $2, %edx, %xmm1
+; X64-SSE-NEXT: pinsrb $1, %esi, %xmm1
+; X64-SSE-NEXT: pinsrb $2, %edx, %xmm1
; X64-SSE-NEXT: por %xmm0, %xmm1
; X64-SSE-NEXT: pextrb $0, %xmm1, %eax
-; X64-SSE-NEXT: pextrb $4, %xmm1, %edx
-; X64-SSE-NEXT: pextrb $8, %xmm1, %ecx
+; X64-SSE-NEXT: pextrb $1, %xmm1, %edx
+; X64-SSE-NEXT: pextrb $2, %xmm1, %ecx
; X64-SSE-NEXT: # kill: def $al killed $al killed $eax
; X64-SSE-NEXT: # kill: def $dl killed $dl killed $edx
; X64-SSE-NEXT: # kill: def $cl killed $cl killed $ecx
; CHECK-NEXT: pushl %eax
; CHECK-NEXT: movl $0, (%esp)
; CHECK-NEXT: pcmpeqd %xmm0, %xmm0
-; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; CHECK-NEXT: cmpl $3, (%esp)
; CHECK-NEXT: jg .LBB0_3
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: movl (%esp), %eax
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT: pmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; CHECK-NEXT: psubw %xmm0, %xmm2
-; CHECK-NEXT: pshufb %xmm1, %xmm2
-; CHECK-NEXT: movq %xmm2, (%ecx,%eax,8)
+; CHECK-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT: psubw %xmm0, %xmm1
+; CHECK-NEXT: movq %xmm1, (%ecx,%eax,8)
; CHECK-NEXT: incl (%esp)
; CHECK-NEXT: cmpl $3, (%esp)
; CHECK-NEXT: jle .LBB0_2
; ATOM: # %bb.0: # %entry
; ATOM-NEXT: pushl %eax
; ATOM-NEXT: pcmpeqd %xmm0, %xmm0
-; ATOM-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; ATOM-NEXT: movl $0, (%esp)
; ATOM-NEXT: cmpl $3, (%esp)
; ATOM-NEXT: jg .LBB0_3
; ATOM-NEXT: # =>This Inner Loop Header: Depth=1
; ATOM-NEXT: movl (%esp), %eax
; ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; ATOM-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
+; ATOM-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; ATOM-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; ATOM-NEXT: psubw %xmm0, %xmm2
-; ATOM-NEXT: pshufb %xmm1, %xmm2
-; ATOM-NEXT: movq %xmm2, (%ecx,%eax,8)
+; ATOM-NEXT: psubw %xmm0, %xmm1
+; ATOM-NEXT: movq %xmm1, (%ecx,%eax,8)
; ATOM-NEXT: incl (%esp)
; ATOM-NEXT: cmpl $3, (%esp)
; ATOM-NEXT: jle .LBB0_2
; CHECK-NEXT: movdqa 16(%edx,%eax), %xmm2
; CHECK-NEXT: psubw %xmm0, %xmm1
; CHECK-NEXT: psubw %xmm0, %xmm2
+; CHECK-NEXT: movd %xmm2, 16(%ecx,%eax)
+; CHECK-NEXT: pextrd $1, %xmm2, 20(%ecx,%eax)
; CHECK-NEXT: pextrd $2, %xmm2, 24(%ecx,%eax)
-; CHECK-NEXT: movq %xmm2, 16(%ecx,%eax)
; CHECK-NEXT: movdqa %xmm1, (%ecx,%eax)
; CHECK-NEXT: incl (%esp)
; CHECK-NEXT: cmpl $3, (%esp)
; X86-NEXT: pcmpeqd %xmm1, %xmm1
; X86-NEXT: psubd %xmm1, %xmm0
; X86-NEXT: pextrd $2, %xmm0, 8(%eax)
-; X86-NEXT: movq %xmm0, (%eax)
+; X86-NEXT: pextrd $1, %xmm0, 4(%eax)
+; X86-NEXT: movd %xmm0, (%eax)
; X86-NEXT: retl
;
; X64-LABEL: convert:
; NARROW-NEXT: subl $12, %esp
; NARROW-NEXT: movl $0, (%esp)
; NARROW-NEXT: pcmpeqd %xmm0, %xmm0
-; NARROW-NEXT: movdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; NARROW-NEXT: movdqa {{.*#+}} xmm1 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; NARROW-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
; NARROW-NEXT: .p2align 4, 0x90
; NARROW-NEXT: .LBB0_1: # %forcond
; NARROW-NEXT: # =>This Inner Loop Header: Depth=1
; NARROW-NEXT: movl %edx, {{[0-9]+}}(%esp)
; NARROW-NEXT: addl {{[0-9]+}}(%esp), %ecx
; NARROW-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; NARROW-NEXT: pmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; NARROW-NEXT: psubw %xmm0, %xmm2
-; NARROW-NEXT: psllw $8, %xmm2
-; NARROW-NEXT: psraw $8, %xmm2
-; NARROW-NEXT: psrlw $2, %xmm2
-; NARROW-NEXT: pshufb %xmm1, %xmm2
-; NARROW-NEXT: movq %xmm2, (%edx,%eax,8)
+; NARROW-NEXT: movq {{.*#+}} xmm3 = mem[0],zero
+; NARROW-NEXT: psubb %xmm0, %xmm3
+; NARROW-NEXT: psrlw $2, %xmm3
+; NARROW-NEXT: pand %xmm1, %xmm3
+; NARROW-NEXT: pxor %xmm2, %xmm3
+; NARROW-NEXT: psubb %xmm2, %xmm3
+; NARROW-NEXT: movq %xmm3, (%edx,%eax,8)
; NARROW-NEXT: incl (%esp)
; NARROW-NEXT: jmp .LBB0_1
; NARROW-NEXT: .LBB0_3: # %afterfor
; X86-LABEL: convert:
; X86: ## %bb.0: ## %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
-; X86-NEXT: pxor LCPI0_0, %xmm0
-; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-NEXT: movq %xmm0, (%eax)
+; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT: xorps LCPI0_0, %xmm0
+; X86-NEXT: movlps %xmm0, (%eax)
; X86-NEXT: retl
;
; X64-LABEL: convert:
; X64: ## %bb.0: ## %entry
; X64-NEXT: movq %rsi, %xmm0
-; X64-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; X64-NEXT: pxor {{.*}}(%rip), %xmm0
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-NEXT: movq %xmm0, (%rdi)
; X64-NEXT: retq
entry:
define i32 @return_v2hi() nounwind {
; X86-LABEL: return_v2hi:
; X86: ## %bb.0: ## %entry
-; X86-NEXT: pushl %eax
; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: popl %ecx
; X86-NEXT: retl
;
; X64-LABEL: return_v2hi:
define <2 x i16> @compare_v2i64_to_v2i16_unary(<2 x i16>* %src) nounwind {
; X86-LABEL: compare_v2i64_to_v2i16_unary:
; X86: # %bb.0:
-; X86-NEXT: movaps {{.*#+}} xmm0 = [65535,0,65535,0]
+; X86-NEXT: pcmpeqd %xmm0, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: compare_v2i64_to_v2i16_unary:
; X64: # %bb.0:
-; X64-NEXT: movaps {{.*#+}} xmm0 = [65535,65535]
+; X64-NEXT: pcmpeqd %xmm0, %xmm0
; X64-NEXT: retq
%val = load <2 x i16>, <2 x i16>* %src, align 4
%cmp = icmp uge <2 x i16> %val, %val
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
-; X86-NEXT: pmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
-; X86-NEXT: pcmpgtq %xmm0, %xmm1
-; X86-NEXT: pcmpeqd %xmm0, %xmm0
-; X86-NEXT: pxor %xmm1, %xmm0
+; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: pmaxuw %xmm1, %xmm0
+; X86-NEXT: pcmpeqw %xmm1, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: compare_v2i64_to_v2i16_binary:
; X64: # %bb.0:
-; X64-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
-; X64-NEXT: pmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
-; X64-NEXT: pcmpgtq %xmm0, %xmm1
-; X64-NEXT: pcmpeqd %xmm0, %xmm0
-; X64-NEXT: pxor %xmm1, %xmm0
+; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: pmaxuw %xmm1, %xmm0
+; X64-NEXT: pcmpeqw %xmm1, %xmm0
; X64-NEXT: retq
%val0 = load <2 x i16>, <2 x i16>* %src0, align 4
%val1 = load <2 x i16>, <2 x i16>* %src1, align 4
; X86-LABEL: convert_v2i64_to_v2i32:
; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-NEXT: pcmpeqd %xmm1, %xmm1
; X86-NEXT: psubd %xmm1, %xmm0
-; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-NEXT: movq %xmm0, (%eax)
; X86-NEXT: retl
;
; X64-LABEL: convert_v2i64_to_v2i32:
; X64: # %bb.0: # %entry
-; X64-NEXT: paddd {{.*}}(%rip), %xmm0
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-NEXT: pcmpeqd %xmm1, %xmm1
+; X64-NEXT: psubd %xmm1, %xmm0
; X64-NEXT: movq %xmm0, (%rdi)
; X64-NEXT: retq
entry:
define void @convert_v3i32_to_v3i8(<3 x i8>* %dst.addr, <3 x i32>* %src.addr) nounwind {
; X86-LABEL: convert_v3i32_to_v3i8:
; X86: # %bb.0: # %entry
-; X86-NEXT: pushl %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movdqa (%ecx), %xmm0
-; X86-NEXT: pcmpeqd %xmm1, %xmm1
-; X86-NEXT: psubd %xmm1, %xmm0
-; X86-NEXT: pextrb $8, %xmm0, 2(%eax)
; X86-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; X86-NEXT: pcmpeqd %xmm1, %xmm1
+; X86-NEXT: psubb %xmm1, %xmm0
+; X86-NEXT: pextrb $2, %xmm0, 2(%eax)
; X86-NEXT: pextrw $0, %xmm0, (%eax)
-; X86-NEXT: popl %eax
; X86-NEXT: retl
;
; X64-LABEL: convert_v3i32_to_v3i8:
; X64: # %bb.0: # %entry
; X64-NEXT: movdqa (%rsi), %xmm0
-; X64-NEXT: pcmpeqd %xmm1, %xmm1
-; X64-NEXT: psubd %xmm1, %xmm0
-; X64-NEXT: pextrb $8, %xmm0, 2(%rdi)
; X64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; X64-NEXT: pcmpeqd %xmm1, %xmm1
+; X64-NEXT: psubb %xmm1, %xmm0
+; X64-NEXT: pextrb $2, %xmm0, 2(%rdi)
; X64-NEXT: pextrw $0, %xmm0, (%rdi)
; X64-NEXT: retq
entry:
define void @convert_v5i16_to_v5i8(<5 x i8>* %dst.addr, <5 x i16>* %src.addr) nounwind {
; X86-LABEL: convert_v5i16_to_v5i8:
; X86: # %bb.0: # %entry
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $8, %esp
-; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movdqa (%ecx), %xmm0
-; X86-NEXT: pcmpeqd %xmm1, %xmm1
-; X86-NEXT: psubw %xmm1, %xmm0
-; X86-NEXT: pextrb $8, %xmm0, 4(%eax)
; X86-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; X86-NEXT: pcmpeqd %xmm1, %xmm1
+; X86-NEXT: psubb %xmm1, %xmm0
+; X86-NEXT: pextrb $4, %xmm0, 4(%eax)
; X86-NEXT: movd %xmm0, (%eax)
-; X86-NEXT: movl %ebp, %esp
-; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; X64-LABEL: convert_v5i16_to_v5i8:
; X64: # %bb.0: # %entry
; X64-NEXT: movdqa (%rsi), %xmm0
-; X64-NEXT: pcmpeqd %xmm1, %xmm1
-; X64-NEXT: psubw %xmm1, %xmm0
-; X64-NEXT: pextrb $8, %xmm0, 4(%rdi)
; X64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; X64-NEXT: pcmpeqd %xmm1, %xmm1
+; X64-NEXT: psubb %xmm1, %xmm0
+; X64-NEXT: pextrb $4, %xmm0, 4(%rdi)
; X64-NEXT: movd %xmm0, (%rdi)
; X64-NEXT: retq
entry:
; X86-LABEL: convert_v2i16_v2i32:
; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: psllq $48, %xmm0
-; X86-NEXT: psrad $16, %xmm0
-; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; X86-NEXT: pmovsxwd %xmm0, %xmm0
; X86-NEXT: movq %xmm0, (%eax)
; X86-NEXT: retl
;
; X64-LABEL: convert_v2i16_v2i32:
; X64: # %bb.0: # %entry
-; X64-NEXT: psllq $48, %xmm0
-; X64-NEXT: psrad $16, %xmm0
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; X64-NEXT: pmovsxwd %xmm0, %xmm0
; X64-NEXT: movq %xmm0, (%rdi)
; X64-NEXT: retq
entry:
; sign to float v2i16 to v2f32
define void @convert_v2i16_to_v2f32(<2 x float>* %dst.addr, <2 x i16> %src) nounwind {
-; X86-LABEL: convert_v2i16_to_v2f32:
-; X86: # %bb.0: # %entry
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: psllq $48, %xmm0
-; X86-NEXT: psrad $16, %xmm0
-; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; X86-NEXT: cvtdq2ps %xmm0, %xmm0
-; X86-NEXT: movlps %xmm0, (%eax)
-; X86-NEXT: retl
+; X86-SSE2-LABEL: convert_v2i16_to_v2f32:
+; X86-SSE2: # %bb.0: # %entry
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X86-SSE2-NEXT: psrad $16, %xmm0
+; X86-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
+; X86-SSE2-NEXT: movlps %xmm0, (%eax)
+; X86-SSE2-NEXT: retl
;
-; X64-LABEL: convert_v2i16_to_v2f32:
-; X64: # %bb.0: # %entry
-; X64-NEXT: psllq $48, %xmm0
-; X64-NEXT: psrad $16, %xmm0
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; X64-NEXT: cvtdq2ps %xmm0, %xmm0
-; X64-NEXT: movlps %xmm0, (%rdi)
-; X64-NEXT: retq
+; X86-SSE42-LABEL: convert_v2i16_to_v2f32:
+; X86-SSE42: # %bb.0: # %entry
+; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE42-NEXT: pmovsxwd %xmm0, %xmm0
+; X86-SSE42-NEXT: cvtdq2ps %xmm0, %xmm0
+; X86-SSE42-NEXT: movlps %xmm0, (%eax)
+; X86-SSE42-NEXT: retl
+;
+; X64-SSE2-LABEL: convert_v2i16_to_v2f32:
+; X64-SSE2: # %bb.0: # %entry
+; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X64-SSE2-NEXT: psrad $16, %xmm0
+; X64-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
+; X64-SSE2-NEXT: movlps %xmm0, (%rdi)
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: convert_v2i16_to_v2f32:
+; X64-SSE42: # %bb.0: # %entry
+; X64-SSE42-NEXT: pmovsxwd %xmm0, %xmm0
+; X64-SSE42-NEXT: cvtdq2ps %xmm0, %xmm0
+; X64-SSE42-NEXT: movlps %xmm0, (%rdi)
+; X64-SSE42-NEXT: retq
entry:
%val = sitofp <2 x i16> %src to <2 x float>
store <2 x float> %val, <2 x float>* %dst.addr, align 4
define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr) nounwind {
; X86-SSE2-LABEL: convert_v3i8_to_v3f32:
; X86-SSE2: # %bb.0: # %entry
-; X86-SSE2-NEXT: pushl %ebp
-; X86-SSE2-NEXT: movl %esp, %ebp
-; X86-SSE2-NEXT: pushl %esi
-; X86-SSE2-NEXT: andl $-16, %esp
-; X86-SSE2-NEXT: subl $32, %esp
-; X86-SSE2-NEXT: movl 8(%ebp), %eax
-; X86-SSE2-NEXT: movl 12(%ebp), %ecx
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE2-NEXT: movzwl (%ecx), %edx
; X86-SSE2-NEXT: movd %edx, %xmm0
-; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; X86-SSE2-NEXT: movdqa %xmm0, (%esp)
-; X86-SSE2-NEXT: movl (%esp), %edx
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-SSE2-NEXT: shll $8, %edx
-; X86-SSE2-NEXT: pxor %xmm0, %xmm0
-; X86-SSE2-NEXT: pinsrw $1, %edx, %xmm0
-; X86-SSE2-NEXT: shll $8, %esi
-; X86-SSE2-NEXT: pinsrw $3, %esi, %xmm0
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X86-SSE2-NEXT: pand %xmm1, %xmm0
; X86-SSE2-NEXT: movzbl 2(%ecx), %ecx
-; X86-SSE2-NEXT: shll $8, %ecx
-; X86-SSE2-NEXT: pinsrw $5, %ecx, %xmm0
+; X86-SSE2-NEXT: movd %ecx, %xmm2
+; X86-SSE2-NEXT: pslld $16, %xmm2
+; X86-SSE2-NEXT: pandn %xmm2, %xmm1
+; X86-SSE2-NEXT: por %xmm0, %xmm1
+; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X86-SSE2-NEXT: psrad $24, %xmm0
; X86-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
; X86-SSE2-NEXT: movss %xmm0, (%eax)
; X86-SSE2-NEXT: movss %xmm1, 8(%eax)
; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
; X86-SSE2-NEXT: movss %xmm0, 4(%eax)
-; X86-SSE2-NEXT: leal -4(%ebp), %esp
-; X86-SSE2-NEXT: popl %esi
-; X86-SSE2-NEXT: popl %ebp
; X86-SSE2-NEXT: retl
;
; X86-SSE42-LABEL: convert_v3i8_to_v3f32:
; X86-SSE42: # %bb.0: # %entry
-; X86-SSE42-NEXT: pushl %eax
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE42-NEXT: movzbl 2(%ecx), %edx
-; X86-SSE42-NEXT: movzwl (%ecx), %ecx
-; X86-SSE42-NEXT: movd %ecx, %xmm0
-; X86-SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X86-SSE42-NEXT: pinsrd $2, %edx, %xmm0
-; X86-SSE42-NEXT: pslld $24, %xmm0
-; X86-SSE42-NEXT: psrad $24, %xmm0
+; X86-SSE42-NEXT: movzwl (%ecx), %edx
+; X86-SSE42-NEXT: movd %edx, %xmm0
+; X86-SSE42-NEXT: pinsrb $2, 2(%ecx), %xmm0
+; X86-SSE42-NEXT: pmovsxbd %xmm0, %xmm0
; X86-SSE42-NEXT: cvtdq2ps %xmm0, %xmm0
; X86-SSE42-NEXT: extractps $2, %xmm0, 8(%eax)
; X86-SSE42-NEXT: extractps $1, %xmm0, 4(%eax)
; X86-SSE42-NEXT: movss %xmm0, (%eax)
-; X86-SSE42-NEXT: popl %eax
; X86-SSE42-NEXT: retl
;
; X64-SSE2-LABEL: convert_v3i8_to_v3f32:
; X64-SSE2: # %bb.0: # %entry
; X64-SSE2-NEXT: movzwl (%rsi), %eax
; X64-SSE2-NEXT: movd %eax, %xmm0
-; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; X64-SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movl -{{[0-9]+}}(%rsp), %eax
-; X64-SSE2-NEXT: movl -{{[0-9]+}}(%rsp), %ecx
-; X64-SSE2-NEXT: shll $8, %eax
-; X64-SSE2-NEXT: pxor %xmm0, %xmm0
-; X64-SSE2-NEXT: pinsrw $1, %eax, %xmm0
-; X64-SSE2-NEXT: shll $8, %ecx
-; X64-SSE2-NEXT: pinsrw $3, %ecx, %xmm0
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X64-SSE2-NEXT: pand %xmm1, %xmm0
; X64-SSE2-NEXT: movzbl 2(%rsi), %eax
-; X64-SSE2-NEXT: shll $8, %eax
-; X64-SSE2-NEXT: pinsrw $5, %eax, %xmm0
+; X64-SSE2-NEXT: movd %eax, %xmm2
+; X64-SSE2-NEXT: pslld $16, %xmm2
+; X64-SSE2-NEXT: pandn %xmm2, %xmm1
+; X64-SSE2-NEXT: por %xmm0, %xmm1
+; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X64-SSE2-NEXT: psrad $24, %xmm0
; X64-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
; X64-SSE2-NEXT: movlps %xmm0, (%rdi)
;
; X64-SSE42-LABEL: convert_v3i8_to_v3f32:
; X64-SSE42: # %bb.0: # %entry
-; X64-SSE42-NEXT: movzbl 2(%rsi), %eax
-; X64-SSE42-NEXT: movzwl (%rsi), %ecx
-; X64-SSE42-NEXT: movd %ecx, %xmm0
-; X64-SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X64-SSE42-NEXT: pinsrd $2, %eax, %xmm0
-; X64-SSE42-NEXT: pslld $24, %xmm0
-; X64-SSE42-NEXT: psrad $24, %xmm0
+; X64-SSE42-NEXT: movzwl (%rsi), %eax
+; X64-SSE42-NEXT: movd %eax, %xmm0
+; X64-SSE42-NEXT: pinsrb $2, 2(%rsi), %xmm0
+; X64-SSE42-NEXT: pmovsxbd %xmm0, %xmm0
; X64-SSE42-NEXT: cvtdq2ps %xmm0, %xmm0
; X64-SSE42-NEXT: extractps $2, %xmm0, 8(%rdi)
; X64-SSE42-NEXT: movlps %xmm0, (%rdi)
; X86-SSE42-LABEL: convert_v7i16_v7f32:
; X86-SSE42: # %bb.0: # %entry
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE42-NEXT: pxor %xmm1, %xmm1
-; X86-SSE42-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X86-SSE42-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; X86-SSE42-NEXT: cvtdq2ps %xmm1, %xmm1
+; X86-SSE42-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; X86-SSE42-NEXT: cvtdq2ps %xmm0, %xmm0
-; X86-SSE42-NEXT: cvtdq2ps %xmm2, %xmm1
-; X86-SSE42-NEXT: extractps $2, %xmm0, 24(%eax)
-; X86-SSE42-NEXT: extractps $1, %xmm0, 20(%eax)
-; X86-SSE42-NEXT: movups %xmm1, (%eax)
-; X86-SSE42-NEXT: movss %xmm0, 16(%eax)
+; X86-SSE42-NEXT: movups %xmm0, (%eax)
+; X86-SSE42-NEXT: extractps $2, %xmm1, 24(%eax)
+; X86-SSE42-NEXT: extractps $1, %xmm1, 20(%eax)
+; X86-SSE42-NEXT: movss %xmm1, 16(%eax)
; X86-SSE42-NEXT: retl
;
; X64-SSE2-LABEL: convert_v7i16_v7f32:
;
; X64-SSE42-LABEL: convert_v7i16_v7f32:
; X64-SSE42: # %bb.0: # %entry
-; X64-SSE42-NEXT: pxor %xmm1, %xmm1
-; X64-SSE42-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X64-SSE42-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; X64-SSE42-NEXT: cvtdq2ps %xmm1, %xmm1
+; X64-SSE42-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; X64-SSE42-NEXT: cvtdq2ps %xmm0, %xmm0
-; X64-SSE42-NEXT: cvtdq2ps %xmm2, %xmm1
-; X64-SSE42-NEXT: extractps $2, %xmm0, 24(%rdi)
-; X64-SSE42-NEXT: movlps %xmm0, 16(%rdi)
-; X64-SSE42-NEXT: movups %xmm1, (%rdi)
+; X64-SSE42-NEXT: movups %xmm0, (%rdi)
+; X64-SSE42-NEXT: extractps $2, %xmm1, 24(%rdi)
+; X64-SSE42-NEXT: movlps %xmm1, 16(%rdi)
; X64-SSE42-NEXT: retq
entry:
%val = uitofp <7 x i16> %src to <7 x float>
define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr) nounwind {
; X86-SSE2-LABEL: convert_v3i8_to_v3f32:
; X86-SSE2: # %bb.0: # %entry
-; X86-SSE2-NEXT: pushl %ebp
-; X86-SSE2-NEXT: movl %esp, %ebp
-; X86-SSE2-NEXT: andl $-16, %esp
-; X86-SSE2-NEXT: subl $32, %esp
-; X86-SSE2-NEXT: movl 8(%ebp), %eax
-; X86-SSE2-NEXT: movl 12(%ebp), %ecx
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE2-NEXT: movzwl (%ecx), %edx
; X86-SSE2-NEXT: movd %edx, %xmm0
-; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X86-SSE2-NEXT: pand %xmm1, %xmm0
; X86-SSE2-NEXT: movzbl 2(%ecx), %ecx
-; X86-SSE2-NEXT: movdqa %xmm0, (%esp)
-; X86-SSE2-NEXT: movzbl (%esp), %edx
-; X86-SSE2-NEXT: movd %edx, %xmm0
-; X86-SSE2-NEXT: movzbl {{[0-9]+}}(%esp), %edx
-; X86-SSE2-NEXT: pinsrw $2, %edx, %xmm0
-; X86-SSE2-NEXT: pinsrw $4, %ecx, %xmm0
-; X86-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
+; X86-SSE2-NEXT: movd %ecx, %xmm2
+; X86-SSE2-NEXT: pslld $16, %xmm2
+; X86-SSE2-NEXT: pandn %xmm2, %xmm1
+; X86-SSE2-NEXT: por %xmm0, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm0
+; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X86-SSE2-NEXT: cvtdq2ps %xmm1, %xmm0
; X86-SSE2-NEXT: movss %xmm0, (%eax)
; X86-SSE2-NEXT: movaps %xmm0, %xmm1
; X86-SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; X86-SSE2-NEXT: movss %xmm1, 8(%eax)
; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
; X86-SSE2-NEXT: movss %xmm0, 4(%eax)
-; X86-SSE2-NEXT: movl %ebp, %esp
-; X86-SSE2-NEXT: popl %ebp
; X86-SSE2-NEXT: retl
;
; X86-SSE42-LABEL: convert_v3i8_to_v3f32:
; X86-SSE42: # %bb.0: # %entry
-; X86-SSE42-NEXT: pushl %eax
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE42-NEXT: movzbl 2(%ecx), %edx
-; X86-SSE42-NEXT: movzwl (%ecx), %ecx
-; X86-SSE42-NEXT: movd %ecx, %xmm0
+; X86-SSE42-NEXT: movzwl (%ecx), %edx
+; X86-SSE42-NEXT: movd %edx, %xmm0
+; X86-SSE42-NEXT: pinsrb $2, 2(%ecx), %xmm0
; X86-SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X86-SSE42-NEXT: pinsrd $2, %edx, %xmm0
-; X86-SSE42-NEXT: pand {{\.LCPI.*}}, %xmm0
; X86-SSE42-NEXT: cvtdq2ps %xmm0, %xmm0
; X86-SSE42-NEXT: extractps $2, %xmm0, 8(%eax)
; X86-SSE42-NEXT: extractps $1, %xmm0, 4(%eax)
; X86-SSE42-NEXT: movss %xmm0, (%eax)
-; X86-SSE42-NEXT: popl %eax
; X86-SSE42-NEXT: retl
;
; X64-SSE2-LABEL: convert_v3i8_to_v3f32:
; X64-SSE2: # %bb.0: # %entry
; X64-SSE2-NEXT: movzwl (%rsi), %eax
; X64-SSE2-NEXT: movd %eax, %xmm0
-; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X64-SSE2-NEXT: pand %xmm1, %xmm0
; X64-SSE2-NEXT: movzbl 2(%rsi), %eax
-; X64-SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; X64-SSE2-NEXT: movd %ecx, %xmm0
-; X64-SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; X64-SSE2-NEXT: pinsrw $2, %ecx, %xmm0
-; X64-SSE2-NEXT: pinsrw $4, %eax, %xmm0
-; X64-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
+; X64-SSE2-NEXT: movd %eax, %xmm2
+; X64-SSE2-NEXT: pslld $16, %xmm2
+; X64-SSE2-NEXT: pandn %xmm2, %xmm1
+; X64-SSE2-NEXT: por %xmm0, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm0
+; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X64-SSE2-NEXT: cvtdq2ps %xmm1, %xmm0
; X64-SSE2-NEXT: movlps %xmm0, (%rdi)
; X64-SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; X64-SSE2-NEXT: movss %xmm0, 8(%rdi)
;
; X64-SSE42-LABEL: convert_v3i8_to_v3f32:
; X64-SSE42: # %bb.0: # %entry
-; X64-SSE42-NEXT: movzbl 2(%rsi), %eax
-; X64-SSE42-NEXT: movzwl (%rsi), %ecx
-; X64-SSE42-NEXT: movd %ecx, %xmm0
+; X64-SSE42-NEXT: movzwl (%rsi), %eax
+; X64-SSE42-NEXT: movd %eax, %xmm0
+; X64-SSE42-NEXT: pinsrb $2, 2(%rsi), %xmm0
; X64-SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X64-SSE42-NEXT: pinsrd $2, %eax, %xmm0
-; X64-SSE42-NEXT: pand {{.*}}(%rip), %xmm0
; X64-SSE42-NEXT: cvtdq2ps %xmm0, %xmm0
; X64-SSE42-NEXT: extractps $2, %xmm0, 8(%rdi)
; X64-SSE42-NEXT: movlps %xmm0, (%rdi)
; X86-NEXT: movdqa (%edx), %xmm0
; X86-NEXT: paddd (%ecx), %xmm0
; X86-NEXT: pextrd $2, %xmm0, 8(%eax)
-; X86-NEXT: movq %xmm0, (%eax)
+; X86-NEXT: pextrd $1, %xmm0, 4(%eax)
+; X86-NEXT: movd %xmm0, (%eax)
; X86-NEXT: retl $4
;
; X64-LABEL: add3i32:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: pinsrd $1, 4(%edx), %xmm0
; X86-NEXT: pinsrd $2, 8(%edx), %xmm0
-; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT: pinsrd $1, 4(%ecx), %xmm1
; X86-NEXT: pinsrd $2, 8(%ecx), %xmm1
; X86-NEXT: paddd %xmm0, %xmm1
-; X86-NEXT: movq %xmm1, (%eax)
+; X86-NEXT: pextrd $1, %xmm1, 4(%eax)
; X86-NEXT: pextrd $2, %xmm1, 8(%eax)
+; X86-NEXT: movd %xmm1, (%eax)
; X86-NEXT: retl $4
;
; X64-LABEL: add3i32_2:
; X86-NEXT: movdqa 16(%edx), %xmm1
; X86-NEXT: paddd (%ecx), %xmm0
; X86-NEXT: paddd 16(%ecx), %xmm1
+; X86-NEXT: movd %xmm1, 16(%eax)
+; X86-NEXT: pextrd $1, %xmm1, 20(%eax)
; X86-NEXT: pextrd $2, %xmm1, 24(%eax)
-; X86-NEXT: movq %xmm1, 16(%eax)
; X86-NEXT: movdqa %xmm0, (%eax)
; X86-NEXT: retl $4
;
define void @add3i16(%i16vec3* nocapture sret %ret, %i16vec3* %ap, %i16vec3* %bp) nounwind {
; X86-LABEL: add3i16:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $8, %esp
-; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: movl 16(%ebp), %ecx
-; X86-NEXT: movl 12(%ebp), %edx
-; X86-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-NEXT: pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-NEXT: paddd %xmm0, %xmm1
-; X86-NEXT: pextrw $4, %xmm1, 4(%eax)
-; X86-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: pinsrw $2, 4(%edx), %xmm0
+; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT: pinsrw $2, 4(%ecx), %xmm1
+; X86-NEXT: paddw %xmm0, %xmm1
+; X86-NEXT: pextrw $2, %xmm1, 4(%eax)
; X86-NEXT: movd %xmm1, (%eax)
-; X86-NEXT: movl %ebp, %esp
-; X86-NEXT: popl %ebp
; X86-NEXT: retl $4
;
; X64-LABEL: add3i16:
; X64: # %bb.0:
; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X64-NEXT: pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X64-NEXT: paddd %xmm0, %xmm1
-; X64-NEXT: pextrw $4, %xmm1, 4(%rdi)
-; X64-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; X64-NEXT: paddw %xmm0, %xmm1
+; X64-NEXT: pextrw $2, %xmm1, 4(%rdi)
; X64-NEXT: movd %xmm1, (%rdi)
; X64-NEXT: retq
%a = load %i16vec3, %i16vec3* %ap, align 16
; X86-NEXT: movdqa 16(%edx), %xmm1
; X86-NEXT: paddw (%ecx), %xmm0
; X86-NEXT: paddw 16(%ecx), %xmm1
-; X86-NEXT: movq %xmm1, 16(%eax)
+; X86-NEXT: movd %xmm1, 16(%eax)
+; X86-NEXT: pextrd $1, %xmm1, 20(%eax)
; X86-NEXT: movdqa %xmm0, (%eax)
; X86-NEXT: retl $4
;
define void @add3i8(%i8vec3* nocapture sret %ret, %i8vec3* %ap, %i8vec3* %bp) nounwind {
; X86-LABEL: add3i8:
; X86: # %bb.0:
-; X86-NEXT: subl $12, %esp
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-NEXT: paddd %xmm0, %xmm1
-; X86-NEXT: pextrb $8, %xmm1, 2(%eax)
-; X86-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT: paddb %xmm0, %xmm1
+; X86-NEXT: pextrb $2, %xmm1, 2(%eax)
; X86-NEXT: pextrw $0, %xmm1, (%eax)
-; X86-NEXT: addl $12, %esp
; X86-NEXT: retl $4
;
; X64-LABEL: add3i8:
; X64: # %bb.0:
; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X64-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X64-NEXT: paddd %xmm0, %xmm1
-; X64-NEXT: pextrb $8, %xmm1, 2(%rdi)
-; X64-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT: paddb %xmm0, %xmm1
+; X64-NEXT: pextrb $2, %xmm1, 2(%rdi)
; X64-NEXT: pextrw $0, %xmm1, (%rdi)
; X64-NEXT: retq
%a = load %i8vec3, %i8vec3* %ap, align 16
; X86-NEXT: movdqa 16(%edx), %xmm1
; X86-NEXT: paddb (%ecx), %xmm0
; X86-NEXT: paddb 16(%ecx), %xmm1
+; X86-NEXT: movd %xmm1, 16(%eax)
+; X86-NEXT: pextrd $1, %xmm1, 20(%eax)
; X86-NEXT: pextrd $2, %xmm1, 24(%eax)
; X86-NEXT: pextrw $6, %xmm1, 28(%eax)
; X86-NEXT: pextrb $14, %xmm1, 30(%eax)
-; X86-NEXT: movq %xmm1, 16(%eax)
; X86-NEXT: movdqa %xmm0, (%eax)
; X86-NEXT: retl $4
;
define void @rot(%i8vec3pack* nocapture sret %result, %i8vec3pack* %X, %i8vec3pack* %rot) nounwind {
; X86-LABEL: rot:
; X86: # %bb.0: # %entry
-; X86-NEXT: subl $16, %esp
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movw $-24930, (%edx) # imm = 0x9E9E
; X86-NEXT: movb $1, 2(%ecx)
; X86-NEXT: movw $257, (%ecx) # imm = 0x101
-; X86-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-NEXT: psrld $1, %xmm0
-; X86-NEXT: pextrb $8, %xmm0, 2(%eax)
-; X86-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: psrlw $1, %xmm0
+; X86-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X86-NEXT: pextrb $2, %xmm0, 2(%eax)
; X86-NEXT: pextrw $0, %xmm0, (%eax)
-; X86-NEXT: addl $16, %esp
; X86-NEXT: retl $4
;
; X64-LABEL: rot:
; X64-NEXT: movw $-24930, (%rsi) # imm = 0x9E9E
; X64-NEXT: movb $1, 2(%rdx)
; X64-NEXT: movw $257, (%rdx) # imm = 0x101
-; X64-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X64-NEXT: psrld $1, %xmm0
-; X64-NEXT: pextrb $8, %xmm0, 2(%rdi)
-; X64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: psrlw $1, %xmm0
+; X64-NEXT: pand {{.*}}(%rip), %xmm0
+; X64-NEXT: pextrb $2, %xmm0, 2(%rdi)
; X64-NEXT: pextrw $0, %xmm0, (%rdi)
; X64-NEXT: retq
entry:
define <8 x i8> @shuf4(<4 x i8> %a, <4 x i8> %b) nounwind readnone {
; X86-LABEL: shuf4:
; X86: # %bb.0:
-; X86-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; X86-NEXT: pshufb %xmm2, %xmm1
-; X86-NEXT: pshufb %xmm2, %xmm0
-; X86-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X86-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-NEXT: retl
;
; X64-LABEL: shuf4:
; X64: # %bb.0:
-; X64-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; X64-NEXT: pshufb %xmm2, %xmm1
-; X64-NEXT: pshufb %xmm2, %xmm0
-; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X64-NEXT: retq
%vshuf = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i8> %vshuf
define <8 x i8> @interleaved_load_vf8_i8_stride4(<32 x i8>* %ptr) {
; AVX-LABEL: interleaved_load_vf8_i8_stride4:
; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX-NEXT: vmovdqa (%rdi), %xmm1
; AVX-NEXT: vmovdqa 16(%rdi), %xmm2
; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm3
; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm0[0],xmm3[0]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [1,1,3,3,5,5,7,7,7,7,3,3,6,6,7,7]
-; AVX-NEXT: vpshufb %xmm5, %xmm3, %xmm3
-; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm0
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; AVX-NEXT: vpaddw %xmm0, %xmm4, %xmm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,2,3,14,15,10,11,14,15,10,11,12,13,14,15]
-; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[1,0,3,2,4,5,6,7]
-; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[1,0,3,2,4,5,6,7]
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [3,3,1,1,7,7,5,5,1,1,5,5,0,0,1,1]
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm4
+; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm3
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm4
+; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm3
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm2
; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm1
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX-NEXT: vpaddw %xmm3, %xmm1, %xmm1
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX-NEXT: retq
%wide.vec = load <32 x i8>, <32 x i8>* %ptr, align 16
%v1 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
define void @interleaved_store_vf8_i8_stride4(<8 x i8> %x1, <8 x i8> %x2, <8 x i8> %x3, <8 x i8> %x4, <32 x i8>* %p) {
; AVX-LABEL: interleaved_store_vf8_i8_stride4:
; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm1
-; AVX-NEXT: vpshufb %xmm4, %xmm0, %xmm0
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm1
-; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX-NEXT: vmovdqa %xmm0, 16(%rdi)
; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,2,u,5,u]
-; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,u,3,u,6,u,9,u,12,u,15,u],zero,xmm0[u],zero,xmm0[u]
+; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,0,u,3,u,6,u]
-; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,u,4,u,7,u,10,u,13,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u]
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3
-; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,1,u,4,u,7,u]
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,5,u,8,u,11,u,14,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u]
+; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpaddw %xmm0, %xmm3, %xmm0
-; AVX-NEXT: vpaddw %xmm0, %xmm2, %xmm0
+; AVX-NEXT: vpaddb %xmm0, %xmm3, %xmm0
+; AVX-NEXT: vpaddb %xmm0, %xmm2, %xmm0
; AVX-NEXT: retq
%wide.vec = load <24 x i8>, <24 x i8>* %ptr
%v1 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> <i32 0,i32 3,i32 6,i32 9,i32 12,i32 15,i32 18,i32 21>
define void @interleaved_store_vf8_i8_stride3(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <24 x i8>* %p) {
; AVX-LABEL: interleaved_store_vf8_i8_stride3:
; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm1
-; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5]
-; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,xmm1[1],zero,zero,xmm1[2],zero,zero,xmm1[3],zero,zero,xmm1[4],zero
-; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5]
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm2[0],zero,zero,xmm2[1],zero,zero,xmm2[2],zero,zero,xmm2[3],zero,zero,xmm2[4],zero
+; AVX-NEXT: vpor %xmm3, %xmm1, %xmm1
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[5],zero,zero,xmm2[6],zero,zero,xmm2[7,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpor %xmm2, %xmm0, %xmm0
; AVX-NEXT: vmovq %xmm0, 16(%rdi)
-; AVX-NEXT: vmovdqu %xmm2, (%rdi)
+; AVX-NEXT: vmovdqu %xmm1, (%rdi)
; AVX-NEXT: retq
%1 = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%2 = shufflevector <8 x i8> %c, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; X32-LABEL: shl2_other:
; X32: # %bb.0: # %entry
; X32-NEXT: movdqa %xmm0, %xmm1
-; X32-NEXT: psllq $2, %xmm1
-; X32-NEXT: psllq $9, %xmm0
+; X32-NEXT: pslld $2, %xmm1
+; X32-NEXT: pslld $9, %xmm0
; X32-NEXT: pxor %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: shl2_other:
; X64: # %bb.0: # %entry
; X64-NEXT: movdqa %xmm0, %xmm1
-; X64-NEXT: psllq $2, %xmm1
-; X64-NEXT: psllq $9, %xmm0
+; X64-NEXT: pslld $2, %xmm1
+; X64-NEXT: pslld $9, %xmm0
; X64-NEXT: pxor %xmm1, %xmm0
; X64-NEXT: retq
entry:
define <2 x i32> @shr2_other(<2 x i32> %A) nounwind {
; X32-LABEL: shr2_other:
; X32: # %bb.0: # %entry
-; X32-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-NEXT: movdqa %xmm0, %xmm1
-; X32-NEXT: psrlq $8, %xmm1
-; X32-NEXT: psrlq $1, %xmm0
+; X32-NEXT: psrld $8, %xmm1
+; X32-NEXT: psrld $1, %xmm0
; X32-NEXT: pxor %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: shr2_other:
; X64: # %bb.0: # %entry
-; X64-NEXT: pand {{.*}}(%rip), %xmm0
; X64-NEXT: movdqa %xmm0, %xmm1
-; X64-NEXT: psrlq $8, %xmm1
-; X64-NEXT: psrlq $1, %xmm0
+; X64-NEXT: psrld $8, %xmm1
+; X64-NEXT: psrld $1, %xmm0
; X64-NEXT: pxor %xmm1, %xmm0
; X64-NEXT: retq
entry:
define <4 x i8> @h(<4 x i8> %x, <4 x i8> %y) {
; CHECK-LABEL: @h(
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <4 x i32> <i32 0, i32 3, i32 5, i32 6>
-; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i8> [[TMP1]], [[TMP1]]
-; CHECK-NEXT: ret <4 x i8> [[TMP2]]
+; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
+; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
+; CHECK-NEXT: [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1
+; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2
+; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
+; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
+; CHECK-NEXT: [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
+; CHECK-NEXT: [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]]
+; CHECK-NEXT: [[INS1:%.*]] = insertelement <4 x i8> undef, i8 [[X0X0]], i32 0
+; CHECK-NEXT: [[INS2:%.*]] = insertelement <4 x i8> [[INS1]], i8 [[X3X3]], i32 1
+; CHECK-NEXT: [[INS3:%.*]] = insertelement <4 x i8> [[INS2]], i8 [[Y1Y1]], i32 2
+; CHECK-NEXT: [[INS4:%.*]] = insertelement <4 x i8> [[INS3]], i8 [[Y2Y2]], i32 3
+; CHECK-NEXT: ret <4 x i8> [[INS4]]
;
%x0 = extractelement <4 x i8> %x, i32 0
%x3 = extractelement <4 x i8> %x, i32 3
define <4 x i8> @h_undef(<4 x i8> %x, <4 x i8> %y) {
; CHECK-LABEL: @h_undef(
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <4 x i32> <i32 undef, i32 3, i32 5, i32 6>
-; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i8> [[TMP1]], [[TMP1]]
-; CHECK-NEXT: ret <4 x i8> [[TMP2]]
+; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 3
+; CHECK-NEXT: [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1
+; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2
+; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
+; CHECK-NEXT: [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
+; CHECK-NEXT: [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]]
+; CHECK-NEXT: [[INS2:%.*]] = insertelement <4 x i8> undef, i8 [[X3X3]], i32 1
+; CHECK-NEXT: [[INS3:%.*]] = insertelement <4 x i8> [[INS2]], i8 [[Y1Y1]], i32 2
+; CHECK-NEXT: [[INS4:%.*]] = insertelement <4 x i8> [[INS3]], i8 [[Y2Y2]], i32 3
+; CHECK-NEXT: ret <4 x i8> [[INS4]]
;
%x0 = extractelement <4 x i8> undef, i32 0
%x3 = extractelement <4 x i8> %x, i32 3
define i8 @i(<4 x i8> %x, <4 x i8> %y) {
; CHECK-LABEL: @i(
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <4 x i32> <i32 0, i32 3, i32 5, i32 6>
-; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i8> [[TMP1]], [[TMP1]]
-; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i8> [[TMP2]], [[RDX_SHUF]]
-; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i8> [[BIN_RDX]], <4 x i8> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT: [[BIN_RDX2:%.*]] = add <4 x i8> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i8> [[BIN_RDX2]], i32 0
+; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
+; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
+; CHECK-NEXT: [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1
+; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2
+; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
+; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
+; CHECK-NEXT: [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
+; CHECK-NEXT: [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]]
+; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
+; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[Y1Y1]], [[Y2Y2]]
+; CHECK-NEXT: [[TMP3:%.*]] = add i8 [[TMP1]], [[TMP2]]
; CHECK-NEXT: ret i8 [[TMP3]]
;
%x0 = extractelement <4 x i8> %x, i32 0
}
define void @fptosi_8f64_8i16() #0 {
-; SSE-LABEL: @fptosi_8f64_8i16(
-; SSE-NEXT: [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; SSE-NEXT: [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; SSE-NEXT: [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; SSE-NEXT: [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; SSE-NEXT: [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
-; SSE-NEXT: [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
-; SSE-NEXT: [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
-; SSE-NEXT: [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
-; SSE-NEXT: [[CVT0:%.*]] = fptosi double [[A0]] to i16
-; SSE-NEXT: [[CVT1:%.*]] = fptosi double [[A1]] to i16
-; SSE-NEXT: [[CVT2:%.*]] = fptosi double [[A2]] to i16
-; SSE-NEXT: [[CVT3:%.*]] = fptosi double [[A3]] to i16
-; SSE-NEXT: [[CVT4:%.*]] = fptosi double [[A4]] to i16
-; SSE-NEXT: [[CVT5:%.*]] = fptosi double [[A5]] to i16
-; SSE-NEXT: [[CVT6:%.*]] = fptosi double [[A6]] to i16
-; SSE-NEXT: [[CVT7:%.*]] = fptosi double [[A7]] to i16
-; SSE-NEXT: store i16 [[CVT0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 0), align 2
-; SSE-NEXT: store i16 [[CVT1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 1), align 2
-; SSE-NEXT: store i16 [[CVT2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 2), align 2
-; SSE-NEXT: store i16 [[CVT3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 3), align 2
-; SSE-NEXT: store i16 [[CVT4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 4), align 2
-; SSE-NEXT: store i16 [[CVT5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 5), align 2
-; SSE-NEXT: store i16 [[CVT6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 6), align 2
-; SSE-NEXT: store i16 [[CVT7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 7), align 2
-; SSE-NEXT: ret void
-;
-; AVX-LABEL: @fptosi_8f64_8i16(
-; AVX-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
-; AVX-NEXT: [[TMP2:%.*]] = fptosi <8 x double> [[TMP1]] to <8 x i16>
-; AVX-NEXT: store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([32 x i16]* @dst16 to <8 x i16>*), align 2
-; AVX-NEXT: ret void
+; CHECK-LABEL: @fptosi_8f64_8i16(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = fptosi <8 x double> [[TMP1]] to <8 x i16>
+; CHECK-NEXT: store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([32 x i16]* @dst16 to <8 x i16>*), align 2
+; CHECK-NEXT: ret void
;
%a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
%a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
; SSE-NEXT: store i16 [[CVT7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 7), align 2
; SSE-NEXT: ret void
;
-; AVX256NODQ-LABEL: @fptoui_8f64_8i16(
-; AVX256NODQ-NEXT: [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; AVX256NODQ-NEXT: [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; AVX256NODQ-NEXT: [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; AVX256NODQ-NEXT: [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; AVX256NODQ-NEXT: [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
-; AVX256NODQ-NEXT: [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
-; AVX256NODQ-NEXT: [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
-; AVX256NODQ-NEXT: [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
-; AVX256NODQ-NEXT: [[CVT0:%.*]] = fptoui double [[A0]] to i16
-; AVX256NODQ-NEXT: [[CVT1:%.*]] = fptoui double [[A1]] to i16
-; AVX256NODQ-NEXT: [[CVT2:%.*]] = fptoui double [[A2]] to i16
-; AVX256NODQ-NEXT: [[CVT3:%.*]] = fptoui double [[A3]] to i16
-; AVX256NODQ-NEXT: [[CVT4:%.*]] = fptoui double [[A4]] to i16
-; AVX256NODQ-NEXT: [[CVT5:%.*]] = fptoui double [[A5]] to i16
-; AVX256NODQ-NEXT: [[CVT6:%.*]] = fptoui double [[A6]] to i16
-; AVX256NODQ-NEXT: [[CVT7:%.*]] = fptoui double [[A7]] to i16
-; AVX256NODQ-NEXT: store i16 [[CVT0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 0), align 2
-; AVX256NODQ-NEXT: store i16 [[CVT1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 1), align 2
-; AVX256NODQ-NEXT: store i16 [[CVT2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 2), align 2
-; AVX256NODQ-NEXT: store i16 [[CVT3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 3), align 2
-; AVX256NODQ-NEXT: store i16 [[CVT4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 4), align 2
-; AVX256NODQ-NEXT: store i16 [[CVT5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 5), align 2
-; AVX256NODQ-NEXT: store i16 [[CVT6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 6), align 2
-; AVX256NODQ-NEXT: store i16 [[CVT7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 7), align 2
-; AVX256NODQ-NEXT: ret void
-;
-; AVX512-LABEL: @fptoui_8f64_8i16(
-; AVX512-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
-; AVX512-NEXT: [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i16>
-; AVX512-NEXT: store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([32 x i16]* @dst16 to <8 x i16>*), align 2
-; AVX512-NEXT: ret void
-;
-; AVX256DQ-LABEL: @fptoui_8f64_8i16(
-; AVX256DQ-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
-; AVX256DQ-NEXT: [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i16>
-; AVX256DQ-NEXT: store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([32 x i16]* @dst16 to <8 x i16>*), align 2
-; AVX256DQ-NEXT: ret void
+; AVX-LABEL: @fptoui_8f64_8i16(
+; AVX-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
+; AVX-NEXT: [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i16>
+; AVX-NEXT: store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([32 x i16]* @dst16 to <8 x i16>*), align 2
+; AVX-NEXT: ret void
;
%a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
%a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
; ZEROTHRESH-NEXT: [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3
; ZEROTHRESH-NEXT: [[CMP0:%.*]] = icmp ne i32 [[C0]], 0
; ZEROTHRESH-NEXT: [[CMP1:%.*]] = icmp ne i32 [[C1]], 0
-; ZEROTHRESH-NEXT: [[CMP2:%.*]] = icmp ne i32 [[C2]], 0
-; ZEROTHRESH-NEXT: [[CMP3:%.*]] = icmp ne i32 [[C3]], 0
+; ZEROTHRESH-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[C2]], i32 0
+; ZEROTHRESH-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C3]], i32 1
+; ZEROTHRESH-NEXT: [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
; ZEROTHRESH-NEXT: [[S0:%.*]] = select i1 [[CMP0]], float [[A0]], float [[B0]]
; ZEROTHRESH-NEXT: [[S1:%.*]] = select i1 [[CMP1]], float [[A1]], float [[B1]]
-; ZEROTHRESH-NEXT: [[S2:%.*]] = select i1 [[CMP2]], float [[A2]], float [[B2]]
-; ZEROTHRESH-NEXT: [[S3:%.*]] = select i1 [[CMP3]], float [[A3]], float [[B3]]
+; ZEROTHRESH-NEXT: [[TMP4:%.*]] = insertelement <2 x float> undef, float [[A2]], i32 0
+; ZEROTHRESH-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[A3]], i32 1
+; ZEROTHRESH-NEXT: [[TMP6:%.*]] = insertelement <2 x float> undef, float [[B2]], i32 0
+; ZEROTHRESH-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[B3]], i32 1
+; ZEROTHRESH-NEXT: [[TMP8:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP5]], <2 x float> [[TMP7]]
; ZEROTHRESH-NEXT: [[RA:%.*]] = insertelement <4 x float> undef, float [[S0]], i32 0
; ZEROTHRESH-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[S1]], i32 1
-; ZEROTHRESH-NEXT: [[RC:%.*]] = insertelement <4 x float> undef, float [[S2]], i32 2
-; ZEROTHRESH-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[S3]], i32 3
+; ZEROTHRESH-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
+; ZEROTHRESH-NEXT: [[RC:%.*]] = insertelement <4 x float> undef, float [[TMP9]], i32 2
+; ZEROTHRESH-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
+; ZEROTHRESH-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP10]], i32 3
; ZEROTHRESH-NEXT: ret <4 x float> [[RD]]
;
%c0 = extractelement <4 x i32> %c, i32 0
; CHECK-NEXT: ret <2 x float> [[RB]]
;
; ZEROTHRESH-LABEL: @simple_select_v2(
-; ZEROTHRESH-NEXT: [[C0:%.*]] = extractelement <2 x i32> [[C:%.*]], i32 0
-; ZEROTHRESH-NEXT: [[C1:%.*]] = extractelement <2 x i32> [[C]], i32 1
-; ZEROTHRESH-NEXT: [[A0:%.*]] = extractelement <2 x float> [[A:%.*]], i32 0
-; ZEROTHRESH-NEXT: [[A1:%.*]] = extractelement <2 x float> [[A]], i32 1
-; ZEROTHRESH-NEXT: [[B0:%.*]] = extractelement <2 x float> [[B:%.*]], i32 0
-; ZEROTHRESH-NEXT: [[B1:%.*]] = extractelement <2 x float> [[B]], i32 1
-; ZEROTHRESH-NEXT: [[CMP0:%.*]] = icmp ne i32 [[C0]], 0
-; ZEROTHRESH-NEXT: [[CMP1:%.*]] = icmp ne i32 [[C1]], 0
-; ZEROTHRESH-NEXT: [[S0:%.*]] = select i1 [[CMP0]], float [[A0]], float [[B0]]
-; ZEROTHRESH-NEXT: [[S1:%.*]] = select i1 [[CMP1]], float [[A1]], float [[B1]]
-; ZEROTHRESH-NEXT: [[RA:%.*]] = insertelement <2 x float> undef, float [[S0]], i32 0
-; ZEROTHRESH-NEXT: [[RB:%.*]] = insertelement <2 x float> [[RA]], float [[S1]], i32 1
+; ZEROTHRESH-NEXT: [[TMP1:%.*]] = icmp ne <2 x i32> [[C:%.*]], zeroinitializer
+; ZEROTHRESH-NEXT: [[TMP2:%.*]] = select <2 x i1> [[TMP1]], <2 x float> [[A:%.*]], <2 x float> [[B:%.*]]
+; ZEROTHRESH-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; ZEROTHRESH-NEXT: [[RA:%.*]] = insertelement <2 x float> undef, float [[TMP3]], i32 0
+; ZEROTHRESH-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+; ZEROTHRESH-NEXT: [[RB:%.*]] = insertelement <2 x float> [[RA]], float [[TMP4]], i32 1
; ZEROTHRESH-NEXT: ret <2 x float> [[RB]]
;
%c0 = extractelement <2 x i32> %c, i32 0
}
define void @sitofp_4i16_4f32() #0 {
-; CHECK-LABEL: @sitofp_4i16_4f32(
-; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
-; CHECK-NEXT: [[TMP2:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float>
-; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; CHECK-NEXT: ret void
+; SSE-LABEL: @sitofp_4i16_4f32(
+; SSE-NEXT: [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
+; SSE-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
+; SSE-NEXT: [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
+; SSE-NEXT: [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
+; SSE-NEXT: [[CVT0:%.*]] = sitofp i16 [[LD0]] to float
+; SSE-NEXT: [[CVT1:%.*]] = sitofp i16 [[LD1]] to float
+; SSE-NEXT: [[CVT2:%.*]] = sitofp i16 [[LD2]] to float
+; SSE-NEXT: [[CVT3:%.*]] = sitofp i16 [[LD3]] to float
+; SSE-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; SSE-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; SSE-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+; SSE-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE-NEXT: ret void
+;
+; AVX-LABEL: @sitofp_4i16_4f32(
+; AVX-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; AVX-NEXT: [[TMP2:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float>
+; AVX-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; AVX-NEXT: ret void
;
%ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
%ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
define void @sitofp_8i16_8f32() #0 {
; SSE-LABEL: @sitofp_8i16_8f32(
-; SSE-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
-; SSE-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
-; SSE-NEXT: [[TMP3:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float>
-; SSE-NEXT: [[TMP4:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float>
-; SSE-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT: [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
+; SSE-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
+; SSE-NEXT: [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
+; SSE-NEXT: [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
+; SSE-NEXT: [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8
+; SSE-NEXT: [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2
+; SSE-NEXT: [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4
+; SSE-NEXT: [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2
+; SSE-NEXT: [[CVT0:%.*]] = sitofp i16 [[LD0]] to float
+; SSE-NEXT: [[CVT1:%.*]] = sitofp i16 [[LD1]] to float
+; SSE-NEXT: [[CVT2:%.*]] = sitofp i16 [[LD2]] to float
+; SSE-NEXT: [[CVT3:%.*]] = sitofp i16 [[LD3]] to float
+; SSE-NEXT: [[CVT4:%.*]] = sitofp i16 [[LD4]] to float
+; SSE-NEXT: [[CVT5:%.*]] = sitofp i16 [[LD5]] to float
+; SSE-NEXT: [[CVT6:%.*]] = sitofp i16 [[LD6]] to float
+; SSE-NEXT: [[CVT7:%.*]] = sitofp i16 [[LD7]] to float
+; SSE-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; SSE-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; SSE-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+; SSE-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE-NEXT: store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
+; SSE-NEXT: store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+; SSE-NEXT: store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
+; SSE-NEXT: store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
; SSE-NEXT: ret void
;
; AVX-LABEL: @sitofp_8i16_8f32(
define void @sitofp_16i16_16f32() #0 {
; SSE-LABEL: @sitofp_16i16_16f32(
-; SSE-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
-; SSE-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
-; SSE-NEXT: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <4 x i16>*), align 16
-; SSE-NEXT: [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12) to <4 x i16>*), align 8
-; SSE-NEXT: [[TMP5:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float>
-; SSE-NEXT: [[TMP6:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float>
-; SSE-NEXT: [[TMP7:%.*]] = sitofp <4 x i16> [[TMP3]] to <4 x float>
-; SSE-NEXT: [[TMP8:%.*]] = sitofp <4 x i16> [[TMP4]] to <4 x float>
-; SSE-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; SSE-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
-; SSE-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
-; SSE-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
+; SSE-NEXT: [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
+; SSE-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
+; SSE-NEXT: [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
+; SSE-NEXT: [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
+; SSE-NEXT: [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8
+; SSE-NEXT: [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2
+; SSE-NEXT: [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4
+; SSE-NEXT: [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2
+; SSE-NEXT: [[LD8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8), align 16
+; SSE-NEXT: [[LD9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 9), align 2
+; SSE-NEXT: [[LD10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 10), align 4
+; SSE-NEXT: [[LD11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 11), align 2
+; SSE-NEXT: [[LD12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12), align 8
+; SSE-NEXT: [[LD13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 13), align 2
+; SSE-NEXT: [[LD14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 14), align 4
+; SSE-NEXT: [[LD15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 15), align 2
+; SSE-NEXT: [[CVT0:%.*]] = sitofp i16 [[LD0]] to float
+; SSE-NEXT: [[CVT1:%.*]] = sitofp i16 [[LD1]] to float
+; SSE-NEXT: [[CVT2:%.*]] = sitofp i16 [[LD2]] to float
+; SSE-NEXT: [[CVT3:%.*]] = sitofp i16 [[LD3]] to float
+; SSE-NEXT: [[CVT4:%.*]] = sitofp i16 [[LD4]] to float
+; SSE-NEXT: [[CVT5:%.*]] = sitofp i16 [[LD5]] to float
+; SSE-NEXT: [[CVT6:%.*]] = sitofp i16 [[LD6]] to float
+; SSE-NEXT: [[CVT7:%.*]] = sitofp i16 [[LD7]] to float
+; SSE-NEXT: [[CVT8:%.*]] = sitofp i16 [[LD8]] to float
+; SSE-NEXT: [[CVT9:%.*]] = sitofp i16 [[LD9]] to float
+; SSE-NEXT: [[CVT10:%.*]] = sitofp i16 [[LD10]] to float
+; SSE-NEXT: [[CVT11:%.*]] = sitofp i16 [[LD11]] to float
+; SSE-NEXT: [[CVT12:%.*]] = sitofp i16 [[LD12]] to float
+; SSE-NEXT: [[CVT13:%.*]] = sitofp i16 [[LD13]] to float
+; SSE-NEXT: [[CVT14:%.*]] = sitofp i16 [[LD14]] to float
+; SSE-NEXT: [[CVT15:%.*]] = sitofp i16 [[LD15]] to float
+; SSE-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; SSE-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; SSE-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+; SSE-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE-NEXT: store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
+; SSE-NEXT: store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+; SSE-NEXT: store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
+; SSE-NEXT: store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; SSE-NEXT: store float [[CVT8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 32
+; SSE-NEXT: store float [[CVT9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4
+; SSE-NEXT: store float [[CVT10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 8
+; SSE-NEXT: store float [[CVT11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
+; SSE-NEXT: store float [[CVT12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 16
+; SSE-NEXT: store float [[CVT13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
+; SSE-NEXT: store float [[CVT14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 8
+; SSE-NEXT: store float [[CVT15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
; SSE-NEXT: ret void
;
; AVX256-LABEL: @sitofp_16i16_16f32(
}
define void @uitofp_4i16_4f32() #0 {
-; CHECK-LABEL: @uitofp_4i16_4f32(
-; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
-; CHECK-NEXT: [[TMP2:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x float>
-; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; CHECK-NEXT: ret void
+; SSE-LABEL: @uitofp_4i16_4f32(
+; SSE-NEXT: [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
+; SSE-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
+; SSE-NEXT: [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
+; SSE-NEXT: [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
+; SSE-NEXT: [[CVT0:%.*]] = uitofp i16 [[LD0]] to float
+; SSE-NEXT: [[CVT1:%.*]] = uitofp i16 [[LD1]] to float
+; SSE-NEXT: [[CVT2:%.*]] = uitofp i16 [[LD2]] to float
+; SSE-NEXT: [[CVT3:%.*]] = uitofp i16 [[LD3]] to float
+; SSE-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; SSE-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; SSE-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+; SSE-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE-NEXT: ret void
+;
+; AVX-LABEL: @uitofp_4i16_4f32(
+; AVX-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; AVX-NEXT: [[TMP2:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x float>
+; AVX-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; AVX-NEXT: ret void
;
%ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
%ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
define void @uitofp_8i16_8f32() #0 {
; SSE-LABEL: @uitofp_8i16_8f32(
-; SSE-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
-; SSE-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
-; SSE-NEXT: [[TMP3:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x float>
-; SSE-NEXT: [[TMP4:%.*]] = uitofp <4 x i16> [[TMP2]] to <4 x float>
-; SSE-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT: [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
+; SSE-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
+; SSE-NEXT: [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
+; SSE-NEXT: [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
+; SSE-NEXT: [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8
+; SSE-NEXT: [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2
+; SSE-NEXT: [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4
+; SSE-NEXT: [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2
+; SSE-NEXT: [[CVT0:%.*]] = uitofp i16 [[LD0]] to float
+; SSE-NEXT: [[CVT1:%.*]] = uitofp i16 [[LD1]] to float
+; SSE-NEXT: [[CVT2:%.*]] = uitofp i16 [[LD2]] to float
+; SSE-NEXT: [[CVT3:%.*]] = uitofp i16 [[LD3]] to float
+; SSE-NEXT: [[CVT4:%.*]] = uitofp i16 [[LD4]] to float
+; SSE-NEXT: [[CVT5:%.*]] = uitofp i16 [[LD5]] to float
+; SSE-NEXT: [[CVT6:%.*]] = uitofp i16 [[LD6]] to float
+; SSE-NEXT: [[CVT7:%.*]] = uitofp i16 [[LD7]] to float
+; SSE-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; SSE-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; SSE-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+; SSE-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE-NEXT: store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
+; SSE-NEXT: store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+; SSE-NEXT: store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
+; SSE-NEXT: store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
; SSE-NEXT: ret void
;
; AVX-LABEL: @uitofp_8i16_8f32(
define void @uitofp_16i16_16f32() #0 {
; SSE-LABEL: @uitofp_16i16_16f32(
-; SSE-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
-; SSE-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
-; SSE-NEXT: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <4 x i16>*), align 16
-; SSE-NEXT: [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12) to <4 x i16>*), align 8
-; SSE-NEXT: [[TMP5:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x float>
-; SSE-NEXT: [[TMP6:%.*]] = uitofp <4 x i16> [[TMP2]] to <4 x float>
-; SSE-NEXT: [[TMP7:%.*]] = uitofp <4 x i16> [[TMP3]] to <4 x float>
-; SSE-NEXT: [[TMP8:%.*]] = uitofp <4 x i16> [[TMP4]] to <4 x float>
-; SSE-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; SSE-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
-; SSE-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
-; SSE-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
+; SSE-NEXT: [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
+; SSE-NEXT: [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
+; SSE-NEXT: [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
+; SSE-NEXT: [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
+; SSE-NEXT: [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8
+; SSE-NEXT: [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2
+; SSE-NEXT: [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4
+; SSE-NEXT: [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2
+; SSE-NEXT: [[LD8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8), align 16
+; SSE-NEXT: [[LD9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 9), align 2
+; SSE-NEXT: [[LD10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 10), align 4
+; SSE-NEXT: [[LD11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 11), align 2
+; SSE-NEXT: [[LD12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12), align 8
+; SSE-NEXT: [[LD13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 13), align 2
+; SSE-NEXT: [[LD14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 14), align 4
+; SSE-NEXT: [[LD15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 15), align 2
+; SSE-NEXT: [[CVT0:%.*]] = uitofp i16 [[LD0]] to float
+; SSE-NEXT: [[CVT1:%.*]] = uitofp i16 [[LD1]] to float
+; SSE-NEXT: [[CVT2:%.*]] = uitofp i16 [[LD2]] to float
+; SSE-NEXT: [[CVT3:%.*]] = uitofp i16 [[LD3]] to float
+; SSE-NEXT: [[CVT4:%.*]] = uitofp i16 [[LD4]] to float
+; SSE-NEXT: [[CVT5:%.*]] = uitofp i16 [[LD5]] to float
+; SSE-NEXT: [[CVT6:%.*]] = uitofp i16 [[LD6]] to float
+; SSE-NEXT: [[CVT7:%.*]] = uitofp i16 [[LD7]] to float
+; SSE-NEXT: [[CVT8:%.*]] = uitofp i16 [[LD8]] to float
+; SSE-NEXT: [[CVT9:%.*]] = uitofp i16 [[LD9]] to float
+; SSE-NEXT: [[CVT10:%.*]] = uitofp i16 [[LD10]] to float
+; SSE-NEXT: [[CVT11:%.*]] = uitofp i16 [[LD11]] to float
+; SSE-NEXT: [[CVT12:%.*]] = uitofp i16 [[LD12]] to float
+; SSE-NEXT: [[CVT13:%.*]] = uitofp i16 [[LD13]] to float
+; SSE-NEXT: [[CVT14:%.*]] = uitofp i16 [[LD14]] to float
+; SSE-NEXT: [[CVT15:%.*]] = uitofp i16 [[LD15]] to float
+; SSE-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; SSE-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; SSE-NEXT: store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+; SSE-NEXT: store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE-NEXT: store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
+; SSE-NEXT: store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+; SSE-NEXT: store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
+; SSE-NEXT: store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; SSE-NEXT: store float [[CVT8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 32
+; SSE-NEXT: store float [[CVT9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4
+; SSE-NEXT: store float [[CVT10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 8
+; SSE-NEXT: store float [[CVT11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
+; SSE-NEXT: store float [[CVT12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 16
+; SSE-NEXT: store float [[CVT13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
+; SSE-NEXT: store float [[CVT14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 8
+; SSE-NEXT: store float [[CVT15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
; SSE-NEXT: ret void
;
; AVX256-LABEL: @uitofp_16i16_16f32(