From: Nikita Popov Date: Tue, 18 Dec 2018 13:22:53 +0000 (+0000) Subject: [SelectionDAG][X86] Fix [US](ADD|SUB)SAT vector legalization, add tests X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=7c859dd19bcdcba0bfa73ee76ae0a7955398835a;p=llvm [SelectionDAG][X86] Fix [US](ADD|SUB)SAT vector legalization, add tests Integer result promotion needs to use the scalar size, and we need support for result widening. This is in preparation for D55787. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@349480 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index 96d1c3d75e4..25fa2a0a4af 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -590,7 +590,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSAT(SDNode *N) { SDLoc dl(N); SDValue Op1 = N->getOperand(0); SDValue Op2 = N->getOperand(1); - unsigned OldBits = Op1.getValueSizeInBits(); + unsigned OldBits = Op1.getScalarValueSizeInBits(); unsigned Opcode = N->getOpcode(); unsigned ShiftOp; @@ -612,7 +612,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSAT(SDNode *N) { SDValue Op2Promoted = GetPromotedInteger(Op2); EVT PromotedType = Op1Promoted.getValueType(); - unsigned NewBits = Op1Promoted.getValueSizeInBits(); + unsigned NewBits = PromotedType.getScalarSizeInBits(); unsigned SHLAmount = NewBits - OldBits; EVT SHVT = TLI.getShiftAmountTy(PromotedType, DAG.getDataLayout()); SDValue ShiftAmount = DAG.getConstant(SHLAmount, dl, SHVT); diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index a40618b0ed2..f367e935857 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -2425,6 +2425,10 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) { case ISD::SMAX: case ISD::UMIN: case ISD::UMAX: + case ISD::UADDSAT: + case ISD::SADDSAT: + case ISD::USUBSAT: + case ISD::SSUBSAT: Res = WidenVecRes_Binary(N); break; diff --git a/test/CodeGen/X86/sadd_sat_vec.ll b/test/CodeGen/X86/sadd_sat_vec.ll new file mode 100644 index 00000000000..33547f78fec --- /dev/null +++ b/test/CodeGen/X86/sadd_sat_vec.ll @@ -0,0 +1,20167 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512 + +declare <1 x i8> @llvm.sadd.sat.v1i8(<1 x i8>, <1 x i8>) +declare <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8>, <2 x i8>) +declare <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8>, <4 x i8>) +declare <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8>, <8 x i8>) +declare <12 x i8> @llvm.sadd.sat.v12i8(<12 x i8>, <12 x i8>) +declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8>, <16 x i8>) +declare <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8>, <32 x i8>) +declare <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8>, <64 x i8>) + +declare <1 x i16> @llvm.sadd.sat.v1i16(<1 x i16>, <1 x i16>) +declare <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16>, <2 x i16>) +declare <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16>, <4 x i16>) +declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16>, <8 x i16>) +declare <12 x i16> @llvm.sadd.sat.v12i16(<12 x i16>, <12 x i16>) +declare <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16>, <16 x i16>) +declare <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16>, <32 x i16>) + +declare <16 x i1> @llvm.sadd.sat.v16i1(<16 x i1>, <16 x i1>) +declare <16 x i4> @llvm.sadd.sat.v16i4(<16 x i4>, <16 x i4>) + +declare <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32>, <4 x i32>) +declare <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32>, <2 x i32>) +declare <4 x i24> @llvm.sadd.sat.v4i24(<4 x i24>, <4 x i24>) +declare <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128>, <2 x i128>) + +; Legal types, depending on architecture. + +define <16 x i8> @v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { +; SSE2-LABEL: v16i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: pushq %r15 +; SSE2-NEXT: pushq %r14 +; SSE2-NEXT: pushq %r13 +; SSE2-NEXT: pushq %r12 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r9b +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSE2-NEXT: movl %r9d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r9b +; SSE2-NEXT: jno .LBB0_2 +; SSE2-NEXT: # %bb.1: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r9d +; SSE2-NEXT: .LBB0_2: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movl %esi, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %sil +; SSE2-NEXT: jno .LBB0_4 +; SSE2-NEXT: # %bb.3: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %esi +; SSE2-NEXT: .LBB0_4: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl %ebx, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %bl +; SSE2-NEXT: jno .LBB0_6 +; SSE2-NEXT: # %bb.5: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %ebx +; SSE2-NEXT: .LBB0_6: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %dl +; SSE2-NEXT: jno .LBB0_8 +; SSE2-NEXT: # %bb.7: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %edx +; SSE2-NEXT: .LBB0_8: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r10b +; SSE2-NEXT: movl %r10d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r10b +; SSE2-NEXT: jno .LBB0_10 +; SSE2-NEXT: # %bb.9: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r10d +; SSE2-NEXT: .LBB0_10: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r11b +; SSE2-NEXT: movl %r11d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r11b +; SSE2-NEXT: jno .LBB0_12 +; SSE2-NEXT: # %bb.11: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r11d +; SSE2-NEXT: .LBB0_12: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bpl +; SSE2-NEXT: movl %ebp, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %bpl +; SSE2-NEXT: jno .LBB0_14 +; SSE2-NEXT: # %bb.13: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %ebp +; SSE2-NEXT: .LBB0_14: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r14b +; SSE2-NEXT: movl %r14d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r14b +; SSE2-NEXT: jno .LBB0_16 +; SSE2-NEXT: # %bb.15: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r14d +; SSE2-NEXT: .LBB0_16: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r15b +; SSE2-NEXT: movl %r15d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r15b +; SSE2-NEXT: jno .LBB0_18 +; SSE2-NEXT: # %bb.17: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r15d +; SSE2-NEXT: .LBB0_18: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r12b +; SSE2-NEXT: movl %r12d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r12b +; SSE2-NEXT: jno .LBB0_20 +; SSE2-NEXT: # %bb.19: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r12d +; SSE2-NEXT: .LBB0_20: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r13b +; SSE2-NEXT: movl %r13d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r13b +; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB0_22 +; SSE2-NEXT: # %bb.21: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r13d +; SSE2-NEXT: .LBB0_22: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dil +; SSE2-NEXT: movl %edi, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %dil +; SSE2-NEXT: jno .LBB0_24 +; SSE2-NEXT: # %bb.23: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %edi +; SSE2-NEXT: .LBB0_24: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r8b +; SSE2-NEXT: movl %r8d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r8b +; SSE2-NEXT: jno .LBB0_26 +; SSE2-NEXT: # %bb.25: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r8d +; SSE2-NEXT: .LBB0_26: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl %ebx, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %bl +; SSE2-NEXT: jno .LBB0_28 +; SSE2-NEXT: # %bb.27: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %ebx +; SSE2-NEXT: .LBB0_28: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: addb %dl, %cl +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addb %dl, %al +; SSE2-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB0_30 +; SSE2-NEXT: # %bb.29: +; SSE2-NEXT: addb $127, %cl +; SSE2-NEXT: movl %ecx, %eax +; SSE2-NEXT: .LBB0_30: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSE2-NEXT: movl %esi, %ecx +; SSE2-NEXT: addb %dl, %cl +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addb %dl, %sil +; SSE2-NEXT: jno .LBB0_32 +; SSE2-NEXT: # %bb.31: +; SSE2-NEXT: addb $127, %cl +; SSE2-NEXT: movl %ecx, %esi +; SSE2-NEXT: .LBB0_32: +; SSE2-NEXT: movzbl %sil, %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: movzbl %bl, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl %r8b, %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: movzbl %dil, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl %r13b, %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: movzbl %r12b, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl %r15b, %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: movzbl %r14b, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl %bpl, %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: movzbl %r11b, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl %r10b, %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: movzbl %r9b, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %r12 +; SSE2-NEXT: popq %r13 +; SSE2-NEXT: popq %r14 +; SSE2-NEXT: popq %r15 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v16i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pushq %rbp +; SSSE3-NEXT: pushq %r15 +; SSSE3-NEXT: pushq %r14 +; SSSE3-NEXT: pushq %r13 +; SSSE3-NEXT: pushq %r12 +; SSSE3-NEXT: pushq %rbx +; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r9b +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSSE3-NEXT: movl %r9d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r9b +; SSSE3-NEXT: jno .LBB0_2 +; SSSE3-NEXT: # %bb.1: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r9d +; SSSE3-NEXT: .LBB0_2: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movl %esi, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %sil +; SSSE3-NEXT: jno .LBB0_4 +; SSSE3-NEXT: # %bb.3: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %esi +; SSSE3-NEXT: .LBB0_4: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl %ebx, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %bl +; SSSE3-NEXT: jno .LBB0_6 +; SSSE3-NEXT: # %bb.5: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %ebx +; SSSE3-NEXT: .LBB0_6: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %dl +; SSSE3-NEXT: jno .LBB0_8 +; SSSE3-NEXT: # %bb.7: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %edx +; SSSE3-NEXT: .LBB0_8: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r10b +; SSSE3-NEXT: movl %r10d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r10b +; SSSE3-NEXT: jno .LBB0_10 +; SSSE3-NEXT: # %bb.9: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r10d +; SSSE3-NEXT: .LBB0_10: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r11b +; SSSE3-NEXT: movl %r11d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r11b +; SSSE3-NEXT: jno .LBB0_12 +; SSSE3-NEXT: # %bb.11: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r11d +; SSSE3-NEXT: .LBB0_12: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bpl +; SSSE3-NEXT: movl %ebp, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %bpl +; SSSE3-NEXT: jno .LBB0_14 +; SSSE3-NEXT: # %bb.13: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %ebp +; SSSE3-NEXT: .LBB0_14: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r14b +; SSSE3-NEXT: movl %r14d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r14b +; SSSE3-NEXT: jno .LBB0_16 +; SSSE3-NEXT: # %bb.15: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r14d +; SSSE3-NEXT: .LBB0_16: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r15b +; SSSE3-NEXT: movl %r15d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r15b +; SSSE3-NEXT: jno .LBB0_18 +; SSSE3-NEXT: # %bb.17: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r15d +; SSSE3-NEXT: .LBB0_18: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r12b +; SSSE3-NEXT: movl %r12d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r12b +; SSSE3-NEXT: jno .LBB0_20 +; SSSE3-NEXT: # %bb.19: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r12d +; SSSE3-NEXT: .LBB0_20: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r13b +; SSSE3-NEXT: movl %r13d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r13b +; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB0_22 +; SSSE3-NEXT: # %bb.21: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r13d +; SSSE3-NEXT: .LBB0_22: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dil +; SSSE3-NEXT: movl %edi, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %dil +; SSSE3-NEXT: jno .LBB0_24 +; SSSE3-NEXT: # %bb.23: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %edi +; SSSE3-NEXT: .LBB0_24: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r8b +; SSSE3-NEXT: movl %r8d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r8b +; SSSE3-NEXT: jno .LBB0_26 +; SSSE3-NEXT: # %bb.25: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r8d +; SSSE3-NEXT: .LBB0_26: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl %ebx, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %bl +; SSSE3-NEXT: jno .LBB0_28 +; SSSE3-NEXT: # %bb.27: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %ebx +; SSSE3-NEXT: .LBB0_28: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: addb %dl, %cl +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addb %dl, %al +; SSSE3-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB0_30 +; SSSE3-NEXT: # %bb.29: +; SSSE3-NEXT: addb $127, %cl +; SSSE3-NEXT: movl %ecx, %eax +; SSSE3-NEXT: .LBB0_30: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSSE3-NEXT: movl %esi, %ecx +; SSSE3-NEXT: addb %dl, %cl +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addb %dl, %sil +; SSSE3-NEXT: jno .LBB0_32 +; SSSE3-NEXT: # %bb.31: +; SSSE3-NEXT: addb $127, %cl +; SSSE3-NEXT: movl %ecx, %esi +; SSSE3-NEXT: .LBB0_32: +; SSSE3-NEXT: movzbl %sil, %ecx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: movzbl %al, %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: movzbl %bl, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzbl %r8b, %eax +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSSE3-NEXT: movzbl %dil, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzbl %r13b, %eax +; SSSE3-NEXT: movd %eax, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSSE3-NEXT: movzbl %r12b, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzbl %r15b, %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSSE3-NEXT: movzbl %r14b, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzbl %bpl, %eax +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: movzbl %r11b, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzbl %r10b, %eax +; SSSE3-NEXT: movd %eax, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movd %eax, %xmm4 +; SSSE3-NEXT: movzbl %r9b, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: popq %rbx +; SSSE3-NEXT: popq %r12 +; SSSE3-NEXT: popq %r13 +; SSSE3-NEXT: popq %r14 +; SSSE3-NEXT: popq %r15 +; SSSE3-NEXT: popq %rbp +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v16i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pextrb $15, %xmm1, %ecx +; SSE41-NEXT: pextrb $15, %xmm0, %edx +; SSE41-NEXT: movl %edx, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %dl +; SSE41-NEXT: jno .LBB0_2 +; SSE41-NEXT: # %bb.1: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edx +; SSE41-NEXT: .LBB0_2: +; SSE41-NEXT: pextrb $14, %xmm1, %ecx +; SSE41-NEXT: pextrb $14, %xmm0, %r11d +; SSE41-NEXT: movl %r11d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r11b +; SSE41-NEXT: jno .LBB0_4 +; SSE41-NEXT: # %bb.3: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r11d +; SSE41-NEXT: .LBB0_4: +; SSE41-NEXT: pextrb $13, %xmm1, %ecx +; SSE41-NEXT: pextrb $13, %xmm0, %edi +; SSE41-NEXT: movl %edi, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %dil +; SSE41-NEXT: jno .LBB0_6 +; SSE41-NEXT: # %bb.5: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edi +; SSE41-NEXT: .LBB0_6: +; SSE41-NEXT: pushq %rbp +; SSE41-NEXT: pushq %r15 +; SSE41-NEXT: pushq %r14 +; SSE41-NEXT: pushq %r13 +; SSE41-NEXT: pushq %r12 +; SSE41-NEXT: pushq %rbx +; SSE41-NEXT: pextrb $12, %xmm1, %ecx +; SSE41-NEXT: pextrb $12, %xmm0, %r14d +; SSE41-NEXT: movl %r14d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r14b +; SSE41-NEXT: jno .LBB0_8 +; SSE41-NEXT: # %bb.7: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r14d +; SSE41-NEXT: .LBB0_8: +; SSE41-NEXT: pextrb $11, %xmm1, %ecx +; SSE41-NEXT: pextrb $11, %xmm0, %ebp +; SSE41-NEXT: movl %ebp, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %bpl +; SSE41-NEXT: jno .LBB0_10 +; SSE41-NEXT: # %bb.9: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebp +; SSE41-NEXT: .LBB0_10: +; SSE41-NEXT: pextrb $10, %xmm1, %ecx +; SSE41-NEXT: pextrb $10, %xmm0, %r15d +; SSE41-NEXT: movl %r15d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r15b +; SSE41-NEXT: jno .LBB0_12 +; SSE41-NEXT: # %bb.11: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r15d +; SSE41-NEXT: .LBB0_12: +; SSE41-NEXT: pextrb $9, %xmm1, %ecx +; SSE41-NEXT: pextrb $9, %xmm0, %r12d +; SSE41-NEXT: movl %r12d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r12b +; SSE41-NEXT: jno .LBB0_14 +; SSE41-NEXT: # %bb.13: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r12d +; SSE41-NEXT: .LBB0_14: +; SSE41-NEXT: pextrb $8, %xmm1, %ecx +; SSE41-NEXT: pextrb $8, %xmm0, %r13d +; SSE41-NEXT: movl %r13d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r13b +; SSE41-NEXT: jno .LBB0_16 +; SSE41-NEXT: # %bb.15: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r13d +; SSE41-NEXT: .LBB0_16: +; SSE41-NEXT: pextrb $7, %xmm1, %ecx +; SSE41-NEXT: pextrb $7, %xmm0, %r10d +; SSE41-NEXT: movl %r10d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r10b +; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB0_18 +; SSE41-NEXT: # %bb.17: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r10d +; SSE41-NEXT: .LBB0_18: +; SSE41-NEXT: pextrb $6, %xmm1, %ecx +; SSE41-NEXT: pextrb $6, %xmm0, %r9d +; SSE41-NEXT: movl %r9d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r9b +; SSE41-NEXT: jno .LBB0_20 +; SSE41-NEXT: # %bb.19: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r9d +; SSE41-NEXT: .LBB0_20: +; SSE41-NEXT: pextrb $5, %xmm1, %ecx +; SSE41-NEXT: pextrb $5, %xmm0, %ebp +; SSE41-NEXT: movl %ebp, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %bpl +; SSE41-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB0_22 +; SSE41-NEXT: # %bb.21: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebp +; SSE41-NEXT: .LBB0_22: +; SSE41-NEXT: pextrb $4, %xmm1, %ecx +; SSE41-NEXT: pextrb $4, %xmm0, %edi +; SSE41-NEXT: movl %edi, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %dil +; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB0_24 +; SSE41-NEXT: # %bb.23: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edi +; SSE41-NEXT: .LBB0_24: +; SSE41-NEXT: pextrb $3, %xmm1, %edx +; SSE41-NEXT: pextrb $3, %xmm0, %eax +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: addb %dl, %cl +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addb %dl, %al +; SSE41-NEXT: jno .LBB0_26 +; SSE41-NEXT: # %bb.25: +; SSE41-NEXT: addb $127, %cl +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB0_26: +; SSE41-NEXT: pextrb $2, %xmm1, %ebx +; SSE41-NEXT: pextrb $2, %xmm0, %ecx +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: addb %bl, %dl +; SSE41-NEXT: setns %dl +; SSE41-NEXT: addb %bl, %cl +; SSE41-NEXT: jno .LBB0_28 +; SSE41-NEXT: # %bb.27: +; SSE41-NEXT: addb $127, %dl +; SSE41-NEXT: movl %edx, %ecx +; SSE41-NEXT: .LBB0_28: +; SSE41-NEXT: pextrb $0, %xmm1, %esi +; SSE41-NEXT: pextrb $0, %xmm0, %edx +; SSE41-NEXT: movl %edx, %ebx +; SSE41-NEXT: addb %sil, %bl +; SSE41-NEXT: setns %bl +; SSE41-NEXT: addb %sil, %dl +; SSE41-NEXT: jno .LBB0_30 +; SSE41-NEXT: # %bb.29: +; SSE41-NEXT: addb $127, %bl +; SSE41-NEXT: movl %ebx, %edx +; SSE41-NEXT: .LBB0_30: +; SSE41-NEXT: pextrb $1, %xmm1, %esi +; SSE41-NEXT: pextrb $1, %xmm0, %r8d +; SSE41-NEXT: movl %r8d, %ebx +; SSE41-NEXT: addb %sil, %bl +; SSE41-NEXT: setns %bl +; SSE41-NEXT: addb %sil, %r8b +; SSE41-NEXT: jno .LBB0_32 +; SSE41-NEXT: # %bb.31: +; SSE41-NEXT: addb $127, %bl +; SSE41-NEXT: movl %ebx, %r8d +; SSE41-NEXT: .LBB0_32: +; SSE41-NEXT: movzbl %dl, %edx +; SSE41-NEXT: movd %edx, %xmm0 +; SSE41-NEXT: movzbl %r8b, %edx +; SSE41-NEXT: pinsrb $1, %edx, %xmm0 +; SSE41-NEXT: movzbl %cl, %ecx +; SSE41-NEXT: pinsrb $2, %ecx, %xmm0 +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $3, %eax, %xmm0 +; SSE41-NEXT: movzbl %dil, %eax +; SSE41-NEXT: pinsrb $4, %eax, %xmm0 +; SSE41-NEXT: movzbl %bpl, %eax +; SSE41-NEXT: pinsrb $5, %eax, %xmm0 +; SSE41-NEXT: movzbl %r9b, %eax +; SSE41-NEXT: pinsrb $6, %eax, %xmm0 +; SSE41-NEXT: movzbl %r10b, %eax +; SSE41-NEXT: pinsrb $7, %eax, %xmm0 +; SSE41-NEXT: movzbl %r13b, %eax +; SSE41-NEXT: pinsrb $8, %eax, %xmm0 +; SSE41-NEXT: movzbl %r12b, %eax +; SSE41-NEXT: pinsrb $9, %eax, %xmm0 +; SSE41-NEXT: movzbl %r15b, %eax +; SSE41-NEXT: pinsrb $10, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $11, %eax, %xmm0 +; SSE41-NEXT: movzbl %r14b, %eax +; SSE41-NEXT: pinsrb $12, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $13, %eax, %xmm0 +; SSE41-NEXT: movzbl %r11b, %eax +; SSE41-NEXT: pinsrb $14, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $15, %eax, %xmm0 +; SSE41-NEXT: popq %rbx +; SSE41-NEXT: popq %r12 +; SSE41-NEXT: popq %r13 +; SSE41-NEXT: popq %r14 +; SSE41-NEXT: popq %r15 +; SSE41-NEXT: popq %rbp +; SSE41-NEXT: retq +; +; AVX-LABEL: v16i8: +; AVX: # %bb.0: +; AVX-NEXT: vpextrb $15, %xmm1, %ecx +; AVX-NEXT: vpextrb $15, %xmm0, %edx +; AVX-NEXT: movl %edx, %eax +; AVX-NEXT: addb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: addb %cl, %dl +; AVX-NEXT: jno .LBB0_2 +; AVX-NEXT: # %bb.1: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %edx +; AVX-NEXT: .LBB0_2: +; AVX-NEXT: vpextrb $14, %xmm1, %ecx +; AVX-NEXT: vpextrb $14, %xmm0, %r11d +; AVX-NEXT: movl %r11d, %eax +; AVX-NEXT: addb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: addb %cl, %r11b +; AVX-NEXT: jno .LBB0_4 +; AVX-NEXT: # %bb.3: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %r11d +; AVX-NEXT: .LBB0_4: +; AVX-NEXT: vpextrb $13, %xmm1, %ecx +; AVX-NEXT: vpextrb $13, %xmm0, %edi +; AVX-NEXT: movl %edi, %eax +; AVX-NEXT: addb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: addb %cl, %dil +; AVX-NEXT: jno .LBB0_6 +; AVX-NEXT: # %bb.5: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %edi +; AVX-NEXT: .LBB0_6: +; AVX-NEXT: pushq %rbp +; AVX-NEXT: pushq %r15 +; AVX-NEXT: pushq %r14 +; AVX-NEXT: pushq %r13 +; AVX-NEXT: pushq %r12 +; AVX-NEXT: pushq %rbx +; AVX-NEXT: vpextrb $12, %xmm1, %ecx +; AVX-NEXT: vpextrb $12, %xmm0, %r14d +; AVX-NEXT: movl %r14d, %eax +; AVX-NEXT: addb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: addb %cl, %r14b +; AVX-NEXT: jno .LBB0_8 +; AVX-NEXT: # %bb.7: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %r14d +; AVX-NEXT: .LBB0_8: +; AVX-NEXT: vpextrb $11, %xmm1, %ecx +; AVX-NEXT: vpextrb $11, %xmm0, %ebp +; AVX-NEXT: movl %ebp, %eax +; AVX-NEXT: addb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: addb %cl, %bpl +; AVX-NEXT: jno .LBB0_10 +; AVX-NEXT: # %bb.9: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %ebp +; AVX-NEXT: .LBB0_10: +; AVX-NEXT: vpextrb $10, %xmm1, %ecx +; AVX-NEXT: vpextrb $10, %xmm0, %r15d +; AVX-NEXT: movl %r15d, %eax +; AVX-NEXT: addb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: addb %cl, %r15b +; AVX-NEXT: jno .LBB0_12 +; AVX-NEXT: # %bb.11: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %r15d +; AVX-NEXT: .LBB0_12: +; AVX-NEXT: vpextrb $9, %xmm1, %ecx +; AVX-NEXT: vpextrb $9, %xmm0, %r12d +; AVX-NEXT: movl %r12d, %eax +; AVX-NEXT: addb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: addb %cl, %r12b +; AVX-NEXT: jno .LBB0_14 +; AVX-NEXT: # %bb.13: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %r12d +; AVX-NEXT: .LBB0_14: +; AVX-NEXT: vpextrb $8, %xmm1, %ecx +; AVX-NEXT: vpextrb $8, %xmm0, %r13d +; AVX-NEXT: movl %r13d, %eax +; AVX-NEXT: addb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: addb %cl, %r13b +; AVX-NEXT: jno .LBB0_16 +; AVX-NEXT: # %bb.15: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %r13d +; AVX-NEXT: .LBB0_16: +; AVX-NEXT: vpextrb $7, %xmm1, %ecx +; AVX-NEXT: vpextrb $7, %xmm0, %r10d +; AVX-NEXT: movl %r10d, %eax +; AVX-NEXT: addb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: addb %cl, %r10b +; AVX-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX-NEXT: jno .LBB0_18 +; AVX-NEXT: # %bb.17: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %r10d +; AVX-NEXT: .LBB0_18: +; AVX-NEXT: vpextrb $6, %xmm1, %ecx +; AVX-NEXT: vpextrb $6, %xmm0, %r9d +; AVX-NEXT: movl %r9d, %eax +; AVX-NEXT: addb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: addb %cl, %r9b +; AVX-NEXT: jno .LBB0_20 +; AVX-NEXT: # %bb.19: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %r9d +; AVX-NEXT: .LBB0_20: +; AVX-NEXT: vpextrb $5, %xmm1, %ecx +; AVX-NEXT: vpextrb $5, %xmm0, %ebp +; AVX-NEXT: movl %ebp, %eax +; AVX-NEXT: addb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: addb %cl, %bpl +; AVX-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX-NEXT: jno .LBB0_22 +; AVX-NEXT: # %bb.21: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %ebp +; AVX-NEXT: .LBB0_22: +; AVX-NEXT: vpextrb $4, %xmm1, %ecx +; AVX-NEXT: vpextrb $4, %xmm0, %edi +; AVX-NEXT: movl %edi, %eax +; AVX-NEXT: addb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: addb %cl, %dil +; AVX-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX-NEXT: jno .LBB0_24 +; AVX-NEXT: # %bb.23: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %edi +; AVX-NEXT: .LBB0_24: +; AVX-NEXT: vpextrb $3, %xmm1, %edx +; AVX-NEXT: vpextrb $3, %xmm0, %eax +; AVX-NEXT: movl %eax, %ecx +; AVX-NEXT: addb %dl, %cl +; AVX-NEXT: setns %cl +; AVX-NEXT: addb %dl, %al +; AVX-NEXT: jno .LBB0_26 +; AVX-NEXT: # %bb.25: +; AVX-NEXT: addb $127, %cl +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: .LBB0_26: +; AVX-NEXT: vpextrb $2, %xmm1, %ebx +; AVX-NEXT: vpextrb $2, %xmm0, %ecx +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: addb %bl, %dl +; AVX-NEXT: setns %dl +; AVX-NEXT: addb %bl, %cl +; AVX-NEXT: jno .LBB0_28 +; AVX-NEXT: # %bb.27: +; AVX-NEXT: addb $127, %dl +; AVX-NEXT: movl %edx, %ecx +; AVX-NEXT: .LBB0_28: +; AVX-NEXT: vpextrb $0, %xmm1, %esi +; AVX-NEXT: vpextrb $0, %xmm0, %edx +; AVX-NEXT: movl %edx, %ebx +; AVX-NEXT: addb %sil, %bl +; AVX-NEXT: setns %bl +; AVX-NEXT: addb %sil, %dl +; AVX-NEXT: jno .LBB0_30 +; AVX-NEXT: # %bb.29: +; AVX-NEXT: addb $127, %bl +; AVX-NEXT: movl %ebx, %edx +; AVX-NEXT: .LBB0_30: +; AVX-NEXT: vpextrb $1, %xmm1, %esi +; AVX-NEXT: vpextrb $1, %xmm0, %r8d +; AVX-NEXT: movl %r8d, %ebx +; AVX-NEXT: addb %sil, %bl +; AVX-NEXT: setns %bl +; AVX-NEXT: addb %sil, %r8b +; AVX-NEXT: jno .LBB0_32 +; AVX-NEXT: # %bb.31: +; AVX-NEXT: addb $127, %bl +; AVX-NEXT: movl %ebx, %r8d +; AVX-NEXT: .LBB0_32: +; AVX-NEXT: movzbl %dl, %edx +; AVX-NEXT: vmovd %edx, %xmm0 +; AVX-NEXT: movzbl %r8b, %edx +; AVX-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0 +; AVX-NEXT: movzbl %cl, %ecx +; AVX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %dil, %eax +; AVX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %bpl, %eax +; AVX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %r9b, %eax +; AVX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %r10b, %eax +; AVX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %r13b, %eax +; AVX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %r12b, %eax +; AVX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %r15b, %eax +; AVX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %r14b, %eax +; AVX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %r11b, %eax +; AVX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX-NEXT: popq %rbx +; AVX-NEXT: popq %r12 +; AVX-NEXT: popq %r13 +; AVX-NEXT: popq %r14 +; AVX-NEXT: popq %r15 +; AVX-NEXT: popq %rbp +; AVX-NEXT: retq + %z = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %x, <16 x i8> %y) + ret <16 x i8> %z +} + +define <32 x i8> @v32i8(<32 x i8> %x, <32 x i8> %y) nounwind { +; SSE2-LABEL: v32i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: pushq %r15 +; SSE2-NEXT: pushq %r14 +; SSE2-NEXT: pushq %r13 +; SSE2-NEXT: pushq %r12 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r8b +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r11b +; SSE2-NEXT: movl %r8d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r8b +; SSE2-NEXT: jno .LBB1_2 +; SSE2-NEXT: # %bb.1: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r8d +; SSE2-NEXT: .LBB1_2: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movl %r11d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r11b +; SSE2-NEXT: jno .LBB1_4 +; SSE2-NEXT: # %bb.3: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r11d +; SSE2-NEXT: .LBB1_4: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl %ebx, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %bl +; SSE2-NEXT: jno .LBB1_6 +; SSE2-NEXT: # %bb.5: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %ebx +; SSE2-NEXT: .LBB1_6: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSE2-NEXT: movl %esi, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %sil +; SSE2-NEXT: jno .LBB1_8 +; SSE2-NEXT: # %bb.7: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %esi +; SSE2-NEXT: .LBB1_8: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %dl +; SSE2-NEXT: jo .LBB1_9 +; SSE2-NEXT: # %bb.10: +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jmp .LBB1_11 +; SSE2-NEXT: .LBB1_9: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB1_11: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dil +; SSE2-NEXT: movl %edi, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %dil +; SSE2-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB1_13 +; SSE2-NEXT: # %bb.12: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %edi +; SSE2-NEXT: .LBB1_13: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r9b +; SSE2-NEXT: movl %r9d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r9b +; SSE2-NEXT: jno .LBB1_15 +; SSE2-NEXT: # %bb.14: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r9d +; SSE2-NEXT: .LBB1_15: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r10b +; SSE2-NEXT: movl %r10d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r10b +; SSE2-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB1_17 +; SSE2-NEXT: # %bb.16: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r10d +; SSE2-NEXT: .LBB1_17: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bpl +; SSE2-NEXT: movl %ebp, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %bpl +; SSE2-NEXT: jno .LBB1_19 +; SSE2-NEXT: # %bb.18: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %ebp +; SSE2-NEXT: .LBB1_19: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r14b +; SSE2-NEXT: movl %r14d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r14b +; SSE2-NEXT: jno .LBB1_21 +; SSE2-NEXT: # %bb.20: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r14d +; SSE2-NEXT: .LBB1_21: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r11b +; SSE2-NEXT: movl %r11d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r11b +; SSE2-NEXT: jno .LBB1_23 +; SSE2-NEXT: # %bb.22: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r11d +; SSE2-NEXT: .LBB1_23: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r8b +; SSE2-NEXT: movl %r8d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r8b +; SSE2-NEXT: jno .LBB1_25 +; SSE2-NEXT: # %bb.24: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r8d +; SSE2-NEXT: .LBB1_25: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r15b +; SSE2-NEXT: movl %r15d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r15b +; SSE2-NEXT: jno .LBB1_27 +; SSE2-NEXT: # %bb.26: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r15d +; SSE2-NEXT: .LBB1_27: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r12b +; SSE2-NEXT: movl %r12d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r12b +; SSE2-NEXT: jno .LBB1_29 +; SSE2-NEXT: # %bb.28: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r12d +; SSE2-NEXT: .LBB1_29: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r13b +; SSE2-NEXT: movl %r13d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r13b +; SSE2-NEXT: jno .LBB1_31 +; SSE2-NEXT: # %bb.30: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r13d +; SSE2-NEXT: .LBB1_31: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %dl +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB1_33 +; SSE2-NEXT: # %bb.32: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB1_33: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl %esi, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %sil +; SSE2-NEXT: jno .LBB1_35 +; SSE2-NEXT: # %bb.34: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %esi +; SSE2-NEXT: .LBB1_35: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %dl +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB1_37 +; SSE2-NEXT: # %bb.36: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB1_37: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %dl +; SSE2-NEXT: jno .LBB1_39 +; SSE2-NEXT: # %bb.38: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %edx +; SSE2-NEXT: .LBB1_39: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl %ebx, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %bl +; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB1_41 +; SSE2-NEXT: # %bb.40: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB1_41: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl %ebx, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %bl +; SSE2-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB1_43 +; SSE2-NEXT: # %bb.42: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %ebx +; SSE2-NEXT: .LBB1_43: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bpl +; SSE2-NEXT: movl %ebp, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %bpl +; SSE2-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB1_45 +; SSE2-NEXT: # %bb.44: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %ebp +; SSE2-NEXT: .LBB1_45: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r14b +; SSE2-NEXT: movl %r14d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r14b +; SSE2-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB1_47 +; SSE2-NEXT: # %bb.46: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r14d +; SSE2-NEXT: .LBB1_47: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r15b +; SSE2-NEXT: movl %r15d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r15b +; SSE2-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB1_49 +; SSE2-NEXT: # %bb.48: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r15d +; SSE2-NEXT: .LBB1_49: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r12b +; SSE2-NEXT: movl %r12d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r12b +; SSE2-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB1_51 +; SSE2-NEXT: # %bb.50: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r12d +; SSE2-NEXT: .LBB1_51: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r13b +; SSE2-NEXT: movl %r13d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r13b +; SSE2-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB1_53 +; SSE2-NEXT: # %bb.52: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r13d +; SSE2-NEXT: .LBB1_53: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r8b +; SSE2-NEXT: movl %r8d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r8b +; SSE2-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB1_55 +; SSE2-NEXT: # %bb.54: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r8d +; SSE2-NEXT: .LBB1_55: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r9b +; SSE2-NEXT: movl %r9d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r9b +; SSE2-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB1_57 +; SSE2-NEXT: # %bb.56: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r9d +; SSE2-NEXT: .LBB1_57: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r10b +; SSE2-NEXT: movl %r10d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r10b +; SSE2-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB1_59 +; SSE2-NEXT: # %bb.58: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r10d +; SSE2-NEXT: .LBB1_59: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r11b +; SSE2-NEXT: movl %r11d, %ecx +; SSE2-NEXT: addb %dl, %cl +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addb %dl, %r11b +; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB1_61 +; SSE2-NEXT: # %bb.60: +; SSE2-NEXT: addb $127, %cl +; SSE2-NEXT: movl %ecx, %r11d +; SSE2-NEXT: .LBB1_61: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movl %ecx, %edx +; SSE2-NEXT: addb %bl, %dl +; SSE2-NEXT: setns %dl +; SSE2-NEXT: addb %bl, %cl +; SSE2-NEXT: jno .LBB1_63 +; SSE2-NEXT: # %bb.62: +; SSE2-NEXT: addb $127, %dl +; SSE2-NEXT: movl %edx, %ecx +; SSE2-NEXT: .LBB1_63: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl %ebx, %edx +; SSE2-NEXT: addb %al, %dl +; SSE2-NEXT: setns %dl +; SSE2-NEXT: addb %al, %bl +; SSE2-NEXT: jno .LBB1_65 +; SSE2-NEXT: # %bb.64: +; SSE2-NEXT: addb $127, %dl +; SSE2-NEXT: movl %edx, %ebx +; SSE2-NEXT: .LBB1_65: +; SSE2-NEXT: movzbl %bl, %esi +; SSE2-NEXT: movzbl %cl, %edi +; SSE2-NEXT: movzbl %r11b, %r11d +; SSE2-NEXT: movzbl %r10b, %r10d +; SSE2-NEXT: movzbl %r9b, %r9d +; SSE2-NEXT: movzbl %r8b, %r8d +; SSE2-NEXT: movzbl %r13b, %r13d +; SSE2-NEXT: movzbl %r12b, %eax +; SSE2-NEXT: movzbl %r15b, %ebx +; SSE2-NEXT: movzbl %r14b, %edx +; SSE2-NEXT: movzbl %bpl, %ebp +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload +; SSE2-NEXT: movd %esi, %xmm12 +; SSE2-NEXT: movd %edi, %xmm6 +; SSE2-NEXT: movd %r11d, %xmm11 +; SSE2-NEXT: movd %r10d, %xmm2 +; SSE2-NEXT: movd %r9d, %xmm10 +; SSE2-NEXT: movd %r8d, %xmm5 +; SSE2-NEXT: movd %r13d, %xmm9 +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: movd %ebx, %xmm8 +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload +; SSE2-NEXT: movd %edx, %xmm14 +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload +; SSE2-NEXT: movd %ebp, %xmm13 +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload +; SSE2-NEXT: movd %ecx, %xmm7 +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movd %r12d, %xmm0 +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE2-NEXT: movd %r15d, %xmm4 +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload +; SSE2-NEXT: movd %r14d, %xmm15 +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; SSE2-NEXT: movd %r13d, %xmm0 +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] +; SSE2-NEXT: movd %r11d, %xmm3 +; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] +; SSE2-NEXT: movd %r8d, %xmm11 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; SSE2-NEXT: movd %r9d, %xmm12 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] +; SSE2-NEXT: movd %r10d, %xmm10 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; SSE2-NEXT: movd %ebx, %xmm9 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE2-NEXT: movd %eax, %xmm5 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: movd %r12d, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3],xmm14[4],xmm8[4],xmm14[5],xmm8[5],xmm14[6],xmm8[6],xmm14[7],xmm8[7] +; SSE2-NEXT: movd %esi, %xmm8 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] +; SSE2-NEXT: movd %r15d, %xmm13 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3] +; SSE2-NEXT: movd %ebp, %xmm3 +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE2-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3],xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; SSE2-NEXT: movd %r14d, %xmm14 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; SSE2-NEXT: movd %edx, %xmm15 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE2-NEXT: movd %r13d, %xmm4 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE2-NEXT: movd %edi, %xmm7 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: movd %r11d, %xmm2 +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE2-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3],xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3],xmm3[4],xmm13[4],xmm3[5],xmm13[5],xmm3[6],xmm13[6],xmm3[7],xmm13[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1],xmm15[2],xmm3[2],xmm15[3],xmm3[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm8[0] +; SSE2-NEXT: addq $8, %rsp +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %r12 +; SSE2-NEXT: popq %r13 +; SSE2-NEXT: popq %r14 +; SSE2-NEXT: popq %r15 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v32i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pushq %rbp +; SSSE3-NEXT: pushq %r15 +; SSSE3-NEXT: pushq %r14 +; SSSE3-NEXT: pushq %r13 +; SSSE3-NEXT: pushq %r12 +; SSSE3-NEXT: pushq %rbx +; SSSE3-NEXT: pushq %rax +; SSSE3-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r8b +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r11b +; SSSE3-NEXT: movl %r8d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r8b +; SSSE3-NEXT: jno .LBB1_2 +; SSSE3-NEXT: # %bb.1: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r8d +; SSSE3-NEXT: .LBB1_2: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movl %r11d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r11b +; SSSE3-NEXT: jno .LBB1_4 +; SSSE3-NEXT: # %bb.3: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r11d +; SSSE3-NEXT: .LBB1_4: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl %ebx, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %bl +; SSSE3-NEXT: jno .LBB1_6 +; SSSE3-NEXT: # %bb.5: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %ebx +; SSSE3-NEXT: .LBB1_6: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSSE3-NEXT: movl %esi, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %sil +; SSSE3-NEXT: jno .LBB1_8 +; SSSE3-NEXT: # %bb.7: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %esi +; SSSE3-NEXT: .LBB1_8: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %dl +; SSSE3-NEXT: jo .LBB1_9 +; SSSE3-NEXT: # %bb.10: +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jmp .LBB1_11 +; SSSE3-NEXT: .LBB1_9: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB1_11: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dil +; SSSE3-NEXT: movl %edi, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %dil +; SSSE3-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB1_13 +; SSSE3-NEXT: # %bb.12: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %edi +; SSSE3-NEXT: .LBB1_13: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r9b +; SSSE3-NEXT: movl %r9d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r9b +; SSSE3-NEXT: jno .LBB1_15 +; SSSE3-NEXT: # %bb.14: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r9d +; SSSE3-NEXT: .LBB1_15: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r10b +; SSSE3-NEXT: movl %r10d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r10b +; SSSE3-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB1_17 +; SSSE3-NEXT: # %bb.16: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r10d +; SSSE3-NEXT: .LBB1_17: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bpl +; SSSE3-NEXT: movl %ebp, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %bpl +; SSSE3-NEXT: jno .LBB1_19 +; SSSE3-NEXT: # %bb.18: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %ebp +; SSSE3-NEXT: .LBB1_19: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r14b +; SSSE3-NEXT: movl %r14d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r14b +; SSSE3-NEXT: jno .LBB1_21 +; SSSE3-NEXT: # %bb.20: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r14d +; SSSE3-NEXT: .LBB1_21: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r11b +; SSSE3-NEXT: movl %r11d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r11b +; SSSE3-NEXT: jno .LBB1_23 +; SSSE3-NEXT: # %bb.22: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r11d +; SSSE3-NEXT: .LBB1_23: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r8b +; SSSE3-NEXT: movl %r8d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r8b +; SSSE3-NEXT: jno .LBB1_25 +; SSSE3-NEXT: # %bb.24: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r8d +; SSSE3-NEXT: .LBB1_25: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r15b +; SSSE3-NEXT: movl %r15d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r15b +; SSSE3-NEXT: jno .LBB1_27 +; SSSE3-NEXT: # %bb.26: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r15d +; SSSE3-NEXT: .LBB1_27: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r12b +; SSSE3-NEXT: movl %r12d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r12b +; SSSE3-NEXT: jno .LBB1_29 +; SSSE3-NEXT: # %bb.28: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r12d +; SSSE3-NEXT: .LBB1_29: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r13b +; SSSE3-NEXT: movl %r13d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r13b +; SSSE3-NEXT: jno .LBB1_31 +; SSSE3-NEXT: # %bb.30: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r13d +; SSSE3-NEXT: .LBB1_31: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %dl +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB1_33 +; SSSE3-NEXT: # %bb.32: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB1_33: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl %esi, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %sil +; SSSE3-NEXT: jno .LBB1_35 +; SSSE3-NEXT: # %bb.34: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %esi +; SSSE3-NEXT: .LBB1_35: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %dl +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB1_37 +; SSSE3-NEXT: # %bb.36: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB1_37: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %dl +; SSSE3-NEXT: jno .LBB1_39 +; SSSE3-NEXT: # %bb.38: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %edx +; SSSE3-NEXT: .LBB1_39: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl %ebx, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %bl +; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB1_41 +; SSSE3-NEXT: # %bb.40: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB1_41: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl %ebx, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %bl +; SSSE3-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB1_43 +; SSSE3-NEXT: # %bb.42: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %ebx +; SSSE3-NEXT: .LBB1_43: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bpl +; SSSE3-NEXT: movl %ebp, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %bpl +; SSSE3-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB1_45 +; SSSE3-NEXT: # %bb.44: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %ebp +; SSSE3-NEXT: .LBB1_45: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r14b +; SSSE3-NEXT: movl %r14d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r14b +; SSSE3-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB1_47 +; SSSE3-NEXT: # %bb.46: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r14d +; SSSE3-NEXT: .LBB1_47: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r15b +; SSSE3-NEXT: movl %r15d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r15b +; SSSE3-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB1_49 +; SSSE3-NEXT: # %bb.48: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r15d +; SSSE3-NEXT: .LBB1_49: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r12b +; SSSE3-NEXT: movl %r12d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r12b +; SSSE3-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB1_51 +; SSSE3-NEXT: # %bb.50: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r12d +; SSSE3-NEXT: .LBB1_51: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r13b +; SSSE3-NEXT: movl %r13d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r13b +; SSSE3-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB1_53 +; SSSE3-NEXT: # %bb.52: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r13d +; SSSE3-NEXT: .LBB1_53: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r8b +; SSSE3-NEXT: movl %r8d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r8b +; SSSE3-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB1_55 +; SSSE3-NEXT: # %bb.54: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r8d +; SSSE3-NEXT: .LBB1_55: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r9b +; SSSE3-NEXT: movl %r9d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r9b +; SSSE3-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB1_57 +; SSSE3-NEXT: # %bb.56: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r9d +; SSSE3-NEXT: .LBB1_57: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r10b +; SSSE3-NEXT: movl %r10d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r10b +; SSSE3-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB1_59 +; SSSE3-NEXT: # %bb.58: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r10d +; SSSE3-NEXT: .LBB1_59: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r11b +; SSSE3-NEXT: movl %r11d, %ecx +; SSSE3-NEXT: addb %dl, %cl +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addb %dl, %r11b +; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB1_61 +; SSSE3-NEXT: # %bb.60: +; SSSE3-NEXT: addb $127, %cl +; SSSE3-NEXT: movl %ecx, %r11d +; SSSE3-NEXT: .LBB1_61: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movl %ecx, %edx +; SSSE3-NEXT: addb %bl, %dl +; SSSE3-NEXT: setns %dl +; SSSE3-NEXT: addb %bl, %cl +; SSSE3-NEXT: jno .LBB1_63 +; SSSE3-NEXT: # %bb.62: +; SSSE3-NEXT: addb $127, %dl +; SSSE3-NEXT: movl %edx, %ecx +; SSSE3-NEXT: .LBB1_63: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl %ebx, %edx +; SSSE3-NEXT: addb %al, %dl +; SSSE3-NEXT: setns %dl +; SSSE3-NEXT: addb %al, %bl +; SSSE3-NEXT: jno .LBB1_65 +; SSSE3-NEXT: # %bb.64: +; SSSE3-NEXT: addb $127, %dl +; SSSE3-NEXT: movl %edx, %ebx +; SSSE3-NEXT: .LBB1_65: +; SSSE3-NEXT: movzbl %bl, %esi +; SSSE3-NEXT: movzbl %cl, %edi +; SSSE3-NEXT: movzbl %r11b, %r11d +; SSSE3-NEXT: movzbl %r10b, %r10d +; SSSE3-NEXT: movzbl %r9b, %r9d +; SSSE3-NEXT: movzbl %r8b, %r8d +; SSSE3-NEXT: movzbl %r13b, %r13d +; SSSE3-NEXT: movzbl %r12b, %eax +; SSSE3-NEXT: movzbl %r15b, %ebx +; SSSE3-NEXT: movzbl %r14b, %edx +; SSSE3-NEXT: movzbl %bpl, %ebp +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload +; SSSE3-NEXT: movd %esi, %xmm12 +; SSSE3-NEXT: movd %edi, %xmm6 +; SSSE3-NEXT: movd %r11d, %xmm11 +; SSSE3-NEXT: movd %r10d, %xmm2 +; SSSE3-NEXT: movd %r9d, %xmm10 +; SSSE3-NEXT: movd %r8d, %xmm5 +; SSSE3-NEXT: movd %r13d, %xmm9 +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movd %ebx, %xmm8 +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload +; SSSE3-NEXT: movd %edx, %xmm14 +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload +; SSSE3-NEXT: movd %ebp, %xmm13 +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload +; SSSE3-NEXT: movd %ecx, %xmm7 +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movd %r12d, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSSE3-NEXT: movd %r15d, %xmm4 +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload +; SSSE3-NEXT: movd %r14d, %xmm15 +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; SSSE3-NEXT: movd %r13d, %xmm0 +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] +; SSSE3-NEXT: movd %r11d, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] +; SSSE3-NEXT: movd %r8d, %xmm11 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; SSSE3-NEXT: movd %r9d, %xmm12 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] +; SSSE3-NEXT: movd %r10d, %xmm10 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; SSSE3-NEXT: movd %ebx, %xmm9 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSSE3-NEXT: movd %eax, %xmm5 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSSE3-NEXT: movd %r12d, %xmm6 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3],xmm14[4],xmm8[4],xmm14[5],xmm8[5],xmm14[6],xmm8[6],xmm14[7],xmm8[7] +; SSSE3-NEXT: movd %esi, %xmm8 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] +; SSSE3-NEXT: movd %r15d, %xmm13 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3] +; SSSE3-NEXT: movd %ebp, %xmm3 +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3],xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; SSSE3-NEXT: movd %r14d, %xmm14 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; SSSE3-NEXT: movd %edx, %xmm15 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSSE3-NEXT: movd %r13d, %xmm4 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSSE3-NEXT: movd %edi, %xmm7 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: movd %r11d, %xmm2 +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3],xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] +; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3],xmm3[4],xmm13[4],xmm3[5],xmm13[5],xmm3[6],xmm13[6],xmm3[7],xmm13[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1],xmm15[2],xmm3[2],xmm15[3],xmm3[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm8[0] +; SSSE3-NEXT: addq $8, %rsp +; SSSE3-NEXT: popq %rbx +; SSSE3-NEXT: popq %r12 +; SSSE3-NEXT: popq %r13 +; SSSE3-NEXT: popq %r14 +; SSSE3-NEXT: popq %r15 +; SSSE3-NEXT: popq %rbp +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v32i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pextrb $15, %xmm3, %ecx +; SSE41-NEXT: pextrb $15, %xmm1, %edx +; SSE41-NEXT: movl %edx, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %dl +; SSE41-NEXT: jno .LBB1_2 +; SSE41-NEXT: # %bb.1: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edx +; SSE41-NEXT: .LBB1_2: +; SSE41-NEXT: pextrb $14, %xmm3, %ecx +; SSE41-NEXT: pextrb $14, %xmm1, %esi +; SSE41-NEXT: movl %esi, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %sil +; SSE41-NEXT: jno .LBB1_4 +; SSE41-NEXT: # %bb.3: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %esi +; SSE41-NEXT: .LBB1_4: +; SSE41-NEXT: pushq %rbp +; SSE41-NEXT: pushq %r15 +; SSE41-NEXT: pushq %r14 +; SSE41-NEXT: pushq %r13 +; SSE41-NEXT: pushq %r12 +; SSE41-NEXT: pushq %rbx +; SSE41-NEXT: pextrb $13, %xmm3, %ecx +; SSE41-NEXT: pextrb $13, %xmm1, %edi +; SSE41-NEXT: movl %edi, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %dil +; SSE41-NEXT: jo .LBB1_5 +; SSE41-NEXT: # %bb.6: +; SSE41-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jmp .LBB1_7 +; SSE41-NEXT: .LBB1_5: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: # kill: def $al killed $al def $eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: .LBB1_7: +; SSE41-NEXT: pextrb $12, %xmm3, %ecx +; SSE41-NEXT: pextrb $12, %xmm1, %edi +; SSE41-NEXT: movl %edi, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %dil +; SSE41-NEXT: jno .LBB1_9 +; SSE41-NEXT: # %bb.8: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edi +; SSE41-NEXT: .LBB1_9: +; SSE41-NEXT: pextrb $11, %xmm3, %ecx +; SSE41-NEXT: pextrb $11, %xmm1, %ebp +; SSE41-NEXT: movl %ebp, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %bpl +; SSE41-NEXT: jno .LBB1_11 +; SSE41-NEXT: # %bb.10: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebp +; SSE41-NEXT: .LBB1_11: +; SSE41-NEXT: pextrb $10, %xmm3, %ecx +; SSE41-NEXT: pextrb $10, %xmm1, %ebx +; SSE41-NEXT: movl %ebx, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %bl +; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB1_13 +; SSE41-NEXT: # %bb.12: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebx +; SSE41-NEXT: .LBB1_13: +; SSE41-NEXT: pextrb $9, %xmm3, %ecx +; SSE41-NEXT: pextrb $9, %xmm1, %ebp +; SSE41-NEXT: movl %ebp, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %bpl +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB1_15 +; SSE41-NEXT: # %bb.14: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebp +; SSE41-NEXT: .LBB1_15: +; SSE41-NEXT: pextrb $8, %xmm3, %ecx +; SSE41-NEXT: pextrb $8, %xmm1, %esi +; SSE41-NEXT: movl %esi, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %sil +; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB1_17 +; SSE41-NEXT: # %bb.16: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %esi +; SSE41-NEXT: .LBB1_17: +; SSE41-NEXT: pextrb $7, %xmm3, %ecx +; SSE41-NEXT: pextrb $7, %xmm1, %edx +; SSE41-NEXT: movl %edx, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %dl +; SSE41-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB1_19 +; SSE41-NEXT: # %bb.18: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edx +; SSE41-NEXT: .LBB1_19: +; SSE41-NEXT: pextrb $6, %xmm3, %ecx +; SSE41-NEXT: pextrb $6, %xmm1, %edi +; SSE41-NEXT: movl %edi, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %dil +; SSE41-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB1_21 +; SSE41-NEXT: # %bb.20: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edi +; SSE41-NEXT: .LBB1_21: +; SSE41-NEXT: pextrb $5, %xmm3, %ecx +; SSE41-NEXT: pextrb $5, %xmm1, %r8d +; SSE41-NEXT: movl %r8d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r8b +; SSE41-NEXT: jno .LBB1_23 +; SSE41-NEXT: # %bb.22: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r8d +; SSE41-NEXT: .LBB1_23: +; SSE41-NEXT: pextrb $4, %xmm3, %ecx +; SSE41-NEXT: pextrb $4, %xmm1, %r11d +; SSE41-NEXT: movl %r11d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r11b +; SSE41-NEXT: jno .LBB1_25 +; SSE41-NEXT: # %bb.24: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r11d +; SSE41-NEXT: .LBB1_25: +; SSE41-NEXT: pextrb $3, %xmm3, %ecx +; SSE41-NEXT: pextrb $3, %xmm1, %r10d +; SSE41-NEXT: movl %r10d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r10b +; SSE41-NEXT: jno .LBB1_27 +; SSE41-NEXT: # %bb.26: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r10d +; SSE41-NEXT: .LBB1_27: +; SSE41-NEXT: pextrb $2, %xmm3, %ecx +; SSE41-NEXT: pextrb $2, %xmm1, %r14d +; SSE41-NEXT: movl %r14d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r14b +; SSE41-NEXT: jno .LBB1_29 +; SSE41-NEXT: # %bb.28: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r14d +; SSE41-NEXT: .LBB1_29: +; SSE41-NEXT: pextrb $0, %xmm3, %ecx +; SSE41-NEXT: pextrb $0, %xmm1, %r9d +; SSE41-NEXT: movl %r9d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r9b +; SSE41-NEXT: jno .LBB1_31 +; SSE41-NEXT: # %bb.30: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r9d +; SSE41-NEXT: .LBB1_31: +; SSE41-NEXT: pextrb $1, %xmm3, %ecx +; SSE41-NEXT: pextrb $1, %xmm1, %ebp +; SSE41-NEXT: movl %ebp, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %bpl +; SSE41-NEXT: jno .LBB1_33 +; SSE41-NEXT: # %bb.32: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebp +; SSE41-NEXT: .LBB1_33: +; SSE41-NEXT: pextrb $15, %xmm2, %ecx +; SSE41-NEXT: pextrb $15, %xmm0, %ebx +; SSE41-NEXT: movl %ebx, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: movl %esi, %r15d +; SSE41-NEXT: addb %cl, %bl +; SSE41-NEXT: jno .LBB1_35 +; SSE41-NEXT: # %bb.34: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebx +; SSE41-NEXT: .LBB1_35: +; SSE41-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrb $14, %xmm2, %ecx +; SSE41-NEXT: pextrb $14, %xmm0, %esi +; SSE41-NEXT: movl %esi, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: movl %edx, %edi +; SSE41-NEXT: addb %cl, %sil +; SSE41-NEXT: jno .LBB1_37 +; SSE41-NEXT: # %bb.36: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %esi +; SSE41-NEXT: .LBB1_37: +; SSE41-NEXT: pextrb $13, %xmm2, %ecx +; SSE41-NEXT: pextrb $13, %xmm0, %edx +; SSE41-NEXT: movl %edx, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %dl +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jo .LBB1_38 +; SSE41-NEXT: # %bb.39: +; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jmp .LBB1_40 +; SSE41-NEXT: .LBB1_38: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: # kill: def $al killed $al def $eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: .LBB1_40: +; SSE41-NEXT: movl %edi, %edx +; SSE41-NEXT: pextrb $12, %xmm2, %ecx +; SSE41-NEXT: pextrb $12, %xmm0, %edi +; SSE41-NEXT: movl %edi, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %dil +; SSE41-NEXT: movl %r15d, %esi +; SSE41-NEXT: jno .LBB1_42 +; SSE41-NEXT: # %bb.41: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edi +; SSE41-NEXT: .LBB1_42: +; SSE41-NEXT: pextrb $11, %xmm2, %ecx +; SSE41-NEXT: pextrb $11, %xmm0, %r15d +; SSE41-NEXT: movl %r15d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r15b +; SSE41-NEXT: jno .LBB1_44 +; SSE41-NEXT: # %bb.43: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r15d +; SSE41-NEXT: .LBB1_44: +; SSE41-NEXT: pextrb $10, %xmm2, %ecx +; SSE41-NEXT: pextrb $10, %xmm0, %r12d +; SSE41-NEXT: movl %r12d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r12b +; SSE41-NEXT: jno .LBB1_46 +; SSE41-NEXT: # %bb.45: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r12d +; SSE41-NEXT: .LBB1_46: +; SSE41-NEXT: pextrb $9, %xmm2, %ecx +; SSE41-NEXT: pextrb $9, %xmm0, %r13d +; SSE41-NEXT: movl %r13d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r13b +; SSE41-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB1_48 +; SSE41-NEXT: # %bb.47: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r13d +; SSE41-NEXT: .LBB1_48: +; SSE41-NEXT: pextrb $8, %xmm2, %ecx +; SSE41-NEXT: pextrb $8, %xmm0, %r11d +; SSE41-NEXT: movl %r11d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r11b +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB1_50 +; SSE41-NEXT: # %bb.49: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r11d +; SSE41-NEXT: .LBB1_50: +; SSE41-NEXT: pextrb $7, %xmm2, %ecx +; SSE41-NEXT: pextrb $7, %xmm0, %r10d +; SSE41-NEXT: movl %r10d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r10b +; SSE41-NEXT: jno .LBB1_52 +; SSE41-NEXT: # %bb.51: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r10d +; SSE41-NEXT: .LBB1_52: +; SSE41-NEXT: pextrb $6, %xmm2, %ecx +; SSE41-NEXT: pextrb $6, %xmm0, %ebp +; SSE41-NEXT: movl %ebp, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: movl %edi, %r14d +; SSE41-NEXT: addb %cl, %bpl +; SSE41-NEXT: jno .LBB1_54 +; SSE41-NEXT: # %bb.53: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebp +; SSE41-NEXT: .LBB1_54: +; SSE41-NEXT: pextrb $5, %xmm2, %ecx +; SSE41-NEXT: pextrb $5, %xmm0, %edi +; SSE41-NEXT: movl %edi, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %dil +; SSE41-NEXT: jno .LBB1_56 +; SSE41-NEXT: # %bb.55: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edi +; SSE41-NEXT: .LBB1_56: +; SSE41-NEXT: pextrb $4, %xmm2, %edx +; SSE41-NEXT: pextrb $4, %xmm0, %eax +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: addb %dl, %cl +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addb %dl, %al +; SSE41-NEXT: jno .LBB1_58 +; SSE41-NEXT: # %bb.57: +; SSE41-NEXT: addb $127, %cl +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB1_58: +; SSE41-NEXT: pextrb $3, %xmm2, %ebx +; SSE41-NEXT: pextrb $3, %xmm0, %ecx +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: addb %bl, %dl +; SSE41-NEXT: setns %dl +; SSE41-NEXT: addb %bl, %cl +; SSE41-NEXT: jno .LBB1_60 +; SSE41-NEXT: # %bb.59: +; SSE41-NEXT: addb $127, %dl +; SSE41-NEXT: movl %edx, %ecx +; SSE41-NEXT: .LBB1_60: +; SSE41-NEXT: pextrb $2, %xmm2, %esi +; SSE41-NEXT: pextrb $2, %xmm0, %edx +; SSE41-NEXT: movl %edx, %ebx +; SSE41-NEXT: addb %sil, %bl +; SSE41-NEXT: setns %bl +; SSE41-NEXT: addb %sil, %dl +; SSE41-NEXT: jno .LBB1_62 +; SSE41-NEXT: # %bb.61: +; SSE41-NEXT: addb $127, %bl +; SSE41-NEXT: movl %ebx, %edx +; SSE41-NEXT: .LBB1_62: +; SSE41-NEXT: pextrb $0, %xmm2, %esi +; SSE41-NEXT: pextrb $0, %xmm0, %r8d +; SSE41-NEXT: movl %r8d, %ebx +; SSE41-NEXT: addb %sil, %bl +; SSE41-NEXT: setns %bl +; SSE41-NEXT: addb %sil, %r8b +; SSE41-NEXT: jno .LBB1_64 +; SSE41-NEXT: # %bb.63: +; SSE41-NEXT: addb $127, %bl +; SSE41-NEXT: movl %ebx, %r8d +; SSE41-NEXT: .LBB1_64: +; SSE41-NEXT: pextrb $1, %xmm2, %esi +; SSE41-NEXT: pextrb $1, %xmm0, %r9d +; SSE41-NEXT: movl %r9d, %ebx +; SSE41-NEXT: addb %sil, %bl +; SSE41-NEXT: setns %bl +; SSE41-NEXT: addb %sil, %r9b +; SSE41-NEXT: jno .LBB1_66 +; SSE41-NEXT: # %bb.65: +; SSE41-NEXT: addb $127, %bl +; SSE41-NEXT: movl %ebx, %r9d +; SSE41-NEXT: .LBB1_66: +; SSE41-NEXT: movzbl %r8b, %esi +; SSE41-NEXT: movd %esi, %xmm0 +; SSE41-NEXT: movzbl %r9b, %esi +; SSE41-NEXT: pinsrb $1, %esi, %xmm0 +; SSE41-NEXT: movzbl %dl, %edx +; SSE41-NEXT: pinsrb $2, %edx, %xmm0 +; SSE41-NEXT: movzbl %cl, %ecx +; SSE41-NEXT: pinsrb $3, %ecx, %xmm0 +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $4, %eax, %xmm0 +; SSE41-NEXT: movzbl %dil, %eax +; SSE41-NEXT: pinsrb $5, %eax, %xmm0 +; SSE41-NEXT: movzbl %bpl, %eax +; SSE41-NEXT: pinsrb $6, %eax, %xmm0 +; SSE41-NEXT: movzbl %r10b, %eax +; SSE41-NEXT: pinsrb $7, %eax, %xmm0 +; SSE41-NEXT: movzbl %r11b, %eax +; SSE41-NEXT: pinsrb $8, %eax, %xmm0 +; SSE41-NEXT: movzbl %r13b, %eax +; SSE41-NEXT: pinsrb $9, %eax, %xmm0 +; SSE41-NEXT: movzbl %r12b, %eax +; SSE41-NEXT: pinsrb $10, %eax, %xmm0 +; SSE41-NEXT: movzbl %r15b, %eax +; SSE41-NEXT: pinsrb $11, %eax, %xmm0 +; SSE41-NEXT: movzbl %r14b, %eax +; SSE41-NEXT: pinsrb $12, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $13, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $14, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $15, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: movd %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $1, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $2, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $3, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $4, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $5, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $6, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $7, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $8, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $9, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $10, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $11, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $12, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $13, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $14, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $15, %eax, %xmm1 +; SSE41-NEXT: popq %rbx +; SSE41-NEXT: popq %r12 +; SSE41-NEXT: popq %r13 +; SSE41-NEXT: popq %r14 +; SSE41-NEXT: popq %r15 +; SSE41-NEXT: popq %rbp +; SSE41-NEXT: retq +; +; AVX1-LABEL: v32i8: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: pushq %r15 +; AVX1-NEXT: pushq %r14 +; AVX1-NEXT: pushq %r13 +; AVX1-NEXT: pushq %r12 +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: vpextrb $15, %xmm1, %ecx +; AVX1-NEXT: vpextrb $15, %xmm0, %edx +; AVX1-NEXT: movl %edx, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %dl +; AVX1-NEXT: jo .LBB1_1 +; AVX1-NEXT: # %bb.2: +; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jmp .LBB1_3 +; AVX1-NEXT: .LBB1_1: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: # kill: def $al killed $al def $eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: .LBB1_3: +; AVX1-NEXT: vpextrb $14, %xmm1, %ecx +; AVX1-NEXT: vpextrb $14, %xmm0, %edx +; AVX1-NEXT: movl %edx, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %dl +; AVX1-NEXT: jno .LBB1_5 +; AVX1-NEXT: # %bb.4: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %edx +; AVX1-NEXT: .LBB1_5: +; AVX1-NEXT: vpextrb $13, %xmm1, %ecx +; AVX1-NEXT: vpextrb $13, %xmm0, %esi +; AVX1-NEXT: movl %esi, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %sil +; AVX1-NEXT: jo .LBB1_6 +; AVX1-NEXT: # %bb.7: +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jmp .LBB1_8 +; AVX1-NEXT: .LBB1_6: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: # kill: def $al killed $al def $eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: .LBB1_8: +; AVX1-NEXT: vpextrb $12, %xmm1, %ecx +; AVX1-NEXT: vpextrb $12, %xmm0, %esi +; AVX1-NEXT: movl %esi, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %sil +; AVX1-NEXT: jno .LBB1_10 +; AVX1-NEXT: # %bb.9: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %esi +; AVX1-NEXT: .LBB1_10: +; AVX1-NEXT: vpextrb $11, %xmm1, %ecx +; AVX1-NEXT: vpextrb $11, %xmm0, %edi +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %dil +; AVX1-NEXT: jno .LBB1_12 +; AVX1-NEXT: # %bb.11: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %edi +; AVX1-NEXT: .LBB1_12: +; AVX1-NEXT: vpextrb $10, %xmm1, %ecx +; AVX1-NEXT: vpextrb $10, %xmm0, %ebp +; AVX1-NEXT: movl %ebp, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %bpl +; AVX1-NEXT: jno .LBB1_14 +; AVX1-NEXT: # %bb.13: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %ebp +; AVX1-NEXT: .LBB1_14: +; AVX1-NEXT: vpextrb $9, %xmm1, %ecx +; AVX1-NEXT: vpextrb $9, %xmm0, %ebx +; AVX1-NEXT: movl %ebx, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %bl +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB1_16 +; AVX1-NEXT: # %bb.15: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %ebx +; AVX1-NEXT: .LBB1_16: +; AVX1-NEXT: vpextrb $8, %xmm1, %ecx +; AVX1-NEXT: vpextrb $8, %xmm0, %esi +; AVX1-NEXT: movl %esi, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %sil +; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB1_18 +; AVX1-NEXT: # %bb.17: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %esi +; AVX1-NEXT: .LBB1_18: +; AVX1-NEXT: vpextrb $7, %xmm1, %ecx +; AVX1-NEXT: vpextrb $7, %xmm0, %edx +; AVX1-NEXT: movl %edx, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %dl +; AVX1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB1_20 +; AVX1-NEXT: # %bb.19: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %edx +; AVX1-NEXT: .LBB1_20: +; AVX1-NEXT: vpextrb $6, %xmm1, %ecx +; AVX1-NEXT: vpextrb $6, %xmm0, %edi +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %dil +; AVX1-NEXT: jno .LBB1_22 +; AVX1-NEXT: # %bb.21: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %edi +; AVX1-NEXT: .LBB1_22: +; AVX1-NEXT: vpextrb $5, %xmm1, %ecx +; AVX1-NEXT: vpextrb $5, %xmm0, %ebp +; AVX1-NEXT: movl %ebp, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %bpl +; AVX1-NEXT: jno .LBB1_24 +; AVX1-NEXT: # %bb.23: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %ebp +; AVX1-NEXT: .LBB1_24: +; AVX1-NEXT: vpextrb $4, %xmm1, %ecx +; AVX1-NEXT: vpextrb $4, %xmm0, %r11d +; AVX1-NEXT: movl %r11d, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %r11b +; AVX1-NEXT: jno .LBB1_26 +; AVX1-NEXT: # %bb.25: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r11d +; AVX1-NEXT: .LBB1_26: +; AVX1-NEXT: vpextrb $3, %xmm1, %ecx +; AVX1-NEXT: vpextrb $3, %xmm0, %r14d +; AVX1-NEXT: movl %r14d, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %r14b +; AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB1_28 +; AVX1-NEXT: # %bb.27: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r14d +; AVX1-NEXT: .LBB1_28: +; AVX1-NEXT: vpextrb $2, %xmm1, %ecx +; AVX1-NEXT: vpextrb $2, %xmm0, %r8d +; AVX1-NEXT: movl %r8d, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %r8b +; AVX1-NEXT: jno .LBB1_30 +; AVX1-NEXT: # %bb.29: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r8d +; AVX1-NEXT: .LBB1_30: +; AVX1-NEXT: vpextrb $0, %xmm1, %ecx +; AVX1-NEXT: vpextrb $0, %xmm0, %r10d +; AVX1-NEXT: movl %r10d, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %r10b +; AVX1-NEXT: jno .LBB1_32 +; AVX1-NEXT: # %bb.31: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r10d +; AVX1-NEXT: .LBB1_32: +; AVX1-NEXT: vpextrb $1, %xmm1, %ecx +; AVX1-NEXT: vpextrb $1, %xmm0, %r9d +; AVX1-NEXT: movl %r9d, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %r9b +; AVX1-NEXT: jno .LBB1_34 +; AVX1-NEXT: # %bb.33: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r9d +; AVX1-NEXT: .LBB1_34: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpextrb $15, %xmm1, %ecx +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpextrb $15, %xmm0, %ebx +; AVX1-NEXT: movl %ebx, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: movl %esi, %r12d +; AVX1-NEXT: addb %cl, %bl +; AVX1-NEXT: jno .LBB1_36 +; AVX1-NEXT: # %bb.35: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %ebx +; AVX1-NEXT: .LBB1_36: +; AVX1-NEXT: vpextrb $14, %xmm1, %ecx +; AVX1-NEXT: vpextrb $14, %xmm0, %esi +; AVX1-NEXT: movl %esi, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: movl %edx, %r13d +; AVX1-NEXT: addb %cl, %sil +; AVX1-NEXT: jno .LBB1_38 +; AVX1-NEXT: # %bb.37: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %esi +; AVX1-NEXT: .LBB1_38: +; AVX1-NEXT: vpextrb $13, %xmm1, %ecx +; AVX1-NEXT: vpextrb $13, %xmm0, %edx +; AVX1-NEXT: movl %edx, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: movl %edi, %ebp +; AVX1-NEXT: addb %cl, %dl +; AVX1-NEXT: jno .LBB1_40 +; AVX1-NEXT: # %bb.39: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %edx +; AVX1-NEXT: .LBB1_40: +; AVX1-NEXT: vpextrb $12, %xmm1, %ecx +; AVX1-NEXT: vpextrb $12, %xmm0, %edi +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %dil +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jo .LBB1_41 +; AVX1-NEXT: # %bb.42: +; AVX1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jmp .LBB1_43 +; AVX1-NEXT: .LBB1_41: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: # kill: def $al killed $al def $eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: .LBB1_43: +; AVX1-NEXT: movl %ebp, %edi +; AVX1-NEXT: vpextrb $11, %xmm1, %ecx +; AVX1-NEXT: vpextrb $11, %xmm0, %r15d +; AVX1-NEXT: movl %r15d, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %r15b +; AVX1-NEXT: movl %r12d, %esi +; AVX1-NEXT: movl %r13d, %edx +; AVX1-NEXT: jno .LBB1_45 +; AVX1-NEXT: # %bb.44: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r15d +; AVX1-NEXT: .LBB1_45: +; AVX1-NEXT: vpextrb $10, %xmm1, %ecx +; AVX1-NEXT: vpextrb $10, %xmm0, %r12d +; AVX1-NEXT: movl %r12d, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %r12b +; AVX1-NEXT: jno .LBB1_47 +; AVX1-NEXT: # %bb.46: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r12d +; AVX1-NEXT: .LBB1_47: +; AVX1-NEXT: vpextrb $9, %xmm1, %ecx +; AVX1-NEXT: vpextrb $9, %xmm0, %r13d +; AVX1-NEXT: movl %r13d, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %r13b +; AVX1-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB1_49 +; AVX1-NEXT: # %bb.48: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r13d +; AVX1-NEXT: .LBB1_49: +; AVX1-NEXT: vpextrb $8, %xmm1, %ecx +; AVX1-NEXT: vpextrb $8, %xmm0, %r11d +; AVX1-NEXT: movl %r11d, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %r11b +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB1_51 +; AVX1-NEXT: # %bb.50: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r11d +; AVX1-NEXT: .LBB1_51: +; AVX1-NEXT: vpextrb $7, %xmm1, %ecx +; AVX1-NEXT: vpextrb $7, %xmm0, %r10d +; AVX1-NEXT: movl %r10d, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %r10b +; AVX1-NEXT: jno .LBB1_53 +; AVX1-NEXT: # %bb.52: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r10d +; AVX1-NEXT: .LBB1_53: +; AVX1-NEXT: vpextrb $6, %xmm1, %ecx +; AVX1-NEXT: vpextrb $6, %xmm0, %ebp +; AVX1-NEXT: movl %ebp, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %bpl +; AVX1-NEXT: jno .LBB1_55 +; AVX1-NEXT: # %bb.54: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %ebp +; AVX1-NEXT: .LBB1_55: +; AVX1-NEXT: vpextrb $5, %xmm1, %ecx +; AVX1-NEXT: vpextrb $5, %xmm0, %edi +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %dil +; AVX1-NEXT: jno .LBB1_57 +; AVX1-NEXT: # %bb.56: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %edi +; AVX1-NEXT: .LBB1_57: +; AVX1-NEXT: vpextrb $4, %xmm1, %edx +; AVX1-NEXT: vpextrb $4, %xmm0, %eax +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: addb %dl, %cl +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addb %dl, %al +; AVX1-NEXT: jno .LBB1_59 +; AVX1-NEXT: # %bb.58: +; AVX1-NEXT: addb $127, %cl +; AVX1-NEXT: movl %ecx, %eax +; AVX1-NEXT: .LBB1_59: +; AVX1-NEXT: vpextrb $3, %xmm1, %ebx +; AVX1-NEXT: vpextrb $3, %xmm0, %ecx +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: addb %bl, %dl +; AVX1-NEXT: setns %dl +; AVX1-NEXT: addb %bl, %cl +; AVX1-NEXT: jno .LBB1_61 +; AVX1-NEXT: # %bb.60: +; AVX1-NEXT: addb $127, %dl +; AVX1-NEXT: movl %edx, %ecx +; AVX1-NEXT: .LBB1_61: +; AVX1-NEXT: vpextrb $2, %xmm1, %esi +; AVX1-NEXT: vpextrb $2, %xmm0, %edx +; AVX1-NEXT: movl %edx, %ebx +; AVX1-NEXT: addb %sil, %bl +; AVX1-NEXT: setns %bl +; AVX1-NEXT: addb %sil, %dl +; AVX1-NEXT: jno .LBB1_63 +; AVX1-NEXT: # %bb.62: +; AVX1-NEXT: addb $127, %bl +; AVX1-NEXT: movl %ebx, %edx +; AVX1-NEXT: .LBB1_63: +; AVX1-NEXT: vpextrb $0, %xmm1, %esi +; AVX1-NEXT: vpextrb $0, %xmm0, %r8d +; AVX1-NEXT: movl %r8d, %ebx +; AVX1-NEXT: addb %sil, %bl +; AVX1-NEXT: setns %bl +; AVX1-NEXT: addb %sil, %r8b +; AVX1-NEXT: jo .LBB1_64 +; AVX1-NEXT: # %bb.65: +; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Reload +; AVX1-NEXT: jmp .LBB1_66 +; AVX1-NEXT: .LBB1_64: +; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Reload +; AVX1-NEXT: addb $127, %bl +; AVX1-NEXT: movl %ebx, %r8d +; AVX1-NEXT: .LBB1_66: +; AVX1-NEXT: vpextrb $1, %xmm1, %esi +; AVX1-NEXT: vpextrb $1, %xmm0, %r9d +; AVX1-NEXT: movl %r9d, %ebx +; AVX1-NEXT: addb %sil, %bl +; AVX1-NEXT: setns %bl +; AVX1-NEXT: addb %sil, %r9b +; AVX1-NEXT: jno .LBB1_68 +; AVX1-NEXT: # %bb.67: +; AVX1-NEXT: addb $127, %bl +; AVX1-NEXT: movl %ebx, %r9d +; AVX1-NEXT: .LBB1_68: +; AVX1-NEXT: movzbl %r8b, %esi +; AVX1-NEXT: vmovd %esi, %xmm0 +; AVX1-NEXT: movzbl %r9b, %esi +; AVX1-NEXT: vpinsrb $1, %esi, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %dl, %edx +; AVX1-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %cl, %ecx +; AVX1-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %al, %eax +; AVX1-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %dil, %eax +; AVX1-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %bpl, %eax +; AVX1-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %r10b, %eax +; AVX1-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %r11b, %eax +; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %r13b, %eax +; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %r12b, %eax +; AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %r15b, %eax +; AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl %r14b, %eax +; AVX1-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: popq %r12 +; AVX1-NEXT: popq %r13 +; AVX1-NEXT: popq %r14 +; AVX1-NEXT: popq %r15 +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: v32i8: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %r13 +; AVX2-NEXT: pushq %r12 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: vpextrb $15, %xmm1, %ecx +; AVX2-NEXT: vpextrb $15, %xmm0, %edx +; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %dl +; AVX2-NEXT: jo .LBB1_1 +; AVX2-NEXT: # %bb.2: +; AVX2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jmp .LBB1_3 +; AVX2-NEXT: .LBB1_1: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: # kill: def $al killed $al def $eax +; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: .LBB1_3: +; AVX2-NEXT: vpextrb $14, %xmm1, %ecx +; AVX2-NEXT: vpextrb $14, %xmm0, %edx +; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %dl +; AVX2-NEXT: jno .LBB1_5 +; AVX2-NEXT: # %bb.4: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %edx +; AVX2-NEXT: .LBB1_5: +; AVX2-NEXT: vpextrb $13, %xmm1, %ecx +; AVX2-NEXT: vpextrb $13, %xmm0, %esi +; AVX2-NEXT: movl %esi, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %sil +; AVX2-NEXT: jo .LBB1_6 +; AVX2-NEXT: # %bb.7: +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jmp .LBB1_8 +; AVX2-NEXT: .LBB1_6: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: # kill: def $al killed $al def $eax +; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: .LBB1_8: +; AVX2-NEXT: vpextrb $12, %xmm1, %ecx +; AVX2-NEXT: vpextrb $12, %xmm0, %esi +; AVX2-NEXT: movl %esi, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %sil +; AVX2-NEXT: jno .LBB1_10 +; AVX2-NEXT: # %bb.9: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %esi +; AVX2-NEXT: .LBB1_10: +; AVX2-NEXT: vpextrb $11, %xmm1, %ecx +; AVX2-NEXT: vpextrb $11, %xmm0, %edi +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %dil +; AVX2-NEXT: jno .LBB1_12 +; AVX2-NEXT: # %bb.11: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %edi +; AVX2-NEXT: .LBB1_12: +; AVX2-NEXT: vpextrb $10, %xmm1, %ecx +; AVX2-NEXT: vpextrb $10, %xmm0, %ebp +; AVX2-NEXT: movl %ebp, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %bpl +; AVX2-NEXT: jno .LBB1_14 +; AVX2-NEXT: # %bb.13: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %ebp +; AVX2-NEXT: .LBB1_14: +; AVX2-NEXT: vpextrb $9, %xmm1, %ecx +; AVX2-NEXT: vpextrb $9, %xmm0, %ebx +; AVX2-NEXT: movl %ebx, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %bl +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB1_16 +; AVX2-NEXT: # %bb.15: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %ebx +; AVX2-NEXT: .LBB1_16: +; AVX2-NEXT: vpextrb $8, %xmm1, %ecx +; AVX2-NEXT: vpextrb $8, %xmm0, %esi +; AVX2-NEXT: movl %esi, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %sil +; AVX2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB1_18 +; AVX2-NEXT: # %bb.17: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %esi +; AVX2-NEXT: .LBB1_18: +; AVX2-NEXT: vpextrb $7, %xmm1, %ecx +; AVX2-NEXT: vpextrb $7, %xmm0, %edx +; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %dl +; AVX2-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB1_20 +; AVX2-NEXT: # %bb.19: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %edx +; AVX2-NEXT: .LBB1_20: +; AVX2-NEXT: vpextrb $6, %xmm1, %ecx +; AVX2-NEXT: vpextrb $6, %xmm0, %edi +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %dil +; AVX2-NEXT: jno .LBB1_22 +; AVX2-NEXT: # %bb.21: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %edi +; AVX2-NEXT: .LBB1_22: +; AVX2-NEXT: vpextrb $5, %xmm1, %ecx +; AVX2-NEXT: vpextrb $5, %xmm0, %ebp +; AVX2-NEXT: movl %ebp, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %bpl +; AVX2-NEXT: jno .LBB1_24 +; AVX2-NEXT: # %bb.23: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %ebp +; AVX2-NEXT: .LBB1_24: +; AVX2-NEXT: vpextrb $4, %xmm1, %ecx +; AVX2-NEXT: vpextrb $4, %xmm0, %r11d +; AVX2-NEXT: movl %r11d, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %r11b +; AVX2-NEXT: jno .LBB1_26 +; AVX2-NEXT: # %bb.25: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r11d +; AVX2-NEXT: .LBB1_26: +; AVX2-NEXT: vpextrb $3, %xmm1, %ecx +; AVX2-NEXT: vpextrb $3, %xmm0, %r14d +; AVX2-NEXT: movl %r14d, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %r14b +; AVX2-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB1_28 +; AVX2-NEXT: # %bb.27: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r14d +; AVX2-NEXT: .LBB1_28: +; AVX2-NEXT: vpextrb $2, %xmm1, %ecx +; AVX2-NEXT: vpextrb $2, %xmm0, %r8d +; AVX2-NEXT: movl %r8d, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %r8b +; AVX2-NEXT: jno .LBB1_30 +; AVX2-NEXT: # %bb.29: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r8d +; AVX2-NEXT: .LBB1_30: +; AVX2-NEXT: vpextrb $0, %xmm1, %ecx +; AVX2-NEXT: vpextrb $0, %xmm0, %r10d +; AVX2-NEXT: movl %r10d, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %r10b +; AVX2-NEXT: jno .LBB1_32 +; AVX2-NEXT: # %bb.31: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r10d +; AVX2-NEXT: .LBB1_32: +; AVX2-NEXT: vpextrb $1, %xmm1, %ecx +; AVX2-NEXT: vpextrb $1, %xmm0, %r9d +; AVX2-NEXT: movl %r9d, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %r9b +; AVX2-NEXT: jno .LBB1_34 +; AVX2-NEXT: # %bb.33: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r9d +; AVX2-NEXT: .LBB1_34: +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vpextrb $15, %xmm1, %ecx +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpextrb $15, %xmm0, %ebx +; AVX2-NEXT: movl %ebx, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: movl %esi, %r12d +; AVX2-NEXT: addb %cl, %bl +; AVX2-NEXT: jno .LBB1_36 +; AVX2-NEXT: # %bb.35: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %ebx +; AVX2-NEXT: .LBB1_36: +; AVX2-NEXT: vpextrb $14, %xmm1, %ecx +; AVX2-NEXT: vpextrb $14, %xmm0, %esi +; AVX2-NEXT: movl %esi, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: movl %edx, %r13d +; AVX2-NEXT: addb %cl, %sil +; AVX2-NEXT: jno .LBB1_38 +; AVX2-NEXT: # %bb.37: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %esi +; AVX2-NEXT: .LBB1_38: +; AVX2-NEXT: vpextrb $13, %xmm1, %ecx +; AVX2-NEXT: vpextrb $13, %xmm0, %edx +; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: movl %edi, %ebp +; AVX2-NEXT: addb %cl, %dl +; AVX2-NEXT: jno .LBB1_40 +; AVX2-NEXT: # %bb.39: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %edx +; AVX2-NEXT: .LBB1_40: +; AVX2-NEXT: vpextrb $12, %xmm1, %ecx +; AVX2-NEXT: vpextrb $12, %xmm0, %edi +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %dil +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jo .LBB1_41 +; AVX2-NEXT: # %bb.42: +; AVX2-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jmp .LBB1_43 +; AVX2-NEXT: .LBB1_41: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: # kill: def $al killed $al def $eax +; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: .LBB1_43: +; AVX2-NEXT: movl %ebp, %edi +; AVX2-NEXT: vpextrb $11, %xmm1, %ecx +; AVX2-NEXT: vpextrb $11, %xmm0, %r15d +; AVX2-NEXT: movl %r15d, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %r15b +; AVX2-NEXT: movl %r12d, %esi +; AVX2-NEXT: movl %r13d, %edx +; AVX2-NEXT: jno .LBB1_45 +; AVX2-NEXT: # %bb.44: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r15d +; AVX2-NEXT: .LBB1_45: +; AVX2-NEXT: vpextrb $10, %xmm1, %ecx +; AVX2-NEXT: vpextrb $10, %xmm0, %r12d +; AVX2-NEXT: movl %r12d, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %r12b +; AVX2-NEXT: jno .LBB1_47 +; AVX2-NEXT: # %bb.46: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r12d +; AVX2-NEXT: .LBB1_47: +; AVX2-NEXT: vpextrb $9, %xmm1, %ecx +; AVX2-NEXT: vpextrb $9, %xmm0, %r13d +; AVX2-NEXT: movl %r13d, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %r13b +; AVX2-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB1_49 +; AVX2-NEXT: # %bb.48: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r13d +; AVX2-NEXT: .LBB1_49: +; AVX2-NEXT: vpextrb $8, %xmm1, %ecx +; AVX2-NEXT: vpextrb $8, %xmm0, %r11d +; AVX2-NEXT: movl %r11d, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %r11b +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB1_51 +; AVX2-NEXT: # %bb.50: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r11d +; AVX2-NEXT: .LBB1_51: +; AVX2-NEXT: vpextrb $7, %xmm1, %ecx +; AVX2-NEXT: vpextrb $7, %xmm0, %r10d +; AVX2-NEXT: movl %r10d, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %r10b +; AVX2-NEXT: jno .LBB1_53 +; AVX2-NEXT: # %bb.52: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r10d +; AVX2-NEXT: .LBB1_53: +; AVX2-NEXT: vpextrb $6, %xmm1, %ecx +; AVX2-NEXT: vpextrb $6, %xmm0, %ebp +; AVX2-NEXT: movl %ebp, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %bpl +; AVX2-NEXT: jno .LBB1_55 +; AVX2-NEXT: # %bb.54: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %ebp +; AVX2-NEXT: .LBB1_55: +; AVX2-NEXT: vpextrb $5, %xmm1, %ecx +; AVX2-NEXT: vpextrb $5, %xmm0, %edi +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %dil +; AVX2-NEXT: jno .LBB1_57 +; AVX2-NEXT: # %bb.56: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %edi +; AVX2-NEXT: .LBB1_57: +; AVX2-NEXT: vpextrb $4, %xmm1, %edx +; AVX2-NEXT: vpextrb $4, %xmm0, %eax +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: addb %dl, %cl +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addb %dl, %al +; AVX2-NEXT: jno .LBB1_59 +; AVX2-NEXT: # %bb.58: +; AVX2-NEXT: addb $127, %cl +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: .LBB1_59: +; AVX2-NEXT: vpextrb $3, %xmm1, %ebx +; AVX2-NEXT: vpextrb $3, %xmm0, %ecx +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: addb %bl, %dl +; AVX2-NEXT: setns %dl +; AVX2-NEXT: addb %bl, %cl +; AVX2-NEXT: jno .LBB1_61 +; AVX2-NEXT: # %bb.60: +; AVX2-NEXT: addb $127, %dl +; AVX2-NEXT: movl %edx, %ecx +; AVX2-NEXT: .LBB1_61: +; AVX2-NEXT: vpextrb $2, %xmm1, %esi +; AVX2-NEXT: vpextrb $2, %xmm0, %edx +; AVX2-NEXT: movl %edx, %ebx +; AVX2-NEXT: addb %sil, %bl +; AVX2-NEXT: setns %bl +; AVX2-NEXT: addb %sil, %dl +; AVX2-NEXT: jno .LBB1_63 +; AVX2-NEXT: # %bb.62: +; AVX2-NEXT: addb $127, %bl +; AVX2-NEXT: movl %ebx, %edx +; AVX2-NEXT: .LBB1_63: +; AVX2-NEXT: vpextrb $0, %xmm1, %esi +; AVX2-NEXT: vpextrb $0, %xmm0, %r8d +; AVX2-NEXT: movl %r8d, %ebx +; AVX2-NEXT: addb %sil, %bl +; AVX2-NEXT: setns %bl +; AVX2-NEXT: addb %sil, %r8b +; AVX2-NEXT: jo .LBB1_64 +; AVX2-NEXT: # %bb.65: +; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Reload +; AVX2-NEXT: jmp .LBB1_66 +; AVX2-NEXT: .LBB1_64: +; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Reload +; AVX2-NEXT: addb $127, %bl +; AVX2-NEXT: movl %ebx, %r8d +; AVX2-NEXT: .LBB1_66: +; AVX2-NEXT: vpextrb $1, %xmm1, %esi +; AVX2-NEXT: vpextrb $1, %xmm0, %r9d +; AVX2-NEXT: movl %r9d, %ebx +; AVX2-NEXT: addb %sil, %bl +; AVX2-NEXT: setns %bl +; AVX2-NEXT: addb %sil, %r9b +; AVX2-NEXT: jno .LBB1_68 +; AVX2-NEXT: # %bb.67: +; AVX2-NEXT: addb $127, %bl +; AVX2-NEXT: movl %ebx, %r9d +; AVX2-NEXT: .LBB1_68: +; AVX2-NEXT: movzbl %r8b, %esi +; AVX2-NEXT: vmovd %esi, %xmm0 +; AVX2-NEXT: movzbl %r9b, %esi +; AVX2-NEXT: vpinsrb $1, %esi, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %dl, %edx +; AVX2-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %dil, %eax +; AVX2-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %bpl, %eax +; AVX2-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %r10b, %eax +; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %r11b, %eax +; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %r13b, %eax +; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %r12b, %eax +; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %r15b, %eax +; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl %r14b, %eax +; AVX2-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r12 +; AVX2-NEXT: popq %r13 +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512-LABEL: v32i8: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r15 +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %r13 +; AVX512-NEXT: pushq %r12 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: vpextrb $15, %xmm1, %ecx +; AVX512-NEXT: vpextrb $15, %xmm0, %edx +; AVX512-NEXT: movl %edx, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %dl +; AVX512-NEXT: jo .LBB1_1 +; AVX512-NEXT: # %bb.2: +; AVX512-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jmp .LBB1_3 +; AVX512-NEXT: .LBB1_1: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: # kill: def $al killed $al def $eax +; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: .LBB1_3: +; AVX512-NEXT: vpextrb $14, %xmm1, %ecx +; AVX512-NEXT: vpextrb $14, %xmm0, %edx +; AVX512-NEXT: movl %edx, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %dl +; AVX512-NEXT: jno .LBB1_5 +; AVX512-NEXT: # %bb.4: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %edx +; AVX512-NEXT: .LBB1_5: +; AVX512-NEXT: vpextrb $13, %xmm1, %ecx +; AVX512-NEXT: vpextrb $13, %xmm0, %esi +; AVX512-NEXT: movl %esi, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %sil +; AVX512-NEXT: jo .LBB1_6 +; AVX512-NEXT: # %bb.7: +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jmp .LBB1_8 +; AVX512-NEXT: .LBB1_6: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: # kill: def $al killed $al def $eax +; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: .LBB1_8: +; AVX512-NEXT: vpextrb $12, %xmm1, %ecx +; AVX512-NEXT: vpextrb $12, %xmm0, %esi +; AVX512-NEXT: movl %esi, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %sil +; AVX512-NEXT: jno .LBB1_10 +; AVX512-NEXT: # %bb.9: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %esi +; AVX512-NEXT: .LBB1_10: +; AVX512-NEXT: vpextrb $11, %xmm1, %ecx +; AVX512-NEXT: vpextrb $11, %xmm0, %edi +; AVX512-NEXT: movl %edi, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %dil +; AVX512-NEXT: jno .LBB1_12 +; AVX512-NEXT: # %bb.11: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %edi +; AVX512-NEXT: .LBB1_12: +; AVX512-NEXT: vpextrb $10, %xmm1, %ecx +; AVX512-NEXT: vpextrb $10, %xmm0, %ebp +; AVX512-NEXT: movl %ebp, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %bpl +; AVX512-NEXT: jno .LBB1_14 +; AVX512-NEXT: # %bb.13: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %ebp +; AVX512-NEXT: .LBB1_14: +; AVX512-NEXT: vpextrb $9, %xmm1, %ecx +; AVX512-NEXT: vpextrb $9, %xmm0, %ebx +; AVX512-NEXT: movl %ebx, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %bl +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB1_16 +; AVX512-NEXT: # %bb.15: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %ebx +; AVX512-NEXT: .LBB1_16: +; AVX512-NEXT: vpextrb $8, %xmm1, %ecx +; AVX512-NEXT: vpextrb $8, %xmm0, %esi +; AVX512-NEXT: movl %esi, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %sil +; AVX512-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB1_18 +; AVX512-NEXT: # %bb.17: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %esi +; AVX512-NEXT: .LBB1_18: +; AVX512-NEXT: vpextrb $7, %xmm1, %ecx +; AVX512-NEXT: vpextrb $7, %xmm0, %edx +; AVX512-NEXT: movl %edx, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %dl +; AVX512-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB1_20 +; AVX512-NEXT: # %bb.19: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %edx +; AVX512-NEXT: .LBB1_20: +; AVX512-NEXT: vpextrb $6, %xmm1, %ecx +; AVX512-NEXT: vpextrb $6, %xmm0, %edi +; AVX512-NEXT: movl %edi, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %dil +; AVX512-NEXT: jno .LBB1_22 +; AVX512-NEXT: # %bb.21: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %edi +; AVX512-NEXT: .LBB1_22: +; AVX512-NEXT: vpextrb $5, %xmm1, %ecx +; AVX512-NEXT: vpextrb $5, %xmm0, %ebp +; AVX512-NEXT: movl %ebp, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %bpl +; AVX512-NEXT: jno .LBB1_24 +; AVX512-NEXT: # %bb.23: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %ebp +; AVX512-NEXT: .LBB1_24: +; AVX512-NEXT: vpextrb $4, %xmm1, %ecx +; AVX512-NEXT: vpextrb $4, %xmm0, %r11d +; AVX512-NEXT: movl %r11d, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %r11b +; AVX512-NEXT: jno .LBB1_26 +; AVX512-NEXT: # %bb.25: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r11d +; AVX512-NEXT: .LBB1_26: +; AVX512-NEXT: vpextrb $3, %xmm1, %ecx +; AVX512-NEXT: vpextrb $3, %xmm0, %r14d +; AVX512-NEXT: movl %r14d, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %r14b +; AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB1_28 +; AVX512-NEXT: # %bb.27: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r14d +; AVX512-NEXT: .LBB1_28: +; AVX512-NEXT: vpextrb $2, %xmm1, %ecx +; AVX512-NEXT: vpextrb $2, %xmm0, %r8d +; AVX512-NEXT: movl %r8d, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %r8b +; AVX512-NEXT: jno .LBB1_30 +; AVX512-NEXT: # %bb.29: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r8d +; AVX512-NEXT: .LBB1_30: +; AVX512-NEXT: vpextrb $0, %xmm1, %ecx +; AVX512-NEXT: vpextrb $0, %xmm0, %r10d +; AVX512-NEXT: movl %r10d, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %r10b +; AVX512-NEXT: jno .LBB1_32 +; AVX512-NEXT: # %bb.31: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r10d +; AVX512-NEXT: .LBB1_32: +; AVX512-NEXT: vpextrb $1, %xmm1, %ecx +; AVX512-NEXT: vpextrb $1, %xmm0, %r9d +; AVX512-NEXT: movl %r9d, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %r9b +; AVX512-NEXT: jno .LBB1_34 +; AVX512-NEXT: # %bb.33: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r9d +; AVX512-NEXT: .LBB1_34: +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vpextrb $15, %xmm1, %ecx +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vpextrb $15, %xmm0, %ebx +; AVX512-NEXT: movl %ebx, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: movl %esi, %r12d +; AVX512-NEXT: addb %cl, %bl +; AVX512-NEXT: jno .LBB1_36 +; AVX512-NEXT: # %bb.35: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %ebx +; AVX512-NEXT: .LBB1_36: +; AVX512-NEXT: vpextrb $14, %xmm1, %ecx +; AVX512-NEXT: vpextrb $14, %xmm0, %esi +; AVX512-NEXT: movl %esi, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: movl %edx, %r13d +; AVX512-NEXT: addb %cl, %sil +; AVX512-NEXT: jno .LBB1_38 +; AVX512-NEXT: # %bb.37: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %esi +; AVX512-NEXT: .LBB1_38: +; AVX512-NEXT: vpextrb $13, %xmm1, %ecx +; AVX512-NEXT: vpextrb $13, %xmm0, %edx +; AVX512-NEXT: movl %edx, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: movl %edi, %ebp +; AVX512-NEXT: addb %cl, %dl +; AVX512-NEXT: jno .LBB1_40 +; AVX512-NEXT: # %bb.39: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %edx +; AVX512-NEXT: .LBB1_40: +; AVX512-NEXT: vpextrb $12, %xmm1, %ecx +; AVX512-NEXT: vpextrb $12, %xmm0, %edi +; AVX512-NEXT: movl %edi, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %dil +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jo .LBB1_41 +; AVX512-NEXT: # %bb.42: +; AVX512-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jmp .LBB1_43 +; AVX512-NEXT: .LBB1_41: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: # kill: def $al killed $al def $eax +; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: .LBB1_43: +; AVX512-NEXT: movl %ebp, %edi +; AVX512-NEXT: vpextrb $11, %xmm1, %ecx +; AVX512-NEXT: vpextrb $11, %xmm0, %r15d +; AVX512-NEXT: movl %r15d, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %r15b +; AVX512-NEXT: movl %r12d, %esi +; AVX512-NEXT: movl %r13d, %edx +; AVX512-NEXT: jno .LBB1_45 +; AVX512-NEXT: # %bb.44: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r15d +; AVX512-NEXT: .LBB1_45: +; AVX512-NEXT: vpextrb $10, %xmm1, %ecx +; AVX512-NEXT: vpextrb $10, %xmm0, %r12d +; AVX512-NEXT: movl %r12d, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %r12b +; AVX512-NEXT: jno .LBB1_47 +; AVX512-NEXT: # %bb.46: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r12d +; AVX512-NEXT: .LBB1_47: +; AVX512-NEXT: vpextrb $9, %xmm1, %ecx +; AVX512-NEXT: vpextrb $9, %xmm0, %r13d +; AVX512-NEXT: movl %r13d, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %r13b +; AVX512-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB1_49 +; AVX512-NEXT: # %bb.48: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r13d +; AVX512-NEXT: .LBB1_49: +; AVX512-NEXT: vpextrb $8, %xmm1, %ecx +; AVX512-NEXT: vpextrb $8, %xmm0, %r11d +; AVX512-NEXT: movl %r11d, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %r11b +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB1_51 +; AVX512-NEXT: # %bb.50: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r11d +; AVX512-NEXT: .LBB1_51: +; AVX512-NEXT: vpextrb $7, %xmm1, %ecx +; AVX512-NEXT: vpextrb $7, %xmm0, %r10d +; AVX512-NEXT: movl %r10d, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %r10b +; AVX512-NEXT: jno .LBB1_53 +; AVX512-NEXT: # %bb.52: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r10d +; AVX512-NEXT: .LBB1_53: +; AVX512-NEXT: vpextrb $6, %xmm1, %ecx +; AVX512-NEXT: vpextrb $6, %xmm0, %ebp +; AVX512-NEXT: movl %ebp, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %bpl +; AVX512-NEXT: jno .LBB1_55 +; AVX512-NEXT: # %bb.54: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %ebp +; AVX512-NEXT: .LBB1_55: +; AVX512-NEXT: vpextrb $5, %xmm1, %ecx +; AVX512-NEXT: vpextrb $5, %xmm0, %edi +; AVX512-NEXT: movl %edi, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %dil +; AVX512-NEXT: jno .LBB1_57 +; AVX512-NEXT: # %bb.56: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %edi +; AVX512-NEXT: .LBB1_57: +; AVX512-NEXT: vpextrb $4, %xmm1, %edx +; AVX512-NEXT: vpextrb $4, %xmm0, %eax +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: addb %dl, %cl +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addb %dl, %al +; AVX512-NEXT: jno .LBB1_59 +; AVX512-NEXT: # %bb.58: +; AVX512-NEXT: addb $127, %cl +; AVX512-NEXT: movl %ecx, %eax +; AVX512-NEXT: .LBB1_59: +; AVX512-NEXT: vpextrb $3, %xmm1, %ebx +; AVX512-NEXT: vpextrb $3, %xmm0, %ecx +; AVX512-NEXT: movl %ecx, %edx +; AVX512-NEXT: addb %bl, %dl +; AVX512-NEXT: setns %dl +; AVX512-NEXT: addb %bl, %cl +; AVX512-NEXT: jno .LBB1_61 +; AVX512-NEXT: # %bb.60: +; AVX512-NEXT: addb $127, %dl +; AVX512-NEXT: movl %edx, %ecx +; AVX512-NEXT: .LBB1_61: +; AVX512-NEXT: vpextrb $2, %xmm1, %esi +; AVX512-NEXT: vpextrb $2, %xmm0, %edx +; AVX512-NEXT: movl %edx, %ebx +; AVX512-NEXT: addb %sil, %bl +; AVX512-NEXT: setns %bl +; AVX512-NEXT: addb %sil, %dl +; AVX512-NEXT: jno .LBB1_63 +; AVX512-NEXT: # %bb.62: +; AVX512-NEXT: addb $127, %bl +; AVX512-NEXT: movl %ebx, %edx +; AVX512-NEXT: .LBB1_63: +; AVX512-NEXT: vpextrb $0, %xmm1, %esi +; AVX512-NEXT: vpextrb $0, %xmm0, %r8d +; AVX512-NEXT: movl %r8d, %ebx +; AVX512-NEXT: addb %sil, %bl +; AVX512-NEXT: setns %bl +; AVX512-NEXT: addb %sil, %r8b +; AVX512-NEXT: jo .LBB1_64 +; AVX512-NEXT: # %bb.65: +; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Reload +; AVX512-NEXT: jmp .LBB1_66 +; AVX512-NEXT: .LBB1_64: +; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Reload +; AVX512-NEXT: addb $127, %bl +; AVX512-NEXT: movl %ebx, %r8d +; AVX512-NEXT: .LBB1_66: +; AVX512-NEXT: vpextrb $1, %xmm1, %esi +; AVX512-NEXT: vpextrb $1, %xmm0, %r9d +; AVX512-NEXT: movl %r9d, %ebx +; AVX512-NEXT: addb %sil, %bl +; AVX512-NEXT: setns %bl +; AVX512-NEXT: addb %sil, %r9b +; AVX512-NEXT: jno .LBB1_68 +; AVX512-NEXT: # %bb.67: +; AVX512-NEXT: addb $127, %bl +; AVX512-NEXT: movl %ebx, %r9d +; AVX512-NEXT: .LBB1_68: +; AVX512-NEXT: movzbl %r8b, %esi +; AVX512-NEXT: vmovd %esi, %xmm0 +; AVX512-NEXT: movzbl %r9b, %esi +; AVX512-NEXT: vpinsrb $1, %esi, %xmm0, %xmm0 +; AVX512-NEXT: movzbl %dl, %edx +; AVX512-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; AVX512-NEXT: movzbl %cl, %ecx +; AVX512-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 +; AVX512-NEXT: movzbl %al, %eax +; AVX512-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; AVX512-NEXT: movzbl %dil, %eax +; AVX512-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; AVX512-NEXT: movzbl %bpl, %eax +; AVX512-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX512-NEXT: movzbl %r10b, %eax +; AVX512-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX512-NEXT: movzbl %r11b, %eax +; AVX512-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX512-NEXT: movzbl %r13b, %eax +; AVX512-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX512-NEXT: movzbl %r12b, %eax +; AVX512-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX512-NEXT: movzbl %r15b, %eax +; AVX512-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vmovd %eax, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl %r14b, %eax +; AVX512-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r12 +; AVX512-NEXT: popq %r13 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %r15 +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq + %z = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> %x, <32 x i8> %y) + ret <32 x i8> %z +} + +define <64 x i8> @v64i8(<64 x i8> %x, <64 x i8> %y) nounwind { +; SSE2-LABEL: v64i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: pushq %r15 +; SSE2-NEXT: pushq %r14 +; SSE2-NEXT: pushq %r13 +; SSE2-NEXT: pushq %r12 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: subq $232, %rsp +; SSE2-NEXT: movaps %xmm5, (%rsp) +; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movb (%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %dl +; SSE2-NEXT: jno .LBB2_2 +; SSE2-NEXT: # %bb.1: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %edx +; SSE2-NEXT: .LBB2_2: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movl %esi, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %sil +; SSE2-NEXT: jno .LBB2_4 +; SSE2-NEXT: # %bb.3: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %esi +; SSE2-NEXT: .LBB2_4: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dil +; SSE2-NEXT: movl %edi, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %dil +; SSE2-NEXT: jno .LBB2_6 +; SSE2-NEXT: # %bb.5: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %edi +; SSE2-NEXT: .LBB2_6: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r11b +; SSE2-NEXT: movl %r11d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r11b +; SSE2-NEXT: jno .LBB2_8 +; SSE2-NEXT: # %bb.7: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r11d +; SSE2-NEXT: .LBB2_8: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r9b +; SSE2-NEXT: movl %r9d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r9b +; SSE2-NEXT: jno .LBB2_10 +; SSE2-NEXT: # %bb.9: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r9d +; SSE2-NEXT: .LBB2_10: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r10b +; SSE2-NEXT: movl %r10d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r10b +; SSE2-NEXT: jno .LBB2_12 +; SSE2-NEXT: # %bb.11: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r10d +; SSE2-NEXT: .LBB2_12: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl %ebx, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %bl +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB2_14 +; SSE2-NEXT: # %bb.13: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %ebx +; SSE2-NEXT: .LBB2_14: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %dl +; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jo .LBB2_15 +; SSE2-NEXT: # %bb.16: +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jmp .LBB2_17 +; SSE2-NEXT: .LBB2_15: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_17: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %dl +; SSE2-NEXT: jno .LBB2_19 +; SSE2-NEXT: # %bb.18: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %edx +; SSE2-NEXT: .LBB2_19: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r12b +; SSE2-NEXT: movl %r12d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r12b +; SSE2-NEXT: jno .LBB2_21 +; SSE2-NEXT: # %bb.20: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r12d +; SSE2-NEXT: .LBB2_21: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl %ebx, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %bl +; SSE2-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB2_23 +; SSE2-NEXT: # %bb.22: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %ebx +; SSE2-NEXT: .LBB2_23: +; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r9b +; SSE2-NEXT: movl %r9d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r9b +; SSE2-NEXT: jno .LBB2_25 +; SSE2-NEXT: # %bb.24: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r9d +; SSE2-NEXT: .LBB2_25: +; SSE2-NEXT: movl %edi, %r8d +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl %ebx, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %bl +; SSE2-NEXT: jo .LBB2_26 +; SSE2-NEXT: # %bb.27: +; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movl %esi, %edi +; SSE2-NEXT: jmp .LBB2_28 +; SSE2-NEXT: .LBB2_26: +; SSE2-NEXT: movl %esi, %edi +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_28: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSE2-NEXT: movl %esi, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %sil +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB2_30 +; SSE2-NEXT: # %bb.29: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %esi +; SSE2-NEXT: .LBB2_30: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %dl +; SSE2-NEXT: jno .LBB2_32 +; SSE2-NEXT: # %bb.31: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %edx +; SSE2-NEXT: .LBB2_32: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl %ebx, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %bl +; SSE2-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB2_34 +; SSE2-NEXT: # %bb.33: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %ebx +; SSE2-NEXT: .LBB2_34: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r9b +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %dl +; SSE2-NEXT: jno .LBB2_36 +; SSE2-NEXT: # %bb.35: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %edx +; SSE2-NEXT: .LBB2_36: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movl %r9d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r9b +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB2_38 +; SSE2-NEXT: # %bb.37: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r9d +; SSE2-NEXT: .LBB2_38: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %dl +; SSE2-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB2_40 +; SSE2-NEXT: # %bb.39: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %edx +; SSE2-NEXT: .LBB2_40: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bpl +; SSE2-NEXT: movl %ebp, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %bpl +; SSE2-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB2_42 +; SSE2-NEXT: # %bb.41: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %ebp +; SSE2-NEXT: .LBB2_42: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSE2-NEXT: movl %esi, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %sil +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB2_44 +; SSE2-NEXT: # %bb.43: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %esi +; SSE2-NEXT: .LBB2_44: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %dl +; SSE2-NEXT: jno .LBB2_46 +; SSE2-NEXT: # %bb.45: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %edx +; SSE2-NEXT: .LBB2_46: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dil +; SSE2-NEXT: movl %edi, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %dil +; SSE2-NEXT: jno .LBB2_48 +; SSE2-NEXT: # %bb.47: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %edi +; SSE2-NEXT: .LBB2_48: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl %ebx, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %bl +; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB2_50 +; SSE2-NEXT: # %bb.49: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_50: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSE2-NEXT: movl %esi, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %sil +; SSE2-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB2_52 +; SSE2-NEXT: # %bb.51: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %esi +; SSE2-NEXT: .LBB2_52: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dil +; SSE2-NEXT: movl %edi, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %dil +; SSE2-NEXT: jno .LBB2_54 +; SSE2-NEXT: # %bb.53: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %edi +; SSE2-NEXT: .LBB2_54: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r14b +; SSE2-NEXT: movl %r14d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r14b +; SSE2-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB2_56 +; SSE2-NEXT: # %bb.55: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r14d +; SSE2-NEXT: .LBB2_56: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r9b +; SSE2-NEXT: movl %r9d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r9b +; SSE2-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB2_58 +; SSE2-NEXT: # %bb.57: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r9d +; SSE2-NEXT: .LBB2_58: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %dl +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB2_60 +; SSE2-NEXT: # %bb.59: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_60: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %dl +; SSE2-NEXT: jno .LBB2_62 +; SSE2-NEXT: # %bb.61: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %edx +; SSE2-NEXT: .LBB2_62: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r15b +; SSE2-NEXT: movl %r15d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r15b +; SSE2-NEXT: jno .LBB2_64 +; SSE2-NEXT: # %bb.63: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r15d +; SSE2-NEXT: .LBB2_64: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl %ebx, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %bl +; SSE2-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jo .LBB2_65 +; SSE2-NEXT: # %bb.66: +; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jmp .LBB2_67 +; SSE2-NEXT: .LBB2_65: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_67: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r11b +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl %r11d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r11b +; SSE2-NEXT: jno .LBB2_69 +; SSE2-NEXT: # %bb.68: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r11d +; SSE2-NEXT: .LBB2_69: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %dl +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB2_71 +; SSE2-NEXT: # %bb.70: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_71: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r12b +; SSE2-NEXT: movl %r12d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r12b +; SSE2-NEXT: jno .LBB2_73 +; SSE2-NEXT: # %bb.72: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r12d +; SSE2-NEXT: .LBB2_73: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r14b +; SSE2-NEXT: movl %r14d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r14b +; SSE2-NEXT: jno .LBB2_75 +; SSE2-NEXT: # %bb.74: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r14d +; SSE2-NEXT: .LBB2_75: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r15b +; SSE2-NEXT: movl %r15d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r15b +; SSE2-NEXT: jno .LBB2_77 +; SSE2-NEXT: # %bb.76: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r15d +; SSE2-NEXT: .LBB2_77: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bpl +; SSE2-NEXT: movl %ebp, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %bpl +; SSE2-NEXT: jno .LBB2_79 +; SSE2-NEXT: # %bb.78: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %ebp +; SSE2-NEXT: .LBB2_79: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r13b +; SSE2-NEXT: movl %r13d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r13b +; SSE2-NEXT: jno .LBB2_81 +; SSE2-NEXT: # %bb.80: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r13d +; SSE2-NEXT: .LBB2_81: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %dl +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB2_83 +; SSE2-NEXT: # %bb.82: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_83: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %dl +; SSE2-NEXT: jo .LBB2_84 +; SSE2-NEXT: # %bb.85: +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jmp .LBB2_86 +; SSE2-NEXT: .LBB2_84: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_86: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %dl +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB2_88 +; SSE2-NEXT: # %bb.87: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_88: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %dl +; SSE2-NEXT: jo .LBB2_89 +; SSE2-NEXT: # %bb.90: +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jmp .LBB2_91 +; SSE2-NEXT: .LBB2_89: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_91: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %dl +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB2_93 +; SSE2-NEXT: # %bb.92: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_93: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r8b +; SSE2-NEXT: movl %r8d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r8b +; SSE2-NEXT: jno .LBB2_95 +; SSE2-NEXT: # %bb.94: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r8d +; SSE2-NEXT: .LBB2_95: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %dl +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB2_97 +; SSE2-NEXT: # %bb.96: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_97: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r9b +; SSE2-NEXT: movl %r9d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r9b +; SSE2-NEXT: jno .LBB2_99 +; SSE2-NEXT: # %bb.98: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r9d +; SSE2-NEXT: .LBB2_99: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %dl +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB2_101 +; SSE2-NEXT: # %bb.100: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_101: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r12b +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r10b +; SSE2-NEXT: movl %r12d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r12b +; SSE2-NEXT: jno .LBB2_103 +; SSE2-NEXT: # %bb.102: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r12d +; SSE2-NEXT: .LBB2_103: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movl %r10d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r10b +; SSE2-NEXT: jno .LBB2_105 +; SSE2-NEXT: # %bb.104: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r10d +; SSE2-NEXT: .LBB2_105: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %dl +; SSE2-NEXT: jno .LBB2_107 +; SSE2-NEXT: # %bb.106: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %edx +; SSE2-NEXT: .LBB2_107: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl %ebx, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %bl +; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB2_109 +; SSE2-NEXT: # %bb.108: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_109: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r14b +; SSE2-NEXT: movl %r14d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r14b +; SSE2-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB2_111 +; SSE2-NEXT: # %bb.110: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r14d +; SSE2-NEXT: .LBB2_111: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r15b +; SSE2-NEXT: movl %r15d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r15b +; SSE2-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB2_113 +; SSE2-NEXT: # %bb.112: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r15d +; SSE2-NEXT: .LBB2_113: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r12b +; SSE2-NEXT: movl %r12d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r12b +; SSE2-NEXT: movl %r8d, %edx +; SSE2-NEXT: jno .LBB2_115 +; SSE2-NEXT: # %bb.114: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r12d +; SSE2-NEXT: .LBB2_115: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r13b +; SSE2-NEXT: movl %r13d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r13b +; SSE2-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB2_117 +; SSE2-NEXT: # %bb.116: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r13d +; SSE2-NEXT: .LBB2_117: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSE2-NEXT: movl %esi, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %sil +; SSE2-NEXT: jno .LBB2_119 +; SSE2-NEXT: # %bb.118: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %esi +; SSE2-NEXT: .LBB2_119: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dil +; SSE2-NEXT: movl %edi, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %dil +; SSE2-NEXT: jno .LBB2_121 +; SSE2-NEXT: # %bb.120: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %edi +; SSE2-NEXT: .LBB2_121: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r8b +; SSE2-NEXT: movl %r8d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r8b +; SSE2-NEXT: jno .LBB2_123 +; SSE2-NEXT: # %bb.122: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r8d +; SSE2-NEXT: .LBB2_123: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r10b +; SSE2-NEXT: movl %r10d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r10b +; SSE2-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bpl # 1-byte Reload +; SSE2-NEXT: jno .LBB2_125 +; SSE2-NEXT: # %bb.124: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r10d +; SSE2-NEXT: .LBB2_125: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r11b +; SSE2-NEXT: movl %r11d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r11b +; SSE2-NEXT: jno .LBB2_127 +; SSE2-NEXT: # %bb.126: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r11d +; SSE2-NEXT: .LBB2_127: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movl %ecx, %eax +; SSE2-NEXT: addb %bl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %bl, %cl +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB2_129 +; SSE2-NEXT: # %bb.128: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: .LBB2_129: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl %ebx, %eax +; SSE2-NEXT: addb %dl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %dl, %bl +; SSE2-NEXT: jno .LBB2_131 +; SSE2-NEXT: # %bb.130: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %ebx +; SSE2-NEXT: .LBB2_131: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r9b +; SSE2-NEXT: movl %r9d, %eax +; SSE2-NEXT: addb %dl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %dl, %r9b +; SSE2-NEXT: jno .LBB2_133 +; SSE2-NEXT: # %bb.132: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r9d +; SSE2-NEXT: .LBB2_133: +; SSE2-NEXT: movl %ebp, %eax +; SSE2-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %dl # 1-byte Reload +; SSE2-NEXT: movzbl %r9b, %ebp +; SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl %bl, %ebp +; SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl %cl, %ecx +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl %r11b, %ecx +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl %r10b, %ecx +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl %r8b, %ecx +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl %dil, %ecx +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl %sil, %ecx +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl %r13b, %ecx +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl %r12b, %ecx +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl %r15b, %ecx +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl %r14b, %ecx +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl %dl, %ecx +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 4-byte Folded Reload +; SSE2-NEXT: # xmm14 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload +; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 4-byte Folded Reload +; SSE2-NEXT: # xmm12 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload +; SSE2-NEXT: # xmm4 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 4-byte Folded Reload +; SSE2-NEXT: # xmm8 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 4-byte Folded Reload +; SSE2-NEXT: # xmm6 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 4-byte Folded Reload +; SSE2-NEXT: # xmm10 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 4-byte Folded Reload +; SSE2-NEXT: # xmm5 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 4-byte Folded Reload +; SSE2-NEXT: # xmm13 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 4-byte Folded Reload +; SSE2-NEXT: # xmm7 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 4-byte Folded Reload +; SSE2-NEXT: # xmm11 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload +; SSE2-NEXT: # xmm3 = mem[0],zero,zero,zero +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload +; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload +; SSE2-NEXT: # xmm15 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 4-byte Folded Reload +; SSE2-NEXT: # xmm9 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 4-byte Folded Reload +; SSE2-NEXT: # xmm12 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 4-byte Folded Reload +; SSE2-NEXT: # xmm8 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 4-byte Folded Reload +; SSE2-NEXT: # xmm6 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 4-byte Folded Reload +; SSE2-NEXT: # xmm14 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload +; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3],xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 4-byte Folded Reload +; SSE2-NEXT: # xmm11 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 4-byte Folded Reload +; SSE2-NEXT: # xmm10 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 4-byte Folded Reload +; SSE2-NEXT: # xmm13 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3],xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload +; SSE2-NEXT: # xmm15 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 4-byte Folded Reload +; SSE2-NEXT: # xmm7 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm5[0] +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 4-byte Folded Reload +; SSE2-NEXT: # xmm5 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE2-NEXT: # xmm12 = xmm12[0],mem[0],xmm12[1],mem[1],xmm12[2],mem[2],xmm12[3],mem[3],xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload +; SSE2-NEXT: # xmm4 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE2-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3],xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] +; SSE2-NEXT: movd %r13d, %xmm1 +; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3] +; SSE2-NEXT: movd %r12d, %xmm1 +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE2-NEXT: # xmm6 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3],xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] +; SSE2-NEXT: movd %r15d, %xmm3 +; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] +; SSE2-NEXT: movd %r14d, %xmm14 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; SSE2-NEXT: movd %ebp, %xmm3 +; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] +; SSE2-NEXT: movd %ebx, %xmm12 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] +; SSE2-NEXT: movd %r11d, %xmm8 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] +; SSE2-NEXT: movd %r9d, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; SSE2-NEXT: movd %eax, %xmm11 +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3],xmm7[4],xmm15[4],xmm7[5],xmm15[5],xmm7[6],xmm15[6],xmm7[7],xmm15[7] +; SSE2-NEXT: movd %ecx, %xmm6 +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; SSE2-NEXT: movd %edx, %xmm13 +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; SSE2-NEXT: movd %edi, %xmm5 +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE2-NEXT: movd %esi, %xmm15 +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE2-NEXT: movd %r8d, %xmm0 +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE2-NEXT: movd %r10d, %xmm10 +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE2-NEXT: # xmm14 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3],xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] +; SSE2-NEXT: movd %r15d, %xmm2 +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload +; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3] +; SSE2-NEXT: movd %r12d, %xmm1 +; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE2-NEXT: # xmm12 = xmm12[0],mem[0],xmm12[1],mem[1],xmm12[2],mem[2],xmm12[3],mem[3],xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] +; SSE2-NEXT: movd %r9d, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; SSE2-NEXT: movd %r11d, %xmm8 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] +; SSE2-NEXT: movd %r14d, %xmm12 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1] +; SSE2-NEXT: movd %edi, %xmm7 +; SSE2-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] +; SSE2-NEXT: movd %r13d, %xmm11 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] +; SSE2-NEXT: movd %esi, %xmm14 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; SSE2-NEXT: movd %edx, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; SSE2-NEXT: movd %r8d, %xmm15 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] +; SSE2-NEXT: movd %ebp, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: movd %ecx, %xmm13 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSE2-NEXT: movd %r10d, %xmm5 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE2-NEXT: movd %ebx, %xmm10 +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE2-NEXT: movd %r15d, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7] +; SSE2-NEXT: movd %eax, %xmm8 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] +; SSE2-NEXT: movd %r12d, %xmm3 +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE2-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3],xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3],xmm6[4],xmm14[4],xmm6[5],xmm14[5],xmm6[6],xmm14[6],xmm6[7],xmm14[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3],xmm7[4],xmm15[4],xmm7[5],xmm15[5],xmm7[6],xmm15[6],xmm7[7],xmm15[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0] +; SSE2-NEXT: movdqa %xmm9, %xmm0 +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: addq $232, %rsp +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %r12 +; SSE2-NEXT: popq %r13 +; SSE2-NEXT: popq %r14 +; SSE2-NEXT: popq %r15 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v64i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pushq %rbp +; SSSE3-NEXT: pushq %r15 +; SSSE3-NEXT: pushq %r14 +; SSSE3-NEXT: pushq %r13 +; SSSE3-NEXT: pushq %r12 +; SSSE3-NEXT: pushq %rbx +; SSSE3-NEXT: subq $232, %rsp +; SSSE3-NEXT: movaps %xmm5, (%rsp) +; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movb (%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %dl +; SSSE3-NEXT: jno .LBB2_2 +; SSSE3-NEXT: # %bb.1: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %edx +; SSSE3-NEXT: .LBB2_2: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movl %esi, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %sil +; SSSE3-NEXT: jno .LBB2_4 +; SSSE3-NEXT: # %bb.3: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %esi +; SSSE3-NEXT: .LBB2_4: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dil +; SSSE3-NEXT: movl %edi, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %dil +; SSSE3-NEXT: jno .LBB2_6 +; SSSE3-NEXT: # %bb.5: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %edi +; SSSE3-NEXT: .LBB2_6: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r11b +; SSSE3-NEXT: movl %r11d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r11b +; SSSE3-NEXT: jno .LBB2_8 +; SSSE3-NEXT: # %bb.7: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r11d +; SSSE3-NEXT: .LBB2_8: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r9b +; SSSE3-NEXT: movl %r9d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r9b +; SSSE3-NEXT: jno .LBB2_10 +; SSSE3-NEXT: # %bb.9: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r9d +; SSSE3-NEXT: .LBB2_10: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r10b +; SSSE3-NEXT: movl %r10d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r10b +; SSSE3-NEXT: jno .LBB2_12 +; SSSE3-NEXT: # %bb.11: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r10d +; SSSE3-NEXT: .LBB2_12: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl %ebx, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %bl +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB2_14 +; SSSE3-NEXT: # %bb.13: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %ebx +; SSSE3-NEXT: .LBB2_14: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %dl +; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jo .LBB2_15 +; SSSE3-NEXT: # %bb.16: +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jmp .LBB2_17 +; SSSE3-NEXT: .LBB2_15: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_17: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %dl +; SSSE3-NEXT: jno .LBB2_19 +; SSSE3-NEXT: # %bb.18: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %edx +; SSSE3-NEXT: .LBB2_19: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r12b +; SSSE3-NEXT: movl %r12d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r12b +; SSSE3-NEXT: jno .LBB2_21 +; SSSE3-NEXT: # %bb.20: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r12d +; SSSE3-NEXT: .LBB2_21: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl %ebx, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %bl +; SSSE3-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB2_23 +; SSSE3-NEXT: # %bb.22: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %ebx +; SSSE3-NEXT: .LBB2_23: +; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r9b +; SSSE3-NEXT: movl %r9d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r9b +; SSSE3-NEXT: jno .LBB2_25 +; SSSE3-NEXT: # %bb.24: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r9d +; SSSE3-NEXT: .LBB2_25: +; SSSE3-NEXT: movl %edi, %r8d +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl %ebx, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %bl +; SSSE3-NEXT: jo .LBB2_26 +; SSSE3-NEXT: # %bb.27: +; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movl %esi, %edi +; SSSE3-NEXT: jmp .LBB2_28 +; SSSE3-NEXT: .LBB2_26: +; SSSE3-NEXT: movl %esi, %edi +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_28: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSSE3-NEXT: movl %esi, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %sil +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB2_30 +; SSSE3-NEXT: # %bb.29: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %esi +; SSSE3-NEXT: .LBB2_30: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %dl +; SSSE3-NEXT: jno .LBB2_32 +; SSSE3-NEXT: # %bb.31: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %edx +; SSSE3-NEXT: .LBB2_32: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl %ebx, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %bl +; SSSE3-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp) +; SSSE3-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB2_34 +; SSSE3-NEXT: # %bb.33: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %ebx +; SSSE3-NEXT: .LBB2_34: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r9b +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %dl +; SSSE3-NEXT: jno .LBB2_36 +; SSSE3-NEXT: # %bb.35: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %edx +; SSSE3-NEXT: .LBB2_36: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movl %r9d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r9b +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB2_38 +; SSSE3-NEXT: # %bb.37: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r9d +; SSSE3-NEXT: .LBB2_38: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %dl +; SSSE3-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB2_40 +; SSSE3-NEXT: # %bb.39: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %edx +; SSSE3-NEXT: .LBB2_40: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bpl +; SSSE3-NEXT: movl %ebp, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %bpl +; SSSE3-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB2_42 +; SSSE3-NEXT: # %bb.41: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %ebp +; SSSE3-NEXT: .LBB2_42: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSSE3-NEXT: movl %esi, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %sil +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB2_44 +; SSSE3-NEXT: # %bb.43: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %esi +; SSSE3-NEXT: .LBB2_44: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %dl +; SSSE3-NEXT: jno .LBB2_46 +; SSSE3-NEXT: # %bb.45: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %edx +; SSSE3-NEXT: .LBB2_46: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dil +; SSSE3-NEXT: movl %edi, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %dil +; SSSE3-NEXT: jno .LBB2_48 +; SSSE3-NEXT: # %bb.47: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %edi +; SSSE3-NEXT: .LBB2_48: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl %ebx, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %bl +; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB2_50 +; SSSE3-NEXT: # %bb.49: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_50: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSSE3-NEXT: movl %esi, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %sil +; SSSE3-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB2_52 +; SSSE3-NEXT: # %bb.51: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %esi +; SSSE3-NEXT: .LBB2_52: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dil +; SSSE3-NEXT: movl %edi, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %dil +; SSSE3-NEXT: jno .LBB2_54 +; SSSE3-NEXT: # %bb.53: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %edi +; SSSE3-NEXT: .LBB2_54: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r14b +; SSSE3-NEXT: movl %r14d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r14b +; SSSE3-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB2_56 +; SSSE3-NEXT: # %bb.55: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r14d +; SSSE3-NEXT: .LBB2_56: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r9b +; SSSE3-NEXT: movl %r9d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r9b +; SSSE3-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB2_58 +; SSSE3-NEXT: # %bb.57: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r9d +; SSSE3-NEXT: .LBB2_58: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %dl +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB2_60 +; SSSE3-NEXT: # %bb.59: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_60: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %dl +; SSSE3-NEXT: jno .LBB2_62 +; SSSE3-NEXT: # %bb.61: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %edx +; SSSE3-NEXT: .LBB2_62: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r15b +; SSSE3-NEXT: movl %r15d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r15b +; SSSE3-NEXT: jno .LBB2_64 +; SSSE3-NEXT: # %bb.63: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r15d +; SSSE3-NEXT: .LBB2_64: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl %ebx, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %bl +; SSSE3-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp) +; SSSE3-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jo .LBB2_65 +; SSSE3-NEXT: # %bb.66: +; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jmp .LBB2_67 +; SSSE3-NEXT: .LBB2_65: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_67: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r11b +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl %r11d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r11b +; SSSE3-NEXT: jno .LBB2_69 +; SSSE3-NEXT: # %bb.68: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r11d +; SSSE3-NEXT: .LBB2_69: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %dl +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB2_71 +; SSSE3-NEXT: # %bb.70: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_71: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r12b +; SSSE3-NEXT: movl %r12d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r12b +; SSSE3-NEXT: jno .LBB2_73 +; SSSE3-NEXT: # %bb.72: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r12d +; SSSE3-NEXT: .LBB2_73: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r14b +; SSSE3-NEXT: movl %r14d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r14b +; SSSE3-NEXT: jno .LBB2_75 +; SSSE3-NEXT: # %bb.74: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r14d +; SSSE3-NEXT: .LBB2_75: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r15b +; SSSE3-NEXT: movl %r15d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r15b +; SSSE3-NEXT: jno .LBB2_77 +; SSSE3-NEXT: # %bb.76: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r15d +; SSSE3-NEXT: .LBB2_77: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bpl +; SSSE3-NEXT: movl %ebp, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %bpl +; SSSE3-NEXT: jno .LBB2_79 +; SSSE3-NEXT: # %bb.78: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %ebp +; SSSE3-NEXT: .LBB2_79: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r13b +; SSSE3-NEXT: movl %r13d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r13b +; SSSE3-NEXT: jno .LBB2_81 +; SSSE3-NEXT: # %bb.80: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r13d +; SSSE3-NEXT: .LBB2_81: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %dl +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB2_83 +; SSSE3-NEXT: # %bb.82: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_83: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %dl +; SSSE3-NEXT: jo .LBB2_84 +; SSSE3-NEXT: # %bb.85: +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jmp .LBB2_86 +; SSSE3-NEXT: .LBB2_84: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_86: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %dl +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB2_88 +; SSSE3-NEXT: # %bb.87: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_88: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %dl +; SSSE3-NEXT: jo .LBB2_89 +; SSSE3-NEXT: # %bb.90: +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jmp .LBB2_91 +; SSSE3-NEXT: .LBB2_89: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_91: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %dl +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB2_93 +; SSSE3-NEXT: # %bb.92: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_93: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r8b +; SSSE3-NEXT: movl %r8d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r8b +; SSSE3-NEXT: jno .LBB2_95 +; SSSE3-NEXT: # %bb.94: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r8d +; SSSE3-NEXT: .LBB2_95: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %dl +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB2_97 +; SSSE3-NEXT: # %bb.96: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_97: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r9b +; SSSE3-NEXT: movl %r9d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r9b +; SSSE3-NEXT: jno .LBB2_99 +; SSSE3-NEXT: # %bb.98: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r9d +; SSSE3-NEXT: .LBB2_99: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %dl +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB2_101 +; SSSE3-NEXT: # %bb.100: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_101: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r12b +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r10b +; SSSE3-NEXT: movl %r12d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r12b +; SSSE3-NEXT: jno .LBB2_103 +; SSSE3-NEXT: # %bb.102: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r12d +; SSSE3-NEXT: .LBB2_103: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movl %r10d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r10b +; SSSE3-NEXT: jno .LBB2_105 +; SSSE3-NEXT: # %bb.104: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r10d +; SSSE3-NEXT: .LBB2_105: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %dl +; SSSE3-NEXT: jno .LBB2_107 +; SSSE3-NEXT: # %bb.106: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %edx +; SSSE3-NEXT: .LBB2_107: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl %ebx, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %bl +; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB2_109 +; SSSE3-NEXT: # %bb.108: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_109: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r14b +; SSSE3-NEXT: movl %r14d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r14b +; SSSE3-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB2_111 +; SSSE3-NEXT: # %bb.110: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r14d +; SSSE3-NEXT: .LBB2_111: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r15b +; SSSE3-NEXT: movl %r15d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r15b +; SSSE3-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB2_113 +; SSSE3-NEXT: # %bb.112: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r15d +; SSSE3-NEXT: .LBB2_113: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r12b +; SSSE3-NEXT: movl %r12d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r12b +; SSSE3-NEXT: movl %r8d, %edx +; SSSE3-NEXT: jno .LBB2_115 +; SSSE3-NEXT: # %bb.114: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r12d +; SSSE3-NEXT: .LBB2_115: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r13b +; SSSE3-NEXT: movl %r13d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r13b +; SSSE3-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB2_117 +; SSSE3-NEXT: # %bb.116: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r13d +; SSSE3-NEXT: .LBB2_117: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSSE3-NEXT: movl %esi, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %sil +; SSSE3-NEXT: jno .LBB2_119 +; SSSE3-NEXT: # %bb.118: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %esi +; SSSE3-NEXT: .LBB2_119: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dil +; SSSE3-NEXT: movl %edi, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %dil +; SSSE3-NEXT: jno .LBB2_121 +; SSSE3-NEXT: # %bb.120: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %edi +; SSSE3-NEXT: .LBB2_121: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r8b +; SSSE3-NEXT: movl %r8d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r8b +; SSSE3-NEXT: jno .LBB2_123 +; SSSE3-NEXT: # %bb.122: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r8d +; SSSE3-NEXT: .LBB2_123: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r10b +; SSSE3-NEXT: movl %r10d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r10b +; SSSE3-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bpl # 1-byte Reload +; SSSE3-NEXT: jno .LBB2_125 +; SSSE3-NEXT: # %bb.124: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r10d +; SSSE3-NEXT: .LBB2_125: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r11b +; SSSE3-NEXT: movl %r11d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r11b +; SSSE3-NEXT: jno .LBB2_127 +; SSSE3-NEXT: # %bb.126: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r11d +; SSSE3-NEXT: .LBB2_127: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movl %ecx, %eax +; SSSE3-NEXT: addb %bl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %bl, %cl +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB2_129 +; SSSE3-NEXT: # %bb.128: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: .LBB2_129: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl %ebx, %eax +; SSSE3-NEXT: addb %dl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %dl, %bl +; SSSE3-NEXT: jno .LBB2_131 +; SSSE3-NEXT: # %bb.130: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %ebx +; SSSE3-NEXT: .LBB2_131: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r9b +; SSSE3-NEXT: movl %r9d, %eax +; SSSE3-NEXT: addb %dl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %dl, %r9b +; SSSE3-NEXT: jno .LBB2_133 +; SSSE3-NEXT: # %bb.132: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r9d +; SSSE3-NEXT: .LBB2_133: +; SSSE3-NEXT: movl %ebp, %eax +; SSSE3-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %dl # 1-byte Reload +; SSSE3-NEXT: movzbl %r9b, %ebp +; SSSE3-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl %bl, %ebp +; SSSE3-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl %cl, %ecx +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl %r11b, %ecx +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl %r10b, %ecx +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl %r8b, %ecx +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl %dil, %ecx +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl %sil, %ecx +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl %r13b, %ecx +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl %r12b, %ecx +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl %r15b, %ecx +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl %r14b, %ecx +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl %dl, %ecx +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl %al, %eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm14 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm12 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm4 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm8 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm6 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm10 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm5 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm13 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm7 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm11 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm3 = mem[0],zero,zero,zero +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSSE3-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm15 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm9 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSSE3-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm12 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSSE3-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm8 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSSE3-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm6 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm14 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3],xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm11 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm10 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm13 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3],xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm15 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm7 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm5[0] +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm5 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm12 = xmm12[0],mem[0],xmm12[1],mem[1],xmm12[2],mem[2],xmm12[3],mem[3],xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm4 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3],xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] +; SSSE3-NEXT: movd %r13d, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3] +; SSSE3-NEXT: movd %r12d, %xmm1 +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm6 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3],xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] +; SSSE3-NEXT: movd %r15d, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] +; SSSE3-NEXT: movd %r14d, %xmm14 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; SSSE3-NEXT: movd %ebp, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] +; SSSE3-NEXT: movd %ebx, %xmm12 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] +; SSSE3-NEXT: movd %r11d, %xmm8 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] +; SSSE3-NEXT: movd %r9d, %xmm3 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; SSSE3-NEXT: movd %eax, %xmm11 +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3],xmm7[4],xmm15[4],xmm7[5],xmm15[5],xmm7[6],xmm15[6],xmm7[7],xmm15[7] +; SSSE3-NEXT: movd %ecx, %xmm6 +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; SSSE3-NEXT: movd %edx, %xmm13 +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; SSSE3-NEXT: movd %edi, %xmm5 +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload +; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSSE3-NEXT: movd %esi, %xmm15 +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSSE3-NEXT: movd %r8d, %xmm0 +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSSE3-NEXT: movd %r10d, %xmm10 +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm14 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3],xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] +; SSSE3-NEXT: movd %r15d, %xmm2 +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3] +; SSSE3-NEXT: movd %r12d, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm12 = xmm12[0],mem[0],xmm12[1],mem[1],xmm12[2],mem[2],xmm12[3],mem[3],xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] +; SSSE3-NEXT: movd %r9d, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; SSSE3-NEXT: movd %r11d, %xmm8 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] +; SSSE3-NEXT: movd %r14d, %xmm12 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1] +; SSSE3-NEXT: movd %edi, %xmm7 +; SSSE3-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] +; SSSE3-NEXT: movd %r13d, %xmm11 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] +; SSSE3-NEXT: movd %esi, %xmm14 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; SSSE3-NEXT: movd %edx, %xmm6 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; SSSE3-NEXT: movd %r8d, %xmm15 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] +; SSSE3-NEXT: movd %ebp, %xmm7 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT: movd %ecx, %xmm13 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSSE3-NEXT: movd %r10d, %xmm5 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSSE3-NEXT: movd %ebx, %xmm10 +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSSE3-NEXT: movd %r15d, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7] +; SSSE3-NEXT: movd %eax, %xmm8 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] +; SSSE3-NEXT: movd %r12d, %xmm3 +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3],xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3],xmm6[4],xmm14[4],xmm6[5],xmm14[5],xmm6[6],xmm14[6],xmm6[7],xmm14[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3],xmm7[4],xmm15[4],xmm7[5],xmm15[5],xmm7[6],xmm15[6],xmm7[7],xmm15[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0] +; SSSE3-NEXT: movdqa %xmm9, %xmm0 +; SSSE3-NEXT: movdqa %xmm4, %xmm1 +; SSSE3-NEXT: addq $232, %rsp +; SSSE3-NEXT: popq %rbx +; SSSE3-NEXT: popq %r12 +; SSSE3-NEXT: popq %r13 +; SSSE3-NEXT: popq %r14 +; SSSE3-NEXT: popq %r15 +; SSSE3-NEXT: popq %rbp +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v64i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pextrb $15, %xmm5, %ecx +; SSE41-NEXT: pextrb $15, %xmm1, %r11d +; SSE41-NEXT: movl %r11d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r11b +; SSE41-NEXT: jno .LBB2_2 +; SSE41-NEXT: # %bb.1: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r11d +; SSE41-NEXT: .LBB2_2: +; SSE41-NEXT: pextrb $14, %xmm5, %ecx +; SSE41-NEXT: pextrb $14, %xmm1, %esi +; SSE41-NEXT: movl %esi, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %sil +; SSE41-NEXT: jno .LBB2_4 +; SSE41-NEXT: # %bb.3: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %esi +; SSE41-NEXT: .LBB2_4: +; SSE41-NEXT: pextrb $13, %xmm5, %ecx +; SSE41-NEXT: pextrb $13, %xmm1, %edx +; SSE41-NEXT: movl %edx, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %dl +; SSE41-NEXT: jno .LBB2_6 +; SSE41-NEXT: # %bb.5: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edx +; SSE41-NEXT: .LBB2_6: +; SSE41-NEXT: pextrb $12, %xmm5, %ecx +; SSE41-NEXT: pextrb $12, %xmm1, %edi +; SSE41-NEXT: movl %edi, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %dil +; SSE41-NEXT: jno .LBB2_8 +; SSE41-NEXT: # %bb.7: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edi +; SSE41-NEXT: .LBB2_8: +; SSE41-NEXT: pushq %rbp +; SSE41-NEXT: pushq %r15 +; SSE41-NEXT: pushq %r14 +; SSE41-NEXT: pushq %r13 +; SSE41-NEXT: pushq %r12 +; SSE41-NEXT: pushq %rbx +; SSE41-NEXT: subq $76, %rsp +; SSE41-NEXT: pextrb $11, %xmm5, %ecx +; SSE41-NEXT: pextrb $11, %xmm1, %ebp +; SSE41-NEXT: movl %ebp, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %bpl +; SSE41-NEXT: jno .LBB2_10 +; SSE41-NEXT: # %bb.9: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebp +; SSE41-NEXT: .LBB2_10: +; SSE41-NEXT: pextrb $10, %xmm5, %ecx +; SSE41-NEXT: pextrb $10, %xmm1, %ebx +; SSE41-NEXT: movl %ebx, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %bl +; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_12 +; SSE41-NEXT: # %bb.11: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebx +; SSE41-NEXT: .LBB2_12: +; SSE41-NEXT: pextrb $9, %xmm5, %ecx +; SSE41-NEXT: pextrb $9, %xmm1, %edx +; SSE41-NEXT: movl %edx, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %dl +; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jo .LBB2_13 +; SSE41-NEXT: # %bb.14: +; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jmp .LBB2_15 +; SSE41-NEXT: .LBB2_13: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: # kill: def $al killed $al def $eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: .LBB2_15: +; SSE41-NEXT: pextrb $8, %xmm5, %ecx +; SSE41-NEXT: pextrb $8, %xmm1, %edx +; SSE41-NEXT: movl %edx, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %dl +; SSE41-NEXT: jno .LBB2_17 +; SSE41-NEXT: # %bb.16: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edx +; SSE41-NEXT: .LBB2_17: +; SSE41-NEXT: pextrb $7, %xmm5, %ecx +; SSE41-NEXT: pextrb $7, %xmm1, %ebp +; SSE41-NEXT: movl %ebp, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %bpl +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_19 +; SSE41-NEXT: # %bb.18: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebp +; SSE41-NEXT: .LBB2_19: +; SSE41-NEXT: pextrb $6, %xmm5, %ecx +; SSE41-NEXT: pextrb $6, %xmm1, %esi +; SSE41-NEXT: movl %esi, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %sil +; SSE41-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_21 +; SSE41-NEXT: # %bb.20: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %esi +; SSE41-NEXT: .LBB2_21: +; SSE41-NEXT: pextrb $5, %xmm5, %ecx +; SSE41-NEXT: pextrb $5, %xmm1, %ebx +; SSE41-NEXT: movl %ebx, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %bl +; SSE41-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jo .LBB2_22 +; SSE41-NEXT: # %bb.23: +; SSE41-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jmp .LBB2_24 +; SSE41-NEXT: .LBB2_22: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: # kill: def $al killed $al def $eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: .LBB2_24: +; SSE41-NEXT: pextrb $4, %xmm5, %ecx +; SSE41-NEXT: pextrb $4, %xmm1, %r13d +; SSE41-NEXT: movl %r13d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r13b +; SSE41-NEXT: jno .LBB2_26 +; SSE41-NEXT: # %bb.25: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r13d +; SSE41-NEXT: .LBB2_26: +; SSE41-NEXT: pextrb $3, %xmm5, %ecx +; SSE41-NEXT: pextrb $3, %xmm1, %ebx +; SSE41-NEXT: movl %ebx, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %bl +; SSE41-NEXT: jno .LBB2_28 +; SSE41-NEXT: # %bb.27: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebx +; SSE41-NEXT: .LBB2_28: +; SSE41-NEXT: pextrb $2, %xmm5, %ecx +; SSE41-NEXT: pextrb $2, %xmm1, %ebp +; SSE41-NEXT: movl %ebp, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %bpl +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_30 +; SSE41-NEXT: # %bb.29: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebp +; SSE41-NEXT: .LBB2_30: +; SSE41-NEXT: pextrb $0, %xmm5, %ecx +; SSE41-NEXT: pextrb $0, %xmm1, %esi +; SSE41-NEXT: movl %esi, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %sil +; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_32 +; SSE41-NEXT: # %bb.31: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %esi +; SSE41-NEXT: .LBB2_32: +; SSE41-NEXT: pextrb $1, %xmm5, %ecx +; SSE41-NEXT: pextrb $1, %xmm1, %edx +; SSE41-NEXT: movl %edx, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %dl +; SSE41-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_34 +; SSE41-NEXT: # %bb.33: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edx +; SSE41-NEXT: .LBB2_34: +; SSE41-NEXT: pextrb $15, %xmm6, %ecx +; SSE41-NEXT: pextrb $15, %xmm2, %r9d +; SSE41-NEXT: movl %r9d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r9b +; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_36 +; SSE41-NEXT: # %bb.35: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r9d +; SSE41-NEXT: .LBB2_36: +; SSE41-NEXT: pextrb $14, %xmm6, %ecx +; SSE41-NEXT: pextrb $14, %xmm2, %ebp +; SSE41-NEXT: movl %ebp, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %bpl +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_38 +; SSE41-NEXT: # %bb.37: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebp +; SSE41-NEXT: .LBB2_38: +; SSE41-NEXT: pextrb $13, %xmm6, %ecx +; SSE41-NEXT: pextrb $13, %xmm2, %r14d +; SSE41-NEXT: movl %r14d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r14b +; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_40 +; SSE41-NEXT: # %bb.39: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r14d +; SSE41-NEXT: .LBB2_40: +; SSE41-NEXT: pextrb $12, %xmm6, %ecx +; SSE41-NEXT: pextrb $12, %xmm2, %r10d +; SSE41-NEXT: movl %r10d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r10b +; SSE41-NEXT: jno .LBB2_42 +; SSE41-NEXT: # %bb.41: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r10d +; SSE41-NEXT: .LBB2_42: +; SSE41-NEXT: pextrb $11, %xmm6, %ecx +; SSE41-NEXT: pextrb $11, %xmm2, %ebx +; SSE41-NEXT: movl %ebx, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %bl +; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_44 +; SSE41-NEXT: # %bb.43: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebx +; SSE41-NEXT: .LBB2_44: +; SSE41-NEXT: pextrb $10, %xmm6, %ecx +; SSE41-NEXT: pextrb $10, %xmm2, %ebp +; SSE41-NEXT: movl %ebp, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %bpl +; SSE41-NEXT: jno .LBB2_46 +; SSE41-NEXT: # %bb.45: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebp +; SSE41-NEXT: .LBB2_46: +; SSE41-NEXT: pextrb $9, %xmm6, %ecx +; SSE41-NEXT: pextrb $9, %xmm2, %edi +; SSE41-NEXT: movl %edi, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %dil +; SSE41-NEXT: jno .LBB2_48 +; SSE41-NEXT: # %bb.47: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edi +; SSE41-NEXT: .LBB2_48: +; SSE41-NEXT: pextrb $8, %xmm6, %ecx +; SSE41-NEXT: pextrb $8, %xmm2, %edx +; SSE41-NEXT: movl %edx, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %dl +; SSE41-NEXT: jno .LBB2_50 +; SSE41-NEXT: # %bb.49: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edx +; SSE41-NEXT: .LBB2_50: +; SSE41-NEXT: pextrb $7, %xmm6, %ecx +; SSE41-NEXT: pextrb $7, %xmm2, %esi +; SSE41-NEXT: movl %esi, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %sil +; SSE41-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_52 +; SSE41-NEXT: # %bb.51: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %esi +; SSE41-NEXT: .LBB2_52: +; SSE41-NEXT: pextrb $6, %xmm6, %ecx +; SSE41-NEXT: pextrb $6, %xmm2, %r8d +; SSE41-NEXT: movl %r8d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r8b +; SSE41-NEXT: jno .LBB2_54 +; SSE41-NEXT: # %bb.53: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r8d +; SSE41-NEXT: .LBB2_54: +; SSE41-NEXT: pextrb $5, %xmm6, %ecx +; SSE41-NEXT: pextrb $5, %xmm2, %r11d +; SSE41-NEXT: movl %r11d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r11b +; SSE41-NEXT: movl %ebx, (%rsp) # 4-byte Spill +; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_56 +; SSE41-NEXT: # %bb.55: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r11d +; SSE41-NEXT: .LBB2_56: +; SSE41-NEXT: pextrb $4, %xmm6, %ecx +; SSE41-NEXT: pextrb $4, %xmm2, %edx +; SSE41-NEXT: movl %edx, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %dl +; SSE41-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_58 +; SSE41-NEXT: # %bb.57: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edx +; SSE41-NEXT: .LBB2_58: +; SSE41-NEXT: pextrb $3, %xmm6, %ecx +; SSE41-NEXT: pextrb $3, %xmm2, %esi +; SSE41-NEXT: movl %esi, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %sil +; SSE41-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_60 +; SSE41-NEXT: # %bb.59: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %esi +; SSE41-NEXT: .LBB2_60: +; SSE41-NEXT: pextrb $2, %xmm6, %ecx +; SSE41-NEXT: pextrb $2, %xmm2, %edx +; SSE41-NEXT: movl %edx, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %dl +; SSE41-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_62 +; SSE41-NEXT: # %bb.61: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edx +; SSE41-NEXT: .LBB2_62: +; SSE41-NEXT: pextrb $0, %xmm6, %ecx +; SSE41-NEXT: pextrb $0, %xmm2, %edi +; SSE41-NEXT: movl %edi, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %dil +; SSE41-NEXT: jno .LBB2_64 +; SSE41-NEXT: # %bb.63: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edi +; SSE41-NEXT: .LBB2_64: +; SSE41-NEXT: pextrb $1, %xmm6, %ecx +; SSE41-NEXT: pextrb $1, %xmm2, %ebx +; SSE41-NEXT: movl %ebx, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %bl +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_66 +; SSE41-NEXT: # %bb.65: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebx +; SSE41-NEXT: .LBB2_66: +; SSE41-NEXT: pextrb $15, %xmm7, %ecx +; SSE41-NEXT: pextrb $15, %xmm3, %esi +; SSE41-NEXT: movl %esi, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %sil +; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_68 +; SSE41-NEXT: # %bb.67: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %esi +; SSE41-NEXT: .LBB2_68: +; SSE41-NEXT: pextrb $14, %xmm7, %ecx +; SSE41-NEXT: pextrb $14, %xmm3, %edx +; SSE41-NEXT: movl %edx, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %dl +; SSE41-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_70 +; SSE41-NEXT: # %bb.69: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edx +; SSE41-NEXT: .LBB2_70: +; SSE41-NEXT: pextrb $13, %xmm7, %ecx +; SSE41-NEXT: pextrb $13, %xmm3, %edi +; SSE41-NEXT: movl %edi, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %dil +; SSE41-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_72 +; SSE41-NEXT: # %bb.71: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edi +; SSE41-NEXT: .LBB2_72: +; SSE41-NEXT: pextrb $12, %xmm7, %ecx +; SSE41-NEXT: pextrb $12, %xmm3, %r15d +; SSE41-NEXT: movl %r15d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r15b +; SSE41-NEXT: movl %r15d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_74 +; SSE41-NEXT: # %bb.73: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: # kill: def $al killed $al def $eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: .LBB2_74: +; SSE41-NEXT: pextrb $11, %xmm7, %ecx +; SSE41-NEXT: pextrb $11, %xmm3, %edx +; SSE41-NEXT: movl %edx, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %dl +; SSE41-NEXT: jno .LBB2_76 +; SSE41-NEXT: # %bb.75: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edx +; SSE41-NEXT: .LBB2_76: +; SSE41-NEXT: pextrb $10, %xmm7, %ecx +; SSE41-NEXT: pextrb $10, %xmm3, %ebp +; SSE41-NEXT: movl %ebp, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %bpl +; SSE41-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_78 +; SSE41-NEXT: # %bb.77: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebp +; SSE41-NEXT: .LBB2_78: +; SSE41-NEXT: pextrb $9, %xmm7, %ecx +; SSE41-NEXT: pextrb $9, %xmm3, %r11d +; SSE41-NEXT: movl %r11d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r11b +; SSE41-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_80 +; SSE41-NEXT: # %bb.79: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r11d +; SSE41-NEXT: .LBB2_80: +; SSE41-NEXT: pextrb $8, %xmm7, %ecx +; SSE41-NEXT: pextrb $8, %xmm3, %r8d +; SSE41-NEXT: movl %r8d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r8b +; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_82 +; SSE41-NEXT: # %bb.81: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r8d +; SSE41-NEXT: .LBB2_82: +; SSE41-NEXT: pextrb $7, %xmm7, %ecx +; SSE41-NEXT: pextrb $7, %xmm3, %r9d +; SSE41-NEXT: movl %r9d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r9b +; SSE41-NEXT: jno .LBB2_84 +; SSE41-NEXT: # %bb.83: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r9d +; SSE41-NEXT: .LBB2_84: +; SSE41-NEXT: pextrb $6, %xmm7, %ecx +; SSE41-NEXT: pextrb $6, %xmm3, %r10d +; SSE41-NEXT: movl %r10d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r10b +; SSE41-NEXT: jno .LBB2_86 +; SSE41-NEXT: # %bb.85: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r10d +; SSE41-NEXT: .LBB2_86: +; SSE41-NEXT: pextrb $5, %xmm7, %ecx +; SSE41-NEXT: pextrb $5, %xmm3, %r14d +; SSE41-NEXT: movl %r14d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r14b +; SSE41-NEXT: jno .LBB2_88 +; SSE41-NEXT: # %bb.87: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r14d +; SSE41-NEXT: .LBB2_88: +; SSE41-NEXT: pextrb $4, %xmm7, %ecx +; SSE41-NEXT: pextrb $4, %xmm3, %r12d +; SSE41-NEXT: movl %r12d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r12b +; SSE41-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_90 +; SSE41-NEXT: # %bb.89: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: # kill: def $al killed $al def $eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: .LBB2_90: +; SSE41-NEXT: pextrb $3, %xmm7, %ecx +; SSE41-NEXT: pextrb $3, %xmm3, %r13d +; SSE41-NEXT: movl %r13d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r13b +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_92 +; SSE41-NEXT: # %bb.91: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r13d +; SSE41-NEXT: .LBB2_92: +; SSE41-NEXT: pextrb $2, %xmm7, %ecx +; SSE41-NEXT: pextrb $2, %xmm3, %esi +; SSE41-NEXT: movl %esi, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %sil +; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_94 +; SSE41-NEXT: # %bb.93: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %esi +; SSE41-NEXT: .LBB2_94: +; SSE41-NEXT: pextrb $0, %xmm7, %ecx +; SSE41-NEXT: pextrb $0, %xmm3, %edx +; SSE41-NEXT: movl %edx, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: movl %edi, %r15d +; SSE41-NEXT: addb %cl, %dl +; SSE41-NEXT: jno .LBB2_96 +; SSE41-NEXT: # %bb.95: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edx +; SSE41-NEXT: .LBB2_96: +; SSE41-NEXT: pextrb $1, %xmm7, %ecx +; SSE41-NEXT: pextrb $1, %xmm3, %edi +; SSE41-NEXT: movl %edi, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %dil +; SSE41-NEXT: jno .LBB2_98 +; SSE41-NEXT: # %bb.97: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edi +; SSE41-NEXT: .LBB2_98: +; SSE41-NEXT: pextrb $15, %xmm4, %ecx +; SSE41-NEXT: pextrb $15, %xmm0, %r12d +; SSE41-NEXT: movl %r12d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r12b +; SSE41-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_100 +; SSE41-NEXT: # %bb.99: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: # kill: def $al killed $al def $eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: .LBB2_100: +; SSE41-NEXT: pextrb $14, %xmm4, %ecx +; SSE41-NEXT: pextrb $14, %xmm0, %ebp +; SSE41-NEXT: movl %ebp, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %bpl +; SSE41-NEXT: jno .LBB2_102 +; SSE41-NEXT: # %bb.101: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebp +; SSE41-NEXT: .LBB2_102: +; SSE41-NEXT: pextrb $13, %xmm4, %ecx +; SSE41-NEXT: pextrb $13, %xmm0, %ebx +; SSE41-NEXT: movl %ebx, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %bl +; SSE41-NEXT: movl %r15d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jo .LBB2_103 +; SSE41-NEXT: # %bb.104: +; SSE41-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jmp .LBB2_105 +; SSE41-NEXT: .LBB2_103: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: # kill: def $al killed $al def $eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: .LBB2_105: +; SSE41-NEXT: pextrb $12, %xmm4, %ecx +; SSE41-NEXT: pextrb $12, %xmm0, %r12d +; SSE41-NEXT: movl %r12d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r12b +; SSE41-NEXT: jno .LBB2_107 +; SSE41-NEXT: # %bb.106: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r12d +; SSE41-NEXT: .LBB2_107: +; SSE41-NEXT: pextrb $11, %xmm4, %ecx +; SSE41-NEXT: pextrb $11, %xmm0, %r13d +; SSE41-NEXT: movl %r13d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r13b +; SSE41-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_109 +; SSE41-NEXT: # %bb.108: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r13d +; SSE41-NEXT: .LBB2_109: +; SSE41-NEXT: pextrb $10, %xmm4, %ecx +; SSE41-NEXT: pextrb $10, %xmm0, %r15d +; SSE41-NEXT: movl %r15d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r15b +; SSE41-NEXT: jno .LBB2_111 +; SSE41-NEXT: # %bb.110: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r15d +; SSE41-NEXT: .LBB2_111: +; SSE41-NEXT: pextrb $9, %xmm4, %ecx +; SSE41-NEXT: pextrb $9, %xmm0, %r14d +; SSE41-NEXT: movl %r14d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r14b +; SSE41-NEXT: jno .LBB2_113 +; SSE41-NEXT: # %bb.112: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r14d +; SSE41-NEXT: .LBB2_113: +; SSE41-NEXT: pextrb $8, %xmm4, %ecx +; SSE41-NEXT: pextrb $8, %xmm0, %ebp +; SSE41-NEXT: movl %ebp, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %bpl +; SSE41-NEXT: jno .LBB2_115 +; SSE41-NEXT: # %bb.114: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebp +; SSE41-NEXT: .LBB2_115: +; SSE41-NEXT: pextrb $7, %xmm4, %ecx +; SSE41-NEXT: pextrb $7, %xmm0, %edi +; SSE41-NEXT: movl %edi, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %dil +; SSE41-NEXT: jno .LBB2_117 +; SSE41-NEXT: # %bb.116: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edi +; SSE41-NEXT: .LBB2_117: +; SSE41-NEXT: pextrb $6, %xmm4, %edx +; SSE41-NEXT: pextrb $6, %xmm0, %eax +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: addb %dl, %cl +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addb %dl, %al +; SSE41-NEXT: jno .LBB2_119 +; SSE41-NEXT: # %bb.118: +; SSE41-NEXT: addb $127, %cl +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB2_119: +; SSE41-NEXT: pextrb $5, %xmm4, %ebx +; SSE41-NEXT: pextrb $5, %xmm0, %ecx +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: addb %bl, %dl +; SSE41-NEXT: setns %dl +; SSE41-NEXT: addb %bl, %cl +; SSE41-NEXT: jno .LBB2_121 +; SSE41-NEXT: # %bb.120: +; SSE41-NEXT: addb $127, %dl +; SSE41-NEXT: movl %edx, %ecx +; SSE41-NEXT: .LBB2_121: +; SSE41-NEXT: pextrb $4, %xmm4, %esi +; SSE41-NEXT: pextrb $4, %xmm0, %edx +; SSE41-NEXT: movl %edx, %ebx +; SSE41-NEXT: addb %sil, %bl +; SSE41-NEXT: setns %bl +; SSE41-NEXT: addb %sil, %dl +; SSE41-NEXT: jno .LBB2_123 +; SSE41-NEXT: # %bb.122: +; SSE41-NEXT: addb $127, %bl +; SSE41-NEXT: movl %ebx, %edx +; SSE41-NEXT: .LBB2_123: +; SSE41-NEXT: pextrb $3, %xmm4, %esi +; SSE41-NEXT: pextrb $3, %xmm0, %r8d +; SSE41-NEXT: movl %r8d, %ebx +; SSE41-NEXT: addb %sil, %bl +; SSE41-NEXT: setns %bl +; SSE41-NEXT: addb %sil, %r8b +; SSE41-NEXT: jno .LBB2_125 +; SSE41-NEXT: # %bb.124: +; SSE41-NEXT: addb $127, %bl +; SSE41-NEXT: movl %ebx, %r8d +; SSE41-NEXT: .LBB2_125: +; SSE41-NEXT: pextrb $2, %xmm4, %esi +; SSE41-NEXT: pextrb $2, %xmm0, %r9d +; SSE41-NEXT: movl %r9d, %ebx +; SSE41-NEXT: addb %sil, %bl +; SSE41-NEXT: setns %bl +; SSE41-NEXT: addb %sil, %r9b +; SSE41-NEXT: jno .LBB2_127 +; SSE41-NEXT: # %bb.126: +; SSE41-NEXT: addb $127, %bl +; SSE41-NEXT: movl %ebx, %r9d +; SSE41-NEXT: .LBB2_127: +; SSE41-NEXT: pextrb $0, %xmm4, %esi +; SSE41-NEXT: pextrb $0, %xmm0, %r10d +; SSE41-NEXT: movl %r10d, %ebx +; SSE41-NEXT: addb %sil, %bl +; SSE41-NEXT: setns %bl +; SSE41-NEXT: addb %sil, %r10b +; SSE41-NEXT: jno .LBB2_129 +; SSE41-NEXT: # %bb.128: +; SSE41-NEXT: addb $127, %bl +; SSE41-NEXT: movl %ebx, %r10d +; SSE41-NEXT: .LBB2_129: +; SSE41-NEXT: pextrb $1, %xmm4, %esi +; SSE41-NEXT: pextrb $1, %xmm0, %r11d +; SSE41-NEXT: movl %r11d, %ebx +; SSE41-NEXT: addb %sil, %bl +; SSE41-NEXT: setns %bl +; SSE41-NEXT: addb %sil, %r11b +; SSE41-NEXT: jno .LBB2_131 +; SSE41-NEXT: # %bb.130: +; SSE41-NEXT: addb $127, %bl +; SSE41-NEXT: movl %ebx, %r11d +; SSE41-NEXT: .LBB2_131: +; SSE41-NEXT: movzbl %r10b, %esi +; SSE41-NEXT: movd %esi, %xmm0 +; SSE41-NEXT: movzbl %r11b, %esi +; SSE41-NEXT: pinsrb $1, %esi, %xmm0 +; SSE41-NEXT: movzbl %r9b, %esi +; SSE41-NEXT: pinsrb $2, %esi, %xmm0 +; SSE41-NEXT: movzbl %r8b, %esi +; SSE41-NEXT: pinsrb $3, %esi, %xmm0 +; SSE41-NEXT: movzbl %dl, %edx +; SSE41-NEXT: pinsrb $4, %edx, %xmm0 +; SSE41-NEXT: movzbl %cl, %ecx +; SSE41-NEXT: pinsrb $5, %ecx, %xmm0 +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $6, %eax, %xmm0 +; SSE41-NEXT: movzbl %dil, %eax +; SSE41-NEXT: pinsrb $7, %eax, %xmm0 +; SSE41-NEXT: movzbl %bpl, %eax +; SSE41-NEXT: pinsrb $8, %eax, %xmm0 +; SSE41-NEXT: movzbl %r14b, %eax +; SSE41-NEXT: pinsrb $9, %eax, %xmm0 +; SSE41-NEXT: movzbl %r15b, %eax +; SSE41-NEXT: pinsrb $10, %eax, %xmm0 +; SSE41-NEXT: movzbl %r13b, %eax +; SSE41-NEXT: pinsrb $11, %eax, %xmm0 +; SSE41-NEXT: movzbl %r12b, %eax +; SSE41-NEXT: pinsrb $12, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $13, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $14, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $15, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: movd %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $1, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $2, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $3, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $4, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $5, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $6, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $7, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $8, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $9, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $10, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $11, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $12, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $13, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $14, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $15, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: movd %eax, %xmm2 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $1, %eax, %xmm2 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $2, %eax, %xmm2 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $3, %eax, %xmm2 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $4, %eax, %xmm2 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $5, %eax, %xmm2 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $6, %eax, %xmm2 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $7, %eax, %xmm2 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $8, %eax, %xmm2 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $9, %eax, %xmm2 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $10, %eax, %xmm2 +; SSE41-NEXT: movzbl (%rsp), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $11, %eax, %xmm2 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $12, %eax, %xmm2 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $13, %eax, %xmm2 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $14, %eax, %xmm2 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $15, %eax, %xmm2 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: movd %eax, %xmm3 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $1, %eax, %xmm3 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $2, %eax, %xmm3 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $3, %eax, %xmm3 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $4, %eax, %xmm3 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $5, %eax, %xmm3 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $6, %eax, %xmm3 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $7, %eax, %xmm3 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $8, %eax, %xmm3 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $9, %eax, %xmm3 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $10, %eax, %xmm3 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $11, %eax, %xmm3 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $12, %eax, %xmm3 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $13, %eax, %xmm3 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $14, %eax, %xmm3 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $15, %eax, %xmm3 +; SSE41-NEXT: addq $76, %rsp +; SSE41-NEXT: popq %rbx +; SSE41-NEXT: popq %r12 +; SSE41-NEXT: popq %r13 +; SSE41-NEXT: popq %r14 +; SSE41-NEXT: popq %r15 +; SSE41-NEXT: popq %rbp +; SSE41-NEXT: retq +; +; AVX1-LABEL: v64i8: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: pushq %r15 +; AVX1-NEXT: pushq %r14 +; AVX1-NEXT: pushq %r13 +; AVX1-NEXT: pushq %r12 +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: subq $76, %rsp +; AVX1-NEXT: vpextrb $15, %xmm3, %ecx +; AVX1-NEXT: vpextrb $15, %xmm1, %edx +; AVX1-NEXT: movl %edx, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %dl +; AVX1-NEXT: jo .LBB2_1 +; AVX1-NEXT: # %bb.2: +; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jmp .LBB2_3 +; AVX1-NEXT: .LBB2_1: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: # kill: def $al killed $al def $eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: .LBB2_3: +; AVX1-NEXT: vpextrb $14, %xmm3, %ecx +; AVX1-NEXT: vpextrb $14, %xmm1, %edx +; AVX1-NEXT: movl %edx, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %dl +; AVX1-NEXT: jno .LBB2_5 +; AVX1-NEXT: # %bb.4: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %edx +; AVX1-NEXT: .LBB2_5: +; AVX1-NEXT: vpextrb $13, %xmm3, %ecx +; AVX1-NEXT: vpextrb $13, %xmm1, %esi +; AVX1-NEXT: movl %esi, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %sil +; AVX1-NEXT: jo .LBB2_6 +; AVX1-NEXT: # %bb.7: +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jmp .LBB2_8 +; AVX1-NEXT: .LBB2_6: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: # kill: def $al killed $al def $eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: .LBB2_8: +; AVX1-NEXT: vpextrb $12, %xmm3, %ecx +; AVX1-NEXT: vpextrb $12, %xmm1, %esi +; AVX1-NEXT: movl %esi, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %sil +; AVX1-NEXT: jno .LBB2_10 +; AVX1-NEXT: # %bb.9: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %esi +; AVX1-NEXT: .LBB2_10: +; AVX1-NEXT: vpextrb $11, %xmm3, %ecx +; AVX1-NEXT: vpextrb $11, %xmm1, %edi +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %dil +; AVX1-NEXT: jno .LBB2_12 +; AVX1-NEXT: # %bb.11: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %edi +; AVX1-NEXT: .LBB2_12: +; AVX1-NEXT: vpextrb $10, %xmm3, %ecx +; AVX1-NEXT: vpextrb $10, %xmm1, %ebp +; AVX1-NEXT: movl %ebp, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %bpl +; AVX1-NEXT: jno .LBB2_14 +; AVX1-NEXT: # %bb.13: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %ebp +; AVX1-NEXT: .LBB2_14: +; AVX1-NEXT: vpextrb $9, %xmm3, %ecx +; AVX1-NEXT: vpextrb $9, %xmm1, %ebx +; AVX1-NEXT: movl %ebx, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %bl +; AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jo .LBB2_15 +; AVX1-NEXT: # %bb.16: +; AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jmp .LBB2_17 +; AVX1-NEXT: .LBB2_15: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: # kill: def $al killed $al def $eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: .LBB2_17: +; AVX1-NEXT: vpextrb $8, %xmm3, %ecx +; AVX1-NEXT: vpextrb $8, %xmm1, %ebp +; AVX1-NEXT: movl %ebp, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %bpl +; AVX1-NEXT: jno .LBB2_19 +; AVX1-NEXT: # %bb.18: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %ebp +; AVX1-NEXT: .LBB2_19: +; AVX1-NEXT: vpextrb $7, %xmm3, %ecx +; AVX1-NEXT: vpextrb $7, %xmm1, %ebx +; AVX1-NEXT: movl %ebx, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %bl +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_21 +; AVX1-NEXT: # %bb.20: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %ebx +; AVX1-NEXT: .LBB2_21: +; AVX1-NEXT: vpextrb $6, %xmm3, %ecx +; AVX1-NEXT: vpextrb $6, %xmm1, %esi +; AVX1-NEXT: movl %esi, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %sil +; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_23 +; AVX1-NEXT: # %bb.22: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %esi +; AVX1-NEXT: .LBB2_23: +; AVX1-NEXT: vpextrb $5, %xmm3, %ecx +; AVX1-NEXT: vpextrb $5, %xmm1, %r11d +; AVX1-NEXT: movl %r11d, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %r11b +; AVX1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_25 +; AVX1-NEXT: # %bb.24: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r11d +; AVX1-NEXT: .LBB2_25: +; AVX1-NEXT: vpextrb $4, %xmm3, %ecx +; AVX1-NEXT: vpextrb $4, %xmm1, %r13d +; AVX1-NEXT: movl %r13d, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %r13b +; AVX1-NEXT: jno .LBB2_27 +; AVX1-NEXT: # %bb.26: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r13d +; AVX1-NEXT: .LBB2_27: +; AVX1-NEXT: vpextrb $3, %xmm3, %ecx +; AVX1-NEXT: vpextrb $3, %xmm1, %r8d +; AVX1-NEXT: movl %r8d, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %r8b +; AVX1-NEXT: jno .LBB2_29 +; AVX1-NEXT: # %bb.28: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r8d +; AVX1-NEXT: .LBB2_29: +; AVX1-NEXT: vpextrb $2, %xmm3, %ecx +; AVX1-NEXT: vpextrb $2, %xmm1, %ebp +; AVX1-NEXT: movl %ebp, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %bpl +; AVX1-NEXT: jno .LBB2_31 +; AVX1-NEXT: # %bb.30: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %ebp +; AVX1-NEXT: .LBB2_31: +; AVX1-NEXT: vpextrb $0, %xmm3, %ecx +; AVX1-NEXT: vpextrb $0, %xmm1, %ebx +; AVX1-NEXT: movl %ebx, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %bl +; AVX1-NEXT: jno .LBB2_33 +; AVX1-NEXT: # %bb.32: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %ebx +; AVX1-NEXT: .LBB2_33: +; AVX1-NEXT: vpextrb $1, %xmm3, %ecx +; AVX1-NEXT: vpextrb $1, %xmm1, %edi +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %dil +; AVX1-NEXT: jno .LBB2_35 +; AVX1-NEXT: # %bb.34: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %edi +; AVX1-NEXT: .LBB2_35: +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vpextrb $15, %xmm3, %ecx +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpextrb $15, %xmm1, %edx +; AVX1-NEXT: movl %edx, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %dl +; AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_37 +; AVX1-NEXT: # %bb.36: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %edx +; AVX1-NEXT: .LBB2_37: +; AVX1-NEXT: vpextrb $14, %xmm3, %ecx +; AVX1-NEXT: vpextrb $14, %xmm1, %edi +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %dil +; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_39 +; AVX1-NEXT: # %bb.38: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %edi +; AVX1-NEXT: .LBB2_39: +; AVX1-NEXT: vpextrb $13, %xmm3, %ecx +; AVX1-NEXT: vpextrb $13, %xmm1, %r12d +; AVX1-NEXT: movl %r12d, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %r12b +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_41 +; AVX1-NEXT: # %bb.40: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r12d +; AVX1-NEXT: .LBB2_41: +; AVX1-NEXT: vpextrb $12, %xmm3, %ecx +; AVX1-NEXT: vpextrb $12, %xmm1, %r15d +; AVX1-NEXT: movl %r15d, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %r15b +; AVX1-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_43 +; AVX1-NEXT: # %bb.42: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r15d +; AVX1-NEXT: .LBB2_43: +; AVX1-NEXT: vpextrb $11, %xmm3, %ecx +; AVX1-NEXT: vpextrb $11, %xmm1, %ebp +; AVX1-NEXT: movl %ebp, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %bpl +; AVX1-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_45 +; AVX1-NEXT: # %bb.44: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %ebp +; AVX1-NEXT: .LBB2_45: +; AVX1-NEXT: vpextrb $10, %xmm3, %ecx +; AVX1-NEXT: vpextrb $10, %xmm1, %ebx +; AVX1-NEXT: movl %ebx, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %bl +; AVX1-NEXT: jno .LBB2_47 +; AVX1-NEXT: # %bb.46: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %ebx +; AVX1-NEXT: .LBB2_47: +; AVX1-NEXT: vpextrb $9, %xmm3, %ecx +; AVX1-NEXT: vpextrb $9, %xmm1, %edx +; AVX1-NEXT: movl %edx, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %dl +; AVX1-NEXT: jno .LBB2_49 +; AVX1-NEXT: # %bb.48: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %edx +; AVX1-NEXT: .LBB2_49: +; AVX1-NEXT: vpextrb $8, %xmm3, %ecx +; AVX1-NEXT: vpextrb $8, %xmm1, %esi +; AVX1-NEXT: movl %esi, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %sil +; AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %ebx, (%rsp) # 4-byte Spill +; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jo .LBB2_50 +; AVX1-NEXT: # %bb.51: +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jmp .LBB2_52 +; AVX1-NEXT: .LBB2_50: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: # kill: def $al killed $al def $eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: .LBB2_52: +; AVX1-NEXT: vpextrb $7, %xmm3, %ecx +; AVX1-NEXT: vpextrb $7, %xmm1, %r11d +; AVX1-NEXT: movl %r11d, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %r11b +; AVX1-NEXT: jno .LBB2_54 +; AVX1-NEXT: # %bb.53: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r11d +; AVX1-NEXT: .LBB2_54: +; AVX1-NEXT: vpextrb $6, %xmm3, %ecx +; AVX1-NEXT: vpextrb $6, %xmm1, %esi +; AVX1-NEXT: movl %esi, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %sil +; AVX1-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_56 +; AVX1-NEXT: # %bb.55: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %esi +; AVX1-NEXT: .LBB2_56: +; AVX1-NEXT: vpextrb $5, %xmm3, %ecx +; AVX1-NEXT: vpextrb $5, %xmm1, %edi +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %dil +; AVX1-NEXT: jno .LBB2_58 +; AVX1-NEXT: # %bb.57: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %edi +; AVX1-NEXT: .LBB2_58: +; AVX1-NEXT: vpextrb $4, %xmm3, %ecx +; AVX1-NEXT: vpextrb $4, %xmm1, %r13d +; AVX1-NEXT: movl %r13d, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %r13b +; AVX1-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_60 +; AVX1-NEXT: # %bb.59: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: # kill: def $al killed $al def $eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: .LBB2_60: +; AVX1-NEXT: vpextrb $3, %xmm3, %ecx +; AVX1-NEXT: vpextrb $3, %xmm1, %ebp +; AVX1-NEXT: movl %ebp, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %bpl +; AVX1-NEXT: jo .LBB2_61 +; AVX1-NEXT: # %bb.62: +; AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jmp .LBB2_63 +; AVX1-NEXT: .LBB2_61: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: # kill: def $al killed $al def $eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: .LBB2_63: +; AVX1-NEXT: vpextrb $2, %xmm3, %ecx +; AVX1-NEXT: vpextrb $2, %xmm1, %ebp +; AVX1-NEXT: movl %ebp, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %bpl +; AVX1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_65 +; AVX1-NEXT: # %bb.64: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %ebp +; AVX1-NEXT: .LBB2_65: +; AVX1-NEXT: vpextrb $0, %xmm3, %ecx +; AVX1-NEXT: vpextrb $0, %xmm1, %edi +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %dil +; AVX1-NEXT: movl %r15d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_67 +; AVX1-NEXT: # %bb.66: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %edi +; AVX1-NEXT: .LBB2_67: +; AVX1-NEXT: vpextrb $1, %xmm3, %ecx +; AVX1-NEXT: vpextrb $1, %xmm1, %ebx +; AVX1-NEXT: movl %ebx, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %bl +; AVX1-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_69 +; AVX1-NEXT: # %bb.68: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %ebx +; AVX1-NEXT: .LBB2_69: +; AVX1-NEXT: vpextrb $15, %xmm2, %ecx +; AVX1-NEXT: vpextrb $15, %xmm0, %edx +; AVX1-NEXT: movl %edx, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %dl +; AVX1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_71 +; AVX1-NEXT: # %bb.70: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %edx +; AVX1-NEXT: .LBB2_71: +; AVX1-NEXT: vpextrb $14, %xmm2, %ecx +; AVX1-NEXT: vpextrb $14, %xmm0, %edi +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %dil +; AVX1-NEXT: jno .LBB2_73 +; AVX1-NEXT: # %bb.72: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %edi +; AVX1-NEXT: .LBB2_73: +; AVX1-NEXT: vpextrb $13, %xmm2, %ecx +; AVX1-NEXT: vpextrb $13, %xmm0, %r10d +; AVX1-NEXT: movl %r10d, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %r10b +; AVX1-NEXT: jno .LBB2_75 +; AVX1-NEXT: # %bb.74: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r10d +; AVX1-NEXT: .LBB2_75: +; AVX1-NEXT: vpextrb $12, %xmm2, %ecx +; AVX1-NEXT: vpextrb $12, %xmm0, %r12d +; AVX1-NEXT: movl %r12d, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %r12b +; AVX1-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_77 +; AVX1-NEXT: # %bb.76: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: # kill: def $al killed $al def $eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: .LBB2_77: +; AVX1-NEXT: vpextrb $11, %xmm2, %ecx +; AVX1-NEXT: vpextrb $11, %xmm0, %r14d +; AVX1-NEXT: movl %r14d, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %r14b +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_79 +; AVX1-NEXT: # %bb.78: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r14d +; AVX1-NEXT: .LBB2_79: +; AVX1-NEXT: vpextrb $10, %xmm2, %ecx +; AVX1-NEXT: vpextrb $10, %xmm0, %r13d +; AVX1-NEXT: movl %r13d, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %r13b +; AVX1-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_81 +; AVX1-NEXT: # %bb.80: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r13d +; AVX1-NEXT: .LBB2_81: +; AVX1-NEXT: vpextrb $9, %xmm2, %ecx +; AVX1-NEXT: vpextrb $9, %xmm0, %r8d +; AVX1-NEXT: movl %r8d, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %r8b +; AVX1-NEXT: jno .LBB2_83 +; AVX1-NEXT: # %bb.82: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r8d +; AVX1-NEXT: .LBB2_83: +; AVX1-NEXT: vpextrb $8, %xmm2, %ecx +; AVX1-NEXT: vpextrb $8, %xmm0, %r15d +; AVX1-NEXT: movl %r15d, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %r15b +; AVX1-NEXT: jno .LBB2_85 +; AVX1-NEXT: # %bb.84: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r15d +; AVX1-NEXT: .LBB2_85: +; AVX1-NEXT: vpextrb $7, %xmm2, %ecx +; AVX1-NEXT: vpextrb $7, %xmm0, %r12d +; AVX1-NEXT: movl %r12d, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %r12b +; AVX1-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_87 +; AVX1-NEXT: # %bb.86: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: # kill: def $al killed $al def $eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: .LBB2_87: +; AVX1-NEXT: vpextrb $6, %xmm2, %ecx +; AVX1-NEXT: vpextrb $6, %xmm0, %r12d +; AVX1-NEXT: movl %r12d, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %r12b +; AVX1-NEXT: jno .LBB2_89 +; AVX1-NEXT: # %bb.88: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r12d +; AVX1-NEXT: .LBB2_89: +; AVX1-NEXT: vpextrb $5, %xmm2, %ecx +; AVX1-NEXT: vpextrb $5, %xmm0, %ebp +; AVX1-NEXT: movl %ebp, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %bpl +; AVX1-NEXT: jno .LBB2_91 +; AVX1-NEXT: # %bb.90: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %ebp +; AVX1-NEXT: .LBB2_91: +; AVX1-NEXT: vpextrb $4, %xmm2, %ecx +; AVX1-NEXT: vpextrb $4, %xmm0, %esi +; AVX1-NEXT: movl %esi, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %sil +; AVX1-NEXT: jno .LBB2_93 +; AVX1-NEXT: # %bb.92: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %esi +; AVX1-NEXT: .LBB2_93: +; AVX1-NEXT: vpextrb $3, %xmm2, %ecx +; AVX1-NEXT: vpextrb $3, %xmm0, %edx +; AVX1-NEXT: movl %edx, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %dl +; AVX1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_95 +; AVX1-NEXT: # %bb.94: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %edx +; AVX1-NEXT: .LBB2_95: +; AVX1-NEXT: vpextrb $2, %xmm2, %ecx +; AVX1-NEXT: vpextrb $2, %xmm0, %edi +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %dil +; AVX1-NEXT: jno .LBB2_97 +; AVX1-NEXT: # %bb.96: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %edi +; AVX1-NEXT: .LBB2_97: +; AVX1-NEXT: vpextrb $0, %xmm2, %ecx +; AVX1-NEXT: vpextrb $0, %xmm0, %ebx +; AVX1-NEXT: movl %ebx, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %bl +; AVX1-NEXT: jno .LBB2_99 +; AVX1-NEXT: # %bb.98: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %ebx +; AVX1-NEXT: .LBB2_99: +; AVX1-NEXT: vpextrb $1, %xmm2, %ecx +; AVX1-NEXT: vpextrb $1, %xmm0, %r11d +; AVX1-NEXT: movl %r11d, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %r11b +; AVX1-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_101 +; AVX1-NEXT: # %bb.100: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: # kill: def $al killed $al def $eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: .LBB2_101: +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-NEXT: vpextrb $15, %xmm1, %ecx +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpextrb $15, %xmm0, %r11d +; AVX1-NEXT: movl %r11d, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %r11b +; AVX1-NEXT: jno .LBB2_103 +; AVX1-NEXT: # %bb.102: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r11d +; AVX1-NEXT: .LBB2_103: +; AVX1-NEXT: vpextrb $14, %xmm1, %ecx +; AVX1-NEXT: vpextrb $14, %xmm0, %r9d +; AVX1-NEXT: movl %r9d, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %r9b +; AVX1-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_105 +; AVX1-NEXT: # %bb.104: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: # kill: def $al killed $al def $eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: .LBB2_105: +; AVX1-NEXT: vpextrb $13, %xmm1, %ecx +; AVX1-NEXT: vpextrb $13, %xmm0, %r9d +; AVX1-NEXT: movl %r9d, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %r9b +; AVX1-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %r15d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_107 +; AVX1-NEXT: # %bb.106: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r9d +; AVX1-NEXT: .LBB2_107: +; AVX1-NEXT: vpextrb $12, %xmm1, %ecx +; AVX1-NEXT: vpextrb $12, %xmm0, %edx +; AVX1-NEXT: movl %edx, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %dl +; AVX1-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_109 +; AVX1-NEXT: # %bb.108: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %edx +; AVX1-NEXT: .LBB2_109: +; AVX1-NEXT: vpextrb $11, %xmm1, %ecx +; AVX1-NEXT: vpextrb $11, %xmm0, %r13d +; AVX1-NEXT: movl %r13d, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %r13b +; AVX1-NEXT: jno .LBB2_111 +; AVX1-NEXT: # %bb.110: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r13d +; AVX1-NEXT: .LBB2_111: +; AVX1-NEXT: vpextrb $10, %xmm1, %ecx +; AVX1-NEXT: vpextrb $10, %xmm0, %r15d +; AVX1-NEXT: movl %r15d, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %r15b +; AVX1-NEXT: jno .LBB2_113 +; AVX1-NEXT: # %bb.112: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r15d +; AVX1-NEXT: .LBB2_113: +; AVX1-NEXT: vpextrb $9, %xmm1, %ecx +; AVX1-NEXT: vpextrb $9, %xmm0, %r14d +; AVX1-NEXT: movl %r14d, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %r14b +; AVX1-NEXT: jno .LBB2_115 +; AVX1-NEXT: # %bb.114: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r14d +; AVX1-NEXT: .LBB2_115: +; AVX1-NEXT: vpextrb $8, %xmm1, %ecx +; AVX1-NEXT: vpextrb $8, %xmm0, %ebp +; AVX1-NEXT: movl %ebp, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %bpl +; AVX1-NEXT: jno .LBB2_117 +; AVX1-NEXT: # %bb.116: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %ebp +; AVX1-NEXT: .LBB2_117: +; AVX1-NEXT: vpextrb $7, %xmm1, %ecx +; AVX1-NEXT: vpextrb $7, %xmm0, %edi +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %dil +; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_119 +; AVX1-NEXT: # %bb.118: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %edi +; AVX1-NEXT: .LBB2_119: +; AVX1-NEXT: vpextrb $6, %xmm1, %edx +; AVX1-NEXT: vpextrb $6, %xmm0, %eax +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: addb %dl, %cl +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addb %dl, %al +; AVX1-NEXT: jno .LBB2_121 +; AVX1-NEXT: # %bb.120: +; AVX1-NEXT: addb $127, %cl +; AVX1-NEXT: movl %ecx, %eax +; AVX1-NEXT: .LBB2_121: +; AVX1-NEXT: vpextrb $5, %xmm1, %ebx +; AVX1-NEXT: vpextrb $5, %xmm0, %ecx +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: addb %bl, %dl +; AVX1-NEXT: setns %dl +; AVX1-NEXT: addb %bl, %cl +; AVX1-NEXT: jno .LBB2_123 +; AVX1-NEXT: # %bb.122: +; AVX1-NEXT: addb $127, %dl +; AVX1-NEXT: movl %edx, %ecx +; AVX1-NEXT: .LBB2_123: +; AVX1-NEXT: vpextrb $4, %xmm1, %esi +; AVX1-NEXT: vpextrb $4, %xmm0, %edx +; AVX1-NEXT: movl %edx, %ebx +; AVX1-NEXT: addb %sil, %bl +; AVX1-NEXT: setns %bl +; AVX1-NEXT: addb %sil, %dl +; AVX1-NEXT: jno .LBB2_125 +; AVX1-NEXT: # %bb.124: +; AVX1-NEXT: addb $127, %bl +; AVX1-NEXT: movl %ebx, %edx +; AVX1-NEXT: .LBB2_125: +; AVX1-NEXT: vpextrb $3, %xmm1, %esi +; AVX1-NEXT: vpextrb $3, %xmm0, %r8d +; AVX1-NEXT: movl %r8d, %ebx +; AVX1-NEXT: addb %sil, %bl +; AVX1-NEXT: setns %bl +; AVX1-NEXT: addb %sil, %r8b +; AVX1-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_127 +; AVX1-NEXT: # %bb.126: +; AVX1-NEXT: addb $127, %bl +; AVX1-NEXT: movl %ebx, %r8d +; AVX1-NEXT: .LBB2_127: +; AVX1-NEXT: vpextrb $2, %xmm1, %esi +; AVX1-NEXT: vpextrb $2, %xmm0, %r9d +; AVX1-NEXT: movl %r9d, %ebx +; AVX1-NEXT: addb %sil, %bl +; AVX1-NEXT: setns %bl +; AVX1-NEXT: addb %sil, %r9b +; AVX1-NEXT: jno .LBB2_129 +; AVX1-NEXT: # %bb.128: +; AVX1-NEXT: addb $127, %bl +; AVX1-NEXT: movl %ebx, %r9d +; AVX1-NEXT: .LBB2_129: +; AVX1-NEXT: vpextrb $0, %xmm1, %esi +; AVX1-NEXT: vpextrb $0, %xmm0, %r10d +; AVX1-NEXT: movl %r10d, %ebx +; AVX1-NEXT: addb %sil, %bl +; AVX1-NEXT: setns %bl +; AVX1-NEXT: movl %r11d, %r12d +; AVX1-NEXT: addb %sil, %r10b +; AVX1-NEXT: jno .LBB2_131 +; AVX1-NEXT: # %bb.130: +; AVX1-NEXT: addb $127, %bl +; AVX1-NEXT: movl %ebx, %r10d +; AVX1-NEXT: .LBB2_131: +; AVX1-NEXT: vpextrb $1, %xmm1, %esi +; AVX1-NEXT: vpextrb $1, %xmm0, %r11d +; AVX1-NEXT: movl %r11d, %ebx +; AVX1-NEXT: addb %sil, %bl +; AVX1-NEXT: setns %bl +; AVX1-NEXT: addb %sil, %r11b +; AVX1-NEXT: jno .LBB2_133 +; AVX1-NEXT: # %bb.132: +; AVX1-NEXT: addb $127, %bl +; AVX1-NEXT: movl %ebx, %r11d +; AVX1-NEXT: .LBB2_133: +; AVX1-NEXT: movzbl %r10b, %esi +; AVX1-NEXT: vmovd %esi, %xmm0 +; AVX1-NEXT: movzbl %r11b, %esi +; AVX1-NEXT: vpinsrb $1, %esi, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %r9b, %esi +; AVX1-NEXT: vpinsrb $2, %esi, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %r8b, %esi +; AVX1-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %dl, %edx +; AVX1-NEXT: vpinsrb $4, %edx, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %cl, %ecx +; AVX1-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %al, %eax +; AVX1-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %dil, %eax +; AVX1-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %bpl, %eax +; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %r14b, %eax +; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %r15b, %eax +; AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %r13b, %eax +; AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %r12b, %eax +; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vmovd %eax, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl (%rsp), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vmovd %eax, %xmm3 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm1 +; AVX1-NEXT: addq $76, %rsp +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: popq %r12 +; AVX1-NEXT: popq %r13 +; AVX1-NEXT: popq %r14 +; AVX1-NEXT: popq %r15 +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: v64i8: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %r13 +; AVX2-NEXT: pushq %r12 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: subq $76, %rsp +; AVX2-NEXT: vpextrb $15, %xmm3, %ecx +; AVX2-NEXT: vpextrb $15, %xmm1, %edx +; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %dl +; AVX2-NEXT: jo .LBB2_1 +; AVX2-NEXT: # %bb.2: +; AVX2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jmp .LBB2_3 +; AVX2-NEXT: .LBB2_1: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: # kill: def $al killed $al def $eax +; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: .LBB2_3: +; AVX2-NEXT: vpextrb $14, %xmm3, %ecx +; AVX2-NEXT: vpextrb $14, %xmm1, %edx +; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %dl +; AVX2-NEXT: jno .LBB2_5 +; AVX2-NEXT: # %bb.4: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %edx +; AVX2-NEXT: .LBB2_5: +; AVX2-NEXT: vpextrb $13, %xmm3, %ecx +; AVX2-NEXT: vpextrb $13, %xmm1, %esi +; AVX2-NEXT: movl %esi, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %sil +; AVX2-NEXT: jo .LBB2_6 +; AVX2-NEXT: # %bb.7: +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jmp .LBB2_8 +; AVX2-NEXT: .LBB2_6: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: # kill: def $al killed $al def $eax +; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: .LBB2_8: +; AVX2-NEXT: vpextrb $12, %xmm3, %ecx +; AVX2-NEXT: vpextrb $12, %xmm1, %esi +; AVX2-NEXT: movl %esi, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %sil +; AVX2-NEXT: jno .LBB2_10 +; AVX2-NEXT: # %bb.9: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %esi +; AVX2-NEXT: .LBB2_10: +; AVX2-NEXT: vpextrb $11, %xmm3, %ecx +; AVX2-NEXT: vpextrb $11, %xmm1, %edi +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %dil +; AVX2-NEXT: jno .LBB2_12 +; AVX2-NEXT: # %bb.11: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %edi +; AVX2-NEXT: .LBB2_12: +; AVX2-NEXT: vpextrb $10, %xmm3, %ecx +; AVX2-NEXT: vpextrb $10, %xmm1, %ebp +; AVX2-NEXT: movl %ebp, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %bpl +; AVX2-NEXT: jno .LBB2_14 +; AVX2-NEXT: # %bb.13: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %ebp +; AVX2-NEXT: .LBB2_14: +; AVX2-NEXT: vpextrb $9, %xmm3, %ecx +; AVX2-NEXT: vpextrb $9, %xmm1, %ebx +; AVX2-NEXT: movl %ebx, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %bl +; AVX2-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jo .LBB2_15 +; AVX2-NEXT: # %bb.16: +; AVX2-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jmp .LBB2_17 +; AVX2-NEXT: .LBB2_15: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: # kill: def $al killed $al def $eax +; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: .LBB2_17: +; AVX2-NEXT: vpextrb $8, %xmm3, %ecx +; AVX2-NEXT: vpextrb $8, %xmm1, %ebp +; AVX2-NEXT: movl %ebp, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %bpl +; AVX2-NEXT: jno .LBB2_19 +; AVX2-NEXT: # %bb.18: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %ebp +; AVX2-NEXT: .LBB2_19: +; AVX2-NEXT: vpextrb $7, %xmm3, %ecx +; AVX2-NEXT: vpextrb $7, %xmm1, %ebx +; AVX2-NEXT: movl %ebx, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %bl +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_21 +; AVX2-NEXT: # %bb.20: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %ebx +; AVX2-NEXT: .LBB2_21: +; AVX2-NEXT: vpextrb $6, %xmm3, %ecx +; AVX2-NEXT: vpextrb $6, %xmm1, %esi +; AVX2-NEXT: movl %esi, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %sil +; AVX2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_23 +; AVX2-NEXT: # %bb.22: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %esi +; AVX2-NEXT: .LBB2_23: +; AVX2-NEXT: vpextrb $5, %xmm3, %ecx +; AVX2-NEXT: vpextrb $5, %xmm1, %r11d +; AVX2-NEXT: movl %r11d, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %r11b +; AVX2-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_25 +; AVX2-NEXT: # %bb.24: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r11d +; AVX2-NEXT: .LBB2_25: +; AVX2-NEXT: vpextrb $4, %xmm3, %ecx +; AVX2-NEXT: vpextrb $4, %xmm1, %r13d +; AVX2-NEXT: movl %r13d, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %r13b +; AVX2-NEXT: jno .LBB2_27 +; AVX2-NEXT: # %bb.26: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r13d +; AVX2-NEXT: .LBB2_27: +; AVX2-NEXT: vpextrb $3, %xmm3, %ecx +; AVX2-NEXT: vpextrb $3, %xmm1, %r8d +; AVX2-NEXT: movl %r8d, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %r8b +; AVX2-NEXT: jno .LBB2_29 +; AVX2-NEXT: # %bb.28: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r8d +; AVX2-NEXT: .LBB2_29: +; AVX2-NEXT: vpextrb $2, %xmm3, %ecx +; AVX2-NEXT: vpextrb $2, %xmm1, %ebp +; AVX2-NEXT: movl %ebp, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %bpl +; AVX2-NEXT: jno .LBB2_31 +; AVX2-NEXT: # %bb.30: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %ebp +; AVX2-NEXT: .LBB2_31: +; AVX2-NEXT: vpextrb $0, %xmm3, %ecx +; AVX2-NEXT: vpextrb $0, %xmm1, %ebx +; AVX2-NEXT: movl %ebx, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %bl +; AVX2-NEXT: jno .LBB2_33 +; AVX2-NEXT: # %bb.32: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %ebx +; AVX2-NEXT: .LBB2_33: +; AVX2-NEXT: vpextrb $1, %xmm3, %ecx +; AVX2-NEXT: vpextrb $1, %xmm1, %edi +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %dil +; AVX2-NEXT: jno .LBB2_35 +; AVX2-NEXT: # %bb.34: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %edi +; AVX2-NEXT: .LBB2_35: +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX2-NEXT: vpextrb $15, %xmm3, %ecx +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vpextrb $15, %xmm1, %edx +; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %dl +; AVX2-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_37 +; AVX2-NEXT: # %bb.36: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %edx +; AVX2-NEXT: .LBB2_37: +; AVX2-NEXT: vpextrb $14, %xmm3, %ecx +; AVX2-NEXT: vpextrb $14, %xmm1, %edi +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %dil +; AVX2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_39 +; AVX2-NEXT: # %bb.38: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %edi +; AVX2-NEXT: .LBB2_39: +; AVX2-NEXT: vpextrb $13, %xmm3, %ecx +; AVX2-NEXT: vpextrb $13, %xmm1, %r12d +; AVX2-NEXT: movl %r12d, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %r12b +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_41 +; AVX2-NEXT: # %bb.40: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r12d +; AVX2-NEXT: .LBB2_41: +; AVX2-NEXT: vpextrb $12, %xmm3, %ecx +; AVX2-NEXT: vpextrb $12, %xmm1, %r15d +; AVX2-NEXT: movl %r15d, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %r15b +; AVX2-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_43 +; AVX2-NEXT: # %bb.42: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r15d +; AVX2-NEXT: .LBB2_43: +; AVX2-NEXT: vpextrb $11, %xmm3, %ecx +; AVX2-NEXT: vpextrb $11, %xmm1, %ebp +; AVX2-NEXT: movl %ebp, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %bpl +; AVX2-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_45 +; AVX2-NEXT: # %bb.44: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %ebp +; AVX2-NEXT: .LBB2_45: +; AVX2-NEXT: vpextrb $10, %xmm3, %ecx +; AVX2-NEXT: vpextrb $10, %xmm1, %ebx +; AVX2-NEXT: movl %ebx, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %bl +; AVX2-NEXT: jno .LBB2_47 +; AVX2-NEXT: # %bb.46: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %ebx +; AVX2-NEXT: .LBB2_47: +; AVX2-NEXT: vpextrb $9, %xmm3, %ecx +; AVX2-NEXT: vpextrb $9, %xmm1, %edx +; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %dl +; AVX2-NEXT: jno .LBB2_49 +; AVX2-NEXT: # %bb.48: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %edx +; AVX2-NEXT: .LBB2_49: +; AVX2-NEXT: vpextrb $8, %xmm3, %ecx +; AVX2-NEXT: vpextrb $8, %xmm1, %esi +; AVX2-NEXT: movl %esi, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %sil +; AVX2-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %ebx, (%rsp) # 4-byte Spill +; AVX2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jo .LBB2_50 +; AVX2-NEXT: # %bb.51: +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jmp .LBB2_52 +; AVX2-NEXT: .LBB2_50: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: # kill: def $al killed $al def $eax +; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: .LBB2_52: +; AVX2-NEXT: vpextrb $7, %xmm3, %ecx +; AVX2-NEXT: vpextrb $7, %xmm1, %r11d +; AVX2-NEXT: movl %r11d, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %r11b +; AVX2-NEXT: jno .LBB2_54 +; AVX2-NEXT: # %bb.53: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r11d +; AVX2-NEXT: .LBB2_54: +; AVX2-NEXT: vpextrb $6, %xmm3, %ecx +; AVX2-NEXT: vpextrb $6, %xmm1, %esi +; AVX2-NEXT: movl %esi, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %sil +; AVX2-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_56 +; AVX2-NEXT: # %bb.55: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %esi +; AVX2-NEXT: .LBB2_56: +; AVX2-NEXT: vpextrb $5, %xmm3, %ecx +; AVX2-NEXT: vpextrb $5, %xmm1, %edi +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %dil +; AVX2-NEXT: jno .LBB2_58 +; AVX2-NEXT: # %bb.57: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %edi +; AVX2-NEXT: .LBB2_58: +; AVX2-NEXT: vpextrb $4, %xmm3, %ecx +; AVX2-NEXT: vpextrb $4, %xmm1, %r13d +; AVX2-NEXT: movl %r13d, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %r13b +; AVX2-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_60 +; AVX2-NEXT: # %bb.59: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: # kill: def $al killed $al def $eax +; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: .LBB2_60: +; AVX2-NEXT: vpextrb $3, %xmm3, %ecx +; AVX2-NEXT: vpextrb $3, %xmm1, %ebp +; AVX2-NEXT: movl %ebp, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %bpl +; AVX2-NEXT: jo .LBB2_61 +; AVX2-NEXT: # %bb.62: +; AVX2-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jmp .LBB2_63 +; AVX2-NEXT: .LBB2_61: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: # kill: def $al killed $al def $eax +; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: .LBB2_63: +; AVX2-NEXT: vpextrb $2, %xmm3, %ecx +; AVX2-NEXT: vpextrb $2, %xmm1, %ebp +; AVX2-NEXT: movl %ebp, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %bpl +; AVX2-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_65 +; AVX2-NEXT: # %bb.64: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %ebp +; AVX2-NEXT: .LBB2_65: +; AVX2-NEXT: vpextrb $0, %xmm3, %ecx +; AVX2-NEXT: vpextrb $0, %xmm1, %edi +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %dil +; AVX2-NEXT: movl %r15d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_67 +; AVX2-NEXT: # %bb.66: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %edi +; AVX2-NEXT: .LBB2_67: +; AVX2-NEXT: vpextrb $1, %xmm3, %ecx +; AVX2-NEXT: vpextrb $1, %xmm1, %ebx +; AVX2-NEXT: movl %ebx, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %bl +; AVX2-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_69 +; AVX2-NEXT: # %bb.68: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %ebx +; AVX2-NEXT: .LBB2_69: +; AVX2-NEXT: vpextrb $15, %xmm2, %ecx +; AVX2-NEXT: vpextrb $15, %xmm0, %edx +; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %dl +; AVX2-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_71 +; AVX2-NEXT: # %bb.70: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %edx +; AVX2-NEXT: .LBB2_71: +; AVX2-NEXT: vpextrb $14, %xmm2, %ecx +; AVX2-NEXT: vpextrb $14, %xmm0, %edi +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %dil +; AVX2-NEXT: jno .LBB2_73 +; AVX2-NEXT: # %bb.72: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %edi +; AVX2-NEXT: .LBB2_73: +; AVX2-NEXT: vpextrb $13, %xmm2, %ecx +; AVX2-NEXT: vpextrb $13, %xmm0, %r10d +; AVX2-NEXT: movl %r10d, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %r10b +; AVX2-NEXT: jno .LBB2_75 +; AVX2-NEXT: # %bb.74: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r10d +; AVX2-NEXT: .LBB2_75: +; AVX2-NEXT: vpextrb $12, %xmm2, %ecx +; AVX2-NEXT: vpextrb $12, %xmm0, %r12d +; AVX2-NEXT: movl %r12d, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %r12b +; AVX2-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_77 +; AVX2-NEXT: # %bb.76: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: # kill: def $al killed $al def $eax +; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: .LBB2_77: +; AVX2-NEXT: vpextrb $11, %xmm2, %ecx +; AVX2-NEXT: vpextrb $11, %xmm0, %r14d +; AVX2-NEXT: movl %r14d, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %r14b +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_79 +; AVX2-NEXT: # %bb.78: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r14d +; AVX2-NEXT: .LBB2_79: +; AVX2-NEXT: vpextrb $10, %xmm2, %ecx +; AVX2-NEXT: vpextrb $10, %xmm0, %r13d +; AVX2-NEXT: movl %r13d, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %r13b +; AVX2-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_81 +; AVX2-NEXT: # %bb.80: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r13d +; AVX2-NEXT: .LBB2_81: +; AVX2-NEXT: vpextrb $9, %xmm2, %ecx +; AVX2-NEXT: vpextrb $9, %xmm0, %r8d +; AVX2-NEXT: movl %r8d, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %r8b +; AVX2-NEXT: jno .LBB2_83 +; AVX2-NEXT: # %bb.82: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r8d +; AVX2-NEXT: .LBB2_83: +; AVX2-NEXT: vpextrb $8, %xmm2, %ecx +; AVX2-NEXT: vpextrb $8, %xmm0, %r15d +; AVX2-NEXT: movl %r15d, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %r15b +; AVX2-NEXT: jno .LBB2_85 +; AVX2-NEXT: # %bb.84: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r15d +; AVX2-NEXT: .LBB2_85: +; AVX2-NEXT: vpextrb $7, %xmm2, %ecx +; AVX2-NEXT: vpextrb $7, %xmm0, %r12d +; AVX2-NEXT: movl %r12d, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %r12b +; AVX2-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_87 +; AVX2-NEXT: # %bb.86: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: # kill: def $al killed $al def $eax +; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: .LBB2_87: +; AVX2-NEXT: vpextrb $6, %xmm2, %ecx +; AVX2-NEXT: vpextrb $6, %xmm0, %r12d +; AVX2-NEXT: movl %r12d, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %r12b +; AVX2-NEXT: jno .LBB2_89 +; AVX2-NEXT: # %bb.88: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r12d +; AVX2-NEXT: .LBB2_89: +; AVX2-NEXT: vpextrb $5, %xmm2, %ecx +; AVX2-NEXT: vpextrb $5, %xmm0, %ebp +; AVX2-NEXT: movl %ebp, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %bpl +; AVX2-NEXT: jno .LBB2_91 +; AVX2-NEXT: # %bb.90: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %ebp +; AVX2-NEXT: .LBB2_91: +; AVX2-NEXT: vpextrb $4, %xmm2, %ecx +; AVX2-NEXT: vpextrb $4, %xmm0, %esi +; AVX2-NEXT: movl %esi, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %sil +; AVX2-NEXT: jno .LBB2_93 +; AVX2-NEXT: # %bb.92: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %esi +; AVX2-NEXT: .LBB2_93: +; AVX2-NEXT: vpextrb $3, %xmm2, %ecx +; AVX2-NEXT: vpextrb $3, %xmm0, %edx +; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %dl +; AVX2-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_95 +; AVX2-NEXT: # %bb.94: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %edx +; AVX2-NEXT: .LBB2_95: +; AVX2-NEXT: vpextrb $2, %xmm2, %ecx +; AVX2-NEXT: vpextrb $2, %xmm0, %edi +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %dil +; AVX2-NEXT: jno .LBB2_97 +; AVX2-NEXT: # %bb.96: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %edi +; AVX2-NEXT: .LBB2_97: +; AVX2-NEXT: vpextrb $0, %xmm2, %ecx +; AVX2-NEXT: vpextrb $0, %xmm0, %ebx +; AVX2-NEXT: movl %ebx, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %bl +; AVX2-NEXT: jno .LBB2_99 +; AVX2-NEXT: # %bb.98: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %ebx +; AVX2-NEXT: .LBB2_99: +; AVX2-NEXT: vpextrb $1, %xmm2, %ecx +; AVX2-NEXT: vpextrb $1, %xmm0, %r11d +; AVX2-NEXT: movl %r11d, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %r11b +; AVX2-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_101 +; AVX2-NEXT: # %bb.100: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: # kill: def $al killed $al def $eax +; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: .LBB2_101: +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1 +; AVX2-NEXT: vpextrb $15, %xmm1, %ecx +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpextrb $15, %xmm0, %r11d +; AVX2-NEXT: movl %r11d, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %r11b +; AVX2-NEXT: jno .LBB2_103 +; AVX2-NEXT: # %bb.102: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r11d +; AVX2-NEXT: .LBB2_103: +; AVX2-NEXT: vpextrb $14, %xmm1, %ecx +; AVX2-NEXT: vpextrb $14, %xmm0, %r9d +; AVX2-NEXT: movl %r9d, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %r9b +; AVX2-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_105 +; AVX2-NEXT: # %bb.104: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: # kill: def $al killed $al def $eax +; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: .LBB2_105: +; AVX2-NEXT: vpextrb $13, %xmm1, %ecx +; AVX2-NEXT: vpextrb $13, %xmm0, %r9d +; AVX2-NEXT: movl %r9d, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %r9b +; AVX2-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %r15d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_107 +; AVX2-NEXT: # %bb.106: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r9d +; AVX2-NEXT: .LBB2_107: +; AVX2-NEXT: vpextrb $12, %xmm1, %ecx +; AVX2-NEXT: vpextrb $12, %xmm0, %edx +; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %dl +; AVX2-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_109 +; AVX2-NEXT: # %bb.108: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %edx +; AVX2-NEXT: .LBB2_109: +; AVX2-NEXT: vpextrb $11, %xmm1, %ecx +; AVX2-NEXT: vpextrb $11, %xmm0, %r13d +; AVX2-NEXT: movl %r13d, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %r13b +; AVX2-NEXT: jno .LBB2_111 +; AVX2-NEXT: # %bb.110: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r13d +; AVX2-NEXT: .LBB2_111: +; AVX2-NEXT: vpextrb $10, %xmm1, %ecx +; AVX2-NEXT: vpextrb $10, %xmm0, %r15d +; AVX2-NEXT: movl %r15d, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %r15b +; AVX2-NEXT: jno .LBB2_113 +; AVX2-NEXT: # %bb.112: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r15d +; AVX2-NEXT: .LBB2_113: +; AVX2-NEXT: vpextrb $9, %xmm1, %ecx +; AVX2-NEXT: vpextrb $9, %xmm0, %r14d +; AVX2-NEXT: movl %r14d, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %r14b +; AVX2-NEXT: jno .LBB2_115 +; AVX2-NEXT: # %bb.114: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r14d +; AVX2-NEXT: .LBB2_115: +; AVX2-NEXT: vpextrb $8, %xmm1, %ecx +; AVX2-NEXT: vpextrb $8, %xmm0, %ebp +; AVX2-NEXT: movl %ebp, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %bpl +; AVX2-NEXT: jno .LBB2_117 +; AVX2-NEXT: # %bb.116: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %ebp +; AVX2-NEXT: .LBB2_117: +; AVX2-NEXT: vpextrb $7, %xmm1, %ecx +; AVX2-NEXT: vpextrb $7, %xmm0, %edi +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %dil +; AVX2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_119 +; AVX2-NEXT: # %bb.118: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %edi +; AVX2-NEXT: .LBB2_119: +; AVX2-NEXT: vpextrb $6, %xmm1, %edx +; AVX2-NEXT: vpextrb $6, %xmm0, %eax +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: addb %dl, %cl +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addb %dl, %al +; AVX2-NEXT: jno .LBB2_121 +; AVX2-NEXT: # %bb.120: +; AVX2-NEXT: addb $127, %cl +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: .LBB2_121: +; AVX2-NEXT: vpextrb $5, %xmm1, %ebx +; AVX2-NEXT: vpextrb $5, %xmm0, %ecx +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: addb %bl, %dl +; AVX2-NEXT: setns %dl +; AVX2-NEXT: addb %bl, %cl +; AVX2-NEXT: jno .LBB2_123 +; AVX2-NEXT: # %bb.122: +; AVX2-NEXT: addb $127, %dl +; AVX2-NEXT: movl %edx, %ecx +; AVX2-NEXT: .LBB2_123: +; AVX2-NEXT: vpextrb $4, %xmm1, %esi +; AVX2-NEXT: vpextrb $4, %xmm0, %edx +; AVX2-NEXT: movl %edx, %ebx +; AVX2-NEXT: addb %sil, %bl +; AVX2-NEXT: setns %bl +; AVX2-NEXT: addb %sil, %dl +; AVX2-NEXT: jno .LBB2_125 +; AVX2-NEXT: # %bb.124: +; AVX2-NEXT: addb $127, %bl +; AVX2-NEXT: movl %ebx, %edx +; AVX2-NEXT: .LBB2_125: +; AVX2-NEXT: vpextrb $3, %xmm1, %esi +; AVX2-NEXT: vpextrb $3, %xmm0, %r8d +; AVX2-NEXT: movl %r8d, %ebx +; AVX2-NEXT: addb %sil, %bl +; AVX2-NEXT: setns %bl +; AVX2-NEXT: addb %sil, %r8b +; AVX2-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_127 +; AVX2-NEXT: # %bb.126: +; AVX2-NEXT: addb $127, %bl +; AVX2-NEXT: movl %ebx, %r8d +; AVX2-NEXT: .LBB2_127: +; AVX2-NEXT: vpextrb $2, %xmm1, %esi +; AVX2-NEXT: vpextrb $2, %xmm0, %r9d +; AVX2-NEXT: movl %r9d, %ebx +; AVX2-NEXT: addb %sil, %bl +; AVX2-NEXT: setns %bl +; AVX2-NEXT: addb %sil, %r9b +; AVX2-NEXT: jno .LBB2_129 +; AVX2-NEXT: # %bb.128: +; AVX2-NEXT: addb $127, %bl +; AVX2-NEXT: movl %ebx, %r9d +; AVX2-NEXT: .LBB2_129: +; AVX2-NEXT: vpextrb $0, %xmm1, %esi +; AVX2-NEXT: vpextrb $0, %xmm0, %r10d +; AVX2-NEXT: movl %r10d, %ebx +; AVX2-NEXT: addb %sil, %bl +; AVX2-NEXT: setns %bl +; AVX2-NEXT: movl %r11d, %r12d +; AVX2-NEXT: addb %sil, %r10b +; AVX2-NEXT: jno .LBB2_131 +; AVX2-NEXT: # %bb.130: +; AVX2-NEXT: addb $127, %bl +; AVX2-NEXT: movl %ebx, %r10d +; AVX2-NEXT: .LBB2_131: +; AVX2-NEXT: vpextrb $1, %xmm1, %esi +; AVX2-NEXT: vpextrb $1, %xmm0, %r11d +; AVX2-NEXT: movl %r11d, %ebx +; AVX2-NEXT: addb %sil, %bl +; AVX2-NEXT: setns %bl +; AVX2-NEXT: addb %sil, %r11b +; AVX2-NEXT: jno .LBB2_133 +; AVX2-NEXT: # %bb.132: +; AVX2-NEXT: addb $127, %bl +; AVX2-NEXT: movl %ebx, %r11d +; AVX2-NEXT: .LBB2_133: +; AVX2-NEXT: movzbl %r10b, %esi +; AVX2-NEXT: vmovd %esi, %xmm0 +; AVX2-NEXT: movzbl %r11b, %esi +; AVX2-NEXT: vpinsrb $1, %esi, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %r9b, %esi +; AVX2-NEXT: vpinsrb $2, %esi, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %r8b, %esi +; AVX2-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %dl, %edx +; AVX2-NEXT: vpinsrb $4, %edx, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %dil, %eax +; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %bpl, %eax +; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %r14b, %eax +; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %r15b, %eax +; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %r13b, %eax +; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %r12b, %eax +; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vmovd %eax, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl (%rsp), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vmovd %eax, %xmm3 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm1 +; AVX2-NEXT: addq $76, %rsp +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r12 +; AVX2-NEXT: popq %r13 +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512-LABEL: v64i8: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r15 +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %r13 +; AVX512-NEXT: pushq %r12 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: subq $76, %rsp +; AVX512-NEXT: vpextrb $15, %xmm1, %ecx +; AVX512-NEXT: vpextrb $15, %xmm0, %edx +; AVX512-NEXT: movl %edx, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %dl +; AVX512-NEXT: jo .LBB2_1 +; AVX512-NEXT: # %bb.2: +; AVX512-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jmp .LBB2_3 +; AVX512-NEXT: .LBB2_1: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: # kill: def $al killed $al def $eax +; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: .LBB2_3: +; AVX512-NEXT: vpextrb $14, %xmm1, %ecx +; AVX512-NEXT: vpextrb $14, %xmm0, %edx +; AVX512-NEXT: movl %edx, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %dl +; AVX512-NEXT: jno .LBB2_5 +; AVX512-NEXT: # %bb.4: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %edx +; AVX512-NEXT: .LBB2_5: +; AVX512-NEXT: vpextrb $13, %xmm1, %ecx +; AVX512-NEXT: vpextrb $13, %xmm0, %esi +; AVX512-NEXT: movl %esi, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %sil +; AVX512-NEXT: jo .LBB2_6 +; AVX512-NEXT: # %bb.7: +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jmp .LBB2_8 +; AVX512-NEXT: .LBB2_6: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: # kill: def $al killed $al def $eax +; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: .LBB2_8: +; AVX512-NEXT: vpextrb $12, %xmm1, %ecx +; AVX512-NEXT: vpextrb $12, %xmm0, %esi +; AVX512-NEXT: movl %esi, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %sil +; AVX512-NEXT: jno .LBB2_10 +; AVX512-NEXT: # %bb.9: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %esi +; AVX512-NEXT: .LBB2_10: +; AVX512-NEXT: vpextrb $11, %xmm1, %ecx +; AVX512-NEXT: vpextrb $11, %xmm0, %edi +; AVX512-NEXT: movl %edi, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %dil +; AVX512-NEXT: jno .LBB2_12 +; AVX512-NEXT: # %bb.11: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %edi +; AVX512-NEXT: .LBB2_12: +; AVX512-NEXT: vpextrb $10, %xmm1, %ecx +; AVX512-NEXT: vpextrb $10, %xmm0, %ebp +; AVX512-NEXT: movl %ebp, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %bpl +; AVX512-NEXT: jno .LBB2_14 +; AVX512-NEXT: # %bb.13: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %ebp +; AVX512-NEXT: .LBB2_14: +; AVX512-NEXT: vpextrb $9, %xmm1, %ecx +; AVX512-NEXT: vpextrb $9, %xmm0, %ebx +; AVX512-NEXT: movl %ebx, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %bl +; AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jo .LBB2_15 +; AVX512-NEXT: # %bb.16: +; AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jmp .LBB2_17 +; AVX512-NEXT: .LBB2_15: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: # kill: def $al killed $al def $eax +; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: .LBB2_17: +; AVX512-NEXT: vpextrb $8, %xmm1, %ecx +; AVX512-NEXT: vpextrb $8, %xmm0, %ebp +; AVX512-NEXT: movl %ebp, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %bpl +; AVX512-NEXT: jno .LBB2_19 +; AVX512-NEXT: # %bb.18: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %ebp +; AVX512-NEXT: .LBB2_19: +; AVX512-NEXT: vpextrb $7, %xmm1, %ecx +; AVX512-NEXT: vpextrb $7, %xmm0, %ebx +; AVX512-NEXT: movl %ebx, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %bl +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_21 +; AVX512-NEXT: # %bb.20: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %ebx +; AVX512-NEXT: .LBB2_21: +; AVX512-NEXT: vpextrb $6, %xmm1, %ecx +; AVX512-NEXT: vpextrb $6, %xmm0, %esi +; AVX512-NEXT: movl %esi, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %sil +; AVX512-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_23 +; AVX512-NEXT: # %bb.22: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %esi +; AVX512-NEXT: .LBB2_23: +; AVX512-NEXT: vpextrb $5, %xmm1, %ecx +; AVX512-NEXT: vpextrb $5, %xmm0, %r11d +; AVX512-NEXT: movl %r11d, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %r11b +; AVX512-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_25 +; AVX512-NEXT: # %bb.24: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r11d +; AVX512-NEXT: .LBB2_25: +; AVX512-NEXT: vpextrb $4, %xmm1, %ecx +; AVX512-NEXT: vpextrb $4, %xmm0, %r13d +; AVX512-NEXT: movl %r13d, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %r13b +; AVX512-NEXT: jno .LBB2_27 +; AVX512-NEXT: # %bb.26: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r13d +; AVX512-NEXT: .LBB2_27: +; AVX512-NEXT: vpextrb $3, %xmm1, %ecx +; AVX512-NEXT: vpextrb $3, %xmm0, %r8d +; AVX512-NEXT: movl %r8d, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %r8b +; AVX512-NEXT: jno .LBB2_29 +; AVX512-NEXT: # %bb.28: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r8d +; AVX512-NEXT: .LBB2_29: +; AVX512-NEXT: vpextrb $2, %xmm1, %ecx +; AVX512-NEXT: vpextrb $2, %xmm0, %ebp +; AVX512-NEXT: movl %ebp, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %bpl +; AVX512-NEXT: jno .LBB2_31 +; AVX512-NEXT: # %bb.30: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %ebp +; AVX512-NEXT: .LBB2_31: +; AVX512-NEXT: vpextrb $0, %xmm1, %ecx +; AVX512-NEXT: vpextrb $0, %xmm0, %ebx +; AVX512-NEXT: movl %ebx, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %bl +; AVX512-NEXT: jno .LBB2_33 +; AVX512-NEXT: # %bb.32: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %ebx +; AVX512-NEXT: .LBB2_33: +; AVX512-NEXT: vpextrb $1, %xmm1, %ecx +; AVX512-NEXT: vpextrb $1, %xmm0, %edi +; AVX512-NEXT: movl %edi, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %dil +; AVX512-NEXT: jno .LBB2_35 +; AVX512-NEXT: # %bb.34: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %edi +; AVX512-NEXT: .LBB2_35: +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vpextrb $15, %xmm2, %ecx +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512-NEXT: vpextrb $15, %xmm3, %edx +; AVX512-NEXT: movl %edx, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %dl +; AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_37 +; AVX512-NEXT: # %bb.36: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %edx +; AVX512-NEXT: .LBB2_37: +; AVX512-NEXT: vpextrb $14, %xmm2, %ecx +; AVX512-NEXT: vpextrb $14, %xmm3, %edi +; AVX512-NEXT: movl %edi, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %dil +; AVX512-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_39 +; AVX512-NEXT: # %bb.38: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %edi +; AVX512-NEXT: .LBB2_39: +; AVX512-NEXT: vpextrb $13, %xmm2, %ecx +; AVX512-NEXT: vpextrb $13, %xmm3, %r12d +; AVX512-NEXT: movl %r12d, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %r12b +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_41 +; AVX512-NEXT: # %bb.40: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r12d +; AVX512-NEXT: .LBB2_41: +; AVX512-NEXT: vpextrb $12, %xmm2, %ecx +; AVX512-NEXT: vpextrb $12, %xmm3, %r15d +; AVX512-NEXT: movl %r15d, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %r15b +; AVX512-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_43 +; AVX512-NEXT: # %bb.42: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r15d +; AVX512-NEXT: .LBB2_43: +; AVX512-NEXT: vpextrb $11, %xmm2, %ecx +; AVX512-NEXT: vpextrb $11, %xmm3, %ebp +; AVX512-NEXT: movl %ebp, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %bpl +; AVX512-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_45 +; AVX512-NEXT: # %bb.44: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %ebp +; AVX512-NEXT: .LBB2_45: +; AVX512-NEXT: vpextrb $10, %xmm2, %ecx +; AVX512-NEXT: vpextrb $10, %xmm3, %ebx +; AVX512-NEXT: movl %ebx, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %bl +; AVX512-NEXT: jno .LBB2_47 +; AVX512-NEXT: # %bb.46: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %ebx +; AVX512-NEXT: .LBB2_47: +; AVX512-NEXT: vpextrb $9, %xmm2, %ecx +; AVX512-NEXT: vpextrb $9, %xmm3, %edx +; AVX512-NEXT: movl %edx, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %dl +; AVX512-NEXT: jno .LBB2_49 +; AVX512-NEXT: # %bb.48: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %edx +; AVX512-NEXT: .LBB2_49: +; AVX512-NEXT: vpextrb $8, %xmm2, %ecx +; AVX512-NEXT: vpextrb $8, %xmm3, %esi +; AVX512-NEXT: movl %esi, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %sil +; AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %ebx, (%rsp) # 4-byte Spill +; AVX512-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jo .LBB2_50 +; AVX512-NEXT: # %bb.51: +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jmp .LBB2_52 +; AVX512-NEXT: .LBB2_50: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: # kill: def $al killed $al def $eax +; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: .LBB2_52: +; AVX512-NEXT: vpextrb $7, %xmm2, %ecx +; AVX512-NEXT: vpextrb $7, %xmm3, %r11d +; AVX512-NEXT: movl %r11d, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %r11b +; AVX512-NEXT: jno .LBB2_54 +; AVX512-NEXT: # %bb.53: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r11d +; AVX512-NEXT: .LBB2_54: +; AVX512-NEXT: vpextrb $6, %xmm2, %ecx +; AVX512-NEXT: vpextrb $6, %xmm3, %esi +; AVX512-NEXT: movl %esi, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %sil +; AVX512-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_56 +; AVX512-NEXT: # %bb.55: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %esi +; AVX512-NEXT: .LBB2_56: +; AVX512-NEXT: vpextrb $5, %xmm2, %ecx +; AVX512-NEXT: vpextrb $5, %xmm3, %edi +; AVX512-NEXT: movl %edi, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %dil +; AVX512-NEXT: jno .LBB2_58 +; AVX512-NEXT: # %bb.57: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %edi +; AVX512-NEXT: .LBB2_58: +; AVX512-NEXT: vpextrb $4, %xmm2, %ecx +; AVX512-NEXT: vpextrb $4, %xmm3, %r13d +; AVX512-NEXT: movl %r13d, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %r13b +; AVX512-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_60 +; AVX512-NEXT: # %bb.59: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: # kill: def $al killed $al def $eax +; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: .LBB2_60: +; AVX512-NEXT: vpextrb $3, %xmm2, %ecx +; AVX512-NEXT: vpextrb $3, %xmm3, %ebp +; AVX512-NEXT: movl %ebp, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %bpl +; AVX512-NEXT: jo .LBB2_61 +; AVX512-NEXT: # %bb.62: +; AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jmp .LBB2_63 +; AVX512-NEXT: .LBB2_61: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: # kill: def $al killed $al def $eax +; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: .LBB2_63: +; AVX512-NEXT: vpextrb $2, %xmm2, %ecx +; AVX512-NEXT: vpextrb $2, %xmm3, %ebp +; AVX512-NEXT: movl %ebp, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %bpl +; AVX512-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_65 +; AVX512-NEXT: # %bb.64: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %ebp +; AVX512-NEXT: .LBB2_65: +; AVX512-NEXT: vpextrb $0, %xmm2, %ecx +; AVX512-NEXT: vpextrb $0, %xmm3, %edi +; AVX512-NEXT: movl %edi, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %dil +; AVX512-NEXT: movl %r15d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_67 +; AVX512-NEXT: # %bb.66: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %edi +; AVX512-NEXT: .LBB2_67: +; AVX512-NEXT: vpextrb $1, %xmm2, %ecx +; AVX512-NEXT: vpextrb $1, %xmm3, %ebx +; AVX512-NEXT: movl %ebx, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %bl +; AVX512-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_69 +; AVX512-NEXT: # %bb.68: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %ebx +; AVX512-NEXT: .LBB2_69: +; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm2 +; AVX512-NEXT: vpextrb $15, %xmm2, %ecx +; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm3 +; AVX512-NEXT: vpextrb $15, %xmm3, %edx +; AVX512-NEXT: movl %edx, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %dl +; AVX512-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_71 +; AVX512-NEXT: # %bb.70: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %edx +; AVX512-NEXT: .LBB2_71: +; AVX512-NEXT: vpextrb $14, %xmm2, %ecx +; AVX512-NEXT: vpextrb $14, %xmm3, %edi +; AVX512-NEXT: movl %edi, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %dil +; AVX512-NEXT: jno .LBB2_73 +; AVX512-NEXT: # %bb.72: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %edi +; AVX512-NEXT: .LBB2_73: +; AVX512-NEXT: vpextrb $13, %xmm2, %ecx +; AVX512-NEXT: vpextrb $13, %xmm3, %r10d +; AVX512-NEXT: movl %r10d, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %r10b +; AVX512-NEXT: jno .LBB2_75 +; AVX512-NEXT: # %bb.74: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r10d +; AVX512-NEXT: .LBB2_75: +; AVX512-NEXT: vpextrb $12, %xmm2, %ecx +; AVX512-NEXT: vpextrb $12, %xmm3, %r12d +; AVX512-NEXT: movl %r12d, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %r12b +; AVX512-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_77 +; AVX512-NEXT: # %bb.76: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: # kill: def $al killed $al def $eax +; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: .LBB2_77: +; AVX512-NEXT: vpextrb $11, %xmm2, %ecx +; AVX512-NEXT: vpextrb $11, %xmm3, %r14d +; AVX512-NEXT: movl %r14d, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %r14b +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_79 +; AVX512-NEXT: # %bb.78: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r14d +; AVX512-NEXT: .LBB2_79: +; AVX512-NEXT: vpextrb $10, %xmm2, %ecx +; AVX512-NEXT: vpextrb $10, %xmm3, %r13d +; AVX512-NEXT: movl %r13d, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %r13b +; AVX512-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_81 +; AVX512-NEXT: # %bb.80: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r13d +; AVX512-NEXT: .LBB2_81: +; AVX512-NEXT: vpextrb $9, %xmm2, %ecx +; AVX512-NEXT: vpextrb $9, %xmm3, %r8d +; AVX512-NEXT: movl %r8d, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %r8b +; AVX512-NEXT: jno .LBB2_83 +; AVX512-NEXT: # %bb.82: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r8d +; AVX512-NEXT: .LBB2_83: +; AVX512-NEXT: vpextrb $8, %xmm2, %ecx +; AVX512-NEXT: vpextrb $8, %xmm3, %r15d +; AVX512-NEXT: movl %r15d, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %r15b +; AVX512-NEXT: jno .LBB2_85 +; AVX512-NEXT: # %bb.84: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r15d +; AVX512-NEXT: .LBB2_85: +; AVX512-NEXT: vpextrb $7, %xmm2, %ecx +; AVX512-NEXT: vpextrb $7, %xmm3, %r12d +; AVX512-NEXT: movl %r12d, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %r12b +; AVX512-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_87 +; AVX512-NEXT: # %bb.86: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: # kill: def $al killed $al def $eax +; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: .LBB2_87: +; AVX512-NEXT: vpextrb $6, %xmm2, %ecx +; AVX512-NEXT: vpextrb $6, %xmm3, %r12d +; AVX512-NEXT: movl %r12d, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %r12b +; AVX512-NEXT: jno .LBB2_89 +; AVX512-NEXT: # %bb.88: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r12d +; AVX512-NEXT: .LBB2_89: +; AVX512-NEXT: vpextrb $5, %xmm2, %ecx +; AVX512-NEXT: vpextrb $5, %xmm3, %ebp +; AVX512-NEXT: movl %ebp, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %bpl +; AVX512-NEXT: jno .LBB2_91 +; AVX512-NEXT: # %bb.90: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %ebp +; AVX512-NEXT: .LBB2_91: +; AVX512-NEXT: vpextrb $4, %xmm2, %ecx +; AVX512-NEXT: vpextrb $4, %xmm3, %esi +; AVX512-NEXT: movl %esi, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %sil +; AVX512-NEXT: jno .LBB2_93 +; AVX512-NEXT: # %bb.92: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %esi +; AVX512-NEXT: .LBB2_93: +; AVX512-NEXT: vpextrb $3, %xmm2, %ecx +; AVX512-NEXT: vpextrb $3, %xmm3, %edx +; AVX512-NEXT: movl %edx, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %dl +; AVX512-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_95 +; AVX512-NEXT: # %bb.94: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %edx +; AVX512-NEXT: .LBB2_95: +; AVX512-NEXT: vpextrb $2, %xmm2, %ecx +; AVX512-NEXT: vpextrb $2, %xmm3, %edi +; AVX512-NEXT: movl %edi, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %dil +; AVX512-NEXT: jno .LBB2_97 +; AVX512-NEXT: # %bb.96: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %edi +; AVX512-NEXT: .LBB2_97: +; AVX512-NEXT: vpextrb $0, %xmm2, %ecx +; AVX512-NEXT: vpextrb $0, %xmm3, %ebx +; AVX512-NEXT: movl %ebx, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %bl +; AVX512-NEXT: jno .LBB2_99 +; AVX512-NEXT: # %bb.98: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %ebx +; AVX512-NEXT: .LBB2_99: +; AVX512-NEXT: vpextrb $1, %xmm2, %ecx +; AVX512-NEXT: vpextrb $1, %xmm3, %r11d +; AVX512-NEXT: movl %r11d, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %r11b +; AVX512-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_101 +; AVX512-NEXT: # %bb.100: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: # kill: def $al killed $al def $eax +; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: .LBB2_101: +; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm1 +; AVX512-NEXT: vpextrb $15, %xmm1, %ecx +; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm0 +; AVX512-NEXT: vpextrb $15, %xmm0, %r11d +; AVX512-NEXT: movl %r11d, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %r11b +; AVX512-NEXT: jno .LBB2_103 +; AVX512-NEXT: # %bb.102: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r11d +; AVX512-NEXT: .LBB2_103: +; AVX512-NEXT: vpextrb $14, %xmm1, %ecx +; AVX512-NEXT: vpextrb $14, %xmm0, %r9d +; AVX512-NEXT: movl %r9d, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %r9b +; AVX512-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_105 +; AVX512-NEXT: # %bb.104: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: # kill: def $al killed $al def $eax +; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: .LBB2_105: +; AVX512-NEXT: vpextrb $13, %xmm1, %ecx +; AVX512-NEXT: vpextrb $13, %xmm0, %r9d +; AVX512-NEXT: movl %r9d, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %r9b +; AVX512-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %r15d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_107 +; AVX512-NEXT: # %bb.106: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r9d +; AVX512-NEXT: .LBB2_107: +; AVX512-NEXT: vpextrb $12, %xmm1, %ecx +; AVX512-NEXT: vpextrb $12, %xmm0, %edx +; AVX512-NEXT: movl %edx, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %dl +; AVX512-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_109 +; AVX512-NEXT: # %bb.108: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %edx +; AVX512-NEXT: .LBB2_109: +; AVX512-NEXT: vpextrb $11, %xmm1, %ecx +; AVX512-NEXT: vpextrb $11, %xmm0, %r13d +; AVX512-NEXT: movl %r13d, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %r13b +; AVX512-NEXT: jno .LBB2_111 +; AVX512-NEXT: # %bb.110: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r13d +; AVX512-NEXT: .LBB2_111: +; AVX512-NEXT: vpextrb $10, %xmm1, %ecx +; AVX512-NEXT: vpextrb $10, %xmm0, %r15d +; AVX512-NEXT: movl %r15d, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %r15b +; AVX512-NEXT: jno .LBB2_113 +; AVX512-NEXT: # %bb.112: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r15d +; AVX512-NEXT: .LBB2_113: +; AVX512-NEXT: vpextrb $9, %xmm1, %ecx +; AVX512-NEXT: vpextrb $9, %xmm0, %r14d +; AVX512-NEXT: movl %r14d, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %r14b +; AVX512-NEXT: jno .LBB2_115 +; AVX512-NEXT: # %bb.114: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r14d +; AVX512-NEXT: .LBB2_115: +; AVX512-NEXT: vpextrb $8, %xmm1, %ecx +; AVX512-NEXT: vpextrb $8, %xmm0, %ebp +; AVX512-NEXT: movl %ebp, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %bpl +; AVX512-NEXT: jno .LBB2_117 +; AVX512-NEXT: # %bb.116: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %ebp +; AVX512-NEXT: .LBB2_117: +; AVX512-NEXT: vpextrb $7, %xmm1, %ecx +; AVX512-NEXT: vpextrb $7, %xmm0, %edi +; AVX512-NEXT: movl %edi, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: addb %cl, %dil +; AVX512-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_119 +; AVX512-NEXT: # %bb.118: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %edi +; AVX512-NEXT: .LBB2_119: +; AVX512-NEXT: vpextrb $6, %xmm1, %edx +; AVX512-NEXT: vpextrb $6, %xmm0, %eax +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: addb %dl, %cl +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addb %dl, %al +; AVX512-NEXT: jno .LBB2_121 +; AVX512-NEXT: # %bb.120: +; AVX512-NEXT: addb $127, %cl +; AVX512-NEXT: movl %ecx, %eax +; AVX512-NEXT: .LBB2_121: +; AVX512-NEXT: vpextrb $5, %xmm1, %ebx +; AVX512-NEXT: vpextrb $5, %xmm0, %ecx +; AVX512-NEXT: movl %ecx, %edx +; AVX512-NEXT: addb %bl, %dl +; AVX512-NEXT: setns %dl +; AVX512-NEXT: addb %bl, %cl +; AVX512-NEXT: jno .LBB2_123 +; AVX512-NEXT: # %bb.122: +; AVX512-NEXT: addb $127, %dl +; AVX512-NEXT: movl %edx, %ecx +; AVX512-NEXT: .LBB2_123: +; AVX512-NEXT: vpextrb $4, %xmm1, %esi +; AVX512-NEXT: vpextrb $4, %xmm0, %edx +; AVX512-NEXT: movl %edx, %ebx +; AVX512-NEXT: addb %sil, %bl +; AVX512-NEXT: setns %bl +; AVX512-NEXT: addb %sil, %dl +; AVX512-NEXT: jno .LBB2_125 +; AVX512-NEXT: # %bb.124: +; AVX512-NEXT: addb $127, %bl +; AVX512-NEXT: movl %ebx, %edx +; AVX512-NEXT: .LBB2_125: +; AVX512-NEXT: vpextrb $3, %xmm1, %esi +; AVX512-NEXT: vpextrb $3, %xmm0, %r8d +; AVX512-NEXT: movl %r8d, %ebx +; AVX512-NEXT: addb %sil, %bl +; AVX512-NEXT: setns %bl +; AVX512-NEXT: addb %sil, %r8b +; AVX512-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_127 +; AVX512-NEXT: # %bb.126: +; AVX512-NEXT: addb $127, %bl +; AVX512-NEXT: movl %ebx, %r8d +; AVX512-NEXT: .LBB2_127: +; AVX512-NEXT: vpextrb $2, %xmm1, %esi +; AVX512-NEXT: vpextrb $2, %xmm0, %r9d +; AVX512-NEXT: movl %r9d, %ebx +; AVX512-NEXT: addb %sil, %bl +; AVX512-NEXT: setns %bl +; AVX512-NEXT: addb %sil, %r9b +; AVX512-NEXT: jno .LBB2_129 +; AVX512-NEXT: # %bb.128: +; AVX512-NEXT: addb $127, %bl +; AVX512-NEXT: movl %ebx, %r9d +; AVX512-NEXT: .LBB2_129: +; AVX512-NEXT: vpextrb $0, %xmm1, %esi +; AVX512-NEXT: vpextrb $0, %xmm0, %r10d +; AVX512-NEXT: movl %r10d, %ebx +; AVX512-NEXT: addb %sil, %bl +; AVX512-NEXT: setns %bl +; AVX512-NEXT: movl %r11d, %r12d +; AVX512-NEXT: addb %sil, %r10b +; AVX512-NEXT: jno .LBB2_131 +; AVX512-NEXT: # %bb.130: +; AVX512-NEXT: addb $127, %bl +; AVX512-NEXT: movl %ebx, %r10d +; AVX512-NEXT: .LBB2_131: +; AVX512-NEXT: vpextrb $1, %xmm1, %esi +; AVX512-NEXT: vpextrb $1, %xmm0, %r11d +; AVX512-NEXT: movl %r11d, %ebx +; AVX512-NEXT: addb %sil, %bl +; AVX512-NEXT: setns %bl +; AVX512-NEXT: addb %sil, %r11b +; AVX512-NEXT: jno .LBB2_133 +; AVX512-NEXT: # %bb.132: +; AVX512-NEXT: addb $127, %bl +; AVX512-NEXT: movl %ebx, %r11d +; AVX512-NEXT: .LBB2_133: +; AVX512-NEXT: movzbl %r10b, %esi +; AVX512-NEXT: vmovd %esi, %xmm0 +; AVX512-NEXT: movzbl %r11b, %esi +; AVX512-NEXT: vpinsrb $1, %esi, %xmm0, %xmm0 +; AVX512-NEXT: movzbl %r9b, %esi +; AVX512-NEXT: vpinsrb $2, %esi, %xmm0, %xmm0 +; AVX512-NEXT: movzbl %r8b, %esi +; AVX512-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; AVX512-NEXT: movzbl %dl, %edx +; AVX512-NEXT: vpinsrb $4, %edx, %xmm0, %xmm0 +; AVX512-NEXT: movzbl %cl, %ecx +; AVX512-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 +; AVX512-NEXT: movzbl %al, %eax +; AVX512-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX512-NEXT: movzbl %dil, %eax +; AVX512-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX512-NEXT: movzbl %bpl, %eax +; AVX512-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX512-NEXT: movzbl %r14b, %eax +; AVX512-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX512-NEXT: movzbl %r15b, %eax +; AVX512-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX512-NEXT: movzbl %r13b, %eax +; AVX512-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX512-NEXT: movzbl %r12b, %eax +; AVX512-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vmovd %eax, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vmovd %eax, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl (%rsp), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vmovd %eax, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: addq $76, %rsp +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r12 +; AVX512-NEXT: popq %r13 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %r15 +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq + %z = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %x, <64 x i8> %y) + ret <64 x i8> %z +} + +define <8 x i16> @v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { +; SSE2-LABEL: v8i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: movd %xmm0, %r8d +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %r8d, %edx +; SSE2-NEXT: addw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %r8w +; SSE2-NEXT: cmovol %ecx, %r8d +; SSE2-NEXT: pextrw $1, %xmm1, %eax +; SSE2-NEXT: pextrw $1, %xmm0, %r9d +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %r9d, %edx +; SSE2-NEXT: addw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %r9w +; SSE2-NEXT: cmovol %ecx, %r9d +; SSE2-NEXT: pextrw $2, %xmm1, %eax +; SSE2-NEXT: pextrw $2, %xmm0, %r10d +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %r10d, %esi +; SSE2-NEXT: addw %ax, %si +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %r10w +; SSE2-NEXT: cmovol %ecx, %r10d +; SSE2-NEXT: pextrw $3, %xmm1, %eax +; SSE2-NEXT: pextrw $3, %xmm0, %r11d +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %r11d, %edi +; SSE2-NEXT: addw %ax, %di +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %r11w +; SSE2-NEXT: cmovol %ecx, %r11d +; SSE2-NEXT: pextrw $4, %xmm1, %eax +; SSE2-NEXT: pextrw $4, %xmm0, %edi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %edi, %edx +; SSE2-NEXT: addw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %di +; SSE2-NEXT: cmovol %ecx, %edi +; SSE2-NEXT: pextrw $5, %xmm1, %ecx +; SSE2-NEXT: pextrw $5, %xmm0, %eax +; SSE2-NEXT: xorl %edx, %edx +; SSE2-NEXT: movl %eax, %esi +; SSE2-NEXT: addw %cx, %si +; SSE2-NEXT: setns %dl +; SSE2-NEXT: addl $32767, %edx # imm = 0x7FFF +; SSE2-NEXT: addw %cx, %ax +; SSE2-NEXT: cmovol %edx, %eax +; SSE2-NEXT: pextrw $6, %xmm1, %edx +; SSE2-NEXT: pextrw $6, %xmm0, %ecx +; SSE2-NEXT: xorl %esi, %esi +; SSE2-NEXT: movl %ecx, %ebx +; SSE2-NEXT: addw %dx, %bx +; SSE2-NEXT: setns %sil +; SSE2-NEXT: addl $32767, %esi # imm = 0x7FFF +; SSE2-NEXT: addw %dx, %cx +; SSE2-NEXT: cmovol %esi, %ecx +; SSE2-NEXT: pextrw $7, %xmm1, %edx +; SSE2-NEXT: pextrw $7, %xmm0, %esi +; SSE2-NEXT: xorl %ebx, %ebx +; SSE2-NEXT: movl %esi, %ebp +; SSE2-NEXT: addw %dx, %bp +; SSE2-NEXT: setns %bl +; SSE2-NEXT: addl $32767, %ebx # imm = 0x7FFF +; SSE2-NEXT: addw %dx, %si +; SSE2-NEXT: cmovol %ebx, %esi +; SSE2-NEXT: movd %esi, %xmm0 +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movd %edi, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: movd %r11d, %xmm0 +; SSE2-NEXT: movd %r10d, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: movd %r9d, %xmm3 +; SSE2-NEXT: movd %r8d, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v8i16: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pushq %rbp +; SSSE3-NEXT: pushq %rbx +; SSSE3-NEXT: movd %xmm1, %eax +; SSSE3-NEXT: movd %xmm0, %r8d +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %r8d, %edx +; SSSE3-NEXT: addw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %r8w +; SSSE3-NEXT: cmovol %ecx, %r8d +; SSSE3-NEXT: pextrw $1, %xmm1, %eax +; SSSE3-NEXT: pextrw $1, %xmm0, %r9d +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %r9d, %edx +; SSSE3-NEXT: addw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %r9w +; SSSE3-NEXT: cmovol %ecx, %r9d +; SSSE3-NEXT: pextrw $2, %xmm1, %eax +; SSSE3-NEXT: pextrw $2, %xmm0, %r10d +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %r10d, %esi +; SSSE3-NEXT: addw %ax, %si +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %r10w +; SSSE3-NEXT: cmovol %ecx, %r10d +; SSSE3-NEXT: pextrw $3, %xmm1, %eax +; SSSE3-NEXT: pextrw $3, %xmm0, %r11d +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %r11d, %edi +; SSSE3-NEXT: addw %ax, %di +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %r11w +; SSSE3-NEXT: cmovol %ecx, %r11d +; SSSE3-NEXT: pextrw $4, %xmm1, %eax +; SSSE3-NEXT: pextrw $4, %xmm0, %edi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %edi, %edx +; SSSE3-NEXT: addw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %di +; SSSE3-NEXT: cmovol %ecx, %edi +; SSSE3-NEXT: pextrw $5, %xmm1, %ecx +; SSSE3-NEXT: pextrw $5, %xmm0, %eax +; SSSE3-NEXT: xorl %edx, %edx +; SSSE3-NEXT: movl %eax, %esi +; SSSE3-NEXT: addw %cx, %si +; SSSE3-NEXT: setns %dl +; SSSE3-NEXT: addl $32767, %edx # imm = 0x7FFF +; SSSE3-NEXT: addw %cx, %ax +; SSSE3-NEXT: cmovol %edx, %eax +; SSSE3-NEXT: pextrw $6, %xmm1, %edx +; SSSE3-NEXT: pextrw $6, %xmm0, %ecx +; SSSE3-NEXT: xorl %esi, %esi +; SSSE3-NEXT: movl %ecx, %ebx +; SSSE3-NEXT: addw %dx, %bx +; SSSE3-NEXT: setns %sil +; SSSE3-NEXT: addl $32767, %esi # imm = 0x7FFF +; SSSE3-NEXT: addw %dx, %cx +; SSSE3-NEXT: cmovol %esi, %ecx +; SSSE3-NEXT: pextrw $7, %xmm1, %edx +; SSSE3-NEXT: pextrw $7, %xmm0, %esi +; SSSE3-NEXT: xorl %ebx, %ebx +; SSSE3-NEXT: movl %esi, %ebp +; SSSE3-NEXT: addw %dx, %bp +; SSSE3-NEXT: setns %bl +; SSSE3-NEXT: addl $32767, %ebx # imm = 0x7FFF +; SSSE3-NEXT: addw %dx, %si +; SSSE3-NEXT: cmovol %ebx, %esi +; SSSE3-NEXT: movd %esi, %xmm0 +; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movd %edi, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSSE3-NEXT: movd %r11d, %xmm0 +; SSSE3-NEXT: movd %r10d, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: movd %r9d, %xmm3 +; SSSE3-NEXT: movd %r8d, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSSE3-NEXT: popq %rbx +; SSSE3-NEXT: popq %rbp +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v8i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pushq %rbp +; SSE41-NEXT: pushq %rbx +; SSE41-NEXT: pextrw $7, %xmm1, %eax +; SSE41-NEXT: pextrw $7, %xmm0, %r8d +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %r8d, %edx +; SSE41-NEXT: addw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %r8w +; SSE41-NEXT: cmovol %ecx, %r8d +; SSE41-NEXT: pextrw $6, %xmm1, %eax +; SSE41-NEXT: pextrw $6, %xmm0, %r9d +; SSE41-NEXT: xorl %edx, %edx +; SSE41-NEXT: movl %r9d, %esi +; SSE41-NEXT: addw %ax, %si +; SSE41-NEXT: setns %dl +; SSE41-NEXT: addl $32767, %edx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %r9w +; SSE41-NEXT: cmovol %edx, %r9d +; SSE41-NEXT: pextrw $5, %xmm1, %eax +; SSE41-NEXT: pextrw $5, %xmm0, %r10d +; SSE41-NEXT: xorl %esi, %esi +; SSE41-NEXT: movl %r10d, %edi +; SSE41-NEXT: addw %ax, %di +; SSE41-NEXT: setns %sil +; SSE41-NEXT: addl $32767, %esi # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %r10w +; SSE41-NEXT: cmovol %esi, %r10d +; SSE41-NEXT: pextrw $4, %xmm1, %eax +; SSE41-NEXT: pextrw $4, %xmm0, %r11d +; SSE41-NEXT: xorl %edi, %edi +; SSE41-NEXT: movl %r11d, %ecx +; SSE41-NEXT: addw %ax, %cx +; SSE41-NEXT: setns %dil +; SSE41-NEXT: addl $32767, %edi # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %r11w +; SSE41-NEXT: cmovol %edi, %r11d +; SSE41-NEXT: pextrw $3, %xmm1, %eax +; SSE41-NEXT: pextrw $3, %xmm0, %edi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %edi, %edx +; SSE41-NEXT: addw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %di +; SSE41-NEXT: cmovol %ecx, %edi +; SSE41-NEXT: pextrw $2, %xmm1, %ecx +; SSE41-NEXT: pextrw $2, %xmm0, %eax +; SSE41-NEXT: xorl %edx, %edx +; SSE41-NEXT: movl %eax, %esi +; SSE41-NEXT: addw %cx, %si +; SSE41-NEXT: setns %dl +; SSE41-NEXT: addl $32767, %edx # imm = 0x7FFF +; SSE41-NEXT: addw %cx, %ax +; SSE41-NEXT: cmovol %edx, %eax +; SSE41-NEXT: movd %xmm1, %ecx +; SSE41-NEXT: movd %xmm0, %edx +; SSE41-NEXT: xorl %esi, %esi +; SSE41-NEXT: movl %edx, %ebx +; SSE41-NEXT: addw %cx, %bx +; SSE41-NEXT: setns %sil +; SSE41-NEXT: addl $32767, %esi # imm = 0x7FFF +; SSE41-NEXT: addw %cx, %dx +; SSE41-NEXT: cmovol %esi, %edx +; SSE41-NEXT: pextrw $1, %xmm1, %ecx +; SSE41-NEXT: pextrw $1, %xmm0, %esi +; SSE41-NEXT: xorl %ebx, %ebx +; SSE41-NEXT: movl %esi, %ebp +; SSE41-NEXT: addw %cx, %bp +; SSE41-NEXT: setns %bl +; SSE41-NEXT: addl $32767, %ebx # imm = 0x7FFF +; SSE41-NEXT: addw %cx, %si +; SSE41-NEXT: cmovol %ebx, %esi +; SSE41-NEXT: movd %edx, %xmm0 +; SSE41-NEXT: pinsrw $1, %esi, %xmm0 +; SSE41-NEXT: pinsrw $2, %eax, %xmm0 +; SSE41-NEXT: pinsrw $3, %edi, %xmm0 +; SSE41-NEXT: pinsrw $4, %r11d, %xmm0 +; SSE41-NEXT: pinsrw $5, %r10d, %xmm0 +; SSE41-NEXT: pinsrw $6, %r9d, %xmm0 +; SSE41-NEXT: pinsrw $7, %r8d, %xmm0 +; SSE41-NEXT: popq %rbx +; SSE41-NEXT: popq %rbp +; SSE41-NEXT: retq +; +; AVX-LABEL: v8i16: +; AVX: # %bb.0: +; AVX-NEXT: pushq %rbp +; AVX-NEXT: pushq %rbx +; AVX-NEXT: vpextrw $7, %xmm1, %eax +; AVX-NEXT: vpextrw $7, %xmm0, %r8d +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: movl %r8d, %edx +; AVX-NEXT: addw %ax, %dx +; AVX-NEXT: setns %cl +; AVX-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX-NEXT: addw %ax, %r8w +; AVX-NEXT: cmovol %ecx, %r8d +; AVX-NEXT: vpextrw $6, %xmm1, %eax +; AVX-NEXT: vpextrw $6, %xmm0, %r9d +; AVX-NEXT: xorl %edx, %edx +; AVX-NEXT: movl %r9d, %esi +; AVX-NEXT: addw %ax, %si +; AVX-NEXT: setns %dl +; AVX-NEXT: addl $32767, %edx # imm = 0x7FFF +; AVX-NEXT: addw %ax, %r9w +; AVX-NEXT: cmovol %edx, %r9d +; AVX-NEXT: vpextrw $5, %xmm1, %eax +; AVX-NEXT: vpextrw $5, %xmm0, %r10d +; AVX-NEXT: xorl %esi, %esi +; AVX-NEXT: movl %r10d, %edi +; AVX-NEXT: addw %ax, %di +; AVX-NEXT: setns %sil +; AVX-NEXT: addl $32767, %esi # imm = 0x7FFF +; AVX-NEXT: addw %ax, %r10w +; AVX-NEXT: cmovol %esi, %r10d +; AVX-NEXT: vpextrw $4, %xmm1, %eax +; AVX-NEXT: vpextrw $4, %xmm0, %r11d +; AVX-NEXT: xorl %edi, %edi +; AVX-NEXT: movl %r11d, %ecx +; AVX-NEXT: addw %ax, %cx +; AVX-NEXT: setns %dil +; AVX-NEXT: addl $32767, %edi # imm = 0x7FFF +; AVX-NEXT: addw %ax, %r11w +; AVX-NEXT: cmovol %edi, %r11d +; AVX-NEXT: vpextrw $3, %xmm1, %eax +; AVX-NEXT: vpextrw $3, %xmm0, %edi +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: movl %edi, %edx +; AVX-NEXT: addw %ax, %dx +; AVX-NEXT: setns %cl +; AVX-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX-NEXT: addw %ax, %di +; AVX-NEXT: cmovol %ecx, %edi +; AVX-NEXT: vpextrw $2, %xmm1, %ecx +; AVX-NEXT: vpextrw $2, %xmm0, %eax +; AVX-NEXT: xorl %edx, %edx +; AVX-NEXT: movl %eax, %esi +; AVX-NEXT: addw %cx, %si +; AVX-NEXT: setns %dl +; AVX-NEXT: addl $32767, %edx # imm = 0x7FFF +; AVX-NEXT: addw %cx, %ax +; AVX-NEXT: cmovol %edx, %eax +; AVX-NEXT: vmovd %xmm1, %ecx +; AVX-NEXT: vmovd %xmm0, %edx +; AVX-NEXT: xorl %esi, %esi +; AVX-NEXT: movl %edx, %ebx +; AVX-NEXT: addw %cx, %bx +; AVX-NEXT: setns %sil +; AVX-NEXT: addl $32767, %esi # imm = 0x7FFF +; AVX-NEXT: addw %cx, %dx +; AVX-NEXT: cmovol %esi, %edx +; AVX-NEXT: vpextrw $1, %xmm1, %ecx +; AVX-NEXT: vpextrw $1, %xmm0, %esi +; AVX-NEXT: xorl %ebx, %ebx +; AVX-NEXT: movl %esi, %ebp +; AVX-NEXT: addw %cx, %bp +; AVX-NEXT: setns %bl +; AVX-NEXT: addl $32767, %ebx # imm = 0x7FFF +; AVX-NEXT: addw %cx, %si +; AVX-NEXT: cmovol %ebx, %esi +; AVX-NEXT: vmovd %edx, %xmm0 +; AVX-NEXT: vpinsrw $1, %esi, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $3, %edi, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $4, %r11d, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $5, %r10d, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $6, %r9d, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $7, %r8d, %xmm0, %xmm0 +; AVX-NEXT: popq %rbx +; AVX-NEXT: popq %rbp +; AVX-NEXT: retq + %z = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %x, <8 x i16> %y) + ret <8 x i16> %z +} + +define <16 x i16> @v16i16(<16 x i16> %x, <16 x i16> %y) nounwind { +; SSE2-LABEL: v16i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: pushq %r15 +; SSE2-NEXT: pushq %r14 +; SSE2-NEXT: pushq %r13 +; SSE2-NEXT: pushq %r12 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: movd %xmm3, %eax +; SSE2-NEXT: movd %xmm1, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: addw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: pextrw $1, %xmm3, %eax +; SSE2-NEXT: pextrw $1, %xmm1, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: addw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: pextrw $2, %xmm3, %eax +; SSE2-NEXT: pextrw $2, %xmm1, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: addw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: pextrw $3, %xmm3, %eax +; SSE2-NEXT: pextrw $3, %xmm1, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: addw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: pextrw $4, %xmm3, %eax +; SSE2-NEXT: pextrw $4, %xmm1, %r14d +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %r14d, %edx +; SSE2-NEXT: addw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %r14w +; SSE2-NEXT: cmovol %ecx, %r14d +; SSE2-NEXT: pextrw $5, %xmm3, %eax +; SSE2-NEXT: pextrw $5, %xmm1, %r15d +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %r15d, %edx +; SSE2-NEXT: addw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %r15w +; SSE2-NEXT: cmovol %ecx, %r15d +; SSE2-NEXT: pextrw $6, %xmm3, %eax +; SSE2-NEXT: pextrw $6, %xmm1, %r12d +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %r12d, %edx +; SSE2-NEXT: addw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %r12w +; SSE2-NEXT: cmovol %ecx, %r12d +; SSE2-NEXT: pextrw $7, %xmm3, %eax +; SSE2-NEXT: pextrw $7, %xmm1, %r13d +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %r13d, %edx +; SSE2-NEXT: addw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %r13w +; SSE2-NEXT: cmovol %ecx, %r13d +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: movd %xmm0, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: addw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: pextrw $1, %xmm2, %eax +; SSE2-NEXT: pextrw $1, %xmm0, %ebx +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %ebx, %edx +; SSE2-NEXT: addw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %bx +; SSE2-NEXT: cmovol %ecx, %ebx +; SSE2-NEXT: pextrw $2, %xmm2, %eax +; SSE2-NEXT: pextrw $2, %xmm0, %ebp +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %ebp, %edx +; SSE2-NEXT: addw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %bp +; SSE2-NEXT: cmovol %ecx, %ebp +; SSE2-NEXT: pextrw $3, %xmm2, %eax +; SSE2-NEXT: pextrw $3, %xmm0, %edi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %edi, %edx +; SSE2-NEXT: addw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %di +; SSE2-NEXT: cmovol %ecx, %edi +; SSE2-NEXT: pextrw $4, %xmm2, %ecx +; SSE2-NEXT: pextrw $4, %xmm0, %eax +; SSE2-NEXT: xorl %edx, %edx +; SSE2-NEXT: movl %eax, %r8d +; SSE2-NEXT: addw %cx, %r8w +; SSE2-NEXT: setns %dl +; SSE2-NEXT: addl $32767, %edx # imm = 0x7FFF +; SSE2-NEXT: addw %cx, %ax +; SSE2-NEXT: cmovol %edx, %eax +; SSE2-NEXT: pextrw $5, %xmm2, %r8d +; SSE2-NEXT: pextrw $5, %xmm0, %ecx +; SSE2-NEXT: xorl %edx, %edx +; SSE2-NEXT: movl %ecx, %r9d +; SSE2-NEXT: addw %r8w, %r9w +; SSE2-NEXT: setns %dl +; SSE2-NEXT: addl $32767, %edx # imm = 0x7FFF +; SSE2-NEXT: addw %r8w, %cx +; SSE2-NEXT: cmovol %edx, %ecx +; SSE2-NEXT: pextrw $6, %xmm2, %r8d +; SSE2-NEXT: pextrw $6, %xmm0, %r9d +; SSE2-NEXT: xorl %edx, %edx +; SSE2-NEXT: movl %r9d, %r10d +; SSE2-NEXT: addw %r8w, %r10w +; SSE2-NEXT: setns %dl +; SSE2-NEXT: addl $32767, %edx # imm = 0x7FFF +; SSE2-NEXT: addw %r8w, %r9w +; SSE2-NEXT: cmovol %edx, %r9d +; SSE2-NEXT: pextrw $7, %xmm2, %r8d +; SSE2-NEXT: pextrw $7, %xmm0, %edx +; SSE2-NEXT: xorl %r10d, %r10d +; SSE2-NEXT: movl %edx, %r11d +; SSE2-NEXT: addw %r8w, %r11w +; SSE2-NEXT: setns %r10b +; SSE2-NEXT: addl $32767, %r10d # imm = 0x7FFF +; SSE2-NEXT: addw %r8w, %dx +; SSE2-NEXT: cmovol %r10d, %edx +; SSE2-NEXT: movd %edx, %xmm8 +; SSE2-NEXT: movd %r9d, %xmm3 +; SSE2-NEXT: movd %ecx, %xmm9 +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: movd %edi, %xmm10 +; SSE2-NEXT: movd %ebp, %xmm7 +; SSE2-NEXT: movd %ebx, %xmm11 +; SSE2-NEXT: movd %esi, %xmm0 +; SSE2-NEXT: movd %r13d, %xmm12 +; SSE2-NEXT: movd %r12d, %xmm6 +; SSE2-NEXT: movd %r15d, %xmm13 +; SSE2-NEXT: movd %r14d, %xmm5 +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 4-byte Folded Reload +; SSE2-NEXT: # xmm14 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload +; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload +; SSE2-NEXT: # xmm15 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload +; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %r12 +; SSE2-NEXT: popq %r13 +; SSE2-NEXT: popq %r14 +; SSE2-NEXT: popq %r15 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v16i16: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pushq %rbp +; SSSE3-NEXT: pushq %r15 +; SSSE3-NEXT: pushq %r14 +; SSSE3-NEXT: pushq %r13 +; SSSE3-NEXT: pushq %r12 +; SSSE3-NEXT: pushq %rbx +; SSSE3-NEXT: movd %xmm3, %eax +; SSSE3-NEXT: movd %xmm1, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: addw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: pextrw $1, %xmm3, %eax +; SSSE3-NEXT: pextrw $1, %xmm1, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: addw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: pextrw $2, %xmm3, %eax +; SSSE3-NEXT: pextrw $2, %xmm1, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: addw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: pextrw $3, %xmm3, %eax +; SSSE3-NEXT: pextrw $3, %xmm1, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: addw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: pextrw $4, %xmm3, %eax +; SSSE3-NEXT: pextrw $4, %xmm1, %r14d +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %r14d, %edx +; SSSE3-NEXT: addw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %r14w +; SSSE3-NEXT: cmovol %ecx, %r14d +; SSSE3-NEXT: pextrw $5, %xmm3, %eax +; SSSE3-NEXT: pextrw $5, %xmm1, %r15d +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %r15d, %edx +; SSSE3-NEXT: addw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %r15w +; SSSE3-NEXT: cmovol %ecx, %r15d +; SSSE3-NEXT: pextrw $6, %xmm3, %eax +; SSSE3-NEXT: pextrw $6, %xmm1, %r12d +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %r12d, %edx +; SSSE3-NEXT: addw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %r12w +; SSSE3-NEXT: cmovol %ecx, %r12d +; SSSE3-NEXT: pextrw $7, %xmm3, %eax +; SSSE3-NEXT: pextrw $7, %xmm1, %r13d +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %r13d, %edx +; SSSE3-NEXT: addw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %r13w +; SSSE3-NEXT: cmovol %ecx, %r13d +; SSSE3-NEXT: movd %xmm2, %eax +; SSSE3-NEXT: movd %xmm0, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: addw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: pextrw $1, %xmm2, %eax +; SSSE3-NEXT: pextrw $1, %xmm0, %ebx +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %ebx, %edx +; SSSE3-NEXT: addw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %bx +; SSSE3-NEXT: cmovol %ecx, %ebx +; SSSE3-NEXT: pextrw $2, %xmm2, %eax +; SSSE3-NEXT: pextrw $2, %xmm0, %ebp +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %ebp, %edx +; SSSE3-NEXT: addw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %bp +; SSSE3-NEXT: cmovol %ecx, %ebp +; SSSE3-NEXT: pextrw $3, %xmm2, %eax +; SSSE3-NEXT: pextrw $3, %xmm0, %edi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %edi, %edx +; SSSE3-NEXT: addw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %di +; SSSE3-NEXT: cmovol %ecx, %edi +; SSSE3-NEXT: pextrw $4, %xmm2, %ecx +; SSSE3-NEXT: pextrw $4, %xmm0, %eax +; SSSE3-NEXT: xorl %edx, %edx +; SSSE3-NEXT: movl %eax, %r8d +; SSSE3-NEXT: addw %cx, %r8w +; SSSE3-NEXT: setns %dl +; SSSE3-NEXT: addl $32767, %edx # imm = 0x7FFF +; SSSE3-NEXT: addw %cx, %ax +; SSSE3-NEXT: cmovol %edx, %eax +; SSSE3-NEXT: pextrw $5, %xmm2, %r8d +; SSSE3-NEXT: pextrw $5, %xmm0, %ecx +; SSSE3-NEXT: xorl %edx, %edx +; SSSE3-NEXT: movl %ecx, %r9d +; SSSE3-NEXT: addw %r8w, %r9w +; SSSE3-NEXT: setns %dl +; SSSE3-NEXT: addl $32767, %edx # imm = 0x7FFF +; SSSE3-NEXT: addw %r8w, %cx +; SSSE3-NEXT: cmovol %edx, %ecx +; SSSE3-NEXT: pextrw $6, %xmm2, %r8d +; SSSE3-NEXT: pextrw $6, %xmm0, %r9d +; SSSE3-NEXT: xorl %edx, %edx +; SSSE3-NEXT: movl %r9d, %r10d +; SSSE3-NEXT: addw %r8w, %r10w +; SSSE3-NEXT: setns %dl +; SSSE3-NEXT: addl $32767, %edx # imm = 0x7FFF +; SSSE3-NEXT: addw %r8w, %r9w +; SSSE3-NEXT: cmovol %edx, %r9d +; SSSE3-NEXT: pextrw $7, %xmm2, %r8d +; SSSE3-NEXT: pextrw $7, %xmm0, %edx +; SSSE3-NEXT: xorl %r10d, %r10d +; SSSE3-NEXT: movl %edx, %r11d +; SSSE3-NEXT: addw %r8w, %r11w +; SSSE3-NEXT: setns %r10b +; SSSE3-NEXT: addl $32767, %r10d # imm = 0x7FFF +; SSSE3-NEXT: addw %r8w, %dx +; SSSE3-NEXT: cmovol %r10d, %edx +; SSSE3-NEXT: movd %edx, %xmm8 +; SSSE3-NEXT: movd %r9d, %xmm3 +; SSSE3-NEXT: movd %ecx, %xmm9 +; SSSE3-NEXT: movd %eax, %xmm4 +; SSSE3-NEXT: movd %edi, %xmm10 +; SSSE3-NEXT: movd %ebp, %xmm7 +; SSSE3-NEXT: movd %ebx, %xmm11 +; SSSE3-NEXT: movd %esi, %xmm0 +; SSSE3-NEXT: movd %r13d, %xmm12 +; SSSE3-NEXT: movd %r12d, %xmm6 +; SSSE3-NEXT: movd %r15d, %xmm13 +; SSSE3-NEXT: movd %r14d, %xmm5 +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm14 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm15 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] +; SSSE3-NEXT: popq %rbx +; SSSE3-NEXT: popq %r12 +; SSSE3-NEXT: popq %r13 +; SSSE3-NEXT: popq %r14 +; SSSE3-NEXT: popq %r15 +; SSSE3-NEXT: popq %rbp +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v16i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pushq %rbp +; SSE41-NEXT: pushq %r15 +; SSE41-NEXT: pushq %r14 +; SSE41-NEXT: pushq %r13 +; SSE41-NEXT: pushq %r12 +; SSE41-NEXT: pushq %rbx +; SSE41-NEXT: pextrw $7, %xmm3, %eax +; SSE41-NEXT: pextrw $7, %xmm1, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: addw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrw $6, %xmm3, %eax +; SSE41-NEXT: pextrw $6, %xmm1, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: addw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrw $5, %xmm3, %eax +; SSE41-NEXT: pextrw $5, %xmm1, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: addw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrw $4, %xmm3, %eax +; SSE41-NEXT: pextrw $4, %xmm1, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: addw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrw $3, %xmm3, %eax +; SSE41-NEXT: pextrw $3, %xmm1, %r14d +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %r14d, %edx +; SSE41-NEXT: addw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %r14w +; SSE41-NEXT: cmovol %ecx, %r14d +; SSE41-NEXT: pextrw $2, %xmm3, %eax +; SSE41-NEXT: pextrw $2, %xmm1, %r15d +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %r15d, %edx +; SSE41-NEXT: addw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %r15w +; SSE41-NEXT: cmovol %ecx, %r15d +; SSE41-NEXT: movd %xmm3, %eax +; SSE41-NEXT: movd %xmm1, %r12d +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %r12d, %edx +; SSE41-NEXT: addw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %r12w +; SSE41-NEXT: cmovol %ecx, %r12d +; SSE41-NEXT: pextrw $1, %xmm3, %eax +; SSE41-NEXT: pextrw $1, %xmm1, %r13d +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %r13d, %esi +; SSE41-NEXT: addw %ax, %si +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %r13w +; SSE41-NEXT: cmovol %ecx, %r13d +; SSE41-NEXT: pextrw $7, %xmm2, %eax +; SSE41-NEXT: pextrw $7, %xmm0, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edi +; SSE41-NEXT: addw %ax, %di +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: pextrw $6, %xmm2, %eax +; SSE41-NEXT: pextrw $6, %xmm0, %ebx +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %ebx, %edi +; SSE41-NEXT: addw %ax, %di +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %bx +; SSE41-NEXT: cmovol %ecx, %ebx +; SSE41-NEXT: pextrw $5, %xmm2, %eax +; SSE41-NEXT: pextrw $5, %xmm0, %ebp +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %ebp, %edi +; SSE41-NEXT: addw %ax, %di +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %bp +; SSE41-NEXT: cmovol %ecx, %ebp +; SSE41-NEXT: pextrw $4, %xmm2, %eax +; SSE41-NEXT: pextrw $4, %xmm0, %edi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %edi, %edx +; SSE41-NEXT: addw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %di +; SSE41-NEXT: cmovol %ecx, %edi +; SSE41-NEXT: pextrw $3, %xmm2, %ecx +; SSE41-NEXT: pextrw $3, %xmm0, %eax +; SSE41-NEXT: xorl %edx, %edx +; SSE41-NEXT: movl %eax, %r8d +; SSE41-NEXT: addw %cx, %r8w +; SSE41-NEXT: setns %dl +; SSE41-NEXT: addl $32767, %edx # imm = 0x7FFF +; SSE41-NEXT: addw %cx, %ax +; SSE41-NEXT: cmovol %edx, %eax +; SSE41-NEXT: pextrw $2, %xmm2, %r8d +; SSE41-NEXT: pextrw $2, %xmm0, %ecx +; SSE41-NEXT: xorl %edx, %edx +; SSE41-NEXT: movl %ecx, %r9d +; SSE41-NEXT: addw %r8w, %r9w +; SSE41-NEXT: setns %dl +; SSE41-NEXT: addl $32767, %edx # imm = 0x7FFF +; SSE41-NEXT: addw %r8w, %cx +; SSE41-NEXT: cmovol %edx, %ecx +; SSE41-NEXT: movd %xmm2, %r8d +; SSE41-NEXT: movd %xmm0, %r9d +; SSE41-NEXT: xorl %edx, %edx +; SSE41-NEXT: movl %r9d, %r10d +; SSE41-NEXT: addw %r8w, %r10w +; SSE41-NEXT: setns %dl +; SSE41-NEXT: addl $32767, %edx # imm = 0x7FFF +; SSE41-NEXT: addw %r8w, %r9w +; SSE41-NEXT: cmovol %edx, %r9d +; SSE41-NEXT: pextrw $1, %xmm2, %r8d +; SSE41-NEXT: pextrw $1, %xmm0, %edx +; SSE41-NEXT: xorl %r10d, %r10d +; SSE41-NEXT: movl %edx, %r11d +; SSE41-NEXT: addw %r8w, %r11w +; SSE41-NEXT: setns %r10b +; SSE41-NEXT: addl $32767, %r10d # imm = 0x7FFF +; SSE41-NEXT: addw %r8w, %dx +; SSE41-NEXT: cmovol %r10d, %edx +; SSE41-NEXT: movd %r9d, %xmm0 +; SSE41-NEXT: pinsrw $1, %edx, %xmm0 +; SSE41-NEXT: pinsrw $2, %ecx, %xmm0 +; SSE41-NEXT: pinsrw $3, %eax, %xmm0 +; SSE41-NEXT: pinsrw $4, %edi, %xmm0 +; SSE41-NEXT: pinsrw $5, %ebp, %xmm0 +; SSE41-NEXT: pinsrw $6, %ebx, %xmm0 +; SSE41-NEXT: pinsrw $7, %esi, %xmm0 +; SSE41-NEXT: movd %r12d, %xmm1 +; SSE41-NEXT: pinsrw $1, %r13d, %xmm1 +; SSE41-NEXT: pinsrw $2, %r15d, %xmm1 +; SSE41-NEXT: pinsrw $3, %r14d, %xmm1 +; SSE41-NEXT: pinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload +; SSE41-NEXT: pinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload +; SSE41-NEXT: pinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload +; SSE41-NEXT: pinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload +; SSE41-NEXT: popq %rbx +; SSE41-NEXT: popq %r12 +; SSE41-NEXT: popq %r13 +; SSE41-NEXT: popq %r14 +; SSE41-NEXT: popq %r15 +; SSE41-NEXT: popq %rbp +; SSE41-NEXT: retq +; +; AVX1-LABEL: v16i16: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: pushq %r15 +; AVX1-NEXT: pushq %r14 +; AVX1-NEXT: pushq %r13 +; AVX1-NEXT: pushq %r12 +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: vpextrw $7, %xmm1, %eax +; AVX1-NEXT: vpextrw $7, %xmm0, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: addw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: addw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrw $6, %xmm1, %eax +; AVX1-NEXT: vpextrw $6, %xmm0, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: addw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: addw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrw $5, %xmm1, %eax +; AVX1-NEXT: vpextrw $5, %xmm0, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: addw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: addw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrw $4, %xmm1, %eax +; AVX1-NEXT: vpextrw $4, %xmm0, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: addw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: addw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrw $3, %xmm1, %eax +; AVX1-NEXT: vpextrw $3, %xmm0, %r14d +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %r14d, %edx +; AVX1-NEXT: addw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: addw %ax, %r14w +; AVX1-NEXT: cmovol %ecx, %r14d +; AVX1-NEXT: vpextrw $2, %xmm1, %eax +; AVX1-NEXT: vpextrw $2, %xmm0, %r15d +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %r15d, %edx +; AVX1-NEXT: addw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: addw %ax, %r15w +; AVX1-NEXT: cmovol %ecx, %r15d +; AVX1-NEXT: vmovd %xmm1, %eax +; AVX1-NEXT: vmovd %xmm0, %r12d +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %r12d, %edx +; AVX1-NEXT: addw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: addw %ax, %r12w +; AVX1-NEXT: cmovol %ecx, %r12d +; AVX1-NEXT: vpextrw $1, %xmm1, %eax +; AVX1-NEXT: vpextrw $1, %xmm0, %r13d +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %r13d, %esi +; AVX1-NEXT: addw %ax, %si +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: addw %ax, %r13w +; AVX1-NEXT: cmovol %ecx, %r13d +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpextrw $7, %xmm1, %eax +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpextrw $7, %xmm0, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edi +; AVX1-NEXT: addw %ax, %di +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: addw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: vpextrw $6, %xmm1, %eax +; AVX1-NEXT: vpextrw $6, %xmm0, %ebx +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %ebx, %edi +; AVX1-NEXT: addw %ax, %di +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: addw %ax, %bx +; AVX1-NEXT: cmovol %ecx, %ebx +; AVX1-NEXT: vpextrw $5, %xmm1, %eax +; AVX1-NEXT: vpextrw $5, %xmm0, %ebp +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %ebp, %edi +; AVX1-NEXT: addw %ax, %di +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: addw %ax, %bp +; AVX1-NEXT: cmovol %ecx, %ebp +; AVX1-NEXT: vpextrw $4, %xmm1, %eax +; AVX1-NEXT: vpextrw $4, %xmm0, %edi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %edi, %edx +; AVX1-NEXT: addw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: addw %ax, %di +; AVX1-NEXT: cmovol %ecx, %edi +; AVX1-NEXT: vpextrw $3, %xmm1, %ecx +; AVX1-NEXT: vpextrw $3, %xmm0, %eax +; AVX1-NEXT: xorl %edx, %edx +; AVX1-NEXT: movl %eax, %r8d +; AVX1-NEXT: addw %cx, %r8w +; AVX1-NEXT: setns %dl +; AVX1-NEXT: addl $32767, %edx # imm = 0x7FFF +; AVX1-NEXT: addw %cx, %ax +; AVX1-NEXT: cmovol %edx, %eax +; AVX1-NEXT: vpextrw $2, %xmm1, %r8d +; AVX1-NEXT: vpextrw $2, %xmm0, %ecx +; AVX1-NEXT: xorl %edx, %edx +; AVX1-NEXT: movl %ecx, %r9d +; AVX1-NEXT: addw %r8w, %r9w +; AVX1-NEXT: setns %dl +; AVX1-NEXT: addl $32767, %edx # imm = 0x7FFF +; AVX1-NEXT: addw %r8w, %cx +; AVX1-NEXT: cmovol %edx, %ecx +; AVX1-NEXT: vmovd %xmm1, %r8d +; AVX1-NEXT: vmovd %xmm0, %r9d +; AVX1-NEXT: xorl %edx, %edx +; AVX1-NEXT: movl %r9d, %r10d +; AVX1-NEXT: addw %r8w, %r10w +; AVX1-NEXT: setns %dl +; AVX1-NEXT: addl $32767, %edx # imm = 0x7FFF +; AVX1-NEXT: addw %r8w, %r9w +; AVX1-NEXT: cmovol %edx, %r9d +; AVX1-NEXT: vpextrw $1, %xmm1, %r8d +; AVX1-NEXT: vpextrw $1, %xmm0, %edx +; AVX1-NEXT: xorl %r10d, %r10d +; AVX1-NEXT: movl %edx, %r11d +; AVX1-NEXT: addw %r8w, %r11w +; AVX1-NEXT: setns %r10b +; AVX1-NEXT: addl $32767, %r10d # imm = 0x7FFF +; AVX1-NEXT: addw %r8w, %dx +; AVX1-NEXT: cmovol %r10d, %edx +; AVX1-NEXT: vmovd %r9d, %xmm0 +; AVX1-NEXT: vpinsrw $1, %edx, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $4, %edi, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $5, %ebp, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $6, %ebx, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $7, %esi, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %r12d, %xmm1 +; AVX1-NEXT: vpinsrw $1, %r13d, %xmm1, %xmm1 +; AVX1-NEXT: vpinsrw $2, %r15d, %xmm1, %xmm1 +; AVX1-NEXT: vpinsrw $3, %r14d, %xmm1, %xmm1 +; AVX1-NEXT: vpinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX1-NEXT: vpinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX1-NEXT: vpinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX1-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: popq %r12 +; AVX1-NEXT: popq %r13 +; AVX1-NEXT: popq %r14 +; AVX1-NEXT: popq %r15 +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: v16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %r13 +; AVX2-NEXT: pushq %r12 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: vpextrw $7, %xmm1, %eax +; AVX2-NEXT: vpextrw $7, %xmm0, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: addw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: addw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrw $6, %xmm1, %eax +; AVX2-NEXT: vpextrw $6, %xmm0, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: addw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: addw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrw $5, %xmm1, %eax +; AVX2-NEXT: vpextrw $5, %xmm0, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: addw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: addw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrw $4, %xmm1, %eax +; AVX2-NEXT: vpextrw $4, %xmm0, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: addw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: addw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrw $3, %xmm1, %eax +; AVX2-NEXT: vpextrw $3, %xmm0, %r14d +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %r14d, %edx +; AVX2-NEXT: addw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: addw %ax, %r14w +; AVX2-NEXT: cmovol %ecx, %r14d +; AVX2-NEXT: vpextrw $2, %xmm1, %eax +; AVX2-NEXT: vpextrw $2, %xmm0, %r15d +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %r15d, %edx +; AVX2-NEXT: addw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: addw %ax, %r15w +; AVX2-NEXT: cmovol %ecx, %r15d +; AVX2-NEXT: vmovd %xmm1, %eax +; AVX2-NEXT: vmovd %xmm0, %r12d +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %r12d, %edx +; AVX2-NEXT: addw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: addw %ax, %r12w +; AVX2-NEXT: cmovol %ecx, %r12d +; AVX2-NEXT: vpextrw $1, %xmm1, %eax +; AVX2-NEXT: vpextrw $1, %xmm0, %r13d +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %r13d, %esi +; AVX2-NEXT: addw %ax, %si +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: addw %ax, %r13w +; AVX2-NEXT: cmovol %ecx, %r13d +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vpextrw $7, %xmm1, %eax +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpextrw $7, %xmm0, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edi +; AVX2-NEXT: addw %ax, %di +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: addw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: vpextrw $6, %xmm1, %eax +; AVX2-NEXT: vpextrw $6, %xmm0, %ebx +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %ebx, %edi +; AVX2-NEXT: addw %ax, %di +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: addw %ax, %bx +; AVX2-NEXT: cmovol %ecx, %ebx +; AVX2-NEXT: vpextrw $5, %xmm1, %eax +; AVX2-NEXT: vpextrw $5, %xmm0, %ebp +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %ebp, %edi +; AVX2-NEXT: addw %ax, %di +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: addw %ax, %bp +; AVX2-NEXT: cmovol %ecx, %ebp +; AVX2-NEXT: vpextrw $4, %xmm1, %eax +; AVX2-NEXT: vpextrw $4, %xmm0, %edi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %edi, %edx +; AVX2-NEXT: addw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: addw %ax, %di +; AVX2-NEXT: cmovol %ecx, %edi +; AVX2-NEXT: vpextrw $3, %xmm1, %ecx +; AVX2-NEXT: vpextrw $3, %xmm0, %eax +; AVX2-NEXT: xorl %edx, %edx +; AVX2-NEXT: movl %eax, %r8d +; AVX2-NEXT: addw %cx, %r8w +; AVX2-NEXT: setns %dl +; AVX2-NEXT: addl $32767, %edx # imm = 0x7FFF +; AVX2-NEXT: addw %cx, %ax +; AVX2-NEXT: cmovol %edx, %eax +; AVX2-NEXT: vpextrw $2, %xmm1, %r8d +; AVX2-NEXT: vpextrw $2, %xmm0, %ecx +; AVX2-NEXT: xorl %edx, %edx +; AVX2-NEXT: movl %ecx, %r9d +; AVX2-NEXT: addw %r8w, %r9w +; AVX2-NEXT: setns %dl +; AVX2-NEXT: addl $32767, %edx # imm = 0x7FFF +; AVX2-NEXT: addw %r8w, %cx +; AVX2-NEXT: cmovol %edx, %ecx +; AVX2-NEXT: vmovd %xmm1, %r8d +; AVX2-NEXT: vmovd %xmm0, %r9d +; AVX2-NEXT: xorl %edx, %edx +; AVX2-NEXT: movl %r9d, %r10d +; AVX2-NEXT: addw %r8w, %r10w +; AVX2-NEXT: setns %dl +; AVX2-NEXT: addl $32767, %edx # imm = 0x7FFF +; AVX2-NEXT: addw %r8w, %r9w +; AVX2-NEXT: cmovol %edx, %r9d +; AVX2-NEXT: vpextrw $1, %xmm1, %r8d +; AVX2-NEXT: vpextrw $1, %xmm0, %edx +; AVX2-NEXT: xorl %r10d, %r10d +; AVX2-NEXT: movl %edx, %r11d +; AVX2-NEXT: addw %r8w, %r11w +; AVX2-NEXT: setns %r10b +; AVX2-NEXT: addl $32767, %r10d # imm = 0x7FFF +; AVX2-NEXT: addw %r8w, %dx +; AVX2-NEXT: cmovol %r10d, %edx +; AVX2-NEXT: vmovd %r9d, %xmm0 +; AVX2-NEXT: vpinsrw $1, %edx, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $4, %edi, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $5, %ebp, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $6, %ebx, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $7, %esi, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %r12d, %xmm1 +; AVX2-NEXT: vpinsrw $1, %r13d, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrw $2, %r15d, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrw $3, %r14d, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX2-NEXT: vpinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX2-NEXT: vpinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX2-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r12 +; AVX2-NEXT: popq %r13 +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512-LABEL: v16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r15 +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %r13 +; AVX512-NEXT: pushq %r12 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: vpextrw $7, %xmm1, %eax +; AVX512-NEXT: vpextrw $7, %xmm0, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: addw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: addw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrw $6, %xmm1, %eax +; AVX512-NEXT: vpextrw $6, %xmm0, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: addw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: addw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrw $5, %xmm1, %eax +; AVX512-NEXT: vpextrw $5, %xmm0, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: addw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: addw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrw $4, %xmm1, %eax +; AVX512-NEXT: vpextrw $4, %xmm0, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: addw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: addw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrw $3, %xmm1, %eax +; AVX512-NEXT: vpextrw $3, %xmm0, %r14d +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %r14d, %edx +; AVX512-NEXT: addw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: addw %ax, %r14w +; AVX512-NEXT: cmovol %ecx, %r14d +; AVX512-NEXT: vpextrw $2, %xmm1, %eax +; AVX512-NEXT: vpextrw $2, %xmm0, %r15d +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %r15d, %edx +; AVX512-NEXT: addw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: addw %ax, %r15w +; AVX512-NEXT: cmovol %ecx, %r15d +; AVX512-NEXT: vmovd %xmm1, %eax +; AVX512-NEXT: vmovd %xmm0, %r12d +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %r12d, %edx +; AVX512-NEXT: addw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: addw %ax, %r12w +; AVX512-NEXT: cmovol %ecx, %r12d +; AVX512-NEXT: vpextrw $1, %xmm1, %eax +; AVX512-NEXT: vpextrw $1, %xmm0, %r13d +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %r13d, %esi +; AVX512-NEXT: addw %ax, %si +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: addw %ax, %r13w +; AVX512-NEXT: cmovol %ecx, %r13d +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vpextrw $7, %xmm1, %eax +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vpextrw $7, %xmm0, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edi +; AVX512-NEXT: addw %ax, %di +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: addw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: vpextrw $6, %xmm1, %eax +; AVX512-NEXT: vpextrw $6, %xmm0, %ebx +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %ebx, %edi +; AVX512-NEXT: addw %ax, %di +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: addw %ax, %bx +; AVX512-NEXT: cmovol %ecx, %ebx +; AVX512-NEXT: vpextrw $5, %xmm1, %eax +; AVX512-NEXT: vpextrw $5, %xmm0, %ebp +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %ebp, %edi +; AVX512-NEXT: addw %ax, %di +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: addw %ax, %bp +; AVX512-NEXT: cmovol %ecx, %ebp +; AVX512-NEXT: vpextrw $4, %xmm1, %eax +; AVX512-NEXT: vpextrw $4, %xmm0, %edi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %edi, %edx +; AVX512-NEXT: addw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: addw %ax, %di +; AVX512-NEXT: cmovol %ecx, %edi +; AVX512-NEXT: vpextrw $3, %xmm1, %ecx +; AVX512-NEXT: vpextrw $3, %xmm0, %eax +; AVX512-NEXT: xorl %edx, %edx +; AVX512-NEXT: movl %eax, %r8d +; AVX512-NEXT: addw %cx, %r8w +; AVX512-NEXT: setns %dl +; AVX512-NEXT: addl $32767, %edx # imm = 0x7FFF +; AVX512-NEXT: addw %cx, %ax +; AVX512-NEXT: cmovol %edx, %eax +; AVX512-NEXT: vpextrw $2, %xmm1, %r8d +; AVX512-NEXT: vpextrw $2, %xmm0, %ecx +; AVX512-NEXT: xorl %edx, %edx +; AVX512-NEXT: movl %ecx, %r9d +; AVX512-NEXT: addw %r8w, %r9w +; AVX512-NEXT: setns %dl +; AVX512-NEXT: addl $32767, %edx # imm = 0x7FFF +; AVX512-NEXT: addw %r8w, %cx +; AVX512-NEXT: cmovol %edx, %ecx +; AVX512-NEXT: vmovd %xmm1, %r8d +; AVX512-NEXT: vmovd %xmm0, %r9d +; AVX512-NEXT: xorl %edx, %edx +; AVX512-NEXT: movl %r9d, %r10d +; AVX512-NEXT: addw %r8w, %r10w +; AVX512-NEXT: setns %dl +; AVX512-NEXT: addl $32767, %edx # imm = 0x7FFF +; AVX512-NEXT: addw %r8w, %r9w +; AVX512-NEXT: cmovol %edx, %r9d +; AVX512-NEXT: vpextrw $1, %xmm1, %r8d +; AVX512-NEXT: vpextrw $1, %xmm0, %edx +; AVX512-NEXT: xorl %r10d, %r10d +; AVX512-NEXT: movl %edx, %r11d +; AVX512-NEXT: addw %r8w, %r11w +; AVX512-NEXT: setns %r10b +; AVX512-NEXT: addl $32767, %r10d # imm = 0x7FFF +; AVX512-NEXT: addw %r8w, %dx +; AVX512-NEXT: cmovol %r10d, %edx +; AVX512-NEXT: vmovd %r9d, %xmm0 +; AVX512-NEXT: vpinsrw $1, %edx, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $4, %edi, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $5, %ebp, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $6, %ebx, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $7, %esi, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %r12d, %xmm1 +; AVX512-NEXT: vpinsrw $1, %r13d, %xmm1, %xmm1 +; AVX512-NEXT: vpinsrw $2, %r15d, %xmm1, %xmm1 +; AVX512-NEXT: vpinsrw $3, %r14d, %xmm1, %xmm1 +; AVX512-NEXT: vpinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX512-NEXT: vpinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX512-NEXT: vpinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX512-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r12 +; AVX512-NEXT: popq %r13 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %r15 +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq + %z = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %x, <16 x i16> %y) + ret <16 x i16> %z +} + +define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind { +; SSE2-LABEL: v32i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: pushq %r15 +; SSE2-NEXT: pushq %r14 +; SSE2-NEXT: pushq %r13 +; SSE2-NEXT: pushq %r12 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: movd %xmm5, %eax +; SSE2-NEXT: movd %xmm1, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: addw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: pextrw $1, %xmm5, %eax +; SSE2-NEXT: pextrw $1, %xmm1, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: addw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: pextrw $2, %xmm5, %eax +; SSE2-NEXT: pextrw $2, %xmm1, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: addw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: pextrw $3, %xmm5, %eax +; SSE2-NEXT: pextrw $3, %xmm1, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: addw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: pextrw $4, %xmm5, %eax +; SSE2-NEXT: pextrw $4, %xmm1, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: addw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: pextrw $5, %xmm5, %eax +; SSE2-NEXT: pextrw $5, %xmm1, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: addw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: pextrw $6, %xmm5, %eax +; SSE2-NEXT: pextrw $6, %xmm1, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: addw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: pextrw $7, %xmm5, %eax +; SSE2-NEXT: pextrw $7, %xmm1, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: addw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movd %xmm6, %eax +; SSE2-NEXT: movd %xmm2, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: addw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: pextrw $1, %xmm6, %eax +; SSE2-NEXT: pextrw $1, %xmm2, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: addw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: pextrw $2, %xmm6, %eax +; SSE2-NEXT: pextrw $2, %xmm2, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: addw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: pextrw $3, %xmm6, %eax +; SSE2-NEXT: pextrw $3, %xmm2, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: addw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: pextrw $4, %xmm6, %eax +; SSE2-NEXT: pextrw $4, %xmm2, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: addw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: pextrw $5, %xmm6, %eax +; SSE2-NEXT: pextrw $5, %xmm2, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: addw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: pextrw $6, %xmm6, %eax +; SSE2-NEXT: pextrw $6, %xmm2, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: addw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: pextrw $7, %xmm6, %eax +; SSE2-NEXT: pextrw $7, %xmm2, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: addw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movd %xmm7, %eax +; SSE2-NEXT: movd %xmm3, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: addw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: pextrw $1, %xmm7, %eax +; SSE2-NEXT: pextrw $1, %xmm3, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: addw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: pextrw $2, %xmm7, %eax +; SSE2-NEXT: pextrw $2, %xmm3, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: addw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: pextrw $3, %xmm7, %eax +; SSE2-NEXT: pextrw $3, %xmm3, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: addw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: pextrw $4, %xmm7, %eax +; SSE2-NEXT: pextrw $4, %xmm3, %ebp +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %ebp, %edx +; SSE2-NEXT: addw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %bp +; SSE2-NEXT: cmovol %ecx, %ebp +; SSE2-NEXT: pextrw $5, %xmm7, %eax +; SSE2-NEXT: pextrw $5, %xmm3, %ebx +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %ebx, %edx +; SSE2-NEXT: addw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %bx +; SSE2-NEXT: cmovol %ecx, %ebx +; SSE2-NEXT: pextrw $6, %xmm7, %eax +; SSE2-NEXT: pextrw $6, %xmm3, %r11d +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %r11d, %edx +; SSE2-NEXT: addw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %r11w +; SSE2-NEXT: cmovol %ecx, %r11d +; SSE2-NEXT: pextrw $7, %xmm7, %eax +; SSE2-NEXT: pextrw $7, %xmm3, %r10d +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %r10d, %edx +; SSE2-NEXT: addw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %r10w +; SSE2-NEXT: cmovol %ecx, %r10d +; SSE2-NEXT: movd %xmm4, %eax +; SSE2-NEXT: movd %xmm0, %r9d +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %r9d, %edx +; SSE2-NEXT: addw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %r9w +; SSE2-NEXT: cmovol %ecx, %r9d +; SSE2-NEXT: pextrw $1, %xmm4, %eax +; SSE2-NEXT: pextrw $1, %xmm0, %r8d +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %r8d, %edx +; SSE2-NEXT: addw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %r8w +; SSE2-NEXT: cmovol %ecx, %r8d +; SSE2-NEXT: pextrw $2, %xmm4, %eax +; SSE2-NEXT: pextrw $2, %xmm0, %edi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %edi, %edx +; SSE2-NEXT: addw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %di +; SSE2-NEXT: cmovol %ecx, %edi +; SSE2-NEXT: pextrw $3, %xmm4, %eax +; SSE2-NEXT: pextrw $3, %xmm0, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: addw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: pextrw $4, %xmm4, %eax +; SSE2-NEXT: pextrw $4, %xmm0, %edx +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %edx, %r13d +; SSE2-NEXT: addw %ax, %r13w +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %dx +; SSE2-NEXT: cmovol %ecx, %edx +; SSE2-NEXT: pextrw $5, %xmm4, %r13d +; SSE2-NEXT: pextrw $5, %xmm0, %ecx +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: movl %ecx, %r12d +; SSE2-NEXT: addw %r13w, %r12w +; SSE2-NEXT: setns %al +; SSE2-NEXT: addl $32767, %eax # imm = 0x7FFF +; SSE2-NEXT: addw %r13w, %cx +; SSE2-NEXT: cmovol %eax, %ecx +; SSE2-NEXT: pextrw $6, %xmm4, %r12d +; SSE2-NEXT: pextrw $6, %xmm0, %r13d +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: movl %r13d, %r15d +; SSE2-NEXT: addw %r12w, %r15w +; SSE2-NEXT: setns %al +; SSE2-NEXT: addl $32767, %eax # imm = 0x7FFF +; SSE2-NEXT: addw %r12w, %r13w +; SSE2-NEXT: cmovol %eax, %r13d +; SSE2-NEXT: pextrw $7, %xmm4, %r15d +; SSE2-NEXT: pextrw $7, %xmm0, %eax +; SSE2-NEXT: xorl %r12d, %r12d +; SSE2-NEXT: movl %eax, %r14d +; SSE2-NEXT: addw %r15w, %r14w +; SSE2-NEXT: setns %r12b +; SSE2-NEXT: addl $32767, %r12d # imm = 0x7FFF +; SSE2-NEXT: addw %r15w, %ax +; SSE2-NEXT: cmovol %r12d, %eax +; SSE2-NEXT: movd %eax, %xmm10 +; SSE2-NEXT: movd %r13d, %xmm12 +; SSE2-NEXT: movd %ecx, %xmm8 +; SSE2-NEXT: movd %edx, %xmm2 +; SSE2-NEXT: movd %esi, %xmm4 +; SSE2-NEXT: movd %edi, %xmm0 +; SSE2-NEXT: movd %r8d, %xmm13 +; SSE2-NEXT: movd %r9d, %xmm5 +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 4-byte Folded Reload +; SSE2-NEXT: # xmm6 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 4-byte Folded Reload +; SSE2-NEXT: # xmm11 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload +; SSE2-NEXT: # xmm3 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 4-byte Folded Reload +; SSE2-NEXT: # xmm14 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 4-byte Folded Reload +; SSE2-NEXT: # xmm9 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload +; SSE2-NEXT: # xmm15 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload +; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 4-byte Reload +; SSE2-NEXT: # xmm7 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 4-byte Folded Reload +; SSE2-NEXT: # xmm8 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1] +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 4-byte Reload +; SSE2-NEXT: # xmm7 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload +; SSE2-NEXT: # xmm4 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 4-byte Folded Reload +; SSE2-NEXT: # xmm13 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 4-byte Folded Reload +; SSE2-NEXT: # xmm7 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 4-byte Folded Reload +; SSE2-NEXT: # xmm12 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE2-NEXT: # xmm6 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3] +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload +; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3] +; SSE2-NEXT: movd %r10d, %xmm11 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; SSE2-NEXT: movd %r11d, %xmm6 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3] +; SSE2-NEXT: movd %ebx, %xmm14 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] +; SSE2-NEXT: movd %ebp, %xmm15 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 4-byte Folded Reload +; SSE2-NEXT: # xmm10 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE2-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3] +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 4-byte Folded Reload +; SSE2-NEXT: # xmm9 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE2-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload +; SSE2-NEXT: # xmm3 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm15[0] +; SSE2-NEXT: movdqa %xmm5, %xmm0 +; SSE2-NEXT: addq $8, %rsp +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %r12 +; SSE2-NEXT: popq %r13 +; SSE2-NEXT: popq %r14 +; SSE2-NEXT: popq %r15 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v32i16: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pushq %rbp +; SSSE3-NEXT: pushq %r15 +; SSSE3-NEXT: pushq %r14 +; SSSE3-NEXT: pushq %r13 +; SSSE3-NEXT: pushq %r12 +; SSSE3-NEXT: pushq %rbx +; SSSE3-NEXT: pushq %rax +; SSSE3-NEXT: movd %xmm5, %eax +; SSSE3-NEXT: movd %xmm1, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: addw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: pextrw $1, %xmm5, %eax +; SSSE3-NEXT: pextrw $1, %xmm1, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: addw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: pextrw $2, %xmm5, %eax +; SSSE3-NEXT: pextrw $2, %xmm1, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: addw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: pextrw $3, %xmm5, %eax +; SSSE3-NEXT: pextrw $3, %xmm1, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: addw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: pextrw $4, %xmm5, %eax +; SSSE3-NEXT: pextrw $4, %xmm1, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: addw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: pextrw $5, %xmm5, %eax +; SSSE3-NEXT: pextrw $5, %xmm1, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: addw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: pextrw $6, %xmm5, %eax +; SSSE3-NEXT: pextrw $6, %xmm1, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: addw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: pextrw $7, %xmm5, %eax +; SSSE3-NEXT: pextrw $7, %xmm1, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: addw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movd %xmm6, %eax +; SSSE3-NEXT: movd %xmm2, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: addw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: pextrw $1, %xmm6, %eax +; SSSE3-NEXT: pextrw $1, %xmm2, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: addw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: pextrw $2, %xmm6, %eax +; SSSE3-NEXT: pextrw $2, %xmm2, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: addw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: pextrw $3, %xmm6, %eax +; SSSE3-NEXT: pextrw $3, %xmm2, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: addw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: pextrw $4, %xmm6, %eax +; SSSE3-NEXT: pextrw $4, %xmm2, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: addw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: pextrw $5, %xmm6, %eax +; SSSE3-NEXT: pextrw $5, %xmm2, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: addw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: pextrw $6, %xmm6, %eax +; SSSE3-NEXT: pextrw $6, %xmm2, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: addw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: pextrw $7, %xmm6, %eax +; SSSE3-NEXT: pextrw $7, %xmm2, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: addw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movd %xmm7, %eax +; SSSE3-NEXT: movd %xmm3, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: addw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: pextrw $1, %xmm7, %eax +; SSSE3-NEXT: pextrw $1, %xmm3, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: addw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: pextrw $2, %xmm7, %eax +; SSSE3-NEXT: pextrw $2, %xmm3, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: addw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: pextrw $3, %xmm7, %eax +; SSSE3-NEXT: pextrw $3, %xmm3, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: addw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: pextrw $4, %xmm7, %eax +; SSSE3-NEXT: pextrw $4, %xmm3, %ebp +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %ebp, %edx +; SSSE3-NEXT: addw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %bp +; SSSE3-NEXT: cmovol %ecx, %ebp +; SSSE3-NEXT: pextrw $5, %xmm7, %eax +; SSSE3-NEXT: pextrw $5, %xmm3, %ebx +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %ebx, %edx +; SSSE3-NEXT: addw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %bx +; SSSE3-NEXT: cmovol %ecx, %ebx +; SSSE3-NEXT: pextrw $6, %xmm7, %eax +; SSSE3-NEXT: pextrw $6, %xmm3, %r11d +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %r11d, %edx +; SSSE3-NEXT: addw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %r11w +; SSSE3-NEXT: cmovol %ecx, %r11d +; SSSE3-NEXT: pextrw $7, %xmm7, %eax +; SSSE3-NEXT: pextrw $7, %xmm3, %r10d +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %r10d, %edx +; SSSE3-NEXT: addw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %r10w +; SSSE3-NEXT: cmovol %ecx, %r10d +; SSSE3-NEXT: movd %xmm4, %eax +; SSSE3-NEXT: movd %xmm0, %r9d +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %r9d, %edx +; SSSE3-NEXT: addw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %r9w +; SSSE3-NEXT: cmovol %ecx, %r9d +; SSSE3-NEXT: pextrw $1, %xmm4, %eax +; SSSE3-NEXT: pextrw $1, %xmm0, %r8d +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %r8d, %edx +; SSSE3-NEXT: addw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %r8w +; SSSE3-NEXT: cmovol %ecx, %r8d +; SSSE3-NEXT: pextrw $2, %xmm4, %eax +; SSSE3-NEXT: pextrw $2, %xmm0, %edi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %edi, %edx +; SSSE3-NEXT: addw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %di +; SSSE3-NEXT: cmovol %ecx, %edi +; SSSE3-NEXT: pextrw $3, %xmm4, %eax +; SSSE3-NEXT: pextrw $3, %xmm0, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: addw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: pextrw $4, %xmm4, %eax +; SSSE3-NEXT: pextrw $4, %xmm0, %edx +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %edx, %r13d +; SSSE3-NEXT: addw %ax, %r13w +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %dx +; SSSE3-NEXT: cmovol %ecx, %edx +; SSSE3-NEXT: pextrw $5, %xmm4, %r13d +; SSSE3-NEXT: pextrw $5, %xmm0, %ecx +; SSSE3-NEXT: xorl %eax, %eax +; SSSE3-NEXT: movl %ecx, %r12d +; SSSE3-NEXT: addw %r13w, %r12w +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addl $32767, %eax # imm = 0x7FFF +; SSSE3-NEXT: addw %r13w, %cx +; SSSE3-NEXT: cmovol %eax, %ecx +; SSSE3-NEXT: pextrw $6, %xmm4, %r12d +; SSSE3-NEXT: pextrw $6, %xmm0, %r13d +; SSSE3-NEXT: xorl %eax, %eax +; SSSE3-NEXT: movl %r13d, %r15d +; SSSE3-NEXT: addw %r12w, %r15w +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addl $32767, %eax # imm = 0x7FFF +; SSSE3-NEXT: addw %r12w, %r13w +; SSSE3-NEXT: cmovol %eax, %r13d +; SSSE3-NEXT: pextrw $7, %xmm4, %r15d +; SSSE3-NEXT: pextrw $7, %xmm0, %eax +; SSSE3-NEXT: xorl %r12d, %r12d +; SSSE3-NEXT: movl %eax, %r14d +; SSSE3-NEXT: addw %r15w, %r14w +; SSSE3-NEXT: setns %r12b +; SSSE3-NEXT: addl $32767, %r12d # imm = 0x7FFF +; SSSE3-NEXT: addw %r15w, %ax +; SSSE3-NEXT: cmovol %r12d, %eax +; SSSE3-NEXT: movd %eax, %xmm10 +; SSSE3-NEXT: movd %r13d, %xmm12 +; SSSE3-NEXT: movd %ecx, %xmm8 +; SSSE3-NEXT: movd %edx, %xmm2 +; SSSE3-NEXT: movd %esi, %xmm4 +; SSSE3-NEXT: movd %edi, %xmm0 +; SSSE3-NEXT: movd %r8d, %xmm13 +; SSSE3-NEXT: movd %r9d, %xmm5 +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm6 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm11 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm3 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm14 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm9 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm15 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 4-byte Reload +; SSSE3-NEXT: # xmm7 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm8 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1] +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 4-byte Reload +; SSSE3-NEXT: # xmm7 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm4 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm13 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm7 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm12 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm6 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3] +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3] +; SSSE3-NEXT: movd %r10d, %xmm11 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; SSSE3-NEXT: movd %r11d, %xmm6 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3] +; SSSE3-NEXT: movd %ebx, %xmm14 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] +; SSSE3-NEXT: movd %ebp, %xmm15 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm10 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3] +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm9 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm3 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm15[0] +; SSSE3-NEXT: movdqa %xmm5, %xmm0 +; SSSE3-NEXT: addq $8, %rsp +; SSSE3-NEXT: popq %rbx +; SSSE3-NEXT: popq %r12 +; SSSE3-NEXT: popq %r13 +; SSSE3-NEXT: popq %r14 +; SSSE3-NEXT: popq %r15 +; SSSE3-NEXT: popq %rbp +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v32i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pushq %rbp +; SSE41-NEXT: pushq %r15 +; SSE41-NEXT: pushq %r14 +; SSE41-NEXT: pushq %r13 +; SSE41-NEXT: pushq %r12 +; SSE41-NEXT: pushq %rbx +; SSE41-NEXT: pextrw $7, %xmm5, %eax +; SSE41-NEXT: pextrw $7, %xmm1, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: addw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrw $6, %xmm5, %eax +; SSE41-NEXT: pextrw $6, %xmm1, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: addw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrw $5, %xmm5, %eax +; SSE41-NEXT: pextrw $5, %xmm1, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: addw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrw $4, %xmm5, %eax +; SSE41-NEXT: pextrw $4, %xmm1, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: addw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrw $3, %xmm5, %eax +; SSE41-NEXT: pextrw $3, %xmm1, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: addw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrw $2, %xmm5, %eax +; SSE41-NEXT: pextrw $2, %xmm1, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: addw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movd %xmm5, %eax +; SSE41-NEXT: movd %xmm1, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: addw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrw $1, %xmm5, %eax +; SSE41-NEXT: pextrw $1, %xmm1, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: addw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrw $7, %xmm6, %eax +; SSE41-NEXT: pextrw $7, %xmm2, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: addw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrw $6, %xmm6, %eax +; SSE41-NEXT: pextrw $6, %xmm2, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: addw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrw $5, %xmm6, %eax +; SSE41-NEXT: pextrw $5, %xmm2, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: addw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrw $4, %xmm6, %eax +; SSE41-NEXT: pextrw $4, %xmm2, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: addw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrw $3, %xmm6, %eax +; SSE41-NEXT: pextrw $3, %xmm2, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: addw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrw $2, %xmm6, %eax +; SSE41-NEXT: pextrw $2, %xmm2, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: addw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movd %xmm6, %eax +; SSE41-NEXT: movd %xmm2, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: addw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrw $1, %xmm6, %eax +; SSE41-NEXT: pextrw $1, %xmm2, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: addw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrw $7, %xmm7, %eax +; SSE41-NEXT: pextrw $7, %xmm3, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: addw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrw $6, %xmm7, %eax +; SSE41-NEXT: pextrw $6, %xmm3, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: addw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrw $5, %xmm7, %eax +; SSE41-NEXT: pextrw $5, %xmm3, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: addw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrw $4, %xmm7, %eax +; SSE41-NEXT: pextrw $4, %xmm3, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: addw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrw $3, %xmm7, %eax +; SSE41-NEXT: pextrw $3, %xmm3, %ebx +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %ebx, %edx +; SSE41-NEXT: addw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %bx +; SSE41-NEXT: cmovol %ecx, %ebx +; SSE41-NEXT: pextrw $2, %xmm7, %eax +; SSE41-NEXT: pextrw $2, %xmm3, %r11d +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %r11d, %edx +; SSE41-NEXT: addw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %r11w +; SSE41-NEXT: cmovol %ecx, %r11d +; SSE41-NEXT: movd %xmm7, %eax +; SSE41-NEXT: movd %xmm3, %r10d +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %r10d, %edx +; SSE41-NEXT: addw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %r10w +; SSE41-NEXT: cmovol %ecx, %r10d +; SSE41-NEXT: pextrw $1, %xmm7, %eax +; SSE41-NEXT: pextrw $1, %xmm3, %r9d +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %r9d, %edx +; SSE41-NEXT: addw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %r9w +; SSE41-NEXT: cmovol %ecx, %r9d +; SSE41-NEXT: pextrw $7, %xmm4, %eax +; SSE41-NEXT: pextrw $7, %xmm0, %r8d +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %r8d, %edx +; SSE41-NEXT: addw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %r8w +; SSE41-NEXT: cmovol %ecx, %r8d +; SSE41-NEXT: pextrw $6, %xmm4, %eax +; SSE41-NEXT: pextrw $6, %xmm0, %edi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %edi, %edx +; SSE41-NEXT: addw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %di +; SSE41-NEXT: cmovol %ecx, %edi +; SSE41-NEXT: pextrw $5, %xmm4, %eax +; SSE41-NEXT: pextrw $5, %xmm0, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: addw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: pextrw $4, %xmm4, %eax +; SSE41-NEXT: pextrw $4, %xmm0, %edx +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %edx, %r13d +; SSE41-NEXT: addw %ax, %r13w +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %dx +; SSE41-NEXT: cmovol %ecx, %edx +; SSE41-NEXT: pextrw $3, %xmm4, %eax +; SSE41-NEXT: pextrw $3, %xmm0, %r13d +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %r13d, %r12d +; SSE41-NEXT: addw %ax, %r12w +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %r13w +; SSE41-NEXT: cmovol %ecx, %r13d +; SSE41-NEXT: pextrw $2, %xmm4, %r12d +; SSE41-NEXT: pextrw $2, %xmm0, %eax +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %eax, %r15d +; SSE41-NEXT: addw %r12w, %r15w +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %r12w, %ax +; SSE41-NEXT: cmovol %ecx, %eax +; SSE41-NEXT: movd %xmm4, %r15d +; SSE41-NEXT: movd %xmm0, %r12d +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %r12d, %r14d +; SSE41-NEXT: addw %r15w, %r14w +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %r15w, %r12w +; SSE41-NEXT: cmovol %ecx, %r12d +; SSE41-NEXT: pextrw $1, %xmm4, %r14d +; SSE41-NEXT: pextrw $1, %xmm0, %ecx +; SSE41-NEXT: xorl %r15d, %r15d +; SSE41-NEXT: movl %ecx, %ebp +; SSE41-NEXT: addw %r14w, %bp +; SSE41-NEXT: setns %r15b +; SSE41-NEXT: addl $32767, %r15d # imm = 0x7FFF +; SSE41-NEXT: addw %r14w, %cx +; SSE41-NEXT: cmovol %r15d, %ecx +; SSE41-NEXT: movd %r12d, %xmm0 +; SSE41-NEXT: pinsrw $1, %ecx, %xmm0 +; SSE41-NEXT: pinsrw $2, %eax, %xmm0 +; SSE41-NEXT: pinsrw $3, %r13d, %xmm0 +; SSE41-NEXT: pinsrw $4, %edx, %xmm0 +; SSE41-NEXT: pinsrw $5, %esi, %xmm0 +; SSE41-NEXT: pinsrw $6, %edi, %xmm0 +; SSE41-NEXT: pinsrw $7, %r8d, %xmm0 +; SSE41-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload +; SSE41-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE41-NEXT: pinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload +; SSE41-NEXT: pinsrw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload +; SSE41-NEXT: pinsrw $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload +; SSE41-NEXT: pinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload +; SSE41-NEXT: pinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload +; SSE41-NEXT: pinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload +; SSE41-NEXT: pinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload +; SSE41-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload +; SSE41-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: pinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload +; SSE41-NEXT: pinsrw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload +; SSE41-NEXT: pinsrw $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload +; SSE41-NEXT: pinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload +; SSE41-NEXT: pinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload +; SSE41-NEXT: pinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload +; SSE41-NEXT: pinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload +; SSE41-NEXT: movd %r10d, %xmm3 +; SSE41-NEXT: pinsrw $1, %r9d, %xmm3 +; SSE41-NEXT: pinsrw $2, %r11d, %xmm3 +; SSE41-NEXT: pinsrw $3, %ebx, %xmm3 +; SSE41-NEXT: pinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload +; SSE41-NEXT: pinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload +; SSE41-NEXT: pinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload +; SSE41-NEXT: pinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload +; SSE41-NEXT: popq %rbx +; SSE41-NEXT: popq %r12 +; SSE41-NEXT: popq %r13 +; SSE41-NEXT: popq %r14 +; SSE41-NEXT: popq %r15 +; SSE41-NEXT: popq %rbp +; SSE41-NEXT: retq +; +; AVX1-LABEL: v32i16: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: pushq %r15 +; AVX1-NEXT: pushq %r14 +; AVX1-NEXT: pushq %r13 +; AVX1-NEXT: pushq %r12 +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: vpextrw $7, %xmm3, %eax +; AVX1-NEXT: vpextrw $7, %xmm1, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: addw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: addw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrw $6, %xmm3, %eax +; AVX1-NEXT: vpextrw $6, %xmm1, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: addw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: addw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrw $5, %xmm3, %eax +; AVX1-NEXT: vpextrw $5, %xmm1, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: addw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: addw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrw $4, %xmm3, %eax +; AVX1-NEXT: vpextrw $4, %xmm1, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: addw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: addw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrw $3, %xmm3, %eax +; AVX1-NEXT: vpextrw $3, %xmm1, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: addw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: addw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrw $2, %xmm3, %eax +; AVX1-NEXT: vpextrw $2, %xmm1, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: addw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: addw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vmovd %xmm3, %eax +; AVX1-NEXT: vmovd %xmm1, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: addw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: addw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrw $1, %xmm3, %eax +; AVX1-NEXT: vpextrw $1, %xmm1, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: addw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: addw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vpextrw $7, %xmm3, %eax +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpextrw $7, %xmm1, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: addw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: addw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrw $6, %xmm3, %eax +; AVX1-NEXT: vpextrw $6, %xmm1, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: addw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: addw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrw $5, %xmm3, %eax +; AVX1-NEXT: vpextrw $5, %xmm1, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: addw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: addw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrw $4, %xmm3, %eax +; AVX1-NEXT: vpextrw $4, %xmm1, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: addw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: addw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrw $3, %xmm3, %eax +; AVX1-NEXT: vpextrw $3, %xmm1, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: addw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: addw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrw $2, %xmm3, %eax +; AVX1-NEXT: vpextrw $2, %xmm1, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: addw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: addw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vmovd %xmm3, %eax +; AVX1-NEXT: vmovd %xmm1, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: addw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: addw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrw $1, %xmm3, %eax +; AVX1-NEXT: vpextrw $1, %xmm1, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: addw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: addw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrw $7, %xmm2, %eax +; AVX1-NEXT: vpextrw $7, %xmm0, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: addw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: addw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrw $6, %xmm2, %eax +; AVX1-NEXT: vpextrw $6, %xmm0, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: addw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: addw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrw $5, %xmm2, %eax +; AVX1-NEXT: vpextrw $5, %xmm0, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: addw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: addw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrw $4, %xmm2, %eax +; AVX1-NEXT: vpextrw $4, %xmm0, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: addw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: addw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrw $3, %xmm2, %eax +; AVX1-NEXT: vpextrw $3, %xmm0, %ebx +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %ebx, %edx +; AVX1-NEXT: addw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: addw %ax, %bx +; AVX1-NEXT: cmovol %ecx, %ebx +; AVX1-NEXT: vpextrw $2, %xmm2, %eax +; AVX1-NEXT: vpextrw $2, %xmm0, %r11d +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %r11d, %edx +; AVX1-NEXT: addw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: addw %ax, %r11w +; AVX1-NEXT: cmovol %ecx, %r11d +; AVX1-NEXT: vmovd %xmm2, %eax +; AVX1-NEXT: vmovd %xmm0, %r10d +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %r10d, %edx +; AVX1-NEXT: addw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: addw %ax, %r10w +; AVX1-NEXT: cmovol %ecx, %r10d +; AVX1-NEXT: vpextrw $1, %xmm2, %eax +; AVX1-NEXT: vpextrw $1, %xmm0, %r9d +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %r9d, %edx +; AVX1-NEXT: addw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: addw %ax, %r9w +; AVX1-NEXT: cmovol %ecx, %r9d +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpextrw $7, %xmm1, %eax +; AVX1-NEXT: vpextrw $7, %xmm0, %r8d +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %r8d, %edx +; AVX1-NEXT: addw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: addw %ax, %r8w +; AVX1-NEXT: cmovol %ecx, %r8d +; AVX1-NEXT: vpextrw $6, %xmm1, %eax +; AVX1-NEXT: vpextrw $6, %xmm0, %edi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %edi, %edx +; AVX1-NEXT: addw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: addw %ax, %di +; AVX1-NEXT: cmovol %ecx, %edi +; AVX1-NEXT: vpextrw $5, %xmm1, %eax +; AVX1-NEXT: vpextrw $5, %xmm0, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: addw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: addw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: vpextrw $4, %xmm1, %eax +; AVX1-NEXT: vpextrw $4, %xmm0, %edx +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %edx, %r13d +; AVX1-NEXT: addw %ax, %r13w +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: addw %ax, %dx +; AVX1-NEXT: cmovol %ecx, %edx +; AVX1-NEXT: vpextrw $3, %xmm1, %eax +; AVX1-NEXT: vpextrw $3, %xmm0, %r13d +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %r13d, %r12d +; AVX1-NEXT: addw %ax, %r12w +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: addw %ax, %r13w +; AVX1-NEXT: cmovol %ecx, %r13d +; AVX1-NEXT: vpextrw $2, %xmm1, %r12d +; AVX1-NEXT: vpextrw $2, %xmm0, %eax +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %eax, %r15d +; AVX1-NEXT: addw %r12w, %r15w +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: addw %r12w, %ax +; AVX1-NEXT: cmovol %ecx, %eax +; AVX1-NEXT: vmovd %xmm1, %r15d +; AVX1-NEXT: vmovd %xmm0, %r12d +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %r12d, %r14d +; AVX1-NEXT: addw %r15w, %r14w +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: addw %r15w, %r12w +; AVX1-NEXT: cmovol %ecx, %r12d +; AVX1-NEXT: vpextrw $1, %xmm1, %r14d +; AVX1-NEXT: vpextrw $1, %xmm0, %ecx +; AVX1-NEXT: xorl %r15d, %r15d +; AVX1-NEXT: movl %ecx, %ebp +; AVX1-NEXT: addw %r14w, %bp +; AVX1-NEXT: setns %r15b +; AVX1-NEXT: addl $32767, %r15d # imm = 0x7FFF +; AVX1-NEXT: addw %r14w, %cx +; AVX1-NEXT: cmovol %r15d, %ecx +; AVX1-NEXT: vmovd %r12d, %xmm0 +; AVX1-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $3, %r13d, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $4, %edx, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $5, %esi, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $6, %edi, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $7, %r8d, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %r10d, %xmm1 +; AVX1-NEXT: vpinsrw $1, %r9d, %xmm1, %xmm1 +; AVX1-NEXT: vpinsrw $2, %r11d, %xmm1, %xmm1 +; AVX1-NEXT: vpinsrw $3, %ebx, %xmm1, %xmm1 +; AVX1-NEXT: vpinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX1-NEXT: vpinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX1-NEXT: vpinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX1-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX1-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload +; AVX1-NEXT: # xmm2 = mem[0],zero,zero,zero +; AVX1-NEXT: vpinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload +; AVX1-NEXT: vpinsrw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload +; AVX1-NEXT: vpinsrw $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload +; AVX1-NEXT: vpinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload +; AVX1-NEXT: vpinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload +; AVX1-NEXT: vpinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload +; AVX1-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload +; AVX1-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload +; AVX1-NEXT: # xmm3 = mem[0],zero,zero,zero +; AVX1-NEXT: vpinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload +; AVX1-NEXT: vpinsrw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload +; AVX1-NEXT: vpinsrw $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload +; AVX1-NEXT: vpinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload +; AVX1-NEXT: vpinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload +; AVX1-NEXT: vpinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload +; AVX1-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm1 +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: popq %r12 +; AVX1-NEXT: popq %r13 +; AVX1-NEXT: popq %r14 +; AVX1-NEXT: popq %r15 +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: v32i16: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %r13 +; AVX2-NEXT: pushq %r12 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: vpextrw $7, %xmm3, %eax +; AVX2-NEXT: vpextrw $7, %xmm1, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: addw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: addw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrw $6, %xmm3, %eax +; AVX2-NEXT: vpextrw $6, %xmm1, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: addw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: addw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrw $5, %xmm3, %eax +; AVX2-NEXT: vpextrw $5, %xmm1, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: addw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: addw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrw $4, %xmm3, %eax +; AVX2-NEXT: vpextrw $4, %xmm1, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: addw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: addw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrw $3, %xmm3, %eax +; AVX2-NEXT: vpextrw $3, %xmm1, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: addw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: addw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrw $2, %xmm3, %eax +; AVX2-NEXT: vpextrw $2, %xmm1, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: addw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: addw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vmovd %xmm3, %eax +; AVX2-NEXT: vmovd %xmm1, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: addw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: addw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrw $1, %xmm3, %eax +; AVX2-NEXT: vpextrw $1, %xmm1, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: addw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: addw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX2-NEXT: vpextrw $7, %xmm3, %eax +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vpextrw $7, %xmm1, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: addw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: addw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrw $6, %xmm3, %eax +; AVX2-NEXT: vpextrw $6, %xmm1, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: addw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: addw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrw $5, %xmm3, %eax +; AVX2-NEXT: vpextrw $5, %xmm1, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: addw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: addw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrw $4, %xmm3, %eax +; AVX2-NEXT: vpextrw $4, %xmm1, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: addw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: addw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrw $3, %xmm3, %eax +; AVX2-NEXT: vpextrw $3, %xmm1, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: addw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: addw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrw $2, %xmm3, %eax +; AVX2-NEXT: vpextrw $2, %xmm1, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: addw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: addw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vmovd %xmm3, %eax +; AVX2-NEXT: vmovd %xmm1, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: addw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: addw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrw $1, %xmm3, %eax +; AVX2-NEXT: vpextrw $1, %xmm1, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: addw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: addw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrw $7, %xmm2, %eax +; AVX2-NEXT: vpextrw $7, %xmm0, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: addw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: addw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrw $6, %xmm2, %eax +; AVX2-NEXT: vpextrw $6, %xmm0, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: addw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: addw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrw $5, %xmm2, %eax +; AVX2-NEXT: vpextrw $5, %xmm0, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: addw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: addw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrw $4, %xmm2, %eax +; AVX2-NEXT: vpextrw $4, %xmm0, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: addw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: addw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrw $3, %xmm2, %eax +; AVX2-NEXT: vpextrw $3, %xmm0, %ebx +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %ebx, %edx +; AVX2-NEXT: addw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: addw %ax, %bx +; AVX2-NEXT: cmovol %ecx, %ebx +; AVX2-NEXT: vpextrw $2, %xmm2, %eax +; AVX2-NEXT: vpextrw $2, %xmm0, %r11d +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %r11d, %edx +; AVX2-NEXT: addw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: addw %ax, %r11w +; AVX2-NEXT: cmovol %ecx, %r11d +; AVX2-NEXT: vmovd %xmm2, %eax +; AVX2-NEXT: vmovd %xmm0, %r10d +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %r10d, %edx +; AVX2-NEXT: addw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: addw %ax, %r10w +; AVX2-NEXT: cmovol %ecx, %r10d +; AVX2-NEXT: vpextrw $1, %xmm2, %eax +; AVX2-NEXT: vpextrw $1, %xmm0, %r9d +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %r9d, %edx +; AVX2-NEXT: addw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: addw %ax, %r9w +; AVX2-NEXT: cmovol %ecx, %r9d +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpextrw $7, %xmm1, %eax +; AVX2-NEXT: vpextrw $7, %xmm0, %r8d +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %r8d, %edx +; AVX2-NEXT: addw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: addw %ax, %r8w +; AVX2-NEXT: cmovol %ecx, %r8d +; AVX2-NEXT: vpextrw $6, %xmm1, %eax +; AVX2-NEXT: vpextrw $6, %xmm0, %edi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %edi, %edx +; AVX2-NEXT: addw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: addw %ax, %di +; AVX2-NEXT: cmovol %ecx, %edi +; AVX2-NEXT: vpextrw $5, %xmm1, %eax +; AVX2-NEXT: vpextrw $5, %xmm0, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: addw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: addw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: vpextrw $4, %xmm1, %eax +; AVX2-NEXT: vpextrw $4, %xmm0, %edx +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %edx, %r13d +; AVX2-NEXT: addw %ax, %r13w +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: addw %ax, %dx +; AVX2-NEXT: cmovol %ecx, %edx +; AVX2-NEXT: vpextrw $3, %xmm1, %eax +; AVX2-NEXT: vpextrw $3, %xmm0, %r13d +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %r13d, %r12d +; AVX2-NEXT: addw %ax, %r12w +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: addw %ax, %r13w +; AVX2-NEXT: cmovol %ecx, %r13d +; AVX2-NEXT: vpextrw $2, %xmm1, %r12d +; AVX2-NEXT: vpextrw $2, %xmm0, %eax +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %eax, %r15d +; AVX2-NEXT: addw %r12w, %r15w +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: addw %r12w, %ax +; AVX2-NEXT: cmovol %ecx, %eax +; AVX2-NEXT: vmovd %xmm1, %r15d +; AVX2-NEXT: vmovd %xmm0, %r12d +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %r12d, %r14d +; AVX2-NEXT: addw %r15w, %r14w +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: addw %r15w, %r12w +; AVX2-NEXT: cmovol %ecx, %r12d +; AVX2-NEXT: vpextrw $1, %xmm1, %r14d +; AVX2-NEXT: vpextrw $1, %xmm0, %ecx +; AVX2-NEXT: xorl %r15d, %r15d +; AVX2-NEXT: movl %ecx, %ebp +; AVX2-NEXT: addw %r14w, %bp +; AVX2-NEXT: setns %r15b +; AVX2-NEXT: addl $32767, %r15d # imm = 0x7FFF +; AVX2-NEXT: addw %r14w, %cx +; AVX2-NEXT: cmovol %r15d, %ecx +; AVX2-NEXT: vmovd %r12d, %xmm0 +; AVX2-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $3, %r13d, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $4, %edx, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $5, %esi, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $6, %edi, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $7, %r8d, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %r10d, %xmm1 +; AVX2-NEXT: vpinsrw $1, %r9d, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrw $2, %r11d, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrw $3, %ebx, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX2-NEXT: vpinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX2-NEXT: vpinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX2-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX2-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload +; AVX2-NEXT: # xmm2 = mem[0],zero,zero,zero +; AVX2-NEXT: vpinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload +; AVX2-NEXT: vpinsrw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload +; AVX2-NEXT: vpinsrw $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload +; AVX2-NEXT: vpinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload +; AVX2-NEXT: vpinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload +; AVX2-NEXT: vpinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload +; AVX2-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload +; AVX2-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload +; AVX2-NEXT: # xmm3 = mem[0],zero,zero,zero +; AVX2-NEXT: vpinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload +; AVX2-NEXT: vpinsrw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload +; AVX2-NEXT: vpinsrw $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload +; AVX2-NEXT: vpinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload +; AVX2-NEXT: vpinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload +; AVX2-NEXT: vpinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload +; AVX2-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm1 +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r12 +; AVX2-NEXT: popq %r13 +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512-LABEL: v32i16: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r15 +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %r13 +; AVX512-NEXT: pushq %r12 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: vpextrw $7, %xmm1, %eax +; AVX512-NEXT: vpextrw $7, %xmm0, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: addw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: addw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrw $6, %xmm1, %eax +; AVX512-NEXT: vpextrw $6, %xmm0, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: addw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: addw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrw $5, %xmm1, %eax +; AVX512-NEXT: vpextrw $5, %xmm0, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: addw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: addw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrw $4, %xmm1, %eax +; AVX512-NEXT: vpextrw $4, %xmm0, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: addw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: addw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrw $3, %xmm1, %eax +; AVX512-NEXT: vpextrw $3, %xmm0, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: addw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: addw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrw $2, %xmm1, %eax +; AVX512-NEXT: vpextrw $2, %xmm0, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: addw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: addw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vmovd %xmm1, %eax +; AVX512-NEXT: vmovd %xmm0, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: addw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: addw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrw $1, %xmm1, %eax +; AVX512-NEXT: vpextrw $1, %xmm0, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: addw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: addw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vpextrw $7, %xmm2, %eax +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512-NEXT: vpextrw $7, %xmm3, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: addw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: addw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrw $6, %xmm2, %eax +; AVX512-NEXT: vpextrw $6, %xmm3, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: addw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: addw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrw $5, %xmm2, %eax +; AVX512-NEXT: vpextrw $5, %xmm3, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: addw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: addw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrw $4, %xmm2, %eax +; AVX512-NEXT: vpextrw $4, %xmm3, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: addw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: addw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrw $3, %xmm2, %eax +; AVX512-NEXT: vpextrw $3, %xmm3, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: addw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: addw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrw $2, %xmm2, %eax +; AVX512-NEXT: vpextrw $2, %xmm3, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: addw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: addw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vmovd %xmm2, %eax +; AVX512-NEXT: vmovd %xmm3, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: addw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: addw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrw $1, %xmm2, %eax +; AVX512-NEXT: vpextrw $1, %xmm3, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: addw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: addw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm2 +; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm3 +; AVX512-NEXT: vpextrw $7, %xmm2, %eax +; AVX512-NEXT: vpextrw $7, %xmm3, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: addw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: addw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrw $6, %xmm2, %eax +; AVX512-NEXT: vpextrw $6, %xmm3, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: addw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: addw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrw $5, %xmm2, %eax +; AVX512-NEXT: vpextrw $5, %xmm3, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: addw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: addw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrw $4, %xmm2, %eax +; AVX512-NEXT: vpextrw $4, %xmm3, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: addw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: addw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrw $3, %xmm2, %eax +; AVX512-NEXT: vpextrw $3, %xmm3, %ebx +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %ebx, %edx +; AVX512-NEXT: addw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: addw %ax, %bx +; AVX512-NEXT: cmovol %ecx, %ebx +; AVX512-NEXT: vpextrw $2, %xmm2, %eax +; AVX512-NEXT: vpextrw $2, %xmm3, %r11d +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %r11d, %edx +; AVX512-NEXT: addw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: addw %ax, %r11w +; AVX512-NEXT: cmovol %ecx, %r11d +; AVX512-NEXT: vmovd %xmm2, %eax +; AVX512-NEXT: vmovd %xmm3, %r10d +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %r10d, %edx +; AVX512-NEXT: addw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: addw %ax, %r10w +; AVX512-NEXT: cmovol %ecx, %r10d +; AVX512-NEXT: vpextrw $1, %xmm2, %eax +; AVX512-NEXT: vpextrw $1, %xmm3, %r9d +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %r9d, %edx +; AVX512-NEXT: addw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: addw %ax, %r9w +; AVX512-NEXT: cmovol %ecx, %r9d +; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm1 +; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm0 +; AVX512-NEXT: vpextrw $7, %xmm1, %eax +; AVX512-NEXT: vpextrw $7, %xmm0, %r8d +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %r8d, %edx +; AVX512-NEXT: addw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: addw %ax, %r8w +; AVX512-NEXT: cmovol %ecx, %r8d +; AVX512-NEXT: vpextrw $6, %xmm1, %eax +; AVX512-NEXT: vpextrw $6, %xmm0, %edi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %edi, %edx +; AVX512-NEXT: addw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: addw %ax, %di +; AVX512-NEXT: cmovol %ecx, %edi +; AVX512-NEXT: vpextrw $5, %xmm1, %eax +; AVX512-NEXT: vpextrw $5, %xmm0, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: addw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: addw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: vpextrw $4, %xmm1, %eax +; AVX512-NEXT: vpextrw $4, %xmm0, %edx +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %edx, %r13d +; AVX512-NEXT: addw %ax, %r13w +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: addw %ax, %dx +; AVX512-NEXT: cmovol %ecx, %edx +; AVX512-NEXT: vpextrw $3, %xmm1, %eax +; AVX512-NEXT: vpextrw $3, %xmm0, %r13d +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %r13d, %r12d +; AVX512-NEXT: addw %ax, %r12w +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: addw %ax, %r13w +; AVX512-NEXT: cmovol %ecx, %r13d +; AVX512-NEXT: vpextrw $2, %xmm1, %r12d +; AVX512-NEXT: vpextrw $2, %xmm0, %eax +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %eax, %r15d +; AVX512-NEXT: addw %r12w, %r15w +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: addw %r12w, %ax +; AVX512-NEXT: cmovol %ecx, %eax +; AVX512-NEXT: vmovd %xmm1, %r15d +; AVX512-NEXT: vmovd %xmm0, %r12d +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %r12d, %r14d +; AVX512-NEXT: addw %r15w, %r14w +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: addw %r15w, %r12w +; AVX512-NEXT: cmovol %ecx, %r12d +; AVX512-NEXT: vpextrw $1, %xmm1, %r14d +; AVX512-NEXT: vpextrw $1, %xmm0, %ecx +; AVX512-NEXT: xorl %r15d, %r15d +; AVX512-NEXT: movl %ecx, %ebp +; AVX512-NEXT: addw %r14w, %bp +; AVX512-NEXT: setns %r15b +; AVX512-NEXT: addl $32767, %r15d # imm = 0x7FFF +; AVX512-NEXT: addw %r14w, %cx +; AVX512-NEXT: cmovol %r15d, %ecx +; AVX512-NEXT: vmovd %r12d, %xmm0 +; AVX512-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $3, %r13d, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $4, %edx, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $5, %esi, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $6, %edi, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $7, %r8d, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %r10d, %xmm1 +; AVX512-NEXT: vpinsrw $1, %r9d, %xmm1, %xmm1 +; AVX512-NEXT: vpinsrw $2, %r11d, %xmm1, %xmm1 +; AVX512-NEXT: vpinsrw $3, %ebx, %xmm1, %xmm1 +; AVX512-NEXT: vpinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX512-NEXT: vpinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX512-NEXT: vpinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX512-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX512-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload +; AVX512-NEXT: # xmm2 = mem[0],zero,zero,zero +; AVX512-NEXT: vpinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload +; AVX512-NEXT: vpinsrw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload +; AVX512-NEXT: vpinsrw $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload +; AVX512-NEXT: vpinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload +; AVX512-NEXT: vpinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload +; AVX512-NEXT: vpinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload +; AVX512-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload +; AVX512-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload +; AVX512-NEXT: # xmm3 = mem[0],zero,zero,zero +; AVX512-NEXT: vpinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload +; AVX512-NEXT: vpinsrw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload +; AVX512-NEXT: vpinsrw $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload +; AVX512-NEXT: vpinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload +; AVX512-NEXT: vpinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload +; AVX512-NEXT: vpinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload +; AVX512-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r12 +; AVX512-NEXT: popq %r13 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %r15 +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq + %z = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %x, <32 x i16> %y) + ret <32 x i16> %z +} + +; Too narrow vectors, legalized by widening. + +define void @v8i8(<8 x i8>* %px, <8 x i8>* %py, <8 x i8>* %pz) nounwind { +; SSE2-LABEL: v8i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: pushq %r14 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: movd %xmm0, %r8d +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %r8d, %esi +; SSE2-NEXT: addw %ax, %si +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %r8w +; SSE2-NEXT: cmovol %ecx, %r8d +; SSE2-NEXT: pextrw $1, %xmm1, %eax +; SSE2-NEXT: pextrw $1, %xmm0, %r9d +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %r9d, %esi +; SSE2-NEXT: addw %ax, %si +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %r9w +; SSE2-NEXT: cmovol %ecx, %r9d +; SSE2-NEXT: pextrw $2, %xmm1, %eax +; SSE2-NEXT: pextrw $2, %xmm0, %r10d +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %r10d, %esi +; SSE2-NEXT: addw %ax, %si +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %r10w +; SSE2-NEXT: cmovol %ecx, %r10d +; SSE2-NEXT: pextrw $3, %xmm1, %eax +; SSE2-NEXT: pextrw $3, %xmm0, %r11d +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %r11d, %esi +; SSE2-NEXT: addw %ax, %si +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %r11w +; SSE2-NEXT: cmovol %ecx, %r11d +; SSE2-NEXT: pextrw $4, %xmm1, %ecx +; SSE2-NEXT: pextrw $4, %xmm0, %r14d +; SSE2-NEXT: xorl %esi, %esi +; SSE2-NEXT: movl %r14d, %edi +; SSE2-NEXT: addw %cx, %di +; SSE2-NEXT: setns %sil +; SSE2-NEXT: addl $32767, %esi # imm = 0x7FFF +; SSE2-NEXT: addw %cx, %r14w +; SSE2-NEXT: cmovol %esi, %r14d +; SSE2-NEXT: pextrw $5, %xmm1, %esi +; SSE2-NEXT: pextrw $5, %xmm0, %ecx +; SSE2-NEXT: xorl %edi, %edi +; SSE2-NEXT: movl %ecx, %ebx +; SSE2-NEXT: addw %si, %bx +; SSE2-NEXT: setns %dil +; SSE2-NEXT: addl $32767, %edi # imm = 0x7FFF +; SSE2-NEXT: addw %si, %cx +; SSE2-NEXT: cmovol %edi, %ecx +; SSE2-NEXT: pextrw $6, %xmm1, %edi +; SSE2-NEXT: pextrw $6, %xmm0, %esi +; SSE2-NEXT: xorl %ebx, %ebx +; SSE2-NEXT: movl %esi, %ebp +; SSE2-NEXT: addw %di, %bp +; SSE2-NEXT: setns %bl +; SSE2-NEXT: addl $32767, %ebx # imm = 0x7FFF +; SSE2-NEXT: addw %di, %si +; SSE2-NEXT: cmovol %ebx, %esi +; SSE2-NEXT: pextrw $7, %xmm1, %edi +; SSE2-NEXT: pextrw $7, %xmm0, %ebx +; SSE2-NEXT: xorl %ebp, %ebp +; SSE2-NEXT: movl %ebx, %eax +; SSE2-NEXT: addw %di, %ax +; SSE2-NEXT: setns %bpl +; SSE2-NEXT: addl $32767, %ebp # imm = 0x7FFF +; SSE2-NEXT: addw %di, %bx +; SSE2-NEXT: cmovol %ebp, %ebx +; SSE2-NEXT: movd %ebx, %xmm0 +; SSE2-NEXT: movd %esi, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: movd %r14d, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: movd %r11d, %xmm0 +; SSE2-NEXT: movd %r10d, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: movd %r9d, %xmm0 +; SSE2-NEXT: movd %r8d, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE2-NEXT: psrlw $8, %xmm3 +; SSE2-NEXT: packuswb %xmm0, %xmm3 +; SSE2-NEXT: movq %xmm3, (%rdx) +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %r14 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v8i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pushq %rbp +; SSSE3-NEXT: pushq %r14 +; SSSE3-NEXT: pushq %rbx +; SSSE3-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; SSSE3-NEXT: movq {{.*#+}} xmm3 = mem[0],zero +; SSSE3-NEXT: pxor %xmm0, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSSE3-NEXT: movd %xmm1, %eax +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSSE3-NEXT: movd %xmm0, %r8d +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %r8d, %esi +; SSSE3-NEXT: addw %ax, %si +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %r8w +; SSSE3-NEXT: cmovol %ecx, %r8d +; SSSE3-NEXT: pextrw $1, %xmm1, %eax +; SSSE3-NEXT: pextrw $1, %xmm0, %r9d +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %r9d, %esi +; SSSE3-NEXT: addw %ax, %si +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %r9w +; SSSE3-NEXT: cmovol %ecx, %r9d +; SSSE3-NEXT: pextrw $2, %xmm1, %eax +; SSSE3-NEXT: pextrw $2, %xmm0, %r10d +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %r10d, %esi +; SSSE3-NEXT: addw %ax, %si +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %r10w +; SSSE3-NEXT: cmovol %ecx, %r10d +; SSSE3-NEXT: pextrw $3, %xmm1, %eax +; SSSE3-NEXT: pextrw $3, %xmm0, %r11d +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %r11d, %esi +; SSSE3-NEXT: addw %ax, %si +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %r11w +; SSSE3-NEXT: cmovol %ecx, %r11d +; SSSE3-NEXT: pextrw $4, %xmm1, %ecx +; SSSE3-NEXT: pextrw $4, %xmm0, %r14d +; SSSE3-NEXT: xorl %esi, %esi +; SSSE3-NEXT: movl %r14d, %edi +; SSSE3-NEXT: addw %cx, %di +; SSSE3-NEXT: setns %sil +; SSSE3-NEXT: addl $32767, %esi # imm = 0x7FFF +; SSSE3-NEXT: addw %cx, %r14w +; SSSE3-NEXT: cmovol %esi, %r14d +; SSSE3-NEXT: pextrw $5, %xmm1, %esi +; SSSE3-NEXT: pextrw $5, %xmm0, %ecx +; SSSE3-NEXT: xorl %edi, %edi +; SSSE3-NEXT: movl %ecx, %ebx +; SSSE3-NEXT: addw %si, %bx +; SSSE3-NEXT: setns %dil +; SSSE3-NEXT: addl $32767, %edi # imm = 0x7FFF +; SSSE3-NEXT: addw %si, %cx +; SSSE3-NEXT: cmovol %edi, %ecx +; SSSE3-NEXT: pextrw $6, %xmm1, %edi +; SSSE3-NEXT: pextrw $6, %xmm0, %esi +; SSSE3-NEXT: xorl %ebx, %ebx +; SSSE3-NEXT: movl %esi, %ebp +; SSSE3-NEXT: addw %di, %bp +; SSSE3-NEXT: setns %bl +; SSSE3-NEXT: addl $32767, %ebx # imm = 0x7FFF +; SSSE3-NEXT: addw %di, %si +; SSSE3-NEXT: cmovol %ebx, %esi +; SSSE3-NEXT: pextrw $7, %xmm1, %edi +; SSSE3-NEXT: pextrw $7, %xmm0, %ebx +; SSSE3-NEXT: xorl %ebp, %ebp +; SSSE3-NEXT: movl %ebx, %eax +; SSSE3-NEXT: addw %di, %ax +; SSSE3-NEXT: setns %bpl +; SSSE3-NEXT: addl $32767, %ebp # imm = 0x7FFF +; SSSE3-NEXT: addw %di, %bx +; SSSE3-NEXT: cmovol %ebp, %ebx +; SSSE3-NEXT: movd %ebx, %xmm0 +; SSSE3-NEXT: movd %esi, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: movd %r14d, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSSE3-NEXT: movd %r11d, %xmm0 +; SSSE3-NEXT: movd %r10d, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: movd %r9d, %xmm0 +; SSSE3-NEXT: movd %r8d, %xmm3 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSSE3-NEXT: psrlw $8, %xmm3 +; SSSE3-NEXT: packuswb %xmm0, %xmm3 +; SSSE3-NEXT: movq %xmm3, (%rdx) +; SSSE3-NEXT: popq %rbx +; SSSE3-NEXT: popq %r14 +; SSSE3-NEXT: popq %rbp +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v8i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pushq %rbp +; SSE41-NEXT: pushq %r14 +; SSE41-NEXT: pushq %rbx +; SSE41-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; SSE41-NEXT: movq {{.*#+}} xmm3 = mem[0],zero +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE41-NEXT: pextrw $7, %xmm1, %eax +; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE41-NEXT: pextrw $7, %xmm0, %r8d +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %r8d, %esi +; SSE41-NEXT: addw %ax, %si +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %r8w +; SSE41-NEXT: cmovol %ecx, %r8d +; SSE41-NEXT: pextrw $6, %xmm1, %eax +; SSE41-NEXT: pextrw $6, %xmm0, %r9d +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %r9d, %esi +; SSE41-NEXT: addw %ax, %si +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %r9w +; SSE41-NEXT: cmovol %ecx, %r9d +; SSE41-NEXT: pextrw $5, %xmm1, %eax +; SSE41-NEXT: pextrw $5, %xmm0, %r10d +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %r10d, %edi +; SSE41-NEXT: addw %ax, %di +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %r10w +; SSE41-NEXT: cmovol %ecx, %r10d +; SSE41-NEXT: pextrw $4, %xmm1, %eax +; SSE41-NEXT: pextrw $4, %xmm0, %r11d +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %r11d, %esi +; SSE41-NEXT: addw %ax, %si +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %r11w +; SSE41-NEXT: cmovol %ecx, %r11d +; SSE41-NEXT: pextrw $3, %xmm1, %ecx +; SSE41-NEXT: pextrw $3, %xmm0, %r14d +; SSE41-NEXT: xorl %esi, %esi +; SSE41-NEXT: movl %r14d, %edi +; SSE41-NEXT: addw %cx, %di +; SSE41-NEXT: setns %sil +; SSE41-NEXT: addl $32767, %esi # imm = 0x7FFF +; SSE41-NEXT: addw %cx, %r14w +; SSE41-NEXT: cmovol %esi, %r14d +; SSE41-NEXT: pextrw $2, %xmm1, %esi +; SSE41-NEXT: pextrw $2, %xmm0, %ecx +; SSE41-NEXT: xorl %edi, %edi +; SSE41-NEXT: movl %ecx, %ebx +; SSE41-NEXT: addw %si, %bx +; SSE41-NEXT: setns %dil +; SSE41-NEXT: addl $32767, %edi # imm = 0x7FFF +; SSE41-NEXT: addw %si, %cx +; SSE41-NEXT: cmovol %edi, %ecx +; SSE41-NEXT: movd %xmm1, %esi +; SSE41-NEXT: movd %xmm0, %edi +; SSE41-NEXT: xorl %ebx, %ebx +; SSE41-NEXT: movl %edi, %ebp +; SSE41-NEXT: addw %si, %bp +; SSE41-NEXT: setns %bl +; SSE41-NEXT: addl $32767, %ebx # imm = 0x7FFF +; SSE41-NEXT: addw %si, %di +; SSE41-NEXT: cmovol %ebx, %edi +; SSE41-NEXT: pextrw $1, %xmm1, %esi +; SSE41-NEXT: pextrw $1, %xmm0, %ebx +; SSE41-NEXT: xorl %ebp, %ebp +; SSE41-NEXT: movl %ebx, %eax +; SSE41-NEXT: addw %si, %ax +; SSE41-NEXT: setns %bpl +; SSE41-NEXT: addl $32767, %ebp # imm = 0x7FFF +; SSE41-NEXT: addw %si, %bx +; SSE41-NEXT: cmovol %ebp, %ebx +; SSE41-NEXT: movd %edi, %xmm0 +; SSE41-NEXT: pinsrw $1, %ebx, %xmm0 +; SSE41-NEXT: pinsrw $2, %ecx, %xmm0 +; SSE41-NEXT: pinsrw $3, %r14d, %xmm0 +; SSE41-NEXT: pinsrw $4, %r11d, %xmm0 +; SSE41-NEXT: pinsrw $5, %r10d, %xmm0 +; SSE41-NEXT: pinsrw $6, %r9d, %xmm0 +; SSE41-NEXT: pinsrw $7, %r8d, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: packuswb %xmm0, %xmm0 +; SSE41-NEXT: movq %xmm0, (%rdx) +; SSE41-NEXT: popq %rbx +; SSE41-NEXT: popq %r14 +; SSE41-NEXT: popq %rbp +; SSE41-NEXT: retq +; +; AVX1-LABEL: v8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: pushq %r14 +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX1-NEXT: vpextrw $7, %xmm0, %eax +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX1-NEXT: vpextrw $7, %xmm1, %r8d +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %r8d, %esi +; AVX1-NEXT: addw %ax, %si +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: addw %ax, %r8w +; AVX1-NEXT: cmovol %ecx, %r8d +; AVX1-NEXT: vpextrw $6, %xmm0, %eax +; AVX1-NEXT: vpextrw $6, %xmm1, %r9d +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %r9d, %esi +; AVX1-NEXT: addw %ax, %si +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: addw %ax, %r9w +; AVX1-NEXT: cmovol %ecx, %r9d +; AVX1-NEXT: vpextrw $5, %xmm0, %eax +; AVX1-NEXT: vpextrw $5, %xmm1, %r10d +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %r10d, %edi +; AVX1-NEXT: addw %ax, %di +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: addw %ax, %r10w +; AVX1-NEXT: cmovol %ecx, %r10d +; AVX1-NEXT: vpextrw $4, %xmm0, %eax +; AVX1-NEXT: vpextrw $4, %xmm1, %r11d +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %r11d, %esi +; AVX1-NEXT: addw %ax, %si +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: addw %ax, %r11w +; AVX1-NEXT: cmovol %ecx, %r11d +; AVX1-NEXT: vpextrw $3, %xmm0, %ecx +; AVX1-NEXT: vpextrw $3, %xmm1, %r14d +; AVX1-NEXT: xorl %esi, %esi +; AVX1-NEXT: movl %r14d, %edi +; AVX1-NEXT: addw %cx, %di +; AVX1-NEXT: setns %sil +; AVX1-NEXT: addl $32767, %esi # imm = 0x7FFF +; AVX1-NEXT: addw %cx, %r14w +; AVX1-NEXT: cmovol %esi, %r14d +; AVX1-NEXT: vpextrw $2, %xmm0, %esi +; AVX1-NEXT: vpextrw $2, %xmm1, %ecx +; AVX1-NEXT: xorl %edi, %edi +; AVX1-NEXT: movl %ecx, %ebx +; AVX1-NEXT: addw %si, %bx +; AVX1-NEXT: setns %dil +; AVX1-NEXT: addl $32767, %edi # imm = 0x7FFF +; AVX1-NEXT: addw %si, %cx +; AVX1-NEXT: cmovol %edi, %ecx +; AVX1-NEXT: vmovd %xmm0, %esi +; AVX1-NEXT: vmovd %xmm1, %edi +; AVX1-NEXT: xorl %ebx, %ebx +; AVX1-NEXT: movl %edi, %ebp +; AVX1-NEXT: addw %si, %bp +; AVX1-NEXT: setns %bl +; AVX1-NEXT: addl $32767, %ebx # imm = 0x7FFF +; AVX1-NEXT: addw %si, %di +; AVX1-NEXT: cmovol %ebx, %edi +; AVX1-NEXT: vpextrw $1, %xmm0, %esi +; AVX1-NEXT: vpextrw $1, %xmm1, %ebx +; AVX1-NEXT: xorl %ebp, %ebp +; AVX1-NEXT: movl %ebx, %eax +; AVX1-NEXT: addw %si, %ax +; AVX1-NEXT: setns %bpl +; AVX1-NEXT: addl $32767, %ebp # imm = 0x7FFF +; AVX1-NEXT: addw %si, %bx +; AVX1-NEXT: cmovol %ebp, %ebx +; AVX1-NEXT: vmovd %edi, %xmm0 +; AVX1-NEXT: vpinsrw $1, %ebx, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $3, %r14d, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $4, %r11d, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $5, %r10d, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $6, %r9d, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $7, %r8d, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, (%rdx) +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: popq %r14 +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX2-NEXT: vpextrw $7, %xmm0, %eax +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX2-NEXT: vpextrw $7, %xmm1, %r8d +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %r8d, %esi +; AVX2-NEXT: addw %ax, %si +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: addw %ax, %r8w +; AVX2-NEXT: cmovol %ecx, %r8d +; AVX2-NEXT: vpextrw $6, %xmm0, %eax +; AVX2-NEXT: vpextrw $6, %xmm1, %r9d +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %r9d, %esi +; AVX2-NEXT: addw %ax, %si +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: addw %ax, %r9w +; AVX2-NEXT: cmovol %ecx, %r9d +; AVX2-NEXT: vpextrw $5, %xmm0, %eax +; AVX2-NEXT: vpextrw $5, %xmm1, %r10d +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %r10d, %edi +; AVX2-NEXT: addw %ax, %di +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: addw %ax, %r10w +; AVX2-NEXT: cmovol %ecx, %r10d +; AVX2-NEXT: vpextrw $4, %xmm0, %eax +; AVX2-NEXT: vpextrw $4, %xmm1, %r11d +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %r11d, %esi +; AVX2-NEXT: addw %ax, %si +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: addw %ax, %r11w +; AVX2-NEXT: cmovol %ecx, %r11d +; AVX2-NEXT: vpextrw $3, %xmm0, %ecx +; AVX2-NEXT: vpextrw $3, %xmm1, %r14d +; AVX2-NEXT: xorl %esi, %esi +; AVX2-NEXT: movl %r14d, %edi +; AVX2-NEXT: addw %cx, %di +; AVX2-NEXT: setns %sil +; AVX2-NEXT: addl $32767, %esi # imm = 0x7FFF +; AVX2-NEXT: addw %cx, %r14w +; AVX2-NEXT: cmovol %esi, %r14d +; AVX2-NEXT: vpextrw $2, %xmm0, %esi +; AVX2-NEXT: vpextrw $2, %xmm1, %ecx +; AVX2-NEXT: xorl %edi, %edi +; AVX2-NEXT: movl %ecx, %ebx +; AVX2-NEXT: addw %si, %bx +; AVX2-NEXT: setns %dil +; AVX2-NEXT: addl $32767, %edi # imm = 0x7FFF +; AVX2-NEXT: addw %si, %cx +; AVX2-NEXT: cmovol %edi, %ecx +; AVX2-NEXT: vmovd %xmm0, %esi +; AVX2-NEXT: vmovd %xmm1, %edi +; AVX2-NEXT: xorl %ebx, %ebx +; AVX2-NEXT: movl %edi, %ebp +; AVX2-NEXT: addw %si, %bp +; AVX2-NEXT: setns %bl +; AVX2-NEXT: addl $32767, %ebx # imm = 0x7FFF +; AVX2-NEXT: addw %si, %di +; AVX2-NEXT: cmovol %ebx, %edi +; AVX2-NEXT: vpextrw $1, %xmm0, %esi +; AVX2-NEXT: vpextrw $1, %xmm1, %ebx +; AVX2-NEXT: xorl %ebp, %ebp +; AVX2-NEXT: movl %ebx, %eax +; AVX2-NEXT: addw %si, %ax +; AVX2-NEXT: setns %bpl +; AVX2-NEXT: addl $32767, %ebp # imm = 0x7FFF +; AVX2-NEXT: addw %si, %bx +; AVX2-NEXT: cmovol %ebp, %ebx +; AVX2-NEXT: vmovd %edi, %xmm0 +; AVX2-NEXT: vpinsrw $1, %ebx, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $3, %r14d, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $4, %r11d, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $5, %r10d, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $6, %r9d, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $7, %r8d, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rdx) +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512-LABEL: v8i8: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512-NEXT: vpextrw $7, %xmm0, %eax +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512-NEXT: vpextrw $7, %xmm1, %r8d +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %r8d, %esi +; AVX512-NEXT: addw %ax, %si +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: addw %ax, %r8w +; AVX512-NEXT: cmovol %ecx, %r8d +; AVX512-NEXT: vpextrw $6, %xmm0, %eax +; AVX512-NEXT: vpextrw $6, %xmm1, %r9d +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %r9d, %esi +; AVX512-NEXT: addw %ax, %si +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: addw %ax, %r9w +; AVX512-NEXT: cmovol %ecx, %r9d +; AVX512-NEXT: vpextrw $5, %xmm0, %eax +; AVX512-NEXT: vpextrw $5, %xmm1, %r10d +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %r10d, %edi +; AVX512-NEXT: addw %ax, %di +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: addw %ax, %r10w +; AVX512-NEXT: cmovol %ecx, %r10d +; AVX512-NEXT: vpextrw $4, %xmm0, %eax +; AVX512-NEXT: vpextrw $4, %xmm1, %r11d +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %r11d, %esi +; AVX512-NEXT: addw %ax, %si +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: addw %ax, %r11w +; AVX512-NEXT: cmovol %ecx, %r11d +; AVX512-NEXT: vpextrw $3, %xmm0, %ecx +; AVX512-NEXT: vpextrw $3, %xmm1, %r14d +; AVX512-NEXT: xorl %esi, %esi +; AVX512-NEXT: movl %r14d, %edi +; AVX512-NEXT: addw %cx, %di +; AVX512-NEXT: setns %sil +; AVX512-NEXT: addl $32767, %esi # imm = 0x7FFF +; AVX512-NEXT: addw %cx, %r14w +; AVX512-NEXT: cmovol %esi, %r14d +; AVX512-NEXT: vpextrw $2, %xmm0, %esi +; AVX512-NEXT: vpextrw $2, %xmm1, %ecx +; AVX512-NEXT: xorl %edi, %edi +; AVX512-NEXT: movl %ecx, %ebx +; AVX512-NEXT: addw %si, %bx +; AVX512-NEXT: setns %dil +; AVX512-NEXT: addl $32767, %edi # imm = 0x7FFF +; AVX512-NEXT: addw %si, %cx +; AVX512-NEXT: cmovol %edi, %ecx +; AVX512-NEXT: vmovd %xmm0, %esi +; AVX512-NEXT: vmovd %xmm1, %edi +; AVX512-NEXT: xorl %ebx, %ebx +; AVX512-NEXT: movl %edi, %ebp +; AVX512-NEXT: addw %si, %bp +; AVX512-NEXT: setns %bl +; AVX512-NEXT: addl $32767, %ebx # imm = 0x7FFF +; AVX512-NEXT: addw %si, %di +; AVX512-NEXT: cmovol %ebx, %edi +; AVX512-NEXT: vpextrw $1, %xmm0, %esi +; AVX512-NEXT: vpextrw $1, %xmm1, %ebx +; AVX512-NEXT: xorl %ebp, %ebp +; AVX512-NEXT: movl %ebx, %eax +; AVX512-NEXT: addw %si, %ax +; AVX512-NEXT: setns %bpl +; AVX512-NEXT: addl $32767, %ebp # imm = 0x7FFF +; AVX512-NEXT: addw %si, %bx +; AVX512-NEXT: cmovol %ebp, %ebx +; AVX512-NEXT: vmovd %edi, %xmm0 +; AVX512-NEXT: vpinsrw $1, %ebx, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $3, %r14d, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $4, %r11d, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $5, %r10d, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $6, %r9d, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $7, %r8d, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512-NEXT: vpmovwb %xmm0, (%rdx) +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq + %x = load <8 x i8>, <8 x i8>* %px + %y = load <8 x i8>, <8 x i8>* %py + %z = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> %x, <8 x i8> %y) + store <8 x i8> %z, <8 x i8>* %pz + ret void +} + +define void @v4i8(<4 x i8>* %px, <4 x i8>* %py, <4 x i8>* %pz) nounwind { +; SSE2-LABEL: v4i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: pslld $24, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] +; SSE2-NEXT: movd %xmm2, %ecx +; SSE2-NEXT: pslld $24, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; SSE2-NEXT: movd %xmm2, %r8d +; SSE2-NEXT: xorl %esi, %esi +; SSE2-NEXT: movl %r8d, %edi +; SSE2-NEXT: addl %ecx, %edi +; SSE2-NEXT: setns %sil +; SSE2-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; SSE2-NEXT: addl %ecx, %r8d +; SSE2-NEXT: cmovol %esi, %r8d +; SSE2-NEXT: movd %xmm1, %esi +; SSE2-NEXT: movd %xmm0, %r10d +; SSE2-NEXT: xorl %edi, %edi +; SSE2-NEXT: movl %r10d, %ecx +; SSE2-NEXT: addl %esi, %ecx +; SSE2-NEXT: setns %dil +; SSE2-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; SSE2-NEXT: addl %esi, %r10d +; SSE2-NEXT: cmovol %edi, %r10d +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE2-NEXT: movd %xmm2, %r9d +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE2-NEXT: movd %xmm2, %ecx +; SSE2-NEXT: xorl %edi, %edi +; SSE2-NEXT: movl %ecx, %esi +; SSE2-NEXT: addl %r9d, %esi +; SSE2-NEXT: setns %dil +; SSE2-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; SSE2-NEXT: addl %r9d, %ecx +; SSE2-NEXT: cmovol %edi, %ecx +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] +; SSE2-NEXT: movd %xmm1, %r9d +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: xorl %esi, %esi +; SSE2-NEXT: movl %eax, %edi +; SSE2-NEXT: addl %r9d, %edi +; SSE2-NEXT: setns %sil +; SSE2-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; SSE2-NEXT: addl %r9d, %eax +; SSE2-NEXT: cmovol %esi, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movd %r10d, %xmm0 +; SSE2-NEXT: movd %r8d, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: psrld $24, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: movd %xmm0, (%rdx) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v4i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,1,255,255,255,2,255,255,255,3] +; SSSE3-NEXT: pshufb %xmm2, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3] +; SSSE3-NEXT: movd %xmm3, %ecx +; SSSE3-NEXT: pshufb %xmm2, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; SSSE3-NEXT: movd %xmm2, %r8d +; SSSE3-NEXT: xorl %esi, %esi +; SSSE3-NEXT: movl %r8d, %edi +; SSSE3-NEXT: addl %ecx, %edi +; SSSE3-NEXT: setns %sil +; SSSE3-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; SSSE3-NEXT: addl %ecx, %r8d +; SSSE3-NEXT: cmovol %esi, %r8d +; SSSE3-NEXT: movd %xmm1, %esi +; SSSE3-NEXT: movd %xmm0, %r10d +; SSSE3-NEXT: xorl %edi, %edi +; SSSE3-NEXT: movl %r10d, %ecx +; SSSE3-NEXT: addl %esi, %ecx +; SSSE3-NEXT: setns %dil +; SSSE3-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; SSSE3-NEXT: addl %esi, %r10d +; SSSE3-NEXT: cmovol %edi, %r10d +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSSE3-NEXT: movd %xmm2, %r9d +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSSE3-NEXT: movd %xmm2, %ecx +; SSSE3-NEXT: xorl %edi, %edi +; SSSE3-NEXT: movl %ecx, %esi +; SSSE3-NEXT: addl %r9d, %esi +; SSSE3-NEXT: setns %dil +; SSSE3-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; SSSE3-NEXT: addl %r9d, %ecx +; SSSE3-NEXT: cmovol %edi, %ecx +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] +; SSSE3-NEXT: movd %xmm1, %r9d +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSSE3-NEXT: movd %xmm0, %eax +; SSSE3-NEXT: xorl %esi, %esi +; SSSE3-NEXT: movl %eax, %edi +; SSSE3-NEXT: addl %r9d, %edi +; SSSE3-NEXT: setns %sil +; SSSE3-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; SSSE3-NEXT: addl %r9d, %eax +; SSSE3-NEXT: cmovol %esi, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: movd %r10d, %xmm0 +; SSSE3-NEXT: movd %r8d, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: movd %xmm0, (%rdx) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v4i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; SSE41-NEXT: pslld $24, %xmm1 +; SSE41-NEXT: pextrd $3, %xmm1, %ecx +; SSE41-NEXT: pslld $24, %xmm0 +; SSE41-NEXT: pextrd $3, %xmm0, %r8d +; SSE41-NEXT: xorl %esi, %esi +; SSE41-NEXT: movl %r8d, %edi +; SSE41-NEXT: addl %ecx, %edi +; SSE41-NEXT: setns %sil +; SSE41-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; SSE41-NEXT: addl %ecx, %r8d +; SSE41-NEXT: cmovol %esi, %r8d +; SSE41-NEXT: pextrd $2, %xmm1, %esi +; SSE41-NEXT: pextrd $2, %xmm0, %r10d +; SSE41-NEXT: xorl %edi, %edi +; SSE41-NEXT: movl %r10d, %ecx +; SSE41-NEXT: addl %esi, %ecx +; SSE41-NEXT: setns %dil +; SSE41-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; SSE41-NEXT: addl %esi, %r10d +; SSE41-NEXT: cmovol %edi, %r10d +; SSE41-NEXT: movd %xmm1, %r9d +; SSE41-NEXT: movd %xmm0, %ecx +; SSE41-NEXT: xorl %edi, %edi +; SSE41-NEXT: movl %ecx, %esi +; SSE41-NEXT: addl %r9d, %esi +; SSE41-NEXT: setns %dil +; SSE41-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; SSE41-NEXT: addl %r9d, %ecx +; SSE41-NEXT: cmovol %edi, %ecx +; SSE41-NEXT: pextrd $1, %xmm1, %r9d +; SSE41-NEXT: pextrd $1, %xmm0, %eax +; SSE41-NEXT: xorl %esi, %esi +; SSE41-NEXT: movl %eax, %edi +; SSE41-NEXT: addl %r9d, %edi +; SSE41-NEXT: setns %sil +; SSE41-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; SSE41-NEXT: addl %r9d, %eax +; SSE41-NEXT: cmovol %esi, %eax +; SSE41-NEXT: movd %ecx, %xmm0 +; SSE41-NEXT: pinsrd $1, %eax, %xmm0 +; SSE41-NEXT: pinsrd $2, %r10d, %xmm0 +; SSE41-NEXT: pinsrd $3, %r8d, %xmm0 +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] +; SSE41-NEXT: movd %xmm0, (%rdx) +; SSE41-NEXT: retq +; +; AVX1-LABEL: v4i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpslld $24, %xmm0, %xmm0 +; AVX1-NEXT: vpextrd $3, %xmm0, %ecx +; AVX1-NEXT: vpslld $24, %xmm1, %xmm1 +; AVX1-NEXT: vpextrd $3, %xmm1, %r9d +; AVX1-NEXT: xorl %esi, %esi +; AVX1-NEXT: movl %r9d, %edi +; AVX1-NEXT: addl %ecx, %edi +; AVX1-NEXT: setns %sil +; AVX1-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; AVX1-NEXT: addl %ecx, %r9d +; AVX1-NEXT: cmovol %esi, %r9d +; AVX1-NEXT: vpextrd $2, %xmm0, %r8d +; AVX1-NEXT: vpextrd $2, %xmm1, %r10d +; AVX1-NEXT: xorl %edi, %edi +; AVX1-NEXT: movl %r10d, %ecx +; AVX1-NEXT: addl %r8d, %ecx +; AVX1-NEXT: setns %dil +; AVX1-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; AVX1-NEXT: addl %r8d, %r10d +; AVX1-NEXT: cmovol %edi, %r10d +; AVX1-NEXT: vmovd %xmm0, %r8d +; AVX1-NEXT: vmovd %xmm1, %eax +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %eax, %edi +; AVX1-NEXT: addl %r8d, %edi +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF +; AVX1-NEXT: addl %r8d, %eax +; AVX1-NEXT: cmovol %ecx, %eax +; AVX1-NEXT: vpextrd $1, %xmm0, %r8d +; AVX1-NEXT: vpextrd $1, %xmm1, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edi +; AVX1-NEXT: addl %r8d, %edi +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF +; AVX1-NEXT: addl %r8d, %esi +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrd $2, %r10d, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrd $3, %r9d, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vmovd %xmm0, (%rdx) +; AVX1-NEXT: retq +; +; AVX2-LABEL: v4i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX2-NEXT: vpslld $24, %xmm0, %xmm0 +; AVX2-NEXT: vpextrd $3, %xmm0, %ecx +; AVX2-NEXT: vpslld $24, %xmm1, %xmm1 +; AVX2-NEXT: vpextrd $3, %xmm1, %r9d +; AVX2-NEXT: xorl %esi, %esi +; AVX2-NEXT: movl %r9d, %edi +; AVX2-NEXT: addl %ecx, %edi +; AVX2-NEXT: setns %sil +; AVX2-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; AVX2-NEXT: addl %ecx, %r9d +; AVX2-NEXT: cmovol %esi, %r9d +; AVX2-NEXT: vpextrd $2, %xmm0, %r8d +; AVX2-NEXT: vpextrd $2, %xmm1, %r10d +; AVX2-NEXT: xorl %edi, %edi +; AVX2-NEXT: movl %r10d, %ecx +; AVX2-NEXT: addl %r8d, %ecx +; AVX2-NEXT: setns %dil +; AVX2-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; AVX2-NEXT: addl %r8d, %r10d +; AVX2-NEXT: cmovol %edi, %r10d +; AVX2-NEXT: vmovd %xmm0, %r8d +; AVX2-NEXT: vmovd %xmm1, %eax +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %eax, %edi +; AVX2-NEXT: addl %r8d, %edi +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF +; AVX2-NEXT: addl %r8d, %eax +; AVX2-NEXT: cmovol %ecx, %eax +; AVX2-NEXT: vpextrd $1, %xmm0, %r8d +; AVX2-NEXT: vpextrd $1, %xmm1, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edi +; AVX2-NEXT: addl %r8d, %edi +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF +; AVX2-NEXT: addl %r8d, %esi +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrd $2, %r10d, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrd $3, %r9d, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovd %xmm0, (%rdx) +; AVX2-NEXT: retq +; +; AVX512-LABEL: v4i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,1,255,255,255,2,255,255,255,3] +; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpextrd $3, %xmm0, %ecx +; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpextrd $3, %xmm1, %r9d +; AVX512-NEXT: xorl %esi, %esi +; AVX512-NEXT: movl %r9d, %edi +; AVX512-NEXT: addl %ecx, %edi +; AVX512-NEXT: setns %sil +; AVX512-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; AVX512-NEXT: addl %ecx, %r9d +; AVX512-NEXT: cmovol %esi, %r9d +; AVX512-NEXT: vpextrd $2, %xmm0, %r8d +; AVX512-NEXT: vpextrd $2, %xmm1, %r10d +; AVX512-NEXT: xorl %edi, %edi +; AVX512-NEXT: movl %r10d, %ecx +; AVX512-NEXT: addl %r8d, %ecx +; AVX512-NEXT: setns %dil +; AVX512-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; AVX512-NEXT: addl %r8d, %r10d +; AVX512-NEXT: cmovol %edi, %r10d +; AVX512-NEXT: vmovd %xmm0, %r8d +; AVX512-NEXT: vmovd %xmm1, %eax +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %eax, %edi +; AVX512-NEXT: addl %r8d, %edi +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF +; AVX512-NEXT: addl %r8d, %eax +; AVX512-NEXT: cmovol %ecx, %eax +; AVX512-NEXT: vpextrd $1, %xmm0, %r8d +; AVX512-NEXT: vpextrd $1, %xmm1, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edi +; AVX512-NEXT: addl %r8d, %edi +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF +; AVX512-NEXT: addl %r8d, %esi +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: vmovd %eax, %xmm0 +; AVX512-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrd $2, %r10d, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrd $3, %r9d, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $24, %xmm0, %xmm0 +; AVX512-NEXT: vpmovdb %xmm0, (%rdx) +; AVX512-NEXT: retq + %x = load <4 x i8>, <4 x i8>* %px + %y = load <4 x i8>, <4 x i8>* %py + %z = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> %x, <4 x i8> %y) + store <4 x i8> %z, <4 x i8>* %pz + ret void +} + +define void @v2i8(<2 x i8>* %px, <2 x i8>* %py, <2 x i8>* %pz) nounwind { +; SSE2-LABEL: v2i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movzwl (%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSE2-NEXT: movzwl (%rsi), %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] +; SSE2-NEXT: psllq $56, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: psllq $56, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm2, %rcx +; SSE2-NEXT: xorl %esi, %esi +; SSE2-NEXT: movq %rcx, %rdi +; SSE2-NEXT: addq %rax, %rdi +; SSE2-NEXT: setns %sil +; SSE2-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF +; SSE2-NEXT: addq %r8, %rsi +; SSE2-NEXT: addq %rax, %rcx +; SSE2-NEXT: cmovoq %rsi, %rcx +; SSE2-NEXT: movq %xmm1, %r9 +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: xorl %edi, %edi +; SSE2-NEXT: movq %rax, %rsi +; SSE2-NEXT: addq %r9, %rsi +; SSE2-NEXT: setns %dil +; SSE2-NEXT: addq %r8, %rdi +; SSE2-NEXT: addq %r9, %rax +; SSE2-NEXT: cmovoq %rdi, %rax +; SSE2-NEXT: movq %rax, %xmm0 +; SSE2-NEXT: movq %rcx, %xmm1 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: psrlq $56, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movw %ax, (%rdx) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v2i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movzwl (%rdi), %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzwl (%rsi), %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,1] +; SSSE3-NEXT: pshufb %xmm2, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; SSSE3-NEXT: movq %xmm3, %rax +; SSSE3-NEXT: pshufb %xmm2, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSSE3-NEXT: movq %xmm2, %rcx +; SSSE3-NEXT: xorl %esi, %esi +; SSSE3-NEXT: movq %rcx, %rdi +; SSSE3-NEXT: addq %rax, %rdi +; SSSE3-NEXT: setns %sil +; SSSE3-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF +; SSSE3-NEXT: addq %r8, %rsi +; SSSE3-NEXT: addq %rax, %rcx +; SSSE3-NEXT: cmovoq %rsi, %rcx +; SSSE3-NEXT: movq %xmm1, %r9 +; SSSE3-NEXT: movq %xmm0, %rax +; SSSE3-NEXT: xorl %edi, %edi +; SSSE3-NEXT: movq %rax, %rsi +; SSSE3-NEXT: addq %r9, %rsi +; SSSE3-NEXT: setns %dil +; SSSE3-NEXT: addq %r8, %rdi +; SSSE3-NEXT: addq %r9, %rax +; SSSE3-NEXT: cmovoq %rdi, %rax +; SSSE3-NEXT: movq %rax, %xmm0 +; SSSE3-NEXT: movq %rcx, %xmm1 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: movd %xmm0, %eax +; SSSE3-NEXT: movw %ax, (%rdx) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v2i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: psllq $56, %xmm1 +; SSE41-NEXT: movq %xmm1, %rax +; SSE41-NEXT: psllq $56, %xmm0 +; SSE41-NEXT: movq %xmm0, %rcx +; SSE41-NEXT: xorl %esi, %esi +; SSE41-NEXT: movq %rcx, %rdi +; SSE41-NEXT: addq %rax, %rdi +; SSE41-NEXT: setns %sil +; SSE41-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF +; SSE41-NEXT: addq %r8, %rsi +; SSE41-NEXT: addq %rax, %rcx +; SSE41-NEXT: cmovoq %rsi, %rcx +; SSE41-NEXT: pextrq $1, %xmm1, %r9 +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: xorl %edi, %edi +; SSE41-NEXT: movq %rax, %rsi +; SSE41-NEXT: addq %r9, %rsi +; SSE41-NEXT: setns %dil +; SSE41-NEXT: addq %r8, %rdi +; SSE41-NEXT: addq %r9, %rax +; SSE41-NEXT: cmovoq %rdi, %rax +; SSE41-NEXT: movq %rax, %xmm0 +; SSE41-NEXT: movq %rcx, %xmm1 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; SSE41-NEXT: pextrw $0, %xmm1, (%rdx) +; SSE41-NEXT: retq +; +; AVX1-LABEL: v2i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsllq $56, %xmm1, %xmm1 +; AVX1-NEXT: vmovq %xmm1, %rax +; AVX1-NEXT: vpsllq $56, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rcx +; AVX1-NEXT: xorl %esi, %esi +; AVX1-NEXT: movq %rcx, %rdi +; AVX1-NEXT: addq %rax, %rdi +; AVX1-NEXT: setns %sil +; AVX1-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF +; AVX1-NEXT: addq %r8, %rsi +; AVX1-NEXT: addq %rax, %rcx +; AVX1-NEXT: cmovoq %rsi, %rcx +; AVX1-NEXT: vpextrq $1, %xmm1, %r9 +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: xorl %edi, %edi +; AVX1-NEXT: movq %rax, %rsi +; AVX1-NEXT: addq %r9, %rsi +; AVX1-NEXT: setns %dil +; AVX1-NEXT: addq %r8, %rdi +; AVX1-NEXT: addq %r9, %rax +; AVX1-NEXT: cmovoq %rdi, %rax +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: vmovq %rcx, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpextrw $0, %xmm0, (%rdx) +; AVX1-NEXT: retq +; +; AVX2-LABEL: v2i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpsllq $56, %xmm1, %xmm1 +; AVX2-NEXT: vmovq %xmm1, %rax +; AVX2-NEXT: vpsllq $56, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rcx +; AVX2-NEXT: xorl %esi, %esi +; AVX2-NEXT: movq %rcx, %rdi +; AVX2-NEXT: addq %rax, %rdi +; AVX2-NEXT: setns %sil +; AVX2-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF +; AVX2-NEXT: addq %r8, %rsi +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: cmovoq %rsi, %rcx +; AVX2-NEXT: vpextrq $1, %xmm1, %r9 +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: xorl %edi, %edi +; AVX2-NEXT: movq %rax, %rsi +; AVX2-NEXT: addq %r9, %rsi +; AVX2-NEXT: setns %dil +; AVX2-NEXT: addq %r8, %rdi +; AVX2-NEXT: addq %r9, %rax +; AVX2-NEXT: cmovoq %rdi, %rax +; AVX2-NEXT: vmovq %rax, %xmm0 +; AVX2-NEXT: vmovq %rcx, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpextrw $0, %xmm0, (%rdx) +; AVX2-NEXT: retq +; +; AVX512-LABEL: v2i8: +; AVX512: # %bb.0: +; AVX512-NEXT: movzwl (%rdi), %eax +; AVX512-NEXT: vmovd %eax, %xmm0 +; AVX512-NEXT: movzwl (%rsi), %eax +; AVX512-NEXT: vmovd %eax, %xmm1 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,1] +; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vmovq %xmm1, %rax +; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rcx +; AVX512-NEXT: xorl %esi, %esi +; AVX512-NEXT: movq %rcx, %rdi +; AVX512-NEXT: addq %rax, %rdi +; AVX512-NEXT: setns %sil +; AVX512-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF +; AVX512-NEXT: addq %r8, %rsi +; AVX512-NEXT: addq %rax, %rcx +; AVX512-NEXT: cmovoq %rsi, %rcx +; AVX512-NEXT: vpextrq $1, %xmm1, %r9 +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: xorl %edi, %edi +; AVX512-NEXT: movq %rax, %rsi +; AVX512-NEXT: addq %r9, %rsi +; AVX512-NEXT: setns %dil +; AVX512-NEXT: addq %r8, %rdi +; AVX512-NEXT: addq %r9, %rax +; AVX512-NEXT: cmovoq %rdi, %rax +; AVX512-NEXT: vmovq %rax, %xmm0 +; AVX512-NEXT: vmovq %rcx, %xmm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NEXT: vpsrlq $56, %xmm0, %xmm0 +; AVX512-NEXT: vpmovqb %xmm0, (%rdx) +; AVX512-NEXT: retq + %x = load <2 x i8>, <2 x i8>* %px + %y = load <2 x i8>, <2 x i8>* %py + %z = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> %x, <2 x i8> %y) + store <2 x i8> %z, <2 x i8>* %pz + ret void +} + +define void @v4i16(<4 x i16>* %px, <4 x i16>* %py, <4 x i16>* %pz) nounwind { +; SSE2-LABEL: v4i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3] +; SSE2-NEXT: movd %xmm3, %ecx +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; SSE2-NEXT: movd %xmm2, %r8d +; SSE2-NEXT: xorl %esi, %esi +; SSE2-NEXT: movl %r8d, %edi +; SSE2-NEXT: addl %ecx, %edi +; SSE2-NEXT: setns %sil +; SSE2-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; SSE2-NEXT: addl %ecx, %r8d +; SSE2-NEXT: cmovol %esi, %r8d +; SSE2-NEXT: movd %xmm1, %esi +; SSE2-NEXT: movd %xmm0, %r10d +; SSE2-NEXT: xorl %edi, %edi +; SSE2-NEXT: movl %r10d, %ecx +; SSE2-NEXT: addl %esi, %ecx +; SSE2-NEXT: setns %dil +; SSE2-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; SSE2-NEXT: addl %esi, %r10d +; SSE2-NEXT: cmovol %edi, %r10d +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE2-NEXT: movd %xmm2, %r9d +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE2-NEXT: movd %xmm2, %ecx +; SSE2-NEXT: xorl %edi, %edi +; SSE2-NEXT: movl %ecx, %esi +; SSE2-NEXT: addl %r9d, %esi +; SSE2-NEXT: setns %dil +; SSE2-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; SSE2-NEXT: addl %r9d, %ecx +; SSE2-NEXT: cmovol %edi, %ecx +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] +; SSE2-NEXT: movd %xmm1, %r9d +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: xorl %esi, %esi +; SSE2-NEXT: movl %eax, %edi +; SSE2-NEXT: addl %r9d, %edi +; SSE2-NEXT: setns %sil +; SSE2-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; SSE2-NEXT: addl %r9d, %eax +; SSE2-NEXT: cmovol %esi, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movd %r10d, %xmm0 +; SSE2-NEXT: movd %r8d, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: movq %xmm0, (%rdx) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v4i16: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; SSSE3-NEXT: movq {{.*#+}} xmm3 = mem[0],zero +; SSSE3-NEXT: pxor %xmm0, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3] +; SSSE3-NEXT: movd %xmm3, %ecx +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; SSSE3-NEXT: movd %xmm2, %r8d +; SSSE3-NEXT: xorl %esi, %esi +; SSSE3-NEXT: movl %r8d, %edi +; SSSE3-NEXT: addl %ecx, %edi +; SSSE3-NEXT: setns %sil +; SSSE3-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; SSSE3-NEXT: addl %ecx, %r8d +; SSSE3-NEXT: cmovol %esi, %r8d +; SSSE3-NEXT: movd %xmm1, %esi +; SSSE3-NEXT: movd %xmm0, %r10d +; SSSE3-NEXT: xorl %edi, %edi +; SSSE3-NEXT: movl %r10d, %ecx +; SSSE3-NEXT: addl %esi, %ecx +; SSSE3-NEXT: setns %dil +; SSSE3-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; SSSE3-NEXT: addl %esi, %r10d +; SSSE3-NEXT: cmovol %edi, %r10d +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSSE3-NEXT: movd %xmm2, %r9d +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSSE3-NEXT: movd %xmm2, %ecx +; SSSE3-NEXT: xorl %edi, %edi +; SSSE3-NEXT: movl %ecx, %esi +; SSSE3-NEXT: addl %r9d, %esi +; SSSE3-NEXT: setns %dil +; SSSE3-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; SSSE3-NEXT: addl %r9d, %ecx +; SSSE3-NEXT: cmovol %edi, %ecx +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] +; SSSE3-NEXT: movd %xmm1, %r9d +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSSE3-NEXT: movd %xmm0, %eax +; SSSE3-NEXT: xorl %esi, %esi +; SSSE3-NEXT: movl %eax, %edi +; SSSE3-NEXT: addl %r9d, %edi +; SSSE3-NEXT: setns %sil +; SSSE3-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; SSSE3-NEXT: addl %r9d, %eax +; SSSE3-NEXT: cmovol %esi, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: movd %r10d, %xmm0 +; SSSE3-NEXT: movd %r8d, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,10,11,14,15,14,15],zero,zero +; SSSE3-NEXT: movq %xmm0, (%rdx) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v4i16: +; SSE41: # %bb.0: +; SSE41-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; SSE41-NEXT: movq {{.*#+}} xmm3 = mem[0],zero +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE41-NEXT: pextrd $3, %xmm1, %ecx +; SSE41-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE41-NEXT: pextrd $3, %xmm0, %r8d +; SSE41-NEXT: xorl %esi, %esi +; SSE41-NEXT: movl %r8d, %edi +; SSE41-NEXT: addl %ecx, %edi +; SSE41-NEXT: setns %sil +; SSE41-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; SSE41-NEXT: addl %ecx, %r8d +; SSE41-NEXT: cmovol %esi, %r8d +; SSE41-NEXT: pextrd $2, %xmm1, %esi +; SSE41-NEXT: pextrd $2, %xmm0, %r10d +; SSE41-NEXT: xorl %edi, %edi +; SSE41-NEXT: movl %r10d, %ecx +; SSE41-NEXT: addl %esi, %ecx +; SSE41-NEXT: setns %dil +; SSE41-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; SSE41-NEXT: addl %esi, %r10d +; SSE41-NEXT: cmovol %edi, %r10d +; SSE41-NEXT: movd %xmm1, %r9d +; SSE41-NEXT: movd %xmm0, %ecx +; SSE41-NEXT: xorl %edi, %edi +; SSE41-NEXT: movl %ecx, %esi +; SSE41-NEXT: addl %r9d, %esi +; SSE41-NEXT: setns %dil +; SSE41-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; SSE41-NEXT: addl %r9d, %ecx +; SSE41-NEXT: cmovol %edi, %ecx +; SSE41-NEXT: pextrd $1, %xmm1, %r9d +; SSE41-NEXT: pextrd $1, %xmm0, %eax +; SSE41-NEXT: xorl %esi, %esi +; SSE41-NEXT: movl %eax, %edi +; SSE41-NEXT: addl %r9d, %edi +; SSE41-NEXT: setns %sil +; SSE41-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; SSE41-NEXT: addl %r9d, %eax +; SSE41-NEXT: cmovol %esi, %eax +; SSE41-NEXT: movd %ecx, %xmm0 +; SSE41-NEXT: pinsrd $1, %eax, %xmm0 +; SSE41-NEXT: pinsrd $2, %r10d, %xmm0 +; SSE41-NEXT: pinsrd $3, %r8d, %xmm0 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: packusdw %xmm0, %xmm0 +; SSE41-NEXT: movq %xmm0, (%rdx) +; SSE41-NEXT: retq +; +; AVX1-LABEL: v4i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX1-NEXT: vpextrd $3, %xmm0, %ecx +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-NEXT: vpextrd $3, %xmm1, %r9d +; AVX1-NEXT: xorl %esi, %esi +; AVX1-NEXT: movl %r9d, %edi +; AVX1-NEXT: addl %ecx, %edi +; AVX1-NEXT: setns %sil +; AVX1-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; AVX1-NEXT: addl %ecx, %r9d +; AVX1-NEXT: cmovol %esi, %r9d +; AVX1-NEXT: vpextrd $2, %xmm0, %r8d +; AVX1-NEXT: vpextrd $2, %xmm1, %r10d +; AVX1-NEXT: xorl %edi, %edi +; AVX1-NEXT: movl %r10d, %ecx +; AVX1-NEXT: addl %r8d, %ecx +; AVX1-NEXT: setns %dil +; AVX1-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; AVX1-NEXT: addl %r8d, %r10d +; AVX1-NEXT: cmovol %edi, %r10d +; AVX1-NEXT: vmovd %xmm0, %r8d +; AVX1-NEXT: vmovd %xmm1, %eax +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %eax, %edi +; AVX1-NEXT: addl %r8d, %edi +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF +; AVX1-NEXT: addl %r8d, %eax +; AVX1-NEXT: cmovol %ecx, %eax +; AVX1-NEXT: vpextrd $1, %xmm0, %r8d +; AVX1-NEXT: vpextrd $1, %xmm1, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edi +; AVX1-NEXT: addl %r8d, %edi +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF +; AVX1-NEXT: addl %r8d, %esi +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrd $2, %r10d, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrd $3, %r9d, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, (%rdx) +; AVX1-NEXT: retq +; +; AVX2-LABEL: v4i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX2-NEXT: vpextrd $3, %xmm0, %ecx +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-NEXT: vpextrd $3, %xmm1, %r9d +; AVX2-NEXT: xorl %esi, %esi +; AVX2-NEXT: movl %r9d, %edi +; AVX2-NEXT: addl %ecx, %edi +; AVX2-NEXT: setns %sil +; AVX2-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; AVX2-NEXT: addl %ecx, %r9d +; AVX2-NEXT: cmovol %esi, %r9d +; AVX2-NEXT: vpextrd $2, %xmm0, %r8d +; AVX2-NEXT: vpextrd $2, %xmm1, %r10d +; AVX2-NEXT: xorl %edi, %edi +; AVX2-NEXT: movl %r10d, %ecx +; AVX2-NEXT: addl %r8d, %ecx +; AVX2-NEXT: setns %dil +; AVX2-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; AVX2-NEXT: addl %r8d, %r10d +; AVX2-NEXT: cmovol %edi, %r10d +; AVX2-NEXT: vmovd %xmm0, %r8d +; AVX2-NEXT: vmovd %xmm1, %eax +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %eax, %edi +; AVX2-NEXT: addl %r8d, %edi +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF +; AVX2-NEXT: addl %r8d, %eax +; AVX2-NEXT: cmovol %ecx, %eax +; AVX2-NEXT: vpextrd $1, %xmm0, %r8d +; AVX2-NEXT: vpextrd $1, %xmm1, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edi +; AVX2-NEXT: addl %r8d, %edi +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF +; AVX2-NEXT: addl %r8d, %esi +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrd $2, %r10d, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrd $3, %r9d, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rdx) +; AVX2-NEXT: retq +; +; AVX512-LABEL: v4i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,0,1,255,255,2,3,255,255,4,5,255,255,6,7] +; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpextrd $3, %xmm0, %ecx +; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpextrd $3, %xmm1, %r9d +; AVX512-NEXT: xorl %esi, %esi +; AVX512-NEXT: movl %r9d, %edi +; AVX512-NEXT: addl %ecx, %edi +; AVX512-NEXT: setns %sil +; AVX512-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; AVX512-NEXT: addl %ecx, %r9d +; AVX512-NEXT: cmovol %esi, %r9d +; AVX512-NEXT: vpextrd $2, %xmm0, %r8d +; AVX512-NEXT: vpextrd $2, %xmm1, %r10d +; AVX512-NEXT: xorl %edi, %edi +; AVX512-NEXT: movl %r10d, %ecx +; AVX512-NEXT: addl %r8d, %ecx +; AVX512-NEXT: setns %dil +; AVX512-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; AVX512-NEXT: addl %r8d, %r10d +; AVX512-NEXT: cmovol %edi, %r10d +; AVX512-NEXT: vmovd %xmm0, %r8d +; AVX512-NEXT: vmovd %xmm1, %eax +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %eax, %edi +; AVX512-NEXT: addl %r8d, %edi +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF +; AVX512-NEXT: addl %r8d, %eax +; AVX512-NEXT: cmovol %ecx, %eax +; AVX512-NEXT: vpextrd $1, %xmm0, %r8d +; AVX512-NEXT: vpextrd $1, %xmm1, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edi +; AVX512-NEXT: addl %r8d, %edi +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF +; AVX512-NEXT: addl %r8d, %esi +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: vmovd %eax, %xmm0 +; AVX512-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrd $2, %r10d, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrd $3, %r9d, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX512-NEXT: vpmovdw %xmm0, (%rdx) +; AVX512-NEXT: retq + %x = load <4 x i16>, <4 x i16>* %px + %y = load <4 x i16>, <4 x i16>* %py + %z = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %x, <4 x i16> %y) + store <4 x i16> %z, <4 x i16>* %pz + ret void +} + +define void @v2i16(<2 x i16>* %px, <2 x i16>* %py, <2 x i16>* %pz) nounwind { +; SSE2-LABEL: v2i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7] +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,6,7] +; SSE2-NEXT: psllq $48, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: psllq $48, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm2, %rcx +; SSE2-NEXT: xorl %esi, %esi +; SSE2-NEXT: movq %rcx, %rdi +; SSE2-NEXT: addq %rax, %rdi +; SSE2-NEXT: setns %sil +; SSE2-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF +; SSE2-NEXT: addq %r8, %rsi +; SSE2-NEXT: addq %rax, %rcx +; SSE2-NEXT: cmovoq %rsi, %rcx +; SSE2-NEXT: movq %xmm1, %r9 +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: xorl %edi, %edi +; SSE2-NEXT: movq %rax, %rsi +; SSE2-NEXT: addq %r9, %rsi +; SSE2-NEXT: setns %dil +; SSE2-NEXT: addq %r8, %rdi +; SSE2-NEXT: addq %r9, %rax +; SSE2-NEXT: cmovoq %rdi, %rax +; SSE2-NEXT: movq %rax, %xmm0 +; SSE2-NEXT: movq %rcx, %xmm1 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: psrlq $48, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: movd %xmm0, (%rdx) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v2i16: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,0,1,255,255,255,255,255,255,2,3] +; SSSE3-NEXT: pshufb %xmm2, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; SSSE3-NEXT: movq %xmm3, %rax +; SSSE3-NEXT: pshufb %xmm2, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSSE3-NEXT: movq %xmm2, %rcx +; SSSE3-NEXT: xorl %esi, %esi +; SSSE3-NEXT: movq %rcx, %rdi +; SSSE3-NEXT: addq %rax, %rdi +; SSSE3-NEXT: setns %sil +; SSSE3-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF +; SSSE3-NEXT: addq %r8, %rsi +; SSSE3-NEXT: addq %rax, %rcx +; SSSE3-NEXT: cmovoq %rsi, %rcx +; SSSE3-NEXT: movq %xmm1, %r9 +; SSSE3-NEXT: movq %xmm0, %rax +; SSSE3-NEXT: xorl %edi, %edi +; SSSE3-NEXT: movq %rax, %rsi +; SSSE3-NEXT: addq %r9, %rsi +; SSSE3-NEXT: setns %dil +; SSSE3-NEXT: addq %r8, %rdi +; SSSE3-NEXT: addq %r9, %rax +; SSSE3-NEXT: cmovoq %rdi, %rax +; SSSE3-NEXT: movq %rax, %xmm0 +; SSSE3-NEXT: movq %rcx, %xmm1 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,14,15],zero,zero,xmm0[14,15],zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: movd %xmm0, (%rdx) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v2i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; SSE41-NEXT: psllq $48, %xmm1 +; SSE41-NEXT: movq %xmm1, %rax +; SSE41-NEXT: psllq $48, %xmm0 +; SSE41-NEXT: movq %xmm0, %rcx +; SSE41-NEXT: xorl %esi, %esi +; SSE41-NEXT: movq %rcx, %rdi +; SSE41-NEXT: addq %rax, %rdi +; SSE41-NEXT: setns %sil +; SSE41-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF +; SSE41-NEXT: addq %r8, %rsi +; SSE41-NEXT: addq %rax, %rcx +; SSE41-NEXT: cmovoq %rsi, %rcx +; SSE41-NEXT: pextrq $1, %xmm1, %r9 +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: xorl %edi, %edi +; SSE41-NEXT: movq %rax, %rsi +; SSE41-NEXT: addq %r9, %rsi +; SSE41-NEXT: setns %dil +; SSE41-NEXT: addq %r8, %rdi +; SSE41-NEXT: addq %r9, %rax +; SSE41-NEXT: cmovoq %rdi, %rax +; SSE41-NEXT: movq %rax, %xmm0 +; SSE41-NEXT: movq %rcx, %xmm1 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,14,15],zero,zero,xmm1[14,15],zero,zero,zero,zero,zero,zero +; SSE41-NEXT: movd %xmm1, (%rdx) +; SSE41-NEXT: retq +; +; AVX1-LABEL: v2i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; AVX1-NEXT: vpsllq $48, %xmm1, %xmm1 +; AVX1-NEXT: vmovq %xmm1, %rax +; AVX1-NEXT: vpsllq $48, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rcx +; AVX1-NEXT: xorl %esi, %esi +; AVX1-NEXT: movq %rcx, %rdi +; AVX1-NEXT: addq %rax, %rdi +; AVX1-NEXT: setns %sil +; AVX1-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF +; AVX1-NEXT: addq %r8, %rsi +; AVX1-NEXT: addq %rax, %rcx +; AVX1-NEXT: cmovoq %rsi, %rcx +; AVX1-NEXT: vpextrq $1, %xmm1, %r9 +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: xorl %edi, %edi +; AVX1-NEXT: movq %rax, %rsi +; AVX1-NEXT: addq %r9, %rsi +; AVX1-NEXT: setns %dil +; AVX1-NEXT: addq %r8, %rdi +; AVX1-NEXT: addq %r9, %rax +; AVX1-NEXT: cmovoq %rdi, %rax +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: vmovq %rcx, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,14,15],zero,zero,xmm0[14,15],zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vmovd %xmm0, (%rdx) +; AVX1-NEXT: retq +; +; AVX2-LABEL: v2i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1 +; AVX2-NEXT: vmovq %xmm1, %rax +; AVX2-NEXT: vpsllq $48, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rcx +; AVX2-NEXT: xorl %esi, %esi +; AVX2-NEXT: movq %rcx, %rdi +; AVX2-NEXT: addq %rax, %rdi +; AVX2-NEXT: setns %sil +; AVX2-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF +; AVX2-NEXT: addq %r8, %rsi +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: cmovoq %rsi, %rcx +; AVX2-NEXT: vpextrq $1, %xmm1, %r9 +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: xorl %edi, %edi +; AVX2-NEXT: movq %rax, %rsi +; AVX2-NEXT: addq %r9, %rsi +; AVX2-NEXT: setns %dil +; AVX2-NEXT: addq %r8, %rdi +; AVX2-NEXT: addq %r9, %rax +; AVX2-NEXT: cmovoq %rdi, %rax +; AVX2-NEXT: vmovq %rax, %xmm0 +; AVX2-NEXT: vmovq %rcx, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,14,15],zero,zero,xmm0[14,15],zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vmovd %xmm0, (%rdx) +; AVX2-NEXT: retq +; +; AVX512-LABEL: v2i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,0,1,255,255,255,255,255,255,2,3] +; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vmovq %xmm1, %rax +; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rcx +; AVX512-NEXT: xorl %esi, %esi +; AVX512-NEXT: movq %rcx, %rdi +; AVX512-NEXT: addq %rax, %rdi +; AVX512-NEXT: setns %sil +; AVX512-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF +; AVX512-NEXT: addq %r8, %rsi +; AVX512-NEXT: addq %rax, %rcx +; AVX512-NEXT: cmovoq %rsi, %rcx +; AVX512-NEXT: vpextrq $1, %xmm1, %r9 +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: xorl %edi, %edi +; AVX512-NEXT: movq %rax, %rsi +; AVX512-NEXT: addq %r9, %rsi +; AVX512-NEXT: setns %dil +; AVX512-NEXT: addq %r8, %rdi +; AVX512-NEXT: addq %r9, %rax +; AVX512-NEXT: cmovoq %rdi, %rax +; AVX512-NEXT: vmovq %rax, %xmm0 +; AVX512-NEXT: vmovq %rcx, %xmm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX512-NEXT: vpmovqw %xmm0, (%rdx) +; AVX512-NEXT: retq + %x = load <2 x i16>, <2 x i16>* %px + %y = load <2 x i16>, <2 x i16>* %py + %z = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %x, <2 x i16> %y) + store <2 x i16> %z, <2 x i16>* %pz + ret void +} + +define <12 x i8> @v12i8(<12 x i8> %x, <12 x i8> %y) nounwind { +; SSE2-LABEL: v12i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: pushq %r15 +; SSE2-NEXT: pushq %r14 +; SSE2-NEXT: pushq %r13 +; SSE2-NEXT: pushq %r12 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r9b +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSE2-NEXT: movl %r9d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r9b +; SSE2-NEXT: jno .LBB11_2 +; SSE2-NEXT: # %bb.1: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r9d +; SSE2-NEXT: .LBB11_2: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movl %esi, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %sil +; SSE2-NEXT: jno .LBB11_4 +; SSE2-NEXT: # %bb.3: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %esi +; SSE2-NEXT: .LBB11_4: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl %ebx, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %bl +; SSE2-NEXT: jno .LBB11_6 +; SSE2-NEXT: # %bb.5: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %ebx +; SSE2-NEXT: .LBB11_6: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %dl +; SSE2-NEXT: jno .LBB11_8 +; SSE2-NEXT: # %bb.7: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %edx +; SSE2-NEXT: .LBB11_8: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r10b +; SSE2-NEXT: movl %r10d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r10b +; SSE2-NEXT: jno .LBB11_10 +; SSE2-NEXT: # %bb.9: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r10d +; SSE2-NEXT: .LBB11_10: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r11b +; SSE2-NEXT: movl %r11d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r11b +; SSE2-NEXT: jno .LBB11_12 +; SSE2-NEXT: # %bb.11: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r11d +; SSE2-NEXT: .LBB11_12: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bpl +; SSE2-NEXT: movl %ebp, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %bpl +; SSE2-NEXT: jno .LBB11_14 +; SSE2-NEXT: # %bb.13: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %ebp +; SSE2-NEXT: .LBB11_14: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r14b +; SSE2-NEXT: movl %r14d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r14b +; SSE2-NEXT: jno .LBB11_16 +; SSE2-NEXT: # %bb.15: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r14d +; SSE2-NEXT: .LBB11_16: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r15b +; SSE2-NEXT: movl %r15d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r15b +; SSE2-NEXT: jno .LBB11_18 +; SSE2-NEXT: # %bb.17: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r15d +; SSE2-NEXT: .LBB11_18: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r12b +; SSE2-NEXT: movl %r12d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r12b +; SSE2-NEXT: jno .LBB11_20 +; SSE2-NEXT: # %bb.19: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r12d +; SSE2-NEXT: .LBB11_20: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r13b +; SSE2-NEXT: movl %r13d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r13b +; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB11_22 +; SSE2-NEXT: # %bb.21: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r13d +; SSE2-NEXT: .LBB11_22: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dil +; SSE2-NEXT: movl %edi, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %dil +; SSE2-NEXT: jno .LBB11_24 +; SSE2-NEXT: # %bb.23: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %edi +; SSE2-NEXT: .LBB11_24: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r8b +; SSE2-NEXT: movl %r8d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r8b +; SSE2-NEXT: jno .LBB11_26 +; SSE2-NEXT: # %bb.25: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r8d +; SSE2-NEXT: .LBB11_26: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl %ebx, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %bl +; SSE2-NEXT: jno .LBB11_28 +; SSE2-NEXT: # %bb.27: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %ebx +; SSE2-NEXT: .LBB11_28: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: addb %dl, %cl +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addb %dl, %al +; SSE2-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB11_30 +; SSE2-NEXT: # %bb.29: +; SSE2-NEXT: addb $127, %cl +; SSE2-NEXT: movl %ecx, %eax +; SSE2-NEXT: .LBB11_30: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSE2-NEXT: movl %esi, %ecx +; SSE2-NEXT: addb %dl, %cl +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addb %dl, %sil +; SSE2-NEXT: jno .LBB11_32 +; SSE2-NEXT: # %bb.31: +; SSE2-NEXT: addb $127, %cl +; SSE2-NEXT: movl %ecx, %esi +; SSE2-NEXT: .LBB11_32: +; SSE2-NEXT: movzbl %sil, %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: movzbl %bl, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl %r8b, %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: movzbl %dil, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl %r13b, %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: movzbl %r12b, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl %r15b, %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: movzbl %r14b, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl %bpl, %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: movzbl %r11b, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl %r10b, %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: movzbl %r9b, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %r12 +; SSE2-NEXT: popq %r13 +; SSE2-NEXT: popq %r14 +; SSE2-NEXT: popq %r15 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v12i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pushq %rbp +; SSSE3-NEXT: pushq %r15 +; SSSE3-NEXT: pushq %r14 +; SSSE3-NEXT: pushq %r13 +; SSSE3-NEXT: pushq %r12 +; SSSE3-NEXT: pushq %rbx +; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r9b +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSSE3-NEXT: movl %r9d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r9b +; SSSE3-NEXT: jno .LBB11_2 +; SSSE3-NEXT: # %bb.1: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r9d +; SSSE3-NEXT: .LBB11_2: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movl %esi, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %sil +; SSSE3-NEXT: jno .LBB11_4 +; SSSE3-NEXT: # %bb.3: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %esi +; SSSE3-NEXT: .LBB11_4: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl %ebx, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %bl +; SSSE3-NEXT: jno .LBB11_6 +; SSSE3-NEXT: # %bb.5: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %ebx +; SSSE3-NEXT: .LBB11_6: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %dl +; SSSE3-NEXT: jno .LBB11_8 +; SSSE3-NEXT: # %bb.7: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %edx +; SSSE3-NEXT: .LBB11_8: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r10b +; SSSE3-NEXT: movl %r10d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r10b +; SSSE3-NEXT: jno .LBB11_10 +; SSSE3-NEXT: # %bb.9: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r10d +; SSSE3-NEXT: .LBB11_10: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r11b +; SSSE3-NEXT: movl %r11d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r11b +; SSSE3-NEXT: jno .LBB11_12 +; SSSE3-NEXT: # %bb.11: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r11d +; SSSE3-NEXT: .LBB11_12: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bpl +; SSSE3-NEXT: movl %ebp, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %bpl +; SSSE3-NEXT: jno .LBB11_14 +; SSSE3-NEXT: # %bb.13: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %ebp +; SSSE3-NEXT: .LBB11_14: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r14b +; SSSE3-NEXT: movl %r14d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r14b +; SSSE3-NEXT: jno .LBB11_16 +; SSSE3-NEXT: # %bb.15: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r14d +; SSSE3-NEXT: .LBB11_16: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r15b +; SSSE3-NEXT: movl %r15d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r15b +; SSSE3-NEXT: jno .LBB11_18 +; SSSE3-NEXT: # %bb.17: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r15d +; SSSE3-NEXT: .LBB11_18: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r12b +; SSSE3-NEXT: movl %r12d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r12b +; SSSE3-NEXT: jno .LBB11_20 +; SSSE3-NEXT: # %bb.19: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r12d +; SSSE3-NEXT: .LBB11_20: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r13b +; SSSE3-NEXT: movl %r13d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r13b +; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB11_22 +; SSSE3-NEXT: # %bb.21: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r13d +; SSSE3-NEXT: .LBB11_22: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dil +; SSSE3-NEXT: movl %edi, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %dil +; SSSE3-NEXT: jno .LBB11_24 +; SSSE3-NEXT: # %bb.23: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %edi +; SSSE3-NEXT: .LBB11_24: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r8b +; SSSE3-NEXT: movl %r8d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r8b +; SSSE3-NEXT: jno .LBB11_26 +; SSSE3-NEXT: # %bb.25: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r8d +; SSSE3-NEXT: .LBB11_26: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl %ebx, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %bl +; SSSE3-NEXT: jno .LBB11_28 +; SSSE3-NEXT: # %bb.27: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %ebx +; SSSE3-NEXT: .LBB11_28: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: addb %dl, %cl +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addb %dl, %al +; SSSE3-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB11_30 +; SSSE3-NEXT: # %bb.29: +; SSSE3-NEXT: addb $127, %cl +; SSSE3-NEXT: movl %ecx, %eax +; SSSE3-NEXT: .LBB11_30: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSSE3-NEXT: movl %esi, %ecx +; SSSE3-NEXT: addb %dl, %cl +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addb %dl, %sil +; SSSE3-NEXT: jno .LBB11_32 +; SSSE3-NEXT: # %bb.31: +; SSSE3-NEXT: addb $127, %cl +; SSSE3-NEXT: movl %ecx, %esi +; SSSE3-NEXT: .LBB11_32: +; SSSE3-NEXT: movzbl %sil, %ecx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: movzbl %al, %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: movzbl %bl, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzbl %r8b, %eax +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSSE3-NEXT: movzbl %dil, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzbl %r13b, %eax +; SSSE3-NEXT: movd %eax, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSSE3-NEXT: movzbl %r12b, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzbl %r15b, %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSSE3-NEXT: movzbl %r14b, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzbl %bpl, %eax +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: movzbl %r11b, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzbl %r10b, %eax +; SSSE3-NEXT: movd %eax, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movd %eax, %xmm4 +; SSSE3-NEXT: movzbl %r9b, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: popq %rbx +; SSSE3-NEXT: popq %r12 +; SSSE3-NEXT: popq %r13 +; SSSE3-NEXT: popq %r14 +; SSSE3-NEXT: popq %r15 +; SSSE3-NEXT: popq %rbp +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v12i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pextrb $15, %xmm1, %ecx +; SSE41-NEXT: pextrb $15, %xmm0, %edx +; SSE41-NEXT: movl %edx, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %dl +; SSE41-NEXT: jno .LBB11_2 +; SSE41-NEXT: # %bb.1: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edx +; SSE41-NEXT: .LBB11_2: +; SSE41-NEXT: pextrb $14, %xmm1, %ecx +; SSE41-NEXT: pextrb $14, %xmm0, %r11d +; SSE41-NEXT: movl %r11d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r11b +; SSE41-NEXT: jno .LBB11_4 +; SSE41-NEXT: # %bb.3: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r11d +; SSE41-NEXT: .LBB11_4: +; SSE41-NEXT: pextrb $13, %xmm1, %ecx +; SSE41-NEXT: pextrb $13, %xmm0, %edi +; SSE41-NEXT: movl %edi, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %dil +; SSE41-NEXT: jno .LBB11_6 +; SSE41-NEXT: # %bb.5: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edi +; SSE41-NEXT: .LBB11_6: +; SSE41-NEXT: pushq %rbp +; SSE41-NEXT: pushq %r15 +; SSE41-NEXT: pushq %r14 +; SSE41-NEXT: pushq %r13 +; SSE41-NEXT: pushq %r12 +; SSE41-NEXT: pushq %rbx +; SSE41-NEXT: pextrb $12, %xmm1, %ecx +; SSE41-NEXT: pextrb $12, %xmm0, %r14d +; SSE41-NEXT: movl %r14d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r14b +; SSE41-NEXT: jno .LBB11_8 +; SSE41-NEXT: # %bb.7: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r14d +; SSE41-NEXT: .LBB11_8: +; SSE41-NEXT: pextrb $11, %xmm1, %ecx +; SSE41-NEXT: pextrb $11, %xmm0, %ebp +; SSE41-NEXT: movl %ebp, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %bpl +; SSE41-NEXT: jno .LBB11_10 +; SSE41-NEXT: # %bb.9: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebp +; SSE41-NEXT: .LBB11_10: +; SSE41-NEXT: pextrb $10, %xmm1, %ecx +; SSE41-NEXT: pextrb $10, %xmm0, %r15d +; SSE41-NEXT: movl %r15d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r15b +; SSE41-NEXT: jno .LBB11_12 +; SSE41-NEXT: # %bb.11: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r15d +; SSE41-NEXT: .LBB11_12: +; SSE41-NEXT: pextrb $9, %xmm1, %ecx +; SSE41-NEXT: pextrb $9, %xmm0, %r12d +; SSE41-NEXT: movl %r12d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r12b +; SSE41-NEXT: jno .LBB11_14 +; SSE41-NEXT: # %bb.13: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r12d +; SSE41-NEXT: .LBB11_14: +; SSE41-NEXT: pextrb $8, %xmm1, %ecx +; SSE41-NEXT: pextrb $8, %xmm0, %r13d +; SSE41-NEXT: movl %r13d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r13b +; SSE41-NEXT: jno .LBB11_16 +; SSE41-NEXT: # %bb.15: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r13d +; SSE41-NEXT: .LBB11_16: +; SSE41-NEXT: pextrb $7, %xmm1, %ecx +; SSE41-NEXT: pextrb $7, %xmm0, %r10d +; SSE41-NEXT: movl %r10d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r10b +; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB11_18 +; SSE41-NEXT: # %bb.17: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r10d +; SSE41-NEXT: .LBB11_18: +; SSE41-NEXT: pextrb $6, %xmm1, %ecx +; SSE41-NEXT: pextrb $6, %xmm0, %r9d +; SSE41-NEXT: movl %r9d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r9b +; SSE41-NEXT: jno .LBB11_20 +; SSE41-NEXT: # %bb.19: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r9d +; SSE41-NEXT: .LBB11_20: +; SSE41-NEXT: pextrb $5, %xmm1, %ecx +; SSE41-NEXT: pextrb $5, %xmm0, %ebp +; SSE41-NEXT: movl %ebp, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %bpl +; SSE41-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB11_22 +; SSE41-NEXT: # %bb.21: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebp +; SSE41-NEXT: .LBB11_22: +; SSE41-NEXT: pextrb $4, %xmm1, %ecx +; SSE41-NEXT: pextrb $4, %xmm0, %edi +; SSE41-NEXT: movl %edi, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %dil +; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB11_24 +; SSE41-NEXT: # %bb.23: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edi +; SSE41-NEXT: .LBB11_24: +; SSE41-NEXT: pextrb $3, %xmm1, %edx +; SSE41-NEXT: pextrb $3, %xmm0, %eax +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: addb %dl, %cl +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addb %dl, %al +; SSE41-NEXT: jno .LBB11_26 +; SSE41-NEXT: # %bb.25: +; SSE41-NEXT: addb $127, %cl +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB11_26: +; SSE41-NEXT: pextrb $2, %xmm1, %ebx +; SSE41-NEXT: pextrb $2, %xmm0, %ecx +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: addb %bl, %dl +; SSE41-NEXT: setns %dl +; SSE41-NEXT: addb %bl, %cl +; SSE41-NEXT: jno .LBB11_28 +; SSE41-NEXT: # %bb.27: +; SSE41-NEXT: addb $127, %dl +; SSE41-NEXT: movl %edx, %ecx +; SSE41-NEXT: .LBB11_28: +; SSE41-NEXT: pextrb $0, %xmm1, %esi +; SSE41-NEXT: pextrb $0, %xmm0, %edx +; SSE41-NEXT: movl %edx, %ebx +; SSE41-NEXT: addb %sil, %bl +; SSE41-NEXT: setns %bl +; SSE41-NEXT: addb %sil, %dl +; SSE41-NEXT: jno .LBB11_30 +; SSE41-NEXT: # %bb.29: +; SSE41-NEXT: addb $127, %bl +; SSE41-NEXT: movl %ebx, %edx +; SSE41-NEXT: .LBB11_30: +; SSE41-NEXT: pextrb $1, %xmm1, %esi +; SSE41-NEXT: pextrb $1, %xmm0, %r8d +; SSE41-NEXT: movl %r8d, %ebx +; SSE41-NEXT: addb %sil, %bl +; SSE41-NEXT: setns %bl +; SSE41-NEXT: addb %sil, %r8b +; SSE41-NEXT: jno .LBB11_32 +; SSE41-NEXT: # %bb.31: +; SSE41-NEXT: addb $127, %bl +; SSE41-NEXT: movl %ebx, %r8d +; SSE41-NEXT: .LBB11_32: +; SSE41-NEXT: movzbl %dl, %edx +; SSE41-NEXT: movd %edx, %xmm0 +; SSE41-NEXT: movzbl %r8b, %edx +; SSE41-NEXT: pinsrb $1, %edx, %xmm0 +; SSE41-NEXT: movzbl %cl, %ecx +; SSE41-NEXT: pinsrb $2, %ecx, %xmm0 +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $3, %eax, %xmm0 +; SSE41-NEXT: movzbl %dil, %eax +; SSE41-NEXT: pinsrb $4, %eax, %xmm0 +; SSE41-NEXT: movzbl %bpl, %eax +; SSE41-NEXT: pinsrb $5, %eax, %xmm0 +; SSE41-NEXT: movzbl %r9b, %eax +; SSE41-NEXT: pinsrb $6, %eax, %xmm0 +; SSE41-NEXT: movzbl %r10b, %eax +; SSE41-NEXT: pinsrb $7, %eax, %xmm0 +; SSE41-NEXT: movzbl %r13b, %eax +; SSE41-NEXT: pinsrb $8, %eax, %xmm0 +; SSE41-NEXT: movzbl %r12b, %eax +; SSE41-NEXT: pinsrb $9, %eax, %xmm0 +; SSE41-NEXT: movzbl %r15b, %eax +; SSE41-NEXT: pinsrb $10, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $11, %eax, %xmm0 +; SSE41-NEXT: movzbl %r14b, %eax +; SSE41-NEXT: pinsrb $12, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $13, %eax, %xmm0 +; SSE41-NEXT: movzbl %r11b, %eax +; SSE41-NEXT: pinsrb $14, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $15, %eax, %xmm0 +; SSE41-NEXT: popq %rbx +; SSE41-NEXT: popq %r12 +; SSE41-NEXT: popq %r13 +; SSE41-NEXT: popq %r14 +; SSE41-NEXT: popq %r15 +; SSE41-NEXT: popq %rbp +; SSE41-NEXT: retq +; +; AVX-LABEL: v12i8: +; AVX: # %bb.0: +; AVX-NEXT: vpextrb $15, %xmm1, %ecx +; AVX-NEXT: vpextrb $15, %xmm0, %edx +; AVX-NEXT: movl %edx, %eax +; AVX-NEXT: addb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: addb %cl, %dl +; AVX-NEXT: jno .LBB11_2 +; AVX-NEXT: # %bb.1: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %edx +; AVX-NEXT: .LBB11_2: +; AVX-NEXT: vpextrb $14, %xmm1, %ecx +; AVX-NEXT: vpextrb $14, %xmm0, %r11d +; AVX-NEXT: movl %r11d, %eax +; AVX-NEXT: addb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: addb %cl, %r11b +; AVX-NEXT: jno .LBB11_4 +; AVX-NEXT: # %bb.3: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %r11d +; AVX-NEXT: .LBB11_4: +; AVX-NEXT: vpextrb $13, %xmm1, %ecx +; AVX-NEXT: vpextrb $13, %xmm0, %edi +; AVX-NEXT: movl %edi, %eax +; AVX-NEXT: addb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: addb %cl, %dil +; AVX-NEXT: jno .LBB11_6 +; AVX-NEXT: # %bb.5: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %edi +; AVX-NEXT: .LBB11_6: +; AVX-NEXT: pushq %rbp +; AVX-NEXT: pushq %r15 +; AVX-NEXT: pushq %r14 +; AVX-NEXT: pushq %r13 +; AVX-NEXT: pushq %r12 +; AVX-NEXT: pushq %rbx +; AVX-NEXT: vpextrb $12, %xmm1, %ecx +; AVX-NEXT: vpextrb $12, %xmm0, %r14d +; AVX-NEXT: movl %r14d, %eax +; AVX-NEXT: addb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: addb %cl, %r14b +; AVX-NEXT: jno .LBB11_8 +; AVX-NEXT: # %bb.7: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %r14d +; AVX-NEXT: .LBB11_8: +; AVX-NEXT: vpextrb $11, %xmm1, %ecx +; AVX-NEXT: vpextrb $11, %xmm0, %ebp +; AVX-NEXT: movl %ebp, %eax +; AVX-NEXT: addb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: addb %cl, %bpl +; AVX-NEXT: jno .LBB11_10 +; AVX-NEXT: # %bb.9: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %ebp +; AVX-NEXT: .LBB11_10: +; AVX-NEXT: vpextrb $10, %xmm1, %ecx +; AVX-NEXT: vpextrb $10, %xmm0, %r15d +; AVX-NEXT: movl %r15d, %eax +; AVX-NEXT: addb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: addb %cl, %r15b +; AVX-NEXT: jno .LBB11_12 +; AVX-NEXT: # %bb.11: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %r15d +; AVX-NEXT: .LBB11_12: +; AVX-NEXT: vpextrb $9, %xmm1, %ecx +; AVX-NEXT: vpextrb $9, %xmm0, %r12d +; AVX-NEXT: movl %r12d, %eax +; AVX-NEXT: addb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: addb %cl, %r12b +; AVX-NEXT: jno .LBB11_14 +; AVX-NEXT: # %bb.13: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %r12d +; AVX-NEXT: .LBB11_14: +; AVX-NEXT: vpextrb $8, %xmm1, %ecx +; AVX-NEXT: vpextrb $8, %xmm0, %r13d +; AVX-NEXT: movl %r13d, %eax +; AVX-NEXT: addb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: addb %cl, %r13b +; AVX-NEXT: jno .LBB11_16 +; AVX-NEXT: # %bb.15: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %r13d +; AVX-NEXT: .LBB11_16: +; AVX-NEXT: vpextrb $7, %xmm1, %ecx +; AVX-NEXT: vpextrb $7, %xmm0, %r10d +; AVX-NEXT: movl %r10d, %eax +; AVX-NEXT: addb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: addb %cl, %r10b +; AVX-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX-NEXT: jno .LBB11_18 +; AVX-NEXT: # %bb.17: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %r10d +; AVX-NEXT: .LBB11_18: +; AVX-NEXT: vpextrb $6, %xmm1, %ecx +; AVX-NEXT: vpextrb $6, %xmm0, %r9d +; AVX-NEXT: movl %r9d, %eax +; AVX-NEXT: addb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: addb %cl, %r9b +; AVX-NEXT: jno .LBB11_20 +; AVX-NEXT: # %bb.19: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %r9d +; AVX-NEXT: .LBB11_20: +; AVX-NEXT: vpextrb $5, %xmm1, %ecx +; AVX-NEXT: vpextrb $5, %xmm0, %ebp +; AVX-NEXT: movl %ebp, %eax +; AVX-NEXT: addb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: addb %cl, %bpl +; AVX-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX-NEXT: jno .LBB11_22 +; AVX-NEXT: # %bb.21: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %ebp +; AVX-NEXT: .LBB11_22: +; AVX-NEXT: vpextrb $4, %xmm1, %ecx +; AVX-NEXT: vpextrb $4, %xmm0, %edi +; AVX-NEXT: movl %edi, %eax +; AVX-NEXT: addb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: addb %cl, %dil +; AVX-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX-NEXT: jno .LBB11_24 +; AVX-NEXT: # %bb.23: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %edi +; AVX-NEXT: .LBB11_24: +; AVX-NEXT: vpextrb $3, %xmm1, %edx +; AVX-NEXT: vpextrb $3, %xmm0, %eax +; AVX-NEXT: movl %eax, %ecx +; AVX-NEXT: addb %dl, %cl +; AVX-NEXT: setns %cl +; AVX-NEXT: addb %dl, %al +; AVX-NEXT: jno .LBB11_26 +; AVX-NEXT: # %bb.25: +; AVX-NEXT: addb $127, %cl +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: .LBB11_26: +; AVX-NEXT: vpextrb $2, %xmm1, %ebx +; AVX-NEXT: vpextrb $2, %xmm0, %ecx +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: addb %bl, %dl +; AVX-NEXT: setns %dl +; AVX-NEXT: addb %bl, %cl +; AVX-NEXT: jno .LBB11_28 +; AVX-NEXT: # %bb.27: +; AVX-NEXT: addb $127, %dl +; AVX-NEXT: movl %edx, %ecx +; AVX-NEXT: .LBB11_28: +; AVX-NEXT: vpextrb $0, %xmm1, %esi +; AVX-NEXT: vpextrb $0, %xmm0, %edx +; AVX-NEXT: movl %edx, %ebx +; AVX-NEXT: addb %sil, %bl +; AVX-NEXT: setns %bl +; AVX-NEXT: addb %sil, %dl +; AVX-NEXT: jno .LBB11_30 +; AVX-NEXT: # %bb.29: +; AVX-NEXT: addb $127, %bl +; AVX-NEXT: movl %ebx, %edx +; AVX-NEXT: .LBB11_30: +; AVX-NEXT: vpextrb $1, %xmm1, %esi +; AVX-NEXT: vpextrb $1, %xmm0, %r8d +; AVX-NEXT: movl %r8d, %ebx +; AVX-NEXT: addb %sil, %bl +; AVX-NEXT: setns %bl +; AVX-NEXT: addb %sil, %r8b +; AVX-NEXT: jno .LBB11_32 +; AVX-NEXT: # %bb.31: +; AVX-NEXT: addb $127, %bl +; AVX-NEXT: movl %ebx, %r8d +; AVX-NEXT: .LBB11_32: +; AVX-NEXT: movzbl %dl, %edx +; AVX-NEXT: vmovd %edx, %xmm0 +; AVX-NEXT: movzbl %r8b, %edx +; AVX-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0 +; AVX-NEXT: movzbl %cl, %ecx +; AVX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %dil, %eax +; AVX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %bpl, %eax +; AVX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %r9b, %eax +; AVX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %r10b, %eax +; AVX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %r13b, %eax +; AVX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %r12b, %eax +; AVX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %r15b, %eax +; AVX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %r14b, %eax +; AVX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %r11b, %eax +; AVX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX-NEXT: popq %rbx +; AVX-NEXT: popq %r12 +; AVX-NEXT: popq %r13 +; AVX-NEXT: popq %r14 +; AVX-NEXT: popq %r15 +; AVX-NEXT: popq %rbp +; AVX-NEXT: retq + %z = call <12 x i8> @llvm.sadd.sat.v12i8(<12 x i8> %x, <12 x i8> %y) + ret <12 x i8> %z +} + +define void @v12i16(<12 x i16>* %px, <12 x i16>* %py, <12 x i16>* %pz) nounwind { +; SSE2-LABEL: v12i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: pushq %r15 +; SSE2-NEXT: pushq %r14 +; SSE2-NEXT: pushq %r13 +; SSE2-NEXT: pushq %r12 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm2 +; SSE2-NEXT: movdqa (%rsi), %xmm1 +; SSE2-NEXT: movdqa 16(%rsi), %xmm3 +; SSE2-NEXT: pextrw $3, %xmm3, %eax +; SSE2-NEXT: pextrw $3, %xmm2, %edx +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %edx, %esi +; SSE2-NEXT: addw %ax, %si +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %dx +; SSE2-NEXT: cmovol %ecx, %edx +; SSE2-NEXT: pextrw $2, %xmm3, %eax +; SSE2-NEXT: pextrw $2, %xmm2, %r9d +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %r9d, %esi +; SSE2-NEXT: addw %ax, %si +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %r9w +; SSE2-NEXT: cmovol %ecx, %r9d +; SSE2-NEXT: movd %xmm3, %eax +; SSE2-NEXT: movd %xmm2, %r10d +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %r10d, %esi +; SSE2-NEXT: addw %ax, %si +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %r10w +; SSE2-NEXT: cmovol %ecx, %r10d +; SSE2-NEXT: pextrw $1, %xmm3, %eax +; SSE2-NEXT: pextrw $1, %xmm2, %r11d +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %r11d, %esi +; SSE2-NEXT: addw %ax, %si +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %r11w +; SSE2-NEXT: cmovol %ecx, %r11d +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: movd %xmm0, %r14d +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %r14d, %esi +; SSE2-NEXT: addw %ax, %si +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %r14w +; SSE2-NEXT: cmovol %ecx, %r14d +; SSE2-NEXT: pextrw $1, %xmm1, %eax +; SSE2-NEXT: pextrw $1, %xmm0, %r15d +; SSE2-NEXT: xorl %esi, %esi +; SSE2-NEXT: movl %r15d, %edi +; SSE2-NEXT: addw %ax, %di +; SSE2-NEXT: setns %sil +; SSE2-NEXT: addl $32767, %esi # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %r15w +; SSE2-NEXT: cmovol %esi, %r15d +; SSE2-NEXT: pextrw $2, %xmm1, %eax +; SSE2-NEXT: pextrw $2, %xmm0, %r12d +; SSE2-NEXT: xorl %edi, %edi +; SSE2-NEXT: movl %r12d, %ebx +; SSE2-NEXT: addw %ax, %bx +; SSE2-NEXT: setns %dil +; SSE2-NEXT: addl $32767, %edi # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %r12w +; SSE2-NEXT: cmovol %edi, %r12d +; SSE2-NEXT: pextrw $3, %xmm1, %eax +; SSE2-NEXT: pextrw $3, %xmm0, %r13d +; SSE2-NEXT: xorl %ebx, %ebx +; SSE2-NEXT: movl %r13d, %ebp +; SSE2-NEXT: addw %ax, %bp +; SSE2-NEXT: setns %bl +; SSE2-NEXT: addl $32767, %ebx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %r13w +; SSE2-NEXT: cmovol %ebx, %r13d +; SSE2-NEXT: pextrw $4, %xmm1, %eax +; SSE2-NEXT: pextrw $4, %xmm0, %ebx +; SSE2-NEXT: xorl %ebp, %ebp +; SSE2-NEXT: movl %ebx, %ecx +; SSE2-NEXT: addw %ax, %cx +; SSE2-NEXT: setns %bpl +; SSE2-NEXT: addl $32767, %ebp # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %bx +; SSE2-NEXT: cmovol %ebp, %ebx +; SSE2-NEXT: pextrw $5, %xmm1, %eax +; SSE2-NEXT: pextrw $5, %xmm0, %ebp +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %ebp, %esi +; SSE2-NEXT: addw %ax, %si +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: addw %ax, %bp +; SSE2-NEXT: cmovol %ecx, %ebp +; SSE2-NEXT: pextrw $6, %xmm1, %ecx +; SSE2-NEXT: pextrw $6, %xmm0, %eax +; SSE2-NEXT: xorl %esi, %esi +; SSE2-NEXT: movl %eax, %edi +; SSE2-NEXT: addw %cx, %di +; SSE2-NEXT: setns %sil +; SSE2-NEXT: addl $32767, %esi # imm = 0x7FFF +; SSE2-NEXT: addw %cx, %ax +; SSE2-NEXT: cmovol %esi, %eax +; SSE2-NEXT: pextrw $7, %xmm1, %ecx +; SSE2-NEXT: pextrw $7, %xmm0, %esi +; SSE2-NEXT: xorl %edi, %edi +; SSE2-NEXT: movl %esi, %r8d +; SSE2-NEXT: addw %cx, %r8w +; SSE2-NEXT: setns %dil +; SSE2-NEXT: addl $32767, %edi # imm = 0x7FFF +; SSE2-NEXT: addw %cx, %si +; SSE2-NEXT: cmovol %edi, %esi +; SSE2-NEXT: movd %esi, %xmm8 +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: movd %ebp, %xmm2 +; SSE2-NEXT: movd %ebx, %xmm3 +; SSE2-NEXT: movd %r13d, %xmm4 +; SSE2-NEXT: movd %r12d, %xmm5 +; SSE2-NEXT: movd %r15d, %xmm6 +; SSE2-NEXT: movd %r14d, %xmm7 +; SSE2-NEXT: movd %r10d, %xmm0 +; SSE2-NEXT: pinsrw $1, %r11d, %xmm0 +; SSE2-NEXT: pinsrw $2, %r9d, %xmm0 +; SSE2-NEXT: pinsrw $3, %edx, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm3[0] +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE2-NEXT: movq %xmm0, 16(%rax) +; SSE2-NEXT: movdqa %xmm7, (%rax) +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %r12 +; SSE2-NEXT: popq %r13 +; SSE2-NEXT: popq %r14 +; SSE2-NEXT: popq %r15 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v12i16: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pushq %rbp +; SSSE3-NEXT: pushq %r15 +; SSSE3-NEXT: pushq %r14 +; SSSE3-NEXT: pushq %r13 +; SSSE3-NEXT: pushq %r12 +; SSSE3-NEXT: pushq %rbx +; SSSE3-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSSE3-NEXT: movdqa (%rdi), %xmm0 +; SSSE3-NEXT: movdqa 16(%rdi), %xmm2 +; SSSE3-NEXT: movdqa (%rsi), %xmm1 +; SSSE3-NEXT: movdqa 16(%rsi), %xmm3 +; SSSE3-NEXT: pextrw $3, %xmm3, %eax +; SSSE3-NEXT: pextrw $3, %xmm2, %edx +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %edx, %esi +; SSSE3-NEXT: addw %ax, %si +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %dx +; SSSE3-NEXT: cmovol %ecx, %edx +; SSSE3-NEXT: pextrw $2, %xmm3, %eax +; SSSE3-NEXT: pextrw $2, %xmm2, %r9d +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %r9d, %esi +; SSSE3-NEXT: addw %ax, %si +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %r9w +; SSSE3-NEXT: cmovol %ecx, %r9d +; SSSE3-NEXT: movd %xmm3, %eax +; SSSE3-NEXT: movd %xmm2, %r10d +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %r10d, %esi +; SSSE3-NEXT: addw %ax, %si +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %r10w +; SSSE3-NEXT: cmovol %ecx, %r10d +; SSSE3-NEXT: pextrw $1, %xmm3, %eax +; SSSE3-NEXT: pextrw $1, %xmm2, %r11d +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %r11d, %esi +; SSSE3-NEXT: addw %ax, %si +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %r11w +; SSSE3-NEXT: cmovol %ecx, %r11d +; SSSE3-NEXT: movd %xmm1, %eax +; SSSE3-NEXT: movd %xmm0, %r14d +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %r14d, %esi +; SSSE3-NEXT: addw %ax, %si +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %r14w +; SSSE3-NEXT: cmovol %ecx, %r14d +; SSSE3-NEXT: pextrw $1, %xmm1, %eax +; SSSE3-NEXT: pextrw $1, %xmm0, %r15d +; SSSE3-NEXT: xorl %esi, %esi +; SSSE3-NEXT: movl %r15d, %edi +; SSSE3-NEXT: addw %ax, %di +; SSSE3-NEXT: setns %sil +; SSSE3-NEXT: addl $32767, %esi # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %r15w +; SSSE3-NEXT: cmovol %esi, %r15d +; SSSE3-NEXT: pextrw $2, %xmm1, %eax +; SSSE3-NEXT: pextrw $2, %xmm0, %r12d +; SSSE3-NEXT: xorl %edi, %edi +; SSSE3-NEXT: movl %r12d, %ebx +; SSSE3-NEXT: addw %ax, %bx +; SSSE3-NEXT: setns %dil +; SSSE3-NEXT: addl $32767, %edi # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %r12w +; SSSE3-NEXT: cmovol %edi, %r12d +; SSSE3-NEXT: pextrw $3, %xmm1, %eax +; SSSE3-NEXT: pextrw $3, %xmm0, %r13d +; SSSE3-NEXT: xorl %ebx, %ebx +; SSSE3-NEXT: movl %r13d, %ebp +; SSSE3-NEXT: addw %ax, %bp +; SSSE3-NEXT: setns %bl +; SSSE3-NEXT: addl $32767, %ebx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %r13w +; SSSE3-NEXT: cmovol %ebx, %r13d +; SSSE3-NEXT: pextrw $4, %xmm1, %eax +; SSSE3-NEXT: pextrw $4, %xmm0, %ebx +; SSSE3-NEXT: xorl %ebp, %ebp +; SSSE3-NEXT: movl %ebx, %ecx +; SSSE3-NEXT: addw %ax, %cx +; SSSE3-NEXT: setns %bpl +; SSSE3-NEXT: addl $32767, %ebp # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %bx +; SSSE3-NEXT: cmovol %ebp, %ebx +; SSSE3-NEXT: pextrw $5, %xmm1, %eax +; SSSE3-NEXT: pextrw $5, %xmm0, %ebp +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %ebp, %esi +; SSSE3-NEXT: addw %ax, %si +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: addw %ax, %bp +; SSSE3-NEXT: cmovol %ecx, %ebp +; SSSE3-NEXT: pextrw $6, %xmm1, %ecx +; SSSE3-NEXT: pextrw $6, %xmm0, %eax +; SSSE3-NEXT: xorl %esi, %esi +; SSSE3-NEXT: movl %eax, %edi +; SSSE3-NEXT: addw %cx, %di +; SSSE3-NEXT: setns %sil +; SSSE3-NEXT: addl $32767, %esi # imm = 0x7FFF +; SSSE3-NEXT: addw %cx, %ax +; SSSE3-NEXT: cmovol %esi, %eax +; SSSE3-NEXT: pextrw $7, %xmm1, %ecx +; SSSE3-NEXT: pextrw $7, %xmm0, %esi +; SSSE3-NEXT: xorl %edi, %edi +; SSSE3-NEXT: movl %esi, %r8d +; SSSE3-NEXT: addw %cx, %r8w +; SSSE3-NEXT: setns %dil +; SSSE3-NEXT: addl $32767, %edi # imm = 0x7FFF +; SSSE3-NEXT: addw %cx, %si +; SSSE3-NEXT: cmovol %edi, %esi +; SSSE3-NEXT: movd %esi, %xmm8 +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movd %ebp, %xmm2 +; SSSE3-NEXT: movd %ebx, %xmm3 +; SSSE3-NEXT: movd %r13d, %xmm4 +; SSSE3-NEXT: movd %r12d, %xmm5 +; SSSE3-NEXT: movd %r15d, %xmm6 +; SSSE3-NEXT: movd %r14d, %xmm7 +; SSSE3-NEXT: movd %r10d, %xmm0 +; SSSE3-NEXT: pinsrw $1, %r11d, %xmm0 +; SSSE3-NEXT: pinsrw $2, %r9d, %xmm0 +; SSSE3-NEXT: pinsrw $3, %edx, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm3[0] +; SSSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSSE3-NEXT: movq %xmm0, 16(%rax) +; SSSE3-NEXT: movdqa %xmm7, (%rax) +; SSSE3-NEXT: popq %rbx +; SSSE3-NEXT: popq %r12 +; SSSE3-NEXT: popq %r13 +; SSSE3-NEXT: popq %r14 +; SSSE3-NEXT: popq %r15 +; SSSE3-NEXT: popq %rbp +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v12i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pushq %rbp +; SSE41-NEXT: pushq %r15 +; SSE41-NEXT: pushq %r14 +; SSE41-NEXT: pushq %r13 +; SSE41-NEXT: pushq %r12 +; SSE41-NEXT: pushq %rbx +; SSE41-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: movdqa 16(%rdi), %xmm2 +; SSE41-NEXT: movdqa (%rsi), %xmm1 +; SSE41-NEXT: movdqa 16(%rsi), %xmm3 +; SSE41-NEXT: pextrw $3, %xmm3, %eax +; SSE41-NEXT: pextrw $3, %xmm2, %edx +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %edx, %esi +; SSE41-NEXT: addw %ax, %si +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %dx +; SSE41-NEXT: cmovol %ecx, %edx +; SSE41-NEXT: pextrw $2, %xmm3, %eax +; SSE41-NEXT: pextrw $2, %xmm2, %r9d +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %r9d, %esi +; SSE41-NEXT: addw %ax, %si +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %r9w +; SSE41-NEXT: cmovol %ecx, %r9d +; SSE41-NEXT: movd %xmm3, %eax +; SSE41-NEXT: movd %xmm2, %r10d +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %r10d, %esi +; SSE41-NEXT: addw %ax, %si +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %r10w +; SSE41-NEXT: cmovol %ecx, %r10d +; SSE41-NEXT: pextrw $1, %xmm3, %eax +; SSE41-NEXT: pextrw $1, %xmm2, %r11d +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %r11d, %esi +; SSE41-NEXT: addw %ax, %si +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %r11w +; SSE41-NEXT: cmovol %ecx, %r11d +; SSE41-NEXT: pextrw $7, %xmm1, %ecx +; SSE41-NEXT: pextrw $7, %xmm0, %r14d +; SSE41-NEXT: xorl %esi, %esi +; SSE41-NEXT: movl %r14d, %edi +; SSE41-NEXT: addw %cx, %di +; SSE41-NEXT: setns %sil +; SSE41-NEXT: addl $32767, %esi # imm = 0x7FFF +; SSE41-NEXT: addw %cx, %r14w +; SSE41-NEXT: cmovol %esi, %r14d +; SSE41-NEXT: pextrw $6, %xmm1, %esi +; SSE41-NEXT: pextrw $6, %xmm0, %r15d +; SSE41-NEXT: xorl %edi, %edi +; SSE41-NEXT: movl %r15d, %ebx +; SSE41-NEXT: addw %si, %bx +; SSE41-NEXT: setns %dil +; SSE41-NEXT: addl $32767, %edi # imm = 0x7FFF +; SSE41-NEXT: addw %si, %r15w +; SSE41-NEXT: cmovol %edi, %r15d +; SSE41-NEXT: pextrw $5, %xmm1, %edi +; SSE41-NEXT: pextrw $5, %xmm0, %r12d +; SSE41-NEXT: xorl %ebx, %ebx +; SSE41-NEXT: movl %r12d, %ebp +; SSE41-NEXT: addw %di, %bp +; SSE41-NEXT: setns %bl +; SSE41-NEXT: addl $32767, %ebx # imm = 0x7FFF +; SSE41-NEXT: addw %di, %r12w +; SSE41-NEXT: cmovol %ebx, %r12d +; SSE41-NEXT: pextrw $4, %xmm1, %ebx +; SSE41-NEXT: pextrw $4, %xmm0, %r13d +; SSE41-NEXT: xorl %ebp, %ebp +; SSE41-NEXT: movl %r13d, %eax +; SSE41-NEXT: addw %bx, %ax +; SSE41-NEXT: setns %bpl +; SSE41-NEXT: addl $32767, %ebp # imm = 0x7FFF +; SSE41-NEXT: addw %bx, %r13w +; SSE41-NEXT: cmovol %ebp, %r13d +; SSE41-NEXT: pextrw $3, %xmm1, %eax +; SSE41-NEXT: pextrw $3, %xmm0, %ebx +; SSE41-NEXT: xorl %ebp, %ebp +; SSE41-NEXT: movl %ebx, %ecx +; SSE41-NEXT: addw %ax, %cx +; SSE41-NEXT: setns %bpl +; SSE41-NEXT: addl $32767, %ebp # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %bx +; SSE41-NEXT: cmovol %ebp, %ebx +; SSE41-NEXT: pextrw $2, %xmm1, %eax +; SSE41-NEXT: pextrw $2, %xmm0, %ebp +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %ebp, %esi +; SSE41-NEXT: addw %ax, %si +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %bp +; SSE41-NEXT: cmovol %ecx, %ebp +; SSE41-NEXT: movd %xmm1, %eax +; SSE41-NEXT: movd %xmm0, %ecx +; SSE41-NEXT: xorl %esi, %esi +; SSE41-NEXT: movl %ecx, %edi +; SSE41-NEXT: addw %ax, %di +; SSE41-NEXT: setns %sil +; SSE41-NEXT: addl $32767, %esi # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %cx +; SSE41-NEXT: cmovol %esi, %ecx +; SSE41-NEXT: pextrw $1, %xmm1, %eax +; SSE41-NEXT: pextrw $1, %xmm0, %esi +; SSE41-NEXT: xorl %edi, %edi +; SSE41-NEXT: movl %esi, %r8d +; SSE41-NEXT: addw %ax, %r8w +; SSE41-NEXT: setns %dil +; SSE41-NEXT: addl $32767, %edi # imm = 0x7FFF +; SSE41-NEXT: addw %ax, %si +; SSE41-NEXT: cmovol %edi, %esi +; SSE41-NEXT: movd %ecx, %xmm0 +; SSE41-NEXT: pinsrw $1, %esi, %xmm0 +; SSE41-NEXT: pinsrw $2, %ebp, %xmm0 +; SSE41-NEXT: pinsrw $3, %ebx, %xmm0 +; SSE41-NEXT: pinsrw $4, %r13d, %xmm0 +; SSE41-NEXT: pinsrw $5, %r12d, %xmm0 +; SSE41-NEXT: pinsrw $6, %r15d, %xmm0 +; SSE41-NEXT: pinsrw $7, %r14d, %xmm0 +; SSE41-NEXT: movd %r10d, %xmm1 +; SSE41-NEXT: pinsrw $1, %r11d, %xmm1 +; SSE41-NEXT: pinsrw $2, %r9d, %xmm1 +; SSE41-NEXT: pinsrw $3, %edx, %xmm1 +; SSE41-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE41-NEXT: movq %xmm1, 16(%rax) +; SSE41-NEXT: movdqa %xmm0, (%rax) +; SSE41-NEXT: popq %rbx +; SSE41-NEXT: popq %r12 +; SSE41-NEXT: popq %r13 +; SSE41-NEXT: popq %r14 +; SSE41-NEXT: popq %r15 +; SSE41-NEXT: popq %rbp +; SSE41-NEXT: retq +; +; AVX-LABEL: v12i16: +; AVX: # %bb.0: +; AVX-NEXT: pushq %rbp +; AVX-NEXT: pushq %r15 +; AVX-NEXT: pushq %r14 +; AVX-NEXT: pushq %r13 +; AVX-NEXT: pushq %r12 +; AVX-NEXT: pushq %rbx +; AVX-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX-NEXT: vmovdqa (%rsi), %xmm2 +; AVX-NEXT: vmovdqa 16(%rsi), %xmm0 +; AVX-NEXT: vmovd %xmm2, %eax +; AVX-NEXT: vmovdqa (%rdi), %xmm3 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovd %xmm3, %edx +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: movl %edx, %esi +; AVX-NEXT: addw %ax, %si +; AVX-NEXT: setns %cl +; AVX-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX-NEXT: addw %ax, %dx +; AVX-NEXT: cmovol %ecx, %edx +; AVX-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX-NEXT: vpextrw $1, %xmm2, %eax +; AVX-NEXT: vpextrw $1, %xmm3, %edx +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: movl %edx, %esi +; AVX-NEXT: addw %ax, %si +; AVX-NEXT: setns %cl +; AVX-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX-NEXT: addw %ax, %dx +; AVX-NEXT: cmovol %ecx, %edx +; AVX-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX-NEXT: vpextrw $2, %xmm2, %eax +; AVX-NEXT: vpextrw $2, %xmm3, %edx +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: movl %edx, %esi +; AVX-NEXT: addw %ax, %si +; AVX-NEXT: setns %cl +; AVX-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX-NEXT: addw %ax, %dx +; AVX-NEXT: cmovol %ecx, %edx +; AVX-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX-NEXT: vpextrw $3, %xmm2, %eax +; AVX-NEXT: vpextrw $3, %xmm3, %edx +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: movl %edx, %esi +; AVX-NEXT: addw %ax, %si +; AVX-NEXT: setns %cl +; AVX-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX-NEXT: addw %ax, %dx +; AVX-NEXT: cmovol %ecx, %edx +; AVX-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX-NEXT: vpextrw $4, %xmm2, %eax +; AVX-NEXT: vpextrw $4, %xmm3, %r14d +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: movl %r14d, %esi +; AVX-NEXT: addw %ax, %si +; AVX-NEXT: setns %cl +; AVX-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX-NEXT: addw %ax, %r14w +; AVX-NEXT: cmovol %ecx, %r14d +; AVX-NEXT: vpextrw $5, %xmm2, %eax +; AVX-NEXT: vpextrw $5, %xmm3, %r15d +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: movl %r15d, %esi +; AVX-NEXT: addw %ax, %si +; AVX-NEXT: setns %cl +; AVX-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX-NEXT: addw %ax, %r15w +; AVX-NEXT: cmovol %ecx, %r15d +; AVX-NEXT: vpextrw $6, %xmm2, %eax +; AVX-NEXT: vpextrw $6, %xmm3, %r12d +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: movl %r12d, %esi +; AVX-NEXT: addw %ax, %si +; AVX-NEXT: setns %cl +; AVX-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX-NEXT: addw %ax, %r12w +; AVX-NEXT: cmovol %ecx, %r12d +; AVX-NEXT: vpextrw $7, %xmm2, %eax +; AVX-NEXT: vpextrw $7, %xmm3, %r13d +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: movl %r13d, %esi +; AVX-NEXT: addw %ax, %si +; AVX-NEXT: setns %cl +; AVX-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX-NEXT: addw %ax, %r13w +; AVX-NEXT: cmovol %ecx, %r13d +; AVX-NEXT: vpextrw $7, %xmm0, %eax +; AVX-NEXT: vpextrw $7, %xmm1, %ebx +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: movl %ebx, %esi +; AVX-NEXT: addw %ax, %si +; AVX-NEXT: setns %cl +; AVX-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX-NEXT: addw %ax, %bx +; AVX-NEXT: cmovol %ecx, %ebx +; AVX-NEXT: vpextrw $6, %xmm0, %eax +; AVX-NEXT: vpextrw $6, %xmm1, %ebp +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: movl %ebp, %esi +; AVX-NEXT: addw %ax, %si +; AVX-NEXT: setns %cl +; AVX-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX-NEXT: addw %ax, %bp +; AVX-NEXT: cmovol %ecx, %ebp +; AVX-NEXT: vpextrw $5, %xmm0, %ecx +; AVX-NEXT: vpextrw $5, %xmm1, %eax +; AVX-NEXT: xorl %esi, %esi +; AVX-NEXT: movl %eax, %edi +; AVX-NEXT: addw %cx, %di +; AVX-NEXT: setns %sil +; AVX-NEXT: addl $32767, %esi # imm = 0x7FFF +; AVX-NEXT: addw %cx, %ax +; AVX-NEXT: cmovol %esi, %eax +; AVX-NEXT: vpextrw $4, %xmm0, %esi +; AVX-NEXT: vpextrw $4, %xmm1, %ecx +; AVX-NEXT: xorl %edi, %edi +; AVX-NEXT: movl %ecx, %r8d +; AVX-NEXT: addw %si, %r8w +; AVX-NEXT: setns %dil +; AVX-NEXT: addl $32767, %edi # imm = 0x7FFF +; AVX-NEXT: addw %si, %cx +; AVX-NEXT: cmovol %edi, %ecx +; AVX-NEXT: vpextrw $3, %xmm0, %edi +; AVX-NEXT: vpextrw $3, %xmm1, %r8d +; AVX-NEXT: xorl %esi, %esi +; AVX-NEXT: movl %r8d, %edx +; AVX-NEXT: addw %di, %dx +; AVX-NEXT: setns %sil +; AVX-NEXT: addl $32767, %esi # imm = 0x7FFF +; AVX-NEXT: addw %di, %r8w +; AVX-NEXT: cmovol %esi, %r8d +; AVX-NEXT: vpextrw $2, %xmm0, %edx +; AVX-NEXT: vpextrw $2, %xmm1, %edi +; AVX-NEXT: xorl %esi, %esi +; AVX-NEXT: movl %edi, %r9d +; AVX-NEXT: addw %dx, %r9w +; AVX-NEXT: setns %sil +; AVX-NEXT: addl $32767, %esi # imm = 0x7FFF +; AVX-NEXT: addw %dx, %di +; AVX-NEXT: cmovol %esi, %edi +; AVX-NEXT: vmovd %xmm0, %r9d +; AVX-NEXT: vmovd %xmm1, %esi +; AVX-NEXT: xorl %edx, %edx +; AVX-NEXT: movl %esi, %r10d +; AVX-NEXT: addw %r9w, %r10w +; AVX-NEXT: setns %dl +; AVX-NEXT: addl $32767, %edx # imm = 0x7FFF +; AVX-NEXT: addw %r9w, %si +; AVX-NEXT: cmovol %edx, %esi +; AVX-NEXT: vpextrw $1, %xmm0, %r9d +; AVX-NEXT: vpextrw $1, %xmm1, %edx +; AVX-NEXT: xorl %r10d, %r10d +; AVX-NEXT: movl %edx, %r11d +; AVX-NEXT: addw %r9w, %r11w +; AVX-NEXT: setns %r10b +; AVX-NEXT: addl $32767, %r10d # imm = 0x7FFF +; AVX-NEXT: addw %r9w, %dx +; AVX-NEXT: cmovol %r10d, %edx +; AVX-NEXT: vmovd %esi, %xmm0 +; AVX-NEXT: vpinsrw $1, %edx, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $2, %edi, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $3, %r8d, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $6, %ebp, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $7, %ebx, %xmm0, %xmm0 +; AVX-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload +; AVX-NEXT: # xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vpinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX-NEXT: vpinsrw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX-NEXT: vpinsrw $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX-NEXT: vpinsrw $4, %r14d, %xmm1, %xmm1 +; AVX-NEXT: vpinsrw $5, %r15d, %xmm1, %xmm1 +; AVX-NEXT: vpinsrw $6, %r12d, %xmm1, %xmm1 +; AVX-NEXT: vpinsrw $7, %r13d, %xmm1, %xmm1 +; AVX-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX-NEXT: vmovq %xmm0, 16(%rax) +; AVX-NEXT: vmovdqa %xmm1, (%rax) +; AVX-NEXT: popq %rbx +; AVX-NEXT: popq %r12 +; AVX-NEXT: popq %r13 +; AVX-NEXT: popq %r14 +; AVX-NEXT: popq %r15 +; AVX-NEXT: popq %rbp +; AVX-NEXT: retq + %x = load <12 x i16>, <12 x i16>* %px + %y = load <12 x i16>, <12 x i16>* %py + %z = call <12 x i16> @llvm.sadd.sat.v12i16(<12 x i16> %x, <12 x i16> %y) + store <12 x i16> %z, <12 x i16>* %pz + ret void +} + +; Scalarization + +define void @v1i8(<1 x i8>* %px, <1 x i8>* %py, <1 x i8>* %pz) nounwind { +; SSE-LABEL: v1i8: +; SSE: # %bb.0: +; SSE-NEXT: movb (%rdi), %cl +; SSE-NEXT: movb (%rsi), %dil +; SSE-NEXT: movl %ecx, %eax +; SSE-NEXT: addb %dil, %al +; SSE-NEXT: setns %sil +; SSE-NEXT: addb %dil, %cl +; SSE-NEXT: jno .LBB13_2 +; SSE-NEXT: # %bb.1: +; SSE-NEXT: addb $127, %sil +; SSE-NEXT: movl %esi, %ecx +; SSE-NEXT: .LBB13_2: +; SSE-NEXT: movb %cl, (%rdx) +; SSE-NEXT: retq +; +; AVX-LABEL: v1i8: +; AVX: # %bb.0: +; AVX-NEXT: movb (%rdi), %cl +; AVX-NEXT: movb (%rsi), %dil +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: addb %dil, %al +; AVX-NEXT: setns %sil +; AVX-NEXT: addb %dil, %cl +; AVX-NEXT: jno .LBB13_2 +; AVX-NEXT: # %bb.1: +; AVX-NEXT: addb $127, %sil +; AVX-NEXT: movl %esi, %ecx +; AVX-NEXT: .LBB13_2: +; AVX-NEXT: movb %cl, (%rdx) +; AVX-NEXT: retq + %x = load <1 x i8>, <1 x i8>* %px + %y = load <1 x i8>, <1 x i8>* %py + %z = call <1 x i8> @llvm.sadd.sat.v1i8(<1 x i8> %x, <1 x i8> %y) + store <1 x i8> %z, <1 x i8>* %pz + ret void +} + +define void @v1i16(<1 x i16>* %px, <1 x i16>* %py, <1 x i16>* %pz) nounwind { +; SSE-LABEL: v1i16: +; SSE: # %bb.0: +; SSE-NEXT: movzwl (%rdi), %eax +; SSE-NEXT: movzwl (%rsi), %ecx +; SSE-NEXT: xorl %esi, %esi +; SSE-NEXT: movl %eax, %edi +; SSE-NEXT: addw %cx, %di +; SSE-NEXT: setns %sil +; SSE-NEXT: addl $32767, %esi # imm = 0x7FFF +; SSE-NEXT: addw %cx, %ax +; SSE-NEXT: cmovol %esi, %eax +; SSE-NEXT: movw %ax, (%rdx) +; SSE-NEXT: retq +; +; AVX-LABEL: v1i16: +; AVX: # %bb.0: +; AVX-NEXT: movzwl (%rdi), %eax +; AVX-NEXT: movzwl (%rsi), %ecx +; AVX-NEXT: xorl %esi, %esi +; AVX-NEXT: movl %eax, %edi +; AVX-NEXT: addw %cx, %di +; AVX-NEXT: setns %sil +; AVX-NEXT: addl $32767, %esi # imm = 0x7FFF +; AVX-NEXT: addw %cx, %ax +; AVX-NEXT: cmovol %esi, %eax +; AVX-NEXT: movw %ax, (%rdx) +; AVX-NEXT: retq + %x = load <1 x i16>, <1 x i16>* %px + %y = load <1 x i16>, <1 x i16>* %py + %z = call <1 x i16> @llvm.sadd.sat.v1i16(<1 x i16> %x, <1 x i16> %y) + store <1 x i16> %z, <1 x i16>* %pz + ret void +} + +; Promotion + +define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { +; SSE2-LABEL: v16i4: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: pushq %r15 +; SSE2-NEXT: pushq %r14 +; SSE2-NEXT: pushq %r13 +; SSE2-NEXT: pushq %r12 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: psllw $4, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: psllw $4, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r9b +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSE2-NEXT: movl %r9d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r9b +; SSE2-NEXT: jno .LBB15_2 +; SSE2-NEXT: # %bb.1: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r9d +; SSE2-NEXT: .LBB15_2: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movl %esi, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %sil +; SSE2-NEXT: jno .LBB15_4 +; SSE2-NEXT: # %bb.3: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %esi +; SSE2-NEXT: .LBB15_4: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl %ebx, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %bl +; SSE2-NEXT: jno .LBB15_6 +; SSE2-NEXT: # %bb.5: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %ebx +; SSE2-NEXT: .LBB15_6: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %dl +; SSE2-NEXT: jno .LBB15_8 +; SSE2-NEXT: # %bb.7: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %edx +; SSE2-NEXT: .LBB15_8: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r10b +; SSE2-NEXT: movl %r10d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r10b +; SSE2-NEXT: jno .LBB15_10 +; SSE2-NEXT: # %bb.9: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r10d +; SSE2-NEXT: .LBB15_10: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r11b +; SSE2-NEXT: movl %r11d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r11b +; SSE2-NEXT: jno .LBB15_12 +; SSE2-NEXT: # %bb.11: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r11d +; SSE2-NEXT: .LBB15_12: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bpl +; SSE2-NEXT: movl %ebp, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %bpl +; SSE2-NEXT: jno .LBB15_14 +; SSE2-NEXT: # %bb.13: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %ebp +; SSE2-NEXT: .LBB15_14: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r14b +; SSE2-NEXT: movl %r14d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r14b +; SSE2-NEXT: jno .LBB15_16 +; SSE2-NEXT: # %bb.15: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r14d +; SSE2-NEXT: .LBB15_16: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r15b +; SSE2-NEXT: movl %r15d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r15b +; SSE2-NEXT: jno .LBB15_18 +; SSE2-NEXT: # %bb.17: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r15d +; SSE2-NEXT: .LBB15_18: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r12b +; SSE2-NEXT: movl %r12d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r12b +; SSE2-NEXT: jno .LBB15_20 +; SSE2-NEXT: # %bb.19: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r12d +; SSE2-NEXT: .LBB15_20: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r13b +; SSE2-NEXT: movl %r13d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r13b +; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB15_22 +; SSE2-NEXT: # %bb.21: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r13d +; SSE2-NEXT: .LBB15_22: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dil +; SSE2-NEXT: movl %edi, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %dil +; SSE2-NEXT: jno .LBB15_24 +; SSE2-NEXT: # %bb.23: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %edi +; SSE2-NEXT: .LBB15_24: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r8b +; SSE2-NEXT: movl %r8d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r8b +; SSE2-NEXT: jno .LBB15_26 +; SSE2-NEXT: # %bb.25: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r8d +; SSE2-NEXT: .LBB15_26: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl %ebx, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %bl +; SSE2-NEXT: jno .LBB15_28 +; SSE2-NEXT: # %bb.27: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %ebx +; SSE2-NEXT: .LBB15_28: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: addb %dl, %cl +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addb %dl, %al +; SSE2-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB15_30 +; SSE2-NEXT: # %bb.29: +; SSE2-NEXT: addb $127, %cl +; SSE2-NEXT: movl %ecx, %eax +; SSE2-NEXT: .LBB15_30: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSE2-NEXT: movl %esi, %ecx +; SSE2-NEXT: addb %dl, %cl +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addb %dl, %sil +; SSE2-NEXT: jno .LBB15_32 +; SSE2-NEXT: # %bb.31: +; SSE2-NEXT: addb $127, %cl +; SSE2-NEXT: movl %ecx, %esi +; SSE2-NEXT: .LBB15_32: +; SSE2-NEXT: movzbl %sil, %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: movzbl %bl, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl %r8b, %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: movzbl %dil, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl %r13b, %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: movzbl %r12b, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl %r15b, %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: movzbl %r14b, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl %bpl, %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: movzbl %r11b, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl %r10b, %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: movzbl %r9b, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: psubb %xmm1, %xmm0 +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %r12 +; SSE2-NEXT: popq %r13 +; SSE2-NEXT: popq %r14 +; SSE2-NEXT: popq %r15 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v16i4: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pushq %rbp +; SSSE3-NEXT: pushq %r15 +; SSSE3-NEXT: pushq %r14 +; SSSE3-NEXT: pushq %r13 +; SSSE3-NEXT: pushq %r12 +; SSSE3-NEXT: pushq %rbx +; SSSE3-NEXT: psllw $4, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: psllw $4, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r9b +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSSE3-NEXT: movl %r9d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r9b +; SSSE3-NEXT: jno .LBB15_2 +; SSSE3-NEXT: # %bb.1: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r9d +; SSSE3-NEXT: .LBB15_2: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movl %esi, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %sil +; SSSE3-NEXT: jno .LBB15_4 +; SSSE3-NEXT: # %bb.3: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %esi +; SSSE3-NEXT: .LBB15_4: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl %ebx, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %bl +; SSSE3-NEXT: jno .LBB15_6 +; SSSE3-NEXT: # %bb.5: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %ebx +; SSSE3-NEXT: .LBB15_6: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %dl +; SSSE3-NEXT: jno .LBB15_8 +; SSSE3-NEXT: # %bb.7: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %edx +; SSSE3-NEXT: .LBB15_8: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r10b +; SSSE3-NEXT: movl %r10d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r10b +; SSSE3-NEXT: jno .LBB15_10 +; SSSE3-NEXT: # %bb.9: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r10d +; SSSE3-NEXT: .LBB15_10: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r11b +; SSSE3-NEXT: movl %r11d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r11b +; SSSE3-NEXT: jno .LBB15_12 +; SSSE3-NEXT: # %bb.11: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r11d +; SSSE3-NEXT: .LBB15_12: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bpl +; SSSE3-NEXT: movl %ebp, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %bpl +; SSSE3-NEXT: jno .LBB15_14 +; SSSE3-NEXT: # %bb.13: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %ebp +; SSSE3-NEXT: .LBB15_14: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r14b +; SSSE3-NEXT: movl %r14d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r14b +; SSSE3-NEXT: jno .LBB15_16 +; SSSE3-NEXT: # %bb.15: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r14d +; SSSE3-NEXT: .LBB15_16: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r15b +; SSSE3-NEXT: movl %r15d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r15b +; SSSE3-NEXT: jno .LBB15_18 +; SSSE3-NEXT: # %bb.17: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r15d +; SSSE3-NEXT: .LBB15_18: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r12b +; SSSE3-NEXT: movl %r12d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r12b +; SSSE3-NEXT: jno .LBB15_20 +; SSSE3-NEXT: # %bb.19: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r12d +; SSSE3-NEXT: .LBB15_20: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r13b +; SSSE3-NEXT: movl %r13d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r13b +; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB15_22 +; SSSE3-NEXT: # %bb.21: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r13d +; SSSE3-NEXT: .LBB15_22: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dil +; SSSE3-NEXT: movl %edi, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %dil +; SSSE3-NEXT: jno .LBB15_24 +; SSSE3-NEXT: # %bb.23: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %edi +; SSSE3-NEXT: .LBB15_24: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r8b +; SSSE3-NEXT: movl %r8d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r8b +; SSSE3-NEXT: jno .LBB15_26 +; SSSE3-NEXT: # %bb.25: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r8d +; SSSE3-NEXT: .LBB15_26: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl %ebx, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %bl +; SSSE3-NEXT: jno .LBB15_28 +; SSSE3-NEXT: # %bb.27: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %ebx +; SSSE3-NEXT: .LBB15_28: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: addb %dl, %cl +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addb %dl, %al +; SSSE3-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB15_30 +; SSSE3-NEXT: # %bb.29: +; SSSE3-NEXT: addb $127, %cl +; SSSE3-NEXT: movl %ecx, %eax +; SSSE3-NEXT: .LBB15_30: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSSE3-NEXT: movl %esi, %ecx +; SSSE3-NEXT: addb %dl, %cl +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addb %dl, %sil +; SSSE3-NEXT: jno .LBB15_32 +; SSSE3-NEXT: # %bb.31: +; SSSE3-NEXT: addb $127, %cl +; SSSE3-NEXT: movl %ecx, %esi +; SSSE3-NEXT: .LBB15_32: +; SSSE3-NEXT: movzbl %sil, %ecx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: movzbl %al, %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: movzbl %bl, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzbl %r8b, %eax +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSSE3-NEXT: movzbl %dil, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzbl %r13b, %eax +; SSSE3-NEXT: movd %eax, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSSE3-NEXT: movzbl %r12b, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzbl %r15b, %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSSE3-NEXT: movzbl %r14b, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzbl %bpl, %eax +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: movzbl %r11b, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzbl %r10b, %eax +; SSSE3-NEXT: movd %eax, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movd %eax, %xmm4 +; SSSE3-NEXT: movzbl %r9b, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: psrlw $4, %xmm0 +; SSSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; SSSE3-NEXT: pxor %xmm1, %xmm0 +; SSSE3-NEXT: psubb %xmm1, %xmm0 +; SSSE3-NEXT: popq %rbx +; SSSE3-NEXT: popq %r12 +; SSSE3-NEXT: popq %r13 +; SSSE3-NEXT: popq %r14 +; SSSE3-NEXT: popq %r15 +; SSSE3-NEXT: popq %rbp +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v16i4: +; SSE41: # %bb.0: +; SSE41-NEXT: psllw $4, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; SSE41-NEXT: pand %xmm2, %xmm1 +; SSE41-NEXT: pextrb $15, %xmm1, %ecx +; SSE41-NEXT: psllw $4, %xmm0 +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pextrb $15, %xmm0, %edx +; SSE41-NEXT: movl %edx, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %dl +; SSE41-NEXT: jno .LBB15_2 +; SSE41-NEXT: # %bb.1: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edx +; SSE41-NEXT: .LBB15_2: +; SSE41-NEXT: pextrb $14, %xmm1, %ecx +; SSE41-NEXT: pextrb $14, %xmm0, %r11d +; SSE41-NEXT: movl %r11d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r11b +; SSE41-NEXT: jno .LBB15_4 +; SSE41-NEXT: # %bb.3: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r11d +; SSE41-NEXT: .LBB15_4: +; SSE41-NEXT: pextrb $13, %xmm1, %ecx +; SSE41-NEXT: pextrb $13, %xmm0, %edi +; SSE41-NEXT: movl %edi, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %dil +; SSE41-NEXT: jno .LBB15_6 +; SSE41-NEXT: # %bb.5: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edi +; SSE41-NEXT: .LBB15_6: +; SSE41-NEXT: pushq %rbp +; SSE41-NEXT: pushq %r15 +; SSE41-NEXT: pushq %r14 +; SSE41-NEXT: pushq %r13 +; SSE41-NEXT: pushq %r12 +; SSE41-NEXT: pushq %rbx +; SSE41-NEXT: pextrb $12, %xmm1, %ecx +; SSE41-NEXT: pextrb $12, %xmm0, %r14d +; SSE41-NEXT: movl %r14d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r14b +; SSE41-NEXT: jno .LBB15_8 +; SSE41-NEXT: # %bb.7: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r14d +; SSE41-NEXT: .LBB15_8: +; SSE41-NEXT: pextrb $11, %xmm1, %ecx +; SSE41-NEXT: pextrb $11, %xmm0, %ebp +; SSE41-NEXT: movl %ebp, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %bpl +; SSE41-NEXT: jno .LBB15_10 +; SSE41-NEXT: # %bb.9: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebp +; SSE41-NEXT: .LBB15_10: +; SSE41-NEXT: pextrb $10, %xmm1, %ecx +; SSE41-NEXT: pextrb $10, %xmm0, %r15d +; SSE41-NEXT: movl %r15d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r15b +; SSE41-NEXT: jno .LBB15_12 +; SSE41-NEXT: # %bb.11: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r15d +; SSE41-NEXT: .LBB15_12: +; SSE41-NEXT: pextrb $9, %xmm1, %ecx +; SSE41-NEXT: pextrb $9, %xmm0, %r12d +; SSE41-NEXT: movl %r12d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r12b +; SSE41-NEXT: jno .LBB15_14 +; SSE41-NEXT: # %bb.13: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r12d +; SSE41-NEXT: .LBB15_14: +; SSE41-NEXT: pextrb $8, %xmm1, %ecx +; SSE41-NEXT: pextrb $8, %xmm0, %r13d +; SSE41-NEXT: movl %r13d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r13b +; SSE41-NEXT: jno .LBB15_16 +; SSE41-NEXT: # %bb.15: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r13d +; SSE41-NEXT: .LBB15_16: +; SSE41-NEXT: pextrb $7, %xmm1, %ecx +; SSE41-NEXT: pextrb $7, %xmm0, %r10d +; SSE41-NEXT: movl %r10d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r10b +; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB15_18 +; SSE41-NEXT: # %bb.17: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r10d +; SSE41-NEXT: .LBB15_18: +; SSE41-NEXT: pextrb $6, %xmm1, %ecx +; SSE41-NEXT: pextrb $6, %xmm0, %r9d +; SSE41-NEXT: movl %r9d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r9b +; SSE41-NEXT: jno .LBB15_20 +; SSE41-NEXT: # %bb.19: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r9d +; SSE41-NEXT: .LBB15_20: +; SSE41-NEXT: pextrb $5, %xmm1, %ecx +; SSE41-NEXT: pextrb $5, %xmm0, %ebp +; SSE41-NEXT: movl %ebp, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %bpl +; SSE41-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB15_22 +; SSE41-NEXT: # %bb.21: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebp +; SSE41-NEXT: .LBB15_22: +; SSE41-NEXT: pextrb $4, %xmm1, %ecx +; SSE41-NEXT: pextrb $4, %xmm0, %edi +; SSE41-NEXT: movl %edi, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %dil +; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB15_24 +; SSE41-NEXT: # %bb.23: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edi +; SSE41-NEXT: .LBB15_24: +; SSE41-NEXT: pextrb $3, %xmm1, %edx +; SSE41-NEXT: pextrb $3, %xmm0, %eax +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: addb %dl, %cl +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addb %dl, %al +; SSE41-NEXT: jno .LBB15_26 +; SSE41-NEXT: # %bb.25: +; SSE41-NEXT: addb $127, %cl +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB15_26: +; SSE41-NEXT: pextrb $2, %xmm1, %ebx +; SSE41-NEXT: pextrb $2, %xmm0, %ecx +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: addb %bl, %dl +; SSE41-NEXT: setns %dl +; SSE41-NEXT: addb %bl, %cl +; SSE41-NEXT: jno .LBB15_28 +; SSE41-NEXT: # %bb.27: +; SSE41-NEXT: addb $127, %dl +; SSE41-NEXT: movl %edx, %ecx +; SSE41-NEXT: .LBB15_28: +; SSE41-NEXT: pextrb $0, %xmm1, %esi +; SSE41-NEXT: pextrb $0, %xmm0, %edx +; SSE41-NEXT: movl %edx, %ebx +; SSE41-NEXT: addb %sil, %bl +; SSE41-NEXT: setns %bl +; SSE41-NEXT: addb %sil, %dl +; SSE41-NEXT: jno .LBB15_30 +; SSE41-NEXT: # %bb.29: +; SSE41-NEXT: addb $127, %bl +; SSE41-NEXT: movl %ebx, %edx +; SSE41-NEXT: .LBB15_30: +; SSE41-NEXT: pextrb $1, %xmm1, %esi +; SSE41-NEXT: pextrb $1, %xmm0, %r8d +; SSE41-NEXT: movl %r8d, %ebx +; SSE41-NEXT: addb %sil, %bl +; SSE41-NEXT: setns %bl +; SSE41-NEXT: addb %sil, %r8b +; SSE41-NEXT: jno .LBB15_32 +; SSE41-NEXT: # %bb.31: +; SSE41-NEXT: addb $127, %bl +; SSE41-NEXT: movl %ebx, %r8d +; SSE41-NEXT: .LBB15_32: +; SSE41-NEXT: movzbl %dl, %edx +; SSE41-NEXT: movd %edx, %xmm0 +; SSE41-NEXT: movzbl %r8b, %edx +; SSE41-NEXT: pinsrb $1, %edx, %xmm0 +; SSE41-NEXT: movzbl %cl, %ecx +; SSE41-NEXT: pinsrb $2, %ecx, %xmm0 +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $3, %eax, %xmm0 +; SSE41-NEXT: movzbl %dil, %eax +; SSE41-NEXT: pinsrb $4, %eax, %xmm0 +; SSE41-NEXT: movzbl %bpl, %eax +; SSE41-NEXT: pinsrb $5, %eax, %xmm0 +; SSE41-NEXT: movzbl %r9b, %eax +; SSE41-NEXT: pinsrb $6, %eax, %xmm0 +; SSE41-NEXT: movzbl %r10b, %eax +; SSE41-NEXT: pinsrb $7, %eax, %xmm0 +; SSE41-NEXT: movzbl %r13b, %eax +; SSE41-NEXT: pinsrb $8, %eax, %xmm0 +; SSE41-NEXT: movzbl %r12b, %eax +; SSE41-NEXT: pinsrb $9, %eax, %xmm0 +; SSE41-NEXT: movzbl %r15b, %eax +; SSE41-NEXT: pinsrb $10, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $11, %eax, %xmm0 +; SSE41-NEXT: movzbl %r14b, %eax +; SSE41-NEXT: pinsrb $12, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $13, %eax, %xmm0 +; SSE41-NEXT: movzbl %r11b, %eax +; SSE41-NEXT: pinsrb $14, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $15, %eax, %xmm0 +; SSE41-NEXT: psrlw $4, %xmm0 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: psubb %xmm1, %xmm0 +; SSE41-NEXT: popq %rbx +; SSE41-NEXT: popq %r12 +; SSE41-NEXT: popq %r13 +; SSE41-NEXT: popq %r14 +; SSE41-NEXT: popq %r15 +; SSE41-NEXT: popq %rbp +; SSE41-NEXT: retq +; +; AVX-LABEL: v16i4: +; AVX: # %bb.0: +; AVX-NEXT: vpsllw $4, %xmm1, %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $15, %xmm1, %ecx +; AVX-NEXT: vpsllw $4, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $15, %xmm0, %edx +; AVX-NEXT: movl %edx, %eax +; AVX-NEXT: addb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: addb %cl, %dl +; AVX-NEXT: jno .LBB15_2 +; AVX-NEXT: # %bb.1: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %edx +; AVX-NEXT: .LBB15_2: +; AVX-NEXT: vpextrb $14, %xmm1, %ecx +; AVX-NEXT: vpextrb $14, %xmm0, %r11d +; AVX-NEXT: movl %r11d, %eax +; AVX-NEXT: addb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: addb %cl, %r11b +; AVX-NEXT: jno .LBB15_4 +; AVX-NEXT: # %bb.3: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %r11d +; AVX-NEXT: .LBB15_4: +; AVX-NEXT: vpextrb $13, %xmm1, %ecx +; AVX-NEXT: vpextrb $13, %xmm0, %edi +; AVX-NEXT: movl %edi, %eax +; AVX-NEXT: addb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: addb %cl, %dil +; AVX-NEXT: jno .LBB15_6 +; AVX-NEXT: # %bb.5: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %edi +; AVX-NEXT: .LBB15_6: +; AVX-NEXT: pushq %rbp +; AVX-NEXT: pushq %r15 +; AVX-NEXT: pushq %r14 +; AVX-NEXT: pushq %r13 +; AVX-NEXT: pushq %r12 +; AVX-NEXT: pushq %rbx +; AVX-NEXT: vpextrb $12, %xmm1, %ecx +; AVX-NEXT: vpextrb $12, %xmm0, %r14d +; AVX-NEXT: movl %r14d, %eax +; AVX-NEXT: addb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: addb %cl, %r14b +; AVX-NEXT: jno .LBB15_8 +; AVX-NEXT: # %bb.7: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %r14d +; AVX-NEXT: .LBB15_8: +; AVX-NEXT: vpextrb $11, %xmm1, %ecx +; AVX-NEXT: vpextrb $11, %xmm0, %ebp +; AVX-NEXT: movl %ebp, %eax +; AVX-NEXT: addb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: addb %cl, %bpl +; AVX-NEXT: jno .LBB15_10 +; AVX-NEXT: # %bb.9: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %ebp +; AVX-NEXT: .LBB15_10: +; AVX-NEXT: vpextrb $10, %xmm1, %ecx +; AVX-NEXT: vpextrb $10, %xmm0, %r15d +; AVX-NEXT: movl %r15d, %eax +; AVX-NEXT: addb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: addb %cl, %r15b +; AVX-NEXT: jno .LBB15_12 +; AVX-NEXT: # %bb.11: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %r15d +; AVX-NEXT: .LBB15_12: +; AVX-NEXT: vpextrb $9, %xmm1, %ecx +; AVX-NEXT: vpextrb $9, %xmm0, %r12d +; AVX-NEXT: movl %r12d, %eax +; AVX-NEXT: addb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: addb %cl, %r12b +; AVX-NEXT: jno .LBB15_14 +; AVX-NEXT: # %bb.13: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %r12d +; AVX-NEXT: .LBB15_14: +; AVX-NEXT: vpextrb $8, %xmm1, %ecx +; AVX-NEXT: vpextrb $8, %xmm0, %r13d +; AVX-NEXT: movl %r13d, %eax +; AVX-NEXT: addb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: addb %cl, %r13b +; AVX-NEXT: jno .LBB15_16 +; AVX-NEXT: # %bb.15: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %r13d +; AVX-NEXT: .LBB15_16: +; AVX-NEXT: vpextrb $7, %xmm1, %ecx +; AVX-NEXT: vpextrb $7, %xmm0, %r10d +; AVX-NEXT: movl %r10d, %eax +; AVX-NEXT: addb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: addb %cl, %r10b +; AVX-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX-NEXT: jno .LBB15_18 +; AVX-NEXT: # %bb.17: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %r10d +; AVX-NEXT: .LBB15_18: +; AVX-NEXT: vpextrb $6, %xmm1, %ecx +; AVX-NEXT: vpextrb $6, %xmm0, %r9d +; AVX-NEXT: movl %r9d, %eax +; AVX-NEXT: addb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: addb %cl, %r9b +; AVX-NEXT: jno .LBB15_20 +; AVX-NEXT: # %bb.19: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %r9d +; AVX-NEXT: .LBB15_20: +; AVX-NEXT: vpextrb $5, %xmm1, %ecx +; AVX-NEXT: vpextrb $5, %xmm0, %ebp +; AVX-NEXT: movl %ebp, %eax +; AVX-NEXT: addb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: addb %cl, %bpl +; AVX-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX-NEXT: jno .LBB15_22 +; AVX-NEXT: # %bb.21: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %ebp +; AVX-NEXT: .LBB15_22: +; AVX-NEXT: vpextrb $4, %xmm1, %ecx +; AVX-NEXT: vpextrb $4, %xmm0, %edi +; AVX-NEXT: movl %edi, %eax +; AVX-NEXT: addb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: addb %cl, %dil +; AVX-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX-NEXT: jno .LBB15_24 +; AVX-NEXT: # %bb.23: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %edi +; AVX-NEXT: .LBB15_24: +; AVX-NEXT: vpextrb $3, %xmm1, %edx +; AVX-NEXT: vpextrb $3, %xmm0, %eax +; AVX-NEXT: movl %eax, %ecx +; AVX-NEXT: addb %dl, %cl +; AVX-NEXT: setns %cl +; AVX-NEXT: addb %dl, %al +; AVX-NEXT: jno .LBB15_26 +; AVX-NEXT: # %bb.25: +; AVX-NEXT: addb $127, %cl +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: .LBB15_26: +; AVX-NEXT: vpextrb $2, %xmm1, %ebx +; AVX-NEXT: vpextrb $2, %xmm0, %ecx +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: addb %bl, %dl +; AVX-NEXT: setns %dl +; AVX-NEXT: addb %bl, %cl +; AVX-NEXT: jno .LBB15_28 +; AVX-NEXT: # %bb.27: +; AVX-NEXT: addb $127, %dl +; AVX-NEXT: movl %edx, %ecx +; AVX-NEXT: .LBB15_28: +; AVX-NEXT: vpextrb $0, %xmm1, %esi +; AVX-NEXT: vpextrb $0, %xmm0, %edx +; AVX-NEXT: movl %edx, %ebx +; AVX-NEXT: addb %sil, %bl +; AVX-NEXT: setns %bl +; AVX-NEXT: addb %sil, %dl +; AVX-NEXT: jno .LBB15_30 +; AVX-NEXT: # %bb.29: +; AVX-NEXT: addb $127, %bl +; AVX-NEXT: movl %ebx, %edx +; AVX-NEXT: .LBB15_30: +; AVX-NEXT: vpextrb $1, %xmm1, %esi +; AVX-NEXT: vpextrb $1, %xmm0, %r8d +; AVX-NEXT: movl %r8d, %ebx +; AVX-NEXT: addb %sil, %bl +; AVX-NEXT: setns %bl +; AVX-NEXT: addb %sil, %r8b +; AVX-NEXT: jno .LBB15_32 +; AVX-NEXT: # %bb.31: +; AVX-NEXT: addb $127, %bl +; AVX-NEXT: movl %ebx, %r8d +; AVX-NEXT: .LBB15_32: +; AVX-NEXT: movzbl %dl, %edx +; AVX-NEXT: vmovd %edx, %xmm0 +; AVX-NEXT: movzbl %r8b, %edx +; AVX-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0 +; AVX-NEXT: movzbl %cl, %ecx +; AVX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %dil, %eax +; AVX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %bpl, %eax +; AVX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %r9b, %eax +; AVX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %r10b, %eax +; AVX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %r13b, %eax +; AVX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %r12b, %eax +; AVX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %r15b, %eax +; AVX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %r14b, %eax +; AVX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %r11b, %eax +; AVX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: popq %rbx +; AVX-NEXT: popq %r12 +; AVX-NEXT: popq %r13 +; AVX-NEXT: popq %r14 +; AVX-NEXT: popq %r15 +; AVX-NEXT: popq %rbp +; AVX-NEXT: retq + %z = call <16 x i4> @llvm.sadd.sat.v16i4(<16 x i4> %x, <16 x i4> %y) + ret <16 x i4> %z +} + +define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind { +; SSE2-LABEL: v16i1: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: pushq %r15 +; SSE2-NEXT: pushq %r14 +; SSE2-NEXT: pushq %r13 +; SSE2-NEXT: pushq %r12 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: psllw $7, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: psllw $7, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r9b +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSE2-NEXT: movl %r9d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r9b +; SSE2-NEXT: jno .LBB16_2 +; SSE2-NEXT: # %bb.1: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r9d +; SSE2-NEXT: .LBB16_2: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movl %esi, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %sil +; SSE2-NEXT: jno .LBB16_4 +; SSE2-NEXT: # %bb.3: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %esi +; SSE2-NEXT: .LBB16_4: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl %ebx, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %bl +; SSE2-NEXT: jno .LBB16_6 +; SSE2-NEXT: # %bb.5: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %ebx +; SSE2-NEXT: .LBB16_6: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %dl +; SSE2-NEXT: jno .LBB16_8 +; SSE2-NEXT: # %bb.7: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %edx +; SSE2-NEXT: .LBB16_8: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r10b +; SSE2-NEXT: movl %r10d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r10b +; SSE2-NEXT: jno .LBB16_10 +; SSE2-NEXT: # %bb.9: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r10d +; SSE2-NEXT: .LBB16_10: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r11b +; SSE2-NEXT: movl %r11d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r11b +; SSE2-NEXT: jno .LBB16_12 +; SSE2-NEXT: # %bb.11: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r11d +; SSE2-NEXT: .LBB16_12: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bpl +; SSE2-NEXT: movl %ebp, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %bpl +; SSE2-NEXT: jno .LBB16_14 +; SSE2-NEXT: # %bb.13: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %ebp +; SSE2-NEXT: .LBB16_14: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r14b +; SSE2-NEXT: movl %r14d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r14b +; SSE2-NEXT: jno .LBB16_16 +; SSE2-NEXT: # %bb.15: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r14d +; SSE2-NEXT: .LBB16_16: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r15b +; SSE2-NEXT: movl %r15d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r15b +; SSE2-NEXT: jno .LBB16_18 +; SSE2-NEXT: # %bb.17: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r15d +; SSE2-NEXT: .LBB16_18: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r12b +; SSE2-NEXT: movl %r12d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r12b +; SSE2-NEXT: jno .LBB16_20 +; SSE2-NEXT: # %bb.19: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r12d +; SSE2-NEXT: .LBB16_20: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r13b +; SSE2-NEXT: movl %r13d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r13b +; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB16_22 +; SSE2-NEXT: # %bb.21: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r13d +; SSE2-NEXT: .LBB16_22: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dil +; SSE2-NEXT: movl %edi, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %dil +; SSE2-NEXT: jno .LBB16_24 +; SSE2-NEXT: # %bb.23: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %edi +; SSE2-NEXT: .LBB16_24: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r8b +; SSE2-NEXT: movl %r8d, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %r8b +; SSE2-NEXT: jno .LBB16_26 +; SSE2-NEXT: # %bb.25: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r8d +; SSE2-NEXT: .LBB16_26: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl %ebx, %eax +; SSE2-NEXT: addb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: addb %cl, %bl +; SSE2-NEXT: jno .LBB16_28 +; SSE2-NEXT: # %bb.27: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %ebx +; SSE2-NEXT: .LBB16_28: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: addb %dl, %cl +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addb %dl, %al +; SSE2-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB16_30 +; SSE2-NEXT: # %bb.29: +; SSE2-NEXT: addb $127, %cl +; SSE2-NEXT: movl %ecx, %eax +; SSE2-NEXT: .LBB16_30: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSE2-NEXT: movl %esi, %ecx +; SSE2-NEXT: addb %dl, %cl +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addb %dl, %sil +; SSE2-NEXT: jno .LBB16_32 +; SSE2-NEXT: # %bb.31: +; SSE2-NEXT: addb $127, %cl +; SSE2-NEXT: movl %ecx, %esi +; SSE2-NEXT: .LBB16_32: +; SSE2-NEXT: movzbl %sil, %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: movzbl %bl, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl %r8b, %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: movzbl %dil, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl %r13b, %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: movzbl %r12b, %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: movzbl %r15b, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: movzbl %r14b, %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: movzbl %bpl, %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: movzbl %r11b, %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: movzbl %r10b, %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: movzbl %r9b, %eax +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pcmpgtb %xmm4, %xmm0 +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %r12 +; SSE2-NEXT: popq %r13 +; SSE2-NEXT: popq %r14 +; SSE2-NEXT: popq %r15 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v16i1: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pushq %rbp +; SSSE3-NEXT: pushq %r15 +; SSSE3-NEXT: pushq %r14 +; SSSE3-NEXT: pushq %r13 +; SSSE3-NEXT: pushq %r12 +; SSSE3-NEXT: pushq %rbx +; SSSE3-NEXT: psllw $7, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: psllw $7, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r9b +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSSE3-NEXT: movl %r9d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r9b +; SSSE3-NEXT: jno .LBB16_2 +; SSSE3-NEXT: # %bb.1: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r9d +; SSSE3-NEXT: .LBB16_2: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movl %esi, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %sil +; SSSE3-NEXT: jno .LBB16_4 +; SSSE3-NEXT: # %bb.3: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %esi +; SSSE3-NEXT: .LBB16_4: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl %ebx, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %bl +; SSSE3-NEXT: jno .LBB16_6 +; SSSE3-NEXT: # %bb.5: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %ebx +; SSSE3-NEXT: .LBB16_6: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %dl +; SSSE3-NEXT: jno .LBB16_8 +; SSSE3-NEXT: # %bb.7: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %edx +; SSSE3-NEXT: .LBB16_8: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r10b +; SSSE3-NEXT: movl %r10d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r10b +; SSSE3-NEXT: jno .LBB16_10 +; SSSE3-NEXT: # %bb.9: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r10d +; SSSE3-NEXT: .LBB16_10: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r11b +; SSSE3-NEXT: movl %r11d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r11b +; SSSE3-NEXT: jno .LBB16_12 +; SSSE3-NEXT: # %bb.11: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r11d +; SSSE3-NEXT: .LBB16_12: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bpl +; SSSE3-NEXT: movl %ebp, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %bpl +; SSSE3-NEXT: jno .LBB16_14 +; SSSE3-NEXT: # %bb.13: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %ebp +; SSSE3-NEXT: .LBB16_14: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r14b +; SSSE3-NEXT: movl %r14d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r14b +; SSSE3-NEXT: jno .LBB16_16 +; SSSE3-NEXT: # %bb.15: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r14d +; SSSE3-NEXT: .LBB16_16: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r15b +; SSSE3-NEXT: movl %r15d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r15b +; SSSE3-NEXT: jno .LBB16_18 +; SSSE3-NEXT: # %bb.17: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r15d +; SSSE3-NEXT: .LBB16_18: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r12b +; SSSE3-NEXT: movl %r12d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r12b +; SSSE3-NEXT: jno .LBB16_20 +; SSSE3-NEXT: # %bb.19: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r12d +; SSSE3-NEXT: .LBB16_20: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r13b +; SSSE3-NEXT: movl %r13d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r13b +; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB16_22 +; SSSE3-NEXT: # %bb.21: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r13d +; SSSE3-NEXT: .LBB16_22: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dil +; SSSE3-NEXT: movl %edi, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %dil +; SSSE3-NEXT: jno .LBB16_24 +; SSSE3-NEXT: # %bb.23: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %edi +; SSSE3-NEXT: .LBB16_24: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r8b +; SSSE3-NEXT: movl %r8d, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %r8b +; SSSE3-NEXT: jno .LBB16_26 +; SSSE3-NEXT: # %bb.25: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r8d +; SSSE3-NEXT: .LBB16_26: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl %ebx, %eax +; SSSE3-NEXT: addb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addb %cl, %bl +; SSSE3-NEXT: jno .LBB16_28 +; SSSE3-NEXT: # %bb.27: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %ebx +; SSSE3-NEXT: .LBB16_28: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: addb %dl, %cl +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addb %dl, %al +; SSSE3-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB16_30 +; SSSE3-NEXT: # %bb.29: +; SSSE3-NEXT: addb $127, %cl +; SSSE3-NEXT: movl %ecx, %eax +; SSSE3-NEXT: .LBB16_30: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSSE3-NEXT: movl %esi, %ecx +; SSSE3-NEXT: addb %dl, %cl +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addb %dl, %sil +; SSSE3-NEXT: jno .LBB16_32 +; SSSE3-NEXT: # %bb.31: +; SSSE3-NEXT: addb $127, %cl +; SSSE3-NEXT: movl %ecx, %esi +; SSSE3-NEXT: .LBB16_32: +; SSSE3-NEXT: movzbl %sil, %ecx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: movzbl %al, %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: movzbl %bl, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzbl %r8b, %eax +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSSE3-NEXT: movzbl %dil, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzbl %r13b, %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: movzbl %r12b, %eax +; SSSE3-NEXT: movd %eax, %xmm3 +; SSSE3-NEXT: movzbl %r15b, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: movzbl %r14b, %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movzbl %bpl, %eax +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSSE3-NEXT: movzbl %r11b, %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movzbl %r10b, %eax +; SSSE3-NEXT: movd %eax, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movzbl %r9b, %eax +; SSSE3-NEXT: movd %eax, %xmm4 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSSE3-NEXT: pxor %xmm0, %xmm0 +; SSSE3-NEXT: pcmpgtb %xmm4, %xmm0 +; SSSE3-NEXT: popq %rbx +; SSSE3-NEXT: popq %r12 +; SSSE3-NEXT: popq %r13 +; SSSE3-NEXT: popq %r14 +; SSSE3-NEXT: popq %r15 +; SSSE3-NEXT: popq %rbp +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v16i1: +; SSE41: # %bb.0: +; SSE41-NEXT: psllw $7, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; SSE41-NEXT: pand %xmm2, %xmm1 +; SSE41-NEXT: pextrb $15, %xmm1, %ecx +; SSE41-NEXT: psllw $7, %xmm0 +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pextrb $15, %xmm0, %edx +; SSE41-NEXT: movl %edx, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %dl +; SSE41-NEXT: jno .LBB16_2 +; SSE41-NEXT: # %bb.1: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edx +; SSE41-NEXT: .LBB16_2: +; SSE41-NEXT: pextrb $14, %xmm1, %ecx +; SSE41-NEXT: pextrb $14, %xmm0, %r11d +; SSE41-NEXT: movl %r11d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r11b +; SSE41-NEXT: jno .LBB16_4 +; SSE41-NEXT: # %bb.3: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r11d +; SSE41-NEXT: .LBB16_4: +; SSE41-NEXT: pextrb $13, %xmm1, %ecx +; SSE41-NEXT: pextrb $13, %xmm0, %edi +; SSE41-NEXT: movl %edi, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %dil +; SSE41-NEXT: jno .LBB16_6 +; SSE41-NEXT: # %bb.5: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edi +; SSE41-NEXT: .LBB16_6: +; SSE41-NEXT: pushq %rbp +; SSE41-NEXT: pushq %r15 +; SSE41-NEXT: pushq %r14 +; SSE41-NEXT: pushq %r13 +; SSE41-NEXT: pushq %r12 +; SSE41-NEXT: pushq %rbx +; SSE41-NEXT: pextrb $12, %xmm1, %ecx +; SSE41-NEXT: pextrb $12, %xmm0, %r14d +; SSE41-NEXT: movl %r14d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r14b +; SSE41-NEXT: jno .LBB16_8 +; SSE41-NEXT: # %bb.7: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r14d +; SSE41-NEXT: .LBB16_8: +; SSE41-NEXT: pextrb $11, %xmm1, %ecx +; SSE41-NEXT: pextrb $11, %xmm0, %ebp +; SSE41-NEXT: movl %ebp, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %bpl +; SSE41-NEXT: jno .LBB16_10 +; SSE41-NEXT: # %bb.9: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebp +; SSE41-NEXT: .LBB16_10: +; SSE41-NEXT: pextrb $10, %xmm1, %ecx +; SSE41-NEXT: pextrb $10, %xmm0, %r15d +; SSE41-NEXT: movl %r15d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r15b +; SSE41-NEXT: jno .LBB16_12 +; SSE41-NEXT: # %bb.11: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r15d +; SSE41-NEXT: .LBB16_12: +; SSE41-NEXT: pextrb $9, %xmm1, %ecx +; SSE41-NEXT: pextrb $9, %xmm0, %r12d +; SSE41-NEXT: movl %r12d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r12b +; SSE41-NEXT: jno .LBB16_14 +; SSE41-NEXT: # %bb.13: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r12d +; SSE41-NEXT: .LBB16_14: +; SSE41-NEXT: pextrb $8, %xmm1, %ecx +; SSE41-NEXT: pextrb $8, %xmm0, %r13d +; SSE41-NEXT: movl %r13d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r13b +; SSE41-NEXT: jno .LBB16_16 +; SSE41-NEXT: # %bb.15: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r13d +; SSE41-NEXT: .LBB16_16: +; SSE41-NEXT: pextrb $7, %xmm1, %ecx +; SSE41-NEXT: pextrb $7, %xmm0, %r10d +; SSE41-NEXT: movl %r10d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r10b +; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB16_18 +; SSE41-NEXT: # %bb.17: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r10d +; SSE41-NEXT: .LBB16_18: +; SSE41-NEXT: pextrb $6, %xmm1, %ecx +; SSE41-NEXT: pextrb $6, %xmm0, %r9d +; SSE41-NEXT: movl %r9d, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %r9b +; SSE41-NEXT: jno .LBB16_20 +; SSE41-NEXT: # %bb.19: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r9d +; SSE41-NEXT: .LBB16_20: +; SSE41-NEXT: pextrb $5, %xmm1, %ecx +; SSE41-NEXT: pextrb $5, %xmm0, %ebp +; SSE41-NEXT: movl %ebp, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %bpl +; SSE41-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB16_22 +; SSE41-NEXT: # %bb.21: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebp +; SSE41-NEXT: .LBB16_22: +; SSE41-NEXT: pextrb $4, %xmm1, %ecx +; SSE41-NEXT: pextrb $4, %xmm0, %edi +; SSE41-NEXT: movl %edi, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: addb %cl, %dil +; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB16_24 +; SSE41-NEXT: # %bb.23: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edi +; SSE41-NEXT: .LBB16_24: +; SSE41-NEXT: pextrb $3, %xmm1, %edx +; SSE41-NEXT: pextrb $3, %xmm0, %eax +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: addb %dl, %cl +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addb %dl, %al +; SSE41-NEXT: jno .LBB16_26 +; SSE41-NEXT: # %bb.25: +; SSE41-NEXT: addb $127, %cl +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB16_26: +; SSE41-NEXT: pextrb $2, %xmm1, %ebx +; SSE41-NEXT: pextrb $2, %xmm0, %ecx +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: addb %bl, %dl +; SSE41-NEXT: setns %dl +; SSE41-NEXT: addb %bl, %cl +; SSE41-NEXT: jno .LBB16_28 +; SSE41-NEXT: # %bb.27: +; SSE41-NEXT: addb $127, %dl +; SSE41-NEXT: movl %edx, %ecx +; SSE41-NEXT: .LBB16_28: +; SSE41-NEXT: pextrb $0, %xmm1, %esi +; SSE41-NEXT: pextrb $0, %xmm0, %edx +; SSE41-NEXT: movl %edx, %ebx +; SSE41-NEXT: addb %sil, %bl +; SSE41-NEXT: setns %bl +; SSE41-NEXT: addb %sil, %dl +; SSE41-NEXT: jno .LBB16_30 +; SSE41-NEXT: # %bb.29: +; SSE41-NEXT: addb $127, %bl +; SSE41-NEXT: movl %ebx, %edx +; SSE41-NEXT: .LBB16_30: +; SSE41-NEXT: pextrb $1, %xmm1, %esi +; SSE41-NEXT: pextrb $1, %xmm0, %r8d +; SSE41-NEXT: movl %r8d, %ebx +; SSE41-NEXT: addb %sil, %bl +; SSE41-NEXT: setns %bl +; SSE41-NEXT: addb %sil, %r8b +; SSE41-NEXT: jno .LBB16_32 +; SSE41-NEXT: # %bb.31: +; SSE41-NEXT: addb $127, %bl +; SSE41-NEXT: movl %ebx, %r8d +; SSE41-NEXT: .LBB16_32: +; SSE41-NEXT: movzbl %dl, %edx +; SSE41-NEXT: movd %edx, %xmm1 +; SSE41-NEXT: movzbl %r8b, %edx +; SSE41-NEXT: pinsrb $1, %edx, %xmm1 +; SSE41-NEXT: movzbl %cl, %ecx +; SSE41-NEXT: pinsrb $2, %ecx, %xmm1 +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $3, %eax, %xmm1 +; SSE41-NEXT: movzbl %dil, %eax +; SSE41-NEXT: pinsrb $4, %eax, %xmm1 +; SSE41-NEXT: movzbl %bpl, %eax +; SSE41-NEXT: pinsrb $5, %eax, %xmm1 +; SSE41-NEXT: movzbl %r9b, %eax +; SSE41-NEXT: pinsrb $6, %eax, %xmm1 +; SSE41-NEXT: movzbl %r10b, %eax +; SSE41-NEXT: pinsrb $7, %eax, %xmm1 +; SSE41-NEXT: movzbl %r13b, %eax +; SSE41-NEXT: pinsrb $8, %eax, %xmm1 +; SSE41-NEXT: movzbl %r12b, %eax +; SSE41-NEXT: pinsrb $9, %eax, %xmm1 +; SSE41-NEXT: movzbl %r15b, %eax +; SSE41-NEXT: pinsrb $10, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $11, %eax, %xmm1 +; SSE41-NEXT: movzbl %r14b, %eax +; SSE41-NEXT: pinsrb $12, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $13, %eax, %xmm1 +; SSE41-NEXT: movzbl %r11b, %eax +; SSE41-NEXT: pinsrb $14, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $15, %eax, %xmm1 +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pcmpgtb %xmm1, %xmm0 +; SSE41-NEXT: popq %rbx +; SSE41-NEXT: popq %r12 +; SSE41-NEXT: popq %r13 +; SSE41-NEXT: popq %r14 +; SSE41-NEXT: popq %r15 +; SSE41-NEXT: popq %rbp +; SSE41-NEXT: retq +; +; AVX1-LABEL: v16i1: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsllw $7, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpextrb $15, %xmm1, %ecx +; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpextrb $15, %xmm0, %edx +; AVX1-NEXT: movl %edx, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %dl +; AVX1-NEXT: jno .LBB16_2 +; AVX1-NEXT: # %bb.1: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %edx +; AVX1-NEXT: .LBB16_2: +; AVX1-NEXT: vpextrb $14, %xmm1, %ecx +; AVX1-NEXT: vpextrb $14, %xmm0, %r11d +; AVX1-NEXT: movl %r11d, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %r11b +; AVX1-NEXT: jno .LBB16_4 +; AVX1-NEXT: # %bb.3: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r11d +; AVX1-NEXT: .LBB16_4: +; AVX1-NEXT: vpextrb $13, %xmm1, %ecx +; AVX1-NEXT: vpextrb $13, %xmm0, %edi +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %dil +; AVX1-NEXT: jno .LBB16_6 +; AVX1-NEXT: # %bb.5: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %edi +; AVX1-NEXT: .LBB16_6: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: pushq %r15 +; AVX1-NEXT: pushq %r14 +; AVX1-NEXT: pushq %r13 +; AVX1-NEXT: pushq %r12 +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: vpextrb $12, %xmm1, %ecx +; AVX1-NEXT: vpextrb $12, %xmm0, %r14d +; AVX1-NEXT: movl %r14d, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %r14b +; AVX1-NEXT: jno .LBB16_8 +; AVX1-NEXT: # %bb.7: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r14d +; AVX1-NEXT: .LBB16_8: +; AVX1-NEXT: vpextrb $11, %xmm1, %ecx +; AVX1-NEXT: vpextrb $11, %xmm0, %ebp +; AVX1-NEXT: movl %ebp, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %bpl +; AVX1-NEXT: jno .LBB16_10 +; AVX1-NEXT: # %bb.9: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %ebp +; AVX1-NEXT: .LBB16_10: +; AVX1-NEXT: vpextrb $10, %xmm1, %ecx +; AVX1-NEXT: vpextrb $10, %xmm0, %r15d +; AVX1-NEXT: movl %r15d, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %r15b +; AVX1-NEXT: jno .LBB16_12 +; AVX1-NEXT: # %bb.11: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r15d +; AVX1-NEXT: .LBB16_12: +; AVX1-NEXT: vpextrb $9, %xmm1, %ecx +; AVX1-NEXT: vpextrb $9, %xmm0, %r12d +; AVX1-NEXT: movl %r12d, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %r12b +; AVX1-NEXT: jno .LBB16_14 +; AVX1-NEXT: # %bb.13: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r12d +; AVX1-NEXT: .LBB16_14: +; AVX1-NEXT: vpextrb $8, %xmm1, %ecx +; AVX1-NEXT: vpextrb $8, %xmm0, %r13d +; AVX1-NEXT: movl %r13d, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %r13b +; AVX1-NEXT: jno .LBB16_16 +; AVX1-NEXT: # %bb.15: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r13d +; AVX1-NEXT: .LBB16_16: +; AVX1-NEXT: vpextrb $7, %xmm1, %ecx +; AVX1-NEXT: vpextrb $7, %xmm0, %r10d +; AVX1-NEXT: movl %r10d, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %r10b +; AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB16_18 +; AVX1-NEXT: # %bb.17: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r10d +; AVX1-NEXT: .LBB16_18: +; AVX1-NEXT: vpextrb $6, %xmm1, %ecx +; AVX1-NEXT: vpextrb $6, %xmm0, %r9d +; AVX1-NEXT: movl %r9d, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %r9b +; AVX1-NEXT: jno .LBB16_20 +; AVX1-NEXT: # %bb.19: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r9d +; AVX1-NEXT: .LBB16_20: +; AVX1-NEXT: vpextrb $5, %xmm1, %ecx +; AVX1-NEXT: vpextrb $5, %xmm0, %ebp +; AVX1-NEXT: movl %ebp, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %bpl +; AVX1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB16_22 +; AVX1-NEXT: # %bb.21: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %ebp +; AVX1-NEXT: .LBB16_22: +; AVX1-NEXT: vpextrb $4, %xmm1, %ecx +; AVX1-NEXT: vpextrb $4, %xmm0, %edi +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: addb %cl, %dil +; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB16_24 +; AVX1-NEXT: # %bb.23: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %edi +; AVX1-NEXT: .LBB16_24: +; AVX1-NEXT: vpextrb $3, %xmm1, %edx +; AVX1-NEXT: vpextrb $3, %xmm0, %eax +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: addb %dl, %cl +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addb %dl, %al +; AVX1-NEXT: jno .LBB16_26 +; AVX1-NEXT: # %bb.25: +; AVX1-NEXT: addb $127, %cl +; AVX1-NEXT: movl %ecx, %eax +; AVX1-NEXT: .LBB16_26: +; AVX1-NEXT: vpextrb $2, %xmm1, %ebx +; AVX1-NEXT: vpextrb $2, %xmm0, %ecx +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: addb %bl, %dl +; AVX1-NEXT: setns %dl +; AVX1-NEXT: addb %bl, %cl +; AVX1-NEXT: jno .LBB16_28 +; AVX1-NEXT: # %bb.27: +; AVX1-NEXT: addb $127, %dl +; AVX1-NEXT: movl %edx, %ecx +; AVX1-NEXT: .LBB16_28: +; AVX1-NEXT: vpextrb $0, %xmm1, %esi +; AVX1-NEXT: vpextrb $0, %xmm0, %edx +; AVX1-NEXT: movl %edx, %ebx +; AVX1-NEXT: addb %sil, %bl +; AVX1-NEXT: setns %bl +; AVX1-NEXT: addb %sil, %dl +; AVX1-NEXT: jno .LBB16_30 +; AVX1-NEXT: # %bb.29: +; AVX1-NEXT: addb $127, %bl +; AVX1-NEXT: movl %ebx, %edx +; AVX1-NEXT: .LBB16_30: +; AVX1-NEXT: vpextrb $1, %xmm1, %esi +; AVX1-NEXT: vpextrb $1, %xmm0, %r8d +; AVX1-NEXT: movl %r8d, %ebx +; AVX1-NEXT: addb %sil, %bl +; AVX1-NEXT: setns %bl +; AVX1-NEXT: addb %sil, %r8b +; AVX1-NEXT: jno .LBB16_32 +; AVX1-NEXT: # %bb.31: +; AVX1-NEXT: addb $127, %bl +; AVX1-NEXT: movl %ebx, %r8d +; AVX1-NEXT: .LBB16_32: +; AVX1-NEXT: movzbl %dl, %edx +; AVX1-NEXT: vmovd %edx, %xmm0 +; AVX1-NEXT: movzbl %r8b, %edx +; AVX1-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %cl, %ecx +; AVX1-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %al, %eax +; AVX1-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %dil, %eax +; AVX1-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %bpl, %eax +; AVX1-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %r9b, %eax +; AVX1-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %r10b, %eax +; AVX1-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %r13b, %eax +; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %r12b, %eax +; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %r15b, %eax +; AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %r14b, %eax +; AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %r11b, %eax +; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: popq %r12 +; AVX1-NEXT: popq %r13 +; AVX1-NEXT: popq %r14 +; AVX1-NEXT: popq %r15 +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: v16i1: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllw $7, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpextrb $15, %xmm1, %ecx +; AVX2-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpextrb $15, %xmm0, %edx +; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %dl +; AVX2-NEXT: jno .LBB16_2 +; AVX2-NEXT: # %bb.1: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %edx +; AVX2-NEXT: .LBB16_2: +; AVX2-NEXT: vpextrb $14, %xmm1, %ecx +; AVX2-NEXT: vpextrb $14, %xmm0, %r11d +; AVX2-NEXT: movl %r11d, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %r11b +; AVX2-NEXT: jno .LBB16_4 +; AVX2-NEXT: # %bb.3: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r11d +; AVX2-NEXT: .LBB16_4: +; AVX2-NEXT: vpextrb $13, %xmm1, %ecx +; AVX2-NEXT: vpextrb $13, %xmm0, %edi +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %dil +; AVX2-NEXT: jno .LBB16_6 +; AVX2-NEXT: # %bb.5: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %edi +; AVX2-NEXT: .LBB16_6: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %r13 +; AVX2-NEXT: pushq %r12 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: vpextrb $12, %xmm1, %ecx +; AVX2-NEXT: vpextrb $12, %xmm0, %r14d +; AVX2-NEXT: movl %r14d, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %r14b +; AVX2-NEXT: jno .LBB16_8 +; AVX2-NEXT: # %bb.7: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r14d +; AVX2-NEXT: .LBB16_8: +; AVX2-NEXT: vpextrb $11, %xmm1, %ecx +; AVX2-NEXT: vpextrb $11, %xmm0, %ebp +; AVX2-NEXT: movl %ebp, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %bpl +; AVX2-NEXT: jno .LBB16_10 +; AVX2-NEXT: # %bb.9: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %ebp +; AVX2-NEXT: .LBB16_10: +; AVX2-NEXT: vpextrb $10, %xmm1, %ecx +; AVX2-NEXT: vpextrb $10, %xmm0, %r15d +; AVX2-NEXT: movl %r15d, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %r15b +; AVX2-NEXT: jno .LBB16_12 +; AVX2-NEXT: # %bb.11: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r15d +; AVX2-NEXT: .LBB16_12: +; AVX2-NEXT: vpextrb $9, %xmm1, %ecx +; AVX2-NEXT: vpextrb $9, %xmm0, %r12d +; AVX2-NEXT: movl %r12d, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %r12b +; AVX2-NEXT: jno .LBB16_14 +; AVX2-NEXT: # %bb.13: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r12d +; AVX2-NEXT: .LBB16_14: +; AVX2-NEXT: vpextrb $8, %xmm1, %ecx +; AVX2-NEXT: vpextrb $8, %xmm0, %r13d +; AVX2-NEXT: movl %r13d, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %r13b +; AVX2-NEXT: jno .LBB16_16 +; AVX2-NEXT: # %bb.15: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r13d +; AVX2-NEXT: .LBB16_16: +; AVX2-NEXT: vpextrb $7, %xmm1, %ecx +; AVX2-NEXT: vpextrb $7, %xmm0, %r10d +; AVX2-NEXT: movl %r10d, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %r10b +; AVX2-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB16_18 +; AVX2-NEXT: # %bb.17: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r10d +; AVX2-NEXT: .LBB16_18: +; AVX2-NEXT: vpextrb $6, %xmm1, %ecx +; AVX2-NEXT: vpextrb $6, %xmm0, %r9d +; AVX2-NEXT: movl %r9d, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %r9b +; AVX2-NEXT: jno .LBB16_20 +; AVX2-NEXT: # %bb.19: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r9d +; AVX2-NEXT: .LBB16_20: +; AVX2-NEXT: vpextrb $5, %xmm1, %ecx +; AVX2-NEXT: vpextrb $5, %xmm0, %ebp +; AVX2-NEXT: movl %ebp, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %bpl +; AVX2-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB16_22 +; AVX2-NEXT: # %bb.21: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %ebp +; AVX2-NEXT: .LBB16_22: +; AVX2-NEXT: vpextrb $4, %xmm1, %ecx +; AVX2-NEXT: vpextrb $4, %xmm0, %edi +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: addb %cl, %dil +; AVX2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB16_24 +; AVX2-NEXT: # %bb.23: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %edi +; AVX2-NEXT: .LBB16_24: +; AVX2-NEXT: vpextrb $3, %xmm1, %edx +; AVX2-NEXT: vpextrb $3, %xmm0, %eax +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: addb %dl, %cl +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addb %dl, %al +; AVX2-NEXT: jno .LBB16_26 +; AVX2-NEXT: # %bb.25: +; AVX2-NEXT: addb $127, %cl +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: .LBB16_26: +; AVX2-NEXT: vpextrb $2, %xmm1, %ebx +; AVX2-NEXT: vpextrb $2, %xmm0, %ecx +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: addb %bl, %dl +; AVX2-NEXT: setns %dl +; AVX2-NEXT: addb %bl, %cl +; AVX2-NEXT: jno .LBB16_28 +; AVX2-NEXT: # %bb.27: +; AVX2-NEXT: addb $127, %dl +; AVX2-NEXT: movl %edx, %ecx +; AVX2-NEXT: .LBB16_28: +; AVX2-NEXT: vpextrb $0, %xmm1, %esi +; AVX2-NEXT: vpextrb $0, %xmm0, %edx +; AVX2-NEXT: movl %edx, %ebx +; AVX2-NEXT: addb %sil, %bl +; AVX2-NEXT: setns %bl +; AVX2-NEXT: addb %sil, %dl +; AVX2-NEXT: jno .LBB16_30 +; AVX2-NEXT: # %bb.29: +; AVX2-NEXT: addb $127, %bl +; AVX2-NEXT: movl %ebx, %edx +; AVX2-NEXT: .LBB16_30: +; AVX2-NEXT: vpextrb $1, %xmm1, %esi +; AVX2-NEXT: vpextrb $1, %xmm0, %r8d +; AVX2-NEXT: movl %r8d, %ebx +; AVX2-NEXT: addb %sil, %bl +; AVX2-NEXT: setns %bl +; AVX2-NEXT: addb %sil, %r8b +; AVX2-NEXT: jno .LBB16_32 +; AVX2-NEXT: # %bb.31: +; AVX2-NEXT: addb $127, %bl +; AVX2-NEXT: movl %ebx, %r8d +; AVX2-NEXT: .LBB16_32: +; AVX2-NEXT: movzbl %dl, %edx +; AVX2-NEXT: vmovd %edx, %xmm0 +; AVX2-NEXT: movzbl %r8b, %edx +; AVX2-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %dil, %eax +; AVX2-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %bpl, %eax +; AVX2-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %r9b, %eax +; AVX2-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %r10b, %eax +; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %r13b, %eax +; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %r12b, %eax +; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %r15b, %eax +; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %r14b, %eax +; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %r11b, %eax +; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r12 +; AVX2-NEXT: popq %r13 +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512-LABEL: v16i1: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX512-NEXT: vpmovb2m %xmm0, %k0 +; AVX512-NEXT: kshiftrw $1, %k0, %k1 +; AVX512-NEXT: kmovd %k1, %edx +; AVX512-NEXT: vpsllw $7, %xmm1, %xmm0 +; AVX512-NEXT: vpmovb2m %xmm0, %k1 +; AVX512-NEXT: kshiftrw $1, %k1, %k2 +; AVX512-NEXT: kmovd %k2, %eax +; AVX512-NEXT: shlb $7, %al +; AVX512-NEXT: shlb $7, %dl +; AVX512-NEXT: movl %edx, %ecx +; AVX512-NEXT: addb %al, %cl +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addb %al, %dl +; AVX512-NEXT: kmovd %k0, %esi +; AVX512-NEXT: kmovd %k1, %eax +; AVX512-NEXT: jno .LBB16_2 +; AVX512-NEXT: # %bb.1: +; AVX512-NEXT: addb $127, %cl +; AVX512-NEXT: movl %ecx, %edx +; AVX512-NEXT: .LBB16_2: +; AVX512-NEXT: shlb $7, %al +; AVX512-NEXT: shlb $7, %sil +; AVX512-NEXT: movl %esi, %ecx +; AVX512-NEXT: addb %al, %cl +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addb %al, %sil +; AVX512-NEXT: kshiftrw $2, %k0, %k2 +; AVX512-NEXT: kmovd %k2, %edi +; AVX512-NEXT: kshiftrw $2, %k1, %k2 +; AVX512-NEXT: kmovd %k2, %eax +; AVX512-NEXT: jno .LBB16_4 +; AVX512-NEXT: # %bb.3: +; AVX512-NEXT: addb $127, %cl +; AVX512-NEXT: movl %ecx, %esi +; AVX512-NEXT: .LBB16_4: +; AVX512-NEXT: shlb $7, %al +; AVX512-NEXT: shlb $7, %dil +; AVX512-NEXT: movl %edi, %ecx +; AVX512-NEXT: addb %al, %cl +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addb %al, %dil +; AVX512-NEXT: kshiftrw $3, %k0, %k2 +; AVX512-NEXT: kmovd %k2, %r11d +; AVX512-NEXT: kshiftrw $3, %k1, %k2 +; AVX512-NEXT: kmovd %k2, %eax +; AVX512-NEXT: jno .LBB16_6 +; AVX512-NEXT: # %bb.5: +; AVX512-NEXT: addb $127, %cl +; AVX512-NEXT: movl %ecx, %edi +; AVX512-NEXT: .LBB16_6: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r15 +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %r13 +; AVX512-NEXT: pushq %r12 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: shlb $7, %al +; AVX512-NEXT: shlb $7, %r11b +; AVX512-NEXT: movl %r11d, %ecx +; AVX512-NEXT: addb %al, %cl +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addb %al, %r11b +; AVX512-NEXT: kshiftrw $4, %k0, %k2 +; AVX512-NEXT: kmovd %k2, %r14d +; AVX512-NEXT: kshiftrw $4, %k1, %k2 +; AVX512-NEXT: kmovd %k2, %eax +; AVX512-NEXT: jno .LBB16_8 +; AVX512-NEXT: # %bb.7: +; AVX512-NEXT: addb $127, %cl +; AVX512-NEXT: movl %ecx, %r11d +; AVX512-NEXT: .LBB16_8: +; AVX512-NEXT: shlb $7, %al +; AVX512-NEXT: shlb $7, %r14b +; AVX512-NEXT: movl %r14d, %ecx +; AVX512-NEXT: addb %al, %cl +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addb %al, %r14b +; AVX512-NEXT: kshiftrw $5, %k0, %k2 +; AVX512-NEXT: kmovd %k2, %r15d +; AVX512-NEXT: kshiftrw $5, %k1, %k2 +; AVX512-NEXT: kmovd %k2, %eax +; AVX512-NEXT: jno .LBB16_10 +; AVX512-NEXT: # %bb.9: +; AVX512-NEXT: addb $127, %cl +; AVX512-NEXT: movl %ecx, %r14d +; AVX512-NEXT: .LBB16_10: +; AVX512-NEXT: shlb $7, %al +; AVX512-NEXT: shlb $7, %r15b +; AVX512-NEXT: movl %r15d, %ecx +; AVX512-NEXT: addb %al, %cl +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addb %al, %r15b +; AVX512-NEXT: kshiftrw $6, %k0, %k2 +; AVX512-NEXT: kmovd %k2, %r12d +; AVX512-NEXT: kshiftrw $6, %k1, %k2 +; AVX512-NEXT: kmovd %k2, %eax +; AVX512-NEXT: jno .LBB16_12 +; AVX512-NEXT: # %bb.11: +; AVX512-NEXT: addb $127, %cl +; AVX512-NEXT: movl %ecx, %r15d +; AVX512-NEXT: .LBB16_12: +; AVX512-NEXT: shlb $7, %al +; AVX512-NEXT: shlb $7, %r12b +; AVX512-NEXT: movl %r12d, %ecx +; AVX512-NEXT: addb %al, %cl +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addb %al, %r12b +; AVX512-NEXT: kshiftrw $7, %k0, %k2 +; AVX512-NEXT: kmovd %k2, %r13d +; AVX512-NEXT: kshiftrw $7, %k1, %k2 +; AVX512-NEXT: kmovd %k2, %eax +; AVX512-NEXT: jno .LBB16_14 +; AVX512-NEXT: # %bb.13: +; AVX512-NEXT: addb $127, %cl +; AVX512-NEXT: movl %ecx, %r12d +; AVX512-NEXT: .LBB16_14: +; AVX512-NEXT: shlb $7, %al +; AVX512-NEXT: shlb $7, %r13b +; AVX512-NEXT: movl %r13d, %ecx +; AVX512-NEXT: addb %al, %cl +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addb %al, %r13b +; AVX512-NEXT: kshiftrw $8, %k0, %k2 +; AVX512-NEXT: kmovd %k2, %r9d +; AVX512-NEXT: kshiftrw $8, %k1, %k2 +; AVX512-NEXT: kmovd %k2, %eax +; AVX512-NEXT: jno .LBB16_16 +; AVX512-NEXT: # %bb.15: +; AVX512-NEXT: addb $127, %cl +; AVX512-NEXT: movl %ecx, %r13d +; AVX512-NEXT: .LBB16_16: +; AVX512-NEXT: shlb $7, %al +; AVX512-NEXT: shlb $7, %r9b +; AVX512-NEXT: movl %r9d, %ecx +; AVX512-NEXT: addb %al, %cl +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addb %al, %r9b +; AVX512-NEXT: kshiftrw $9, %k0, %k2 +; AVX512-NEXT: kmovd %k2, %r10d +; AVX512-NEXT: kshiftrw $9, %k1, %k2 +; AVX512-NEXT: kmovd %k2, %eax +; AVX512-NEXT: jno .LBB16_18 +; AVX512-NEXT: # %bb.17: +; AVX512-NEXT: addb $127, %cl +; AVX512-NEXT: movl %ecx, %r9d +; AVX512-NEXT: .LBB16_18: +; AVX512-NEXT: shlb $7, %al +; AVX512-NEXT: shlb $7, %r10b +; AVX512-NEXT: movl %r10d, %ecx +; AVX512-NEXT: addb %al, %cl +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addb %al, %r10b +; AVX512-NEXT: kshiftrw $10, %k0, %k2 +; AVX512-NEXT: kmovd %k2, %ebp +; AVX512-NEXT: kshiftrw $10, %k1, %k2 +; AVX512-NEXT: kmovd %k2, %eax +; AVX512-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB16_20 +; AVX512-NEXT: # %bb.19: +; AVX512-NEXT: addb $127, %cl +; AVX512-NEXT: movl %ecx, %r10d +; AVX512-NEXT: .LBB16_20: +; AVX512-NEXT: shlb $7, %al +; AVX512-NEXT: shlb $7, %bpl +; AVX512-NEXT: movl %ebp, %ecx +; AVX512-NEXT: addb %al, %cl +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addb %al, %bpl +; AVX512-NEXT: kshiftrw $11, %k0, %k2 +; AVX512-NEXT: kmovd %k2, %edi +; AVX512-NEXT: kshiftrw $11, %k1, %k2 +; AVX512-NEXT: kmovd %k2, %eax +; AVX512-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB16_22 +; AVX512-NEXT: # %bb.21: +; AVX512-NEXT: addb $127, %cl +; AVX512-NEXT: movl %ecx, %ebp +; AVX512-NEXT: .LBB16_22: +; AVX512-NEXT: shlb $7, %al +; AVX512-NEXT: shlb $7, %dil +; AVX512-NEXT: movl %edi, %ecx +; AVX512-NEXT: addb %al, %cl +; AVX512-NEXT: setns %dl +; AVX512-NEXT: addb %al, %dil +; AVX512-NEXT: kshiftrw $12, %k0, %k2 +; AVX512-NEXT: kmovd %k2, %eax +; AVX512-NEXT: kshiftrw $12, %k1, %k2 +; AVX512-NEXT: kmovd %k2, %ecx +; AVX512-NEXT: jno .LBB16_24 +; AVX512-NEXT: # %bb.23: +; AVX512-NEXT: addb $127, %dl +; AVX512-NEXT: movl %edx, %edi +; AVX512-NEXT: .LBB16_24: +; AVX512-NEXT: shlb $7, %cl +; AVX512-NEXT: shlb $7, %al +; AVX512-NEXT: movl %eax, %edx +; AVX512-NEXT: addb %cl, %dl +; AVX512-NEXT: setns %bl +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: kshiftrw $13, %k0, %k2 +; AVX512-NEXT: kmovd %k2, %ecx +; AVX512-NEXT: kshiftrw $13, %k1, %k2 +; AVX512-NEXT: kmovd %k2, %edx +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB16_26 +; AVX512-NEXT: # %bb.25: +; AVX512-NEXT: addb $127, %bl +; AVX512-NEXT: movl %ebx, %eax +; AVX512-NEXT: .LBB16_26: +; AVX512-NEXT: shlb $7, %dl +; AVX512-NEXT: shlb $7, %cl +; AVX512-NEXT: movl %ecx, %ebx +; AVX512-NEXT: addb %dl, %bl +; AVX512-NEXT: setns %bl +; AVX512-NEXT: addb %dl, %cl +; AVX512-NEXT: kshiftrw $14, %k0, %k2 +; AVX512-NEXT: kmovd %k2, %edx +; AVX512-NEXT: kshiftrw $14, %k1, %k2 +; AVX512-NEXT: kmovd %k2, %esi +; AVX512-NEXT: jno .LBB16_28 +; AVX512-NEXT: # %bb.27: +; AVX512-NEXT: addb $127, %bl +; AVX512-NEXT: movl %ebx, %ecx +; AVX512-NEXT: .LBB16_28: +; AVX512-NEXT: shlb $7, %sil +; AVX512-NEXT: shlb $7, %dl +; AVX512-NEXT: movl %edx, %ebx +; AVX512-NEXT: addb %sil, %bl +; AVX512-NEXT: setns %bl +; AVX512-NEXT: addb %sil, %dl +; AVX512-NEXT: kshiftrw $15, %k0, %k0 +; AVX512-NEXT: kmovd %k0, %r8d +; AVX512-NEXT: kshiftrw $15, %k1, %k0 +; AVX512-NEXT: kmovd %k0, %esi +; AVX512-NEXT: jno .LBB16_30 +; AVX512-NEXT: # %bb.29: +; AVX512-NEXT: addb $127, %bl +; AVX512-NEXT: movl %ebx, %edx +; AVX512-NEXT: .LBB16_30: +; AVX512-NEXT: shlb $7, %sil +; AVX512-NEXT: shlb $7, %r8b +; AVX512-NEXT: movl %r8d, %ebx +; AVX512-NEXT: addb %sil, %bl +; AVX512-NEXT: setns %bl +; AVX512-NEXT: addb %sil, %r8b +; AVX512-NEXT: jno .LBB16_32 +; AVX512-NEXT: # %bb.31: +; AVX512-NEXT: addb $127, %bl +; AVX512-NEXT: movl %ebx, %r8d +; AVX512-NEXT: .LBB16_32: +; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload +; AVX512-NEXT: sarb $7, %sil +; AVX512-NEXT: kmovd %esi, %k1 +; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload +; AVX512-NEXT: sarb $7, %sil +; AVX512-NEXT: kmovd %esi, %k0 +; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload +; AVX512-NEXT: sarb $7, %sil +; AVX512-NEXT: kmovd %esi, %k2 +; AVX512-NEXT: sarb $7, %r11b +; AVX512-NEXT: kmovd %r11d, %k3 +; AVX512-NEXT: sarb $7, %r14b +; AVX512-NEXT: kmovd %r14d, %k4 +; AVX512-NEXT: sarb $7, %r15b +; AVX512-NEXT: kmovd %r15d, %k5 +; AVX512-NEXT: sarb $7, %r12b +; AVX512-NEXT: kmovd %r12d, %k6 +; AVX512-NEXT: kshiftrw $1, %k0, %k7 +; AVX512-NEXT: kxorw %k1, %k7, %k7 +; AVX512-NEXT: sarb $7, %r13b +; AVX512-NEXT: kmovd %r13d, %k1 +; AVX512-NEXT: kshiftlw $15, %k7, %k7 +; AVX512-NEXT: kshiftrw $14, %k7, %k7 +; AVX512-NEXT: kxorw %k7, %k0, %k0 +; AVX512-NEXT: kshiftrw $2, %k0, %k7 +; AVX512-NEXT: kxorw %k2, %k7, %k7 +; AVX512-NEXT: sarb $7, %r9b +; AVX512-NEXT: kmovd %r9d, %k2 +; AVX512-NEXT: kshiftlw $15, %k7, %k7 +; AVX512-NEXT: kshiftrw $13, %k7, %k7 +; AVX512-NEXT: kxorw %k7, %k0, %k0 +; AVX512-NEXT: kshiftrw $3, %k0, %k7 +; AVX512-NEXT: kxorw %k3, %k7, %k7 +; AVX512-NEXT: sarb $7, %r10b +; AVX512-NEXT: kmovd %r10d, %k3 +; AVX512-NEXT: kshiftlw $15, %k7, %k7 +; AVX512-NEXT: kshiftrw $12, %k7, %k7 +; AVX512-NEXT: kxorw %k7, %k0, %k7 +; AVX512-NEXT: kshiftrw $4, %k7, %k0 +; AVX512-NEXT: kxorw %k4, %k0, %k4 +; AVX512-NEXT: sarb $7, %bpl +; AVX512-NEXT: kmovd %ebp, %k0 +; AVX512-NEXT: kshiftlw $15, %k4, %k4 +; AVX512-NEXT: kshiftrw $11, %k4, %k4 +; AVX512-NEXT: kxorw %k4, %k7, %k7 +; AVX512-NEXT: kshiftrw $5, %k7, %k4 +; AVX512-NEXT: kxorw %k5, %k4, %k5 +; AVX512-NEXT: sarb $7, %dil +; AVX512-NEXT: kmovd %edi, %k4 +; AVX512-NEXT: kshiftlw $15, %k5, %k5 +; AVX512-NEXT: kshiftrw $10, %k5, %k5 +; AVX512-NEXT: kxorw %k5, %k7, %k7 +; AVX512-NEXT: kshiftrw $6, %k7, %k5 +; AVX512-NEXT: kxorw %k6, %k5, %k6 +; AVX512-NEXT: sarb $7, %al +; AVX512-NEXT: kmovd %eax, %k5 +; AVX512-NEXT: kshiftlw $15, %k6, %k6 +; AVX512-NEXT: kshiftrw $9, %k6, %k6 +; AVX512-NEXT: kxorw %k6, %k7, %k6 +; AVX512-NEXT: kshiftrw $7, %k6, %k7 +; AVX512-NEXT: kxorw %k1, %k7, %k7 +; AVX512-NEXT: sarb $7, %cl +; AVX512-NEXT: kmovd %ecx, %k1 +; AVX512-NEXT: kshiftlw $15, %k7, %k7 +; AVX512-NEXT: kshiftrw $8, %k7, %k7 +; AVX512-NEXT: kxorw %k7, %k6, %k6 +; AVX512-NEXT: kshiftrw $8, %k6, %k7 +; AVX512-NEXT: kxorw %k2, %k7, %k7 +; AVX512-NEXT: sarb $7, %dl +; AVX512-NEXT: kmovd %edx, %k2 +; AVX512-NEXT: kshiftlw $15, %k7, %k7 +; AVX512-NEXT: kshiftrw $7, %k7, %k7 +; AVX512-NEXT: kxorw %k7, %k6, %k6 +; AVX512-NEXT: kshiftrw $9, %k6, %k7 +; AVX512-NEXT: kxorw %k3, %k7, %k3 +; AVX512-NEXT: sarb $7, %r8b +; AVX512-NEXT: kmovd %r8d, %k7 +; AVX512-NEXT: kshiftlw $15, %k3, %k3 +; AVX512-NEXT: kshiftrw $6, %k3, %k3 +; AVX512-NEXT: kxorw %k3, %k6, %k3 +; AVX512-NEXT: kshiftrw $10, %k3, %k6 +; AVX512-NEXT: kxorw %k0, %k6, %k0 +; AVX512-NEXT: kshiftlw $15, %k0, %k0 +; AVX512-NEXT: kshiftrw $5, %k0, %k0 +; AVX512-NEXT: kxorw %k0, %k3, %k0 +; AVX512-NEXT: kshiftrw $11, %k0, %k3 +; AVX512-NEXT: kxorw %k4, %k3, %k3 +; AVX512-NEXT: kshiftlw $15, %k3, %k3 +; AVX512-NEXT: kshiftrw $4, %k3, %k3 +; AVX512-NEXT: kxorw %k3, %k0, %k0 +; AVX512-NEXT: kshiftrw $12, %k0, %k3 +; AVX512-NEXT: kxorw %k5, %k3, %k3 +; AVX512-NEXT: kshiftlw $15, %k3, %k3 +; AVX512-NEXT: kshiftrw $3, %k3, %k3 +; AVX512-NEXT: kxorw %k3, %k0, %k0 +; AVX512-NEXT: kshiftrw $13, %k0, %k3 +; AVX512-NEXT: kxorw %k1, %k3, %k1 +; AVX512-NEXT: kshiftlw $15, %k1, %k1 +; AVX512-NEXT: kshiftrw $2, %k1, %k1 +; AVX512-NEXT: kxorw %k1, %k0, %k0 +; AVX512-NEXT: kshiftrw $14, %k0, %k1 +; AVX512-NEXT: kxorw %k2, %k1, %k1 +; AVX512-NEXT: kshiftlw $15, %k1, %k1 +; AVX512-NEXT: kshiftrw $1, %k1, %k1 +; AVX512-NEXT: kxorw %k1, %k0, %k0 +; AVX512-NEXT: kshiftlw $1, %k0, %k0 +; AVX512-NEXT: kshiftrw $1, %k0, %k0 +; AVX512-NEXT: kshiftlw $15, %k7, %k1 +; AVX512-NEXT: korw %k1, %k0, %k0 +; AVX512-NEXT: vpmovm2b %k0, %xmm0 +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r12 +; AVX512-NEXT: popq %r13 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %r15 +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq + %z = call <16 x i1> @llvm.sadd.sat.v16i1(<16 x i1> %x, <16 x i1> %y) + ret <16 x i1> %z +} + +; Expanded + +define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { +; SSE2-LABEL: v4i32: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] +; SSE2-NEXT: movd %xmm2, %ecx +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; SSE2-NEXT: movd %xmm2, %r8d +; SSE2-NEXT: xorl %edx, %edx +; SSE2-NEXT: movl %r8d, %esi +; SSE2-NEXT: addl %ecx, %esi +; SSE2-NEXT: setns %dl +; SSE2-NEXT: addl $2147483647, %edx # imm = 0x7FFFFFFF +; SSE2-NEXT: addl %ecx, %r8d +; SSE2-NEXT: cmovol %edx, %r8d +; SSE2-NEXT: movd %xmm1, %edx +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: xorl %esi, %esi +; SSE2-NEXT: movl %ecx, %edi +; SSE2-NEXT: addl %edx, %edi +; SSE2-NEXT: setns %sil +; SSE2-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; SSE2-NEXT: addl %edx, %ecx +; SSE2-NEXT: cmovol %esi, %ecx +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE2-NEXT: movd %xmm2, %edx +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: xorl %edi, %edi +; SSE2-NEXT: movl %eax, %esi +; SSE2-NEXT: addl %edx, %esi +; SSE2-NEXT: setns %dil +; SSE2-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; SSE2-NEXT: addl %edx, %eax +; SSE2-NEXT: cmovol %edi, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] +; SSE2-NEXT: movd %xmm1, %r9d +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE2-NEXT: movd %xmm0, %edx +; SSE2-NEXT: xorl %edi, %edi +; SSE2-NEXT: movl %edx, %esi +; SSE2-NEXT: addl %r9d, %esi +; SSE2-NEXT: setns %dil +; SSE2-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; SSE2-NEXT: addl %r9d, %edx +; SSE2-NEXT: cmovol %edi, %edx +; SSE2-NEXT: movd %edx, %xmm0 +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: movd %r8d, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v4i32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] +; SSSE3-NEXT: movd %xmm2, %ecx +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; SSSE3-NEXT: movd %xmm2, %r8d +; SSSE3-NEXT: xorl %edx, %edx +; SSSE3-NEXT: movl %r8d, %esi +; SSSE3-NEXT: addl %ecx, %esi +; SSSE3-NEXT: setns %dl +; SSSE3-NEXT: addl $2147483647, %edx # imm = 0x7FFFFFFF +; SSSE3-NEXT: addl %ecx, %r8d +; SSSE3-NEXT: cmovol %edx, %r8d +; SSSE3-NEXT: movd %xmm1, %edx +; SSSE3-NEXT: movd %xmm0, %ecx +; SSSE3-NEXT: xorl %esi, %esi +; SSSE3-NEXT: movl %ecx, %edi +; SSSE3-NEXT: addl %edx, %edi +; SSSE3-NEXT: setns %sil +; SSSE3-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; SSSE3-NEXT: addl %edx, %ecx +; SSSE3-NEXT: cmovol %esi, %ecx +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSSE3-NEXT: movd %xmm2, %edx +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSSE3-NEXT: movd %xmm2, %eax +; SSSE3-NEXT: xorl %edi, %edi +; SSSE3-NEXT: movl %eax, %esi +; SSSE3-NEXT: addl %edx, %esi +; SSSE3-NEXT: setns %dil +; SSSE3-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; SSSE3-NEXT: addl %edx, %eax +; SSSE3-NEXT: cmovol %edi, %eax +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] +; SSSE3-NEXT: movd %xmm1, %r9d +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSSE3-NEXT: movd %xmm0, %edx +; SSSE3-NEXT: xorl %edi, %edi +; SSSE3-NEXT: movl %edx, %esi +; SSSE3-NEXT: addl %r9d, %esi +; SSSE3-NEXT: setns %dil +; SSSE3-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; SSSE3-NEXT: addl %r9d, %edx +; SSSE3-NEXT: cmovol %edi, %edx +; SSSE3-NEXT: movd %edx, %xmm0 +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: movd %r8d, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v4i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pextrd $3, %xmm1, %ecx +; SSE41-NEXT: pextrd $3, %xmm0, %r8d +; SSE41-NEXT: xorl %edx, %edx +; SSE41-NEXT: movl %r8d, %esi +; SSE41-NEXT: addl %ecx, %esi +; SSE41-NEXT: setns %dl +; SSE41-NEXT: addl $2147483647, %edx # imm = 0x7FFFFFFF +; SSE41-NEXT: addl %ecx, %r8d +; SSE41-NEXT: cmovol %edx, %r8d +; SSE41-NEXT: pextrd $2, %xmm1, %edx +; SSE41-NEXT: pextrd $2, %xmm0, %ecx +; SSE41-NEXT: xorl %esi, %esi +; SSE41-NEXT: movl %ecx, %edi +; SSE41-NEXT: addl %edx, %edi +; SSE41-NEXT: setns %sil +; SSE41-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; SSE41-NEXT: addl %edx, %ecx +; SSE41-NEXT: cmovol %esi, %ecx +; SSE41-NEXT: movd %xmm1, %edx +; SSE41-NEXT: movd %xmm0, %eax +; SSE41-NEXT: xorl %edi, %edi +; SSE41-NEXT: movl %eax, %esi +; SSE41-NEXT: addl %edx, %esi +; SSE41-NEXT: setns %dil +; SSE41-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; SSE41-NEXT: addl %edx, %eax +; SSE41-NEXT: cmovol %edi, %eax +; SSE41-NEXT: pextrd $1, %xmm1, %r9d +; SSE41-NEXT: pextrd $1, %xmm0, %edx +; SSE41-NEXT: xorl %edi, %edi +; SSE41-NEXT: movl %edx, %esi +; SSE41-NEXT: addl %r9d, %esi +; SSE41-NEXT: setns %dil +; SSE41-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; SSE41-NEXT: addl %r9d, %edx +; SSE41-NEXT: cmovol %edi, %edx +; SSE41-NEXT: movd %eax, %xmm0 +; SSE41-NEXT: pinsrd $1, %edx, %xmm0 +; SSE41-NEXT: pinsrd $2, %ecx, %xmm0 +; SSE41-NEXT: pinsrd $3, %r8d, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vpextrd $3, %xmm1, %ecx +; AVX-NEXT: vpextrd $3, %xmm0, %r9d +; AVX-NEXT: xorl %edx, %edx +; AVX-NEXT: movl %r9d, %esi +; AVX-NEXT: addl %ecx, %esi +; AVX-NEXT: setns %dl +; AVX-NEXT: addl $2147483647, %edx # imm = 0x7FFFFFFF +; AVX-NEXT: addl %ecx, %r9d +; AVX-NEXT: cmovol %edx, %r9d +; AVX-NEXT: vpextrd $2, %xmm1, %edx +; AVX-NEXT: vpextrd $2, %xmm0, %ecx +; AVX-NEXT: xorl %esi, %esi +; AVX-NEXT: movl %ecx, %edi +; AVX-NEXT: addl %edx, %edi +; AVX-NEXT: setns %sil +; AVX-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; AVX-NEXT: addl %edx, %ecx +; AVX-NEXT: cmovol %esi, %ecx +; AVX-NEXT: vmovd %xmm1, %r8d +; AVX-NEXT: vmovd %xmm0, %edx +; AVX-NEXT: xorl %edi, %edi +; AVX-NEXT: movl %edx, %esi +; AVX-NEXT: addl %r8d, %esi +; AVX-NEXT: setns %dil +; AVX-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; AVX-NEXT: addl %r8d, %edx +; AVX-NEXT: cmovol %edi, %edx +; AVX-NEXT: vpextrd $1, %xmm1, %r8d +; AVX-NEXT: vpextrd $1, %xmm0, %eax +; AVX-NEXT: xorl %esi, %esi +; AVX-NEXT: movl %eax, %edi +; AVX-NEXT: addl %r8d, %edi +; AVX-NEXT: setns %sil +; AVX-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; AVX-NEXT: addl %r8d, %eax +; AVX-NEXT: cmovol %esi, %eax +; AVX-NEXT: vmovd %edx, %xmm0 +; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; AVX-NEXT: vpinsrd $3, %r9d, %xmm0, %xmm0 +; AVX-NEXT: retq + %z = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %x, <4 x i32> %y) + ret <4 x i32> %z +} + +define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { +; SSE2-LABEL: v2i32: +; SSE2: # %bb.0: +; SSE2-NEXT: psllq $32, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: psllq $32, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm2, %rcx +; SSE2-NEXT: xorl %edx, %edx +; SSE2-NEXT: movq %rcx, %rsi +; SSE2-NEXT: addq %rax, %rsi +; SSE2-NEXT: setns %dl +; SSE2-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF +; SSE2-NEXT: addq %r8, %rdx +; SSE2-NEXT: addq %rax, %rcx +; SSE2-NEXT: cmovoq %rdx, %rcx +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: movq %xmm0, %rsi +; SSE2-NEXT: xorl %edi, %edi +; SSE2-NEXT: movq %rsi, %rdx +; SSE2-NEXT: addq %rax, %rdx +; SSE2-NEXT: setns %dil +; SSE2-NEXT: addq %r8, %rdi +; SSE2-NEXT: addq %rax, %rsi +; SSE2-NEXT: cmovoq %rdi, %rsi +; SSE2-NEXT: movq %rsi, %xmm1 +; SSE2-NEXT: movq %rcx, %xmm0 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v2i32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: psllq $32, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSSE3-NEXT: movq %xmm2, %rax +; SSSE3-NEXT: psllq $32, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSSE3-NEXT: movq %xmm2, %rcx +; SSSE3-NEXT: xorl %edx, %edx +; SSSE3-NEXT: movq %rcx, %rsi +; SSSE3-NEXT: addq %rax, %rsi +; SSSE3-NEXT: setns %dl +; SSSE3-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF +; SSSE3-NEXT: addq %r8, %rdx +; SSSE3-NEXT: addq %rax, %rcx +; SSSE3-NEXT: cmovoq %rdx, %rcx +; SSSE3-NEXT: movq %xmm1, %rax +; SSSE3-NEXT: movq %xmm0, %rsi +; SSSE3-NEXT: xorl %edi, %edi +; SSSE3-NEXT: movq %rsi, %rdx +; SSSE3-NEXT: addq %rax, %rdx +; SSSE3-NEXT: setns %dil +; SSSE3-NEXT: addq %r8, %rdi +; SSSE3-NEXT: addq %rax, %rsi +; SSSE3-NEXT: cmovoq %rdi, %rsi +; SSSE3-NEXT: movq %rsi, %xmm1 +; SSSE3-NEXT: movq %rcx, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] +; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v2i32: +; SSE41: # %bb.0: +; SSE41-NEXT: psllq $32, %xmm1 +; SSE41-NEXT: movq %xmm1, %rax +; SSE41-NEXT: psllq $32, %xmm0 +; SSE41-NEXT: movq %xmm0, %rcx +; SSE41-NEXT: xorl %edx, %edx +; SSE41-NEXT: movq %rcx, %rsi +; SSE41-NEXT: addq %rax, %rsi +; SSE41-NEXT: setns %dl +; SSE41-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF +; SSE41-NEXT: addq %r8, %rdx +; SSE41-NEXT: addq %rax, %rcx +; SSE41-NEXT: cmovoq %rdx, %rcx +; SSE41-NEXT: pextrq $1, %xmm1, %rax +; SSE41-NEXT: pextrq $1, %xmm0, %rsi +; SSE41-NEXT: xorl %edi, %edi +; SSE41-NEXT: movq %rsi, %rdx +; SSE41-NEXT: addq %rax, %rdx +; SSE41-NEXT: setns %dil +; SSE41-NEXT: addq %r8, %rdi +; SSE41-NEXT: addq %rax, %rsi +; SSE41-NEXT: cmovoq %rdi, %rsi +; SSE41-NEXT: movq %rsi, %xmm1 +; SSE41-NEXT: movq %rcx, %xmm0 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: v2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 +; AVX1-NEXT: vmovq %xmm1, %rax +; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rcx +; AVX1-NEXT: xorl %edx, %edx +; AVX1-NEXT: movq %rcx, %rsi +; AVX1-NEXT: addq %rax, %rsi +; AVX1-NEXT: setns %dl +; AVX1-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF +; AVX1-NEXT: addq %r8, %rdx +; AVX1-NEXT: addq %rax, %rcx +; AVX1-NEXT: cmovoq %rdx, %rcx +; AVX1-NEXT: vpextrq $1, %xmm1, %rax +; AVX1-NEXT: vpextrq $1, %xmm0, %rsi +; AVX1-NEXT: xorl %edi, %edi +; AVX1-NEXT: movq %rsi, %rdx +; AVX1-NEXT: addq %rax, %rdx +; AVX1-NEXT: setns %dil +; AVX1-NEXT: addq %r8, %rdi +; AVX1-NEXT: addq %rax, %rsi +; AVX1-NEXT: cmovoq %rdi, %rsi +; AVX1-NEXT: vmovq %rsi, %xmm0 +; AVX1-NEXT: vmovq %rcx, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: v2i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1 +; AVX2-NEXT: vmovq %xmm1, %rax +; AVX2-NEXT: vpsllq $32, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rcx +; AVX2-NEXT: xorl %edx, %edx +; AVX2-NEXT: movq %rcx, %rsi +; AVX2-NEXT: addq %rax, %rsi +; AVX2-NEXT: setns %dl +; AVX2-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF +; AVX2-NEXT: addq %r8, %rdx +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: cmovoq %rdx, %rcx +; AVX2-NEXT: vpextrq $1, %xmm1, %rax +; AVX2-NEXT: vpextrq $1, %xmm0, %rsi +; AVX2-NEXT: xorl %edi, %edi +; AVX2-NEXT: movq %rsi, %rdx +; AVX2-NEXT: addq %rax, %rdx +; AVX2-NEXT: setns %dil +; AVX2-NEXT: addq %r8, %rdi +; AVX2-NEXT: addq %rax, %rsi +; AVX2-NEXT: cmovoq %rdi, %rsi +; AVX2-NEXT: vmovq %rsi, %xmm0 +; AVX2-NEXT: vmovq %rcx, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX2-NEXT: retq +; +; AVX512-LABEL: v2i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllq $32, %xmm1, %xmm1 +; AVX512-NEXT: vmovq %xmm1, %rax +; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rcx +; AVX512-NEXT: xorl %edx, %edx +; AVX512-NEXT: movq %rcx, %rsi +; AVX512-NEXT: addq %rax, %rsi +; AVX512-NEXT: setns %dl +; AVX512-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF +; AVX512-NEXT: addq %r8, %rdx +; AVX512-NEXT: addq %rax, %rcx +; AVX512-NEXT: cmovoq %rdx, %rcx +; AVX512-NEXT: vpextrq $1, %xmm1, %rax +; AVX512-NEXT: vpextrq $1, %xmm0, %rsi +; AVX512-NEXT: xorl %edi, %edi +; AVX512-NEXT: movq %rsi, %rdx +; AVX512-NEXT: addq %rax, %rdx +; AVX512-NEXT: setns %dil +; AVX512-NEXT: addq %r8, %rdi +; AVX512-NEXT: addq %rax, %rsi +; AVX512-NEXT: cmovoq %rdi, %rsi +; AVX512-NEXT: vmovq %rsi, %xmm0 +; AVX512-NEXT: vmovq %rcx, %xmm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NEXT: vpsraq $32, %xmm0, %xmm0 +; AVX512-NEXT: retq + %z = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %x, <2 x i32> %y) + ret <2 x i32> %z +} + +define <4 x i24> @v4i24(<4 x i24> %x, <4 x i24> %y) nounwind { +; SSE2-LABEL: v4i24: +; SSE2: # %bb.0: +; SSE2-NEXT: pslld $8, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] +; SSE2-NEXT: movd %xmm2, %ecx +; SSE2-NEXT: pslld $8, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; SSE2-NEXT: movd %xmm2, %r8d +; SSE2-NEXT: xorl %edx, %edx +; SSE2-NEXT: movl %r8d, %esi +; SSE2-NEXT: addl %ecx, %esi +; SSE2-NEXT: setns %dl +; SSE2-NEXT: addl $2147483647, %edx # imm = 0x7FFFFFFF +; SSE2-NEXT: addl %ecx, %r8d +; SSE2-NEXT: cmovol %edx, %r8d +; SSE2-NEXT: movd %xmm1, %edx +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: xorl %esi, %esi +; SSE2-NEXT: movl %ecx, %edi +; SSE2-NEXT: addl %edx, %edi +; SSE2-NEXT: setns %sil +; SSE2-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; SSE2-NEXT: addl %edx, %ecx +; SSE2-NEXT: cmovol %esi, %ecx +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE2-NEXT: movd %xmm2, %edx +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: xorl %edi, %edi +; SSE2-NEXT: movl %eax, %esi +; SSE2-NEXT: addl %edx, %esi +; SSE2-NEXT: setns %dil +; SSE2-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; SSE2-NEXT: addl %edx, %eax +; SSE2-NEXT: cmovol %edi, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] +; SSE2-NEXT: movd %xmm1, %r9d +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE2-NEXT: movd %xmm0, %edx +; SSE2-NEXT: xorl %edi, %edi +; SSE2-NEXT: movl %edx, %esi +; SSE2-NEXT: addl %r9d, %esi +; SSE2-NEXT: setns %dil +; SSE2-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; SSE2-NEXT: addl %r9d, %edx +; SSE2-NEXT: cmovol %edi, %edx +; SSE2-NEXT: movd %edx, %xmm0 +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: movd %r8d, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: psrad $8, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v4i24: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pslld $8, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] +; SSSE3-NEXT: movd %xmm2, %ecx +; SSSE3-NEXT: pslld $8, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; SSSE3-NEXT: movd %xmm2, %r8d +; SSSE3-NEXT: xorl %edx, %edx +; SSSE3-NEXT: movl %r8d, %esi +; SSSE3-NEXT: addl %ecx, %esi +; SSSE3-NEXT: setns %dl +; SSSE3-NEXT: addl $2147483647, %edx # imm = 0x7FFFFFFF +; SSSE3-NEXT: addl %ecx, %r8d +; SSSE3-NEXT: cmovol %edx, %r8d +; SSSE3-NEXT: movd %xmm1, %edx +; SSSE3-NEXT: movd %xmm0, %ecx +; SSSE3-NEXT: xorl %esi, %esi +; SSSE3-NEXT: movl %ecx, %edi +; SSSE3-NEXT: addl %edx, %edi +; SSSE3-NEXT: setns %sil +; SSSE3-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; SSSE3-NEXT: addl %edx, %ecx +; SSSE3-NEXT: cmovol %esi, %ecx +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSSE3-NEXT: movd %xmm2, %edx +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSSE3-NEXT: movd %xmm2, %eax +; SSSE3-NEXT: xorl %edi, %edi +; SSSE3-NEXT: movl %eax, %esi +; SSSE3-NEXT: addl %edx, %esi +; SSSE3-NEXT: setns %dil +; SSSE3-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; SSSE3-NEXT: addl %edx, %eax +; SSSE3-NEXT: cmovol %edi, %eax +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] +; SSSE3-NEXT: movd %xmm1, %r9d +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSSE3-NEXT: movd %xmm0, %edx +; SSSE3-NEXT: xorl %edi, %edi +; SSSE3-NEXT: movl %edx, %esi +; SSSE3-NEXT: addl %r9d, %esi +; SSSE3-NEXT: setns %dil +; SSSE3-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; SSSE3-NEXT: addl %r9d, %edx +; SSSE3-NEXT: cmovol %edi, %edx +; SSSE3-NEXT: movd %edx, %xmm0 +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: movd %r8d, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: psrad $8, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v4i24: +; SSE41: # %bb.0: +; SSE41-NEXT: pslld $8, %xmm1 +; SSE41-NEXT: pextrd $3, %xmm1, %ecx +; SSE41-NEXT: pslld $8, %xmm0 +; SSE41-NEXT: pextrd $3, %xmm0, %r8d +; SSE41-NEXT: xorl %edx, %edx +; SSE41-NEXT: movl %r8d, %esi +; SSE41-NEXT: addl %ecx, %esi +; SSE41-NEXT: setns %dl +; SSE41-NEXT: addl $2147483647, %edx # imm = 0x7FFFFFFF +; SSE41-NEXT: addl %ecx, %r8d +; SSE41-NEXT: cmovol %edx, %r8d +; SSE41-NEXT: pextrd $2, %xmm1, %edx +; SSE41-NEXT: pextrd $2, %xmm0, %ecx +; SSE41-NEXT: xorl %esi, %esi +; SSE41-NEXT: movl %ecx, %edi +; SSE41-NEXT: addl %edx, %edi +; SSE41-NEXT: setns %sil +; SSE41-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; SSE41-NEXT: addl %edx, %ecx +; SSE41-NEXT: cmovol %esi, %ecx +; SSE41-NEXT: movd %xmm1, %edx +; SSE41-NEXT: movd %xmm0, %eax +; SSE41-NEXT: xorl %edi, %edi +; SSE41-NEXT: movl %eax, %esi +; SSE41-NEXT: addl %edx, %esi +; SSE41-NEXT: setns %dil +; SSE41-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; SSE41-NEXT: addl %edx, %eax +; SSE41-NEXT: cmovol %edi, %eax +; SSE41-NEXT: pextrd $1, %xmm1, %r9d +; SSE41-NEXT: pextrd $1, %xmm0, %edx +; SSE41-NEXT: xorl %edi, %edi +; SSE41-NEXT: movl %edx, %esi +; SSE41-NEXT: addl %r9d, %esi +; SSE41-NEXT: setns %dil +; SSE41-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; SSE41-NEXT: addl %r9d, %edx +; SSE41-NEXT: cmovol %edi, %edx +; SSE41-NEXT: movd %eax, %xmm0 +; SSE41-NEXT: pinsrd $1, %edx, %xmm0 +; SSE41-NEXT: pinsrd $2, %ecx, %xmm0 +; SSE41-NEXT: pinsrd $3, %r8d, %xmm0 +; SSE41-NEXT: psrad $8, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: v4i24: +; AVX: # %bb.0: +; AVX-NEXT: vpslld $8, %xmm1, %xmm1 +; AVX-NEXT: vpextrd $3, %xmm1, %ecx +; AVX-NEXT: vpslld $8, %xmm0, %xmm0 +; AVX-NEXT: vpextrd $3, %xmm0, %r9d +; AVX-NEXT: xorl %edx, %edx +; AVX-NEXT: movl %r9d, %esi +; AVX-NEXT: addl %ecx, %esi +; AVX-NEXT: setns %dl +; AVX-NEXT: addl $2147483647, %edx # imm = 0x7FFFFFFF +; AVX-NEXT: addl %ecx, %r9d +; AVX-NEXT: cmovol %edx, %r9d +; AVX-NEXT: vpextrd $2, %xmm1, %edx +; AVX-NEXT: vpextrd $2, %xmm0, %ecx +; AVX-NEXT: xorl %esi, %esi +; AVX-NEXT: movl %ecx, %edi +; AVX-NEXT: addl %edx, %edi +; AVX-NEXT: setns %sil +; AVX-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; AVX-NEXT: addl %edx, %ecx +; AVX-NEXT: cmovol %esi, %ecx +; AVX-NEXT: vmovd %xmm1, %r8d +; AVX-NEXT: vmovd %xmm0, %edx +; AVX-NEXT: xorl %edi, %edi +; AVX-NEXT: movl %edx, %esi +; AVX-NEXT: addl %r8d, %esi +; AVX-NEXT: setns %dil +; AVX-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; AVX-NEXT: addl %r8d, %edx +; AVX-NEXT: cmovol %edi, %edx +; AVX-NEXT: vpextrd $1, %xmm1, %r8d +; AVX-NEXT: vpextrd $1, %xmm0, %eax +; AVX-NEXT: xorl %esi, %esi +; AVX-NEXT: movl %eax, %edi +; AVX-NEXT: addl %r8d, %edi +; AVX-NEXT: setns %sil +; AVX-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; AVX-NEXT: addl %r8d, %eax +; AVX-NEXT: cmovol %esi, %eax +; AVX-NEXT: vmovd %edx, %xmm0 +; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; AVX-NEXT: vpinsrd $3, %r9d, %xmm0, %xmm0 +; AVX-NEXT: vpsrad $8, %xmm0, %xmm0 +; AVX-NEXT: retq + %z = call <4 x i24> @llvm.sadd.sat.v4i24(<4 x i24> %x, <4 x i24> %y) + ret <4 x i24> %z +} + +define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind { +; SSE-LABEL: v2i128: +; SSE: # %bb.0: +; SSE-NEXT: pushq %r15 +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %r13 +; SSE-NEXT: pushq %r12 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r14 +; SSE-NEXT: addq {{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: movq %r8, %r13 +; SSE-NEXT: adcq %r14, %r13 +; SSE-NEXT: movq %r13, %r10 +; SSE-NEXT: sarq $63, %r10 +; SSE-NEXT: xorl %edi, %edi +; SSE-NEXT: testq %r13, %r13 +; SSE-NEXT: setns %dil +; SSE-NEXT: movabsq $9223372036854775807, %r12 # imm = 0x7FFFFFFFFFFFFFFF +; SSE-NEXT: leaq (%rdi,%r12), %r15 +; SSE-NEXT: testq %r8, %r8 +; SSE-NEXT: setns %r8b +; SSE-NEXT: cmpb %dil, %r8b +; SSE-NEXT: setne %dil +; SSE-NEXT: testq %r14, %r14 +; SSE-NEXT: setns %bl +; SSE-NEXT: cmpb %bl, %r8b +; SSE-NEXT: sete %bl +; SSE-NEXT: testb %dil, %bl +; SSE-NEXT: cmoveq %r13, %r15 +; SSE-NEXT: cmoveq %rcx, %r10 +; SSE-NEXT: addq %r9, %rsi +; SSE-NEXT: movq %rdx, %rdi +; SSE-NEXT: adcq %r11, %rdi +; SSE-NEXT: setns %bl +; SSE-NEXT: movzbl %bl, %ebx +; SSE-NEXT: addq %rbx, %r12 +; SSE-NEXT: movq %rdi, %rcx +; SSE-NEXT: sarq $63, %rcx +; SSE-NEXT: testq %r11, %r11 +; SSE-NEXT: setns %r8b +; SSE-NEXT: testq %rdx, %rdx +; SSE-NEXT: setns %dl +; SSE-NEXT: cmpb %r8b, %dl +; SSE-NEXT: sete %r8b +; SSE-NEXT: cmpb %bl, %dl +; SSE-NEXT: setne %dl +; SSE-NEXT: testb %dl, %r8b +; SSE-NEXT: cmoveq %rsi, %rcx +; SSE-NEXT: cmoveq %rdi, %r12 +; SSE-NEXT: movq %r15, 24(%rax) +; SSE-NEXT: movq %r10, 16(%rax) +; SSE-NEXT: movq %r12, 8(%rax) +; SSE-NEXT: movq %rcx, (%rax) +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r12 +; SSE-NEXT: popq %r13 +; SSE-NEXT: popq %r14 +; SSE-NEXT: popq %r15 +; SSE-NEXT: retq +; +; AVX-LABEL: v2i128: +; AVX: # %bb.0: +; AVX-NEXT: pushq %r15 +; AVX-NEXT: pushq %r14 +; AVX-NEXT: pushq %r13 +; AVX-NEXT: pushq %r12 +; AVX-NEXT: pushq %rbx +; AVX-NEXT: movq %rdi, %rax +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r14 +; AVX-NEXT: addq {{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movq %r8, %r13 +; AVX-NEXT: adcq %r14, %r13 +; AVX-NEXT: movq %r13, %r10 +; AVX-NEXT: sarq $63, %r10 +; AVX-NEXT: xorl %edi, %edi +; AVX-NEXT: testq %r13, %r13 +; AVX-NEXT: setns %dil +; AVX-NEXT: movabsq $9223372036854775807, %r12 # imm = 0x7FFFFFFFFFFFFFFF +; AVX-NEXT: leaq (%rdi,%r12), %r15 +; AVX-NEXT: testq %r8, %r8 +; AVX-NEXT: setns %r8b +; AVX-NEXT: cmpb %dil, %r8b +; AVX-NEXT: setne %dil +; AVX-NEXT: testq %r14, %r14 +; AVX-NEXT: setns %bl +; AVX-NEXT: cmpb %bl, %r8b +; AVX-NEXT: sete %bl +; AVX-NEXT: testb %dil, %bl +; AVX-NEXT: cmoveq %r13, %r15 +; AVX-NEXT: cmoveq %rcx, %r10 +; AVX-NEXT: addq %r9, %rsi +; AVX-NEXT: movq %rdx, %rdi +; AVX-NEXT: adcq %r11, %rdi +; AVX-NEXT: setns %bl +; AVX-NEXT: movzbl %bl, %ebx +; AVX-NEXT: addq %rbx, %r12 +; AVX-NEXT: movq %rdi, %rcx +; AVX-NEXT: sarq $63, %rcx +; AVX-NEXT: testq %r11, %r11 +; AVX-NEXT: setns %r8b +; AVX-NEXT: testq %rdx, %rdx +; AVX-NEXT: setns %dl +; AVX-NEXT: cmpb %r8b, %dl +; AVX-NEXT: sete %r8b +; AVX-NEXT: cmpb %bl, %dl +; AVX-NEXT: setne %dl +; AVX-NEXT: testb %dl, %r8b +; AVX-NEXT: cmoveq %rsi, %rcx +; AVX-NEXT: cmoveq %rdi, %r12 +; AVX-NEXT: movq %r15, 24(%rax) +; AVX-NEXT: movq %r10, 16(%rax) +; AVX-NEXT: movq %r12, 8(%rax) +; AVX-NEXT: movq %rcx, (%rax) +; AVX-NEXT: popq %rbx +; AVX-NEXT: popq %r12 +; AVX-NEXT: popq %r13 +; AVX-NEXT: popq %r14 +; AVX-NEXT: popq %r15 +; AVX-NEXT: retq + %z = call <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128> %x, <2 x i128> %y) + ret <2 x i128> %z +} diff --git a/test/CodeGen/X86/ssub_sat_vec.ll b/test/CodeGen/X86/ssub_sat_vec.ll new file mode 100644 index 00000000000..dea002f8705 --- /dev/null +++ b/test/CodeGen/X86/ssub_sat_vec.ll @@ -0,0 +1,20167 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512 + +declare <1 x i8> @llvm.ssub.sat.v1i8(<1 x i8>, <1 x i8>) +declare <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8>, <2 x i8>) +declare <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8>, <4 x i8>) +declare <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8>, <8 x i8>) +declare <12 x i8> @llvm.ssub.sat.v12i8(<12 x i8>, <12 x i8>) +declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8>, <16 x i8>) +declare <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8>, <32 x i8>) +declare <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8>, <64 x i8>) + +declare <1 x i16> @llvm.ssub.sat.v1i16(<1 x i16>, <1 x i16>) +declare <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16>, <2 x i16>) +declare <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16>, <4 x i16>) +declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16>, <8 x i16>) +declare <12 x i16> @llvm.ssub.sat.v12i16(<12 x i16>, <12 x i16>) +declare <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16>, <16 x i16>) +declare <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16>, <32 x i16>) + +declare <16 x i1> @llvm.ssub.sat.v16i1(<16 x i1>, <16 x i1>) +declare <16 x i4> @llvm.ssub.sat.v16i4(<16 x i4>, <16 x i4>) + +declare <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32>, <4 x i32>) +declare <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32>, <2 x i32>) +declare <4 x i24> @llvm.ssub.sat.v4i24(<4 x i24>, <4 x i24>) +declare <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128>, <2 x i128>) + +; Legal types, depending on architecture. + +define <16 x i8> @v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { +; SSE2-LABEL: v16i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: pushq %r15 +; SSE2-NEXT: pushq %r14 +; SSE2-NEXT: pushq %r13 +; SSE2-NEXT: pushq %r12 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r9b +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSE2-NEXT: movl %r9d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r9b +; SSE2-NEXT: jno .LBB0_2 +; SSE2-NEXT: # %bb.1: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r9d +; SSE2-NEXT: .LBB0_2: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movl %esi, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %sil +; SSE2-NEXT: jno .LBB0_4 +; SSE2-NEXT: # %bb.3: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %esi +; SSE2-NEXT: .LBB0_4: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl %ebx, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %bl +; SSE2-NEXT: jno .LBB0_6 +; SSE2-NEXT: # %bb.5: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %ebx +; SSE2-NEXT: .LBB0_6: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %dl +; SSE2-NEXT: jno .LBB0_8 +; SSE2-NEXT: # %bb.7: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %edx +; SSE2-NEXT: .LBB0_8: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r10b +; SSE2-NEXT: movl %r10d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r10b +; SSE2-NEXT: jno .LBB0_10 +; SSE2-NEXT: # %bb.9: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r10d +; SSE2-NEXT: .LBB0_10: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r11b +; SSE2-NEXT: movl %r11d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r11b +; SSE2-NEXT: jno .LBB0_12 +; SSE2-NEXT: # %bb.11: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r11d +; SSE2-NEXT: .LBB0_12: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bpl +; SSE2-NEXT: movl %ebp, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %bpl +; SSE2-NEXT: jno .LBB0_14 +; SSE2-NEXT: # %bb.13: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %ebp +; SSE2-NEXT: .LBB0_14: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r14b +; SSE2-NEXT: movl %r14d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r14b +; SSE2-NEXT: jno .LBB0_16 +; SSE2-NEXT: # %bb.15: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r14d +; SSE2-NEXT: .LBB0_16: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r15b +; SSE2-NEXT: movl %r15d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r15b +; SSE2-NEXT: jno .LBB0_18 +; SSE2-NEXT: # %bb.17: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r15d +; SSE2-NEXT: .LBB0_18: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r12b +; SSE2-NEXT: movl %r12d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r12b +; SSE2-NEXT: jno .LBB0_20 +; SSE2-NEXT: # %bb.19: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r12d +; SSE2-NEXT: .LBB0_20: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r13b +; SSE2-NEXT: movl %r13d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r13b +; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB0_22 +; SSE2-NEXT: # %bb.21: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r13d +; SSE2-NEXT: .LBB0_22: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dil +; SSE2-NEXT: movl %edi, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %dil +; SSE2-NEXT: jno .LBB0_24 +; SSE2-NEXT: # %bb.23: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %edi +; SSE2-NEXT: .LBB0_24: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r8b +; SSE2-NEXT: movl %r8d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r8b +; SSE2-NEXT: jno .LBB0_26 +; SSE2-NEXT: # %bb.25: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r8d +; SSE2-NEXT: .LBB0_26: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl %ebx, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %bl +; SSE2-NEXT: jno .LBB0_28 +; SSE2-NEXT: # %bb.27: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %ebx +; SSE2-NEXT: .LBB0_28: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: subb %dl, %cl +; SSE2-NEXT: setns %cl +; SSE2-NEXT: subb %dl, %al +; SSE2-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB0_30 +; SSE2-NEXT: # %bb.29: +; SSE2-NEXT: addb $127, %cl +; SSE2-NEXT: movl %ecx, %eax +; SSE2-NEXT: .LBB0_30: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSE2-NEXT: movl %esi, %ecx +; SSE2-NEXT: subb %dl, %cl +; SSE2-NEXT: setns %cl +; SSE2-NEXT: subb %dl, %sil +; SSE2-NEXT: jno .LBB0_32 +; SSE2-NEXT: # %bb.31: +; SSE2-NEXT: addb $127, %cl +; SSE2-NEXT: movl %ecx, %esi +; SSE2-NEXT: .LBB0_32: +; SSE2-NEXT: movzbl %sil, %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: movzbl %bl, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl %r8b, %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: movzbl %dil, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl %r13b, %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: movzbl %r12b, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl %r15b, %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: movzbl %r14b, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl %bpl, %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: movzbl %r11b, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl %r10b, %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: movzbl %r9b, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %r12 +; SSE2-NEXT: popq %r13 +; SSE2-NEXT: popq %r14 +; SSE2-NEXT: popq %r15 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v16i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pushq %rbp +; SSSE3-NEXT: pushq %r15 +; SSSE3-NEXT: pushq %r14 +; SSSE3-NEXT: pushq %r13 +; SSSE3-NEXT: pushq %r12 +; SSSE3-NEXT: pushq %rbx +; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r9b +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSSE3-NEXT: movl %r9d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r9b +; SSSE3-NEXT: jno .LBB0_2 +; SSSE3-NEXT: # %bb.1: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r9d +; SSSE3-NEXT: .LBB0_2: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movl %esi, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %sil +; SSSE3-NEXT: jno .LBB0_4 +; SSSE3-NEXT: # %bb.3: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %esi +; SSSE3-NEXT: .LBB0_4: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl %ebx, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %bl +; SSSE3-NEXT: jno .LBB0_6 +; SSSE3-NEXT: # %bb.5: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %ebx +; SSSE3-NEXT: .LBB0_6: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %dl +; SSSE3-NEXT: jno .LBB0_8 +; SSSE3-NEXT: # %bb.7: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %edx +; SSSE3-NEXT: .LBB0_8: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r10b +; SSSE3-NEXT: movl %r10d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r10b +; SSSE3-NEXT: jno .LBB0_10 +; SSSE3-NEXT: # %bb.9: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r10d +; SSSE3-NEXT: .LBB0_10: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r11b +; SSSE3-NEXT: movl %r11d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r11b +; SSSE3-NEXT: jno .LBB0_12 +; SSSE3-NEXT: # %bb.11: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r11d +; SSSE3-NEXT: .LBB0_12: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bpl +; SSSE3-NEXT: movl %ebp, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %bpl +; SSSE3-NEXT: jno .LBB0_14 +; SSSE3-NEXT: # %bb.13: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %ebp +; SSSE3-NEXT: .LBB0_14: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r14b +; SSSE3-NEXT: movl %r14d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r14b +; SSSE3-NEXT: jno .LBB0_16 +; SSSE3-NEXT: # %bb.15: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r14d +; SSSE3-NEXT: .LBB0_16: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r15b +; SSSE3-NEXT: movl %r15d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r15b +; SSSE3-NEXT: jno .LBB0_18 +; SSSE3-NEXT: # %bb.17: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r15d +; SSSE3-NEXT: .LBB0_18: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r12b +; SSSE3-NEXT: movl %r12d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r12b +; SSSE3-NEXT: jno .LBB0_20 +; SSSE3-NEXT: # %bb.19: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r12d +; SSSE3-NEXT: .LBB0_20: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r13b +; SSSE3-NEXT: movl %r13d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r13b +; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB0_22 +; SSSE3-NEXT: # %bb.21: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r13d +; SSSE3-NEXT: .LBB0_22: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dil +; SSSE3-NEXT: movl %edi, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %dil +; SSSE3-NEXT: jno .LBB0_24 +; SSSE3-NEXT: # %bb.23: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %edi +; SSSE3-NEXT: .LBB0_24: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r8b +; SSSE3-NEXT: movl %r8d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r8b +; SSSE3-NEXT: jno .LBB0_26 +; SSSE3-NEXT: # %bb.25: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r8d +; SSSE3-NEXT: .LBB0_26: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl %ebx, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %bl +; SSSE3-NEXT: jno .LBB0_28 +; SSSE3-NEXT: # %bb.27: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %ebx +; SSSE3-NEXT: .LBB0_28: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: subb %dl, %cl +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: subb %dl, %al +; SSSE3-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB0_30 +; SSSE3-NEXT: # %bb.29: +; SSSE3-NEXT: addb $127, %cl +; SSSE3-NEXT: movl %ecx, %eax +; SSSE3-NEXT: .LBB0_30: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSSE3-NEXT: movl %esi, %ecx +; SSSE3-NEXT: subb %dl, %cl +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: subb %dl, %sil +; SSSE3-NEXT: jno .LBB0_32 +; SSSE3-NEXT: # %bb.31: +; SSSE3-NEXT: addb $127, %cl +; SSSE3-NEXT: movl %ecx, %esi +; SSSE3-NEXT: .LBB0_32: +; SSSE3-NEXT: movzbl %sil, %ecx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: movzbl %al, %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: movzbl %bl, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzbl %r8b, %eax +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSSE3-NEXT: movzbl %dil, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzbl %r13b, %eax +; SSSE3-NEXT: movd %eax, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSSE3-NEXT: movzbl %r12b, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzbl %r15b, %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSSE3-NEXT: movzbl %r14b, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzbl %bpl, %eax +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: movzbl %r11b, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzbl %r10b, %eax +; SSSE3-NEXT: movd %eax, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movd %eax, %xmm4 +; SSSE3-NEXT: movzbl %r9b, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: popq %rbx +; SSSE3-NEXT: popq %r12 +; SSSE3-NEXT: popq %r13 +; SSSE3-NEXT: popq %r14 +; SSSE3-NEXT: popq %r15 +; SSSE3-NEXT: popq %rbp +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v16i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pextrb $15, %xmm1, %ecx +; SSE41-NEXT: pextrb $15, %xmm0, %edx +; SSE41-NEXT: movl %edx, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %dl +; SSE41-NEXT: jno .LBB0_2 +; SSE41-NEXT: # %bb.1: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edx +; SSE41-NEXT: .LBB0_2: +; SSE41-NEXT: pextrb $14, %xmm1, %ecx +; SSE41-NEXT: pextrb $14, %xmm0, %r11d +; SSE41-NEXT: movl %r11d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r11b +; SSE41-NEXT: jno .LBB0_4 +; SSE41-NEXT: # %bb.3: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r11d +; SSE41-NEXT: .LBB0_4: +; SSE41-NEXT: pextrb $13, %xmm1, %ecx +; SSE41-NEXT: pextrb $13, %xmm0, %edi +; SSE41-NEXT: movl %edi, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %dil +; SSE41-NEXT: jno .LBB0_6 +; SSE41-NEXT: # %bb.5: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edi +; SSE41-NEXT: .LBB0_6: +; SSE41-NEXT: pushq %rbp +; SSE41-NEXT: pushq %r15 +; SSE41-NEXT: pushq %r14 +; SSE41-NEXT: pushq %r13 +; SSE41-NEXT: pushq %r12 +; SSE41-NEXT: pushq %rbx +; SSE41-NEXT: pextrb $12, %xmm1, %ecx +; SSE41-NEXT: pextrb $12, %xmm0, %r14d +; SSE41-NEXT: movl %r14d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r14b +; SSE41-NEXT: jno .LBB0_8 +; SSE41-NEXT: # %bb.7: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r14d +; SSE41-NEXT: .LBB0_8: +; SSE41-NEXT: pextrb $11, %xmm1, %ecx +; SSE41-NEXT: pextrb $11, %xmm0, %ebp +; SSE41-NEXT: movl %ebp, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %bpl +; SSE41-NEXT: jno .LBB0_10 +; SSE41-NEXT: # %bb.9: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebp +; SSE41-NEXT: .LBB0_10: +; SSE41-NEXT: pextrb $10, %xmm1, %ecx +; SSE41-NEXT: pextrb $10, %xmm0, %r15d +; SSE41-NEXT: movl %r15d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r15b +; SSE41-NEXT: jno .LBB0_12 +; SSE41-NEXT: # %bb.11: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r15d +; SSE41-NEXT: .LBB0_12: +; SSE41-NEXT: pextrb $9, %xmm1, %ecx +; SSE41-NEXT: pextrb $9, %xmm0, %r12d +; SSE41-NEXT: movl %r12d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r12b +; SSE41-NEXT: jno .LBB0_14 +; SSE41-NEXT: # %bb.13: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r12d +; SSE41-NEXT: .LBB0_14: +; SSE41-NEXT: pextrb $8, %xmm1, %ecx +; SSE41-NEXT: pextrb $8, %xmm0, %r13d +; SSE41-NEXT: movl %r13d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r13b +; SSE41-NEXT: jno .LBB0_16 +; SSE41-NEXT: # %bb.15: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r13d +; SSE41-NEXT: .LBB0_16: +; SSE41-NEXT: pextrb $7, %xmm1, %ecx +; SSE41-NEXT: pextrb $7, %xmm0, %r10d +; SSE41-NEXT: movl %r10d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r10b +; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB0_18 +; SSE41-NEXT: # %bb.17: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r10d +; SSE41-NEXT: .LBB0_18: +; SSE41-NEXT: pextrb $6, %xmm1, %ecx +; SSE41-NEXT: pextrb $6, %xmm0, %r9d +; SSE41-NEXT: movl %r9d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r9b +; SSE41-NEXT: jno .LBB0_20 +; SSE41-NEXT: # %bb.19: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r9d +; SSE41-NEXT: .LBB0_20: +; SSE41-NEXT: pextrb $5, %xmm1, %ecx +; SSE41-NEXT: pextrb $5, %xmm0, %ebp +; SSE41-NEXT: movl %ebp, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %bpl +; SSE41-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB0_22 +; SSE41-NEXT: # %bb.21: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebp +; SSE41-NEXT: .LBB0_22: +; SSE41-NEXT: pextrb $4, %xmm1, %ecx +; SSE41-NEXT: pextrb $4, %xmm0, %edi +; SSE41-NEXT: movl %edi, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %dil +; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB0_24 +; SSE41-NEXT: # %bb.23: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edi +; SSE41-NEXT: .LBB0_24: +; SSE41-NEXT: pextrb $3, %xmm1, %edx +; SSE41-NEXT: pextrb $3, %xmm0, %eax +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: setns %cl +; SSE41-NEXT: subb %dl, %al +; SSE41-NEXT: jno .LBB0_26 +; SSE41-NEXT: # %bb.25: +; SSE41-NEXT: addb $127, %cl +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB0_26: +; SSE41-NEXT: pextrb $2, %xmm1, %ebx +; SSE41-NEXT: pextrb $2, %xmm0, %ecx +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: subb %bl, %dl +; SSE41-NEXT: setns %dl +; SSE41-NEXT: subb %bl, %cl +; SSE41-NEXT: jno .LBB0_28 +; SSE41-NEXT: # %bb.27: +; SSE41-NEXT: addb $127, %dl +; SSE41-NEXT: movl %edx, %ecx +; SSE41-NEXT: .LBB0_28: +; SSE41-NEXT: pextrb $0, %xmm1, %esi +; SSE41-NEXT: pextrb $0, %xmm0, %edx +; SSE41-NEXT: movl %edx, %ebx +; SSE41-NEXT: subb %sil, %bl +; SSE41-NEXT: setns %bl +; SSE41-NEXT: subb %sil, %dl +; SSE41-NEXT: jno .LBB0_30 +; SSE41-NEXT: # %bb.29: +; SSE41-NEXT: addb $127, %bl +; SSE41-NEXT: movl %ebx, %edx +; SSE41-NEXT: .LBB0_30: +; SSE41-NEXT: pextrb $1, %xmm1, %esi +; SSE41-NEXT: pextrb $1, %xmm0, %r8d +; SSE41-NEXT: movl %r8d, %ebx +; SSE41-NEXT: subb %sil, %bl +; SSE41-NEXT: setns %bl +; SSE41-NEXT: subb %sil, %r8b +; SSE41-NEXT: jno .LBB0_32 +; SSE41-NEXT: # %bb.31: +; SSE41-NEXT: addb $127, %bl +; SSE41-NEXT: movl %ebx, %r8d +; SSE41-NEXT: .LBB0_32: +; SSE41-NEXT: movzbl %dl, %edx +; SSE41-NEXT: movd %edx, %xmm0 +; SSE41-NEXT: movzbl %r8b, %edx +; SSE41-NEXT: pinsrb $1, %edx, %xmm0 +; SSE41-NEXT: movzbl %cl, %ecx +; SSE41-NEXT: pinsrb $2, %ecx, %xmm0 +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $3, %eax, %xmm0 +; SSE41-NEXT: movzbl %dil, %eax +; SSE41-NEXT: pinsrb $4, %eax, %xmm0 +; SSE41-NEXT: movzbl %bpl, %eax +; SSE41-NEXT: pinsrb $5, %eax, %xmm0 +; SSE41-NEXT: movzbl %r9b, %eax +; SSE41-NEXT: pinsrb $6, %eax, %xmm0 +; SSE41-NEXT: movzbl %r10b, %eax +; SSE41-NEXT: pinsrb $7, %eax, %xmm0 +; SSE41-NEXT: movzbl %r13b, %eax +; SSE41-NEXT: pinsrb $8, %eax, %xmm0 +; SSE41-NEXT: movzbl %r12b, %eax +; SSE41-NEXT: pinsrb $9, %eax, %xmm0 +; SSE41-NEXT: movzbl %r15b, %eax +; SSE41-NEXT: pinsrb $10, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $11, %eax, %xmm0 +; SSE41-NEXT: movzbl %r14b, %eax +; SSE41-NEXT: pinsrb $12, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $13, %eax, %xmm0 +; SSE41-NEXT: movzbl %r11b, %eax +; SSE41-NEXT: pinsrb $14, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $15, %eax, %xmm0 +; SSE41-NEXT: popq %rbx +; SSE41-NEXT: popq %r12 +; SSE41-NEXT: popq %r13 +; SSE41-NEXT: popq %r14 +; SSE41-NEXT: popq %r15 +; SSE41-NEXT: popq %rbp +; SSE41-NEXT: retq +; +; AVX-LABEL: v16i8: +; AVX: # %bb.0: +; AVX-NEXT: vpextrb $15, %xmm1, %ecx +; AVX-NEXT: vpextrb $15, %xmm0, %edx +; AVX-NEXT: movl %edx, %eax +; AVX-NEXT: subb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: subb %cl, %dl +; AVX-NEXT: jno .LBB0_2 +; AVX-NEXT: # %bb.1: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %edx +; AVX-NEXT: .LBB0_2: +; AVX-NEXT: vpextrb $14, %xmm1, %ecx +; AVX-NEXT: vpextrb $14, %xmm0, %r11d +; AVX-NEXT: movl %r11d, %eax +; AVX-NEXT: subb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: subb %cl, %r11b +; AVX-NEXT: jno .LBB0_4 +; AVX-NEXT: # %bb.3: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %r11d +; AVX-NEXT: .LBB0_4: +; AVX-NEXT: vpextrb $13, %xmm1, %ecx +; AVX-NEXT: vpextrb $13, %xmm0, %edi +; AVX-NEXT: movl %edi, %eax +; AVX-NEXT: subb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: subb %cl, %dil +; AVX-NEXT: jno .LBB0_6 +; AVX-NEXT: # %bb.5: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %edi +; AVX-NEXT: .LBB0_6: +; AVX-NEXT: pushq %rbp +; AVX-NEXT: pushq %r15 +; AVX-NEXT: pushq %r14 +; AVX-NEXT: pushq %r13 +; AVX-NEXT: pushq %r12 +; AVX-NEXT: pushq %rbx +; AVX-NEXT: vpextrb $12, %xmm1, %ecx +; AVX-NEXT: vpextrb $12, %xmm0, %r14d +; AVX-NEXT: movl %r14d, %eax +; AVX-NEXT: subb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: subb %cl, %r14b +; AVX-NEXT: jno .LBB0_8 +; AVX-NEXT: # %bb.7: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %r14d +; AVX-NEXT: .LBB0_8: +; AVX-NEXT: vpextrb $11, %xmm1, %ecx +; AVX-NEXT: vpextrb $11, %xmm0, %ebp +; AVX-NEXT: movl %ebp, %eax +; AVX-NEXT: subb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: subb %cl, %bpl +; AVX-NEXT: jno .LBB0_10 +; AVX-NEXT: # %bb.9: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %ebp +; AVX-NEXT: .LBB0_10: +; AVX-NEXT: vpextrb $10, %xmm1, %ecx +; AVX-NEXT: vpextrb $10, %xmm0, %r15d +; AVX-NEXT: movl %r15d, %eax +; AVX-NEXT: subb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: subb %cl, %r15b +; AVX-NEXT: jno .LBB0_12 +; AVX-NEXT: # %bb.11: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %r15d +; AVX-NEXT: .LBB0_12: +; AVX-NEXT: vpextrb $9, %xmm1, %ecx +; AVX-NEXT: vpextrb $9, %xmm0, %r12d +; AVX-NEXT: movl %r12d, %eax +; AVX-NEXT: subb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: subb %cl, %r12b +; AVX-NEXT: jno .LBB0_14 +; AVX-NEXT: # %bb.13: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %r12d +; AVX-NEXT: .LBB0_14: +; AVX-NEXT: vpextrb $8, %xmm1, %ecx +; AVX-NEXT: vpextrb $8, %xmm0, %r13d +; AVX-NEXT: movl %r13d, %eax +; AVX-NEXT: subb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: subb %cl, %r13b +; AVX-NEXT: jno .LBB0_16 +; AVX-NEXT: # %bb.15: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %r13d +; AVX-NEXT: .LBB0_16: +; AVX-NEXT: vpextrb $7, %xmm1, %ecx +; AVX-NEXT: vpextrb $7, %xmm0, %r10d +; AVX-NEXT: movl %r10d, %eax +; AVX-NEXT: subb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: subb %cl, %r10b +; AVX-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX-NEXT: jno .LBB0_18 +; AVX-NEXT: # %bb.17: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %r10d +; AVX-NEXT: .LBB0_18: +; AVX-NEXT: vpextrb $6, %xmm1, %ecx +; AVX-NEXT: vpextrb $6, %xmm0, %r9d +; AVX-NEXT: movl %r9d, %eax +; AVX-NEXT: subb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: subb %cl, %r9b +; AVX-NEXT: jno .LBB0_20 +; AVX-NEXT: # %bb.19: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %r9d +; AVX-NEXT: .LBB0_20: +; AVX-NEXT: vpextrb $5, %xmm1, %ecx +; AVX-NEXT: vpextrb $5, %xmm0, %ebp +; AVX-NEXT: movl %ebp, %eax +; AVX-NEXT: subb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: subb %cl, %bpl +; AVX-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX-NEXT: jno .LBB0_22 +; AVX-NEXT: # %bb.21: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %ebp +; AVX-NEXT: .LBB0_22: +; AVX-NEXT: vpextrb $4, %xmm1, %ecx +; AVX-NEXT: vpextrb $4, %xmm0, %edi +; AVX-NEXT: movl %edi, %eax +; AVX-NEXT: subb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: subb %cl, %dil +; AVX-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX-NEXT: jno .LBB0_24 +; AVX-NEXT: # %bb.23: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %edi +; AVX-NEXT: .LBB0_24: +; AVX-NEXT: vpextrb $3, %xmm1, %edx +; AVX-NEXT: vpextrb $3, %xmm0, %eax +; AVX-NEXT: movl %eax, %ecx +; AVX-NEXT: subb %dl, %cl +; AVX-NEXT: setns %cl +; AVX-NEXT: subb %dl, %al +; AVX-NEXT: jno .LBB0_26 +; AVX-NEXT: # %bb.25: +; AVX-NEXT: addb $127, %cl +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: .LBB0_26: +; AVX-NEXT: vpextrb $2, %xmm1, %ebx +; AVX-NEXT: vpextrb $2, %xmm0, %ecx +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: subb %bl, %dl +; AVX-NEXT: setns %dl +; AVX-NEXT: subb %bl, %cl +; AVX-NEXT: jno .LBB0_28 +; AVX-NEXT: # %bb.27: +; AVX-NEXT: addb $127, %dl +; AVX-NEXT: movl %edx, %ecx +; AVX-NEXT: .LBB0_28: +; AVX-NEXT: vpextrb $0, %xmm1, %esi +; AVX-NEXT: vpextrb $0, %xmm0, %edx +; AVX-NEXT: movl %edx, %ebx +; AVX-NEXT: subb %sil, %bl +; AVX-NEXT: setns %bl +; AVX-NEXT: subb %sil, %dl +; AVX-NEXT: jno .LBB0_30 +; AVX-NEXT: # %bb.29: +; AVX-NEXT: addb $127, %bl +; AVX-NEXT: movl %ebx, %edx +; AVX-NEXT: .LBB0_30: +; AVX-NEXT: vpextrb $1, %xmm1, %esi +; AVX-NEXT: vpextrb $1, %xmm0, %r8d +; AVX-NEXT: movl %r8d, %ebx +; AVX-NEXT: subb %sil, %bl +; AVX-NEXT: setns %bl +; AVX-NEXT: subb %sil, %r8b +; AVX-NEXT: jno .LBB0_32 +; AVX-NEXT: # %bb.31: +; AVX-NEXT: addb $127, %bl +; AVX-NEXT: movl %ebx, %r8d +; AVX-NEXT: .LBB0_32: +; AVX-NEXT: movzbl %dl, %edx +; AVX-NEXT: vmovd %edx, %xmm0 +; AVX-NEXT: movzbl %r8b, %edx +; AVX-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0 +; AVX-NEXT: movzbl %cl, %ecx +; AVX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %dil, %eax +; AVX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %bpl, %eax +; AVX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %r9b, %eax +; AVX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %r10b, %eax +; AVX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %r13b, %eax +; AVX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %r12b, %eax +; AVX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %r15b, %eax +; AVX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %r14b, %eax +; AVX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %r11b, %eax +; AVX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX-NEXT: popq %rbx +; AVX-NEXT: popq %r12 +; AVX-NEXT: popq %r13 +; AVX-NEXT: popq %r14 +; AVX-NEXT: popq %r15 +; AVX-NEXT: popq %rbp +; AVX-NEXT: retq + %z = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %x, <16 x i8> %y) + ret <16 x i8> %z +} + +define <32 x i8> @v32i8(<32 x i8> %x, <32 x i8> %y) nounwind { +; SSE2-LABEL: v32i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: pushq %r15 +; SSE2-NEXT: pushq %r14 +; SSE2-NEXT: pushq %r13 +; SSE2-NEXT: pushq %r12 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r8b +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r11b +; SSE2-NEXT: movl %r8d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r8b +; SSE2-NEXT: jno .LBB1_2 +; SSE2-NEXT: # %bb.1: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r8d +; SSE2-NEXT: .LBB1_2: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movl %r11d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r11b +; SSE2-NEXT: jno .LBB1_4 +; SSE2-NEXT: # %bb.3: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r11d +; SSE2-NEXT: .LBB1_4: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl %ebx, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %bl +; SSE2-NEXT: jno .LBB1_6 +; SSE2-NEXT: # %bb.5: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %ebx +; SSE2-NEXT: .LBB1_6: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSE2-NEXT: movl %esi, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %sil +; SSE2-NEXT: jno .LBB1_8 +; SSE2-NEXT: # %bb.7: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %esi +; SSE2-NEXT: .LBB1_8: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %dl +; SSE2-NEXT: jo .LBB1_9 +; SSE2-NEXT: # %bb.10: +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jmp .LBB1_11 +; SSE2-NEXT: .LBB1_9: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB1_11: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dil +; SSE2-NEXT: movl %edi, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %dil +; SSE2-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB1_13 +; SSE2-NEXT: # %bb.12: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %edi +; SSE2-NEXT: .LBB1_13: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r9b +; SSE2-NEXT: movl %r9d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r9b +; SSE2-NEXT: jno .LBB1_15 +; SSE2-NEXT: # %bb.14: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r9d +; SSE2-NEXT: .LBB1_15: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r10b +; SSE2-NEXT: movl %r10d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r10b +; SSE2-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB1_17 +; SSE2-NEXT: # %bb.16: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r10d +; SSE2-NEXT: .LBB1_17: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bpl +; SSE2-NEXT: movl %ebp, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %bpl +; SSE2-NEXT: jno .LBB1_19 +; SSE2-NEXT: # %bb.18: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %ebp +; SSE2-NEXT: .LBB1_19: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r14b +; SSE2-NEXT: movl %r14d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r14b +; SSE2-NEXT: jno .LBB1_21 +; SSE2-NEXT: # %bb.20: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r14d +; SSE2-NEXT: .LBB1_21: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r11b +; SSE2-NEXT: movl %r11d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r11b +; SSE2-NEXT: jno .LBB1_23 +; SSE2-NEXT: # %bb.22: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r11d +; SSE2-NEXT: .LBB1_23: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r8b +; SSE2-NEXT: movl %r8d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r8b +; SSE2-NEXT: jno .LBB1_25 +; SSE2-NEXT: # %bb.24: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r8d +; SSE2-NEXT: .LBB1_25: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r15b +; SSE2-NEXT: movl %r15d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r15b +; SSE2-NEXT: jno .LBB1_27 +; SSE2-NEXT: # %bb.26: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r15d +; SSE2-NEXT: .LBB1_27: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r12b +; SSE2-NEXT: movl %r12d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r12b +; SSE2-NEXT: jno .LBB1_29 +; SSE2-NEXT: # %bb.28: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r12d +; SSE2-NEXT: .LBB1_29: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r13b +; SSE2-NEXT: movl %r13d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r13b +; SSE2-NEXT: jno .LBB1_31 +; SSE2-NEXT: # %bb.30: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r13d +; SSE2-NEXT: .LBB1_31: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %dl +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB1_33 +; SSE2-NEXT: # %bb.32: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB1_33: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl %esi, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %sil +; SSE2-NEXT: jno .LBB1_35 +; SSE2-NEXT: # %bb.34: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %esi +; SSE2-NEXT: .LBB1_35: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %dl +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB1_37 +; SSE2-NEXT: # %bb.36: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB1_37: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %dl +; SSE2-NEXT: jno .LBB1_39 +; SSE2-NEXT: # %bb.38: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %edx +; SSE2-NEXT: .LBB1_39: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl %ebx, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %bl +; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB1_41 +; SSE2-NEXT: # %bb.40: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB1_41: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl %ebx, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %bl +; SSE2-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB1_43 +; SSE2-NEXT: # %bb.42: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %ebx +; SSE2-NEXT: .LBB1_43: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bpl +; SSE2-NEXT: movl %ebp, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %bpl +; SSE2-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB1_45 +; SSE2-NEXT: # %bb.44: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %ebp +; SSE2-NEXT: .LBB1_45: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r14b +; SSE2-NEXT: movl %r14d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r14b +; SSE2-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB1_47 +; SSE2-NEXT: # %bb.46: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r14d +; SSE2-NEXT: .LBB1_47: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r15b +; SSE2-NEXT: movl %r15d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r15b +; SSE2-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB1_49 +; SSE2-NEXT: # %bb.48: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r15d +; SSE2-NEXT: .LBB1_49: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r12b +; SSE2-NEXT: movl %r12d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r12b +; SSE2-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB1_51 +; SSE2-NEXT: # %bb.50: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r12d +; SSE2-NEXT: .LBB1_51: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r13b +; SSE2-NEXT: movl %r13d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r13b +; SSE2-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB1_53 +; SSE2-NEXT: # %bb.52: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r13d +; SSE2-NEXT: .LBB1_53: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r8b +; SSE2-NEXT: movl %r8d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r8b +; SSE2-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB1_55 +; SSE2-NEXT: # %bb.54: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r8d +; SSE2-NEXT: .LBB1_55: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r9b +; SSE2-NEXT: movl %r9d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r9b +; SSE2-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB1_57 +; SSE2-NEXT: # %bb.56: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r9d +; SSE2-NEXT: .LBB1_57: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r10b +; SSE2-NEXT: movl %r10d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r10b +; SSE2-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB1_59 +; SSE2-NEXT: # %bb.58: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r10d +; SSE2-NEXT: .LBB1_59: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r11b +; SSE2-NEXT: movl %r11d, %ecx +; SSE2-NEXT: subb %dl, %cl +; SSE2-NEXT: setns %cl +; SSE2-NEXT: subb %dl, %r11b +; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB1_61 +; SSE2-NEXT: # %bb.60: +; SSE2-NEXT: addb $127, %cl +; SSE2-NEXT: movl %ecx, %r11d +; SSE2-NEXT: .LBB1_61: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movl %ecx, %edx +; SSE2-NEXT: subb %bl, %dl +; SSE2-NEXT: setns %dl +; SSE2-NEXT: subb %bl, %cl +; SSE2-NEXT: jno .LBB1_63 +; SSE2-NEXT: # %bb.62: +; SSE2-NEXT: addb $127, %dl +; SSE2-NEXT: movl %edx, %ecx +; SSE2-NEXT: .LBB1_63: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl %ebx, %edx +; SSE2-NEXT: subb %al, %dl +; SSE2-NEXT: setns %dl +; SSE2-NEXT: subb %al, %bl +; SSE2-NEXT: jno .LBB1_65 +; SSE2-NEXT: # %bb.64: +; SSE2-NEXT: addb $127, %dl +; SSE2-NEXT: movl %edx, %ebx +; SSE2-NEXT: .LBB1_65: +; SSE2-NEXT: movzbl %bl, %esi +; SSE2-NEXT: movzbl %cl, %edi +; SSE2-NEXT: movzbl %r11b, %r11d +; SSE2-NEXT: movzbl %r10b, %r10d +; SSE2-NEXT: movzbl %r9b, %r9d +; SSE2-NEXT: movzbl %r8b, %r8d +; SSE2-NEXT: movzbl %r13b, %r13d +; SSE2-NEXT: movzbl %r12b, %eax +; SSE2-NEXT: movzbl %r15b, %ebx +; SSE2-NEXT: movzbl %r14b, %edx +; SSE2-NEXT: movzbl %bpl, %ebp +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload +; SSE2-NEXT: movd %esi, %xmm12 +; SSE2-NEXT: movd %edi, %xmm6 +; SSE2-NEXT: movd %r11d, %xmm11 +; SSE2-NEXT: movd %r10d, %xmm2 +; SSE2-NEXT: movd %r9d, %xmm10 +; SSE2-NEXT: movd %r8d, %xmm5 +; SSE2-NEXT: movd %r13d, %xmm9 +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: movd %ebx, %xmm8 +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload +; SSE2-NEXT: movd %edx, %xmm14 +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload +; SSE2-NEXT: movd %ebp, %xmm13 +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload +; SSE2-NEXT: movd %ecx, %xmm7 +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movd %r12d, %xmm0 +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE2-NEXT: movd %r15d, %xmm4 +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload +; SSE2-NEXT: movd %r14d, %xmm15 +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; SSE2-NEXT: movd %r13d, %xmm0 +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] +; SSE2-NEXT: movd %r11d, %xmm3 +; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] +; SSE2-NEXT: movd %r8d, %xmm11 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; SSE2-NEXT: movd %r9d, %xmm12 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] +; SSE2-NEXT: movd %r10d, %xmm10 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; SSE2-NEXT: movd %ebx, %xmm9 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE2-NEXT: movd %eax, %xmm5 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: movd %r12d, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3],xmm14[4],xmm8[4],xmm14[5],xmm8[5],xmm14[6],xmm8[6],xmm14[7],xmm8[7] +; SSE2-NEXT: movd %esi, %xmm8 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] +; SSE2-NEXT: movd %r15d, %xmm13 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3] +; SSE2-NEXT: movd %ebp, %xmm3 +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE2-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3],xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; SSE2-NEXT: movd %r14d, %xmm14 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; SSE2-NEXT: movd %edx, %xmm15 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE2-NEXT: movd %r13d, %xmm4 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE2-NEXT: movd %edi, %xmm7 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: movd %r11d, %xmm2 +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE2-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3],xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3],xmm3[4],xmm13[4],xmm3[5],xmm13[5],xmm3[6],xmm13[6],xmm3[7],xmm13[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1],xmm15[2],xmm3[2],xmm15[3],xmm3[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm8[0] +; SSE2-NEXT: addq $8, %rsp +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %r12 +; SSE2-NEXT: popq %r13 +; SSE2-NEXT: popq %r14 +; SSE2-NEXT: popq %r15 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v32i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pushq %rbp +; SSSE3-NEXT: pushq %r15 +; SSSE3-NEXT: pushq %r14 +; SSSE3-NEXT: pushq %r13 +; SSSE3-NEXT: pushq %r12 +; SSSE3-NEXT: pushq %rbx +; SSSE3-NEXT: pushq %rax +; SSSE3-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r8b +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r11b +; SSSE3-NEXT: movl %r8d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r8b +; SSSE3-NEXT: jno .LBB1_2 +; SSSE3-NEXT: # %bb.1: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r8d +; SSSE3-NEXT: .LBB1_2: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movl %r11d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r11b +; SSSE3-NEXT: jno .LBB1_4 +; SSSE3-NEXT: # %bb.3: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r11d +; SSSE3-NEXT: .LBB1_4: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl %ebx, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %bl +; SSSE3-NEXT: jno .LBB1_6 +; SSSE3-NEXT: # %bb.5: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %ebx +; SSSE3-NEXT: .LBB1_6: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSSE3-NEXT: movl %esi, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %sil +; SSSE3-NEXT: jno .LBB1_8 +; SSSE3-NEXT: # %bb.7: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %esi +; SSSE3-NEXT: .LBB1_8: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %dl +; SSSE3-NEXT: jo .LBB1_9 +; SSSE3-NEXT: # %bb.10: +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jmp .LBB1_11 +; SSSE3-NEXT: .LBB1_9: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB1_11: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dil +; SSSE3-NEXT: movl %edi, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %dil +; SSSE3-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB1_13 +; SSSE3-NEXT: # %bb.12: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %edi +; SSSE3-NEXT: .LBB1_13: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r9b +; SSSE3-NEXT: movl %r9d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r9b +; SSSE3-NEXT: jno .LBB1_15 +; SSSE3-NEXT: # %bb.14: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r9d +; SSSE3-NEXT: .LBB1_15: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r10b +; SSSE3-NEXT: movl %r10d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r10b +; SSSE3-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB1_17 +; SSSE3-NEXT: # %bb.16: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r10d +; SSSE3-NEXT: .LBB1_17: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bpl +; SSSE3-NEXT: movl %ebp, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %bpl +; SSSE3-NEXT: jno .LBB1_19 +; SSSE3-NEXT: # %bb.18: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %ebp +; SSSE3-NEXT: .LBB1_19: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r14b +; SSSE3-NEXT: movl %r14d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r14b +; SSSE3-NEXT: jno .LBB1_21 +; SSSE3-NEXT: # %bb.20: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r14d +; SSSE3-NEXT: .LBB1_21: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r11b +; SSSE3-NEXT: movl %r11d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r11b +; SSSE3-NEXT: jno .LBB1_23 +; SSSE3-NEXT: # %bb.22: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r11d +; SSSE3-NEXT: .LBB1_23: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r8b +; SSSE3-NEXT: movl %r8d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r8b +; SSSE3-NEXT: jno .LBB1_25 +; SSSE3-NEXT: # %bb.24: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r8d +; SSSE3-NEXT: .LBB1_25: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r15b +; SSSE3-NEXT: movl %r15d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r15b +; SSSE3-NEXT: jno .LBB1_27 +; SSSE3-NEXT: # %bb.26: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r15d +; SSSE3-NEXT: .LBB1_27: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r12b +; SSSE3-NEXT: movl %r12d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r12b +; SSSE3-NEXT: jno .LBB1_29 +; SSSE3-NEXT: # %bb.28: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r12d +; SSSE3-NEXT: .LBB1_29: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r13b +; SSSE3-NEXT: movl %r13d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r13b +; SSSE3-NEXT: jno .LBB1_31 +; SSSE3-NEXT: # %bb.30: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r13d +; SSSE3-NEXT: .LBB1_31: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %dl +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB1_33 +; SSSE3-NEXT: # %bb.32: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB1_33: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl %esi, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %sil +; SSSE3-NEXT: jno .LBB1_35 +; SSSE3-NEXT: # %bb.34: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %esi +; SSSE3-NEXT: .LBB1_35: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %dl +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB1_37 +; SSSE3-NEXT: # %bb.36: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB1_37: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %dl +; SSSE3-NEXT: jno .LBB1_39 +; SSSE3-NEXT: # %bb.38: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %edx +; SSSE3-NEXT: .LBB1_39: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl %ebx, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %bl +; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB1_41 +; SSSE3-NEXT: # %bb.40: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB1_41: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl %ebx, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %bl +; SSSE3-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB1_43 +; SSSE3-NEXT: # %bb.42: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %ebx +; SSSE3-NEXT: .LBB1_43: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bpl +; SSSE3-NEXT: movl %ebp, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %bpl +; SSSE3-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB1_45 +; SSSE3-NEXT: # %bb.44: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %ebp +; SSSE3-NEXT: .LBB1_45: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r14b +; SSSE3-NEXT: movl %r14d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r14b +; SSSE3-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB1_47 +; SSSE3-NEXT: # %bb.46: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r14d +; SSSE3-NEXT: .LBB1_47: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r15b +; SSSE3-NEXT: movl %r15d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r15b +; SSSE3-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB1_49 +; SSSE3-NEXT: # %bb.48: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r15d +; SSSE3-NEXT: .LBB1_49: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r12b +; SSSE3-NEXT: movl %r12d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r12b +; SSSE3-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB1_51 +; SSSE3-NEXT: # %bb.50: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r12d +; SSSE3-NEXT: .LBB1_51: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r13b +; SSSE3-NEXT: movl %r13d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r13b +; SSSE3-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB1_53 +; SSSE3-NEXT: # %bb.52: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r13d +; SSSE3-NEXT: .LBB1_53: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r8b +; SSSE3-NEXT: movl %r8d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r8b +; SSSE3-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB1_55 +; SSSE3-NEXT: # %bb.54: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r8d +; SSSE3-NEXT: .LBB1_55: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r9b +; SSSE3-NEXT: movl %r9d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r9b +; SSSE3-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB1_57 +; SSSE3-NEXT: # %bb.56: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r9d +; SSSE3-NEXT: .LBB1_57: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r10b +; SSSE3-NEXT: movl %r10d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r10b +; SSSE3-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB1_59 +; SSSE3-NEXT: # %bb.58: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r10d +; SSSE3-NEXT: .LBB1_59: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r11b +; SSSE3-NEXT: movl %r11d, %ecx +; SSSE3-NEXT: subb %dl, %cl +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: subb %dl, %r11b +; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB1_61 +; SSSE3-NEXT: # %bb.60: +; SSSE3-NEXT: addb $127, %cl +; SSSE3-NEXT: movl %ecx, %r11d +; SSSE3-NEXT: .LBB1_61: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movl %ecx, %edx +; SSSE3-NEXT: subb %bl, %dl +; SSSE3-NEXT: setns %dl +; SSSE3-NEXT: subb %bl, %cl +; SSSE3-NEXT: jno .LBB1_63 +; SSSE3-NEXT: # %bb.62: +; SSSE3-NEXT: addb $127, %dl +; SSSE3-NEXT: movl %edx, %ecx +; SSSE3-NEXT: .LBB1_63: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl %ebx, %edx +; SSSE3-NEXT: subb %al, %dl +; SSSE3-NEXT: setns %dl +; SSSE3-NEXT: subb %al, %bl +; SSSE3-NEXT: jno .LBB1_65 +; SSSE3-NEXT: # %bb.64: +; SSSE3-NEXT: addb $127, %dl +; SSSE3-NEXT: movl %edx, %ebx +; SSSE3-NEXT: .LBB1_65: +; SSSE3-NEXT: movzbl %bl, %esi +; SSSE3-NEXT: movzbl %cl, %edi +; SSSE3-NEXT: movzbl %r11b, %r11d +; SSSE3-NEXT: movzbl %r10b, %r10d +; SSSE3-NEXT: movzbl %r9b, %r9d +; SSSE3-NEXT: movzbl %r8b, %r8d +; SSSE3-NEXT: movzbl %r13b, %r13d +; SSSE3-NEXT: movzbl %r12b, %eax +; SSSE3-NEXT: movzbl %r15b, %ebx +; SSSE3-NEXT: movzbl %r14b, %edx +; SSSE3-NEXT: movzbl %bpl, %ebp +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload +; SSSE3-NEXT: movd %esi, %xmm12 +; SSSE3-NEXT: movd %edi, %xmm6 +; SSSE3-NEXT: movd %r11d, %xmm11 +; SSSE3-NEXT: movd %r10d, %xmm2 +; SSSE3-NEXT: movd %r9d, %xmm10 +; SSSE3-NEXT: movd %r8d, %xmm5 +; SSSE3-NEXT: movd %r13d, %xmm9 +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movd %ebx, %xmm8 +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload +; SSSE3-NEXT: movd %edx, %xmm14 +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload +; SSSE3-NEXT: movd %ebp, %xmm13 +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload +; SSSE3-NEXT: movd %ecx, %xmm7 +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movd %r12d, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSSE3-NEXT: movd %r15d, %xmm4 +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload +; SSSE3-NEXT: movd %r14d, %xmm15 +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; SSSE3-NEXT: movd %r13d, %xmm0 +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] +; SSSE3-NEXT: movd %r11d, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] +; SSSE3-NEXT: movd %r8d, %xmm11 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; SSSE3-NEXT: movd %r9d, %xmm12 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] +; SSSE3-NEXT: movd %r10d, %xmm10 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; SSSE3-NEXT: movd %ebx, %xmm9 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSSE3-NEXT: movd %eax, %xmm5 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSSE3-NEXT: movd %r12d, %xmm6 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3],xmm14[4],xmm8[4],xmm14[5],xmm8[5],xmm14[6],xmm8[6],xmm14[7],xmm8[7] +; SSSE3-NEXT: movd %esi, %xmm8 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] +; SSSE3-NEXT: movd %r15d, %xmm13 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3] +; SSSE3-NEXT: movd %ebp, %xmm3 +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3],xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; SSSE3-NEXT: movd %r14d, %xmm14 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; SSSE3-NEXT: movd %edx, %xmm15 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSSE3-NEXT: movd %r13d, %xmm4 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSSE3-NEXT: movd %edi, %xmm7 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: movd %r11d, %xmm2 +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3],xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] +; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3],xmm3[4],xmm13[4],xmm3[5],xmm13[5],xmm3[6],xmm13[6],xmm3[7],xmm13[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1],xmm15[2],xmm3[2],xmm15[3],xmm3[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm8[0] +; SSSE3-NEXT: addq $8, %rsp +; SSSE3-NEXT: popq %rbx +; SSSE3-NEXT: popq %r12 +; SSSE3-NEXT: popq %r13 +; SSSE3-NEXT: popq %r14 +; SSSE3-NEXT: popq %r15 +; SSSE3-NEXT: popq %rbp +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v32i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pextrb $15, %xmm3, %ecx +; SSE41-NEXT: pextrb $15, %xmm1, %edx +; SSE41-NEXT: movl %edx, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %dl +; SSE41-NEXT: jno .LBB1_2 +; SSE41-NEXT: # %bb.1: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edx +; SSE41-NEXT: .LBB1_2: +; SSE41-NEXT: pextrb $14, %xmm3, %ecx +; SSE41-NEXT: pextrb $14, %xmm1, %esi +; SSE41-NEXT: movl %esi, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %sil +; SSE41-NEXT: jno .LBB1_4 +; SSE41-NEXT: # %bb.3: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %esi +; SSE41-NEXT: .LBB1_4: +; SSE41-NEXT: pushq %rbp +; SSE41-NEXT: pushq %r15 +; SSE41-NEXT: pushq %r14 +; SSE41-NEXT: pushq %r13 +; SSE41-NEXT: pushq %r12 +; SSE41-NEXT: pushq %rbx +; SSE41-NEXT: pextrb $13, %xmm3, %ecx +; SSE41-NEXT: pextrb $13, %xmm1, %edi +; SSE41-NEXT: movl %edi, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %dil +; SSE41-NEXT: jo .LBB1_5 +; SSE41-NEXT: # %bb.6: +; SSE41-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jmp .LBB1_7 +; SSE41-NEXT: .LBB1_5: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: # kill: def $al killed $al def $eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: .LBB1_7: +; SSE41-NEXT: pextrb $12, %xmm3, %ecx +; SSE41-NEXT: pextrb $12, %xmm1, %edi +; SSE41-NEXT: movl %edi, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %dil +; SSE41-NEXT: jno .LBB1_9 +; SSE41-NEXT: # %bb.8: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edi +; SSE41-NEXT: .LBB1_9: +; SSE41-NEXT: pextrb $11, %xmm3, %ecx +; SSE41-NEXT: pextrb $11, %xmm1, %ebp +; SSE41-NEXT: movl %ebp, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %bpl +; SSE41-NEXT: jno .LBB1_11 +; SSE41-NEXT: # %bb.10: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebp +; SSE41-NEXT: .LBB1_11: +; SSE41-NEXT: pextrb $10, %xmm3, %ecx +; SSE41-NEXT: pextrb $10, %xmm1, %ebx +; SSE41-NEXT: movl %ebx, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %bl +; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB1_13 +; SSE41-NEXT: # %bb.12: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebx +; SSE41-NEXT: .LBB1_13: +; SSE41-NEXT: pextrb $9, %xmm3, %ecx +; SSE41-NEXT: pextrb $9, %xmm1, %ebp +; SSE41-NEXT: movl %ebp, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %bpl +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB1_15 +; SSE41-NEXT: # %bb.14: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebp +; SSE41-NEXT: .LBB1_15: +; SSE41-NEXT: pextrb $8, %xmm3, %ecx +; SSE41-NEXT: pextrb $8, %xmm1, %esi +; SSE41-NEXT: movl %esi, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %sil +; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB1_17 +; SSE41-NEXT: # %bb.16: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %esi +; SSE41-NEXT: .LBB1_17: +; SSE41-NEXT: pextrb $7, %xmm3, %ecx +; SSE41-NEXT: pextrb $7, %xmm1, %edx +; SSE41-NEXT: movl %edx, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %dl +; SSE41-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB1_19 +; SSE41-NEXT: # %bb.18: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edx +; SSE41-NEXT: .LBB1_19: +; SSE41-NEXT: pextrb $6, %xmm3, %ecx +; SSE41-NEXT: pextrb $6, %xmm1, %edi +; SSE41-NEXT: movl %edi, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %dil +; SSE41-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB1_21 +; SSE41-NEXT: # %bb.20: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edi +; SSE41-NEXT: .LBB1_21: +; SSE41-NEXT: pextrb $5, %xmm3, %ecx +; SSE41-NEXT: pextrb $5, %xmm1, %r8d +; SSE41-NEXT: movl %r8d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r8b +; SSE41-NEXT: jno .LBB1_23 +; SSE41-NEXT: # %bb.22: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r8d +; SSE41-NEXT: .LBB1_23: +; SSE41-NEXT: pextrb $4, %xmm3, %ecx +; SSE41-NEXT: pextrb $4, %xmm1, %r11d +; SSE41-NEXT: movl %r11d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r11b +; SSE41-NEXT: jno .LBB1_25 +; SSE41-NEXT: # %bb.24: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r11d +; SSE41-NEXT: .LBB1_25: +; SSE41-NEXT: pextrb $3, %xmm3, %ecx +; SSE41-NEXT: pextrb $3, %xmm1, %r10d +; SSE41-NEXT: movl %r10d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r10b +; SSE41-NEXT: jno .LBB1_27 +; SSE41-NEXT: # %bb.26: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r10d +; SSE41-NEXT: .LBB1_27: +; SSE41-NEXT: pextrb $2, %xmm3, %ecx +; SSE41-NEXT: pextrb $2, %xmm1, %r14d +; SSE41-NEXT: movl %r14d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r14b +; SSE41-NEXT: jno .LBB1_29 +; SSE41-NEXT: # %bb.28: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r14d +; SSE41-NEXT: .LBB1_29: +; SSE41-NEXT: pextrb $0, %xmm3, %ecx +; SSE41-NEXT: pextrb $0, %xmm1, %r9d +; SSE41-NEXT: movl %r9d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r9b +; SSE41-NEXT: jno .LBB1_31 +; SSE41-NEXT: # %bb.30: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r9d +; SSE41-NEXT: .LBB1_31: +; SSE41-NEXT: pextrb $1, %xmm3, %ecx +; SSE41-NEXT: pextrb $1, %xmm1, %ebp +; SSE41-NEXT: movl %ebp, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %bpl +; SSE41-NEXT: jno .LBB1_33 +; SSE41-NEXT: # %bb.32: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebp +; SSE41-NEXT: .LBB1_33: +; SSE41-NEXT: pextrb $15, %xmm2, %ecx +; SSE41-NEXT: pextrb $15, %xmm0, %ebx +; SSE41-NEXT: movl %ebx, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: movl %esi, %r15d +; SSE41-NEXT: subb %cl, %bl +; SSE41-NEXT: jno .LBB1_35 +; SSE41-NEXT: # %bb.34: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebx +; SSE41-NEXT: .LBB1_35: +; SSE41-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrb $14, %xmm2, %ecx +; SSE41-NEXT: pextrb $14, %xmm0, %esi +; SSE41-NEXT: movl %esi, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: movl %edx, %edi +; SSE41-NEXT: subb %cl, %sil +; SSE41-NEXT: jno .LBB1_37 +; SSE41-NEXT: # %bb.36: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %esi +; SSE41-NEXT: .LBB1_37: +; SSE41-NEXT: pextrb $13, %xmm2, %ecx +; SSE41-NEXT: pextrb $13, %xmm0, %edx +; SSE41-NEXT: movl %edx, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %dl +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jo .LBB1_38 +; SSE41-NEXT: # %bb.39: +; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jmp .LBB1_40 +; SSE41-NEXT: .LBB1_38: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: # kill: def $al killed $al def $eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: .LBB1_40: +; SSE41-NEXT: movl %edi, %edx +; SSE41-NEXT: pextrb $12, %xmm2, %ecx +; SSE41-NEXT: pextrb $12, %xmm0, %edi +; SSE41-NEXT: movl %edi, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %dil +; SSE41-NEXT: movl %r15d, %esi +; SSE41-NEXT: jno .LBB1_42 +; SSE41-NEXT: # %bb.41: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edi +; SSE41-NEXT: .LBB1_42: +; SSE41-NEXT: pextrb $11, %xmm2, %ecx +; SSE41-NEXT: pextrb $11, %xmm0, %r15d +; SSE41-NEXT: movl %r15d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r15b +; SSE41-NEXT: jno .LBB1_44 +; SSE41-NEXT: # %bb.43: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r15d +; SSE41-NEXT: .LBB1_44: +; SSE41-NEXT: pextrb $10, %xmm2, %ecx +; SSE41-NEXT: pextrb $10, %xmm0, %r12d +; SSE41-NEXT: movl %r12d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r12b +; SSE41-NEXT: jno .LBB1_46 +; SSE41-NEXT: # %bb.45: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r12d +; SSE41-NEXT: .LBB1_46: +; SSE41-NEXT: pextrb $9, %xmm2, %ecx +; SSE41-NEXT: pextrb $9, %xmm0, %r13d +; SSE41-NEXT: movl %r13d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r13b +; SSE41-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB1_48 +; SSE41-NEXT: # %bb.47: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r13d +; SSE41-NEXT: .LBB1_48: +; SSE41-NEXT: pextrb $8, %xmm2, %ecx +; SSE41-NEXT: pextrb $8, %xmm0, %r11d +; SSE41-NEXT: movl %r11d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r11b +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB1_50 +; SSE41-NEXT: # %bb.49: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r11d +; SSE41-NEXT: .LBB1_50: +; SSE41-NEXT: pextrb $7, %xmm2, %ecx +; SSE41-NEXT: pextrb $7, %xmm0, %r10d +; SSE41-NEXT: movl %r10d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r10b +; SSE41-NEXT: jno .LBB1_52 +; SSE41-NEXT: # %bb.51: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r10d +; SSE41-NEXT: .LBB1_52: +; SSE41-NEXT: pextrb $6, %xmm2, %ecx +; SSE41-NEXT: pextrb $6, %xmm0, %ebp +; SSE41-NEXT: movl %ebp, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: movl %edi, %r14d +; SSE41-NEXT: subb %cl, %bpl +; SSE41-NEXT: jno .LBB1_54 +; SSE41-NEXT: # %bb.53: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebp +; SSE41-NEXT: .LBB1_54: +; SSE41-NEXT: pextrb $5, %xmm2, %ecx +; SSE41-NEXT: pextrb $5, %xmm0, %edi +; SSE41-NEXT: movl %edi, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %dil +; SSE41-NEXT: jno .LBB1_56 +; SSE41-NEXT: # %bb.55: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edi +; SSE41-NEXT: .LBB1_56: +; SSE41-NEXT: pextrb $4, %xmm2, %edx +; SSE41-NEXT: pextrb $4, %xmm0, %eax +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: setns %cl +; SSE41-NEXT: subb %dl, %al +; SSE41-NEXT: jno .LBB1_58 +; SSE41-NEXT: # %bb.57: +; SSE41-NEXT: addb $127, %cl +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB1_58: +; SSE41-NEXT: pextrb $3, %xmm2, %ebx +; SSE41-NEXT: pextrb $3, %xmm0, %ecx +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: subb %bl, %dl +; SSE41-NEXT: setns %dl +; SSE41-NEXT: subb %bl, %cl +; SSE41-NEXT: jno .LBB1_60 +; SSE41-NEXT: # %bb.59: +; SSE41-NEXT: addb $127, %dl +; SSE41-NEXT: movl %edx, %ecx +; SSE41-NEXT: .LBB1_60: +; SSE41-NEXT: pextrb $2, %xmm2, %esi +; SSE41-NEXT: pextrb $2, %xmm0, %edx +; SSE41-NEXT: movl %edx, %ebx +; SSE41-NEXT: subb %sil, %bl +; SSE41-NEXT: setns %bl +; SSE41-NEXT: subb %sil, %dl +; SSE41-NEXT: jno .LBB1_62 +; SSE41-NEXT: # %bb.61: +; SSE41-NEXT: addb $127, %bl +; SSE41-NEXT: movl %ebx, %edx +; SSE41-NEXT: .LBB1_62: +; SSE41-NEXT: pextrb $0, %xmm2, %esi +; SSE41-NEXT: pextrb $0, %xmm0, %r8d +; SSE41-NEXT: movl %r8d, %ebx +; SSE41-NEXT: subb %sil, %bl +; SSE41-NEXT: setns %bl +; SSE41-NEXT: subb %sil, %r8b +; SSE41-NEXT: jno .LBB1_64 +; SSE41-NEXT: # %bb.63: +; SSE41-NEXT: addb $127, %bl +; SSE41-NEXT: movl %ebx, %r8d +; SSE41-NEXT: .LBB1_64: +; SSE41-NEXT: pextrb $1, %xmm2, %esi +; SSE41-NEXT: pextrb $1, %xmm0, %r9d +; SSE41-NEXT: movl %r9d, %ebx +; SSE41-NEXT: subb %sil, %bl +; SSE41-NEXT: setns %bl +; SSE41-NEXT: subb %sil, %r9b +; SSE41-NEXT: jno .LBB1_66 +; SSE41-NEXT: # %bb.65: +; SSE41-NEXT: addb $127, %bl +; SSE41-NEXT: movl %ebx, %r9d +; SSE41-NEXT: .LBB1_66: +; SSE41-NEXT: movzbl %r8b, %esi +; SSE41-NEXT: movd %esi, %xmm0 +; SSE41-NEXT: movzbl %r9b, %esi +; SSE41-NEXT: pinsrb $1, %esi, %xmm0 +; SSE41-NEXT: movzbl %dl, %edx +; SSE41-NEXT: pinsrb $2, %edx, %xmm0 +; SSE41-NEXT: movzbl %cl, %ecx +; SSE41-NEXT: pinsrb $3, %ecx, %xmm0 +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $4, %eax, %xmm0 +; SSE41-NEXT: movzbl %dil, %eax +; SSE41-NEXT: pinsrb $5, %eax, %xmm0 +; SSE41-NEXT: movzbl %bpl, %eax +; SSE41-NEXT: pinsrb $6, %eax, %xmm0 +; SSE41-NEXT: movzbl %r10b, %eax +; SSE41-NEXT: pinsrb $7, %eax, %xmm0 +; SSE41-NEXT: movzbl %r11b, %eax +; SSE41-NEXT: pinsrb $8, %eax, %xmm0 +; SSE41-NEXT: movzbl %r13b, %eax +; SSE41-NEXT: pinsrb $9, %eax, %xmm0 +; SSE41-NEXT: movzbl %r12b, %eax +; SSE41-NEXT: pinsrb $10, %eax, %xmm0 +; SSE41-NEXT: movzbl %r15b, %eax +; SSE41-NEXT: pinsrb $11, %eax, %xmm0 +; SSE41-NEXT: movzbl %r14b, %eax +; SSE41-NEXT: pinsrb $12, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $13, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $14, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $15, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: movd %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $1, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $2, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $3, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $4, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $5, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $6, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $7, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $8, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $9, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $10, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $11, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $12, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $13, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $14, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $15, %eax, %xmm1 +; SSE41-NEXT: popq %rbx +; SSE41-NEXT: popq %r12 +; SSE41-NEXT: popq %r13 +; SSE41-NEXT: popq %r14 +; SSE41-NEXT: popq %r15 +; SSE41-NEXT: popq %rbp +; SSE41-NEXT: retq +; +; AVX1-LABEL: v32i8: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: pushq %r15 +; AVX1-NEXT: pushq %r14 +; AVX1-NEXT: pushq %r13 +; AVX1-NEXT: pushq %r12 +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: vpextrb $15, %xmm1, %ecx +; AVX1-NEXT: vpextrb $15, %xmm0, %edx +; AVX1-NEXT: movl %edx, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %dl +; AVX1-NEXT: jo .LBB1_1 +; AVX1-NEXT: # %bb.2: +; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jmp .LBB1_3 +; AVX1-NEXT: .LBB1_1: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: # kill: def $al killed $al def $eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: .LBB1_3: +; AVX1-NEXT: vpextrb $14, %xmm1, %ecx +; AVX1-NEXT: vpextrb $14, %xmm0, %edx +; AVX1-NEXT: movl %edx, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %dl +; AVX1-NEXT: jno .LBB1_5 +; AVX1-NEXT: # %bb.4: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %edx +; AVX1-NEXT: .LBB1_5: +; AVX1-NEXT: vpextrb $13, %xmm1, %ecx +; AVX1-NEXT: vpextrb $13, %xmm0, %esi +; AVX1-NEXT: movl %esi, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %sil +; AVX1-NEXT: jo .LBB1_6 +; AVX1-NEXT: # %bb.7: +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jmp .LBB1_8 +; AVX1-NEXT: .LBB1_6: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: # kill: def $al killed $al def $eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: .LBB1_8: +; AVX1-NEXT: vpextrb $12, %xmm1, %ecx +; AVX1-NEXT: vpextrb $12, %xmm0, %esi +; AVX1-NEXT: movl %esi, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %sil +; AVX1-NEXT: jno .LBB1_10 +; AVX1-NEXT: # %bb.9: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %esi +; AVX1-NEXT: .LBB1_10: +; AVX1-NEXT: vpextrb $11, %xmm1, %ecx +; AVX1-NEXT: vpextrb $11, %xmm0, %edi +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %dil +; AVX1-NEXT: jno .LBB1_12 +; AVX1-NEXT: # %bb.11: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %edi +; AVX1-NEXT: .LBB1_12: +; AVX1-NEXT: vpextrb $10, %xmm1, %ecx +; AVX1-NEXT: vpextrb $10, %xmm0, %ebp +; AVX1-NEXT: movl %ebp, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %bpl +; AVX1-NEXT: jno .LBB1_14 +; AVX1-NEXT: # %bb.13: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %ebp +; AVX1-NEXT: .LBB1_14: +; AVX1-NEXT: vpextrb $9, %xmm1, %ecx +; AVX1-NEXT: vpextrb $9, %xmm0, %ebx +; AVX1-NEXT: movl %ebx, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %bl +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB1_16 +; AVX1-NEXT: # %bb.15: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %ebx +; AVX1-NEXT: .LBB1_16: +; AVX1-NEXT: vpextrb $8, %xmm1, %ecx +; AVX1-NEXT: vpextrb $8, %xmm0, %esi +; AVX1-NEXT: movl %esi, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %sil +; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB1_18 +; AVX1-NEXT: # %bb.17: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %esi +; AVX1-NEXT: .LBB1_18: +; AVX1-NEXT: vpextrb $7, %xmm1, %ecx +; AVX1-NEXT: vpextrb $7, %xmm0, %edx +; AVX1-NEXT: movl %edx, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %dl +; AVX1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB1_20 +; AVX1-NEXT: # %bb.19: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %edx +; AVX1-NEXT: .LBB1_20: +; AVX1-NEXT: vpextrb $6, %xmm1, %ecx +; AVX1-NEXT: vpextrb $6, %xmm0, %edi +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %dil +; AVX1-NEXT: jno .LBB1_22 +; AVX1-NEXT: # %bb.21: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %edi +; AVX1-NEXT: .LBB1_22: +; AVX1-NEXT: vpextrb $5, %xmm1, %ecx +; AVX1-NEXT: vpextrb $5, %xmm0, %ebp +; AVX1-NEXT: movl %ebp, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %bpl +; AVX1-NEXT: jno .LBB1_24 +; AVX1-NEXT: # %bb.23: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %ebp +; AVX1-NEXT: .LBB1_24: +; AVX1-NEXT: vpextrb $4, %xmm1, %ecx +; AVX1-NEXT: vpextrb $4, %xmm0, %r11d +; AVX1-NEXT: movl %r11d, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %r11b +; AVX1-NEXT: jno .LBB1_26 +; AVX1-NEXT: # %bb.25: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r11d +; AVX1-NEXT: .LBB1_26: +; AVX1-NEXT: vpextrb $3, %xmm1, %ecx +; AVX1-NEXT: vpextrb $3, %xmm0, %r14d +; AVX1-NEXT: movl %r14d, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %r14b +; AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB1_28 +; AVX1-NEXT: # %bb.27: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r14d +; AVX1-NEXT: .LBB1_28: +; AVX1-NEXT: vpextrb $2, %xmm1, %ecx +; AVX1-NEXT: vpextrb $2, %xmm0, %r8d +; AVX1-NEXT: movl %r8d, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %r8b +; AVX1-NEXT: jno .LBB1_30 +; AVX1-NEXT: # %bb.29: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r8d +; AVX1-NEXT: .LBB1_30: +; AVX1-NEXT: vpextrb $0, %xmm1, %ecx +; AVX1-NEXT: vpextrb $0, %xmm0, %r10d +; AVX1-NEXT: movl %r10d, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %r10b +; AVX1-NEXT: jno .LBB1_32 +; AVX1-NEXT: # %bb.31: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r10d +; AVX1-NEXT: .LBB1_32: +; AVX1-NEXT: vpextrb $1, %xmm1, %ecx +; AVX1-NEXT: vpextrb $1, %xmm0, %r9d +; AVX1-NEXT: movl %r9d, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %r9b +; AVX1-NEXT: jno .LBB1_34 +; AVX1-NEXT: # %bb.33: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r9d +; AVX1-NEXT: .LBB1_34: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpextrb $15, %xmm1, %ecx +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpextrb $15, %xmm0, %ebx +; AVX1-NEXT: movl %ebx, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: movl %esi, %r12d +; AVX1-NEXT: subb %cl, %bl +; AVX1-NEXT: jno .LBB1_36 +; AVX1-NEXT: # %bb.35: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %ebx +; AVX1-NEXT: .LBB1_36: +; AVX1-NEXT: vpextrb $14, %xmm1, %ecx +; AVX1-NEXT: vpextrb $14, %xmm0, %esi +; AVX1-NEXT: movl %esi, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: movl %edx, %r13d +; AVX1-NEXT: subb %cl, %sil +; AVX1-NEXT: jno .LBB1_38 +; AVX1-NEXT: # %bb.37: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %esi +; AVX1-NEXT: .LBB1_38: +; AVX1-NEXT: vpextrb $13, %xmm1, %ecx +; AVX1-NEXT: vpextrb $13, %xmm0, %edx +; AVX1-NEXT: movl %edx, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: movl %edi, %ebp +; AVX1-NEXT: subb %cl, %dl +; AVX1-NEXT: jno .LBB1_40 +; AVX1-NEXT: # %bb.39: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %edx +; AVX1-NEXT: .LBB1_40: +; AVX1-NEXT: vpextrb $12, %xmm1, %ecx +; AVX1-NEXT: vpextrb $12, %xmm0, %edi +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %dil +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jo .LBB1_41 +; AVX1-NEXT: # %bb.42: +; AVX1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jmp .LBB1_43 +; AVX1-NEXT: .LBB1_41: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: # kill: def $al killed $al def $eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: .LBB1_43: +; AVX1-NEXT: movl %ebp, %edi +; AVX1-NEXT: vpextrb $11, %xmm1, %ecx +; AVX1-NEXT: vpextrb $11, %xmm0, %r15d +; AVX1-NEXT: movl %r15d, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %r15b +; AVX1-NEXT: movl %r12d, %esi +; AVX1-NEXT: movl %r13d, %edx +; AVX1-NEXT: jno .LBB1_45 +; AVX1-NEXT: # %bb.44: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r15d +; AVX1-NEXT: .LBB1_45: +; AVX1-NEXT: vpextrb $10, %xmm1, %ecx +; AVX1-NEXT: vpextrb $10, %xmm0, %r12d +; AVX1-NEXT: movl %r12d, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %r12b +; AVX1-NEXT: jno .LBB1_47 +; AVX1-NEXT: # %bb.46: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r12d +; AVX1-NEXT: .LBB1_47: +; AVX1-NEXT: vpextrb $9, %xmm1, %ecx +; AVX1-NEXT: vpextrb $9, %xmm0, %r13d +; AVX1-NEXT: movl %r13d, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %r13b +; AVX1-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB1_49 +; AVX1-NEXT: # %bb.48: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r13d +; AVX1-NEXT: .LBB1_49: +; AVX1-NEXT: vpextrb $8, %xmm1, %ecx +; AVX1-NEXT: vpextrb $8, %xmm0, %r11d +; AVX1-NEXT: movl %r11d, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %r11b +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB1_51 +; AVX1-NEXT: # %bb.50: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r11d +; AVX1-NEXT: .LBB1_51: +; AVX1-NEXT: vpextrb $7, %xmm1, %ecx +; AVX1-NEXT: vpextrb $7, %xmm0, %r10d +; AVX1-NEXT: movl %r10d, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %r10b +; AVX1-NEXT: jno .LBB1_53 +; AVX1-NEXT: # %bb.52: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r10d +; AVX1-NEXT: .LBB1_53: +; AVX1-NEXT: vpextrb $6, %xmm1, %ecx +; AVX1-NEXT: vpextrb $6, %xmm0, %ebp +; AVX1-NEXT: movl %ebp, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %bpl +; AVX1-NEXT: jno .LBB1_55 +; AVX1-NEXT: # %bb.54: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %ebp +; AVX1-NEXT: .LBB1_55: +; AVX1-NEXT: vpextrb $5, %xmm1, %ecx +; AVX1-NEXT: vpextrb $5, %xmm0, %edi +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %dil +; AVX1-NEXT: jno .LBB1_57 +; AVX1-NEXT: # %bb.56: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %edi +; AVX1-NEXT: .LBB1_57: +; AVX1-NEXT: vpextrb $4, %xmm1, %edx +; AVX1-NEXT: vpextrb $4, %xmm0, %eax +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: subb %dl, %cl +; AVX1-NEXT: setns %cl +; AVX1-NEXT: subb %dl, %al +; AVX1-NEXT: jno .LBB1_59 +; AVX1-NEXT: # %bb.58: +; AVX1-NEXT: addb $127, %cl +; AVX1-NEXT: movl %ecx, %eax +; AVX1-NEXT: .LBB1_59: +; AVX1-NEXT: vpextrb $3, %xmm1, %ebx +; AVX1-NEXT: vpextrb $3, %xmm0, %ecx +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: subb %bl, %dl +; AVX1-NEXT: setns %dl +; AVX1-NEXT: subb %bl, %cl +; AVX1-NEXT: jno .LBB1_61 +; AVX1-NEXT: # %bb.60: +; AVX1-NEXT: addb $127, %dl +; AVX1-NEXT: movl %edx, %ecx +; AVX1-NEXT: .LBB1_61: +; AVX1-NEXT: vpextrb $2, %xmm1, %esi +; AVX1-NEXT: vpextrb $2, %xmm0, %edx +; AVX1-NEXT: movl %edx, %ebx +; AVX1-NEXT: subb %sil, %bl +; AVX1-NEXT: setns %bl +; AVX1-NEXT: subb %sil, %dl +; AVX1-NEXT: jno .LBB1_63 +; AVX1-NEXT: # %bb.62: +; AVX1-NEXT: addb $127, %bl +; AVX1-NEXT: movl %ebx, %edx +; AVX1-NEXT: .LBB1_63: +; AVX1-NEXT: vpextrb $0, %xmm1, %esi +; AVX1-NEXT: vpextrb $0, %xmm0, %r8d +; AVX1-NEXT: movl %r8d, %ebx +; AVX1-NEXT: subb %sil, %bl +; AVX1-NEXT: setns %bl +; AVX1-NEXT: subb %sil, %r8b +; AVX1-NEXT: jo .LBB1_64 +; AVX1-NEXT: # %bb.65: +; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Reload +; AVX1-NEXT: jmp .LBB1_66 +; AVX1-NEXT: .LBB1_64: +; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Reload +; AVX1-NEXT: addb $127, %bl +; AVX1-NEXT: movl %ebx, %r8d +; AVX1-NEXT: .LBB1_66: +; AVX1-NEXT: vpextrb $1, %xmm1, %esi +; AVX1-NEXT: vpextrb $1, %xmm0, %r9d +; AVX1-NEXT: movl %r9d, %ebx +; AVX1-NEXT: subb %sil, %bl +; AVX1-NEXT: setns %bl +; AVX1-NEXT: subb %sil, %r9b +; AVX1-NEXT: jno .LBB1_68 +; AVX1-NEXT: # %bb.67: +; AVX1-NEXT: addb $127, %bl +; AVX1-NEXT: movl %ebx, %r9d +; AVX1-NEXT: .LBB1_68: +; AVX1-NEXT: movzbl %r8b, %esi +; AVX1-NEXT: vmovd %esi, %xmm0 +; AVX1-NEXT: movzbl %r9b, %esi +; AVX1-NEXT: vpinsrb $1, %esi, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %dl, %edx +; AVX1-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %cl, %ecx +; AVX1-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %al, %eax +; AVX1-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %dil, %eax +; AVX1-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %bpl, %eax +; AVX1-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %r10b, %eax +; AVX1-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %r11b, %eax +; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %r13b, %eax +; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %r12b, %eax +; AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %r15b, %eax +; AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl %r14b, %eax +; AVX1-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: popq %r12 +; AVX1-NEXT: popq %r13 +; AVX1-NEXT: popq %r14 +; AVX1-NEXT: popq %r15 +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: v32i8: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %r13 +; AVX2-NEXT: pushq %r12 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: vpextrb $15, %xmm1, %ecx +; AVX2-NEXT: vpextrb $15, %xmm0, %edx +; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %dl +; AVX2-NEXT: jo .LBB1_1 +; AVX2-NEXT: # %bb.2: +; AVX2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jmp .LBB1_3 +; AVX2-NEXT: .LBB1_1: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: # kill: def $al killed $al def $eax +; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: .LBB1_3: +; AVX2-NEXT: vpextrb $14, %xmm1, %ecx +; AVX2-NEXT: vpextrb $14, %xmm0, %edx +; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %dl +; AVX2-NEXT: jno .LBB1_5 +; AVX2-NEXT: # %bb.4: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %edx +; AVX2-NEXT: .LBB1_5: +; AVX2-NEXT: vpextrb $13, %xmm1, %ecx +; AVX2-NEXT: vpextrb $13, %xmm0, %esi +; AVX2-NEXT: movl %esi, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %sil +; AVX2-NEXT: jo .LBB1_6 +; AVX2-NEXT: # %bb.7: +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jmp .LBB1_8 +; AVX2-NEXT: .LBB1_6: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: # kill: def $al killed $al def $eax +; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: .LBB1_8: +; AVX2-NEXT: vpextrb $12, %xmm1, %ecx +; AVX2-NEXT: vpextrb $12, %xmm0, %esi +; AVX2-NEXT: movl %esi, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %sil +; AVX2-NEXT: jno .LBB1_10 +; AVX2-NEXT: # %bb.9: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %esi +; AVX2-NEXT: .LBB1_10: +; AVX2-NEXT: vpextrb $11, %xmm1, %ecx +; AVX2-NEXT: vpextrb $11, %xmm0, %edi +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %dil +; AVX2-NEXT: jno .LBB1_12 +; AVX2-NEXT: # %bb.11: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %edi +; AVX2-NEXT: .LBB1_12: +; AVX2-NEXT: vpextrb $10, %xmm1, %ecx +; AVX2-NEXT: vpextrb $10, %xmm0, %ebp +; AVX2-NEXT: movl %ebp, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %bpl +; AVX2-NEXT: jno .LBB1_14 +; AVX2-NEXT: # %bb.13: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %ebp +; AVX2-NEXT: .LBB1_14: +; AVX2-NEXT: vpextrb $9, %xmm1, %ecx +; AVX2-NEXT: vpextrb $9, %xmm0, %ebx +; AVX2-NEXT: movl %ebx, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %bl +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB1_16 +; AVX2-NEXT: # %bb.15: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %ebx +; AVX2-NEXT: .LBB1_16: +; AVX2-NEXT: vpextrb $8, %xmm1, %ecx +; AVX2-NEXT: vpextrb $8, %xmm0, %esi +; AVX2-NEXT: movl %esi, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %sil +; AVX2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB1_18 +; AVX2-NEXT: # %bb.17: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %esi +; AVX2-NEXT: .LBB1_18: +; AVX2-NEXT: vpextrb $7, %xmm1, %ecx +; AVX2-NEXT: vpextrb $7, %xmm0, %edx +; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %dl +; AVX2-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB1_20 +; AVX2-NEXT: # %bb.19: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %edx +; AVX2-NEXT: .LBB1_20: +; AVX2-NEXT: vpextrb $6, %xmm1, %ecx +; AVX2-NEXT: vpextrb $6, %xmm0, %edi +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %dil +; AVX2-NEXT: jno .LBB1_22 +; AVX2-NEXT: # %bb.21: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %edi +; AVX2-NEXT: .LBB1_22: +; AVX2-NEXT: vpextrb $5, %xmm1, %ecx +; AVX2-NEXT: vpextrb $5, %xmm0, %ebp +; AVX2-NEXT: movl %ebp, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %bpl +; AVX2-NEXT: jno .LBB1_24 +; AVX2-NEXT: # %bb.23: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %ebp +; AVX2-NEXT: .LBB1_24: +; AVX2-NEXT: vpextrb $4, %xmm1, %ecx +; AVX2-NEXT: vpextrb $4, %xmm0, %r11d +; AVX2-NEXT: movl %r11d, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %r11b +; AVX2-NEXT: jno .LBB1_26 +; AVX2-NEXT: # %bb.25: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r11d +; AVX2-NEXT: .LBB1_26: +; AVX2-NEXT: vpextrb $3, %xmm1, %ecx +; AVX2-NEXT: vpextrb $3, %xmm0, %r14d +; AVX2-NEXT: movl %r14d, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %r14b +; AVX2-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB1_28 +; AVX2-NEXT: # %bb.27: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r14d +; AVX2-NEXT: .LBB1_28: +; AVX2-NEXT: vpextrb $2, %xmm1, %ecx +; AVX2-NEXT: vpextrb $2, %xmm0, %r8d +; AVX2-NEXT: movl %r8d, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %r8b +; AVX2-NEXT: jno .LBB1_30 +; AVX2-NEXT: # %bb.29: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r8d +; AVX2-NEXT: .LBB1_30: +; AVX2-NEXT: vpextrb $0, %xmm1, %ecx +; AVX2-NEXT: vpextrb $0, %xmm0, %r10d +; AVX2-NEXT: movl %r10d, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %r10b +; AVX2-NEXT: jno .LBB1_32 +; AVX2-NEXT: # %bb.31: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r10d +; AVX2-NEXT: .LBB1_32: +; AVX2-NEXT: vpextrb $1, %xmm1, %ecx +; AVX2-NEXT: vpextrb $1, %xmm0, %r9d +; AVX2-NEXT: movl %r9d, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %r9b +; AVX2-NEXT: jno .LBB1_34 +; AVX2-NEXT: # %bb.33: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r9d +; AVX2-NEXT: .LBB1_34: +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vpextrb $15, %xmm1, %ecx +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpextrb $15, %xmm0, %ebx +; AVX2-NEXT: movl %ebx, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: movl %esi, %r12d +; AVX2-NEXT: subb %cl, %bl +; AVX2-NEXT: jno .LBB1_36 +; AVX2-NEXT: # %bb.35: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %ebx +; AVX2-NEXT: .LBB1_36: +; AVX2-NEXT: vpextrb $14, %xmm1, %ecx +; AVX2-NEXT: vpextrb $14, %xmm0, %esi +; AVX2-NEXT: movl %esi, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: movl %edx, %r13d +; AVX2-NEXT: subb %cl, %sil +; AVX2-NEXT: jno .LBB1_38 +; AVX2-NEXT: # %bb.37: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %esi +; AVX2-NEXT: .LBB1_38: +; AVX2-NEXT: vpextrb $13, %xmm1, %ecx +; AVX2-NEXT: vpextrb $13, %xmm0, %edx +; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: movl %edi, %ebp +; AVX2-NEXT: subb %cl, %dl +; AVX2-NEXT: jno .LBB1_40 +; AVX2-NEXT: # %bb.39: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %edx +; AVX2-NEXT: .LBB1_40: +; AVX2-NEXT: vpextrb $12, %xmm1, %ecx +; AVX2-NEXT: vpextrb $12, %xmm0, %edi +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %dil +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jo .LBB1_41 +; AVX2-NEXT: # %bb.42: +; AVX2-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jmp .LBB1_43 +; AVX2-NEXT: .LBB1_41: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: # kill: def $al killed $al def $eax +; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: .LBB1_43: +; AVX2-NEXT: movl %ebp, %edi +; AVX2-NEXT: vpextrb $11, %xmm1, %ecx +; AVX2-NEXT: vpextrb $11, %xmm0, %r15d +; AVX2-NEXT: movl %r15d, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %r15b +; AVX2-NEXT: movl %r12d, %esi +; AVX2-NEXT: movl %r13d, %edx +; AVX2-NEXT: jno .LBB1_45 +; AVX2-NEXT: # %bb.44: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r15d +; AVX2-NEXT: .LBB1_45: +; AVX2-NEXT: vpextrb $10, %xmm1, %ecx +; AVX2-NEXT: vpextrb $10, %xmm0, %r12d +; AVX2-NEXT: movl %r12d, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %r12b +; AVX2-NEXT: jno .LBB1_47 +; AVX2-NEXT: # %bb.46: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r12d +; AVX2-NEXT: .LBB1_47: +; AVX2-NEXT: vpextrb $9, %xmm1, %ecx +; AVX2-NEXT: vpextrb $9, %xmm0, %r13d +; AVX2-NEXT: movl %r13d, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %r13b +; AVX2-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB1_49 +; AVX2-NEXT: # %bb.48: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r13d +; AVX2-NEXT: .LBB1_49: +; AVX2-NEXT: vpextrb $8, %xmm1, %ecx +; AVX2-NEXT: vpextrb $8, %xmm0, %r11d +; AVX2-NEXT: movl %r11d, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %r11b +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB1_51 +; AVX2-NEXT: # %bb.50: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r11d +; AVX2-NEXT: .LBB1_51: +; AVX2-NEXT: vpextrb $7, %xmm1, %ecx +; AVX2-NEXT: vpextrb $7, %xmm0, %r10d +; AVX2-NEXT: movl %r10d, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %r10b +; AVX2-NEXT: jno .LBB1_53 +; AVX2-NEXT: # %bb.52: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r10d +; AVX2-NEXT: .LBB1_53: +; AVX2-NEXT: vpextrb $6, %xmm1, %ecx +; AVX2-NEXT: vpextrb $6, %xmm0, %ebp +; AVX2-NEXT: movl %ebp, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %bpl +; AVX2-NEXT: jno .LBB1_55 +; AVX2-NEXT: # %bb.54: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %ebp +; AVX2-NEXT: .LBB1_55: +; AVX2-NEXT: vpextrb $5, %xmm1, %ecx +; AVX2-NEXT: vpextrb $5, %xmm0, %edi +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %dil +; AVX2-NEXT: jno .LBB1_57 +; AVX2-NEXT: # %bb.56: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %edi +; AVX2-NEXT: .LBB1_57: +; AVX2-NEXT: vpextrb $4, %xmm1, %edx +; AVX2-NEXT: vpextrb $4, %xmm0, %eax +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: subb %dl, %cl +; AVX2-NEXT: setns %cl +; AVX2-NEXT: subb %dl, %al +; AVX2-NEXT: jno .LBB1_59 +; AVX2-NEXT: # %bb.58: +; AVX2-NEXT: addb $127, %cl +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: .LBB1_59: +; AVX2-NEXT: vpextrb $3, %xmm1, %ebx +; AVX2-NEXT: vpextrb $3, %xmm0, %ecx +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: subb %bl, %dl +; AVX2-NEXT: setns %dl +; AVX2-NEXT: subb %bl, %cl +; AVX2-NEXT: jno .LBB1_61 +; AVX2-NEXT: # %bb.60: +; AVX2-NEXT: addb $127, %dl +; AVX2-NEXT: movl %edx, %ecx +; AVX2-NEXT: .LBB1_61: +; AVX2-NEXT: vpextrb $2, %xmm1, %esi +; AVX2-NEXT: vpextrb $2, %xmm0, %edx +; AVX2-NEXT: movl %edx, %ebx +; AVX2-NEXT: subb %sil, %bl +; AVX2-NEXT: setns %bl +; AVX2-NEXT: subb %sil, %dl +; AVX2-NEXT: jno .LBB1_63 +; AVX2-NEXT: # %bb.62: +; AVX2-NEXT: addb $127, %bl +; AVX2-NEXT: movl %ebx, %edx +; AVX2-NEXT: .LBB1_63: +; AVX2-NEXT: vpextrb $0, %xmm1, %esi +; AVX2-NEXT: vpextrb $0, %xmm0, %r8d +; AVX2-NEXT: movl %r8d, %ebx +; AVX2-NEXT: subb %sil, %bl +; AVX2-NEXT: setns %bl +; AVX2-NEXT: subb %sil, %r8b +; AVX2-NEXT: jo .LBB1_64 +; AVX2-NEXT: # %bb.65: +; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Reload +; AVX2-NEXT: jmp .LBB1_66 +; AVX2-NEXT: .LBB1_64: +; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Reload +; AVX2-NEXT: addb $127, %bl +; AVX2-NEXT: movl %ebx, %r8d +; AVX2-NEXT: .LBB1_66: +; AVX2-NEXT: vpextrb $1, %xmm1, %esi +; AVX2-NEXT: vpextrb $1, %xmm0, %r9d +; AVX2-NEXT: movl %r9d, %ebx +; AVX2-NEXT: subb %sil, %bl +; AVX2-NEXT: setns %bl +; AVX2-NEXT: subb %sil, %r9b +; AVX2-NEXT: jno .LBB1_68 +; AVX2-NEXT: # %bb.67: +; AVX2-NEXT: addb $127, %bl +; AVX2-NEXT: movl %ebx, %r9d +; AVX2-NEXT: .LBB1_68: +; AVX2-NEXT: movzbl %r8b, %esi +; AVX2-NEXT: vmovd %esi, %xmm0 +; AVX2-NEXT: movzbl %r9b, %esi +; AVX2-NEXT: vpinsrb $1, %esi, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %dl, %edx +; AVX2-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %dil, %eax +; AVX2-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %bpl, %eax +; AVX2-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %r10b, %eax +; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %r11b, %eax +; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %r13b, %eax +; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %r12b, %eax +; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %r15b, %eax +; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl %r14b, %eax +; AVX2-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r12 +; AVX2-NEXT: popq %r13 +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512-LABEL: v32i8: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r15 +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %r13 +; AVX512-NEXT: pushq %r12 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: vpextrb $15, %xmm1, %ecx +; AVX512-NEXT: vpextrb $15, %xmm0, %edx +; AVX512-NEXT: movl %edx, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %dl +; AVX512-NEXT: jo .LBB1_1 +; AVX512-NEXT: # %bb.2: +; AVX512-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jmp .LBB1_3 +; AVX512-NEXT: .LBB1_1: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: # kill: def $al killed $al def $eax +; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: .LBB1_3: +; AVX512-NEXT: vpextrb $14, %xmm1, %ecx +; AVX512-NEXT: vpextrb $14, %xmm0, %edx +; AVX512-NEXT: movl %edx, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %dl +; AVX512-NEXT: jno .LBB1_5 +; AVX512-NEXT: # %bb.4: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %edx +; AVX512-NEXT: .LBB1_5: +; AVX512-NEXT: vpextrb $13, %xmm1, %ecx +; AVX512-NEXT: vpextrb $13, %xmm0, %esi +; AVX512-NEXT: movl %esi, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %sil +; AVX512-NEXT: jo .LBB1_6 +; AVX512-NEXT: # %bb.7: +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jmp .LBB1_8 +; AVX512-NEXT: .LBB1_6: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: # kill: def $al killed $al def $eax +; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: .LBB1_8: +; AVX512-NEXT: vpextrb $12, %xmm1, %ecx +; AVX512-NEXT: vpextrb $12, %xmm0, %esi +; AVX512-NEXT: movl %esi, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %sil +; AVX512-NEXT: jno .LBB1_10 +; AVX512-NEXT: # %bb.9: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %esi +; AVX512-NEXT: .LBB1_10: +; AVX512-NEXT: vpextrb $11, %xmm1, %ecx +; AVX512-NEXT: vpextrb $11, %xmm0, %edi +; AVX512-NEXT: movl %edi, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %dil +; AVX512-NEXT: jno .LBB1_12 +; AVX512-NEXT: # %bb.11: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %edi +; AVX512-NEXT: .LBB1_12: +; AVX512-NEXT: vpextrb $10, %xmm1, %ecx +; AVX512-NEXT: vpextrb $10, %xmm0, %ebp +; AVX512-NEXT: movl %ebp, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %bpl +; AVX512-NEXT: jno .LBB1_14 +; AVX512-NEXT: # %bb.13: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %ebp +; AVX512-NEXT: .LBB1_14: +; AVX512-NEXT: vpextrb $9, %xmm1, %ecx +; AVX512-NEXT: vpextrb $9, %xmm0, %ebx +; AVX512-NEXT: movl %ebx, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %bl +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB1_16 +; AVX512-NEXT: # %bb.15: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %ebx +; AVX512-NEXT: .LBB1_16: +; AVX512-NEXT: vpextrb $8, %xmm1, %ecx +; AVX512-NEXT: vpextrb $8, %xmm0, %esi +; AVX512-NEXT: movl %esi, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %sil +; AVX512-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB1_18 +; AVX512-NEXT: # %bb.17: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %esi +; AVX512-NEXT: .LBB1_18: +; AVX512-NEXT: vpextrb $7, %xmm1, %ecx +; AVX512-NEXT: vpextrb $7, %xmm0, %edx +; AVX512-NEXT: movl %edx, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %dl +; AVX512-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB1_20 +; AVX512-NEXT: # %bb.19: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %edx +; AVX512-NEXT: .LBB1_20: +; AVX512-NEXT: vpextrb $6, %xmm1, %ecx +; AVX512-NEXT: vpextrb $6, %xmm0, %edi +; AVX512-NEXT: movl %edi, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %dil +; AVX512-NEXT: jno .LBB1_22 +; AVX512-NEXT: # %bb.21: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %edi +; AVX512-NEXT: .LBB1_22: +; AVX512-NEXT: vpextrb $5, %xmm1, %ecx +; AVX512-NEXT: vpextrb $5, %xmm0, %ebp +; AVX512-NEXT: movl %ebp, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %bpl +; AVX512-NEXT: jno .LBB1_24 +; AVX512-NEXT: # %bb.23: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %ebp +; AVX512-NEXT: .LBB1_24: +; AVX512-NEXT: vpextrb $4, %xmm1, %ecx +; AVX512-NEXT: vpextrb $4, %xmm0, %r11d +; AVX512-NEXT: movl %r11d, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %r11b +; AVX512-NEXT: jno .LBB1_26 +; AVX512-NEXT: # %bb.25: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r11d +; AVX512-NEXT: .LBB1_26: +; AVX512-NEXT: vpextrb $3, %xmm1, %ecx +; AVX512-NEXT: vpextrb $3, %xmm0, %r14d +; AVX512-NEXT: movl %r14d, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %r14b +; AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB1_28 +; AVX512-NEXT: # %bb.27: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r14d +; AVX512-NEXT: .LBB1_28: +; AVX512-NEXT: vpextrb $2, %xmm1, %ecx +; AVX512-NEXT: vpextrb $2, %xmm0, %r8d +; AVX512-NEXT: movl %r8d, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %r8b +; AVX512-NEXT: jno .LBB1_30 +; AVX512-NEXT: # %bb.29: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r8d +; AVX512-NEXT: .LBB1_30: +; AVX512-NEXT: vpextrb $0, %xmm1, %ecx +; AVX512-NEXT: vpextrb $0, %xmm0, %r10d +; AVX512-NEXT: movl %r10d, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %r10b +; AVX512-NEXT: jno .LBB1_32 +; AVX512-NEXT: # %bb.31: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r10d +; AVX512-NEXT: .LBB1_32: +; AVX512-NEXT: vpextrb $1, %xmm1, %ecx +; AVX512-NEXT: vpextrb $1, %xmm0, %r9d +; AVX512-NEXT: movl %r9d, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %r9b +; AVX512-NEXT: jno .LBB1_34 +; AVX512-NEXT: # %bb.33: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r9d +; AVX512-NEXT: .LBB1_34: +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vpextrb $15, %xmm1, %ecx +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vpextrb $15, %xmm0, %ebx +; AVX512-NEXT: movl %ebx, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: movl %esi, %r12d +; AVX512-NEXT: subb %cl, %bl +; AVX512-NEXT: jno .LBB1_36 +; AVX512-NEXT: # %bb.35: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %ebx +; AVX512-NEXT: .LBB1_36: +; AVX512-NEXT: vpextrb $14, %xmm1, %ecx +; AVX512-NEXT: vpextrb $14, %xmm0, %esi +; AVX512-NEXT: movl %esi, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: movl %edx, %r13d +; AVX512-NEXT: subb %cl, %sil +; AVX512-NEXT: jno .LBB1_38 +; AVX512-NEXT: # %bb.37: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %esi +; AVX512-NEXT: .LBB1_38: +; AVX512-NEXT: vpextrb $13, %xmm1, %ecx +; AVX512-NEXT: vpextrb $13, %xmm0, %edx +; AVX512-NEXT: movl %edx, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: movl %edi, %ebp +; AVX512-NEXT: subb %cl, %dl +; AVX512-NEXT: jno .LBB1_40 +; AVX512-NEXT: # %bb.39: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %edx +; AVX512-NEXT: .LBB1_40: +; AVX512-NEXT: vpextrb $12, %xmm1, %ecx +; AVX512-NEXT: vpextrb $12, %xmm0, %edi +; AVX512-NEXT: movl %edi, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %dil +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jo .LBB1_41 +; AVX512-NEXT: # %bb.42: +; AVX512-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jmp .LBB1_43 +; AVX512-NEXT: .LBB1_41: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: # kill: def $al killed $al def $eax +; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: .LBB1_43: +; AVX512-NEXT: movl %ebp, %edi +; AVX512-NEXT: vpextrb $11, %xmm1, %ecx +; AVX512-NEXT: vpextrb $11, %xmm0, %r15d +; AVX512-NEXT: movl %r15d, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %r15b +; AVX512-NEXT: movl %r12d, %esi +; AVX512-NEXT: movl %r13d, %edx +; AVX512-NEXT: jno .LBB1_45 +; AVX512-NEXT: # %bb.44: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r15d +; AVX512-NEXT: .LBB1_45: +; AVX512-NEXT: vpextrb $10, %xmm1, %ecx +; AVX512-NEXT: vpextrb $10, %xmm0, %r12d +; AVX512-NEXT: movl %r12d, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %r12b +; AVX512-NEXT: jno .LBB1_47 +; AVX512-NEXT: # %bb.46: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r12d +; AVX512-NEXT: .LBB1_47: +; AVX512-NEXT: vpextrb $9, %xmm1, %ecx +; AVX512-NEXT: vpextrb $9, %xmm0, %r13d +; AVX512-NEXT: movl %r13d, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %r13b +; AVX512-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB1_49 +; AVX512-NEXT: # %bb.48: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r13d +; AVX512-NEXT: .LBB1_49: +; AVX512-NEXT: vpextrb $8, %xmm1, %ecx +; AVX512-NEXT: vpextrb $8, %xmm0, %r11d +; AVX512-NEXT: movl %r11d, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %r11b +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB1_51 +; AVX512-NEXT: # %bb.50: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r11d +; AVX512-NEXT: .LBB1_51: +; AVX512-NEXT: vpextrb $7, %xmm1, %ecx +; AVX512-NEXT: vpextrb $7, %xmm0, %r10d +; AVX512-NEXT: movl %r10d, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %r10b +; AVX512-NEXT: jno .LBB1_53 +; AVX512-NEXT: # %bb.52: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r10d +; AVX512-NEXT: .LBB1_53: +; AVX512-NEXT: vpextrb $6, %xmm1, %ecx +; AVX512-NEXT: vpextrb $6, %xmm0, %ebp +; AVX512-NEXT: movl %ebp, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %bpl +; AVX512-NEXT: jno .LBB1_55 +; AVX512-NEXT: # %bb.54: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %ebp +; AVX512-NEXT: .LBB1_55: +; AVX512-NEXT: vpextrb $5, %xmm1, %ecx +; AVX512-NEXT: vpextrb $5, %xmm0, %edi +; AVX512-NEXT: movl %edi, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %dil +; AVX512-NEXT: jno .LBB1_57 +; AVX512-NEXT: # %bb.56: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %edi +; AVX512-NEXT: .LBB1_57: +; AVX512-NEXT: vpextrb $4, %xmm1, %edx +; AVX512-NEXT: vpextrb $4, %xmm0, %eax +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: subb %dl, %cl +; AVX512-NEXT: setns %cl +; AVX512-NEXT: subb %dl, %al +; AVX512-NEXT: jno .LBB1_59 +; AVX512-NEXT: # %bb.58: +; AVX512-NEXT: addb $127, %cl +; AVX512-NEXT: movl %ecx, %eax +; AVX512-NEXT: .LBB1_59: +; AVX512-NEXT: vpextrb $3, %xmm1, %ebx +; AVX512-NEXT: vpextrb $3, %xmm0, %ecx +; AVX512-NEXT: movl %ecx, %edx +; AVX512-NEXT: subb %bl, %dl +; AVX512-NEXT: setns %dl +; AVX512-NEXT: subb %bl, %cl +; AVX512-NEXT: jno .LBB1_61 +; AVX512-NEXT: # %bb.60: +; AVX512-NEXT: addb $127, %dl +; AVX512-NEXT: movl %edx, %ecx +; AVX512-NEXT: .LBB1_61: +; AVX512-NEXT: vpextrb $2, %xmm1, %esi +; AVX512-NEXT: vpextrb $2, %xmm0, %edx +; AVX512-NEXT: movl %edx, %ebx +; AVX512-NEXT: subb %sil, %bl +; AVX512-NEXT: setns %bl +; AVX512-NEXT: subb %sil, %dl +; AVX512-NEXT: jno .LBB1_63 +; AVX512-NEXT: # %bb.62: +; AVX512-NEXT: addb $127, %bl +; AVX512-NEXT: movl %ebx, %edx +; AVX512-NEXT: .LBB1_63: +; AVX512-NEXT: vpextrb $0, %xmm1, %esi +; AVX512-NEXT: vpextrb $0, %xmm0, %r8d +; AVX512-NEXT: movl %r8d, %ebx +; AVX512-NEXT: subb %sil, %bl +; AVX512-NEXT: setns %bl +; AVX512-NEXT: subb %sil, %r8b +; AVX512-NEXT: jo .LBB1_64 +; AVX512-NEXT: # %bb.65: +; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Reload +; AVX512-NEXT: jmp .LBB1_66 +; AVX512-NEXT: .LBB1_64: +; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Reload +; AVX512-NEXT: addb $127, %bl +; AVX512-NEXT: movl %ebx, %r8d +; AVX512-NEXT: .LBB1_66: +; AVX512-NEXT: vpextrb $1, %xmm1, %esi +; AVX512-NEXT: vpextrb $1, %xmm0, %r9d +; AVX512-NEXT: movl %r9d, %ebx +; AVX512-NEXT: subb %sil, %bl +; AVX512-NEXT: setns %bl +; AVX512-NEXT: subb %sil, %r9b +; AVX512-NEXT: jno .LBB1_68 +; AVX512-NEXT: # %bb.67: +; AVX512-NEXT: addb $127, %bl +; AVX512-NEXT: movl %ebx, %r9d +; AVX512-NEXT: .LBB1_68: +; AVX512-NEXT: movzbl %r8b, %esi +; AVX512-NEXT: vmovd %esi, %xmm0 +; AVX512-NEXT: movzbl %r9b, %esi +; AVX512-NEXT: vpinsrb $1, %esi, %xmm0, %xmm0 +; AVX512-NEXT: movzbl %dl, %edx +; AVX512-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; AVX512-NEXT: movzbl %cl, %ecx +; AVX512-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 +; AVX512-NEXT: movzbl %al, %eax +; AVX512-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; AVX512-NEXT: movzbl %dil, %eax +; AVX512-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; AVX512-NEXT: movzbl %bpl, %eax +; AVX512-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX512-NEXT: movzbl %r10b, %eax +; AVX512-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX512-NEXT: movzbl %r11b, %eax +; AVX512-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX512-NEXT: movzbl %r13b, %eax +; AVX512-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX512-NEXT: movzbl %r12b, %eax +; AVX512-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX512-NEXT: movzbl %r15b, %eax +; AVX512-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vmovd %eax, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl %r14b, %eax +; AVX512-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r12 +; AVX512-NEXT: popq %r13 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %r15 +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq + %z = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> %x, <32 x i8> %y) + ret <32 x i8> %z +} + +define <64 x i8> @v64i8(<64 x i8> %x, <64 x i8> %y) nounwind { +; SSE2-LABEL: v64i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: pushq %r15 +; SSE2-NEXT: pushq %r14 +; SSE2-NEXT: pushq %r13 +; SSE2-NEXT: pushq %r12 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: subq $232, %rsp +; SSE2-NEXT: movaps %xmm5, (%rsp) +; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movb (%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %dl +; SSE2-NEXT: jno .LBB2_2 +; SSE2-NEXT: # %bb.1: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %edx +; SSE2-NEXT: .LBB2_2: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movl %esi, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %sil +; SSE2-NEXT: jno .LBB2_4 +; SSE2-NEXT: # %bb.3: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %esi +; SSE2-NEXT: .LBB2_4: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dil +; SSE2-NEXT: movl %edi, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %dil +; SSE2-NEXT: jno .LBB2_6 +; SSE2-NEXT: # %bb.5: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %edi +; SSE2-NEXT: .LBB2_6: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r11b +; SSE2-NEXT: movl %r11d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r11b +; SSE2-NEXT: jno .LBB2_8 +; SSE2-NEXT: # %bb.7: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r11d +; SSE2-NEXT: .LBB2_8: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r9b +; SSE2-NEXT: movl %r9d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r9b +; SSE2-NEXT: jno .LBB2_10 +; SSE2-NEXT: # %bb.9: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r9d +; SSE2-NEXT: .LBB2_10: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r10b +; SSE2-NEXT: movl %r10d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r10b +; SSE2-NEXT: jno .LBB2_12 +; SSE2-NEXT: # %bb.11: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r10d +; SSE2-NEXT: .LBB2_12: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl %ebx, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %bl +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB2_14 +; SSE2-NEXT: # %bb.13: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %ebx +; SSE2-NEXT: .LBB2_14: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %dl +; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jo .LBB2_15 +; SSE2-NEXT: # %bb.16: +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jmp .LBB2_17 +; SSE2-NEXT: .LBB2_15: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_17: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %dl +; SSE2-NEXT: jno .LBB2_19 +; SSE2-NEXT: # %bb.18: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %edx +; SSE2-NEXT: .LBB2_19: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r12b +; SSE2-NEXT: movl %r12d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r12b +; SSE2-NEXT: jno .LBB2_21 +; SSE2-NEXT: # %bb.20: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r12d +; SSE2-NEXT: .LBB2_21: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl %ebx, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %bl +; SSE2-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB2_23 +; SSE2-NEXT: # %bb.22: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %ebx +; SSE2-NEXT: .LBB2_23: +; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r9b +; SSE2-NEXT: movl %r9d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r9b +; SSE2-NEXT: jno .LBB2_25 +; SSE2-NEXT: # %bb.24: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r9d +; SSE2-NEXT: .LBB2_25: +; SSE2-NEXT: movl %edi, %r8d +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl %ebx, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %bl +; SSE2-NEXT: jo .LBB2_26 +; SSE2-NEXT: # %bb.27: +; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movl %esi, %edi +; SSE2-NEXT: jmp .LBB2_28 +; SSE2-NEXT: .LBB2_26: +; SSE2-NEXT: movl %esi, %edi +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_28: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSE2-NEXT: movl %esi, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %sil +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB2_30 +; SSE2-NEXT: # %bb.29: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %esi +; SSE2-NEXT: .LBB2_30: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %dl +; SSE2-NEXT: jno .LBB2_32 +; SSE2-NEXT: # %bb.31: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %edx +; SSE2-NEXT: .LBB2_32: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl %ebx, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %bl +; SSE2-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB2_34 +; SSE2-NEXT: # %bb.33: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %ebx +; SSE2-NEXT: .LBB2_34: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r9b +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %dl +; SSE2-NEXT: jno .LBB2_36 +; SSE2-NEXT: # %bb.35: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %edx +; SSE2-NEXT: .LBB2_36: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movl %r9d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r9b +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB2_38 +; SSE2-NEXT: # %bb.37: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r9d +; SSE2-NEXT: .LBB2_38: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %dl +; SSE2-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB2_40 +; SSE2-NEXT: # %bb.39: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %edx +; SSE2-NEXT: .LBB2_40: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bpl +; SSE2-NEXT: movl %ebp, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %bpl +; SSE2-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB2_42 +; SSE2-NEXT: # %bb.41: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %ebp +; SSE2-NEXT: .LBB2_42: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSE2-NEXT: movl %esi, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %sil +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB2_44 +; SSE2-NEXT: # %bb.43: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %esi +; SSE2-NEXT: .LBB2_44: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %dl +; SSE2-NEXT: jno .LBB2_46 +; SSE2-NEXT: # %bb.45: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %edx +; SSE2-NEXT: .LBB2_46: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dil +; SSE2-NEXT: movl %edi, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %dil +; SSE2-NEXT: jno .LBB2_48 +; SSE2-NEXT: # %bb.47: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %edi +; SSE2-NEXT: .LBB2_48: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl %ebx, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %bl +; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB2_50 +; SSE2-NEXT: # %bb.49: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_50: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSE2-NEXT: movl %esi, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %sil +; SSE2-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB2_52 +; SSE2-NEXT: # %bb.51: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %esi +; SSE2-NEXT: .LBB2_52: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dil +; SSE2-NEXT: movl %edi, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %dil +; SSE2-NEXT: jno .LBB2_54 +; SSE2-NEXT: # %bb.53: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %edi +; SSE2-NEXT: .LBB2_54: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r14b +; SSE2-NEXT: movl %r14d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r14b +; SSE2-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB2_56 +; SSE2-NEXT: # %bb.55: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r14d +; SSE2-NEXT: .LBB2_56: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r9b +; SSE2-NEXT: movl %r9d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r9b +; SSE2-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB2_58 +; SSE2-NEXT: # %bb.57: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r9d +; SSE2-NEXT: .LBB2_58: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %dl +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB2_60 +; SSE2-NEXT: # %bb.59: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_60: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %dl +; SSE2-NEXT: jno .LBB2_62 +; SSE2-NEXT: # %bb.61: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %edx +; SSE2-NEXT: .LBB2_62: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r15b +; SSE2-NEXT: movl %r15d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r15b +; SSE2-NEXT: jno .LBB2_64 +; SSE2-NEXT: # %bb.63: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r15d +; SSE2-NEXT: .LBB2_64: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl %ebx, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %bl +; SSE2-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jo .LBB2_65 +; SSE2-NEXT: # %bb.66: +; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jmp .LBB2_67 +; SSE2-NEXT: .LBB2_65: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_67: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r11b +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl %r11d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r11b +; SSE2-NEXT: jno .LBB2_69 +; SSE2-NEXT: # %bb.68: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r11d +; SSE2-NEXT: .LBB2_69: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %dl +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB2_71 +; SSE2-NEXT: # %bb.70: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_71: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r12b +; SSE2-NEXT: movl %r12d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r12b +; SSE2-NEXT: jno .LBB2_73 +; SSE2-NEXT: # %bb.72: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r12d +; SSE2-NEXT: .LBB2_73: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r14b +; SSE2-NEXT: movl %r14d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r14b +; SSE2-NEXT: jno .LBB2_75 +; SSE2-NEXT: # %bb.74: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r14d +; SSE2-NEXT: .LBB2_75: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r15b +; SSE2-NEXT: movl %r15d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r15b +; SSE2-NEXT: jno .LBB2_77 +; SSE2-NEXT: # %bb.76: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r15d +; SSE2-NEXT: .LBB2_77: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bpl +; SSE2-NEXT: movl %ebp, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %bpl +; SSE2-NEXT: jno .LBB2_79 +; SSE2-NEXT: # %bb.78: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %ebp +; SSE2-NEXT: .LBB2_79: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r13b +; SSE2-NEXT: movl %r13d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r13b +; SSE2-NEXT: jno .LBB2_81 +; SSE2-NEXT: # %bb.80: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r13d +; SSE2-NEXT: .LBB2_81: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %dl +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB2_83 +; SSE2-NEXT: # %bb.82: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_83: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %dl +; SSE2-NEXT: jo .LBB2_84 +; SSE2-NEXT: # %bb.85: +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jmp .LBB2_86 +; SSE2-NEXT: .LBB2_84: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_86: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %dl +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB2_88 +; SSE2-NEXT: # %bb.87: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_88: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %dl +; SSE2-NEXT: jo .LBB2_89 +; SSE2-NEXT: # %bb.90: +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jmp .LBB2_91 +; SSE2-NEXT: .LBB2_89: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_91: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %dl +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB2_93 +; SSE2-NEXT: # %bb.92: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_93: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r8b +; SSE2-NEXT: movl %r8d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r8b +; SSE2-NEXT: jno .LBB2_95 +; SSE2-NEXT: # %bb.94: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r8d +; SSE2-NEXT: .LBB2_95: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %dl +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB2_97 +; SSE2-NEXT: # %bb.96: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_97: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r9b +; SSE2-NEXT: movl %r9d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r9b +; SSE2-NEXT: jno .LBB2_99 +; SSE2-NEXT: # %bb.98: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r9d +; SSE2-NEXT: .LBB2_99: +; SSE2-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %dl +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB2_101 +; SSE2-NEXT: # %bb.100: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_101: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r12b +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r10b +; SSE2-NEXT: movl %r12d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r12b +; SSE2-NEXT: jno .LBB2_103 +; SSE2-NEXT: # %bb.102: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r12d +; SSE2-NEXT: .LBB2_103: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movl %r10d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r10b +; SSE2-NEXT: jno .LBB2_105 +; SSE2-NEXT: # %bb.104: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r10d +; SSE2-NEXT: .LBB2_105: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %dl +; SSE2-NEXT: jno .LBB2_107 +; SSE2-NEXT: # %bb.106: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %edx +; SSE2-NEXT: .LBB2_107: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl %ebx, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %bl +; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB2_109 +; SSE2-NEXT: # %bb.108: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_109: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r14b +; SSE2-NEXT: movl %r14d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r14b +; SSE2-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB2_111 +; SSE2-NEXT: # %bb.110: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r14d +; SSE2-NEXT: .LBB2_111: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r15b +; SSE2-NEXT: movl %r15d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r15b +; SSE2-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB2_113 +; SSE2-NEXT: # %bb.112: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r15d +; SSE2-NEXT: .LBB2_113: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r12b +; SSE2-NEXT: movl %r12d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r12b +; SSE2-NEXT: movl %r8d, %edx +; SSE2-NEXT: jno .LBB2_115 +; SSE2-NEXT: # %bb.114: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r12d +; SSE2-NEXT: .LBB2_115: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r13b +; SSE2-NEXT: movl %r13d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r13b +; SSE2-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB2_117 +; SSE2-NEXT: # %bb.116: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r13d +; SSE2-NEXT: .LBB2_117: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSE2-NEXT: movl %esi, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %sil +; SSE2-NEXT: jno .LBB2_119 +; SSE2-NEXT: # %bb.118: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %esi +; SSE2-NEXT: .LBB2_119: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dil +; SSE2-NEXT: movl %edi, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %dil +; SSE2-NEXT: jno .LBB2_121 +; SSE2-NEXT: # %bb.120: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %edi +; SSE2-NEXT: .LBB2_121: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r8b +; SSE2-NEXT: movl %r8d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r8b +; SSE2-NEXT: jno .LBB2_123 +; SSE2-NEXT: # %bb.122: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r8d +; SSE2-NEXT: .LBB2_123: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r10b +; SSE2-NEXT: movl %r10d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r10b +; SSE2-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bpl # 1-byte Reload +; SSE2-NEXT: jno .LBB2_125 +; SSE2-NEXT: # %bb.124: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r10d +; SSE2-NEXT: .LBB2_125: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r11b +; SSE2-NEXT: movl %r11d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r11b +; SSE2-NEXT: jno .LBB2_127 +; SSE2-NEXT: # %bb.126: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r11d +; SSE2-NEXT: .LBB2_127: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movl %ecx, %eax +; SSE2-NEXT: subb %bl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %bl, %cl +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB2_129 +; SSE2-NEXT: # %bb.128: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: .LBB2_129: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl %ebx, %eax +; SSE2-NEXT: subb %dl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %dl, %bl +; SSE2-NEXT: jno .LBB2_131 +; SSE2-NEXT: # %bb.130: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %ebx +; SSE2-NEXT: .LBB2_131: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r9b +; SSE2-NEXT: movl %r9d, %eax +; SSE2-NEXT: subb %dl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %dl, %r9b +; SSE2-NEXT: jno .LBB2_133 +; SSE2-NEXT: # %bb.132: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r9d +; SSE2-NEXT: .LBB2_133: +; SSE2-NEXT: movl %ebp, %eax +; SSE2-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %dl # 1-byte Reload +; SSE2-NEXT: movzbl %r9b, %ebp +; SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl %bl, %ebp +; SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl %cl, %ecx +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl %r11b, %ecx +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl %r10b, %ecx +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl %r8b, %ecx +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl %dil, %ecx +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl %sil, %ecx +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl %r13b, %ecx +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl %r12b, %ecx +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl %r15b, %ecx +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl %r14b, %ecx +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl %dl, %ecx +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 4-byte Folded Reload +; SSE2-NEXT: # xmm14 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload +; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 4-byte Folded Reload +; SSE2-NEXT: # xmm12 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload +; SSE2-NEXT: # xmm4 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 4-byte Folded Reload +; SSE2-NEXT: # xmm8 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 4-byte Folded Reload +; SSE2-NEXT: # xmm6 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 4-byte Folded Reload +; SSE2-NEXT: # xmm10 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 4-byte Folded Reload +; SSE2-NEXT: # xmm5 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 4-byte Folded Reload +; SSE2-NEXT: # xmm13 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 4-byte Folded Reload +; SSE2-NEXT: # xmm7 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 4-byte Folded Reload +; SSE2-NEXT: # xmm11 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload +; SSE2-NEXT: # xmm3 = mem[0],zero,zero,zero +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload +; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload +; SSE2-NEXT: # xmm15 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 4-byte Folded Reload +; SSE2-NEXT: # xmm9 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 4-byte Folded Reload +; SSE2-NEXT: # xmm12 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 4-byte Folded Reload +; SSE2-NEXT: # xmm8 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 4-byte Folded Reload +; SSE2-NEXT: # xmm6 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 4-byte Folded Reload +; SSE2-NEXT: # xmm14 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload +; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3],xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 4-byte Folded Reload +; SSE2-NEXT: # xmm11 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 4-byte Folded Reload +; SSE2-NEXT: # xmm10 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 4-byte Folded Reload +; SSE2-NEXT: # xmm13 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3],xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload +; SSE2-NEXT: # xmm15 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 4-byte Folded Reload +; SSE2-NEXT: # xmm7 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm5[0] +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 4-byte Folded Reload +; SSE2-NEXT: # xmm5 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE2-NEXT: # xmm12 = xmm12[0],mem[0],xmm12[1],mem[1],xmm12[2],mem[2],xmm12[3],mem[3],xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload +; SSE2-NEXT: # xmm4 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE2-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3],xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] +; SSE2-NEXT: movd %r13d, %xmm1 +; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3] +; SSE2-NEXT: movd %r12d, %xmm1 +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE2-NEXT: # xmm6 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3],xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] +; SSE2-NEXT: movd %r15d, %xmm3 +; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] +; SSE2-NEXT: movd %r14d, %xmm14 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; SSE2-NEXT: movd %ebp, %xmm3 +; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] +; SSE2-NEXT: movd %ebx, %xmm12 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] +; SSE2-NEXT: movd %r11d, %xmm8 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] +; SSE2-NEXT: movd %r9d, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; SSE2-NEXT: movd %eax, %xmm11 +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3],xmm7[4],xmm15[4],xmm7[5],xmm15[5],xmm7[6],xmm15[6],xmm7[7],xmm15[7] +; SSE2-NEXT: movd %ecx, %xmm6 +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; SSE2-NEXT: movd %edx, %xmm13 +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; SSE2-NEXT: movd %edi, %xmm5 +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE2-NEXT: movd %esi, %xmm15 +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE2-NEXT: movd %r8d, %xmm0 +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE2-NEXT: movd %r10d, %xmm10 +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE2-NEXT: # xmm14 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3],xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] +; SSE2-NEXT: movd %r15d, %xmm2 +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload +; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3] +; SSE2-NEXT: movd %r12d, %xmm1 +; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE2-NEXT: # xmm12 = xmm12[0],mem[0],xmm12[1],mem[1],xmm12[2],mem[2],xmm12[3],mem[3],xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] +; SSE2-NEXT: movd %r9d, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; SSE2-NEXT: movd %r11d, %xmm8 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] +; SSE2-NEXT: movd %r14d, %xmm12 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1] +; SSE2-NEXT: movd %edi, %xmm7 +; SSE2-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] +; SSE2-NEXT: movd %r13d, %xmm11 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] +; SSE2-NEXT: movd %esi, %xmm14 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; SSE2-NEXT: movd %edx, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; SSE2-NEXT: movd %r8d, %xmm15 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] +; SSE2-NEXT: movd %ebp, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: movd %ecx, %xmm13 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSE2-NEXT: movd %r10d, %xmm5 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE2-NEXT: movd %ebx, %xmm10 +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE2-NEXT: movd %r15d, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7] +; SSE2-NEXT: movd %eax, %xmm8 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] +; SSE2-NEXT: movd %r12d, %xmm3 +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE2-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3],xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3],xmm6[4],xmm14[4],xmm6[5],xmm14[5],xmm6[6],xmm14[6],xmm6[7],xmm14[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3],xmm7[4],xmm15[4],xmm7[5],xmm15[5],xmm7[6],xmm15[6],xmm7[7],xmm15[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0] +; SSE2-NEXT: movdqa %xmm9, %xmm0 +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: addq $232, %rsp +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %r12 +; SSE2-NEXT: popq %r13 +; SSE2-NEXT: popq %r14 +; SSE2-NEXT: popq %r15 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v64i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pushq %rbp +; SSSE3-NEXT: pushq %r15 +; SSSE3-NEXT: pushq %r14 +; SSSE3-NEXT: pushq %r13 +; SSSE3-NEXT: pushq %r12 +; SSSE3-NEXT: pushq %rbx +; SSSE3-NEXT: subq $232, %rsp +; SSSE3-NEXT: movaps %xmm5, (%rsp) +; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movb (%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %dl +; SSSE3-NEXT: jno .LBB2_2 +; SSSE3-NEXT: # %bb.1: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %edx +; SSSE3-NEXT: .LBB2_2: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movl %esi, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %sil +; SSSE3-NEXT: jno .LBB2_4 +; SSSE3-NEXT: # %bb.3: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %esi +; SSSE3-NEXT: .LBB2_4: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dil +; SSSE3-NEXT: movl %edi, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %dil +; SSSE3-NEXT: jno .LBB2_6 +; SSSE3-NEXT: # %bb.5: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %edi +; SSSE3-NEXT: .LBB2_6: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r11b +; SSSE3-NEXT: movl %r11d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r11b +; SSSE3-NEXT: jno .LBB2_8 +; SSSE3-NEXT: # %bb.7: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r11d +; SSSE3-NEXT: .LBB2_8: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r9b +; SSSE3-NEXT: movl %r9d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r9b +; SSSE3-NEXT: jno .LBB2_10 +; SSSE3-NEXT: # %bb.9: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r9d +; SSSE3-NEXT: .LBB2_10: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r10b +; SSSE3-NEXT: movl %r10d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r10b +; SSSE3-NEXT: jno .LBB2_12 +; SSSE3-NEXT: # %bb.11: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r10d +; SSSE3-NEXT: .LBB2_12: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl %ebx, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %bl +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB2_14 +; SSSE3-NEXT: # %bb.13: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %ebx +; SSSE3-NEXT: .LBB2_14: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %dl +; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jo .LBB2_15 +; SSSE3-NEXT: # %bb.16: +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jmp .LBB2_17 +; SSSE3-NEXT: .LBB2_15: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_17: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %dl +; SSSE3-NEXT: jno .LBB2_19 +; SSSE3-NEXT: # %bb.18: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %edx +; SSSE3-NEXT: .LBB2_19: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r12b +; SSSE3-NEXT: movl %r12d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r12b +; SSSE3-NEXT: jno .LBB2_21 +; SSSE3-NEXT: # %bb.20: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r12d +; SSSE3-NEXT: .LBB2_21: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl %ebx, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %bl +; SSSE3-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB2_23 +; SSSE3-NEXT: # %bb.22: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %ebx +; SSSE3-NEXT: .LBB2_23: +; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r9b +; SSSE3-NEXT: movl %r9d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r9b +; SSSE3-NEXT: jno .LBB2_25 +; SSSE3-NEXT: # %bb.24: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r9d +; SSSE3-NEXT: .LBB2_25: +; SSSE3-NEXT: movl %edi, %r8d +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl %ebx, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %bl +; SSSE3-NEXT: jo .LBB2_26 +; SSSE3-NEXT: # %bb.27: +; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movl %esi, %edi +; SSSE3-NEXT: jmp .LBB2_28 +; SSSE3-NEXT: .LBB2_26: +; SSSE3-NEXT: movl %esi, %edi +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_28: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSSE3-NEXT: movl %esi, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %sil +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB2_30 +; SSSE3-NEXT: # %bb.29: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %esi +; SSSE3-NEXT: .LBB2_30: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %dl +; SSSE3-NEXT: jno .LBB2_32 +; SSSE3-NEXT: # %bb.31: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %edx +; SSSE3-NEXT: .LBB2_32: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl %ebx, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %bl +; SSSE3-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp) +; SSSE3-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB2_34 +; SSSE3-NEXT: # %bb.33: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %ebx +; SSSE3-NEXT: .LBB2_34: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r9b +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %dl +; SSSE3-NEXT: jno .LBB2_36 +; SSSE3-NEXT: # %bb.35: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %edx +; SSSE3-NEXT: .LBB2_36: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movl %r9d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r9b +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB2_38 +; SSSE3-NEXT: # %bb.37: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r9d +; SSSE3-NEXT: .LBB2_38: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %dl +; SSSE3-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB2_40 +; SSSE3-NEXT: # %bb.39: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %edx +; SSSE3-NEXT: .LBB2_40: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bpl +; SSSE3-NEXT: movl %ebp, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %bpl +; SSSE3-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB2_42 +; SSSE3-NEXT: # %bb.41: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %ebp +; SSSE3-NEXT: .LBB2_42: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSSE3-NEXT: movl %esi, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %sil +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB2_44 +; SSSE3-NEXT: # %bb.43: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %esi +; SSSE3-NEXT: .LBB2_44: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %dl +; SSSE3-NEXT: jno .LBB2_46 +; SSSE3-NEXT: # %bb.45: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %edx +; SSSE3-NEXT: .LBB2_46: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dil +; SSSE3-NEXT: movl %edi, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %dil +; SSSE3-NEXT: jno .LBB2_48 +; SSSE3-NEXT: # %bb.47: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %edi +; SSSE3-NEXT: .LBB2_48: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl %ebx, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %bl +; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB2_50 +; SSSE3-NEXT: # %bb.49: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_50: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSSE3-NEXT: movl %esi, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %sil +; SSSE3-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB2_52 +; SSSE3-NEXT: # %bb.51: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %esi +; SSSE3-NEXT: .LBB2_52: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dil +; SSSE3-NEXT: movl %edi, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %dil +; SSSE3-NEXT: jno .LBB2_54 +; SSSE3-NEXT: # %bb.53: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %edi +; SSSE3-NEXT: .LBB2_54: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r14b +; SSSE3-NEXT: movl %r14d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r14b +; SSSE3-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB2_56 +; SSSE3-NEXT: # %bb.55: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r14d +; SSSE3-NEXT: .LBB2_56: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r9b +; SSSE3-NEXT: movl %r9d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r9b +; SSSE3-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB2_58 +; SSSE3-NEXT: # %bb.57: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r9d +; SSSE3-NEXT: .LBB2_58: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %dl +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB2_60 +; SSSE3-NEXT: # %bb.59: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_60: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %dl +; SSSE3-NEXT: jno .LBB2_62 +; SSSE3-NEXT: # %bb.61: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %edx +; SSSE3-NEXT: .LBB2_62: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r15b +; SSSE3-NEXT: movl %r15d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r15b +; SSSE3-NEXT: jno .LBB2_64 +; SSSE3-NEXT: # %bb.63: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r15d +; SSSE3-NEXT: .LBB2_64: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl %ebx, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %bl +; SSSE3-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp) +; SSSE3-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jo .LBB2_65 +; SSSE3-NEXT: # %bb.66: +; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jmp .LBB2_67 +; SSSE3-NEXT: .LBB2_65: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_67: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r11b +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl %r11d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r11b +; SSSE3-NEXT: jno .LBB2_69 +; SSSE3-NEXT: # %bb.68: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r11d +; SSSE3-NEXT: .LBB2_69: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %dl +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB2_71 +; SSSE3-NEXT: # %bb.70: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_71: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r12b +; SSSE3-NEXT: movl %r12d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r12b +; SSSE3-NEXT: jno .LBB2_73 +; SSSE3-NEXT: # %bb.72: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r12d +; SSSE3-NEXT: .LBB2_73: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r14b +; SSSE3-NEXT: movl %r14d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r14b +; SSSE3-NEXT: jno .LBB2_75 +; SSSE3-NEXT: # %bb.74: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r14d +; SSSE3-NEXT: .LBB2_75: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r15b +; SSSE3-NEXT: movl %r15d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r15b +; SSSE3-NEXT: jno .LBB2_77 +; SSSE3-NEXT: # %bb.76: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r15d +; SSSE3-NEXT: .LBB2_77: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bpl +; SSSE3-NEXT: movl %ebp, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %bpl +; SSSE3-NEXT: jno .LBB2_79 +; SSSE3-NEXT: # %bb.78: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %ebp +; SSSE3-NEXT: .LBB2_79: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r13b +; SSSE3-NEXT: movl %r13d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r13b +; SSSE3-NEXT: jno .LBB2_81 +; SSSE3-NEXT: # %bb.80: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r13d +; SSSE3-NEXT: .LBB2_81: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %dl +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB2_83 +; SSSE3-NEXT: # %bb.82: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_83: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %dl +; SSSE3-NEXT: jo .LBB2_84 +; SSSE3-NEXT: # %bb.85: +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jmp .LBB2_86 +; SSSE3-NEXT: .LBB2_84: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_86: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %dl +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB2_88 +; SSSE3-NEXT: # %bb.87: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_88: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %dl +; SSSE3-NEXT: jo .LBB2_89 +; SSSE3-NEXT: # %bb.90: +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jmp .LBB2_91 +; SSSE3-NEXT: .LBB2_89: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_91: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %dl +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB2_93 +; SSSE3-NEXT: # %bb.92: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_93: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r8b +; SSSE3-NEXT: movl %r8d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r8b +; SSSE3-NEXT: jno .LBB2_95 +; SSSE3-NEXT: # %bb.94: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r8d +; SSSE3-NEXT: .LBB2_95: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %dl +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB2_97 +; SSSE3-NEXT: # %bb.96: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_97: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r9b +; SSSE3-NEXT: movl %r9d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r9b +; SSSE3-NEXT: jno .LBB2_99 +; SSSE3-NEXT: # %bb.98: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r9d +; SSSE3-NEXT: .LBB2_99: +; SSSE3-NEXT: movb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %dl +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB2_101 +; SSSE3-NEXT: # %bb.100: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_101: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r12b +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r10b +; SSSE3-NEXT: movl %r12d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r12b +; SSSE3-NEXT: jno .LBB2_103 +; SSSE3-NEXT: # %bb.102: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r12d +; SSSE3-NEXT: .LBB2_103: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movl %r10d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r10b +; SSSE3-NEXT: jno .LBB2_105 +; SSSE3-NEXT: # %bb.104: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r10d +; SSSE3-NEXT: .LBB2_105: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %dl +; SSSE3-NEXT: jno .LBB2_107 +; SSSE3-NEXT: # %bb.106: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %edx +; SSSE3-NEXT: .LBB2_107: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl %ebx, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %bl +; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB2_109 +; SSSE3-NEXT: # %bb.108: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_109: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r14b +; SSSE3-NEXT: movl %r14d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r14b +; SSSE3-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB2_111 +; SSSE3-NEXT: # %bb.110: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r14d +; SSSE3-NEXT: .LBB2_111: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r15b +; SSSE3-NEXT: movl %r15d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r15b +; SSSE3-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB2_113 +; SSSE3-NEXT: # %bb.112: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r15d +; SSSE3-NEXT: .LBB2_113: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r12b +; SSSE3-NEXT: movl %r12d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r12b +; SSSE3-NEXT: movl %r8d, %edx +; SSSE3-NEXT: jno .LBB2_115 +; SSSE3-NEXT: # %bb.114: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r12d +; SSSE3-NEXT: .LBB2_115: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r13b +; SSSE3-NEXT: movl %r13d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r13b +; SSSE3-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB2_117 +; SSSE3-NEXT: # %bb.116: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r13d +; SSSE3-NEXT: .LBB2_117: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSSE3-NEXT: movl %esi, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %sil +; SSSE3-NEXT: jno .LBB2_119 +; SSSE3-NEXT: # %bb.118: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %esi +; SSSE3-NEXT: .LBB2_119: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dil +; SSSE3-NEXT: movl %edi, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %dil +; SSSE3-NEXT: jno .LBB2_121 +; SSSE3-NEXT: # %bb.120: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %edi +; SSSE3-NEXT: .LBB2_121: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r8b +; SSSE3-NEXT: movl %r8d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r8b +; SSSE3-NEXT: jno .LBB2_123 +; SSSE3-NEXT: # %bb.122: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r8d +; SSSE3-NEXT: .LBB2_123: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r10b +; SSSE3-NEXT: movl %r10d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r10b +; SSSE3-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bpl # 1-byte Reload +; SSSE3-NEXT: jno .LBB2_125 +; SSSE3-NEXT: # %bb.124: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r10d +; SSSE3-NEXT: .LBB2_125: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r11b +; SSSE3-NEXT: movl %r11d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r11b +; SSSE3-NEXT: jno .LBB2_127 +; SSSE3-NEXT: # %bb.126: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r11d +; SSSE3-NEXT: .LBB2_127: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movl %ecx, %eax +; SSSE3-NEXT: subb %bl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %bl, %cl +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB2_129 +; SSSE3-NEXT: # %bb.128: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: .LBB2_129: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl %ebx, %eax +; SSSE3-NEXT: subb %dl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %dl, %bl +; SSSE3-NEXT: jno .LBB2_131 +; SSSE3-NEXT: # %bb.130: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %ebx +; SSSE3-NEXT: .LBB2_131: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r9b +; SSSE3-NEXT: movl %r9d, %eax +; SSSE3-NEXT: subb %dl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %dl, %r9b +; SSSE3-NEXT: jno .LBB2_133 +; SSSE3-NEXT: # %bb.132: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r9d +; SSSE3-NEXT: .LBB2_133: +; SSSE3-NEXT: movl %ebp, %eax +; SSSE3-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %dl # 1-byte Reload +; SSSE3-NEXT: movzbl %r9b, %ebp +; SSSE3-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl %bl, %ebp +; SSSE3-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl %cl, %ecx +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl %r11b, %ecx +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl %r10b, %ecx +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl %r8b, %ecx +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl %dil, %ecx +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl %sil, %ecx +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl %r13b, %ecx +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl %r12b, %ecx +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl %r15b, %ecx +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl %r14b, %ecx +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl %dl, %ecx +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl %al, %eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm14 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm12 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm4 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm8 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm6 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm10 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm5 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm13 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm7 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm11 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm3 = mem[0],zero,zero,zero +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSSE3-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm15 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm9 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSSE3-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm12 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSSE3-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm8 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSSE3-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm6 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm14 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3],xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm11 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm10 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm13 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3],xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm15 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm7 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm5[0] +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm5 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm12 = xmm12[0],mem[0],xmm12[1],mem[1],xmm12[2],mem[2],xmm12[3],mem[3],xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm4 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3],xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] +; SSSE3-NEXT: movd %r13d, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3] +; SSSE3-NEXT: movd %r12d, %xmm1 +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm6 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3],xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] +; SSSE3-NEXT: movd %r15d, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] +; SSSE3-NEXT: movd %r14d, %xmm14 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; SSSE3-NEXT: movd %ebp, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] +; SSSE3-NEXT: movd %ebx, %xmm12 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] +; SSSE3-NEXT: movd %r11d, %xmm8 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] +; SSSE3-NEXT: movd %r9d, %xmm3 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; SSSE3-NEXT: movd %eax, %xmm11 +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3],xmm7[4],xmm15[4],xmm7[5],xmm15[5],xmm7[6],xmm15[6],xmm7[7],xmm15[7] +; SSSE3-NEXT: movd %ecx, %xmm6 +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; SSSE3-NEXT: movd %edx, %xmm13 +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; SSSE3-NEXT: movd %edi, %xmm5 +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload +; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSSE3-NEXT: movd %esi, %xmm15 +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSSE3-NEXT: movd %r8d, %xmm0 +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSSE3-NEXT: movd %r10d, %xmm10 +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm14 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3],xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] +; SSSE3-NEXT: movd %r15d, %xmm2 +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3] +; SSSE3-NEXT: movd %r12d, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm12 = xmm12[0],mem[0],xmm12[1],mem[1],xmm12[2],mem[2],xmm12[3],mem[3],xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] +; SSSE3-NEXT: movd %r9d, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; SSSE3-NEXT: movd %r11d, %xmm8 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] +; SSSE3-NEXT: movd %r14d, %xmm12 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1] +; SSSE3-NEXT: movd %edi, %xmm7 +; SSSE3-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] +; SSSE3-NEXT: movd %r13d, %xmm11 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] +; SSSE3-NEXT: movd %esi, %xmm14 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; SSSE3-NEXT: movd %edx, %xmm6 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; SSSE3-NEXT: movd %r8d, %xmm15 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] +; SSSE3-NEXT: movd %ebp, %xmm7 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT: movd %ecx, %xmm13 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSSE3-NEXT: movd %r10d, %xmm5 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSSE3-NEXT: movd %ebx, %xmm10 +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSSE3-NEXT: movd %r15d, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7] +; SSSE3-NEXT: movd %eax, %xmm8 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] +; SSSE3-NEXT: movd %r12d, %xmm3 +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3],xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3],xmm6[4],xmm14[4],xmm6[5],xmm14[5],xmm6[6],xmm14[6],xmm6[7],xmm14[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3],xmm7[4],xmm15[4],xmm7[5],xmm15[5],xmm7[6],xmm15[6],xmm7[7],xmm15[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0] +; SSSE3-NEXT: movdqa %xmm9, %xmm0 +; SSSE3-NEXT: movdqa %xmm4, %xmm1 +; SSSE3-NEXT: addq $232, %rsp +; SSSE3-NEXT: popq %rbx +; SSSE3-NEXT: popq %r12 +; SSSE3-NEXT: popq %r13 +; SSSE3-NEXT: popq %r14 +; SSSE3-NEXT: popq %r15 +; SSSE3-NEXT: popq %rbp +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v64i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pextrb $15, %xmm5, %ecx +; SSE41-NEXT: pextrb $15, %xmm1, %r11d +; SSE41-NEXT: movl %r11d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r11b +; SSE41-NEXT: jno .LBB2_2 +; SSE41-NEXT: # %bb.1: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r11d +; SSE41-NEXT: .LBB2_2: +; SSE41-NEXT: pextrb $14, %xmm5, %ecx +; SSE41-NEXT: pextrb $14, %xmm1, %esi +; SSE41-NEXT: movl %esi, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %sil +; SSE41-NEXT: jno .LBB2_4 +; SSE41-NEXT: # %bb.3: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %esi +; SSE41-NEXT: .LBB2_4: +; SSE41-NEXT: pextrb $13, %xmm5, %ecx +; SSE41-NEXT: pextrb $13, %xmm1, %edx +; SSE41-NEXT: movl %edx, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %dl +; SSE41-NEXT: jno .LBB2_6 +; SSE41-NEXT: # %bb.5: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edx +; SSE41-NEXT: .LBB2_6: +; SSE41-NEXT: pextrb $12, %xmm5, %ecx +; SSE41-NEXT: pextrb $12, %xmm1, %edi +; SSE41-NEXT: movl %edi, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %dil +; SSE41-NEXT: jno .LBB2_8 +; SSE41-NEXT: # %bb.7: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edi +; SSE41-NEXT: .LBB2_8: +; SSE41-NEXT: pushq %rbp +; SSE41-NEXT: pushq %r15 +; SSE41-NEXT: pushq %r14 +; SSE41-NEXT: pushq %r13 +; SSE41-NEXT: pushq %r12 +; SSE41-NEXT: pushq %rbx +; SSE41-NEXT: subq $76, %rsp +; SSE41-NEXT: pextrb $11, %xmm5, %ecx +; SSE41-NEXT: pextrb $11, %xmm1, %ebp +; SSE41-NEXT: movl %ebp, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %bpl +; SSE41-NEXT: jno .LBB2_10 +; SSE41-NEXT: # %bb.9: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebp +; SSE41-NEXT: .LBB2_10: +; SSE41-NEXT: pextrb $10, %xmm5, %ecx +; SSE41-NEXT: pextrb $10, %xmm1, %ebx +; SSE41-NEXT: movl %ebx, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %bl +; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_12 +; SSE41-NEXT: # %bb.11: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebx +; SSE41-NEXT: .LBB2_12: +; SSE41-NEXT: pextrb $9, %xmm5, %ecx +; SSE41-NEXT: pextrb $9, %xmm1, %edx +; SSE41-NEXT: movl %edx, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %dl +; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jo .LBB2_13 +; SSE41-NEXT: # %bb.14: +; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jmp .LBB2_15 +; SSE41-NEXT: .LBB2_13: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: # kill: def $al killed $al def $eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: .LBB2_15: +; SSE41-NEXT: pextrb $8, %xmm5, %ecx +; SSE41-NEXT: pextrb $8, %xmm1, %edx +; SSE41-NEXT: movl %edx, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %dl +; SSE41-NEXT: jno .LBB2_17 +; SSE41-NEXT: # %bb.16: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edx +; SSE41-NEXT: .LBB2_17: +; SSE41-NEXT: pextrb $7, %xmm5, %ecx +; SSE41-NEXT: pextrb $7, %xmm1, %ebp +; SSE41-NEXT: movl %ebp, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %bpl +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_19 +; SSE41-NEXT: # %bb.18: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebp +; SSE41-NEXT: .LBB2_19: +; SSE41-NEXT: pextrb $6, %xmm5, %ecx +; SSE41-NEXT: pextrb $6, %xmm1, %esi +; SSE41-NEXT: movl %esi, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %sil +; SSE41-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_21 +; SSE41-NEXT: # %bb.20: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %esi +; SSE41-NEXT: .LBB2_21: +; SSE41-NEXT: pextrb $5, %xmm5, %ecx +; SSE41-NEXT: pextrb $5, %xmm1, %ebx +; SSE41-NEXT: movl %ebx, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %bl +; SSE41-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jo .LBB2_22 +; SSE41-NEXT: # %bb.23: +; SSE41-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jmp .LBB2_24 +; SSE41-NEXT: .LBB2_22: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: # kill: def $al killed $al def $eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: .LBB2_24: +; SSE41-NEXT: pextrb $4, %xmm5, %ecx +; SSE41-NEXT: pextrb $4, %xmm1, %r13d +; SSE41-NEXT: movl %r13d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r13b +; SSE41-NEXT: jno .LBB2_26 +; SSE41-NEXT: # %bb.25: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r13d +; SSE41-NEXT: .LBB2_26: +; SSE41-NEXT: pextrb $3, %xmm5, %ecx +; SSE41-NEXT: pextrb $3, %xmm1, %ebx +; SSE41-NEXT: movl %ebx, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %bl +; SSE41-NEXT: jno .LBB2_28 +; SSE41-NEXT: # %bb.27: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebx +; SSE41-NEXT: .LBB2_28: +; SSE41-NEXT: pextrb $2, %xmm5, %ecx +; SSE41-NEXT: pextrb $2, %xmm1, %ebp +; SSE41-NEXT: movl %ebp, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %bpl +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_30 +; SSE41-NEXT: # %bb.29: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebp +; SSE41-NEXT: .LBB2_30: +; SSE41-NEXT: pextrb $0, %xmm5, %ecx +; SSE41-NEXT: pextrb $0, %xmm1, %esi +; SSE41-NEXT: movl %esi, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %sil +; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_32 +; SSE41-NEXT: # %bb.31: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %esi +; SSE41-NEXT: .LBB2_32: +; SSE41-NEXT: pextrb $1, %xmm5, %ecx +; SSE41-NEXT: pextrb $1, %xmm1, %edx +; SSE41-NEXT: movl %edx, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %dl +; SSE41-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_34 +; SSE41-NEXT: # %bb.33: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edx +; SSE41-NEXT: .LBB2_34: +; SSE41-NEXT: pextrb $15, %xmm6, %ecx +; SSE41-NEXT: pextrb $15, %xmm2, %r9d +; SSE41-NEXT: movl %r9d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r9b +; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_36 +; SSE41-NEXT: # %bb.35: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r9d +; SSE41-NEXT: .LBB2_36: +; SSE41-NEXT: pextrb $14, %xmm6, %ecx +; SSE41-NEXT: pextrb $14, %xmm2, %ebp +; SSE41-NEXT: movl %ebp, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %bpl +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_38 +; SSE41-NEXT: # %bb.37: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebp +; SSE41-NEXT: .LBB2_38: +; SSE41-NEXT: pextrb $13, %xmm6, %ecx +; SSE41-NEXT: pextrb $13, %xmm2, %r14d +; SSE41-NEXT: movl %r14d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r14b +; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_40 +; SSE41-NEXT: # %bb.39: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r14d +; SSE41-NEXT: .LBB2_40: +; SSE41-NEXT: pextrb $12, %xmm6, %ecx +; SSE41-NEXT: pextrb $12, %xmm2, %r10d +; SSE41-NEXT: movl %r10d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r10b +; SSE41-NEXT: jno .LBB2_42 +; SSE41-NEXT: # %bb.41: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r10d +; SSE41-NEXT: .LBB2_42: +; SSE41-NEXT: pextrb $11, %xmm6, %ecx +; SSE41-NEXT: pextrb $11, %xmm2, %ebx +; SSE41-NEXT: movl %ebx, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %bl +; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_44 +; SSE41-NEXT: # %bb.43: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebx +; SSE41-NEXT: .LBB2_44: +; SSE41-NEXT: pextrb $10, %xmm6, %ecx +; SSE41-NEXT: pextrb $10, %xmm2, %ebp +; SSE41-NEXT: movl %ebp, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %bpl +; SSE41-NEXT: jno .LBB2_46 +; SSE41-NEXT: # %bb.45: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebp +; SSE41-NEXT: .LBB2_46: +; SSE41-NEXT: pextrb $9, %xmm6, %ecx +; SSE41-NEXT: pextrb $9, %xmm2, %edi +; SSE41-NEXT: movl %edi, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %dil +; SSE41-NEXT: jno .LBB2_48 +; SSE41-NEXT: # %bb.47: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edi +; SSE41-NEXT: .LBB2_48: +; SSE41-NEXT: pextrb $8, %xmm6, %ecx +; SSE41-NEXT: pextrb $8, %xmm2, %edx +; SSE41-NEXT: movl %edx, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %dl +; SSE41-NEXT: jno .LBB2_50 +; SSE41-NEXT: # %bb.49: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edx +; SSE41-NEXT: .LBB2_50: +; SSE41-NEXT: pextrb $7, %xmm6, %ecx +; SSE41-NEXT: pextrb $7, %xmm2, %esi +; SSE41-NEXT: movl %esi, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %sil +; SSE41-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_52 +; SSE41-NEXT: # %bb.51: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %esi +; SSE41-NEXT: .LBB2_52: +; SSE41-NEXT: pextrb $6, %xmm6, %ecx +; SSE41-NEXT: pextrb $6, %xmm2, %r8d +; SSE41-NEXT: movl %r8d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r8b +; SSE41-NEXT: jno .LBB2_54 +; SSE41-NEXT: # %bb.53: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r8d +; SSE41-NEXT: .LBB2_54: +; SSE41-NEXT: pextrb $5, %xmm6, %ecx +; SSE41-NEXT: pextrb $5, %xmm2, %r11d +; SSE41-NEXT: movl %r11d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r11b +; SSE41-NEXT: movl %ebx, (%rsp) # 4-byte Spill +; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_56 +; SSE41-NEXT: # %bb.55: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r11d +; SSE41-NEXT: .LBB2_56: +; SSE41-NEXT: pextrb $4, %xmm6, %ecx +; SSE41-NEXT: pextrb $4, %xmm2, %edx +; SSE41-NEXT: movl %edx, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %dl +; SSE41-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_58 +; SSE41-NEXT: # %bb.57: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edx +; SSE41-NEXT: .LBB2_58: +; SSE41-NEXT: pextrb $3, %xmm6, %ecx +; SSE41-NEXT: pextrb $3, %xmm2, %esi +; SSE41-NEXT: movl %esi, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %sil +; SSE41-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_60 +; SSE41-NEXT: # %bb.59: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %esi +; SSE41-NEXT: .LBB2_60: +; SSE41-NEXT: pextrb $2, %xmm6, %ecx +; SSE41-NEXT: pextrb $2, %xmm2, %edx +; SSE41-NEXT: movl %edx, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %dl +; SSE41-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_62 +; SSE41-NEXT: # %bb.61: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edx +; SSE41-NEXT: .LBB2_62: +; SSE41-NEXT: pextrb $0, %xmm6, %ecx +; SSE41-NEXT: pextrb $0, %xmm2, %edi +; SSE41-NEXT: movl %edi, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %dil +; SSE41-NEXT: jno .LBB2_64 +; SSE41-NEXT: # %bb.63: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edi +; SSE41-NEXT: .LBB2_64: +; SSE41-NEXT: pextrb $1, %xmm6, %ecx +; SSE41-NEXT: pextrb $1, %xmm2, %ebx +; SSE41-NEXT: movl %ebx, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %bl +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_66 +; SSE41-NEXT: # %bb.65: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebx +; SSE41-NEXT: .LBB2_66: +; SSE41-NEXT: pextrb $15, %xmm7, %ecx +; SSE41-NEXT: pextrb $15, %xmm3, %esi +; SSE41-NEXT: movl %esi, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %sil +; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_68 +; SSE41-NEXT: # %bb.67: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %esi +; SSE41-NEXT: .LBB2_68: +; SSE41-NEXT: pextrb $14, %xmm7, %ecx +; SSE41-NEXT: pextrb $14, %xmm3, %edx +; SSE41-NEXT: movl %edx, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %dl +; SSE41-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_70 +; SSE41-NEXT: # %bb.69: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edx +; SSE41-NEXT: .LBB2_70: +; SSE41-NEXT: pextrb $13, %xmm7, %ecx +; SSE41-NEXT: pextrb $13, %xmm3, %edi +; SSE41-NEXT: movl %edi, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %dil +; SSE41-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_72 +; SSE41-NEXT: # %bb.71: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edi +; SSE41-NEXT: .LBB2_72: +; SSE41-NEXT: pextrb $12, %xmm7, %ecx +; SSE41-NEXT: pextrb $12, %xmm3, %r15d +; SSE41-NEXT: movl %r15d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r15b +; SSE41-NEXT: movl %r15d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_74 +; SSE41-NEXT: # %bb.73: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: # kill: def $al killed $al def $eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: .LBB2_74: +; SSE41-NEXT: pextrb $11, %xmm7, %ecx +; SSE41-NEXT: pextrb $11, %xmm3, %edx +; SSE41-NEXT: movl %edx, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %dl +; SSE41-NEXT: jno .LBB2_76 +; SSE41-NEXT: # %bb.75: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edx +; SSE41-NEXT: .LBB2_76: +; SSE41-NEXT: pextrb $10, %xmm7, %ecx +; SSE41-NEXT: pextrb $10, %xmm3, %ebp +; SSE41-NEXT: movl %ebp, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %bpl +; SSE41-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_78 +; SSE41-NEXT: # %bb.77: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebp +; SSE41-NEXT: .LBB2_78: +; SSE41-NEXT: pextrb $9, %xmm7, %ecx +; SSE41-NEXT: pextrb $9, %xmm3, %r11d +; SSE41-NEXT: movl %r11d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r11b +; SSE41-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_80 +; SSE41-NEXT: # %bb.79: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r11d +; SSE41-NEXT: .LBB2_80: +; SSE41-NEXT: pextrb $8, %xmm7, %ecx +; SSE41-NEXT: pextrb $8, %xmm3, %r8d +; SSE41-NEXT: movl %r8d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r8b +; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_82 +; SSE41-NEXT: # %bb.81: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r8d +; SSE41-NEXT: .LBB2_82: +; SSE41-NEXT: pextrb $7, %xmm7, %ecx +; SSE41-NEXT: pextrb $7, %xmm3, %r9d +; SSE41-NEXT: movl %r9d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r9b +; SSE41-NEXT: jno .LBB2_84 +; SSE41-NEXT: # %bb.83: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r9d +; SSE41-NEXT: .LBB2_84: +; SSE41-NEXT: pextrb $6, %xmm7, %ecx +; SSE41-NEXT: pextrb $6, %xmm3, %r10d +; SSE41-NEXT: movl %r10d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r10b +; SSE41-NEXT: jno .LBB2_86 +; SSE41-NEXT: # %bb.85: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r10d +; SSE41-NEXT: .LBB2_86: +; SSE41-NEXT: pextrb $5, %xmm7, %ecx +; SSE41-NEXT: pextrb $5, %xmm3, %r14d +; SSE41-NEXT: movl %r14d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r14b +; SSE41-NEXT: jno .LBB2_88 +; SSE41-NEXT: # %bb.87: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r14d +; SSE41-NEXT: .LBB2_88: +; SSE41-NEXT: pextrb $4, %xmm7, %ecx +; SSE41-NEXT: pextrb $4, %xmm3, %r12d +; SSE41-NEXT: movl %r12d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r12b +; SSE41-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_90 +; SSE41-NEXT: # %bb.89: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: # kill: def $al killed $al def $eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: .LBB2_90: +; SSE41-NEXT: pextrb $3, %xmm7, %ecx +; SSE41-NEXT: pextrb $3, %xmm3, %r13d +; SSE41-NEXT: movl %r13d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r13b +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_92 +; SSE41-NEXT: # %bb.91: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r13d +; SSE41-NEXT: .LBB2_92: +; SSE41-NEXT: pextrb $2, %xmm7, %ecx +; SSE41-NEXT: pextrb $2, %xmm3, %esi +; SSE41-NEXT: movl %esi, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %sil +; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_94 +; SSE41-NEXT: # %bb.93: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %esi +; SSE41-NEXT: .LBB2_94: +; SSE41-NEXT: pextrb $0, %xmm7, %ecx +; SSE41-NEXT: pextrb $0, %xmm3, %edx +; SSE41-NEXT: movl %edx, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: movl %edi, %r15d +; SSE41-NEXT: subb %cl, %dl +; SSE41-NEXT: jno .LBB2_96 +; SSE41-NEXT: # %bb.95: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edx +; SSE41-NEXT: .LBB2_96: +; SSE41-NEXT: pextrb $1, %xmm7, %ecx +; SSE41-NEXT: pextrb $1, %xmm3, %edi +; SSE41-NEXT: movl %edi, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %dil +; SSE41-NEXT: jno .LBB2_98 +; SSE41-NEXT: # %bb.97: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edi +; SSE41-NEXT: .LBB2_98: +; SSE41-NEXT: pextrb $15, %xmm4, %ecx +; SSE41-NEXT: pextrb $15, %xmm0, %r12d +; SSE41-NEXT: movl %r12d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r12b +; SSE41-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_100 +; SSE41-NEXT: # %bb.99: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: # kill: def $al killed $al def $eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: .LBB2_100: +; SSE41-NEXT: pextrb $14, %xmm4, %ecx +; SSE41-NEXT: pextrb $14, %xmm0, %ebp +; SSE41-NEXT: movl %ebp, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %bpl +; SSE41-NEXT: jno .LBB2_102 +; SSE41-NEXT: # %bb.101: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebp +; SSE41-NEXT: .LBB2_102: +; SSE41-NEXT: pextrb $13, %xmm4, %ecx +; SSE41-NEXT: pextrb $13, %xmm0, %ebx +; SSE41-NEXT: movl %ebx, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %bl +; SSE41-NEXT: movl %r15d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jo .LBB2_103 +; SSE41-NEXT: # %bb.104: +; SSE41-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jmp .LBB2_105 +; SSE41-NEXT: .LBB2_103: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: # kill: def $al killed $al def $eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: .LBB2_105: +; SSE41-NEXT: pextrb $12, %xmm4, %ecx +; SSE41-NEXT: pextrb $12, %xmm0, %r12d +; SSE41-NEXT: movl %r12d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r12b +; SSE41-NEXT: jno .LBB2_107 +; SSE41-NEXT: # %bb.106: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r12d +; SSE41-NEXT: .LBB2_107: +; SSE41-NEXT: pextrb $11, %xmm4, %ecx +; SSE41-NEXT: pextrb $11, %xmm0, %r13d +; SSE41-NEXT: movl %r13d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r13b +; SSE41-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB2_109 +; SSE41-NEXT: # %bb.108: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r13d +; SSE41-NEXT: .LBB2_109: +; SSE41-NEXT: pextrb $10, %xmm4, %ecx +; SSE41-NEXT: pextrb $10, %xmm0, %r15d +; SSE41-NEXT: movl %r15d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r15b +; SSE41-NEXT: jno .LBB2_111 +; SSE41-NEXT: # %bb.110: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r15d +; SSE41-NEXT: .LBB2_111: +; SSE41-NEXT: pextrb $9, %xmm4, %ecx +; SSE41-NEXT: pextrb $9, %xmm0, %r14d +; SSE41-NEXT: movl %r14d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r14b +; SSE41-NEXT: jno .LBB2_113 +; SSE41-NEXT: # %bb.112: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r14d +; SSE41-NEXT: .LBB2_113: +; SSE41-NEXT: pextrb $8, %xmm4, %ecx +; SSE41-NEXT: pextrb $8, %xmm0, %ebp +; SSE41-NEXT: movl %ebp, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %bpl +; SSE41-NEXT: jno .LBB2_115 +; SSE41-NEXT: # %bb.114: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebp +; SSE41-NEXT: .LBB2_115: +; SSE41-NEXT: pextrb $7, %xmm4, %ecx +; SSE41-NEXT: pextrb $7, %xmm0, %edi +; SSE41-NEXT: movl %edi, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %dil +; SSE41-NEXT: jno .LBB2_117 +; SSE41-NEXT: # %bb.116: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edi +; SSE41-NEXT: .LBB2_117: +; SSE41-NEXT: pextrb $6, %xmm4, %edx +; SSE41-NEXT: pextrb $6, %xmm0, %eax +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: setns %cl +; SSE41-NEXT: subb %dl, %al +; SSE41-NEXT: jno .LBB2_119 +; SSE41-NEXT: # %bb.118: +; SSE41-NEXT: addb $127, %cl +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB2_119: +; SSE41-NEXT: pextrb $5, %xmm4, %ebx +; SSE41-NEXT: pextrb $5, %xmm0, %ecx +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: subb %bl, %dl +; SSE41-NEXT: setns %dl +; SSE41-NEXT: subb %bl, %cl +; SSE41-NEXT: jno .LBB2_121 +; SSE41-NEXT: # %bb.120: +; SSE41-NEXT: addb $127, %dl +; SSE41-NEXT: movl %edx, %ecx +; SSE41-NEXT: .LBB2_121: +; SSE41-NEXT: pextrb $4, %xmm4, %esi +; SSE41-NEXT: pextrb $4, %xmm0, %edx +; SSE41-NEXT: movl %edx, %ebx +; SSE41-NEXT: subb %sil, %bl +; SSE41-NEXT: setns %bl +; SSE41-NEXT: subb %sil, %dl +; SSE41-NEXT: jno .LBB2_123 +; SSE41-NEXT: # %bb.122: +; SSE41-NEXT: addb $127, %bl +; SSE41-NEXT: movl %ebx, %edx +; SSE41-NEXT: .LBB2_123: +; SSE41-NEXT: pextrb $3, %xmm4, %esi +; SSE41-NEXT: pextrb $3, %xmm0, %r8d +; SSE41-NEXT: movl %r8d, %ebx +; SSE41-NEXT: subb %sil, %bl +; SSE41-NEXT: setns %bl +; SSE41-NEXT: subb %sil, %r8b +; SSE41-NEXT: jno .LBB2_125 +; SSE41-NEXT: # %bb.124: +; SSE41-NEXT: addb $127, %bl +; SSE41-NEXT: movl %ebx, %r8d +; SSE41-NEXT: .LBB2_125: +; SSE41-NEXT: pextrb $2, %xmm4, %esi +; SSE41-NEXT: pextrb $2, %xmm0, %r9d +; SSE41-NEXT: movl %r9d, %ebx +; SSE41-NEXT: subb %sil, %bl +; SSE41-NEXT: setns %bl +; SSE41-NEXT: subb %sil, %r9b +; SSE41-NEXT: jno .LBB2_127 +; SSE41-NEXT: # %bb.126: +; SSE41-NEXT: addb $127, %bl +; SSE41-NEXT: movl %ebx, %r9d +; SSE41-NEXT: .LBB2_127: +; SSE41-NEXT: pextrb $0, %xmm4, %esi +; SSE41-NEXT: pextrb $0, %xmm0, %r10d +; SSE41-NEXT: movl %r10d, %ebx +; SSE41-NEXT: subb %sil, %bl +; SSE41-NEXT: setns %bl +; SSE41-NEXT: subb %sil, %r10b +; SSE41-NEXT: jno .LBB2_129 +; SSE41-NEXT: # %bb.128: +; SSE41-NEXT: addb $127, %bl +; SSE41-NEXT: movl %ebx, %r10d +; SSE41-NEXT: .LBB2_129: +; SSE41-NEXT: pextrb $1, %xmm4, %esi +; SSE41-NEXT: pextrb $1, %xmm0, %r11d +; SSE41-NEXT: movl %r11d, %ebx +; SSE41-NEXT: subb %sil, %bl +; SSE41-NEXT: setns %bl +; SSE41-NEXT: subb %sil, %r11b +; SSE41-NEXT: jno .LBB2_131 +; SSE41-NEXT: # %bb.130: +; SSE41-NEXT: addb $127, %bl +; SSE41-NEXT: movl %ebx, %r11d +; SSE41-NEXT: .LBB2_131: +; SSE41-NEXT: movzbl %r10b, %esi +; SSE41-NEXT: movd %esi, %xmm0 +; SSE41-NEXT: movzbl %r11b, %esi +; SSE41-NEXT: pinsrb $1, %esi, %xmm0 +; SSE41-NEXT: movzbl %r9b, %esi +; SSE41-NEXT: pinsrb $2, %esi, %xmm0 +; SSE41-NEXT: movzbl %r8b, %esi +; SSE41-NEXT: pinsrb $3, %esi, %xmm0 +; SSE41-NEXT: movzbl %dl, %edx +; SSE41-NEXT: pinsrb $4, %edx, %xmm0 +; SSE41-NEXT: movzbl %cl, %ecx +; SSE41-NEXT: pinsrb $5, %ecx, %xmm0 +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $6, %eax, %xmm0 +; SSE41-NEXT: movzbl %dil, %eax +; SSE41-NEXT: pinsrb $7, %eax, %xmm0 +; SSE41-NEXT: movzbl %bpl, %eax +; SSE41-NEXT: pinsrb $8, %eax, %xmm0 +; SSE41-NEXT: movzbl %r14b, %eax +; SSE41-NEXT: pinsrb $9, %eax, %xmm0 +; SSE41-NEXT: movzbl %r15b, %eax +; SSE41-NEXT: pinsrb $10, %eax, %xmm0 +; SSE41-NEXT: movzbl %r13b, %eax +; SSE41-NEXT: pinsrb $11, %eax, %xmm0 +; SSE41-NEXT: movzbl %r12b, %eax +; SSE41-NEXT: pinsrb $12, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $13, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $14, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $15, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: movd %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $1, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $2, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $3, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $4, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $5, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $6, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $7, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $8, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $9, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $10, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $11, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $12, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $13, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $14, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $15, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: movd %eax, %xmm2 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $1, %eax, %xmm2 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $2, %eax, %xmm2 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $3, %eax, %xmm2 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $4, %eax, %xmm2 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $5, %eax, %xmm2 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $6, %eax, %xmm2 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $7, %eax, %xmm2 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $8, %eax, %xmm2 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $9, %eax, %xmm2 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $10, %eax, %xmm2 +; SSE41-NEXT: movzbl (%rsp), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $11, %eax, %xmm2 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $12, %eax, %xmm2 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $13, %eax, %xmm2 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $14, %eax, %xmm2 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $15, %eax, %xmm2 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: movd %eax, %xmm3 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $1, %eax, %xmm3 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $2, %eax, %xmm3 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $3, %eax, %xmm3 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $4, %eax, %xmm3 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $5, %eax, %xmm3 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $6, %eax, %xmm3 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $7, %eax, %xmm3 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $8, %eax, %xmm3 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $9, %eax, %xmm3 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $10, %eax, %xmm3 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $11, %eax, %xmm3 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $12, %eax, %xmm3 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $13, %eax, %xmm3 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $14, %eax, %xmm3 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $15, %eax, %xmm3 +; SSE41-NEXT: addq $76, %rsp +; SSE41-NEXT: popq %rbx +; SSE41-NEXT: popq %r12 +; SSE41-NEXT: popq %r13 +; SSE41-NEXT: popq %r14 +; SSE41-NEXT: popq %r15 +; SSE41-NEXT: popq %rbp +; SSE41-NEXT: retq +; +; AVX1-LABEL: v64i8: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: pushq %r15 +; AVX1-NEXT: pushq %r14 +; AVX1-NEXT: pushq %r13 +; AVX1-NEXT: pushq %r12 +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: subq $76, %rsp +; AVX1-NEXT: vpextrb $15, %xmm3, %ecx +; AVX1-NEXT: vpextrb $15, %xmm1, %edx +; AVX1-NEXT: movl %edx, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %dl +; AVX1-NEXT: jo .LBB2_1 +; AVX1-NEXT: # %bb.2: +; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jmp .LBB2_3 +; AVX1-NEXT: .LBB2_1: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: # kill: def $al killed $al def $eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: .LBB2_3: +; AVX1-NEXT: vpextrb $14, %xmm3, %ecx +; AVX1-NEXT: vpextrb $14, %xmm1, %edx +; AVX1-NEXT: movl %edx, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %dl +; AVX1-NEXT: jno .LBB2_5 +; AVX1-NEXT: # %bb.4: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %edx +; AVX1-NEXT: .LBB2_5: +; AVX1-NEXT: vpextrb $13, %xmm3, %ecx +; AVX1-NEXT: vpextrb $13, %xmm1, %esi +; AVX1-NEXT: movl %esi, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %sil +; AVX1-NEXT: jo .LBB2_6 +; AVX1-NEXT: # %bb.7: +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jmp .LBB2_8 +; AVX1-NEXT: .LBB2_6: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: # kill: def $al killed $al def $eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: .LBB2_8: +; AVX1-NEXT: vpextrb $12, %xmm3, %ecx +; AVX1-NEXT: vpextrb $12, %xmm1, %esi +; AVX1-NEXT: movl %esi, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %sil +; AVX1-NEXT: jno .LBB2_10 +; AVX1-NEXT: # %bb.9: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %esi +; AVX1-NEXT: .LBB2_10: +; AVX1-NEXT: vpextrb $11, %xmm3, %ecx +; AVX1-NEXT: vpextrb $11, %xmm1, %edi +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %dil +; AVX1-NEXT: jno .LBB2_12 +; AVX1-NEXT: # %bb.11: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %edi +; AVX1-NEXT: .LBB2_12: +; AVX1-NEXT: vpextrb $10, %xmm3, %ecx +; AVX1-NEXT: vpextrb $10, %xmm1, %ebp +; AVX1-NEXT: movl %ebp, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %bpl +; AVX1-NEXT: jno .LBB2_14 +; AVX1-NEXT: # %bb.13: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %ebp +; AVX1-NEXT: .LBB2_14: +; AVX1-NEXT: vpextrb $9, %xmm3, %ecx +; AVX1-NEXT: vpextrb $9, %xmm1, %ebx +; AVX1-NEXT: movl %ebx, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %bl +; AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jo .LBB2_15 +; AVX1-NEXT: # %bb.16: +; AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jmp .LBB2_17 +; AVX1-NEXT: .LBB2_15: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: # kill: def $al killed $al def $eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: .LBB2_17: +; AVX1-NEXT: vpextrb $8, %xmm3, %ecx +; AVX1-NEXT: vpextrb $8, %xmm1, %ebp +; AVX1-NEXT: movl %ebp, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %bpl +; AVX1-NEXT: jno .LBB2_19 +; AVX1-NEXT: # %bb.18: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %ebp +; AVX1-NEXT: .LBB2_19: +; AVX1-NEXT: vpextrb $7, %xmm3, %ecx +; AVX1-NEXT: vpextrb $7, %xmm1, %ebx +; AVX1-NEXT: movl %ebx, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %bl +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_21 +; AVX1-NEXT: # %bb.20: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %ebx +; AVX1-NEXT: .LBB2_21: +; AVX1-NEXT: vpextrb $6, %xmm3, %ecx +; AVX1-NEXT: vpextrb $6, %xmm1, %esi +; AVX1-NEXT: movl %esi, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %sil +; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_23 +; AVX1-NEXT: # %bb.22: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %esi +; AVX1-NEXT: .LBB2_23: +; AVX1-NEXT: vpextrb $5, %xmm3, %ecx +; AVX1-NEXT: vpextrb $5, %xmm1, %r11d +; AVX1-NEXT: movl %r11d, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %r11b +; AVX1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_25 +; AVX1-NEXT: # %bb.24: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r11d +; AVX1-NEXT: .LBB2_25: +; AVX1-NEXT: vpextrb $4, %xmm3, %ecx +; AVX1-NEXT: vpextrb $4, %xmm1, %r13d +; AVX1-NEXT: movl %r13d, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %r13b +; AVX1-NEXT: jno .LBB2_27 +; AVX1-NEXT: # %bb.26: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r13d +; AVX1-NEXT: .LBB2_27: +; AVX1-NEXT: vpextrb $3, %xmm3, %ecx +; AVX1-NEXT: vpextrb $3, %xmm1, %r8d +; AVX1-NEXT: movl %r8d, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %r8b +; AVX1-NEXT: jno .LBB2_29 +; AVX1-NEXT: # %bb.28: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r8d +; AVX1-NEXT: .LBB2_29: +; AVX1-NEXT: vpextrb $2, %xmm3, %ecx +; AVX1-NEXT: vpextrb $2, %xmm1, %ebp +; AVX1-NEXT: movl %ebp, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %bpl +; AVX1-NEXT: jno .LBB2_31 +; AVX1-NEXT: # %bb.30: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %ebp +; AVX1-NEXT: .LBB2_31: +; AVX1-NEXT: vpextrb $0, %xmm3, %ecx +; AVX1-NEXT: vpextrb $0, %xmm1, %ebx +; AVX1-NEXT: movl %ebx, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %bl +; AVX1-NEXT: jno .LBB2_33 +; AVX1-NEXT: # %bb.32: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %ebx +; AVX1-NEXT: .LBB2_33: +; AVX1-NEXT: vpextrb $1, %xmm3, %ecx +; AVX1-NEXT: vpextrb $1, %xmm1, %edi +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %dil +; AVX1-NEXT: jno .LBB2_35 +; AVX1-NEXT: # %bb.34: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %edi +; AVX1-NEXT: .LBB2_35: +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vpextrb $15, %xmm3, %ecx +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpextrb $15, %xmm1, %edx +; AVX1-NEXT: movl %edx, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %dl +; AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_37 +; AVX1-NEXT: # %bb.36: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %edx +; AVX1-NEXT: .LBB2_37: +; AVX1-NEXT: vpextrb $14, %xmm3, %ecx +; AVX1-NEXT: vpextrb $14, %xmm1, %edi +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %dil +; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_39 +; AVX1-NEXT: # %bb.38: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %edi +; AVX1-NEXT: .LBB2_39: +; AVX1-NEXT: vpextrb $13, %xmm3, %ecx +; AVX1-NEXT: vpextrb $13, %xmm1, %r12d +; AVX1-NEXT: movl %r12d, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %r12b +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_41 +; AVX1-NEXT: # %bb.40: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r12d +; AVX1-NEXT: .LBB2_41: +; AVX1-NEXT: vpextrb $12, %xmm3, %ecx +; AVX1-NEXT: vpextrb $12, %xmm1, %r15d +; AVX1-NEXT: movl %r15d, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %r15b +; AVX1-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_43 +; AVX1-NEXT: # %bb.42: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r15d +; AVX1-NEXT: .LBB2_43: +; AVX1-NEXT: vpextrb $11, %xmm3, %ecx +; AVX1-NEXT: vpextrb $11, %xmm1, %ebp +; AVX1-NEXT: movl %ebp, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %bpl +; AVX1-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_45 +; AVX1-NEXT: # %bb.44: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %ebp +; AVX1-NEXT: .LBB2_45: +; AVX1-NEXT: vpextrb $10, %xmm3, %ecx +; AVX1-NEXT: vpextrb $10, %xmm1, %ebx +; AVX1-NEXT: movl %ebx, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %bl +; AVX1-NEXT: jno .LBB2_47 +; AVX1-NEXT: # %bb.46: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %ebx +; AVX1-NEXT: .LBB2_47: +; AVX1-NEXT: vpextrb $9, %xmm3, %ecx +; AVX1-NEXT: vpextrb $9, %xmm1, %edx +; AVX1-NEXT: movl %edx, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %dl +; AVX1-NEXT: jno .LBB2_49 +; AVX1-NEXT: # %bb.48: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %edx +; AVX1-NEXT: .LBB2_49: +; AVX1-NEXT: vpextrb $8, %xmm3, %ecx +; AVX1-NEXT: vpextrb $8, %xmm1, %esi +; AVX1-NEXT: movl %esi, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %sil +; AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %ebx, (%rsp) # 4-byte Spill +; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jo .LBB2_50 +; AVX1-NEXT: # %bb.51: +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jmp .LBB2_52 +; AVX1-NEXT: .LBB2_50: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: # kill: def $al killed $al def $eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: .LBB2_52: +; AVX1-NEXT: vpextrb $7, %xmm3, %ecx +; AVX1-NEXT: vpextrb $7, %xmm1, %r11d +; AVX1-NEXT: movl %r11d, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %r11b +; AVX1-NEXT: jno .LBB2_54 +; AVX1-NEXT: # %bb.53: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r11d +; AVX1-NEXT: .LBB2_54: +; AVX1-NEXT: vpextrb $6, %xmm3, %ecx +; AVX1-NEXT: vpextrb $6, %xmm1, %esi +; AVX1-NEXT: movl %esi, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %sil +; AVX1-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_56 +; AVX1-NEXT: # %bb.55: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %esi +; AVX1-NEXT: .LBB2_56: +; AVX1-NEXT: vpextrb $5, %xmm3, %ecx +; AVX1-NEXT: vpextrb $5, %xmm1, %edi +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %dil +; AVX1-NEXT: jno .LBB2_58 +; AVX1-NEXT: # %bb.57: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %edi +; AVX1-NEXT: .LBB2_58: +; AVX1-NEXT: vpextrb $4, %xmm3, %ecx +; AVX1-NEXT: vpextrb $4, %xmm1, %r13d +; AVX1-NEXT: movl %r13d, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %r13b +; AVX1-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_60 +; AVX1-NEXT: # %bb.59: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: # kill: def $al killed $al def $eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: .LBB2_60: +; AVX1-NEXT: vpextrb $3, %xmm3, %ecx +; AVX1-NEXT: vpextrb $3, %xmm1, %ebp +; AVX1-NEXT: movl %ebp, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %bpl +; AVX1-NEXT: jo .LBB2_61 +; AVX1-NEXT: # %bb.62: +; AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jmp .LBB2_63 +; AVX1-NEXT: .LBB2_61: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: # kill: def $al killed $al def $eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: .LBB2_63: +; AVX1-NEXT: vpextrb $2, %xmm3, %ecx +; AVX1-NEXT: vpextrb $2, %xmm1, %ebp +; AVX1-NEXT: movl %ebp, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %bpl +; AVX1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_65 +; AVX1-NEXT: # %bb.64: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %ebp +; AVX1-NEXT: .LBB2_65: +; AVX1-NEXT: vpextrb $0, %xmm3, %ecx +; AVX1-NEXT: vpextrb $0, %xmm1, %edi +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %dil +; AVX1-NEXT: movl %r15d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_67 +; AVX1-NEXT: # %bb.66: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %edi +; AVX1-NEXT: .LBB2_67: +; AVX1-NEXT: vpextrb $1, %xmm3, %ecx +; AVX1-NEXT: vpextrb $1, %xmm1, %ebx +; AVX1-NEXT: movl %ebx, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %bl +; AVX1-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_69 +; AVX1-NEXT: # %bb.68: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %ebx +; AVX1-NEXT: .LBB2_69: +; AVX1-NEXT: vpextrb $15, %xmm2, %ecx +; AVX1-NEXT: vpextrb $15, %xmm0, %edx +; AVX1-NEXT: movl %edx, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %dl +; AVX1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_71 +; AVX1-NEXT: # %bb.70: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %edx +; AVX1-NEXT: .LBB2_71: +; AVX1-NEXT: vpextrb $14, %xmm2, %ecx +; AVX1-NEXT: vpextrb $14, %xmm0, %edi +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %dil +; AVX1-NEXT: jno .LBB2_73 +; AVX1-NEXT: # %bb.72: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %edi +; AVX1-NEXT: .LBB2_73: +; AVX1-NEXT: vpextrb $13, %xmm2, %ecx +; AVX1-NEXT: vpextrb $13, %xmm0, %r10d +; AVX1-NEXT: movl %r10d, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %r10b +; AVX1-NEXT: jno .LBB2_75 +; AVX1-NEXT: # %bb.74: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r10d +; AVX1-NEXT: .LBB2_75: +; AVX1-NEXT: vpextrb $12, %xmm2, %ecx +; AVX1-NEXT: vpextrb $12, %xmm0, %r12d +; AVX1-NEXT: movl %r12d, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %r12b +; AVX1-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_77 +; AVX1-NEXT: # %bb.76: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: # kill: def $al killed $al def $eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: .LBB2_77: +; AVX1-NEXT: vpextrb $11, %xmm2, %ecx +; AVX1-NEXT: vpextrb $11, %xmm0, %r14d +; AVX1-NEXT: movl %r14d, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %r14b +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_79 +; AVX1-NEXT: # %bb.78: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r14d +; AVX1-NEXT: .LBB2_79: +; AVX1-NEXT: vpextrb $10, %xmm2, %ecx +; AVX1-NEXT: vpextrb $10, %xmm0, %r13d +; AVX1-NEXT: movl %r13d, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %r13b +; AVX1-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_81 +; AVX1-NEXT: # %bb.80: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r13d +; AVX1-NEXT: .LBB2_81: +; AVX1-NEXT: vpextrb $9, %xmm2, %ecx +; AVX1-NEXT: vpextrb $9, %xmm0, %r8d +; AVX1-NEXT: movl %r8d, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %r8b +; AVX1-NEXT: jno .LBB2_83 +; AVX1-NEXT: # %bb.82: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r8d +; AVX1-NEXT: .LBB2_83: +; AVX1-NEXT: vpextrb $8, %xmm2, %ecx +; AVX1-NEXT: vpextrb $8, %xmm0, %r15d +; AVX1-NEXT: movl %r15d, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %r15b +; AVX1-NEXT: jno .LBB2_85 +; AVX1-NEXT: # %bb.84: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r15d +; AVX1-NEXT: .LBB2_85: +; AVX1-NEXT: vpextrb $7, %xmm2, %ecx +; AVX1-NEXT: vpextrb $7, %xmm0, %r12d +; AVX1-NEXT: movl %r12d, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %r12b +; AVX1-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_87 +; AVX1-NEXT: # %bb.86: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: # kill: def $al killed $al def $eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: .LBB2_87: +; AVX1-NEXT: vpextrb $6, %xmm2, %ecx +; AVX1-NEXT: vpextrb $6, %xmm0, %r12d +; AVX1-NEXT: movl %r12d, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %r12b +; AVX1-NEXT: jno .LBB2_89 +; AVX1-NEXT: # %bb.88: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r12d +; AVX1-NEXT: .LBB2_89: +; AVX1-NEXT: vpextrb $5, %xmm2, %ecx +; AVX1-NEXT: vpextrb $5, %xmm0, %ebp +; AVX1-NEXT: movl %ebp, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %bpl +; AVX1-NEXT: jno .LBB2_91 +; AVX1-NEXT: # %bb.90: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %ebp +; AVX1-NEXT: .LBB2_91: +; AVX1-NEXT: vpextrb $4, %xmm2, %ecx +; AVX1-NEXT: vpextrb $4, %xmm0, %esi +; AVX1-NEXT: movl %esi, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %sil +; AVX1-NEXT: jno .LBB2_93 +; AVX1-NEXT: # %bb.92: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %esi +; AVX1-NEXT: .LBB2_93: +; AVX1-NEXT: vpextrb $3, %xmm2, %ecx +; AVX1-NEXT: vpextrb $3, %xmm0, %edx +; AVX1-NEXT: movl %edx, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %dl +; AVX1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_95 +; AVX1-NEXT: # %bb.94: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %edx +; AVX1-NEXT: .LBB2_95: +; AVX1-NEXT: vpextrb $2, %xmm2, %ecx +; AVX1-NEXT: vpextrb $2, %xmm0, %edi +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %dil +; AVX1-NEXT: jno .LBB2_97 +; AVX1-NEXT: # %bb.96: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %edi +; AVX1-NEXT: .LBB2_97: +; AVX1-NEXT: vpextrb $0, %xmm2, %ecx +; AVX1-NEXT: vpextrb $0, %xmm0, %ebx +; AVX1-NEXT: movl %ebx, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %bl +; AVX1-NEXT: jno .LBB2_99 +; AVX1-NEXT: # %bb.98: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %ebx +; AVX1-NEXT: .LBB2_99: +; AVX1-NEXT: vpextrb $1, %xmm2, %ecx +; AVX1-NEXT: vpextrb $1, %xmm0, %r11d +; AVX1-NEXT: movl %r11d, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %r11b +; AVX1-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_101 +; AVX1-NEXT: # %bb.100: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: # kill: def $al killed $al def $eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: .LBB2_101: +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-NEXT: vpextrb $15, %xmm1, %ecx +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpextrb $15, %xmm0, %r11d +; AVX1-NEXT: movl %r11d, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %r11b +; AVX1-NEXT: jno .LBB2_103 +; AVX1-NEXT: # %bb.102: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r11d +; AVX1-NEXT: .LBB2_103: +; AVX1-NEXT: vpextrb $14, %xmm1, %ecx +; AVX1-NEXT: vpextrb $14, %xmm0, %r9d +; AVX1-NEXT: movl %r9d, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %r9b +; AVX1-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_105 +; AVX1-NEXT: # %bb.104: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: # kill: def $al killed $al def $eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: .LBB2_105: +; AVX1-NEXT: vpextrb $13, %xmm1, %ecx +; AVX1-NEXT: vpextrb $13, %xmm0, %r9d +; AVX1-NEXT: movl %r9d, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %r9b +; AVX1-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %r15d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_107 +; AVX1-NEXT: # %bb.106: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r9d +; AVX1-NEXT: .LBB2_107: +; AVX1-NEXT: vpextrb $12, %xmm1, %ecx +; AVX1-NEXT: vpextrb $12, %xmm0, %edx +; AVX1-NEXT: movl %edx, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %dl +; AVX1-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_109 +; AVX1-NEXT: # %bb.108: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %edx +; AVX1-NEXT: .LBB2_109: +; AVX1-NEXT: vpextrb $11, %xmm1, %ecx +; AVX1-NEXT: vpextrb $11, %xmm0, %r13d +; AVX1-NEXT: movl %r13d, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %r13b +; AVX1-NEXT: jno .LBB2_111 +; AVX1-NEXT: # %bb.110: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r13d +; AVX1-NEXT: .LBB2_111: +; AVX1-NEXT: vpextrb $10, %xmm1, %ecx +; AVX1-NEXT: vpextrb $10, %xmm0, %r15d +; AVX1-NEXT: movl %r15d, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %r15b +; AVX1-NEXT: jno .LBB2_113 +; AVX1-NEXT: # %bb.112: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r15d +; AVX1-NEXT: .LBB2_113: +; AVX1-NEXT: vpextrb $9, %xmm1, %ecx +; AVX1-NEXT: vpextrb $9, %xmm0, %r14d +; AVX1-NEXT: movl %r14d, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %r14b +; AVX1-NEXT: jno .LBB2_115 +; AVX1-NEXT: # %bb.114: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r14d +; AVX1-NEXT: .LBB2_115: +; AVX1-NEXT: vpextrb $8, %xmm1, %ecx +; AVX1-NEXT: vpextrb $8, %xmm0, %ebp +; AVX1-NEXT: movl %ebp, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %bpl +; AVX1-NEXT: jno .LBB2_117 +; AVX1-NEXT: # %bb.116: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %ebp +; AVX1-NEXT: .LBB2_117: +; AVX1-NEXT: vpextrb $7, %xmm1, %ecx +; AVX1-NEXT: vpextrb $7, %xmm0, %edi +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %dil +; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_119 +; AVX1-NEXT: # %bb.118: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %edi +; AVX1-NEXT: .LBB2_119: +; AVX1-NEXT: vpextrb $6, %xmm1, %edx +; AVX1-NEXT: vpextrb $6, %xmm0, %eax +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: subb %dl, %cl +; AVX1-NEXT: setns %cl +; AVX1-NEXT: subb %dl, %al +; AVX1-NEXT: jno .LBB2_121 +; AVX1-NEXT: # %bb.120: +; AVX1-NEXT: addb $127, %cl +; AVX1-NEXT: movl %ecx, %eax +; AVX1-NEXT: .LBB2_121: +; AVX1-NEXT: vpextrb $5, %xmm1, %ebx +; AVX1-NEXT: vpextrb $5, %xmm0, %ecx +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: subb %bl, %dl +; AVX1-NEXT: setns %dl +; AVX1-NEXT: subb %bl, %cl +; AVX1-NEXT: jno .LBB2_123 +; AVX1-NEXT: # %bb.122: +; AVX1-NEXT: addb $127, %dl +; AVX1-NEXT: movl %edx, %ecx +; AVX1-NEXT: .LBB2_123: +; AVX1-NEXT: vpextrb $4, %xmm1, %esi +; AVX1-NEXT: vpextrb $4, %xmm0, %edx +; AVX1-NEXT: movl %edx, %ebx +; AVX1-NEXT: subb %sil, %bl +; AVX1-NEXT: setns %bl +; AVX1-NEXT: subb %sil, %dl +; AVX1-NEXT: jno .LBB2_125 +; AVX1-NEXT: # %bb.124: +; AVX1-NEXT: addb $127, %bl +; AVX1-NEXT: movl %ebx, %edx +; AVX1-NEXT: .LBB2_125: +; AVX1-NEXT: vpextrb $3, %xmm1, %esi +; AVX1-NEXT: vpextrb $3, %xmm0, %r8d +; AVX1-NEXT: movl %r8d, %ebx +; AVX1-NEXT: subb %sil, %bl +; AVX1-NEXT: setns %bl +; AVX1-NEXT: subb %sil, %r8b +; AVX1-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB2_127 +; AVX1-NEXT: # %bb.126: +; AVX1-NEXT: addb $127, %bl +; AVX1-NEXT: movl %ebx, %r8d +; AVX1-NEXT: .LBB2_127: +; AVX1-NEXT: vpextrb $2, %xmm1, %esi +; AVX1-NEXT: vpextrb $2, %xmm0, %r9d +; AVX1-NEXT: movl %r9d, %ebx +; AVX1-NEXT: subb %sil, %bl +; AVX1-NEXT: setns %bl +; AVX1-NEXT: subb %sil, %r9b +; AVX1-NEXT: jno .LBB2_129 +; AVX1-NEXT: # %bb.128: +; AVX1-NEXT: addb $127, %bl +; AVX1-NEXT: movl %ebx, %r9d +; AVX1-NEXT: .LBB2_129: +; AVX1-NEXT: vpextrb $0, %xmm1, %esi +; AVX1-NEXT: vpextrb $0, %xmm0, %r10d +; AVX1-NEXT: movl %r10d, %ebx +; AVX1-NEXT: subb %sil, %bl +; AVX1-NEXT: setns %bl +; AVX1-NEXT: movl %r11d, %r12d +; AVX1-NEXT: subb %sil, %r10b +; AVX1-NEXT: jno .LBB2_131 +; AVX1-NEXT: # %bb.130: +; AVX1-NEXT: addb $127, %bl +; AVX1-NEXT: movl %ebx, %r10d +; AVX1-NEXT: .LBB2_131: +; AVX1-NEXT: vpextrb $1, %xmm1, %esi +; AVX1-NEXT: vpextrb $1, %xmm0, %r11d +; AVX1-NEXT: movl %r11d, %ebx +; AVX1-NEXT: subb %sil, %bl +; AVX1-NEXT: setns %bl +; AVX1-NEXT: subb %sil, %r11b +; AVX1-NEXT: jno .LBB2_133 +; AVX1-NEXT: # %bb.132: +; AVX1-NEXT: addb $127, %bl +; AVX1-NEXT: movl %ebx, %r11d +; AVX1-NEXT: .LBB2_133: +; AVX1-NEXT: movzbl %r10b, %esi +; AVX1-NEXT: vmovd %esi, %xmm0 +; AVX1-NEXT: movzbl %r11b, %esi +; AVX1-NEXT: vpinsrb $1, %esi, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %r9b, %esi +; AVX1-NEXT: vpinsrb $2, %esi, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %r8b, %esi +; AVX1-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %dl, %edx +; AVX1-NEXT: vpinsrb $4, %edx, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %cl, %ecx +; AVX1-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %al, %eax +; AVX1-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %dil, %eax +; AVX1-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %bpl, %eax +; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %r14b, %eax +; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %r15b, %eax +; AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %r13b, %eax +; AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %r12b, %eax +; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vmovd %eax, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl (%rsp), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vmovd %eax, %xmm3 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm1 +; AVX1-NEXT: addq $76, %rsp +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: popq %r12 +; AVX1-NEXT: popq %r13 +; AVX1-NEXT: popq %r14 +; AVX1-NEXT: popq %r15 +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: v64i8: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %r13 +; AVX2-NEXT: pushq %r12 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: subq $76, %rsp +; AVX2-NEXT: vpextrb $15, %xmm3, %ecx +; AVX2-NEXT: vpextrb $15, %xmm1, %edx +; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %dl +; AVX2-NEXT: jo .LBB2_1 +; AVX2-NEXT: # %bb.2: +; AVX2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jmp .LBB2_3 +; AVX2-NEXT: .LBB2_1: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: # kill: def $al killed $al def $eax +; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: .LBB2_3: +; AVX2-NEXT: vpextrb $14, %xmm3, %ecx +; AVX2-NEXT: vpextrb $14, %xmm1, %edx +; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %dl +; AVX2-NEXT: jno .LBB2_5 +; AVX2-NEXT: # %bb.4: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %edx +; AVX2-NEXT: .LBB2_5: +; AVX2-NEXT: vpextrb $13, %xmm3, %ecx +; AVX2-NEXT: vpextrb $13, %xmm1, %esi +; AVX2-NEXT: movl %esi, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %sil +; AVX2-NEXT: jo .LBB2_6 +; AVX2-NEXT: # %bb.7: +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jmp .LBB2_8 +; AVX2-NEXT: .LBB2_6: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: # kill: def $al killed $al def $eax +; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: .LBB2_8: +; AVX2-NEXT: vpextrb $12, %xmm3, %ecx +; AVX2-NEXT: vpextrb $12, %xmm1, %esi +; AVX2-NEXT: movl %esi, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %sil +; AVX2-NEXT: jno .LBB2_10 +; AVX2-NEXT: # %bb.9: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %esi +; AVX2-NEXT: .LBB2_10: +; AVX2-NEXT: vpextrb $11, %xmm3, %ecx +; AVX2-NEXT: vpextrb $11, %xmm1, %edi +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %dil +; AVX2-NEXT: jno .LBB2_12 +; AVX2-NEXT: # %bb.11: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %edi +; AVX2-NEXT: .LBB2_12: +; AVX2-NEXT: vpextrb $10, %xmm3, %ecx +; AVX2-NEXT: vpextrb $10, %xmm1, %ebp +; AVX2-NEXT: movl %ebp, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %bpl +; AVX2-NEXT: jno .LBB2_14 +; AVX2-NEXT: # %bb.13: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %ebp +; AVX2-NEXT: .LBB2_14: +; AVX2-NEXT: vpextrb $9, %xmm3, %ecx +; AVX2-NEXT: vpextrb $9, %xmm1, %ebx +; AVX2-NEXT: movl %ebx, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %bl +; AVX2-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jo .LBB2_15 +; AVX2-NEXT: # %bb.16: +; AVX2-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jmp .LBB2_17 +; AVX2-NEXT: .LBB2_15: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: # kill: def $al killed $al def $eax +; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: .LBB2_17: +; AVX2-NEXT: vpextrb $8, %xmm3, %ecx +; AVX2-NEXT: vpextrb $8, %xmm1, %ebp +; AVX2-NEXT: movl %ebp, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %bpl +; AVX2-NEXT: jno .LBB2_19 +; AVX2-NEXT: # %bb.18: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %ebp +; AVX2-NEXT: .LBB2_19: +; AVX2-NEXT: vpextrb $7, %xmm3, %ecx +; AVX2-NEXT: vpextrb $7, %xmm1, %ebx +; AVX2-NEXT: movl %ebx, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %bl +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_21 +; AVX2-NEXT: # %bb.20: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %ebx +; AVX2-NEXT: .LBB2_21: +; AVX2-NEXT: vpextrb $6, %xmm3, %ecx +; AVX2-NEXT: vpextrb $6, %xmm1, %esi +; AVX2-NEXT: movl %esi, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %sil +; AVX2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_23 +; AVX2-NEXT: # %bb.22: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %esi +; AVX2-NEXT: .LBB2_23: +; AVX2-NEXT: vpextrb $5, %xmm3, %ecx +; AVX2-NEXT: vpextrb $5, %xmm1, %r11d +; AVX2-NEXT: movl %r11d, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %r11b +; AVX2-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_25 +; AVX2-NEXT: # %bb.24: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r11d +; AVX2-NEXT: .LBB2_25: +; AVX2-NEXT: vpextrb $4, %xmm3, %ecx +; AVX2-NEXT: vpextrb $4, %xmm1, %r13d +; AVX2-NEXT: movl %r13d, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %r13b +; AVX2-NEXT: jno .LBB2_27 +; AVX2-NEXT: # %bb.26: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r13d +; AVX2-NEXT: .LBB2_27: +; AVX2-NEXT: vpextrb $3, %xmm3, %ecx +; AVX2-NEXT: vpextrb $3, %xmm1, %r8d +; AVX2-NEXT: movl %r8d, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %r8b +; AVX2-NEXT: jno .LBB2_29 +; AVX2-NEXT: # %bb.28: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r8d +; AVX2-NEXT: .LBB2_29: +; AVX2-NEXT: vpextrb $2, %xmm3, %ecx +; AVX2-NEXT: vpextrb $2, %xmm1, %ebp +; AVX2-NEXT: movl %ebp, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %bpl +; AVX2-NEXT: jno .LBB2_31 +; AVX2-NEXT: # %bb.30: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %ebp +; AVX2-NEXT: .LBB2_31: +; AVX2-NEXT: vpextrb $0, %xmm3, %ecx +; AVX2-NEXT: vpextrb $0, %xmm1, %ebx +; AVX2-NEXT: movl %ebx, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %bl +; AVX2-NEXT: jno .LBB2_33 +; AVX2-NEXT: # %bb.32: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %ebx +; AVX2-NEXT: .LBB2_33: +; AVX2-NEXT: vpextrb $1, %xmm3, %ecx +; AVX2-NEXT: vpextrb $1, %xmm1, %edi +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %dil +; AVX2-NEXT: jno .LBB2_35 +; AVX2-NEXT: # %bb.34: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %edi +; AVX2-NEXT: .LBB2_35: +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX2-NEXT: vpextrb $15, %xmm3, %ecx +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vpextrb $15, %xmm1, %edx +; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %dl +; AVX2-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_37 +; AVX2-NEXT: # %bb.36: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %edx +; AVX2-NEXT: .LBB2_37: +; AVX2-NEXT: vpextrb $14, %xmm3, %ecx +; AVX2-NEXT: vpextrb $14, %xmm1, %edi +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %dil +; AVX2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_39 +; AVX2-NEXT: # %bb.38: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %edi +; AVX2-NEXT: .LBB2_39: +; AVX2-NEXT: vpextrb $13, %xmm3, %ecx +; AVX2-NEXT: vpextrb $13, %xmm1, %r12d +; AVX2-NEXT: movl %r12d, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %r12b +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_41 +; AVX2-NEXT: # %bb.40: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r12d +; AVX2-NEXT: .LBB2_41: +; AVX2-NEXT: vpextrb $12, %xmm3, %ecx +; AVX2-NEXT: vpextrb $12, %xmm1, %r15d +; AVX2-NEXT: movl %r15d, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %r15b +; AVX2-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_43 +; AVX2-NEXT: # %bb.42: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r15d +; AVX2-NEXT: .LBB2_43: +; AVX2-NEXT: vpextrb $11, %xmm3, %ecx +; AVX2-NEXT: vpextrb $11, %xmm1, %ebp +; AVX2-NEXT: movl %ebp, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %bpl +; AVX2-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_45 +; AVX2-NEXT: # %bb.44: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %ebp +; AVX2-NEXT: .LBB2_45: +; AVX2-NEXT: vpextrb $10, %xmm3, %ecx +; AVX2-NEXT: vpextrb $10, %xmm1, %ebx +; AVX2-NEXT: movl %ebx, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %bl +; AVX2-NEXT: jno .LBB2_47 +; AVX2-NEXT: # %bb.46: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %ebx +; AVX2-NEXT: .LBB2_47: +; AVX2-NEXT: vpextrb $9, %xmm3, %ecx +; AVX2-NEXT: vpextrb $9, %xmm1, %edx +; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %dl +; AVX2-NEXT: jno .LBB2_49 +; AVX2-NEXT: # %bb.48: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %edx +; AVX2-NEXT: .LBB2_49: +; AVX2-NEXT: vpextrb $8, %xmm3, %ecx +; AVX2-NEXT: vpextrb $8, %xmm1, %esi +; AVX2-NEXT: movl %esi, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %sil +; AVX2-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %ebx, (%rsp) # 4-byte Spill +; AVX2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jo .LBB2_50 +; AVX2-NEXT: # %bb.51: +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jmp .LBB2_52 +; AVX2-NEXT: .LBB2_50: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: # kill: def $al killed $al def $eax +; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: .LBB2_52: +; AVX2-NEXT: vpextrb $7, %xmm3, %ecx +; AVX2-NEXT: vpextrb $7, %xmm1, %r11d +; AVX2-NEXT: movl %r11d, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %r11b +; AVX2-NEXT: jno .LBB2_54 +; AVX2-NEXT: # %bb.53: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r11d +; AVX2-NEXT: .LBB2_54: +; AVX2-NEXT: vpextrb $6, %xmm3, %ecx +; AVX2-NEXT: vpextrb $6, %xmm1, %esi +; AVX2-NEXT: movl %esi, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %sil +; AVX2-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_56 +; AVX2-NEXT: # %bb.55: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %esi +; AVX2-NEXT: .LBB2_56: +; AVX2-NEXT: vpextrb $5, %xmm3, %ecx +; AVX2-NEXT: vpextrb $5, %xmm1, %edi +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %dil +; AVX2-NEXT: jno .LBB2_58 +; AVX2-NEXT: # %bb.57: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %edi +; AVX2-NEXT: .LBB2_58: +; AVX2-NEXT: vpextrb $4, %xmm3, %ecx +; AVX2-NEXT: vpextrb $4, %xmm1, %r13d +; AVX2-NEXT: movl %r13d, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %r13b +; AVX2-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_60 +; AVX2-NEXT: # %bb.59: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: # kill: def $al killed $al def $eax +; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: .LBB2_60: +; AVX2-NEXT: vpextrb $3, %xmm3, %ecx +; AVX2-NEXT: vpextrb $3, %xmm1, %ebp +; AVX2-NEXT: movl %ebp, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %bpl +; AVX2-NEXT: jo .LBB2_61 +; AVX2-NEXT: # %bb.62: +; AVX2-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jmp .LBB2_63 +; AVX2-NEXT: .LBB2_61: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: # kill: def $al killed $al def $eax +; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: .LBB2_63: +; AVX2-NEXT: vpextrb $2, %xmm3, %ecx +; AVX2-NEXT: vpextrb $2, %xmm1, %ebp +; AVX2-NEXT: movl %ebp, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %bpl +; AVX2-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_65 +; AVX2-NEXT: # %bb.64: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %ebp +; AVX2-NEXT: .LBB2_65: +; AVX2-NEXT: vpextrb $0, %xmm3, %ecx +; AVX2-NEXT: vpextrb $0, %xmm1, %edi +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %dil +; AVX2-NEXT: movl %r15d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_67 +; AVX2-NEXT: # %bb.66: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %edi +; AVX2-NEXT: .LBB2_67: +; AVX2-NEXT: vpextrb $1, %xmm3, %ecx +; AVX2-NEXT: vpextrb $1, %xmm1, %ebx +; AVX2-NEXT: movl %ebx, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %bl +; AVX2-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_69 +; AVX2-NEXT: # %bb.68: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %ebx +; AVX2-NEXT: .LBB2_69: +; AVX2-NEXT: vpextrb $15, %xmm2, %ecx +; AVX2-NEXT: vpextrb $15, %xmm0, %edx +; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %dl +; AVX2-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_71 +; AVX2-NEXT: # %bb.70: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %edx +; AVX2-NEXT: .LBB2_71: +; AVX2-NEXT: vpextrb $14, %xmm2, %ecx +; AVX2-NEXT: vpextrb $14, %xmm0, %edi +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %dil +; AVX2-NEXT: jno .LBB2_73 +; AVX2-NEXT: # %bb.72: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %edi +; AVX2-NEXT: .LBB2_73: +; AVX2-NEXT: vpextrb $13, %xmm2, %ecx +; AVX2-NEXT: vpextrb $13, %xmm0, %r10d +; AVX2-NEXT: movl %r10d, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %r10b +; AVX2-NEXT: jno .LBB2_75 +; AVX2-NEXT: # %bb.74: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r10d +; AVX2-NEXT: .LBB2_75: +; AVX2-NEXT: vpextrb $12, %xmm2, %ecx +; AVX2-NEXT: vpextrb $12, %xmm0, %r12d +; AVX2-NEXT: movl %r12d, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %r12b +; AVX2-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_77 +; AVX2-NEXT: # %bb.76: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: # kill: def $al killed $al def $eax +; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: .LBB2_77: +; AVX2-NEXT: vpextrb $11, %xmm2, %ecx +; AVX2-NEXT: vpextrb $11, %xmm0, %r14d +; AVX2-NEXT: movl %r14d, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %r14b +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_79 +; AVX2-NEXT: # %bb.78: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r14d +; AVX2-NEXT: .LBB2_79: +; AVX2-NEXT: vpextrb $10, %xmm2, %ecx +; AVX2-NEXT: vpextrb $10, %xmm0, %r13d +; AVX2-NEXT: movl %r13d, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %r13b +; AVX2-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_81 +; AVX2-NEXT: # %bb.80: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r13d +; AVX2-NEXT: .LBB2_81: +; AVX2-NEXT: vpextrb $9, %xmm2, %ecx +; AVX2-NEXT: vpextrb $9, %xmm0, %r8d +; AVX2-NEXT: movl %r8d, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %r8b +; AVX2-NEXT: jno .LBB2_83 +; AVX2-NEXT: # %bb.82: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r8d +; AVX2-NEXT: .LBB2_83: +; AVX2-NEXT: vpextrb $8, %xmm2, %ecx +; AVX2-NEXT: vpextrb $8, %xmm0, %r15d +; AVX2-NEXT: movl %r15d, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %r15b +; AVX2-NEXT: jno .LBB2_85 +; AVX2-NEXT: # %bb.84: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r15d +; AVX2-NEXT: .LBB2_85: +; AVX2-NEXT: vpextrb $7, %xmm2, %ecx +; AVX2-NEXT: vpextrb $7, %xmm0, %r12d +; AVX2-NEXT: movl %r12d, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %r12b +; AVX2-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_87 +; AVX2-NEXT: # %bb.86: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: # kill: def $al killed $al def $eax +; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: .LBB2_87: +; AVX2-NEXT: vpextrb $6, %xmm2, %ecx +; AVX2-NEXT: vpextrb $6, %xmm0, %r12d +; AVX2-NEXT: movl %r12d, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %r12b +; AVX2-NEXT: jno .LBB2_89 +; AVX2-NEXT: # %bb.88: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r12d +; AVX2-NEXT: .LBB2_89: +; AVX2-NEXT: vpextrb $5, %xmm2, %ecx +; AVX2-NEXT: vpextrb $5, %xmm0, %ebp +; AVX2-NEXT: movl %ebp, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %bpl +; AVX2-NEXT: jno .LBB2_91 +; AVX2-NEXT: # %bb.90: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %ebp +; AVX2-NEXT: .LBB2_91: +; AVX2-NEXT: vpextrb $4, %xmm2, %ecx +; AVX2-NEXT: vpextrb $4, %xmm0, %esi +; AVX2-NEXT: movl %esi, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %sil +; AVX2-NEXT: jno .LBB2_93 +; AVX2-NEXT: # %bb.92: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %esi +; AVX2-NEXT: .LBB2_93: +; AVX2-NEXT: vpextrb $3, %xmm2, %ecx +; AVX2-NEXT: vpextrb $3, %xmm0, %edx +; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %dl +; AVX2-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_95 +; AVX2-NEXT: # %bb.94: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %edx +; AVX2-NEXT: .LBB2_95: +; AVX2-NEXT: vpextrb $2, %xmm2, %ecx +; AVX2-NEXT: vpextrb $2, %xmm0, %edi +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %dil +; AVX2-NEXT: jno .LBB2_97 +; AVX2-NEXT: # %bb.96: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %edi +; AVX2-NEXT: .LBB2_97: +; AVX2-NEXT: vpextrb $0, %xmm2, %ecx +; AVX2-NEXT: vpextrb $0, %xmm0, %ebx +; AVX2-NEXT: movl %ebx, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %bl +; AVX2-NEXT: jno .LBB2_99 +; AVX2-NEXT: # %bb.98: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %ebx +; AVX2-NEXT: .LBB2_99: +; AVX2-NEXT: vpextrb $1, %xmm2, %ecx +; AVX2-NEXT: vpextrb $1, %xmm0, %r11d +; AVX2-NEXT: movl %r11d, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %r11b +; AVX2-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_101 +; AVX2-NEXT: # %bb.100: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: # kill: def $al killed $al def $eax +; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: .LBB2_101: +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1 +; AVX2-NEXT: vpextrb $15, %xmm1, %ecx +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpextrb $15, %xmm0, %r11d +; AVX2-NEXT: movl %r11d, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %r11b +; AVX2-NEXT: jno .LBB2_103 +; AVX2-NEXT: # %bb.102: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r11d +; AVX2-NEXT: .LBB2_103: +; AVX2-NEXT: vpextrb $14, %xmm1, %ecx +; AVX2-NEXT: vpextrb $14, %xmm0, %r9d +; AVX2-NEXT: movl %r9d, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %r9b +; AVX2-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_105 +; AVX2-NEXT: # %bb.104: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: # kill: def $al killed $al def $eax +; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: .LBB2_105: +; AVX2-NEXT: vpextrb $13, %xmm1, %ecx +; AVX2-NEXT: vpextrb $13, %xmm0, %r9d +; AVX2-NEXT: movl %r9d, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %r9b +; AVX2-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %r15d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_107 +; AVX2-NEXT: # %bb.106: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r9d +; AVX2-NEXT: .LBB2_107: +; AVX2-NEXT: vpextrb $12, %xmm1, %ecx +; AVX2-NEXT: vpextrb $12, %xmm0, %edx +; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %dl +; AVX2-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_109 +; AVX2-NEXT: # %bb.108: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %edx +; AVX2-NEXT: .LBB2_109: +; AVX2-NEXT: vpextrb $11, %xmm1, %ecx +; AVX2-NEXT: vpextrb $11, %xmm0, %r13d +; AVX2-NEXT: movl %r13d, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %r13b +; AVX2-NEXT: jno .LBB2_111 +; AVX2-NEXT: # %bb.110: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r13d +; AVX2-NEXT: .LBB2_111: +; AVX2-NEXT: vpextrb $10, %xmm1, %ecx +; AVX2-NEXT: vpextrb $10, %xmm0, %r15d +; AVX2-NEXT: movl %r15d, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %r15b +; AVX2-NEXT: jno .LBB2_113 +; AVX2-NEXT: # %bb.112: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r15d +; AVX2-NEXT: .LBB2_113: +; AVX2-NEXT: vpextrb $9, %xmm1, %ecx +; AVX2-NEXT: vpextrb $9, %xmm0, %r14d +; AVX2-NEXT: movl %r14d, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %r14b +; AVX2-NEXT: jno .LBB2_115 +; AVX2-NEXT: # %bb.114: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r14d +; AVX2-NEXT: .LBB2_115: +; AVX2-NEXT: vpextrb $8, %xmm1, %ecx +; AVX2-NEXT: vpextrb $8, %xmm0, %ebp +; AVX2-NEXT: movl %ebp, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %bpl +; AVX2-NEXT: jno .LBB2_117 +; AVX2-NEXT: # %bb.116: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %ebp +; AVX2-NEXT: .LBB2_117: +; AVX2-NEXT: vpextrb $7, %xmm1, %ecx +; AVX2-NEXT: vpextrb $7, %xmm0, %edi +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %dil +; AVX2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_119 +; AVX2-NEXT: # %bb.118: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %edi +; AVX2-NEXT: .LBB2_119: +; AVX2-NEXT: vpextrb $6, %xmm1, %edx +; AVX2-NEXT: vpextrb $6, %xmm0, %eax +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: subb %dl, %cl +; AVX2-NEXT: setns %cl +; AVX2-NEXT: subb %dl, %al +; AVX2-NEXT: jno .LBB2_121 +; AVX2-NEXT: # %bb.120: +; AVX2-NEXT: addb $127, %cl +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: .LBB2_121: +; AVX2-NEXT: vpextrb $5, %xmm1, %ebx +; AVX2-NEXT: vpextrb $5, %xmm0, %ecx +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: subb %bl, %dl +; AVX2-NEXT: setns %dl +; AVX2-NEXT: subb %bl, %cl +; AVX2-NEXT: jno .LBB2_123 +; AVX2-NEXT: # %bb.122: +; AVX2-NEXT: addb $127, %dl +; AVX2-NEXT: movl %edx, %ecx +; AVX2-NEXT: .LBB2_123: +; AVX2-NEXT: vpextrb $4, %xmm1, %esi +; AVX2-NEXT: vpextrb $4, %xmm0, %edx +; AVX2-NEXT: movl %edx, %ebx +; AVX2-NEXT: subb %sil, %bl +; AVX2-NEXT: setns %bl +; AVX2-NEXT: subb %sil, %dl +; AVX2-NEXT: jno .LBB2_125 +; AVX2-NEXT: # %bb.124: +; AVX2-NEXT: addb $127, %bl +; AVX2-NEXT: movl %ebx, %edx +; AVX2-NEXT: .LBB2_125: +; AVX2-NEXT: vpextrb $3, %xmm1, %esi +; AVX2-NEXT: vpextrb $3, %xmm0, %r8d +; AVX2-NEXT: movl %r8d, %ebx +; AVX2-NEXT: subb %sil, %bl +; AVX2-NEXT: setns %bl +; AVX2-NEXT: subb %sil, %r8b +; AVX2-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB2_127 +; AVX2-NEXT: # %bb.126: +; AVX2-NEXT: addb $127, %bl +; AVX2-NEXT: movl %ebx, %r8d +; AVX2-NEXT: .LBB2_127: +; AVX2-NEXT: vpextrb $2, %xmm1, %esi +; AVX2-NEXT: vpextrb $2, %xmm0, %r9d +; AVX2-NEXT: movl %r9d, %ebx +; AVX2-NEXT: subb %sil, %bl +; AVX2-NEXT: setns %bl +; AVX2-NEXT: subb %sil, %r9b +; AVX2-NEXT: jno .LBB2_129 +; AVX2-NEXT: # %bb.128: +; AVX2-NEXT: addb $127, %bl +; AVX2-NEXT: movl %ebx, %r9d +; AVX2-NEXT: .LBB2_129: +; AVX2-NEXT: vpextrb $0, %xmm1, %esi +; AVX2-NEXT: vpextrb $0, %xmm0, %r10d +; AVX2-NEXT: movl %r10d, %ebx +; AVX2-NEXT: subb %sil, %bl +; AVX2-NEXT: setns %bl +; AVX2-NEXT: movl %r11d, %r12d +; AVX2-NEXT: subb %sil, %r10b +; AVX2-NEXT: jno .LBB2_131 +; AVX2-NEXT: # %bb.130: +; AVX2-NEXT: addb $127, %bl +; AVX2-NEXT: movl %ebx, %r10d +; AVX2-NEXT: .LBB2_131: +; AVX2-NEXT: vpextrb $1, %xmm1, %esi +; AVX2-NEXT: vpextrb $1, %xmm0, %r11d +; AVX2-NEXT: movl %r11d, %ebx +; AVX2-NEXT: subb %sil, %bl +; AVX2-NEXT: setns %bl +; AVX2-NEXT: subb %sil, %r11b +; AVX2-NEXT: jno .LBB2_133 +; AVX2-NEXT: # %bb.132: +; AVX2-NEXT: addb $127, %bl +; AVX2-NEXT: movl %ebx, %r11d +; AVX2-NEXT: .LBB2_133: +; AVX2-NEXT: movzbl %r10b, %esi +; AVX2-NEXT: vmovd %esi, %xmm0 +; AVX2-NEXT: movzbl %r11b, %esi +; AVX2-NEXT: vpinsrb $1, %esi, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %r9b, %esi +; AVX2-NEXT: vpinsrb $2, %esi, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %r8b, %esi +; AVX2-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %dl, %edx +; AVX2-NEXT: vpinsrb $4, %edx, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %dil, %eax +; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %bpl, %eax +; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %r14b, %eax +; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %r15b, %eax +; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %r13b, %eax +; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %r12b, %eax +; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vmovd %eax, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl (%rsp), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vmovd %eax, %xmm3 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm1 +; AVX2-NEXT: addq $76, %rsp +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r12 +; AVX2-NEXT: popq %r13 +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512-LABEL: v64i8: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r15 +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %r13 +; AVX512-NEXT: pushq %r12 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: subq $76, %rsp +; AVX512-NEXT: vpextrb $15, %xmm1, %ecx +; AVX512-NEXT: vpextrb $15, %xmm0, %edx +; AVX512-NEXT: movl %edx, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %dl +; AVX512-NEXT: jo .LBB2_1 +; AVX512-NEXT: # %bb.2: +; AVX512-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jmp .LBB2_3 +; AVX512-NEXT: .LBB2_1: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: # kill: def $al killed $al def $eax +; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: .LBB2_3: +; AVX512-NEXT: vpextrb $14, %xmm1, %ecx +; AVX512-NEXT: vpextrb $14, %xmm0, %edx +; AVX512-NEXT: movl %edx, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %dl +; AVX512-NEXT: jno .LBB2_5 +; AVX512-NEXT: # %bb.4: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %edx +; AVX512-NEXT: .LBB2_5: +; AVX512-NEXT: vpextrb $13, %xmm1, %ecx +; AVX512-NEXT: vpextrb $13, %xmm0, %esi +; AVX512-NEXT: movl %esi, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %sil +; AVX512-NEXT: jo .LBB2_6 +; AVX512-NEXT: # %bb.7: +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jmp .LBB2_8 +; AVX512-NEXT: .LBB2_6: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: # kill: def $al killed $al def $eax +; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: .LBB2_8: +; AVX512-NEXT: vpextrb $12, %xmm1, %ecx +; AVX512-NEXT: vpextrb $12, %xmm0, %esi +; AVX512-NEXT: movl %esi, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %sil +; AVX512-NEXT: jno .LBB2_10 +; AVX512-NEXT: # %bb.9: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %esi +; AVX512-NEXT: .LBB2_10: +; AVX512-NEXT: vpextrb $11, %xmm1, %ecx +; AVX512-NEXT: vpextrb $11, %xmm0, %edi +; AVX512-NEXT: movl %edi, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %dil +; AVX512-NEXT: jno .LBB2_12 +; AVX512-NEXT: # %bb.11: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %edi +; AVX512-NEXT: .LBB2_12: +; AVX512-NEXT: vpextrb $10, %xmm1, %ecx +; AVX512-NEXT: vpextrb $10, %xmm0, %ebp +; AVX512-NEXT: movl %ebp, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %bpl +; AVX512-NEXT: jno .LBB2_14 +; AVX512-NEXT: # %bb.13: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %ebp +; AVX512-NEXT: .LBB2_14: +; AVX512-NEXT: vpextrb $9, %xmm1, %ecx +; AVX512-NEXT: vpextrb $9, %xmm0, %ebx +; AVX512-NEXT: movl %ebx, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %bl +; AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jo .LBB2_15 +; AVX512-NEXT: # %bb.16: +; AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jmp .LBB2_17 +; AVX512-NEXT: .LBB2_15: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: # kill: def $al killed $al def $eax +; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: .LBB2_17: +; AVX512-NEXT: vpextrb $8, %xmm1, %ecx +; AVX512-NEXT: vpextrb $8, %xmm0, %ebp +; AVX512-NEXT: movl %ebp, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %bpl +; AVX512-NEXT: jno .LBB2_19 +; AVX512-NEXT: # %bb.18: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %ebp +; AVX512-NEXT: .LBB2_19: +; AVX512-NEXT: vpextrb $7, %xmm1, %ecx +; AVX512-NEXT: vpextrb $7, %xmm0, %ebx +; AVX512-NEXT: movl %ebx, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %bl +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_21 +; AVX512-NEXT: # %bb.20: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %ebx +; AVX512-NEXT: .LBB2_21: +; AVX512-NEXT: vpextrb $6, %xmm1, %ecx +; AVX512-NEXT: vpextrb $6, %xmm0, %esi +; AVX512-NEXT: movl %esi, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %sil +; AVX512-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_23 +; AVX512-NEXT: # %bb.22: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %esi +; AVX512-NEXT: .LBB2_23: +; AVX512-NEXT: vpextrb $5, %xmm1, %ecx +; AVX512-NEXT: vpextrb $5, %xmm0, %r11d +; AVX512-NEXT: movl %r11d, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %r11b +; AVX512-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_25 +; AVX512-NEXT: # %bb.24: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r11d +; AVX512-NEXT: .LBB2_25: +; AVX512-NEXT: vpextrb $4, %xmm1, %ecx +; AVX512-NEXT: vpextrb $4, %xmm0, %r13d +; AVX512-NEXT: movl %r13d, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %r13b +; AVX512-NEXT: jno .LBB2_27 +; AVX512-NEXT: # %bb.26: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r13d +; AVX512-NEXT: .LBB2_27: +; AVX512-NEXT: vpextrb $3, %xmm1, %ecx +; AVX512-NEXT: vpextrb $3, %xmm0, %r8d +; AVX512-NEXT: movl %r8d, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %r8b +; AVX512-NEXT: jno .LBB2_29 +; AVX512-NEXT: # %bb.28: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r8d +; AVX512-NEXT: .LBB2_29: +; AVX512-NEXT: vpextrb $2, %xmm1, %ecx +; AVX512-NEXT: vpextrb $2, %xmm0, %ebp +; AVX512-NEXT: movl %ebp, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %bpl +; AVX512-NEXT: jno .LBB2_31 +; AVX512-NEXT: # %bb.30: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %ebp +; AVX512-NEXT: .LBB2_31: +; AVX512-NEXT: vpextrb $0, %xmm1, %ecx +; AVX512-NEXT: vpextrb $0, %xmm0, %ebx +; AVX512-NEXT: movl %ebx, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %bl +; AVX512-NEXT: jno .LBB2_33 +; AVX512-NEXT: # %bb.32: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %ebx +; AVX512-NEXT: .LBB2_33: +; AVX512-NEXT: vpextrb $1, %xmm1, %ecx +; AVX512-NEXT: vpextrb $1, %xmm0, %edi +; AVX512-NEXT: movl %edi, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %dil +; AVX512-NEXT: jno .LBB2_35 +; AVX512-NEXT: # %bb.34: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %edi +; AVX512-NEXT: .LBB2_35: +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vpextrb $15, %xmm2, %ecx +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512-NEXT: vpextrb $15, %xmm3, %edx +; AVX512-NEXT: movl %edx, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %dl +; AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_37 +; AVX512-NEXT: # %bb.36: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %edx +; AVX512-NEXT: .LBB2_37: +; AVX512-NEXT: vpextrb $14, %xmm2, %ecx +; AVX512-NEXT: vpextrb $14, %xmm3, %edi +; AVX512-NEXT: movl %edi, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %dil +; AVX512-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_39 +; AVX512-NEXT: # %bb.38: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %edi +; AVX512-NEXT: .LBB2_39: +; AVX512-NEXT: vpextrb $13, %xmm2, %ecx +; AVX512-NEXT: vpextrb $13, %xmm3, %r12d +; AVX512-NEXT: movl %r12d, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %r12b +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_41 +; AVX512-NEXT: # %bb.40: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r12d +; AVX512-NEXT: .LBB2_41: +; AVX512-NEXT: vpextrb $12, %xmm2, %ecx +; AVX512-NEXT: vpextrb $12, %xmm3, %r15d +; AVX512-NEXT: movl %r15d, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %r15b +; AVX512-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_43 +; AVX512-NEXT: # %bb.42: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r15d +; AVX512-NEXT: .LBB2_43: +; AVX512-NEXT: vpextrb $11, %xmm2, %ecx +; AVX512-NEXT: vpextrb $11, %xmm3, %ebp +; AVX512-NEXT: movl %ebp, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %bpl +; AVX512-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_45 +; AVX512-NEXT: # %bb.44: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %ebp +; AVX512-NEXT: .LBB2_45: +; AVX512-NEXT: vpextrb $10, %xmm2, %ecx +; AVX512-NEXT: vpextrb $10, %xmm3, %ebx +; AVX512-NEXT: movl %ebx, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %bl +; AVX512-NEXT: jno .LBB2_47 +; AVX512-NEXT: # %bb.46: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %ebx +; AVX512-NEXT: .LBB2_47: +; AVX512-NEXT: vpextrb $9, %xmm2, %ecx +; AVX512-NEXT: vpextrb $9, %xmm3, %edx +; AVX512-NEXT: movl %edx, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %dl +; AVX512-NEXT: jno .LBB2_49 +; AVX512-NEXT: # %bb.48: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %edx +; AVX512-NEXT: .LBB2_49: +; AVX512-NEXT: vpextrb $8, %xmm2, %ecx +; AVX512-NEXT: vpextrb $8, %xmm3, %esi +; AVX512-NEXT: movl %esi, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %sil +; AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %ebx, (%rsp) # 4-byte Spill +; AVX512-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jo .LBB2_50 +; AVX512-NEXT: # %bb.51: +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jmp .LBB2_52 +; AVX512-NEXT: .LBB2_50: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: # kill: def $al killed $al def $eax +; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: .LBB2_52: +; AVX512-NEXT: vpextrb $7, %xmm2, %ecx +; AVX512-NEXT: vpextrb $7, %xmm3, %r11d +; AVX512-NEXT: movl %r11d, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %r11b +; AVX512-NEXT: jno .LBB2_54 +; AVX512-NEXT: # %bb.53: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r11d +; AVX512-NEXT: .LBB2_54: +; AVX512-NEXT: vpextrb $6, %xmm2, %ecx +; AVX512-NEXT: vpextrb $6, %xmm3, %esi +; AVX512-NEXT: movl %esi, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %sil +; AVX512-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_56 +; AVX512-NEXT: # %bb.55: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %esi +; AVX512-NEXT: .LBB2_56: +; AVX512-NEXT: vpextrb $5, %xmm2, %ecx +; AVX512-NEXT: vpextrb $5, %xmm3, %edi +; AVX512-NEXT: movl %edi, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %dil +; AVX512-NEXT: jno .LBB2_58 +; AVX512-NEXT: # %bb.57: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %edi +; AVX512-NEXT: .LBB2_58: +; AVX512-NEXT: vpextrb $4, %xmm2, %ecx +; AVX512-NEXT: vpextrb $4, %xmm3, %r13d +; AVX512-NEXT: movl %r13d, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %r13b +; AVX512-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_60 +; AVX512-NEXT: # %bb.59: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: # kill: def $al killed $al def $eax +; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: .LBB2_60: +; AVX512-NEXT: vpextrb $3, %xmm2, %ecx +; AVX512-NEXT: vpextrb $3, %xmm3, %ebp +; AVX512-NEXT: movl %ebp, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %bpl +; AVX512-NEXT: jo .LBB2_61 +; AVX512-NEXT: # %bb.62: +; AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jmp .LBB2_63 +; AVX512-NEXT: .LBB2_61: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: # kill: def $al killed $al def $eax +; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: .LBB2_63: +; AVX512-NEXT: vpextrb $2, %xmm2, %ecx +; AVX512-NEXT: vpextrb $2, %xmm3, %ebp +; AVX512-NEXT: movl %ebp, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %bpl +; AVX512-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_65 +; AVX512-NEXT: # %bb.64: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %ebp +; AVX512-NEXT: .LBB2_65: +; AVX512-NEXT: vpextrb $0, %xmm2, %ecx +; AVX512-NEXT: vpextrb $0, %xmm3, %edi +; AVX512-NEXT: movl %edi, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %dil +; AVX512-NEXT: movl %r15d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_67 +; AVX512-NEXT: # %bb.66: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %edi +; AVX512-NEXT: .LBB2_67: +; AVX512-NEXT: vpextrb $1, %xmm2, %ecx +; AVX512-NEXT: vpextrb $1, %xmm3, %ebx +; AVX512-NEXT: movl %ebx, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %bl +; AVX512-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_69 +; AVX512-NEXT: # %bb.68: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %ebx +; AVX512-NEXT: .LBB2_69: +; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm2 +; AVX512-NEXT: vpextrb $15, %xmm2, %ecx +; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm3 +; AVX512-NEXT: vpextrb $15, %xmm3, %edx +; AVX512-NEXT: movl %edx, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %dl +; AVX512-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_71 +; AVX512-NEXT: # %bb.70: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %edx +; AVX512-NEXT: .LBB2_71: +; AVX512-NEXT: vpextrb $14, %xmm2, %ecx +; AVX512-NEXT: vpextrb $14, %xmm3, %edi +; AVX512-NEXT: movl %edi, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %dil +; AVX512-NEXT: jno .LBB2_73 +; AVX512-NEXT: # %bb.72: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %edi +; AVX512-NEXT: .LBB2_73: +; AVX512-NEXT: vpextrb $13, %xmm2, %ecx +; AVX512-NEXT: vpextrb $13, %xmm3, %r10d +; AVX512-NEXT: movl %r10d, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %r10b +; AVX512-NEXT: jno .LBB2_75 +; AVX512-NEXT: # %bb.74: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r10d +; AVX512-NEXT: .LBB2_75: +; AVX512-NEXT: vpextrb $12, %xmm2, %ecx +; AVX512-NEXT: vpextrb $12, %xmm3, %r12d +; AVX512-NEXT: movl %r12d, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %r12b +; AVX512-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_77 +; AVX512-NEXT: # %bb.76: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: # kill: def $al killed $al def $eax +; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: .LBB2_77: +; AVX512-NEXT: vpextrb $11, %xmm2, %ecx +; AVX512-NEXT: vpextrb $11, %xmm3, %r14d +; AVX512-NEXT: movl %r14d, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %r14b +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_79 +; AVX512-NEXT: # %bb.78: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r14d +; AVX512-NEXT: .LBB2_79: +; AVX512-NEXT: vpextrb $10, %xmm2, %ecx +; AVX512-NEXT: vpextrb $10, %xmm3, %r13d +; AVX512-NEXT: movl %r13d, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %r13b +; AVX512-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_81 +; AVX512-NEXT: # %bb.80: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r13d +; AVX512-NEXT: .LBB2_81: +; AVX512-NEXT: vpextrb $9, %xmm2, %ecx +; AVX512-NEXT: vpextrb $9, %xmm3, %r8d +; AVX512-NEXT: movl %r8d, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %r8b +; AVX512-NEXT: jno .LBB2_83 +; AVX512-NEXT: # %bb.82: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r8d +; AVX512-NEXT: .LBB2_83: +; AVX512-NEXT: vpextrb $8, %xmm2, %ecx +; AVX512-NEXT: vpextrb $8, %xmm3, %r15d +; AVX512-NEXT: movl %r15d, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %r15b +; AVX512-NEXT: jno .LBB2_85 +; AVX512-NEXT: # %bb.84: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r15d +; AVX512-NEXT: .LBB2_85: +; AVX512-NEXT: vpextrb $7, %xmm2, %ecx +; AVX512-NEXT: vpextrb $7, %xmm3, %r12d +; AVX512-NEXT: movl %r12d, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %r12b +; AVX512-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_87 +; AVX512-NEXT: # %bb.86: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: # kill: def $al killed $al def $eax +; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: .LBB2_87: +; AVX512-NEXT: vpextrb $6, %xmm2, %ecx +; AVX512-NEXT: vpextrb $6, %xmm3, %r12d +; AVX512-NEXT: movl %r12d, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %r12b +; AVX512-NEXT: jno .LBB2_89 +; AVX512-NEXT: # %bb.88: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r12d +; AVX512-NEXT: .LBB2_89: +; AVX512-NEXT: vpextrb $5, %xmm2, %ecx +; AVX512-NEXT: vpextrb $5, %xmm3, %ebp +; AVX512-NEXT: movl %ebp, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %bpl +; AVX512-NEXT: jno .LBB2_91 +; AVX512-NEXT: # %bb.90: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %ebp +; AVX512-NEXT: .LBB2_91: +; AVX512-NEXT: vpextrb $4, %xmm2, %ecx +; AVX512-NEXT: vpextrb $4, %xmm3, %esi +; AVX512-NEXT: movl %esi, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %sil +; AVX512-NEXT: jno .LBB2_93 +; AVX512-NEXT: # %bb.92: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %esi +; AVX512-NEXT: .LBB2_93: +; AVX512-NEXT: vpextrb $3, %xmm2, %ecx +; AVX512-NEXT: vpextrb $3, %xmm3, %edx +; AVX512-NEXT: movl %edx, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %dl +; AVX512-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_95 +; AVX512-NEXT: # %bb.94: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %edx +; AVX512-NEXT: .LBB2_95: +; AVX512-NEXT: vpextrb $2, %xmm2, %ecx +; AVX512-NEXT: vpextrb $2, %xmm3, %edi +; AVX512-NEXT: movl %edi, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %dil +; AVX512-NEXT: jno .LBB2_97 +; AVX512-NEXT: # %bb.96: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %edi +; AVX512-NEXT: .LBB2_97: +; AVX512-NEXT: vpextrb $0, %xmm2, %ecx +; AVX512-NEXT: vpextrb $0, %xmm3, %ebx +; AVX512-NEXT: movl %ebx, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %bl +; AVX512-NEXT: jno .LBB2_99 +; AVX512-NEXT: # %bb.98: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %ebx +; AVX512-NEXT: .LBB2_99: +; AVX512-NEXT: vpextrb $1, %xmm2, %ecx +; AVX512-NEXT: vpextrb $1, %xmm3, %r11d +; AVX512-NEXT: movl %r11d, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %r11b +; AVX512-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_101 +; AVX512-NEXT: # %bb.100: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: # kill: def $al killed $al def $eax +; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: .LBB2_101: +; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm1 +; AVX512-NEXT: vpextrb $15, %xmm1, %ecx +; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm0 +; AVX512-NEXT: vpextrb $15, %xmm0, %r11d +; AVX512-NEXT: movl %r11d, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %r11b +; AVX512-NEXT: jno .LBB2_103 +; AVX512-NEXT: # %bb.102: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r11d +; AVX512-NEXT: .LBB2_103: +; AVX512-NEXT: vpextrb $14, %xmm1, %ecx +; AVX512-NEXT: vpextrb $14, %xmm0, %r9d +; AVX512-NEXT: movl %r9d, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %r9b +; AVX512-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_105 +; AVX512-NEXT: # %bb.104: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: # kill: def $al killed $al def $eax +; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: .LBB2_105: +; AVX512-NEXT: vpextrb $13, %xmm1, %ecx +; AVX512-NEXT: vpextrb $13, %xmm0, %r9d +; AVX512-NEXT: movl %r9d, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %r9b +; AVX512-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %r15d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_107 +; AVX512-NEXT: # %bb.106: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r9d +; AVX512-NEXT: .LBB2_107: +; AVX512-NEXT: vpextrb $12, %xmm1, %ecx +; AVX512-NEXT: vpextrb $12, %xmm0, %edx +; AVX512-NEXT: movl %edx, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %dl +; AVX512-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_109 +; AVX512-NEXT: # %bb.108: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %edx +; AVX512-NEXT: .LBB2_109: +; AVX512-NEXT: vpextrb $11, %xmm1, %ecx +; AVX512-NEXT: vpextrb $11, %xmm0, %r13d +; AVX512-NEXT: movl %r13d, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %r13b +; AVX512-NEXT: jno .LBB2_111 +; AVX512-NEXT: # %bb.110: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r13d +; AVX512-NEXT: .LBB2_111: +; AVX512-NEXT: vpextrb $10, %xmm1, %ecx +; AVX512-NEXT: vpextrb $10, %xmm0, %r15d +; AVX512-NEXT: movl %r15d, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %r15b +; AVX512-NEXT: jno .LBB2_113 +; AVX512-NEXT: # %bb.112: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r15d +; AVX512-NEXT: .LBB2_113: +; AVX512-NEXT: vpextrb $9, %xmm1, %ecx +; AVX512-NEXT: vpextrb $9, %xmm0, %r14d +; AVX512-NEXT: movl %r14d, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %r14b +; AVX512-NEXT: jno .LBB2_115 +; AVX512-NEXT: # %bb.114: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %r14d +; AVX512-NEXT: .LBB2_115: +; AVX512-NEXT: vpextrb $8, %xmm1, %ecx +; AVX512-NEXT: vpextrb $8, %xmm0, %ebp +; AVX512-NEXT: movl %ebp, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %bpl +; AVX512-NEXT: jno .LBB2_117 +; AVX512-NEXT: # %bb.116: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %ebp +; AVX512-NEXT: .LBB2_117: +; AVX512-NEXT: vpextrb $7, %xmm1, %ecx +; AVX512-NEXT: vpextrb $7, %xmm0, %edi +; AVX512-NEXT: movl %edi, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: setns %al +; AVX512-NEXT: subb %cl, %dil +; AVX512-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_119 +; AVX512-NEXT: # %bb.118: +; AVX512-NEXT: addb $127, %al +; AVX512-NEXT: movl %eax, %edi +; AVX512-NEXT: .LBB2_119: +; AVX512-NEXT: vpextrb $6, %xmm1, %edx +; AVX512-NEXT: vpextrb $6, %xmm0, %eax +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: subb %dl, %cl +; AVX512-NEXT: setns %cl +; AVX512-NEXT: subb %dl, %al +; AVX512-NEXT: jno .LBB2_121 +; AVX512-NEXT: # %bb.120: +; AVX512-NEXT: addb $127, %cl +; AVX512-NEXT: movl %ecx, %eax +; AVX512-NEXT: .LBB2_121: +; AVX512-NEXT: vpextrb $5, %xmm1, %ebx +; AVX512-NEXT: vpextrb $5, %xmm0, %ecx +; AVX512-NEXT: movl %ecx, %edx +; AVX512-NEXT: subb %bl, %dl +; AVX512-NEXT: setns %dl +; AVX512-NEXT: subb %bl, %cl +; AVX512-NEXT: jno .LBB2_123 +; AVX512-NEXT: # %bb.122: +; AVX512-NEXT: addb $127, %dl +; AVX512-NEXT: movl %edx, %ecx +; AVX512-NEXT: .LBB2_123: +; AVX512-NEXT: vpextrb $4, %xmm1, %esi +; AVX512-NEXT: vpextrb $4, %xmm0, %edx +; AVX512-NEXT: movl %edx, %ebx +; AVX512-NEXT: subb %sil, %bl +; AVX512-NEXT: setns %bl +; AVX512-NEXT: subb %sil, %dl +; AVX512-NEXT: jno .LBB2_125 +; AVX512-NEXT: # %bb.124: +; AVX512-NEXT: addb $127, %bl +; AVX512-NEXT: movl %ebx, %edx +; AVX512-NEXT: .LBB2_125: +; AVX512-NEXT: vpextrb $3, %xmm1, %esi +; AVX512-NEXT: vpextrb $3, %xmm0, %r8d +; AVX512-NEXT: movl %r8d, %ebx +; AVX512-NEXT: subb %sil, %bl +; AVX512-NEXT: setns %bl +; AVX512-NEXT: subb %sil, %r8b +; AVX512-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB2_127 +; AVX512-NEXT: # %bb.126: +; AVX512-NEXT: addb $127, %bl +; AVX512-NEXT: movl %ebx, %r8d +; AVX512-NEXT: .LBB2_127: +; AVX512-NEXT: vpextrb $2, %xmm1, %esi +; AVX512-NEXT: vpextrb $2, %xmm0, %r9d +; AVX512-NEXT: movl %r9d, %ebx +; AVX512-NEXT: subb %sil, %bl +; AVX512-NEXT: setns %bl +; AVX512-NEXT: subb %sil, %r9b +; AVX512-NEXT: jno .LBB2_129 +; AVX512-NEXT: # %bb.128: +; AVX512-NEXT: addb $127, %bl +; AVX512-NEXT: movl %ebx, %r9d +; AVX512-NEXT: .LBB2_129: +; AVX512-NEXT: vpextrb $0, %xmm1, %esi +; AVX512-NEXT: vpextrb $0, %xmm0, %r10d +; AVX512-NEXT: movl %r10d, %ebx +; AVX512-NEXT: subb %sil, %bl +; AVX512-NEXT: setns %bl +; AVX512-NEXT: movl %r11d, %r12d +; AVX512-NEXT: subb %sil, %r10b +; AVX512-NEXT: jno .LBB2_131 +; AVX512-NEXT: # %bb.130: +; AVX512-NEXT: addb $127, %bl +; AVX512-NEXT: movl %ebx, %r10d +; AVX512-NEXT: .LBB2_131: +; AVX512-NEXT: vpextrb $1, %xmm1, %esi +; AVX512-NEXT: vpextrb $1, %xmm0, %r11d +; AVX512-NEXT: movl %r11d, %ebx +; AVX512-NEXT: subb %sil, %bl +; AVX512-NEXT: setns %bl +; AVX512-NEXT: subb %sil, %r11b +; AVX512-NEXT: jno .LBB2_133 +; AVX512-NEXT: # %bb.132: +; AVX512-NEXT: addb $127, %bl +; AVX512-NEXT: movl %ebx, %r11d +; AVX512-NEXT: .LBB2_133: +; AVX512-NEXT: movzbl %r10b, %esi +; AVX512-NEXT: vmovd %esi, %xmm0 +; AVX512-NEXT: movzbl %r11b, %esi +; AVX512-NEXT: vpinsrb $1, %esi, %xmm0, %xmm0 +; AVX512-NEXT: movzbl %r9b, %esi +; AVX512-NEXT: vpinsrb $2, %esi, %xmm0, %xmm0 +; AVX512-NEXT: movzbl %r8b, %esi +; AVX512-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; AVX512-NEXT: movzbl %dl, %edx +; AVX512-NEXT: vpinsrb $4, %edx, %xmm0, %xmm0 +; AVX512-NEXT: movzbl %cl, %ecx +; AVX512-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 +; AVX512-NEXT: movzbl %al, %eax +; AVX512-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX512-NEXT: movzbl %dil, %eax +; AVX512-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX512-NEXT: movzbl %bpl, %eax +; AVX512-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX512-NEXT: movzbl %r14b, %eax +; AVX512-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX512-NEXT: movzbl %r15b, %eax +; AVX512-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX512-NEXT: movzbl %r13b, %eax +; AVX512-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX512-NEXT: movzbl %r12b, %eax +; AVX512-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vmovd %eax, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vmovd %eax, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl (%rsp), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vmovd %eax, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: addq $76, %rsp +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r12 +; AVX512-NEXT: popq %r13 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %r15 +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq + %z = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> %x, <64 x i8> %y) + ret <64 x i8> %z +} + +define <8 x i16> @v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { +; SSE2-LABEL: v8i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: movd %xmm0, %r8d +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %r8d, %edx +; SSE2-NEXT: subw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %r8w +; SSE2-NEXT: cmovol %ecx, %r8d +; SSE2-NEXT: pextrw $1, %xmm1, %eax +; SSE2-NEXT: pextrw $1, %xmm0, %r9d +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %r9d, %edx +; SSE2-NEXT: subw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %r9w +; SSE2-NEXT: cmovol %ecx, %r9d +; SSE2-NEXT: pextrw $2, %xmm1, %eax +; SSE2-NEXT: pextrw $2, %xmm0, %r10d +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %r10d, %esi +; SSE2-NEXT: subw %ax, %si +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %r10w +; SSE2-NEXT: cmovol %ecx, %r10d +; SSE2-NEXT: pextrw $3, %xmm1, %eax +; SSE2-NEXT: pextrw $3, %xmm0, %r11d +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %r11d, %edi +; SSE2-NEXT: subw %ax, %di +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %r11w +; SSE2-NEXT: cmovol %ecx, %r11d +; SSE2-NEXT: pextrw $4, %xmm1, %eax +; SSE2-NEXT: pextrw $4, %xmm0, %edi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %edi, %edx +; SSE2-NEXT: subw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %di +; SSE2-NEXT: cmovol %ecx, %edi +; SSE2-NEXT: pextrw $5, %xmm1, %ecx +; SSE2-NEXT: pextrw $5, %xmm0, %eax +; SSE2-NEXT: xorl %edx, %edx +; SSE2-NEXT: movl %eax, %esi +; SSE2-NEXT: subw %cx, %si +; SSE2-NEXT: setns %dl +; SSE2-NEXT: addl $32767, %edx # imm = 0x7FFF +; SSE2-NEXT: subw %cx, %ax +; SSE2-NEXT: cmovol %edx, %eax +; SSE2-NEXT: pextrw $6, %xmm1, %edx +; SSE2-NEXT: pextrw $6, %xmm0, %ecx +; SSE2-NEXT: xorl %esi, %esi +; SSE2-NEXT: movl %ecx, %ebx +; SSE2-NEXT: subw %dx, %bx +; SSE2-NEXT: setns %sil +; SSE2-NEXT: addl $32767, %esi # imm = 0x7FFF +; SSE2-NEXT: subw %dx, %cx +; SSE2-NEXT: cmovol %esi, %ecx +; SSE2-NEXT: pextrw $7, %xmm1, %edx +; SSE2-NEXT: pextrw $7, %xmm0, %esi +; SSE2-NEXT: xorl %ebx, %ebx +; SSE2-NEXT: movl %esi, %ebp +; SSE2-NEXT: subw %dx, %bp +; SSE2-NEXT: setns %bl +; SSE2-NEXT: addl $32767, %ebx # imm = 0x7FFF +; SSE2-NEXT: subw %dx, %si +; SSE2-NEXT: cmovol %ebx, %esi +; SSE2-NEXT: movd %esi, %xmm0 +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movd %edi, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: movd %r11d, %xmm0 +; SSE2-NEXT: movd %r10d, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: movd %r9d, %xmm3 +; SSE2-NEXT: movd %r8d, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v8i16: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pushq %rbp +; SSSE3-NEXT: pushq %rbx +; SSSE3-NEXT: movd %xmm1, %eax +; SSSE3-NEXT: movd %xmm0, %r8d +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %r8d, %edx +; SSSE3-NEXT: subw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %r8w +; SSSE3-NEXT: cmovol %ecx, %r8d +; SSSE3-NEXT: pextrw $1, %xmm1, %eax +; SSSE3-NEXT: pextrw $1, %xmm0, %r9d +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %r9d, %edx +; SSSE3-NEXT: subw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %r9w +; SSSE3-NEXT: cmovol %ecx, %r9d +; SSSE3-NEXT: pextrw $2, %xmm1, %eax +; SSSE3-NEXT: pextrw $2, %xmm0, %r10d +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %r10d, %esi +; SSSE3-NEXT: subw %ax, %si +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %r10w +; SSSE3-NEXT: cmovol %ecx, %r10d +; SSSE3-NEXT: pextrw $3, %xmm1, %eax +; SSSE3-NEXT: pextrw $3, %xmm0, %r11d +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %r11d, %edi +; SSSE3-NEXT: subw %ax, %di +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %r11w +; SSSE3-NEXT: cmovol %ecx, %r11d +; SSSE3-NEXT: pextrw $4, %xmm1, %eax +; SSSE3-NEXT: pextrw $4, %xmm0, %edi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %edi, %edx +; SSSE3-NEXT: subw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %di +; SSSE3-NEXT: cmovol %ecx, %edi +; SSSE3-NEXT: pextrw $5, %xmm1, %ecx +; SSSE3-NEXT: pextrw $5, %xmm0, %eax +; SSSE3-NEXT: xorl %edx, %edx +; SSSE3-NEXT: movl %eax, %esi +; SSSE3-NEXT: subw %cx, %si +; SSSE3-NEXT: setns %dl +; SSSE3-NEXT: addl $32767, %edx # imm = 0x7FFF +; SSSE3-NEXT: subw %cx, %ax +; SSSE3-NEXT: cmovol %edx, %eax +; SSSE3-NEXT: pextrw $6, %xmm1, %edx +; SSSE3-NEXT: pextrw $6, %xmm0, %ecx +; SSSE3-NEXT: xorl %esi, %esi +; SSSE3-NEXT: movl %ecx, %ebx +; SSSE3-NEXT: subw %dx, %bx +; SSSE3-NEXT: setns %sil +; SSSE3-NEXT: addl $32767, %esi # imm = 0x7FFF +; SSSE3-NEXT: subw %dx, %cx +; SSSE3-NEXT: cmovol %esi, %ecx +; SSSE3-NEXT: pextrw $7, %xmm1, %edx +; SSSE3-NEXT: pextrw $7, %xmm0, %esi +; SSSE3-NEXT: xorl %ebx, %ebx +; SSSE3-NEXT: movl %esi, %ebp +; SSSE3-NEXT: subw %dx, %bp +; SSSE3-NEXT: setns %bl +; SSSE3-NEXT: addl $32767, %ebx # imm = 0x7FFF +; SSSE3-NEXT: subw %dx, %si +; SSSE3-NEXT: cmovol %ebx, %esi +; SSSE3-NEXT: movd %esi, %xmm0 +; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movd %edi, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSSE3-NEXT: movd %r11d, %xmm0 +; SSSE3-NEXT: movd %r10d, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: movd %r9d, %xmm3 +; SSSE3-NEXT: movd %r8d, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSSE3-NEXT: popq %rbx +; SSSE3-NEXT: popq %rbp +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v8i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pushq %rbp +; SSE41-NEXT: pushq %rbx +; SSE41-NEXT: pextrw $7, %xmm1, %eax +; SSE41-NEXT: pextrw $7, %xmm0, %r8d +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %r8d, %edx +; SSE41-NEXT: subw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %r8w +; SSE41-NEXT: cmovol %ecx, %r8d +; SSE41-NEXT: pextrw $6, %xmm1, %eax +; SSE41-NEXT: pextrw $6, %xmm0, %r9d +; SSE41-NEXT: xorl %edx, %edx +; SSE41-NEXT: movl %r9d, %esi +; SSE41-NEXT: subw %ax, %si +; SSE41-NEXT: setns %dl +; SSE41-NEXT: addl $32767, %edx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %r9w +; SSE41-NEXT: cmovol %edx, %r9d +; SSE41-NEXT: pextrw $5, %xmm1, %eax +; SSE41-NEXT: pextrw $5, %xmm0, %r10d +; SSE41-NEXT: xorl %esi, %esi +; SSE41-NEXT: movl %r10d, %edi +; SSE41-NEXT: subw %ax, %di +; SSE41-NEXT: setns %sil +; SSE41-NEXT: addl $32767, %esi # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %r10w +; SSE41-NEXT: cmovol %esi, %r10d +; SSE41-NEXT: pextrw $4, %xmm1, %eax +; SSE41-NEXT: pextrw $4, %xmm0, %r11d +; SSE41-NEXT: xorl %edi, %edi +; SSE41-NEXT: movl %r11d, %ecx +; SSE41-NEXT: subw %ax, %cx +; SSE41-NEXT: setns %dil +; SSE41-NEXT: addl $32767, %edi # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %r11w +; SSE41-NEXT: cmovol %edi, %r11d +; SSE41-NEXT: pextrw $3, %xmm1, %eax +; SSE41-NEXT: pextrw $3, %xmm0, %edi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %edi, %edx +; SSE41-NEXT: subw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %di +; SSE41-NEXT: cmovol %ecx, %edi +; SSE41-NEXT: pextrw $2, %xmm1, %ecx +; SSE41-NEXT: pextrw $2, %xmm0, %eax +; SSE41-NEXT: xorl %edx, %edx +; SSE41-NEXT: movl %eax, %esi +; SSE41-NEXT: subw %cx, %si +; SSE41-NEXT: setns %dl +; SSE41-NEXT: addl $32767, %edx # imm = 0x7FFF +; SSE41-NEXT: subw %cx, %ax +; SSE41-NEXT: cmovol %edx, %eax +; SSE41-NEXT: movd %xmm1, %ecx +; SSE41-NEXT: movd %xmm0, %edx +; SSE41-NEXT: xorl %esi, %esi +; SSE41-NEXT: movl %edx, %ebx +; SSE41-NEXT: subw %cx, %bx +; SSE41-NEXT: setns %sil +; SSE41-NEXT: addl $32767, %esi # imm = 0x7FFF +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovol %esi, %edx +; SSE41-NEXT: pextrw $1, %xmm1, %ecx +; SSE41-NEXT: pextrw $1, %xmm0, %esi +; SSE41-NEXT: xorl %ebx, %ebx +; SSE41-NEXT: movl %esi, %ebp +; SSE41-NEXT: subw %cx, %bp +; SSE41-NEXT: setns %bl +; SSE41-NEXT: addl $32767, %ebx # imm = 0x7FFF +; SSE41-NEXT: subw %cx, %si +; SSE41-NEXT: cmovol %ebx, %esi +; SSE41-NEXT: movd %edx, %xmm0 +; SSE41-NEXT: pinsrw $1, %esi, %xmm0 +; SSE41-NEXT: pinsrw $2, %eax, %xmm0 +; SSE41-NEXT: pinsrw $3, %edi, %xmm0 +; SSE41-NEXT: pinsrw $4, %r11d, %xmm0 +; SSE41-NEXT: pinsrw $5, %r10d, %xmm0 +; SSE41-NEXT: pinsrw $6, %r9d, %xmm0 +; SSE41-NEXT: pinsrw $7, %r8d, %xmm0 +; SSE41-NEXT: popq %rbx +; SSE41-NEXT: popq %rbp +; SSE41-NEXT: retq +; +; AVX-LABEL: v8i16: +; AVX: # %bb.0: +; AVX-NEXT: pushq %rbp +; AVX-NEXT: pushq %rbx +; AVX-NEXT: vpextrw $7, %xmm1, %eax +; AVX-NEXT: vpextrw $7, %xmm0, %r8d +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: movl %r8d, %edx +; AVX-NEXT: subw %ax, %dx +; AVX-NEXT: setns %cl +; AVX-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX-NEXT: subw %ax, %r8w +; AVX-NEXT: cmovol %ecx, %r8d +; AVX-NEXT: vpextrw $6, %xmm1, %eax +; AVX-NEXT: vpextrw $6, %xmm0, %r9d +; AVX-NEXT: xorl %edx, %edx +; AVX-NEXT: movl %r9d, %esi +; AVX-NEXT: subw %ax, %si +; AVX-NEXT: setns %dl +; AVX-NEXT: addl $32767, %edx # imm = 0x7FFF +; AVX-NEXT: subw %ax, %r9w +; AVX-NEXT: cmovol %edx, %r9d +; AVX-NEXT: vpextrw $5, %xmm1, %eax +; AVX-NEXT: vpextrw $5, %xmm0, %r10d +; AVX-NEXT: xorl %esi, %esi +; AVX-NEXT: movl %r10d, %edi +; AVX-NEXT: subw %ax, %di +; AVX-NEXT: setns %sil +; AVX-NEXT: addl $32767, %esi # imm = 0x7FFF +; AVX-NEXT: subw %ax, %r10w +; AVX-NEXT: cmovol %esi, %r10d +; AVX-NEXT: vpextrw $4, %xmm1, %eax +; AVX-NEXT: vpextrw $4, %xmm0, %r11d +; AVX-NEXT: xorl %edi, %edi +; AVX-NEXT: movl %r11d, %ecx +; AVX-NEXT: subw %ax, %cx +; AVX-NEXT: setns %dil +; AVX-NEXT: addl $32767, %edi # imm = 0x7FFF +; AVX-NEXT: subw %ax, %r11w +; AVX-NEXT: cmovol %edi, %r11d +; AVX-NEXT: vpextrw $3, %xmm1, %eax +; AVX-NEXT: vpextrw $3, %xmm0, %edi +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: movl %edi, %edx +; AVX-NEXT: subw %ax, %dx +; AVX-NEXT: setns %cl +; AVX-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX-NEXT: subw %ax, %di +; AVX-NEXT: cmovol %ecx, %edi +; AVX-NEXT: vpextrw $2, %xmm1, %ecx +; AVX-NEXT: vpextrw $2, %xmm0, %eax +; AVX-NEXT: xorl %edx, %edx +; AVX-NEXT: movl %eax, %esi +; AVX-NEXT: subw %cx, %si +; AVX-NEXT: setns %dl +; AVX-NEXT: addl $32767, %edx # imm = 0x7FFF +; AVX-NEXT: subw %cx, %ax +; AVX-NEXT: cmovol %edx, %eax +; AVX-NEXT: vmovd %xmm1, %ecx +; AVX-NEXT: vmovd %xmm0, %edx +; AVX-NEXT: xorl %esi, %esi +; AVX-NEXT: movl %edx, %ebx +; AVX-NEXT: subw %cx, %bx +; AVX-NEXT: setns %sil +; AVX-NEXT: addl $32767, %esi # imm = 0x7FFF +; AVX-NEXT: subw %cx, %dx +; AVX-NEXT: cmovol %esi, %edx +; AVX-NEXT: vpextrw $1, %xmm1, %ecx +; AVX-NEXT: vpextrw $1, %xmm0, %esi +; AVX-NEXT: xorl %ebx, %ebx +; AVX-NEXT: movl %esi, %ebp +; AVX-NEXT: subw %cx, %bp +; AVX-NEXT: setns %bl +; AVX-NEXT: addl $32767, %ebx # imm = 0x7FFF +; AVX-NEXT: subw %cx, %si +; AVX-NEXT: cmovol %ebx, %esi +; AVX-NEXT: vmovd %edx, %xmm0 +; AVX-NEXT: vpinsrw $1, %esi, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $3, %edi, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $4, %r11d, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $5, %r10d, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $6, %r9d, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $7, %r8d, %xmm0, %xmm0 +; AVX-NEXT: popq %rbx +; AVX-NEXT: popq %rbp +; AVX-NEXT: retq + %z = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %x, <8 x i16> %y) + ret <8 x i16> %z +} + +define <16 x i16> @v16i16(<16 x i16> %x, <16 x i16> %y) nounwind { +; SSE2-LABEL: v16i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: pushq %r15 +; SSE2-NEXT: pushq %r14 +; SSE2-NEXT: pushq %r13 +; SSE2-NEXT: pushq %r12 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: movd %xmm3, %eax +; SSE2-NEXT: movd %xmm1, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: subw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: pextrw $1, %xmm3, %eax +; SSE2-NEXT: pextrw $1, %xmm1, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: subw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: pextrw $2, %xmm3, %eax +; SSE2-NEXT: pextrw $2, %xmm1, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: subw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: pextrw $3, %xmm3, %eax +; SSE2-NEXT: pextrw $3, %xmm1, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: subw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: pextrw $4, %xmm3, %eax +; SSE2-NEXT: pextrw $4, %xmm1, %r14d +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %r14d, %edx +; SSE2-NEXT: subw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %r14w +; SSE2-NEXT: cmovol %ecx, %r14d +; SSE2-NEXT: pextrw $5, %xmm3, %eax +; SSE2-NEXT: pextrw $5, %xmm1, %r15d +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %r15d, %edx +; SSE2-NEXT: subw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %r15w +; SSE2-NEXT: cmovol %ecx, %r15d +; SSE2-NEXT: pextrw $6, %xmm3, %eax +; SSE2-NEXT: pextrw $6, %xmm1, %r12d +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %r12d, %edx +; SSE2-NEXT: subw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %r12w +; SSE2-NEXT: cmovol %ecx, %r12d +; SSE2-NEXT: pextrw $7, %xmm3, %eax +; SSE2-NEXT: pextrw $7, %xmm1, %r13d +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %r13d, %edx +; SSE2-NEXT: subw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %r13w +; SSE2-NEXT: cmovol %ecx, %r13d +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: movd %xmm0, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: subw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: pextrw $1, %xmm2, %eax +; SSE2-NEXT: pextrw $1, %xmm0, %ebx +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %ebx, %edx +; SSE2-NEXT: subw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %bx +; SSE2-NEXT: cmovol %ecx, %ebx +; SSE2-NEXT: pextrw $2, %xmm2, %eax +; SSE2-NEXT: pextrw $2, %xmm0, %ebp +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %ebp, %edx +; SSE2-NEXT: subw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %bp +; SSE2-NEXT: cmovol %ecx, %ebp +; SSE2-NEXT: pextrw $3, %xmm2, %eax +; SSE2-NEXT: pextrw $3, %xmm0, %edi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %edi, %edx +; SSE2-NEXT: subw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %di +; SSE2-NEXT: cmovol %ecx, %edi +; SSE2-NEXT: pextrw $4, %xmm2, %ecx +; SSE2-NEXT: pextrw $4, %xmm0, %eax +; SSE2-NEXT: xorl %edx, %edx +; SSE2-NEXT: movl %eax, %r8d +; SSE2-NEXT: subw %cx, %r8w +; SSE2-NEXT: setns %dl +; SSE2-NEXT: addl $32767, %edx # imm = 0x7FFF +; SSE2-NEXT: subw %cx, %ax +; SSE2-NEXT: cmovol %edx, %eax +; SSE2-NEXT: pextrw $5, %xmm2, %r8d +; SSE2-NEXT: pextrw $5, %xmm0, %ecx +; SSE2-NEXT: xorl %edx, %edx +; SSE2-NEXT: movl %ecx, %r9d +; SSE2-NEXT: subw %r8w, %r9w +; SSE2-NEXT: setns %dl +; SSE2-NEXT: addl $32767, %edx # imm = 0x7FFF +; SSE2-NEXT: subw %r8w, %cx +; SSE2-NEXT: cmovol %edx, %ecx +; SSE2-NEXT: pextrw $6, %xmm2, %r8d +; SSE2-NEXT: pextrw $6, %xmm0, %r9d +; SSE2-NEXT: xorl %edx, %edx +; SSE2-NEXT: movl %r9d, %r10d +; SSE2-NEXT: subw %r8w, %r10w +; SSE2-NEXT: setns %dl +; SSE2-NEXT: addl $32767, %edx # imm = 0x7FFF +; SSE2-NEXT: subw %r8w, %r9w +; SSE2-NEXT: cmovol %edx, %r9d +; SSE2-NEXT: pextrw $7, %xmm2, %r8d +; SSE2-NEXT: pextrw $7, %xmm0, %edx +; SSE2-NEXT: xorl %r10d, %r10d +; SSE2-NEXT: movl %edx, %r11d +; SSE2-NEXT: subw %r8w, %r11w +; SSE2-NEXT: setns %r10b +; SSE2-NEXT: addl $32767, %r10d # imm = 0x7FFF +; SSE2-NEXT: subw %r8w, %dx +; SSE2-NEXT: cmovol %r10d, %edx +; SSE2-NEXT: movd %edx, %xmm8 +; SSE2-NEXT: movd %r9d, %xmm3 +; SSE2-NEXT: movd %ecx, %xmm9 +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: movd %edi, %xmm10 +; SSE2-NEXT: movd %ebp, %xmm7 +; SSE2-NEXT: movd %ebx, %xmm11 +; SSE2-NEXT: movd %esi, %xmm0 +; SSE2-NEXT: movd %r13d, %xmm12 +; SSE2-NEXT: movd %r12d, %xmm6 +; SSE2-NEXT: movd %r15d, %xmm13 +; SSE2-NEXT: movd %r14d, %xmm5 +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 4-byte Folded Reload +; SSE2-NEXT: # xmm14 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload +; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload +; SSE2-NEXT: # xmm15 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload +; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %r12 +; SSE2-NEXT: popq %r13 +; SSE2-NEXT: popq %r14 +; SSE2-NEXT: popq %r15 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v16i16: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pushq %rbp +; SSSE3-NEXT: pushq %r15 +; SSSE3-NEXT: pushq %r14 +; SSSE3-NEXT: pushq %r13 +; SSSE3-NEXT: pushq %r12 +; SSSE3-NEXT: pushq %rbx +; SSSE3-NEXT: movd %xmm3, %eax +; SSSE3-NEXT: movd %xmm1, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: subw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: pextrw $1, %xmm3, %eax +; SSSE3-NEXT: pextrw $1, %xmm1, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: subw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: pextrw $2, %xmm3, %eax +; SSSE3-NEXT: pextrw $2, %xmm1, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: subw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: pextrw $3, %xmm3, %eax +; SSSE3-NEXT: pextrw $3, %xmm1, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: subw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: pextrw $4, %xmm3, %eax +; SSSE3-NEXT: pextrw $4, %xmm1, %r14d +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %r14d, %edx +; SSSE3-NEXT: subw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %r14w +; SSSE3-NEXT: cmovol %ecx, %r14d +; SSSE3-NEXT: pextrw $5, %xmm3, %eax +; SSSE3-NEXT: pextrw $5, %xmm1, %r15d +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %r15d, %edx +; SSSE3-NEXT: subw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %r15w +; SSSE3-NEXT: cmovol %ecx, %r15d +; SSSE3-NEXT: pextrw $6, %xmm3, %eax +; SSSE3-NEXT: pextrw $6, %xmm1, %r12d +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %r12d, %edx +; SSSE3-NEXT: subw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %r12w +; SSSE3-NEXT: cmovol %ecx, %r12d +; SSSE3-NEXT: pextrw $7, %xmm3, %eax +; SSSE3-NEXT: pextrw $7, %xmm1, %r13d +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %r13d, %edx +; SSSE3-NEXT: subw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %r13w +; SSSE3-NEXT: cmovol %ecx, %r13d +; SSSE3-NEXT: movd %xmm2, %eax +; SSSE3-NEXT: movd %xmm0, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: subw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: pextrw $1, %xmm2, %eax +; SSSE3-NEXT: pextrw $1, %xmm0, %ebx +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %ebx, %edx +; SSSE3-NEXT: subw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %bx +; SSSE3-NEXT: cmovol %ecx, %ebx +; SSSE3-NEXT: pextrw $2, %xmm2, %eax +; SSSE3-NEXT: pextrw $2, %xmm0, %ebp +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %ebp, %edx +; SSSE3-NEXT: subw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %bp +; SSSE3-NEXT: cmovol %ecx, %ebp +; SSSE3-NEXT: pextrw $3, %xmm2, %eax +; SSSE3-NEXT: pextrw $3, %xmm0, %edi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %edi, %edx +; SSSE3-NEXT: subw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %di +; SSSE3-NEXT: cmovol %ecx, %edi +; SSSE3-NEXT: pextrw $4, %xmm2, %ecx +; SSSE3-NEXT: pextrw $4, %xmm0, %eax +; SSSE3-NEXT: xorl %edx, %edx +; SSSE3-NEXT: movl %eax, %r8d +; SSSE3-NEXT: subw %cx, %r8w +; SSSE3-NEXT: setns %dl +; SSSE3-NEXT: addl $32767, %edx # imm = 0x7FFF +; SSSE3-NEXT: subw %cx, %ax +; SSSE3-NEXT: cmovol %edx, %eax +; SSSE3-NEXT: pextrw $5, %xmm2, %r8d +; SSSE3-NEXT: pextrw $5, %xmm0, %ecx +; SSSE3-NEXT: xorl %edx, %edx +; SSSE3-NEXT: movl %ecx, %r9d +; SSSE3-NEXT: subw %r8w, %r9w +; SSSE3-NEXT: setns %dl +; SSSE3-NEXT: addl $32767, %edx # imm = 0x7FFF +; SSSE3-NEXT: subw %r8w, %cx +; SSSE3-NEXT: cmovol %edx, %ecx +; SSSE3-NEXT: pextrw $6, %xmm2, %r8d +; SSSE3-NEXT: pextrw $6, %xmm0, %r9d +; SSSE3-NEXT: xorl %edx, %edx +; SSSE3-NEXT: movl %r9d, %r10d +; SSSE3-NEXT: subw %r8w, %r10w +; SSSE3-NEXT: setns %dl +; SSSE3-NEXT: addl $32767, %edx # imm = 0x7FFF +; SSSE3-NEXT: subw %r8w, %r9w +; SSSE3-NEXT: cmovol %edx, %r9d +; SSSE3-NEXT: pextrw $7, %xmm2, %r8d +; SSSE3-NEXT: pextrw $7, %xmm0, %edx +; SSSE3-NEXT: xorl %r10d, %r10d +; SSSE3-NEXT: movl %edx, %r11d +; SSSE3-NEXT: subw %r8w, %r11w +; SSSE3-NEXT: setns %r10b +; SSSE3-NEXT: addl $32767, %r10d # imm = 0x7FFF +; SSSE3-NEXT: subw %r8w, %dx +; SSSE3-NEXT: cmovol %r10d, %edx +; SSSE3-NEXT: movd %edx, %xmm8 +; SSSE3-NEXT: movd %r9d, %xmm3 +; SSSE3-NEXT: movd %ecx, %xmm9 +; SSSE3-NEXT: movd %eax, %xmm4 +; SSSE3-NEXT: movd %edi, %xmm10 +; SSSE3-NEXT: movd %ebp, %xmm7 +; SSSE3-NEXT: movd %ebx, %xmm11 +; SSSE3-NEXT: movd %esi, %xmm0 +; SSSE3-NEXT: movd %r13d, %xmm12 +; SSSE3-NEXT: movd %r12d, %xmm6 +; SSSE3-NEXT: movd %r15d, %xmm13 +; SSSE3-NEXT: movd %r14d, %xmm5 +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm14 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm15 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] +; SSSE3-NEXT: popq %rbx +; SSSE3-NEXT: popq %r12 +; SSSE3-NEXT: popq %r13 +; SSSE3-NEXT: popq %r14 +; SSSE3-NEXT: popq %r15 +; SSSE3-NEXT: popq %rbp +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v16i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pushq %rbp +; SSE41-NEXT: pushq %r15 +; SSE41-NEXT: pushq %r14 +; SSE41-NEXT: pushq %r13 +; SSE41-NEXT: pushq %r12 +; SSE41-NEXT: pushq %rbx +; SSE41-NEXT: pextrw $7, %xmm3, %eax +; SSE41-NEXT: pextrw $7, %xmm1, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: subw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrw $6, %xmm3, %eax +; SSE41-NEXT: pextrw $6, %xmm1, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: subw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrw $5, %xmm3, %eax +; SSE41-NEXT: pextrw $5, %xmm1, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: subw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrw $4, %xmm3, %eax +; SSE41-NEXT: pextrw $4, %xmm1, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: subw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrw $3, %xmm3, %eax +; SSE41-NEXT: pextrw $3, %xmm1, %r14d +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %r14d, %edx +; SSE41-NEXT: subw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %r14w +; SSE41-NEXT: cmovol %ecx, %r14d +; SSE41-NEXT: pextrw $2, %xmm3, %eax +; SSE41-NEXT: pextrw $2, %xmm1, %r15d +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %r15d, %edx +; SSE41-NEXT: subw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %r15w +; SSE41-NEXT: cmovol %ecx, %r15d +; SSE41-NEXT: movd %xmm3, %eax +; SSE41-NEXT: movd %xmm1, %r12d +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %r12d, %edx +; SSE41-NEXT: subw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %r12w +; SSE41-NEXT: cmovol %ecx, %r12d +; SSE41-NEXT: pextrw $1, %xmm3, %eax +; SSE41-NEXT: pextrw $1, %xmm1, %r13d +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %r13d, %esi +; SSE41-NEXT: subw %ax, %si +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %r13w +; SSE41-NEXT: cmovol %ecx, %r13d +; SSE41-NEXT: pextrw $7, %xmm2, %eax +; SSE41-NEXT: pextrw $7, %xmm0, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edi +; SSE41-NEXT: subw %ax, %di +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: pextrw $6, %xmm2, %eax +; SSE41-NEXT: pextrw $6, %xmm0, %ebx +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %ebx, %edi +; SSE41-NEXT: subw %ax, %di +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %bx +; SSE41-NEXT: cmovol %ecx, %ebx +; SSE41-NEXT: pextrw $5, %xmm2, %eax +; SSE41-NEXT: pextrw $5, %xmm0, %ebp +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %ebp, %edi +; SSE41-NEXT: subw %ax, %di +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %bp +; SSE41-NEXT: cmovol %ecx, %ebp +; SSE41-NEXT: pextrw $4, %xmm2, %eax +; SSE41-NEXT: pextrw $4, %xmm0, %edi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %edi, %edx +; SSE41-NEXT: subw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %di +; SSE41-NEXT: cmovol %ecx, %edi +; SSE41-NEXT: pextrw $3, %xmm2, %ecx +; SSE41-NEXT: pextrw $3, %xmm0, %eax +; SSE41-NEXT: xorl %edx, %edx +; SSE41-NEXT: movl %eax, %r8d +; SSE41-NEXT: subw %cx, %r8w +; SSE41-NEXT: setns %dl +; SSE41-NEXT: addl $32767, %edx # imm = 0x7FFF +; SSE41-NEXT: subw %cx, %ax +; SSE41-NEXT: cmovol %edx, %eax +; SSE41-NEXT: pextrw $2, %xmm2, %r8d +; SSE41-NEXT: pextrw $2, %xmm0, %ecx +; SSE41-NEXT: xorl %edx, %edx +; SSE41-NEXT: movl %ecx, %r9d +; SSE41-NEXT: subw %r8w, %r9w +; SSE41-NEXT: setns %dl +; SSE41-NEXT: addl $32767, %edx # imm = 0x7FFF +; SSE41-NEXT: subw %r8w, %cx +; SSE41-NEXT: cmovol %edx, %ecx +; SSE41-NEXT: movd %xmm2, %r8d +; SSE41-NEXT: movd %xmm0, %r9d +; SSE41-NEXT: xorl %edx, %edx +; SSE41-NEXT: movl %r9d, %r10d +; SSE41-NEXT: subw %r8w, %r10w +; SSE41-NEXT: setns %dl +; SSE41-NEXT: addl $32767, %edx # imm = 0x7FFF +; SSE41-NEXT: subw %r8w, %r9w +; SSE41-NEXT: cmovol %edx, %r9d +; SSE41-NEXT: pextrw $1, %xmm2, %r8d +; SSE41-NEXT: pextrw $1, %xmm0, %edx +; SSE41-NEXT: xorl %r10d, %r10d +; SSE41-NEXT: movl %edx, %r11d +; SSE41-NEXT: subw %r8w, %r11w +; SSE41-NEXT: setns %r10b +; SSE41-NEXT: addl $32767, %r10d # imm = 0x7FFF +; SSE41-NEXT: subw %r8w, %dx +; SSE41-NEXT: cmovol %r10d, %edx +; SSE41-NEXT: movd %r9d, %xmm0 +; SSE41-NEXT: pinsrw $1, %edx, %xmm0 +; SSE41-NEXT: pinsrw $2, %ecx, %xmm0 +; SSE41-NEXT: pinsrw $3, %eax, %xmm0 +; SSE41-NEXT: pinsrw $4, %edi, %xmm0 +; SSE41-NEXT: pinsrw $5, %ebp, %xmm0 +; SSE41-NEXT: pinsrw $6, %ebx, %xmm0 +; SSE41-NEXT: pinsrw $7, %esi, %xmm0 +; SSE41-NEXT: movd %r12d, %xmm1 +; SSE41-NEXT: pinsrw $1, %r13d, %xmm1 +; SSE41-NEXT: pinsrw $2, %r15d, %xmm1 +; SSE41-NEXT: pinsrw $3, %r14d, %xmm1 +; SSE41-NEXT: pinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload +; SSE41-NEXT: pinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload +; SSE41-NEXT: pinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload +; SSE41-NEXT: pinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload +; SSE41-NEXT: popq %rbx +; SSE41-NEXT: popq %r12 +; SSE41-NEXT: popq %r13 +; SSE41-NEXT: popq %r14 +; SSE41-NEXT: popq %r15 +; SSE41-NEXT: popq %rbp +; SSE41-NEXT: retq +; +; AVX1-LABEL: v16i16: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: pushq %r15 +; AVX1-NEXT: pushq %r14 +; AVX1-NEXT: pushq %r13 +; AVX1-NEXT: pushq %r12 +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: vpextrw $7, %xmm1, %eax +; AVX1-NEXT: vpextrw $7, %xmm0, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: subw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: subw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrw $6, %xmm1, %eax +; AVX1-NEXT: vpextrw $6, %xmm0, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: subw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: subw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrw $5, %xmm1, %eax +; AVX1-NEXT: vpextrw $5, %xmm0, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: subw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: subw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrw $4, %xmm1, %eax +; AVX1-NEXT: vpextrw $4, %xmm0, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: subw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: subw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrw $3, %xmm1, %eax +; AVX1-NEXT: vpextrw $3, %xmm0, %r14d +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %r14d, %edx +; AVX1-NEXT: subw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: subw %ax, %r14w +; AVX1-NEXT: cmovol %ecx, %r14d +; AVX1-NEXT: vpextrw $2, %xmm1, %eax +; AVX1-NEXT: vpextrw $2, %xmm0, %r15d +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %r15d, %edx +; AVX1-NEXT: subw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: subw %ax, %r15w +; AVX1-NEXT: cmovol %ecx, %r15d +; AVX1-NEXT: vmovd %xmm1, %eax +; AVX1-NEXT: vmovd %xmm0, %r12d +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %r12d, %edx +; AVX1-NEXT: subw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: subw %ax, %r12w +; AVX1-NEXT: cmovol %ecx, %r12d +; AVX1-NEXT: vpextrw $1, %xmm1, %eax +; AVX1-NEXT: vpextrw $1, %xmm0, %r13d +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %r13d, %esi +; AVX1-NEXT: subw %ax, %si +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: subw %ax, %r13w +; AVX1-NEXT: cmovol %ecx, %r13d +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpextrw $7, %xmm1, %eax +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpextrw $7, %xmm0, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edi +; AVX1-NEXT: subw %ax, %di +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: subw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: vpextrw $6, %xmm1, %eax +; AVX1-NEXT: vpextrw $6, %xmm0, %ebx +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %ebx, %edi +; AVX1-NEXT: subw %ax, %di +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: subw %ax, %bx +; AVX1-NEXT: cmovol %ecx, %ebx +; AVX1-NEXT: vpextrw $5, %xmm1, %eax +; AVX1-NEXT: vpextrw $5, %xmm0, %ebp +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %ebp, %edi +; AVX1-NEXT: subw %ax, %di +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: subw %ax, %bp +; AVX1-NEXT: cmovol %ecx, %ebp +; AVX1-NEXT: vpextrw $4, %xmm1, %eax +; AVX1-NEXT: vpextrw $4, %xmm0, %edi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %edi, %edx +; AVX1-NEXT: subw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: subw %ax, %di +; AVX1-NEXT: cmovol %ecx, %edi +; AVX1-NEXT: vpextrw $3, %xmm1, %ecx +; AVX1-NEXT: vpextrw $3, %xmm0, %eax +; AVX1-NEXT: xorl %edx, %edx +; AVX1-NEXT: movl %eax, %r8d +; AVX1-NEXT: subw %cx, %r8w +; AVX1-NEXT: setns %dl +; AVX1-NEXT: addl $32767, %edx # imm = 0x7FFF +; AVX1-NEXT: subw %cx, %ax +; AVX1-NEXT: cmovol %edx, %eax +; AVX1-NEXT: vpextrw $2, %xmm1, %r8d +; AVX1-NEXT: vpextrw $2, %xmm0, %ecx +; AVX1-NEXT: xorl %edx, %edx +; AVX1-NEXT: movl %ecx, %r9d +; AVX1-NEXT: subw %r8w, %r9w +; AVX1-NEXT: setns %dl +; AVX1-NEXT: addl $32767, %edx # imm = 0x7FFF +; AVX1-NEXT: subw %r8w, %cx +; AVX1-NEXT: cmovol %edx, %ecx +; AVX1-NEXT: vmovd %xmm1, %r8d +; AVX1-NEXT: vmovd %xmm0, %r9d +; AVX1-NEXT: xorl %edx, %edx +; AVX1-NEXT: movl %r9d, %r10d +; AVX1-NEXT: subw %r8w, %r10w +; AVX1-NEXT: setns %dl +; AVX1-NEXT: addl $32767, %edx # imm = 0x7FFF +; AVX1-NEXT: subw %r8w, %r9w +; AVX1-NEXT: cmovol %edx, %r9d +; AVX1-NEXT: vpextrw $1, %xmm1, %r8d +; AVX1-NEXT: vpextrw $1, %xmm0, %edx +; AVX1-NEXT: xorl %r10d, %r10d +; AVX1-NEXT: movl %edx, %r11d +; AVX1-NEXT: subw %r8w, %r11w +; AVX1-NEXT: setns %r10b +; AVX1-NEXT: addl $32767, %r10d # imm = 0x7FFF +; AVX1-NEXT: subw %r8w, %dx +; AVX1-NEXT: cmovol %r10d, %edx +; AVX1-NEXT: vmovd %r9d, %xmm0 +; AVX1-NEXT: vpinsrw $1, %edx, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $4, %edi, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $5, %ebp, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $6, %ebx, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $7, %esi, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %r12d, %xmm1 +; AVX1-NEXT: vpinsrw $1, %r13d, %xmm1, %xmm1 +; AVX1-NEXT: vpinsrw $2, %r15d, %xmm1, %xmm1 +; AVX1-NEXT: vpinsrw $3, %r14d, %xmm1, %xmm1 +; AVX1-NEXT: vpinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX1-NEXT: vpinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX1-NEXT: vpinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX1-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: popq %r12 +; AVX1-NEXT: popq %r13 +; AVX1-NEXT: popq %r14 +; AVX1-NEXT: popq %r15 +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: v16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %r13 +; AVX2-NEXT: pushq %r12 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: vpextrw $7, %xmm1, %eax +; AVX2-NEXT: vpextrw $7, %xmm0, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: subw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: subw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrw $6, %xmm1, %eax +; AVX2-NEXT: vpextrw $6, %xmm0, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: subw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: subw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrw $5, %xmm1, %eax +; AVX2-NEXT: vpextrw $5, %xmm0, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: subw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: subw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrw $4, %xmm1, %eax +; AVX2-NEXT: vpextrw $4, %xmm0, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: subw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: subw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrw $3, %xmm1, %eax +; AVX2-NEXT: vpextrw $3, %xmm0, %r14d +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %r14d, %edx +; AVX2-NEXT: subw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: subw %ax, %r14w +; AVX2-NEXT: cmovol %ecx, %r14d +; AVX2-NEXT: vpextrw $2, %xmm1, %eax +; AVX2-NEXT: vpextrw $2, %xmm0, %r15d +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %r15d, %edx +; AVX2-NEXT: subw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: subw %ax, %r15w +; AVX2-NEXT: cmovol %ecx, %r15d +; AVX2-NEXT: vmovd %xmm1, %eax +; AVX2-NEXT: vmovd %xmm0, %r12d +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %r12d, %edx +; AVX2-NEXT: subw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: subw %ax, %r12w +; AVX2-NEXT: cmovol %ecx, %r12d +; AVX2-NEXT: vpextrw $1, %xmm1, %eax +; AVX2-NEXT: vpextrw $1, %xmm0, %r13d +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %r13d, %esi +; AVX2-NEXT: subw %ax, %si +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: subw %ax, %r13w +; AVX2-NEXT: cmovol %ecx, %r13d +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vpextrw $7, %xmm1, %eax +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpextrw $7, %xmm0, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edi +; AVX2-NEXT: subw %ax, %di +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: subw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: vpextrw $6, %xmm1, %eax +; AVX2-NEXT: vpextrw $6, %xmm0, %ebx +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %ebx, %edi +; AVX2-NEXT: subw %ax, %di +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: subw %ax, %bx +; AVX2-NEXT: cmovol %ecx, %ebx +; AVX2-NEXT: vpextrw $5, %xmm1, %eax +; AVX2-NEXT: vpextrw $5, %xmm0, %ebp +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %ebp, %edi +; AVX2-NEXT: subw %ax, %di +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: subw %ax, %bp +; AVX2-NEXT: cmovol %ecx, %ebp +; AVX2-NEXT: vpextrw $4, %xmm1, %eax +; AVX2-NEXT: vpextrw $4, %xmm0, %edi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %edi, %edx +; AVX2-NEXT: subw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: subw %ax, %di +; AVX2-NEXT: cmovol %ecx, %edi +; AVX2-NEXT: vpextrw $3, %xmm1, %ecx +; AVX2-NEXT: vpextrw $3, %xmm0, %eax +; AVX2-NEXT: xorl %edx, %edx +; AVX2-NEXT: movl %eax, %r8d +; AVX2-NEXT: subw %cx, %r8w +; AVX2-NEXT: setns %dl +; AVX2-NEXT: addl $32767, %edx # imm = 0x7FFF +; AVX2-NEXT: subw %cx, %ax +; AVX2-NEXT: cmovol %edx, %eax +; AVX2-NEXT: vpextrw $2, %xmm1, %r8d +; AVX2-NEXT: vpextrw $2, %xmm0, %ecx +; AVX2-NEXT: xorl %edx, %edx +; AVX2-NEXT: movl %ecx, %r9d +; AVX2-NEXT: subw %r8w, %r9w +; AVX2-NEXT: setns %dl +; AVX2-NEXT: addl $32767, %edx # imm = 0x7FFF +; AVX2-NEXT: subw %r8w, %cx +; AVX2-NEXT: cmovol %edx, %ecx +; AVX2-NEXT: vmovd %xmm1, %r8d +; AVX2-NEXT: vmovd %xmm0, %r9d +; AVX2-NEXT: xorl %edx, %edx +; AVX2-NEXT: movl %r9d, %r10d +; AVX2-NEXT: subw %r8w, %r10w +; AVX2-NEXT: setns %dl +; AVX2-NEXT: addl $32767, %edx # imm = 0x7FFF +; AVX2-NEXT: subw %r8w, %r9w +; AVX2-NEXT: cmovol %edx, %r9d +; AVX2-NEXT: vpextrw $1, %xmm1, %r8d +; AVX2-NEXT: vpextrw $1, %xmm0, %edx +; AVX2-NEXT: xorl %r10d, %r10d +; AVX2-NEXT: movl %edx, %r11d +; AVX2-NEXT: subw %r8w, %r11w +; AVX2-NEXT: setns %r10b +; AVX2-NEXT: addl $32767, %r10d # imm = 0x7FFF +; AVX2-NEXT: subw %r8w, %dx +; AVX2-NEXT: cmovol %r10d, %edx +; AVX2-NEXT: vmovd %r9d, %xmm0 +; AVX2-NEXT: vpinsrw $1, %edx, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $4, %edi, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $5, %ebp, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $6, %ebx, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $7, %esi, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %r12d, %xmm1 +; AVX2-NEXT: vpinsrw $1, %r13d, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrw $2, %r15d, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrw $3, %r14d, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX2-NEXT: vpinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX2-NEXT: vpinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX2-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r12 +; AVX2-NEXT: popq %r13 +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512-LABEL: v16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r15 +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %r13 +; AVX512-NEXT: pushq %r12 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: vpextrw $7, %xmm1, %eax +; AVX512-NEXT: vpextrw $7, %xmm0, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: subw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: subw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrw $6, %xmm1, %eax +; AVX512-NEXT: vpextrw $6, %xmm0, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: subw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: subw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrw $5, %xmm1, %eax +; AVX512-NEXT: vpextrw $5, %xmm0, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: subw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: subw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrw $4, %xmm1, %eax +; AVX512-NEXT: vpextrw $4, %xmm0, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: subw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: subw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrw $3, %xmm1, %eax +; AVX512-NEXT: vpextrw $3, %xmm0, %r14d +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %r14d, %edx +; AVX512-NEXT: subw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: subw %ax, %r14w +; AVX512-NEXT: cmovol %ecx, %r14d +; AVX512-NEXT: vpextrw $2, %xmm1, %eax +; AVX512-NEXT: vpextrw $2, %xmm0, %r15d +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %r15d, %edx +; AVX512-NEXT: subw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: subw %ax, %r15w +; AVX512-NEXT: cmovol %ecx, %r15d +; AVX512-NEXT: vmovd %xmm1, %eax +; AVX512-NEXT: vmovd %xmm0, %r12d +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %r12d, %edx +; AVX512-NEXT: subw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: subw %ax, %r12w +; AVX512-NEXT: cmovol %ecx, %r12d +; AVX512-NEXT: vpextrw $1, %xmm1, %eax +; AVX512-NEXT: vpextrw $1, %xmm0, %r13d +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %r13d, %esi +; AVX512-NEXT: subw %ax, %si +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: subw %ax, %r13w +; AVX512-NEXT: cmovol %ecx, %r13d +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vpextrw $7, %xmm1, %eax +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vpextrw $7, %xmm0, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edi +; AVX512-NEXT: subw %ax, %di +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: subw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: vpextrw $6, %xmm1, %eax +; AVX512-NEXT: vpextrw $6, %xmm0, %ebx +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %ebx, %edi +; AVX512-NEXT: subw %ax, %di +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: subw %ax, %bx +; AVX512-NEXT: cmovol %ecx, %ebx +; AVX512-NEXT: vpextrw $5, %xmm1, %eax +; AVX512-NEXT: vpextrw $5, %xmm0, %ebp +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %ebp, %edi +; AVX512-NEXT: subw %ax, %di +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: subw %ax, %bp +; AVX512-NEXT: cmovol %ecx, %ebp +; AVX512-NEXT: vpextrw $4, %xmm1, %eax +; AVX512-NEXT: vpextrw $4, %xmm0, %edi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %edi, %edx +; AVX512-NEXT: subw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: subw %ax, %di +; AVX512-NEXT: cmovol %ecx, %edi +; AVX512-NEXT: vpextrw $3, %xmm1, %ecx +; AVX512-NEXT: vpextrw $3, %xmm0, %eax +; AVX512-NEXT: xorl %edx, %edx +; AVX512-NEXT: movl %eax, %r8d +; AVX512-NEXT: subw %cx, %r8w +; AVX512-NEXT: setns %dl +; AVX512-NEXT: addl $32767, %edx # imm = 0x7FFF +; AVX512-NEXT: subw %cx, %ax +; AVX512-NEXT: cmovol %edx, %eax +; AVX512-NEXT: vpextrw $2, %xmm1, %r8d +; AVX512-NEXT: vpextrw $2, %xmm0, %ecx +; AVX512-NEXT: xorl %edx, %edx +; AVX512-NEXT: movl %ecx, %r9d +; AVX512-NEXT: subw %r8w, %r9w +; AVX512-NEXT: setns %dl +; AVX512-NEXT: addl $32767, %edx # imm = 0x7FFF +; AVX512-NEXT: subw %r8w, %cx +; AVX512-NEXT: cmovol %edx, %ecx +; AVX512-NEXT: vmovd %xmm1, %r8d +; AVX512-NEXT: vmovd %xmm0, %r9d +; AVX512-NEXT: xorl %edx, %edx +; AVX512-NEXT: movl %r9d, %r10d +; AVX512-NEXT: subw %r8w, %r10w +; AVX512-NEXT: setns %dl +; AVX512-NEXT: addl $32767, %edx # imm = 0x7FFF +; AVX512-NEXT: subw %r8w, %r9w +; AVX512-NEXT: cmovol %edx, %r9d +; AVX512-NEXT: vpextrw $1, %xmm1, %r8d +; AVX512-NEXT: vpextrw $1, %xmm0, %edx +; AVX512-NEXT: xorl %r10d, %r10d +; AVX512-NEXT: movl %edx, %r11d +; AVX512-NEXT: subw %r8w, %r11w +; AVX512-NEXT: setns %r10b +; AVX512-NEXT: addl $32767, %r10d # imm = 0x7FFF +; AVX512-NEXT: subw %r8w, %dx +; AVX512-NEXT: cmovol %r10d, %edx +; AVX512-NEXT: vmovd %r9d, %xmm0 +; AVX512-NEXT: vpinsrw $1, %edx, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $4, %edi, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $5, %ebp, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $6, %ebx, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $7, %esi, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %r12d, %xmm1 +; AVX512-NEXT: vpinsrw $1, %r13d, %xmm1, %xmm1 +; AVX512-NEXT: vpinsrw $2, %r15d, %xmm1, %xmm1 +; AVX512-NEXT: vpinsrw $3, %r14d, %xmm1, %xmm1 +; AVX512-NEXT: vpinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX512-NEXT: vpinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX512-NEXT: vpinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX512-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r12 +; AVX512-NEXT: popq %r13 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %r15 +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq + %z = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %x, <16 x i16> %y) + ret <16 x i16> %z +} + +define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind { +; SSE2-LABEL: v32i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: pushq %r15 +; SSE2-NEXT: pushq %r14 +; SSE2-NEXT: pushq %r13 +; SSE2-NEXT: pushq %r12 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: movd %xmm5, %eax +; SSE2-NEXT: movd %xmm1, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: subw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: pextrw $1, %xmm5, %eax +; SSE2-NEXT: pextrw $1, %xmm1, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: subw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: pextrw $2, %xmm5, %eax +; SSE2-NEXT: pextrw $2, %xmm1, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: subw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: pextrw $3, %xmm5, %eax +; SSE2-NEXT: pextrw $3, %xmm1, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: subw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: pextrw $4, %xmm5, %eax +; SSE2-NEXT: pextrw $4, %xmm1, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: subw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: pextrw $5, %xmm5, %eax +; SSE2-NEXT: pextrw $5, %xmm1, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: subw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: pextrw $6, %xmm5, %eax +; SSE2-NEXT: pextrw $6, %xmm1, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: subw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: pextrw $7, %xmm5, %eax +; SSE2-NEXT: pextrw $7, %xmm1, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: subw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movd %xmm6, %eax +; SSE2-NEXT: movd %xmm2, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: subw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: pextrw $1, %xmm6, %eax +; SSE2-NEXT: pextrw $1, %xmm2, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: subw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: pextrw $2, %xmm6, %eax +; SSE2-NEXT: pextrw $2, %xmm2, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: subw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: pextrw $3, %xmm6, %eax +; SSE2-NEXT: pextrw $3, %xmm2, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: subw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: pextrw $4, %xmm6, %eax +; SSE2-NEXT: pextrw $4, %xmm2, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: subw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: pextrw $5, %xmm6, %eax +; SSE2-NEXT: pextrw $5, %xmm2, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: subw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: pextrw $6, %xmm6, %eax +; SSE2-NEXT: pextrw $6, %xmm2, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: subw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: pextrw $7, %xmm6, %eax +; SSE2-NEXT: pextrw $7, %xmm2, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: subw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movd %xmm7, %eax +; SSE2-NEXT: movd %xmm3, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: subw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: pextrw $1, %xmm7, %eax +; SSE2-NEXT: pextrw $1, %xmm3, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: subw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: pextrw $2, %xmm7, %eax +; SSE2-NEXT: pextrw $2, %xmm3, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: subw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: pextrw $3, %xmm7, %eax +; SSE2-NEXT: pextrw $3, %xmm3, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: subw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: pextrw $4, %xmm7, %eax +; SSE2-NEXT: pextrw $4, %xmm3, %ebp +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %ebp, %edx +; SSE2-NEXT: subw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %bp +; SSE2-NEXT: cmovol %ecx, %ebp +; SSE2-NEXT: pextrw $5, %xmm7, %eax +; SSE2-NEXT: pextrw $5, %xmm3, %ebx +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %ebx, %edx +; SSE2-NEXT: subw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %bx +; SSE2-NEXT: cmovol %ecx, %ebx +; SSE2-NEXT: pextrw $6, %xmm7, %eax +; SSE2-NEXT: pextrw $6, %xmm3, %r11d +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %r11d, %edx +; SSE2-NEXT: subw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %r11w +; SSE2-NEXT: cmovol %ecx, %r11d +; SSE2-NEXT: pextrw $7, %xmm7, %eax +; SSE2-NEXT: pextrw $7, %xmm3, %r10d +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %r10d, %edx +; SSE2-NEXT: subw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %r10w +; SSE2-NEXT: cmovol %ecx, %r10d +; SSE2-NEXT: movd %xmm4, %eax +; SSE2-NEXT: movd %xmm0, %r9d +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %r9d, %edx +; SSE2-NEXT: subw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %r9w +; SSE2-NEXT: cmovol %ecx, %r9d +; SSE2-NEXT: pextrw $1, %xmm4, %eax +; SSE2-NEXT: pextrw $1, %xmm0, %r8d +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %r8d, %edx +; SSE2-NEXT: subw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %r8w +; SSE2-NEXT: cmovol %ecx, %r8d +; SSE2-NEXT: pextrw $2, %xmm4, %eax +; SSE2-NEXT: pextrw $2, %xmm0, %edi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %edi, %edx +; SSE2-NEXT: subw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %di +; SSE2-NEXT: cmovol %ecx, %edi +; SSE2-NEXT: pextrw $3, %xmm4, %eax +; SSE2-NEXT: pextrw $3, %xmm0, %esi +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %esi, %edx +; SSE2-NEXT: subw %ax, %dx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %si +; SSE2-NEXT: cmovol %ecx, %esi +; SSE2-NEXT: pextrw $4, %xmm4, %eax +; SSE2-NEXT: pextrw $4, %xmm0, %edx +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %edx, %r13d +; SSE2-NEXT: subw %ax, %r13w +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %dx +; SSE2-NEXT: cmovol %ecx, %edx +; SSE2-NEXT: pextrw $5, %xmm4, %r13d +; SSE2-NEXT: pextrw $5, %xmm0, %ecx +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: movl %ecx, %r12d +; SSE2-NEXT: subw %r13w, %r12w +; SSE2-NEXT: setns %al +; SSE2-NEXT: addl $32767, %eax # imm = 0x7FFF +; SSE2-NEXT: subw %r13w, %cx +; SSE2-NEXT: cmovol %eax, %ecx +; SSE2-NEXT: pextrw $6, %xmm4, %r12d +; SSE2-NEXT: pextrw $6, %xmm0, %r13d +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: movl %r13d, %r15d +; SSE2-NEXT: subw %r12w, %r15w +; SSE2-NEXT: setns %al +; SSE2-NEXT: addl $32767, %eax # imm = 0x7FFF +; SSE2-NEXT: subw %r12w, %r13w +; SSE2-NEXT: cmovol %eax, %r13d +; SSE2-NEXT: pextrw $7, %xmm4, %r15d +; SSE2-NEXT: pextrw $7, %xmm0, %eax +; SSE2-NEXT: xorl %r12d, %r12d +; SSE2-NEXT: movl %eax, %r14d +; SSE2-NEXT: subw %r15w, %r14w +; SSE2-NEXT: setns %r12b +; SSE2-NEXT: addl $32767, %r12d # imm = 0x7FFF +; SSE2-NEXT: subw %r15w, %ax +; SSE2-NEXT: cmovol %r12d, %eax +; SSE2-NEXT: movd %eax, %xmm10 +; SSE2-NEXT: movd %r13d, %xmm12 +; SSE2-NEXT: movd %ecx, %xmm8 +; SSE2-NEXT: movd %edx, %xmm2 +; SSE2-NEXT: movd %esi, %xmm4 +; SSE2-NEXT: movd %edi, %xmm0 +; SSE2-NEXT: movd %r8d, %xmm13 +; SSE2-NEXT: movd %r9d, %xmm5 +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 4-byte Folded Reload +; SSE2-NEXT: # xmm6 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 4-byte Folded Reload +; SSE2-NEXT: # xmm11 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload +; SSE2-NEXT: # xmm3 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 4-byte Folded Reload +; SSE2-NEXT: # xmm14 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 4-byte Folded Reload +; SSE2-NEXT: # xmm9 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload +; SSE2-NEXT: # xmm15 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload +; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 4-byte Reload +; SSE2-NEXT: # xmm7 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 4-byte Folded Reload +; SSE2-NEXT: # xmm8 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1] +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 4-byte Reload +; SSE2-NEXT: # xmm7 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload +; SSE2-NEXT: # xmm4 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 4-byte Folded Reload +; SSE2-NEXT: # xmm13 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 4-byte Folded Reload +; SSE2-NEXT: # xmm7 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 4-byte Folded Reload +; SSE2-NEXT: # xmm12 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE2-NEXT: # xmm6 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3] +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload +; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3] +; SSE2-NEXT: movd %r10d, %xmm11 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; SSE2-NEXT: movd %r11d, %xmm6 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3] +; SSE2-NEXT: movd %ebx, %xmm14 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] +; SSE2-NEXT: movd %ebp, %xmm15 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 4-byte Folded Reload +; SSE2-NEXT: # xmm10 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE2-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3] +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 4-byte Folded Reload +; SSE2-NEXT: # xmm9 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE2-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload +; SSE2-NEXT: # xmm3 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm15[0] +; SSE2-NEXT: movdqa %xmm5, %xmm0 +; SSE2-NEXT: addq $8, %rsp +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %r12 +; SSE2-NEXT: popq %r13 +; SSE2-NEXT: popq %r14 +; SSE2-NEXT: popq %r15 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v32i16: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pushq %rbp +; SSSE3-NEXT: pushq %r15 +; SSSE3-NEXT: pushq %r14 +; SSSE3-NEXT: pushq %r13 +; SSSE3-NEXT: pushq %r12 +; SSSE3-NEXT: pushq %rbx +; SSSE3-NEXT: pushq %rax +; SSSE3-NEXT: movd %xmm5, %eax +; SSSE3-NEXT: movd %xmm1, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: subw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: pextrw $1, %xmm5, %eax +; SSSE3-NEXT: pextrw $1, %xmm1, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: subw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: pextrw $2, %xmm5, %eax +; SSSE3-NEXT: pextrw $2, %xmm1, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: subw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: pextrw $3, %xmm5, %eax +; SSSE3-NEXT: pextrw $3, %xmm1, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: subw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: pextrw $4, %xmm5, %eax +; SSSE3-NEXT: pextrw $4, %xmm1, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: subw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: pextrw $5, %xmm5, %eax +; SSSE3-NEXT: pextrw $5, %xmm1, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: subw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: pextrw $6, %xmm5, %eax +; SSSE3-NEXT: pextrw $6, %xmm1, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: subw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: pextrw $7, %xmm5, %eax +; SSSE3-NEXT: pextrw $7, %xmm1, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: subw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movd %xmm6, %eax +; SSSE3-NEXT: movd %xmm2, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: subw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: pextrw $1, %xmm6, %eax +; SSSE3-NEXT: pextrw $1, %xmm2, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: subw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: pextrw $2, %xmm6, %eax +; SSSE3-NEXT: pextrw $2, %xmm2, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: subw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: pextrw $3, %xmm6, %eax +; SSSE3-NEXT: pextrw $3, %xmm2, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: subw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: pextrw $4, %xmm6, %eax +; SSSE3-NEXT: pextrw $4, %xmm2, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: subw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: pextrw $5, %xmm6, %eax +; SSSE3-NEXT: pextrw $5, %xmm2, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: subw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: pextrw $6, %xmm6, %eax +; SSSE3-NEXT: pextrw $6, %xmm2, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: subw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: pextrw $7, %xmm6, %eax +; SSSE3-NEXT: pextrw $7, %xmm2, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: subw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movd %xmm7, %eax +; SSSE3-NEXT: movd %xmm3, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: subw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: pextrw $1, %xmm7, %eax +; SSSE3-NEXT: pextrw $1, %xmm3, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: subw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: pextrw $2, %xmm7, %eax +; SSSE3-NEXT: pextrw $2, %xmm3, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: subw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: pextrw $3, %xmm7, %eax +; SSSE3-NEXT: pextrw $3, %xmm3, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: subw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: pextrw $4, %xmm7, %eax +; SSSE3-NEXT: pextrw $4, %xmm3, %ebp +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %ebp, %edx +; SSSE3-NEXT: subw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %bp +; SSSE3-NEXT: cmovol %ecx, %ebp +; SSSE3-NEXT: pextrw $5, %xmm7, %eax +; SSSE3-NEXT: pextrw $5, %xmm3, %ebx +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %ebx, %edx +; SSSE3-NEXT: subw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %bx +; SSSE3-NEXT: cmovol %ecx, %ebx +; SSSE3-NEXT: pextrw $6, %xmm7, %eax +; SSSE3-NEXT: pextrw $6, %xmm3, %r11d +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %r11d, %edx +; SSSE3-NEXT: subw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %r11w +; SSSE3-NEXT: cmovol %ecx, %r11d +; SSSE3-NEXT: pextrw $7, %xmm7, %eax +; SSSE3-NEXT: pextrw $7, %xmm3, %r10d +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %r10d, %edx +; SSSE3-NEXT: subw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %r10w +; SSSE3-NEXT: cmovol %ecx, %r10d +; SSSE3-NEXT: movd %xmm4, %eax +; SSSE3-NEXT: movd %xmm0, %r9d +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %r9d, %edx +; SSSE3-NEXT: subw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %r9w +; SSSE3-NEXT: cmovol %ecx, %r9d +; SSSE3-NEXT: pextrw $1, %xmm4, %eax +; SSSE3-NEXT: pextrw $1, %xmm0, %r8d +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %r8d, %edx +; SSSE3-NEXT: subw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %r8w +; SSSE3-NEXT: cmovol %ecx, %r8d +; SSSE3-NEXT: pextrw $2, %xmm4, %eax +; SSSE3-NEXT: pextrw $2, %xmm0, %edi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %edi, %edx +; SSSE3-NEXT: subw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %di +; SSSE3-NEXT: cmovol %ecx, %edi +; SSSE3-NEXT: pextrw $3, %xmm4, %eax +; SSSE3-NEXT: pextrw $3, %xmm0, %esi +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %esi, %edx +; SSSE3-NEXT: subw %ax, %dx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %si +; SSSE3-NEXT: cmovol %ecx, %esi +; SSSE3-NEXT: pextrw $4, %xmm4, %eax +; SSSE3-NEXT: pextrw $4, %xmm0, %edx +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %edx, %r13d +; SSSE3-NEXT: subw %ax, %r13w +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %dx +; SSSE3-NEXT: cmovol %ecx, %edx +; SSSE3-NEXT: pextrw $5, %xmm4, %r13d +; SSSE3-NEXT: pextrw $5, %xmm0, %ecx +; SSSE3-NEXT: xorl %eax, %eax +; SSSE3-NEXT: movl %ecx, %r12d +; SSSE3-NEXT: subw %r13w, %r12w +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addl $32767, %eax # imm = 0x7FFF +; SSSE3-NEXT: subw %r13w, %cx +; SSSE3-NEXT: cmovol %eax, %ecx +; SSSE3-NEXT: pextrw $6, %xmm4, %r12d +; SSSE3-NEXT: pextrw $6, %xmm0, %r13d +; SSSE3-NEXT: xorl %eax, %eax +; SSSE3-NEXT: movl %r13d, %r15d +; SSSE3-NEXT: subw %r12w, %r15w +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: addl $32767, %eax # imm = 0x7FFF +; SSSE3-NEXT: subw %r12w, %r13w +; SSSE3-NEXT: cmovol %eax, %r13d +; SSSE3-NEXT: pextrw $7, %xmm4, %r15d +; SSSE3-NEXT: pextrw $7, %xmm0, %eax +; SSSE3-NEXT: xorl %r12d, %r12d +; SSSE3-NEXT: movl %eax, %r14d +; SSSE3-NEXT: subw %r15w, %r14w +; SSSE3-NEXT: setns %r12b +; SSSE3-NEXT: addl $32767, %r12d # imm = 0x7FFF +; SSSE3-NEXT: subw %r15w, %ax +; SSSE3-NEXT: cmovol %r12d, %eax +; SSSE3-NEXT: movd %eax, %xmm10 +; SSSE3-NEXT: movd %r13d, %xmm12 +; SSSE3-NEXT: movd %ecx, %xmm8 +; SSSE3-NEXT: movd %edx, %xmm2 +; SSSE3-NEXT: movd %esi, %xmm4 +; SSSE3-NEXT: movd %edi, %xmm0 +; SSSE3-NEXT: movd %r8d, %xmm13 +; SSSE3-NEXT: movd %r9d, %xmm5 +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm6 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm11 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm3 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm14 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm9 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm15 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 4-byte Reload +; SSSE3-NEXT: # xmm7 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm8 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1] +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 4-byte Reload +; SSSE3-NEXT: # xmm7 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm4 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm13 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm7 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm12 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm6 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3] +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3] +; SSSE3-NEXT: movd %r10d, %xmm11 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; SSSE3-NEXT: movd %r11d, %xmm6 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3] +; SSSE3-NEXT: movd %ebx, %xmm14 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] +; SSSE3-NEXT: movd %ebp, %xmm15 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm10 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3] +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm9 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm3 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm15[0] +; SSSE3-NEXT: movdqa %xmm5, %xmm0 +; SSSE3-NEXT: addq $8, %rsp +; SSSE3-NEXT: popq %rbx +; SSSE3-NEXT: popq %r12 +; SSSE3-NEXT: popq %r13 +; SSSE3-NEXT: popq %r14 +; SSSE3-NEXT: popq %r15 +; SSSE3-NEXT: popq %rbp +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v32i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pushq %rbp +; SSE41-NEXT: pushq %r15 +; SSE41-NEXT: pushq %r14 +; SSE41-NEXT: pushq %r13 +; SSE41-NEXT: pushq %r12 +; SSE41-NEXT: pushq %rbx +; SSE41-NEXT: pextrw $7, %xmm5, %eax +; SSE41-NEXT: pextrw $7, %xmm1, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: subw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrw $6, %xmm5, %eax +; SSE41-NEXT: pextrw $6, %xmm1, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: subw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrw $5, %xmm5, %eax +; SSE41-NEXT: pextrw $5, %xmm1, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: subw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrw $4, %xmm5, %eax +; SSE41-NEXT: pextrw $4, %xmm1, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: subw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrw $3, %xmm5, %eax +; SSE41-NEXT: pextrw $3, %xmm1, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: subw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrw $2, %xmm5, %eax +; SSE41-NEXT: pextrw $2, %xmm1, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: subw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movd %xmm5, %eax +; SSE41-NEXT: movd %xmm1, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: subw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrw $1, %xmm5, %eax +; SSE41-NEXT: pextrw $1, %xmm1, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: subw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrw $7, %xmm6, %eax +; SSE41-NEXT: pextrw $7, %xmm2, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: subw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrw $6, %xmm6, %eax +; SSE41-NEXT: pextrw $6, %xmm2, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: subw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrw $5, %xmm6, %eax +; SSE41-NEXT: pextrw $5, %xmm2, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: subw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrw $4, %xmm6, %eax +; SSE41-NEXT: pextrw $4, %xmm2, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: subw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrw $3, %xmm6, %eax +; SSE41-NEXT: pextrw $3, %xmm2, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: subw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrw $2, %xmm6, %eax +; SSE41-NEXT: pextrw $2, %xmm2, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: subw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movd %xmm6, %eax +; SSE41-NEXT: movd %xmm2, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: subw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrw $1, %xmm6, %eax +; SSE41-NEXT: pextrw $1, %xmm2, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: subw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrw $7, %xmm7, %eax +; SSE41-NEXT: pextrw $7, %xmm3, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: subw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrw $6, %xmm7, %eax +; SSE41-NEXT: pextrw $6, %xmm3, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: subw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrw $5, %xmm7, %eax +; SSE41-NEXT: pextrw $5, %xmm3, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: subw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrw $4, %xmm7, %eax +; SSE41-NEXT: pextrw $4, %xmm3, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: subw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrw $3, %xmm7, %eax +; SSE41-NEXT: pextrw $3, %xmm3, %ebx +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %ebx, %edx +; SSE41-NEXT: subw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %bx +; SSE41-NEXT: cmovol %ecx, %ebx +; SSE41-NEXT: pextrw $2, %xmm7, %eax +; SSE41-NEXT: pextrw $2, %xmm3, %r11d +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %r11d, %edx +; SSE41-NEXT: subw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %r11w +; SSE41-NEXT: cmovol %ecx, %r11d +; SSE41-NEXT: movd %xmm7, %eax +; SSE41-NEXT: movd %xmm3, %r10d +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %r10d, %edx +; SSE41-NEXT: subw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %r10w +; SSE41-NEXT: cmovol %ecx, %r10d +; SSE41-NEXT: pextrw $1, %xmm7, %eax +; SSE41-NEXT: pextrw $1, %xmm3, %r9d +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %r9d, %edx +; SSE41-NEXT: subw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %r9w +; SSE41-NEXT: cmovol %ecx, %r9d +; SSE41-NEXT: pextrw $7, %xmm4, %eax +; SSE41-NEXT: pextrw $7, %xmm0, %r8d +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %r8d, %edx +; SSE41-NEXT: subw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %r8w +; SSE41-NEXT: cmovol %ecx, %r8d +; SSE41-NEXT: pextrw $6, %xmm4, %eax +; SSE41-NEXT: pextrw $6, %xmm0, %edi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %edi, %edx +; SSE41-NEXT: subw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %di +; SSE41-NEXT: cmovol %ecx, %edi +; SSE41-NEXT: pextrw $5, %xmm4, %eax +; SSE41-NEXT: pextrw $5, %xmm0, %esi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %esi, %edx +; SSE41-NEXT: subw %ax, %dx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %si +; SSE41-NEXT: cmovol %ecx, %esi +; SSE41-NEXT: pextrw $4, %xmm4, %eax +; SSE41-NEXT: pextrw $4, %xmm0, %edx +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %edx, %r13d +; SSE41-NEXT: subw %ax, %r13w +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %dx +; SSE41-NEXT: cmovol %ecx, %edx +; SSE41-NEXT: pextrw $3, %xmm4, %eax +; SSE41-NEXT: pextrw $3, %xmm0, %r13d +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %r13d, %r12d +; SSE41-NEXT: subw %ax, %r12w +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %r13w +; SSE41-NEXT: cmovol %ecx, %r13d +; SSE41-NEXT: pextrw $2, %xmm4, %r12d +; SSE41-NEXT: pextrw $2, %xmm0, %eax +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %eax, %r15d +; SSE41-NEXT: subw %r12w, %r15w +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %r12w, %ax +; SSE41-NEXT: cmovol %ecx, %eax +; SSE41-NEXT: movd %xmm4, %r15d +; SSE41-NEXT: movd %xmm0, %r12d +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %r12d, %r14d +; SSE41-NEXT: subw %r15w, %r14w +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %r15w, %r12w +; SSE41-NEXT: cmovol %ecx, %r12d +; SSE41-NEXT: pextrw $1, %xmm4, %r14d +; SSE41-NEXT: pextrw $1, %xmm0, %ecx +; SSE41-NEXT: xorl %r15d, %r15d +; SSE41-NEXT: movl %ecx, %ebp +; SSE41-NEXT: subw %r14w, %bp +; SSE41-NEXT: setns %r15b +; SSE41-NEXT: addl $32767, %r15d # imm = 0x7FFF +; SSE41-NEXT: subw %r14w, %cx +; SSE41-NEXT: cmovol %r15d, %ecx +; SSE41-NEXT: movd %r12d, %xmm0 +; SSE41-NEXT: pinsrw $1, %ecx, %xmm0 +; SSE41-NEXT: pinsrw $2, %eax, %xmm0 +; SSE41-NEXT: pinsrw $3, %r13d, %xmm0 +; SSE41-NEXT: pinsrw $4, %edx, %xmm0 +; SSE41-NEXT: pinsrw $5, %esi, %xmm0 +; SSE41-NEXT: pinsrw $6, %edi, %xmm0 +; SSE41-NEXT: pinsrw $7, %r8d, %xmm0 +; SSE41-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload +; SSE41-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE41-NEXT: pinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload +; SSE41-NEXT: pinsrw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload +; SSE41-NEXT: pinsrw $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload +; SSE41-NEXT: pinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload +; SSE41-NEXT: pinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload +; SSE41-NEXT: pinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload +; SSE41-NEXT: pinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload +; SSE41-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload +; SSE41-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: pinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload +; SSE41-NEXT: pinsrw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload +; SSE41-NEXT: pinsrw $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload +; SSE41-NEXT: pinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload +; SSE41-NEXT: pinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload +; SSE41-NEXT: pinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload +; SSE41-NEXT: pinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload +; SSE41-NEXT: movd %r10d, %xmm3 +; SSE41-NEXT: pinsrw $1, %r9d, %xmm3 +; SSE41-NEXT: pinsrw $2, %r11d, %xmm3 +; SSE41-NEXT: pinsrw $3, %ebx, %xmm3 +; SSE41-NEXT: pinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload +; SSE41-NEXT: pinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload +; SSE41-NEXT: pinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload +; SSE41-NEXT: pinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload +; SSE41-NEXT: popq %rbx +; SSE41-NEXT: popq %r12 +; SSE41-NEXT: popq %r13 +; SSE41-NEXT: popq %r14 +; SSE41-NEXT: popq %r15 +; SSE41-NEXT: popq %rbp +; SSE41-NEXT: retq +; +; AVX1-LABEL: v32i16: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: pushq %r15 +; AVX1-NEXT: pushq %r14 +; AVX1-NEXT: pushq %r13 +; AVX1-NEXT: pushq %r12 +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: vpextrw $7, %xmm3, %eax +; AVX1-NEXT: vpextrw $7, %xmm1, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: subw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: subw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrw $6, %xmm3, %eax +; AVX1-NEXT: vpextrw $6, %xmm1, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: subw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: subw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrw $5, %xmm3, %eax +; AVX1-NEXT: vpextrw $5, %xmm1, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: subw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: subw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrw $4, %xmm3, %eax +; AVX1-NEXT: vpextrw $4, %xmm1, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: subw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: subw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrw $3, %xmm3, %eax +; AVX1-NEXT: vpextrw $3, %xmm1, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: subw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: subw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrw $2, %xmm3, %eax +; AVX1-NEXT: vpextrw $2, %xmm1, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: subw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: subw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vmovd %xmm3, %eax +; AVX1-NEXT: vmovd %xmm1, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: subw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: subw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrw $1, %xmm3, %eax +; AVX1-NEXT: vpextrw $1, %xmm1, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: subw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: subw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vpextrw $7, %xmm3, %eax +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpextrw $7, %xmm1, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: subw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: subw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrw $6, %xmm3, %eax +; AVX1-NEXT: vpextrw $6, %xmm1, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: subw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: subw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrw $5, %xmm3, %eax +; AVX1-NEXT: vpextrw $5, %xmm1, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: subw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: subw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrw $4, %xmm3, %eax +; AVX1-NEXT: vpextrw $4, %xmm1, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: subw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: subw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrw $3, %xmm3, %eax +; AVX1-NEXT: vpextrw $3, %xmm1, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: subw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: subw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrw $2, %xmm3, %eax +; AVX1-NEXT: vpextrw $2, %xmm1, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: subw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: subw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vmovd %xmm3, %eax +; AVX1-NEXT: vmovd %xmm1, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: subw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: subw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrw $1, %xmm3, %eax +; AVX1-NEXT: vpextrw $1, %xmm1, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: subw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: subw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrw $7, %xmm2, %eax +; AVX1-NEXT: vpextrw $7, %xmm0, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: subw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: subw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrw $6, %xmm2, %eax +; AVX1-NEXT: vpextrw $6, %xmm0, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: subw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: subw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrw $5, %xmm2, %eax +; AVX1-NEXT: vpextrw $5, %xmm0, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: subw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: subw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrw $4, %xmm2, %eax +; AVX1-NEXT: vpextrw $4, %xmm0, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: subw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: subw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrw $3, %xmm2, %eax +; AVX1-NEXT: vpextrw $3, %xmm0, %ebx +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %ebx, %edx +; AVX1-NEXT: subw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: subw %ax, %bx +; AVX1-NEXT: cmovol %ecx, %ebx +; AVX1-NEXT: vpextrw $2, %xmm2, %eax +; AVX1-NEXT: vpextrw $2, %xmm0, %r11d +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %r11d, %edx +; AVX1-NEXT: subw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: subw %ax, %r11w +; AVX1-NEXT: cmovol %ecx, %r11d +; AVX1-NEXT: vmovd %xmm2, %eax +; AVX1-NEXT: vmovd %xmm0, %r10d +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %r10d, %edx +; AVX1-NEXT: subw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: subw %ax, %r10w +; AVX1-NEXT: cmovol %ecx, %r10d +; AVX1-NEXT: vpextrw $1, %xmm2, %eax +; AVX1-NEXT: vpextrw $1, %xmm0, %r9d +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %r9d, %edx +; AVX1-NEXT: subw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: subw %ax, %r9w +; AVX1-NEXT: cmovol %ecx, %r9d +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpextrw $7, %xmm1, %eax +; AVX1-NEXT: vpextrw $7, %xmm0, %r8d +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %r8d, %edx +; AVX1-NEXT: subw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: subw %ax, %r8w +; AVX1-NEXT: cmovol %ecx, %r8d +; AVX1-NEXT: vpextrw $6, %xmm1, %eax +; AVX1-NEXT: vpextrw $6, %xmm0, %edi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %edi, %edx +; AVX1-NEXT: subw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: subw %ax, %di +; AVX1-NEXT: cmovol %ecx, %edi +; AVX1-NEXT: vpextrw $5, %xmm1, %eax +; AVX1-NEXT: vpextrw $5, %xmm0, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: subw %ax, %dx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: subw %ax, %si +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: vpextrw $4, %xmm1, %eax +; AVX1-NEXT: vpextrw $4, %xmm0, %edx +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %edx, %r13d +; AVX1-NEXT: subw %ax, %r13w +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: subw %ax, %dx +; AVX1-NEXT: cmovol %ecx, %edx +; AVX1-NEXT: vpextrw $3, %xmm1, %eax +; AVX1-NEXT: vpextrw $3, %xmm0, %r13d +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %r13d, %r12d +; AVX1-NEXT: subw %ax, %r12w +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: subw %ax, %r13w +; AVX1-NEXT: cmovol %ecx, %r13d +; AVX1-NEXT: vpextrw $2, %xmm1, %r12d +; AVX1-NEXT: vpextrw $2, %xmm0, %eax +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %eax, %r15d +; AVX1-NEXT: subw %r12w, %r15w +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: subw %r12w, %ax +; AVX1-NEXT: cmovol %ecx, %eax +; AVX1-NEXT: vmovd %xmm1, %r15d +; AVX1-NEXT: vmovd %xmm0, %r12d +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %r12d, %r14d +; AVX1-NEXT: subw %r15w, %r14w +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: subw %r15w, %r12w +; AVX1-NEXT: cmovol %ecx, %r12d +; AVX1-NEXT: vpextrw $1, %xmm1, %r14d +; AVX1-NEXT: vpextrw $1, %xmm0, %ecx +; AVX1-NEXT: xorl %r15d, %r15d +; AVX1-NEXT: movl %ecx, %ebp +; AVX1-NEXT: subw %r14w, %bp +; AVX1-NEXT: setns %r15b +; AVX1-NEXT: addl $32767, %r15d # imm = 0x7FFF +; AVX1-NEXT: subw %r14w, %cx +; AVX1-NEXT: cmovol %r15d, %ecx +; AVX1-NEXT: vmovd %r12d, %xmm0 +; AVX1-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $3, %r13d, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $4, %edx, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $5, %esi, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $6, %edi, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $7, %r8d, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %r10d, %xmm1 +; AVX1-NEXT: vpinsrw $1, %r9d, %xmm1, %xmm1 +; AVX1-NEXT: vpinsrw $2, %r11d, %xmm1, %xmm1 +; AVX1-NEXT: vpinsrw $3, %ebx, %xmm1, %xmm1 +; AVX1-NEXT: vpinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX1-NEXT: vpinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX1-NEXT: vpinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX1-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX1-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload +; AVX1-NEXT: # xmm2 = mem[0],zero,zero,zero +; AVX1-NEXT: vpinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload +; AVX1-NEXT: vpinsrw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload +; AVX1-NEXT: vpinsrw $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload +; AVX1-NEXT: vpinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload +; AVX1-NEXT: vpinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload +; AVX1-NEXT: vpinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload +; AVX1-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload +; AVX1-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload +; AVX1-NEXT: # xmm3 = mem[0],zero,zero,zero +; AVX1-NEXT: vpinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload +; AVX1-NEXT: vpinsrw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload +; AVX1-NEXT: vpinsrw $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload +; AVX1-NEXT: vpinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload +; AVX1-NEXT: vpinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload +; AVX1-NEXT: vpinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload +; AVX1-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm1 +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: popq %r12 +; AVX1-NEXT: popq %r13 +; AVX1-NEXT: popq %r14 +; AVX1-NEXT: popq %r15 +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: v32i16: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %r13 +; AVX2-NEXT: pushq %r12 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: vpextrw $7, %xmm3, %eax +; AVX2-NEXT: vpextrw $7, %xmm1, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: subw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: subw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrw $6, %xmm3, %eax +; AVX2-NEXT: vpextrw $6, %xmm1, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: subw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: subw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrw $5, %xmm3, %eax +; AVX2-NEXT: vpextrw $5, %xmm1, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: subw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: subw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrw $4, %xmm3, %eax +; AVX2-NEXT: vpextrw $4, %xmm1, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: subw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: subw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrw $3, %xmm3, %eax +; AVX2-NEXT: vpextrw $3, %xmm1, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: subw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: subw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrw $2, %xmm3, %eax +; AVX2-NEXT: vpextrw $2, %xmm1, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: subw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: subw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vmovd %xmm3, %eax +; AVX2-NEXT: vmovd %xmm1, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: subw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: subw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrw $1, %xmm3, %eax +; AVX2-NEXT: vpextrw $1, %xmm1, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: subw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: subw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX2-NEXT: vpextrw $7, %xmm3, %eax +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vpextrw $7, %xmm1, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: subw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: subw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrw $6, %xmm3, %eax +; AVX2-NEXT: vpextrw $6, %xmm1, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: subw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: subw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrw $5, %xmm3, %eax +; AVX2-NEXT: vpextrw $5, %xmm1, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: subw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: subw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrw $4, %xmm3, %eax +; AVX2-NEXT: vpextrw $4, %xmm1, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: subw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: subw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrw $3, %xmm3, %eax +; AVX2-NEXT: vpextrw $3, %xmm1, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: subw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: subw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrw $2, %xmm3, %eax +; AVX2-NEXT: vpextrw $2, %xmm1, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: subw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: subw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vmovd %xmm3, %eax +; AVX2-NEXT: vmovd %xmm1, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: subw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: subw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrw $1, %xmm3, %eax +; AVX2-NEXT: vpextrw $1, %xmm1, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: subw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: subw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrw $7, %xmm2, %eax +; AVX2-NEXT: vpextrw $7, %xmm0, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: subw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: subw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrw $6, %xmm2, %eax +; AVX2-NEXT: vpextrw $6, %xmm0, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: subw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: subw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrw $5, %xmm2, %eax +; AVX2-NEXT: vpextrw $5, %xmm0, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: subw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: subw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrw $4, %xmm2, %eax +; AVX2-NEXT: vpextrw $4, %xmm0, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: subw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: subw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrw $3, %xmm2, %eax +; AVX2-NEXT: vpextrw $3, %xmm0, %ebx +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %ebx, %edx +; AVX2-NEXT: subw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: subw %ax, %bx +; AVX2-NEXT: cmovol %ecx, %ebx +; AVX2-NEXT: vpextrw $2, %xmm2, %eax +; AVX2-NEXT: vpextrw $2, %xmm0, %r11d +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %r11d, %edx +; AVX2-NEXT: subw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: subw %ax, %r11w +; AVX2-NEXT: cmovol %ecx, %r11d +; AVX2-NEXT: vmovd %xmm2, %eax +; AVX2-NEXT: vmovd %xmm0, %r10d +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %r10d, %edx +; AVX2-NEXT: subw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: subw %ax, %r10w +; AVX2-NEXT: cmovol %ecx, %r10d +; AVX2-NEXT: vpextrw $1, %xmm2, %eax +; AVX2-NEXT: vpextrw $1, %xmm0, %r9d +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %r9d, %edx +; AVX2-NEXT: subw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: subw %ax, %r9w +; AVX2-NEXT: cmovol %ecx, %r9d +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpextrw $7, %xmm1, %eax +; AVX2-NEXT: vpextrw $7, %xmm0, %r8d +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %r8d, %edx +; AVX2-NEXT: subw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: subw %ax, %r8w +; AVX2-NEXT: cmovol %ecx, %r8d +; AVX2-NEXT: vpextrw $6, %xmm1, %eax +; AVX2-NEXT: vpextrw $6, %xmm0, %edi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %edi, %edx +; AVX2-NEXT: subw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: subw %ax, %di +; AVX2-NEXT: cmovol %ecx, %edi +; AVX2-NEXT: vpextrw $5, %xmm1, %eax +; AVX2-NEXT: vpextrw $5, %xmm0, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: subw %ax, %dx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: subw %ax, %si +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: vpextrw $4, %xmm1, %eax +; AVX2-NEXT: vpextrw $4, %xmm0, %edx +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %edx, %r13d +; AVX2-NEXT: subw %ax, %r13w +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: subw %ax, %dx +; AVX2-NEXT: cmovol %ecx, %edx +; AVX2-NEXT: vpextrw $3, %xmm1, %eax +; AVX2-NEXT: vpextrw $3, %xmm0, %r13d +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %r13d, %r12d +; AVX2-NEXT: subw %ax, %r12w +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: subw %ax, %r13w +; AVX2-NEXT: cmovol %ecx, %r13d +; AVX2-NEXT: vpextrw $2, %xmm1, %r12d +; AVX2-NEXT: vpextrw $2, %xmm0, %eax +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %eax, %r15d +; AVX2-NEXT: subw %r12w, %r15w +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: subw %r12w, %ax +; AVX2-NEXT: cmovol %ecx, %eax +; AVX2-NEXT: vmovd %xmm1, %r15d +; AVX2-NEXT: vmovd %xmm0, %r12d +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %r12d, %r14d +; AVX2-NEXT: subw %r15w, %r14w +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: subw %r15w, %r12w +; AVX2-NEXT: cmovol %ecx, %r12d +; AVX2-NEXT: vpextrw $1, %xmm1, %r14d +; AVX2-NEXT: vpextrw $1, %xmm0, %ecx +; AVX2-NEXT: xorl %r15d, %r15d +; AVX2-NEXT: movl %ecx, %ebp +; AVX2-NEXT: subw %r14w, %bp +; AVX2-NEXT: setns %r15b +; AVX2-NEXT: addl $32767, %r15d # imm = 0x7FFF +; AVX2-NEXT: subw %r14w, %cx +; AVX2-NEXT: cmovol %r15d, %ecx +; AVX2-NEXT: vmovd %r12d, %xmm0 +; AVX2-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $3, %r13d, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $4, %edx, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $5, %esi, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $6, %edi, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $7, %r8d, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %r10d, %xmm1 +; AVX2-NEXT: vpinsrw $1, %r9d, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrw $2, %r11d, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrw $3, %ebx, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX2-NEXT: vpinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX2-NEXT: vpinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX2-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX2-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload +; AVX2-NEXT: # xmm2 = mem[0],zero,zero,zero +; AVX2-NEXT: vpinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload +; AVX2-NEXT: vpinsrw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload +; AVX2-NEXT: vpinsrw $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload +; AVX2-NEXT: vpinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload +; AVX2-NEXT: vpinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload +; AVX2-NEXT: vpinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload +; AVX2-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload +; AVX2-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload +; AVX2-NEXT: # xmm3 = mem[0],zero,zero,zero +; AVX2-NEXT: vpinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload +; AVX2-NEXT: vpinsrw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload +; AVX2-NEXT: vpinsrw $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload +; AVX2-NEXT: vpinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload +; AVX2-NEXT: vpinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload +; AVX2-NEXT: vpinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload +; AVX2-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm1 +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r12 +; AVX2-NEXT: popq %r13 +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512-LABEL: v32i16: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r15 +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %r13 +; AVX512-NEXT: pushq %r12 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: vpextrw $7, %xmm1, %eax +; AVX512-NEXT: vpextrw $7, %xmm0, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: subw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: subw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrw $6, %xmm1, %eax +; AVX512-NEXT: vpextrw $6, %xmm0, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: subw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: subw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrw $5, %xmm1, %eax +; AVX512-NEXT: vpextrw $5, %xmm0, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: subw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: subw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrw $4, %xmm1, %eax +; AVX512-NEXT: vpextrw $4, %xmm0, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: subw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: subw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrw $3, %xmm1, %eax +; AVX512-NEXT: vpextrw $3, %xmm0, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: subw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: subw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrw $2, %xmm1, %eax +; AVX512-NEXT: vpextrw $2, %xmm0, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: subw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: subw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vmovd %xmm1, %eax +; AVX512-NEXT: vmovd %xmm0, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: subw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: subw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrw $1, %xmm1, %eax +; AVX512-NEXT: vpextrw $1, %xmm0, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: subw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: subw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vpextrw $7, %xmm2, %eax +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512-NEXT: vpextrw $7, %xmm3, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: subw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: subw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrw $6, %xmm2, %eax +; AVX512-NEXT: vpextrw $6, %xmm3, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: subw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: subw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrw $5, %xmm2, %eax +; AVX512-NEXT: vpextrw $5, %xmm3, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: subw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: subw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrw $4, %xmm2, %eax +; AVX512-NEXT: vpextrw $4, %xmm3, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: subw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: subw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrw $3, %xmm2, %eax +; AVX512-NEXT: vpextrw $3, %xmm3, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: subw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: subw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrw $2, %xmm2, %eax +; AVX512-NEXT: vpextrw $2, %xmm3, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: subw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: subw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vmovd %xmm2, %eax +; AVX512-NEXT: vmovd %xmm3, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: subw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: subw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrw $1, %xmm2, %eax +; AVX512-NEXT: vpextrw $1, %xmm3, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: subw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: subw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm2 +; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm3 +; AVX512-NEXT: vpextrw $7, %xmm2, %eax +; AVX512-NEXT: vpextrw $7, %xmm3, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: subw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: subw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrw $6, %xmm2, %eax +; AVX512-NEXT: vpextrw $6, %xmm3, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: subw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: subw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrw $5, %xmm2, %eax +; AVX512-NEXT: vpextrw $5, %xmm3, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: subw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: subw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrw $4, %xmm2, %eax +; AVX512-NEXT: vpextrw $4, %xmm3, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: subw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: subw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrw $3, %xmm2, %eax +; AVX512-NEXT: vpextrw $3, %xmm3, %ebx +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %ebx, %edx +; AVX512-NEXT: subw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: subw %ax, %bx +; AVX512-NEXT: cmovol %ecx, %ebx +; AVX512-NEXT: vpextrw $2, %xmm2, %eax +; AVX512-NEXT: vpextrw $2, %xmm3, %r11d +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %r11d, %edx +; AVX512-NEXT: subw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: subw %ax, %r11w +; AVX512-NEXT: cmovol %ecx, %r11d +; AVX512-NEXT: vmovd %xmm2, %eax +; AVX512-NEXT: vmovd %xmm3, %r10d +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %r10d, %edx +; AVX512-NEXT: subw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: subw %ax, %r10w +; AVX512-NEXT: cmovol %ecx, %r10d +; AVX512-NEXT: vpextrw $1, %xmm2, %eax +; AVX512-NEXT: vpextrw $1, %xmm3, %r9d +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %r9d, %edx +; AVX512-NEXT: subw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: subw %ax, %r9w +; AVX512-NEXT: cmovol %ecx, %r9d +; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm1 +; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm0 +; AVX512-NEXT: vpextrw $7, %xmm1, %eax +; AVX512-NEXT: vpextrw $7, %xmm0, %r8d +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %r8d, %edx +; AVX512-NEXT: subw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: subw %ax, %r8w +; AVX512-NEXT: cmovol %ecx, %r8d +; AVX512-NEXT: vpextrw $6, %xmm1, %eax +; AVX512-NEXT: vpextrw $6, %xmm0, %edi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %edi, %edx +; AVX512-NEXT: subw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: subw %ax, %di +; AVX512-NEXT: cmovol %ecx, %edi +; AVX512-NEXT: vpextrw $5, %xmm1, %eax +; AVX512-NEXT: vpextrw $5, %xmm0, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: subw %ax, %dx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: subw %ax, %si +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: vpextrw $4, %xmm1, %eax +; AVX512-NEXT: vpextrw $4, %xmm0, %edx +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %edx, %r13d +; AVX512-NEXT: subw %ax, %r13w +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: subw %ax, %dx +; AVX512-NEXT: cmovol %ecx, %edx +; AVX512-NEXT: vpextrw $3, %xmm1, %eax +; AVX512-NEXT: vpextrw $3, %xmm0, %r13d +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %r13d, %r12d +; AVX512-NEXT: subw %ax, %r12w +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: subw %ax, %r13w +; AVX512-NEXT: cmovol %ecx, %r13d +; AVX512-NEXT: vpextrw $2, %xmm1, %r12d +; AVX512-NEXT: vpextrw $2, %xmm0, %eax +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %eax, %r15d +; AVX512-NEXT: subw %r12w, %r15w +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: subw %r12w, %ax +; AVX512-NEXT: cmovol %ecx, %eax +; AVX512-NEXT: vmovd %xmm1, %r15d +; AVX512-NEXT: vmovd %xmm0, %r12d +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %r12d, %r14d +; AVX512-NEXT: subw %r15w, %r14w +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: subw %r15w, %r12w +; AVX512-NEXT: cmovol %ecx, %r12d +; AVX512-NEXT: vpextrw $1, %xmm1, %r14d +; AVX512-NEXT: vpextrw $1, %xmm0, %ecx +; AVX512-NEXT: xorl %r15d, %r15d +; AVX512-NEXT: movl %ecx, %ebp +; AVX512-NEXT: subw %r14w, %bp +; AVX512-NEXT: setns %r15b +; AVX512-NEXT: addl $32767, %r15d # imm = 0x7FFF +; AVX512-NEXT: subw %r14w, %cx +; AVX512-NEXT: cmovol %r15d, %ecx +; AVX512-NEXT: vmovd %r12d, %xmm0 +; AVX512-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $3, %r13d, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $4, %edx, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $5, %esi, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $6, %edi, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $7, %r8d, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %r10d, %xmm1 +; AVX512-NEXT: vpinsrw $1, %r9d, %xmm1, %xmm1 +; AVX512-NEXT: vpinsrw $2, %r11d, %xmm1, %xmm1 +; AVX512-NEXT: vpinsrw $3, %ebx, %xmm1, %xmm1 +; AVX512-NEXT: vpinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX512-NEXT: vpinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX512-NEXT: vpinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX512-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX512-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload +; AVX512-NEXT: # xmm2 = mem[0],zero,zero,zero +; AVX512-NEXT: vpinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload +; AVX512-NEXT: vpinsrw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload +; AVX512-NEXT: vpinsrw $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload +; AVX512-NEXT: vpinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload +; AVX512-NEXT: vpinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload +; AVX512-NEXT: vpinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload +; AVX512-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload +; AVX512-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 4-byte Folded Reload +; AVX512-NEXT: # xmm3 = mem[0],zero,zero,zero +; AVX512-NEXT: vpinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload +; AVX512-NEXT: vpinsrw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload +; AVX512-NEXT: vpinsrw $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload +; AVX512-NEXT: vpinsrw $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload +; AVX512-NEXT: vpinsrw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload +; AVX512-NEXT: vpinsrw $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload +; AVX512-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r12 +; AVX512-NEXT: popq %r13 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %r15 +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq + %z = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> %x, <32 x i16> %y) + ret <32 x i16> %z +} + +; Too narrow vectors, legalized by widening. + +define void @v8i8(<8 x i8>* %px, <8 x i8>* %py, <8 x i8>* %pz) nounwind { +; SSE2-LABEL: v8i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: pushq %r14 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: movd %xmm0, %r8d +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %r8d, %esi +; SSE2-NEXT: subw %ax, %si +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %r8w +; SSE2-NEXT: cmovol %ecx, %r8d +; SSE2-NEXT: pextrw $1, %xmm1, %eax +; SSE2-NEXT: pextrw $1, %xmm0, %r9d +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %r9d, %esi +; SSE2-NEXT: subw %ax, %si +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %r9w +; SSE2-NEXT: cmovol %ecx, %r9d +; SSE2-NEXT: pextrw $2, %xmm1, %eax +; SSE2-NEXT: pextrw $2, %xmm0, %r10d +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %r10d, %esi +; SSE2-NEXT: subw %ax, %si +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %r10w +; SSE2-NEXT: cmovol %ecx, %r10d +; SSE2-NEXT: pextrw $3, %xmm1, %eax +; SSE2-NEXT: pextrw $3, %xmm0, %r11d +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %r11d, %esi +; SSE2-NEXT: subw %ax, %si +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %r11w +; SSE2-NEXT: cmovol %ecx, %r11d +; SSE2-NEXT: pextrw $4, %xmm1, %ecx +; SSE2-NEXT: pextrw $4, %xmm0, %r14d +; SSE2-NEXT: xorl %esi, %esi +; SSE2-NEXT: movl %r14d, %edi +; SSE2-NEXT: subw %cx, %di +; SSE2-NEXT: setns %sil +; SSE2-NEXT: addl $32767, %esi # imm = 0x7FFF +; SSE2-NEXT: subw %cx, %r14w +; SSE2-NEXT: cmovol %esi, %r14d +; SSE2-NEXT: pextrw $5, %xmm1, %esi +; SSE2-NEXT: pextrw $5, %xmm0, %ecx +; SSE2-NEXT: xorl %edi, %edi +; SSE2-NEXT: movl %ecx, %ebx +; SSE2-NEXT: subw %si, %bx +; SSE2-NEXT: setns %dil +; SSE2-NEXT: addl $32767, %edi # imm = 0x7FFF +; SSE2-NEXT: subw %si, %cx +; SSE2-NEXT: cmovol %edi, %ecx +; SSE2-NEXT: pextrw $6, %xmm1, %edi +; SSE2-NEXT: pextrw $6, %xmm0, %esi +; SSE2-NEXT: xorl %ebx, %ebx +; SSE2-NEXT: movl %esi, %ebp +; SSE2-NEXT: subw %di, %bp +; SSE2-NEXT: setns %bl +; SSE2-NEXT: addl $32767, %ebx # imm = 0x7FFF +; SSE2-NEXT: subw %di, %si +; SSE2-NEXT: cmovol %ebx, %esi +; SSE2-NEXT: pextrw $7, %xmm1, %edi +; SSE2-NEXT: pextrw $7, %xmm0, %ebx +; SSE2-NEXT: xorl %ebp, %ebp +; SSE2-NEXT: movl %ebx, %eax +; SSE2-NEXT: subw %di, %ax +; SSE2-NEXT: setns %bpl +; SSE2-NEXT: addl $32767, %ebp # imm = 0x7FFF +; SSE2-NEXT: subw %di, %bx +; SSE2-NEXT: cmovol %ebp, %ebx +; SSE2-NEXT: movd %ebx, %xmm0 +; SSE2-NEXT: movd %esi, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: movd %r14d, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: movd %r11d, %xmm0 +; SSE2-NEXT: movd %r10d, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: movd %r9d, %xmm0 +; SSE2-NEXT: movd %r8d, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE2-NEXT: psrlw $8, %xmm3 +; SSE2-NEXT: packuswb %xmm0, %xmm3 +; SSE2-NEXT: movq %xmm3, (%rdx) +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %r14 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v8i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pushq %rbp +; SSSE3-NEXT: pushq %r14 +; SSSE3-NEXT: pushq %rbx +; SSSE3-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; SSSE3-NEXT: movq {{.*#+}} xmm3 = mem[0],zero +; SSSE3-NEXT: pxor %xmm0, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSSE3-NEXT: movd %xmm1, %eax +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSSE3-NEXT: movd %xmm0, %r8d +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %r8d, %esi +; SSSE3-NEXT: subw %ax, %si +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %r8w +; SSSE3-NEXT: cmovol %ecx, %r8d +; SSSE3-NEXT: pextrw $1, %xmm1, %eax +; SSSE3-NEXT: pextrw $1, %xmm0, %r9d +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %r9d, %esi +; SSSE3-NEXT: subw %ax, %si +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %r9w +; SSSE3-NEXT: cmovol %ecx, %r9d +; SSSE3-NEXT: pextrw $2, %xmm1, %eax +; SSSE3-NEXT: pextrw $2, %xmm0, %r10d +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %r10d, %esi +; SSSE3-NEXT: subw %ax, %si +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %r10w +; SSSE3-NEXT: cmovol %ecx, %r10d +; SSSE3-NEXT: pextrw $3, %xmm1, %eax +; SSSE3-NEXT: pextrw $3, %xmm0, %r11d +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %r11d, %esi +; SSSE3-NEXT: subw %ax, %si +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %r11w +; SSSE3-NEXT: cmovol %ecx, %r11d +; SSSE3-NEXT: pextrw $4, %xmm1, %ecx +; SSSE3-NEXT: pextrw $4, %xmm0, %r14d +; SSSE3-NEXT: xorl %esi, %esi +; SSSE3-NEXT: movl %r14d, %edi +; SSSE3-NEXT: subw %cx, %di +; SSSE3-NEXT: setns %sil +; SSSE3-NEXT: addl $32767, %esi # imm = 0x7FFF +; SSSE3-NEXT: subw %cx, %r14w +; SSSE3-NEXT: cmovol %esi, %r14d +; SSSE3-NEXT: pextrw $5, %xmm1, %esi +; SSSE3-NEXT: pextrw $5, %xmm0, %ecx +; SSSE3-NEXT: xorl %edi, %edi +; SSSE3-NEXT: movl %ecx, %ebx +; SSSE3-NEXT: subw %si, %bx +; SSSE3-NEXT: setns %dil +; SSSE3-NEXT: addl $32767, %edi # imm = 0x7FFF +; SSSE3-NEXT: subw %si, %cx +; SSSE3-NEXT: cmovol %edi, %ecx +; SSSE3-NEXT: pextrw $6, %xmm1, %edi +; SSSE3-NEXT: pextrw $6, %xmm0, %esi +; SSSE3-NEXT: xorl %ebx, %ebx +; SSSE3-NEXT: movl %esi, %ebp +; SSSE3-NEXT: subw %di, %bp +; SSSE3-NEXT: setns %bl +; SSSE3-NEXT: addl $32767, %ebx # imm = 0x7FFF +; SSSE3-NEXT: subw %di, %si +; SSSE3-NEXT: cmovol %ebx, %esi +; SSSE3-NEXT: pextrw $7, %xmm1, %edi +; SSSE3-NEXT: pextrw $7, %xmm0, %ebx +; SSSE3-NEXT: xorl %ebp, %ebp +; SSSE3-NEXT: movl %ebx, %eax +; SSSE3-NEXT: subw %di, %ax +; SSSE3-NEXT: setns %bpl +; SSSE3-NEXT: addl $32767, %ebp # imm = 0x7FFF +; SSSE3-NEXT: subw %di, %bx +; SSSE3-NEXT: cmovol %ebp, %ebx +; SSSE3-NEXT: movd %ebx, %xmm0 +; SSSE3-NEXT: movd %esi, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: movd %r14d, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSSE3-NEXT: movd %r11d, %xmm0 +; SSSE3-NEXT: movd %r10d, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: movd %r9d, %xmm0 +; SSSE3-NEXT: movd %r8d, %xmm3 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSSE3-NEXT: psrlw $8, %xmm3 +; SSSE3-NEXT: packuswb %xmm0, %xmm3 +; SSSE3-NEXT: movq %xmm3, (%rdx) +; SSSE3-NEXT: popq %rbx +; SSSE3-NEXT: popq %r14 +; SSSE3-NEXT: popq %rbp +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v8i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pushq %rbp +; SSE41-NEXT: pushq %r14 +; SSE41-NEXT: pushq %rbx +; SSE41-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; SSE41-NEXT: movq {{.*#+}} xmm3 = mem[0],zero +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE41-NEXT: pextrw $7, %xmm1, %eax +; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE41-NEXT: pextrw $7, %xmm0, %r8d +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %r8d, %esi +; SSE41-NEXT: subw %ax, %si +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %r8w +; SSE41-NEXT: cmovol %ecx, %r8d +; SSE41-NEXT: pextrw $6, %xmm1, %eax +; SSE41-NEXT: pextrw $6, %xmm0, %r9d +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %r9d, %esi +; SSE41-NEXT: subw %ax, %si +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %r9w +; SSE41-NEXT: cmovol %ecx, %r9d +; SSE41-NEXT: pextrw $5, %xmm1, %eax +; SSE41-NEXT: pextrw $5, %xmm0, %r10d +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %r10d, %edi +; SSE41-NEXT: subw %ax, %di +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %r10w +; SSE41-NEXT: cmovol %ecx, %r10d +; SSE41-NEXT: pextrw $4, %xmm1, %eax +; SSE41-NEXT: pextrw $4, %xmm0, %r11d +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %r11d, %esi +; SSE41-NEXT: subw %ax, %si +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %r11w +; SSE41-NEXT: cmovol %ecx, %r11d +; SSE41-NEXT: pextrw $3, %xmm1, %ecx +; SSE41-NEXT: pextrw $3, %xmm0, %r14d +; SSE41-NEXT: xorl %esi, %esi +; SSE41-NEXT: movl %r14d, %edi +; SSE41-NEXT: subw %cx, %di +; SSE41-NEXT: setns %sil +; SSE41-NEXT: addl $32767, %esi # imm = 0x7FFF +; SSE41-NEXT: subw %cx, %r14w +; SSE41-NEXT: cmovol %esi, %r14d +; SSE41-NEXT: pextrw $2, %xmm1, %esi +; SSE41-NEXT: pextrw $2, %xmm0, %ecx +; SSE41-NEXT: xorl %edi, %edi +; SSE41-NEXT: movl %ecx, %ebx +; SSE41-NEXT: subw %si, %bx +; SSE41-NEXT: setns %dil +; SSE41-NEXT: addl $32767, %edi # imm = 0x7FFF +; SSE41-NEXT: subw %si, %cx +; SSE41-NEXT: cmovol %edi, %ecx +; SSE41-NEXT: movd %xmm1, %esi +; SSE41-NEXT: movd %xmm0, %edi +; SSE41-NEXT: xorl %ebx, %ebx +; SSE41-NEXT: movl %edi, %ebp +; SSE41-NEXT: subw %si, %bp +; SSE41-NEXT: setns %bl +; SSE41-NEXT: addl $32767, %ebx # imm = 0x7FFF +; SSE41-NEXT: subw %si, %di +; SSE41-NEXT: cmovol %ebx, %edi +; SSE41-NEXT: pextrw $1, %xmm1, %esi +; SSE41-NEXT: pextrw $1, %xmm0, %ebx +; SSE41-NEXT: xorl %ebp, %ebp +; SSE41-NEXT: movl %ebx, %eax +; SSE41-NEXT: subw %si, %ax +; SSE41-NEXT: setns %bpl +; SSE41-NEXT: addl $32767, %ebp # imm = 0x7FFF +; SSE41-NEXT: subw %si, %bx +; SSE41-NEXT: cmovol %ebp, %ebx +; SSE41-NEXT: movd %edi, %xmm0 +; SSE41-NEXT: pinsrw $1, %ebx, %xmm0 +; SSE41-NEXT: pinsrw $2, %ecx, %xmm0 +; SSE41-NEXT: pinsrw $3, %r14d, %xmm0 +; SSE41-NEXT: pinsrw $4, %r11d, %xmm0 +; SSE41-NEXT: pinsrw $5, %r10d, %xmm0 +; SSE41-NEXT: pinsrw $6, %r9d, %xmm0 +; SSE41-NEXT: pinsrw $7, %r8d, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: packuswb %xmm0, %xmm0 +; SSE41-NEXT: movq %xmm0, (%rdx) +; SSE41-NEXT: popq %rbx +; SSE41-NEXT: popq %r14 +; SSE41-NEXT: popq %rbp +; SSE41-NEXT: retq +; +; AVX1-LABEL: v8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: pushq %r14 +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX1-NEXT: vpextrw $7, %xmm0, %eax +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX1-NEXT: vpextrw $7, %xmm1, %r8d +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %r8d, %esi +; AVX1-NEXT: subw %ax, %si +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: subw %ax, %r8w +; AVX1-NEXT: cmovol %ecx, %r8d +; AVX1-NEXT: vpextrw $6, %xmm0, %eax +; AVX1-NEXT: vpextrw $6, %xmm1, %r9d +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %r9d, %esi +; AVX1-NEXT: subw %ax, %si +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: subw %ax, %r9w +; AVX1-NEXT: cmovol %ecx, %r9d +; AVX1-NEXT: vpextrw $5, %xmm0, %eax +; AVX1-NEXT: vpextrw $5, %xmm1, %r10d +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %r10d, %edi +; AVX1-NEXT: subw %ax, %di +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: subw %ax, %r10w +; AVX1-NEXT: cmovol %ecx, %r10d +; AVX1-NEXT: vpextrw $4, %xmm0, %eax +; AVX1-NEXT: vpextrw $4, %xmm1, %r11d +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %r11d, %esi +; AVX1-NEXT: subw %ax, %si +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX1-NEXT: subw %ax, %r11w +; AVX1-NEXT: cmovol %ecx, %r11d +; AVX1-NEXT: vpextrw $3, %xmm0, %ecx +; AVX1-NEXT: vpextrw $3, %xmm1, %r14d +; AVX1-NEXT: xorl %esi, %esi +; AVX1-NEXT: movl %r14d, %edi +; AVX1-NEXT: subw %cx, %di +; AVX1-NEXT: setns %sil +; AVX1-NEXT: addl $32767, %esi # imm = 0x7FFF +; AVX1-NEXT: subw %cx, %r14w +; AVX1-NEXT: cmovol %esi, %r14d +; AVX1-NEXT: vpextrw $2, %xmm0, %esi +; AVX1-NEXT: vpextrw $2, %xmm1, %ecx +; AVX1-NEXT: xorl %edi, %edi +; AVX1-NEXT: movl %ecx, %ebx +; AVX1-NEXT: subw %si, %bx +; AVX1-NEXT: setns %dil +; AVX1-NEXT: addl $32767, %edi # imm = 0x7FFF +; AVX1-NEXT: subw %si, %cx +; AVX1-NEXT: cmovol %edi, %ecx +; AVX1-NEXT: vmovd %xmm0, %esi +; AVX1-NEXT: vmovd %xmm1, %edi +; AVX1-NEXT: xorl %ebx, %ebx +; AVX1-NEXT: movl %edi, %ebp +; AVX1-NEXT: subw %si, %bp +; AVX1-NEXT: setns %bl +; AVX1-NEXT: addl $32767, %ebx # imm = 0x7FFF +; AVX1-NEXT: subw %si, %di +; AVX1-NEXT: cmovol %ebx, %edi +; AVX1-NEXT: vpextrw $1, %xmm0, %esi +; AVX1-NEXT: vpextrw $1, %xmm1, %ebx +; AVX1-NEXT: xorl %ebp, %ebp +; AVX1-NEXT: movl %ebx, %eax +; AVX1-NEXT: subw %si, %ax +; AVX1-NEXT: setns %bpl +; AVX1-NEXT: addl $32767, %ebp # imm = 0x7FFF +; AVX1-NEXT: subw %si, %bx +; AVX1-NEXT: cmovol %ebp, %ebx +; AVX1-NEXT: vmovd %edi, %xmm0 +; AVX1-NEXT: vpinsrw $1, %ebx, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $3, %r14d, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $4, %r11d, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $5, %r10d, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $6, %r9d, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $7, %r8d, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, (%rdx) +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: popq %r14 +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX2-NEXT: vpextrw $7, %xmm0, %eax +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX2-NEXT: vpextrw $7, %xmm1, %r8d +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %r8d, %esi +; AVX2-NEXT: subw %ax, %si +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: subw %ax, %r8w +; AVX2-NEXT: cmovol %ecx, %r8d +; AVX2-NEXT: vpextrw $6, %xmm0, %eax +; AVX2-NEXT: vpextrw $6, %xmm1, %r9d +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %r9d, %esi +; AVX2-NEXT: subw %ax, %si +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: subw %ax, %r9w +; AVX2-NEXT: cmovol %ecx, %r9d +; AVX2-NEXT: vpextrw $5, %xmm0, %eax +; AVX2-NEXT: vpextrw $5, %xmm1, %r10d +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %r10d, %edi +; AVX2-NEXT: subw %ax, %di +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: subw %ax, %r10w +; AVX2-NEXT: cmovol %ecx, %r10d +; AVX2-NEXT: vpextrw $4, %xmm0, %eax +; AVX2-NEXT: vpextrw $4, %xmm1, %r11d +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %r11d, %esi +; AVX2-NEXT: subw %ax, %si +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX2-NEXT: subw %ax, %r11w +; AVX2-NEXT: cmovol %ecx, %r11d +; AVX2-NEXT: vpextrw $3, %xmm0, %ecx +; AVX2-NEXT: vpextrw $3, %xmm1, %r14d +; AVX2-NEXT: xorl %esi, %esi +; AVX2-NEXT: movl %r14d, %edi +; AVX2-NEXT: subw %cx, %di +; AVX2-NEXT: setns %sil +; AVX2-NEXT: addl $32767, %esi # imm = 0x7FFF +; AVX2-NEXT: subw %cx, %r14w +; AVX2-NEXT: cmovol %esi, %r14d +; AVX2-NEXT: vpextrw $2, %xmm0, %esi +; AVX2-NEXT: vpextrw $2, %xmm1, %ecx +; AVX2-NEXT: xorl %edi, %edi +; AVX2-NEXT: movl %ecx, %ebx +; AVX2-NEXT: subw %si, %bx +; AVX2-NEXT: setns %dil +; AVX2-NEXT: addl $32767, %edi # imm = 0x7FFF +; AVX2-NEXT: subw %si, %cx +; AVX2-NEXT: cmovol %edi, %ecx +; AVX2-NEXT: vmovd %xmm0, %esi +; AVX2-NEXT: vmovd %xmm1, %edi +; AVX2-NEXT: xorl %ebx, %ebx +; AVX2-NEXT: movl %edi, %ebp +; AVX2-NEXT: subw %si, %bp +; AVX2-NEXT: setns %bl +; AVX2-NEXT: addl $32767, %ebx # imm = 0x7FFF +; AVX2-NEXT: subw %si, %di +; AVX2-NEXT: cmovol %ebx, %edi +; AVX2-NEXT: vpextrw $1, %xmm0, %esi +; AVX2-NEXT: vpextrw $1, %xmm1, %ebx +; AVX2-NEXT: xorl %ebp, %ebp +; AVX2-NEXT: movl %ebx, %eax +; AVX2-NEXT: subw %si, %ax +; AVX2-NEXT: setns %bpl +; AVX2-NEXT: addl $32767, %ebp # imm = 0x7FFF +; AVX2-NEXT: subw %si, %bx +; AVX2-NEXT: cmovol %ebp, %ebx +; AVX2-NEXT: vmovd %edi, %xmm0 +; AVX2-NEXT: vpinsrw $1, %ebx, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $3, %r14d, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $4, %r11d, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $5, %r10d, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $6, %r9d, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $7, %r8d, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rdx) +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512-LABEL: v8i8: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512-NEXT: vpextrw $7, %xmm0, %eax +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512-NEXT: vpextrw $7, %xmm1, %r8d +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %r8d, %esi +; AVX512-NEXT: subw %ax, %si +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: subw %ax, %r8w +; AVX512-NEXT: cmovol %ecx, %r8d +; AVX512-NEXT: vpextrw $6, %xmm0, %eax +; AVX512-NEXT: vpextrw $6, %xmm1, %r9d +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %r9d, %esi +; AVX512-NEXT: subw %ax, %si +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: subw %ax, %r9w +; AVX512-NEXT: cmovol %ecx, %r9d +; AVX512-NEXT: vpextrw $5, %xmm0, %eax +; AVX512-NEXT: vpextrw $5, %xmm1, %r10d +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %r10d, %edi +; AVX512-NEXT: subw %ax, %di +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: subw %ax, %r10w +; AVX512-NEXT: cmovol %ecx, %r10d +; AVX512-NEXT: vpextrw $4, %xmm0, %eax +; AVX512-NEXT: vpextrw $4, %xmm1, %r11d +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %r11d, %esi +; AVX512-NEXT: subw %ax, %si +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX512-NEXT: subw %ax, %r11w +; AVX512-NEXT: cmovol %ecx, %r11d +; AVX512-NEXT: vpextrw $3, %xmm0, %ecx +; AVX512-NEXT: vpextrw $3, %xmm1, %r14d +; AVX512-NEXT: xorl %esi, %esi +; AVX512-NEXT: movl %r14d, %edi +; AVX512-NEXT: subw %cx, %di +; AVX512-NEXT: setns %sil +; AVX512-NEXT: addl $32767, %esi # imm = 0x7FFF +; AVX512-NEXT: subw %cx, %r14w +; AVX512-NEXT: cmovol %esi, %r14d +; AVX512-NEXT: vpextrw $2, %xmm0, %esi +; AVX512-NEXT: vpextrw $2, %xmm1, %ecx +; AVX512-NEXT: xorl %edi, %edi +; AVX512-NEXT: movl %ecx, %ebx +; AVX512-NEXT: subw %si, %bx +; AVX512-NEXT: setns %dil +; AVX512-NEXT: addl $32767, %edi # imm = 0x7FFF +; AVX512-NEXT: subw %si, %cx +; AVX512-NEXT: cmovol %edi, %ecx +; AVX512-NEXT: vmovd %xmm0, %esi +; AVX512-NEXT: vmovd %xmm1, %edi +; AVX512-NEXT: xorl %ebx, %ebx +; AVX512-NEXT: movl %edi, %ebp +; AVX512-NEXT: subw %si, %bp +; AVX512-NEXT: setns %bl +; AVX512-NEXT: addl $32767, %ebx # imm = 0x7FFF +; AVX512-NEXT: subw %si, %di +; AVX512-NEXT: cmovol %ebx, %edi +; AVX512-NEXT: vpextrw $1, %xmm0, %esi +; AVX512-NEXT: vpextrw $1, %xmm1, %ebx +; AVX512-NEXT: xorl %ebp, %ebp +; AVX512-NEXT: movl %ebx, %eax +; AVX512-NEXT: subw %si, %ax +; AVX512-NEXT: setns %bpl +; AVX512-NEXT: addl $32767, %ebp # imm = 0x7FFF +; AVX512-NEXT: subw %si, %bx +; AVX512-NEXT: cmovol %ebp, %ebx +; AVX512-NEXT: vmovd %edi, %xmm0 +; AVX512-NEXT: vpinsrw $1, %ebx, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $3, %r14d, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $4, %r11d, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $5, %r10d, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $6, %r9d, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $7, %r8d, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512-NEXT: vpmovwb %xmm0, (%rdx) +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq + %x = load <8 x i8>, <8 x i8>* %px + %y = load <8 x i8>, <8 x i8>* %py + %z = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> %x, <8 x i8> %y) + store <8 x i8> %z, <8 x i8>* %pz + ret void +} + +define void @v4i8(<4 x i8>* %px, <4 x i8>* %py, <4 x i8>* %pz) nounwind { +; SSE2-LABEL: v4i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: pslld $24, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] +; SSE2-NEXT: movd %xmm2, %ecx +; SSE2-NEXT: pslld $24, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; SSE2-NEXT: movd %xmm2, %r8d +; SSE2-NEXT: xorl %esi, %esi +; SSE2-NEXT: movl %r8d, %edi +; SSE2-NEXT: subl %ecx, %edi +; SSE2-NEXT: setns %sil +; SSE2-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; SSE2-NEXT: subl %ecx, %r8d +; SSE2-NEXT: cmovol %esi, %r8d +; SSE2-NEXT: movd %xmm1, %esi +; SSE2-NEXT: movd %xmm0, %r10d +; SSE2-NEXT: xorl %edi, %edi +; SSE2-NEXT: movl %r10d, %ecx +; SSE2-NEXT: subl %esi, %ecx +; SSE2-NEXT: setns %dil +; SSE2-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; SSE2-NEXT: subl %esi, %r10d +; SSE2-NEXT: cmovol %edi, %r10d +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE2-NEXT: movd %xmm2, %r9d +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE2-NEXT: movd %xmm2, %ecx +; SSE2-NEXT: xorl %edi, %edi +; SSE2-NEXT: movl %ecx, %esi +; SSE2-NEXT: subl %r9d, %esi +; SSE2-NEXT: setns %dil +; SSE2-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; SSE2-NEXT: subl %r9d, %ecx +; SSE2-NEXT: cmovol %edi, %ecx +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] +; SSE2-NEXT: movd %xmm1, %r9d +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: xorl %esi, %esi +; SSE2-NEXT: movl %eax, %edi +; SSE2-NEXT: subl %r9d, %edi +; SSE2-NEXT: setns %sil +; SSE2-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; SSE2-NEXT: subl %r9d, %eax +; SSE2-NEXT: cmovol %esi, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movd %r10d, %xmm0 +; SSE2-NEXT: movd %r8d, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: psrld $24, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: movd %xmm0, (%rdx) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v4i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,1,255,255,255,2,255,255,255,3] +; SSSE3-NEXT: pshufb %xmm2, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3] +; SSSE3-NEXT: movd %xmm3, %ecx +; SSSE3-NEXT: pshufb %xmm2, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; SSSE3-NEXT: movd %xmm2, %r8d +; SSSE3-NEXT: xorl %esi, %esi +; SSSE3-NEXT: movl %r8d, %edi +; SSSE3-NEXT: subl %ecx, %edi +; SSSE3-NEXT: setns %sil +; SSSE3-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; SSSE3-NEXT: subl %ecx, %r8d +; SSSE3-NEXT: cmovol %esi, %r8d +; SSSE3-NEXT: movd %xmm1, %esi +; SSSE3-NEXT: movd %xmm0, %r10d +; SSSE3-NEXT: xorl %edi, %edi +; SSSE3-NEXT: movl %r10d, %ecx +; SSSE3-NEXT: subl %esi, %ecx +; SSSE3-NEXT: setns %dil +; SSSE3-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; SSSE3-NEXT: subl %esi, %r10d +; SSSE3-NEXT: cmovol %edi, %r10d +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSSE3-NEXT: movd %xmm2, %r9d +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSSE3-NEXT: movd %xmm2, %ecx +; SSSE3-NEXT: xorl %edi, %edi +; SSSE3-NEXT: movl %ecx, %esi +; SSSE3-NEXT: subl %r9d, %esi +; SSSE3-NEXT: setns %dil +; SSSE3-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; SSSE3-NEXT: subl %r9d, %ecx +; SSSE3-NEXT: cmovol %edi, %ecx +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] +; SSSE3-NEXT: movd %xmm1, %r9d +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSSE3-NEXT: movd %xmm0, %eax +; SSSE3-NEXT: xorl %esi, %esi +; SSSE3-NEXT: movl %eax, %edi +; SSSE3-NEXT: subl %r9d, %edi +; SSSE3-NEXT: setns %sil +; SSSE3-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; SSSE3-NEXT: subl %r9d, %eax +; SSSE3-NEXT: cmovol %esi, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: movd %r10d, %xmm0 +; SSSE3-NEXT: movd %r8d, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: movd %xmm0, (%rdx) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v4i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; SSE41-NEXT: pslld $24, %xmm1 +; SSE41-NEXT: pextrd $3, %xmm1, %ecx +; SSE41-NEXT: pslld $24, %xmm0 +; SSE41-NEXT: pextrd $3, %xmm0, %r8d +; SSE41-NEXT: xorl %esi, %esi +; SSE41-NEXT: movl %r8d, %edi +; SSE41-NEXT: subl %ecx, %edi +; SSE41-NEXT: setns %sil +; SSE41-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; SSE41-NEXT: subl %ecx, %r8d +; SSE41-NEXT: cmovol %esi, %r8d +; SSE41-NEXT: pextrd $2, %xmm1, %esi +; SSE41-NEXT: pextrd $2, %xmm0, %r10d +; SSE41-NEXT: xorl %edi, %edi +; SSE41-NEXT: movl %r10d, %ecx +; SSE41-NEXT: subl %esi, %ecx +; SSE41-NEXT: setns %dil +; SSE41-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; SSE41-NEXT: subl %esi, %r10d +; SSE41-NEXT: cmovol %edi, %r10d +; SSE41-NEXT: movd %xmm1, %r9d +; SSE41-NEXT: movd %xmm0, %ecx +; SSE41-NEXT: xorl %edi, %edi +; SSE41-NEXT: movl %ecx, %esi +; SSE41-NEXT: subl %r9d, %esi +; SSE41-NEXT: setns %dil +; SSE41-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; SSE41-NEXT: subl %r9d, %ecx +; SSE41-NEXT: cmovol %edi, %ecx +; SSE41-NEXT: pextrd $1, %xmm1, %r9d +; SSE41-NEXT: pextrd $1, %xmm0, %eax +; SSE41-NEXT: xorl %esi, %esi +; SSE41-NEXT: movl %eax, %edi +; SSE41-NEXT: subl %r9d, %edi +; SSE41-NEXT: setns %sil +; SSE41-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; SSE41-NEXT: subl %r9d, %eax +; SSE41-NEXT: cmovol %esi, %eax +; SSE41-NEXT: movd %ecx, %xmm0 +; SSE41-NEXT: pinsrd $1, %eax, %xmm0 +; SSE41-NEXT: pinsrd $2, %r10d, %xmm0 +; SSE41-NEXT: pinsrd $3, %r8d, %xmm0 +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] +; SSE41-NEXT: movd %xmm0, (%rdx) +; SSE41-NEXT: retq +; +; AVX1-LABEL: v4i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpslld $24, %xmm0, %xmm0 +; AVX1-NEXT: vpextrd $3, %xmm0, %ecx +; AVX1-NEXT: vpslld $24, %xmm1, %xmm1 +; AVX1-NEXT: vpextrd $3, %xmm1, %r9d +; AVX1-NEXT: xorl %esi, %esi +; AVX1-NEXT: movl %r9d, %edi +; AVX1-NEXT: subl %ecx, %edi +; AVX1-NEXT: setns %sil +; AVX1-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; AVX1-NEXT: subl %ecx, %r9d +; AVX1-NEXT: cmovol %esi, %r9d +; AVX1-NEXT: vpextrd $2, %xmm0, %r8d +; AVX1-NEXT: vpextrd $2, %xmm1, %r10d +; AVX1-NEXT: xorl %edi, %edi +; AVX1-NEXT: movl %r10d, %ecx +; AVX1-NEXT: subl %r8d, %ecx +; AVX1-NEXT: setns %dil +; AVX1-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; AVX1-NEXT: subl %r8d, %r10d +; AVX1-NEXT: cmovol %edi, %r10d +; AVX1-NEXT: vmovd %xmm0, %r8d +; AVX1-NEXT: vmovd %xmm1, %eax +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %eax, %edi +; AVX1-NEXT: subl %r8d, %edi +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF +; AVX1-NEXT: subl %r8d, %eax +; AVX1-NEXT: cmovol %ecx, %eax +; AVX1-NEXT: vpextrd $1, %xmm0, %r8d +; AVX1-NEXT: vpextrd $1, %xmm1, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edi +; AVX1-NEXT: subl %r8d, %edi +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF +; AVX1-NEXT: subl %r8d, %esi +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrd $2, %r10d, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrd $3, %r9d, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vmovd %xmm0, (%rdx) +; AVX1-NEXT: retq +; +; AVX2-LABEL: v4i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX2-NEXT: vpslld $24, %xmm0, %xmm0 +; AVX2-NEXT: vpextrd $3, %xmm0, %ecx +; AVX2-NEXT: vpslld $24, %xmm1, %xmm1 +; AVX2-NEXT: vpextrd $3, %xmm1, %r9d +; AVX2-NEXT: xorl %esi, %esi +; AVX2-NEXT: movl %r9d, %edi +; AVX2-NEXT: subl %ecx, %edi +; AVX2-NEXT: setns %sil +; AVX2-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; AVX2-NEXT: subl %ecx, %r9d +; AVX2-NEXT: cmovol %esi, %r9d +; AVX2-NEXT: vpextrd $2, %xmm0, %r8d +; AVX2-NEXT: vpextrd $2, %xmm1, %r10d +; AVX2-NEXT: xorl %edi, %edi +; AVX2-NEXT: movl %r10d, %ecx +; AVX2-NEXT: subl %r8d, %ecx +; AVX2-NEXT: setns %dil +; AVX2-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; AVX2-NEXT: subl %r8d, %r10d +; AVX2-NEXT: cmovol %edi, %r10d +; AVX2-NEXT: vmovd %xmm0, %r8d +; AVX2-NEXT: vmovd %xmm1, %eax +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %eax, %edi +; AVX2-NEXT: subl %r8d, %edi +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF +; AVX2-NEXT: subl %r8d, %eax +; AVX2-NEXT: cmovol %ecx, %eax +; AVX2-NEXT: vpextrd $1, %xmm0, %r8d +; AVX2-NEXT: vpextrd $1, %xmm1, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edi +; AVX2-NEXT: subl %r8d, %edi +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF +; AVX2-NEXT: subl %r8d, %esi +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrd $2, %r10d, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrd $3, %r9d, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovd %xmm0, (%rdx) +; AVX2-NEXT: retq +; +; AVX512-LABEL: v4i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,1,255,255,255,2,255,255,255,3] +; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpextrd $3, %xmm0, %ecx +; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpextrd $3, %xmm1, %r9d +; AVX512-NEXT: xorl %esi, %esi +; AVX512-NEXT: movl %r9d, %edi +; AVX512-NEXT: subl %ecx, %edi +; AVX512-NEXT: setns %sil +; AVX512-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; AVX512-NEXT: subl %ecx, %r9d +; AVX512-NEXT: cmovol %esi, %r9d +; AVX512-NEXT: vpextrd $2, %xmm0, %r8d +; AVX512-NEXT: vpextrd $2, %xmm1, %r10d +; AVX512-NEXT: xorl %edi, %edi +; AVX512-NEXT: movl %r10d, %ecx +; AVX512-NEXT: subl %r8d, %ecx +; AVX512-NEXT: setns %dil +; AVX512-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; AVX512-NEXT: subl %r8d, %r10d +; AVX512-NEXT: cmovol %edi, %r10d +; AVX512-NEXT: vmovd %xmm0, %r8d +; AVX512-NEXT: vmovd %xmm1, %eax +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %eax, %edi +; AVX512-NEXT: subl %r8d, %edi +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF +; AVX512-NEXT: subl %r8d, %eax +; AVX512-NEXT: cmovol %ecx, %eax +; AVX512-NEXT: vpextrd $1, %xmm0, %r8d +; AVX512-NEXT: vpextrd $1, %xmm1, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edi +; AVX512-NEXT: subl %r8d, %edi +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF +; AVX512-NEXT: subl %r8d, %esi +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: vmovd %eax, %xmm0 +; AVX512-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrd $2, %r10d, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrd $3, %r9d, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $24, %xmm0, %xmm0 +; AVX512-NEXT: vpmovdb %xmm0, (%rdx) +; AVX512-NEXT: retq + %x = load <4 x i8>, <4 x i8>* %px + %y = load <4 x i8>, <4 x i8>* %py + %z = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> %x, <4 x i8> %y) + store <4 x i8> %z, <4 x i8>* %pz + ret void +} + +define void @v2i8(<2 x i8>* %px, <2 x i8>* %py, <2 x i8>* %pz) nounwind { +; SSE2-LABEL: v2i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movzwl (%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSE2-NEXT: movzwl (%rsi), %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] +; SSE2-NEXT: psllq $56, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: psllq $56, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm2, %rcx +; SSE2-NEXT: xorl %esi, %esi +; SSE2-NEXT: movq %rcx, %rdi +; SSE2-NEXT: subq %rax, %rdi +; SSE2-NEXT: setns %sil +; SSE2-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF +; SSE2-NEXT: addq %r8, %rsi +; SSE2-NEXT: subq %rax, %rcx +; SSE2-NEXT: cmovoq %rsi, %rcx +; SSE2-NEXT: movq %xmm1, %r9 +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: xorl %edi, %edi +; SSE2-NEXT: movq %rax, %rsi +; SSE2-NEXT: subq %r9, %rsi +; SSE2-NEXT: setns %dil +; SSE2-NEXT: addq %r8, %rdi +; SSE2-NEXT: subq %r9, %rax +; SSE2-NEXT: cmovoq %rdi, %rax +; SSE2-NEXT: movq %rax, %xmm0 +; SSE2-NEXT: movq %rcx, %xmm1 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: psrlq $56, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movw %ax, (%rdx) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v2i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movzwl (%rdi), %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzwl (%rsi), %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,1] +; SSSE3-NEXT: pshufb %xmm2, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; SSSE3-NEXT: movq %xmm3, %rax +; SSSE3-NEXT: pshufb %xmm2, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSSE3-NEXT: movq %xmm2, %rcx +; SSSE3-NEXT: xorl %esi, %esi +; SSSE3-NEXT: movq %rcx, %rdi +; SSSE3-NEXT: subq %rax, %rdi +; SSSE3-NEXT: setns %sil +; SSSE3-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF +; SSSE3-NEXT: addq %r8, %rsi +; SSSE3-NEXT: subq %rax, %rcx +; SSSE3-NEXT: cmovoq %rsi, %rcx +; SSSE3-NEXT: movq %xmm1, %r9 +; SSSE3-NEXT: movq %xmm0, %rax +; SSSE3-NEXT: xorl %edi, %edi +; SSSE3-NEXT: movq %rax, %rsi +; SSSE3-NEXT: subq %r9, %rsi +; SSSE3-NEXT: setns %dil +; SSSE3-NEXT: addq %r8, %rdi +; SSSE3-NEXT: subq %r9, %rax +; SSSE3-NEXT: cmovoq %rdi, %rax +; SSSE3-NEXT: movq %rax, %xmm0 +; SSSE3-NEXT: movq %rcx, %xmm1 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: movd %xmm0, %eax +; SSSE3-NEXT: movw %ax, (%rdx) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v2i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: psllq $56, %xmm1 +; SSE41-NEXT: movq %xmm1, %rax +; SSE41-NEXT: psllq $56, %xmm0 +; SSE41-NEXT: movq %xmm0, %rcx +; SSE41-NEXT: xorl %esi, %esi +; SSE41-NEXT: movq %rcx, %rdi +; SSE41-NEXT: subq %rax, %rdi +; SSE41-NEXT: setns %sil +; SSE41-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF +; SSE41-NEXT: addq %r8, %rsi +; SSE41-NEXT: subq %rax, %rcx +; SSE41-NEXT: cmovoq %rsi, %rcx +; SSE41-NEXT: pextrq $1, %xmm1, %r9 +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: xorl %edi, %edi +; SSE41-NEXT: movq %rax, %rsi +; SSE41-NEXT: subq %r9, %rsi +; SSE41-NEXT: setns %dil +; SSE41-NEXT: addq %r8, %rdi +; SSE41-NEXT: subq %r9, %rax +; SSE41-NEXT: cmovoq %rdi, %rax +; SSE41-NEXT: movq %rax, %xmm0 +; SSE41-NEXT: movq %rcx, %xmm1 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; SSE41-NEXT: pextrw $0, %xmm1, (%rdx) +; SSE41-NEXT: retq +; +; AVX1-LABEL: v2i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsllq $56, %xmm1, %xmm1 +; AVX1-NEXT: vmovq %xmm1, %rax +; AVX1-NEXT: vpsllq $56, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rcx +; AVX1-NEXT: xorl %esi, %esi +; AVX1-NEXT: movq %rcx, %rdi +; AVX1-NEXT: subq %rax, %rdi +; AVX1-NEXT: setns %sil +; AVX1-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF +; AVX1-NEXT: addq %r8, %rsi +; AVX1-NEXT: subq %rax, %rcx +; AVX1-NEXT: cmovoq %rsi, %rcx +; AVX1-NEXT: vpextrq $1, %xmm1, %r9 +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: xorl %edi, %edi +; AVX1-NEXT: movq %rax, %rsi +; AVX1-NEXT: subq %r9, %rsi +; AVX1-NEXT: setns %dil +; AVX1-NEXT: addq %r8, %rdi +; AVX1-NEXT: subq %r9, %rax +; AVX1-NEXT: cmovoq %rdi, %rax +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: vmovq %rcx, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpextrw $0, %xmm0, (%rdx) +; AVX1-NEXT: retq +; +; AVX2-LABEL: v2i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpsllq $56, %xmm1, %xmm1 +; AVX2-NEXT: vmovq %xmm1, %rax +; AVX2-NEXT: vpsllq $56, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rcx +; AVX2-NEXT: xorl %esi, %esi +; AVX2-NEXT: movq %rcx, %rdi +; AVX2-NEXT: subq %rax, %rdi +; AVX2-NEXT: setns %sil +; AVX2-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF +; AVX2-NEXT: addq %r8, %rsi +; AVX2-NEXT: subq %rax, %rcx +; AVX2-NEXT: cmovoq %rsi, %rcx +; AVX2-NEXT: vpextrq $1, %xmm1, %r9 +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: xorl %edi, %edi +; AVX2-NEXT: movq %rax, %rsi +; AVX2-NEXT: subq %r9, %rsi +; AVX2-NEXT: setns %dil +; AVX2-NEXT: addq %r8, %rdi +; AVX2-NEXT: subq %r9, %rax +; AVX2-NEXT: cmovoq %rdi, %rax +; AVX2-NEXT: vmovq %rax, %xmm0 +; AVX2-NEXT: vmovq %rcx, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpextrw $0, %xmm0, (%rdx) +; AVX2-NEXT: retq +; +; AVX512-LABEL: v2i8: +; AVX512: # %bb.0: +; AVX512-NEXT: movzwl (%rdi), %eax +; AVX512-NEXT: vmovd %eax, %xmm0 +; AVX512-NEXT: movzwl (%rsi), %eax +; AVX512-NEXT: vmovd %eax, %xmm1 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,1] +; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vmovq %xmm1, %rax +; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rcx +; AVX512-NEXT: xorl %esi, %esi +; AVX512-NEXT: movq %rcx, %rdi +; AVX512-NEXT: subq %rax, %rdi +; AVX512-NEXT: setns %sil +; AVX512-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF +; AVX512-NEXT: addq %r8, %rsi +; AVX512-NEXT: subq %rax, %rcx +; AVX512-NEXT: cmovoq %rsi, %rcx +; AVX512-NEXT: vpextrq $1, %xmm1, %r9 +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: xorl %edi, %edi +; AVX512-NEXT: movq %rax, %rsi +; AVX512-NEXT: subq %r9, %rsi +; AVX512-NEXT: setns %dil +; AVX512-NEXT: addq %r8, %rdi +; AVX512-NEXT: subq %r9, %rax +; AVX512-NEXT: cmovoq %rdi, %rax +; AVX512-NEXT: vmovq %rax, %xmm0 +; AVX512-NEXT: vmovq %rcx, %xmm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NEXT: vpsrlq $56, %xmm0, %xmm0 +; AVX512-NEXT: vpmovqb %xmm0, (%rdx) +; AVX512-NEXT: retq + %x = load <2 x i8>, <2 x i8>* %px + %y = load <2 x i8>, <2 x i8>* %py + %z = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> %x, <2 x i8> %y) + store <2 x i8> %z, <2 x i8>* %pz + ret void +} + +define void @v4i16(<4 x i16>* %px, <4 x i16>* %py, <4 x i16>* %pz) nounwind { +; SSE2-LABEL: v4i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3] +; SSE2-NEXT: movd %xmm3, %ecx +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; SSE2-NEXT: movd %xmm2, %r8d +; SSE2-NEXT: xorl %esi, %esi +; SSE2-NEXT: movl %r8d, %edi +; SSE2-NEXT: subl %ecx, %edi +; SSE2-NEXT: setns %sil +; SSE2-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; SSE2-NEXT: subl %ecx, %r8d +; SSE2-NEXT: cmovol %esi, %r8d +; SSE2-NEXT: movd %xmm1, %esi +; SSE2-NEXT: movd %xmm0, %r10d +; SSE2-NEXT: xorl %edi, %edi +; SSE2-NEXT: movl %r10d, %ecx +; SSE2-NEXT: subl %esi, %ecx +; SSE2-NEXT: setns %dil +; SSE2-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; SSE2-NEXT: subl %esi, %r10d +; SSE2-NEXT: cmovol %edi, %r10d +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE2-NEXT: movd %xmm2, %r9d +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE2-NEXT: movd %xmm2, %ecx +; SSE2-NEXT: xorl %edi, %edi +; SSE2-NEXT: movl %ecx, %esi +; SSE2-NEXT: subl %r9d, %esi +; SSE2-NEXT: setns %dil +; SSE2-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; SSE2-NEXT: subl %r9d, %ecx +; SSE2-NEXT: cmovol %edi, %ecx +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] +; SSE2-NEXT: movd %xmm1, %r9d +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: xorl %esi, %esi +; SSE2-NEXT: movl %eax, %edi +; SSE2-NEXT: subl %r9d, %edi +; SSE2-NEXT: setns %sil +; SSE2-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; SSE2-NEXT: subl %r9d, %eax +; SSE2-NEXT: cmovol %esi, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movd %r10d, %xmm0 +; SSE2-NEXT: movd %r8d, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: movq %xmm0, (%rdx) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v4i16: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; SSSE3-NEXT: movq {{.*#+}} xmm3 = mem[0],zero +; SSSE3-NEXT: pxor %xmm0, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3] +; SSSE3-NEXT: movd %xmm3, %ecx +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; SSSE3-NEXT: movd %xmm2, %r8d +; SSSE3-NEXT: xorl %esi, %esi +; SSSE3-NEXT: movl %r8d, %edi +; SSSE3-NEXT: subl %ecx, %edi +; SSSE3-NEXT: setns %sil +; SSSE3-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; SSSE3-NEXT: subl %ecx, %r8d +; SSSE3-NEXT: cmovol %esi, %r8d +; SSSE3-NEXT: movd %xmm1, %esi +; SSSE3-NEXT: movd %xmm0, %r10d +; SSSE3-NEXT: xorl %edi, %edi +; SSSE3-NEXT: movl %r10d, %ecx +; SSSE3-NEXT: subl %esi, %ecx +; SSSE3-NEXT: setns %dil +; SSSE3-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; SSSE3-NEXT: subl %esi, %r10d +; SSSE3-NEXT: cmovol %edi, %r10d +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSSE3-NEXT: movd %xmm2, %r9d +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSSE3-NEXT: movd %xmm2, %ecx +; SSSE3-NEXT: xorl %edi, %edi +; SSSE3-NEXT: movl %ecx, %esi +; SSSE3-NEXT: subl %r9d, %esi +; SSSE3-NEXT: setns %dil +; SSSE3-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; SSSE3-NEXT: subl %r9d, %ecx +; SSSE3-NEXT: cmovol %edi, %ecx +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] +; SSSE3-NEXT: movd %xmm1, %r9d +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSSE3-NEXT: movd %xmm0, %eax +; SSSE3-NEXT: xorl %esi, %esi +; SSSE3-NEXT: movl %eax, %edi +; SSSE3-NEXT: subl %r9d, %edi +; SSSE3-NEXT: setns %sil +; SSSE3-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; SSSE3-NEXT: subl %r9d, %eax +; SSSE3-NEXT: cmovol %esi, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: movd %r10d, %xmm0 +; SSSE3-NEXT: movd %r8d, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,10,11,14,15,14,15],zero,zero +; SSSE3-NEXT: movq %xmm0, (%rdx) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v4i16: +; SSE41: # %bb.0: +; SSE41-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; SSE41-NEXT: movq {{.*#+}} xmm3 = mem[0],zero +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE41-NEXT: pextrd $3, %xmm1, %ecx +; SSE41-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE41-NEXT: pextrd $3, %xmm0, %r8d +; SSE41-NEXT: xorl %esi, %esi +; SSE41-NEXT: movl %r8d, %edi +; SSE41-NEXT: subl %ecx, %edi +; SSE41-NEXT: setns %sil +; SSE41-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; SSE41-NEXT: subl %ecx, %r8d +; SSE41-NEXT: cmovol %esi, %r8d +; SSE41-NEXT: pextrd $2, %xmm1, %esi +; SSE41-NEXT: pextrd $2, %xmm0, %r10d +; SSE41-NEXT: xorl %edi, %edi +; SSE41-NEXT: movl %r10d, %ecx +; SSE41-NEXT: subl %esi, %ecx +; SSE41-NEXT: setns %dil +; SSE41-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; SSE41-NEXT: subl %esi, %r10d +; SSE41-NEXT: cmovol %edi, %r10d +; SSE41-NEXT: movd %xmm1, %r9d +; SSE41-NEXT: movd %xmm0, %ecx +; SSE41-NEXT: xorl %edi, %edi +; SSE41-NEXT: movl %ecx, %esi +; SSE41-NEXT: subl %r9d, %esi +; SSE41-NEXT: setns %dil +; SSE41-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; SSE41-NEXT: subl %r9d, %ecx +; SSE41-NEXT: cmovol %edi, %ecx +; SSE41-NEXT: pextrd $1, %xmm1, %r9d +; SSE41-NEXT: pextrd $1, %xmm0, %eax +; SSE41-NEXT: xorl %esi, %esi +; SSE41-NEXT: movl %eax, %edi +; SSE41-NEXT: subl %r9d, %edi +; SSE41-NEXT: setns %sil +; SSE41-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; SSE41-NEXT: subl %r9d, %eax +; SSE41-NEXT: cmovol %esi, %eax +; SSE41-NEXT: movd %ecx, %xmm0 +; SSE41-NEXT: pinsrd $1, %eax, %xmm0 +; SSE41-NEXT: pinsrd $2, %r10d, %xmm0 +; SSE41-NEXT: pinsrd $3, %r8d, %xmm0 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: packusdw %xmm0, %xmm0 +; SSE41-NEXT: movq %xmm0, (%rdx) +; SSE41-NEXT: retq +; +; AVX1-LABEL: v4i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX1-NEXT: vpextrd $3, %xmm0, %ecx +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-NEXT: vpextrd $3, %xmm1, %r9d +; AVX1-NEXT: xorl %esi, %esi +; AVX1-NEXT: movl %r9d, %edi +; AVX1-NEXT: subl %ecx, %edi +; AVX1-NEXT: setns %sil +; AVX1-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; AVX1-NEXT: subl %ecx, %r9d +; AVX1-NEXT: cmovol %esi, %r9d +; AVX1-NEXT: vpextrd $2, %xmm0, %r8d +; AVX1-NEXT: vpextrd $2, %xmm1, %r10d +; AVX1-NEXT: xorl %edi, %edi +; AVX1-NEXT: movl %r10d, %ecx +; AVX1-NEXT: subl %r8d, %ecx +; AVX1-NEXT: setns %dil +; AVX1-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; AVX1-NEXT: subl %r8d, %r10d +; AVX1-NEXT: cmovol %edi, %r10d +; AVX1-NEXT: vmovd %xmm0, %r8d +; AVX1-NEXT: vmovd %xmm1, %eax +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %eax, %edi +; AVX1-NEXT: subl %r8d, %edi +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF +; AVX1-NEXT: subl %r8d, %eax +; AVX1-NEXT: cmovol %ecx, %eax +; AVX1-NEXT: vpextrd $1, %xmm0, %r8d +; AVX1-NEXT: vpextrd $1, %xmm1, %esi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: movl %esi, %edi +; AVX1-NEXT: subl %r8d, %edi +; AVX1-NEXT: setns %cl +; AVX1-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF +; AVX1-NEXT: subl %r8d, %esi +; AVX1-NEXT: cmovol %ecx, %esi +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrd $2, %r10d, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrd $3, %r9d, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, (%rdx) +; AVX1-NEXT: retq +; +; AVX2-LABEL: v4i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX2-NEXT: vpextrd $3, %xmm0, %ecx +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-NEXT: vpextrd $3, %xmm1, %r9d +; AVX2-NEXT: xorl %esi, %esi +; AVX2-NEXT: movl %r9d, %edi +; AVX2-NEXT: subl %ecx, %edi +; AVX2-NEXT: setns %sil +; AVX2-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; AVX2-NEXT: subl %ecx, %r9d +; AVX2-NEXT: cmovol %esi, %r9d +; AVX2-NEXT: vpextrd $2, %xmm0, %r8d +; AVX2-NEXT: vpextrd $2, %xmm1, %r10d +; AVX2-NEXT: xorl %edi, %edi +; AVX2-NEXT: movl %r10d, %ecx +; AVX2-NEXT: subl %r8d, %ecx +; AVX2-NEXT: setns %dil +; AVX2-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; AVX2-NEXT: subl %r8d, %r10d +; AVX2-NEXT: cmovol %edi, %r10d +; AVX2-NEXT: vmovd %xmm0, %r8d +; AVX2-NEXT: vmovd %xmm1, %eax +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %eax, %edi +; AVX2-NEXT: subl %r8d, %edi +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF +; AVX2-NEXT: subl %r8d, %eax +; AVX2-NEXT: cmovol %ecx, %eax +; AVX2-NEXT: vpextrd $1, %xmm0, %r8d +; AVX2-NEXT: vpextrd $1, %xmm1, %esi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: movl %esi, %edi +; AVX2-NEXT: subl %r8d, %edi +; AVX2-NEXT: setns %cl +; AVX2-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF +; AVX2-NEXT: subl %r8d, %esi +; AVX2-NEXT: cmovol %ecx, %esi +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrd $2, %r10d, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrd $3, %r9d, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rdx) +; AVX2-NEXT: retq +; +; AVX512-LABEL: v4i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,0,1,255,255,2,3,255,255,4,5,255,255,6,7] +; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpextrd $3, %xmm0, %ecx +; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpextrd $3, %xmm1, %r9d +; AVX512-NEXT: xorl %esi, %esi +; AVX512-NEXT: movl %r9d, %edi +; AVX512-NEXT: subl %ecx, %edi +; AVX512-NEXT: setns %sil +; AVX512-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; AVX512-NEXT: subl %ecx, %r9d +; AVX512-NEXT: cmovol %esi, %r9d +; AVX512-NEXT: vpextrd $2, %xmm0, %r8d +; AVX512-NEXT: vpextrd $2, %xmm1, %r10d +; AVX512-NEXT: xorl %edi, %edi +; AVX512-NEXT: movl %r10d, %ecx +; AVX512-NEXT: subl %r8d, %ecx +; AVX512-NEXT: setns %dil +; AVX512-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; AVX512-NEXT: subl %r8d, %r10d +; AVX512-NEXT: cmovol %edi, %r10d +; AVX512-NEXT: vmovd %xmm0, %r8d +; AVX512-NEXT: vmovd %xmm1, %eax +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %eax, %edi +; AVX512-NEXT: subl %r8d, %edi +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF +; AVX512-NEXT: subl %r8d, %eax +; AVX512-NEXT: cmovol %ecx, %eax +; AVX512-NEXT: vpextrd $1, %xmm0, %r8d +; AVX512-NEXT: vpextrd $1, %xmm1, %esi +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: movl %esi, %edi +; AVX512-NEXT: subl %r8d, %edi +; AVX512-NEXT: setns %cl +; AVX512-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF +; AVX512-NEXT: subl %r8d, %esi +; AVX512-NEXT: cmovol %ecx, %esi +; AVX512-NEXT: vmovd %eax, %xmm0 +; AVX512-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrd $2, %r10d, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrd $3, %r9d, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX512-NEXT: vpmovdw %xmm0, (%rdx) +; AVX512-NEXT: retq + %x = load <4 x i16>, <4 x i16>* %px + %y = load <4 x i16>, <4 x i16>* %py + %z = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> %x, <4 x i16> %y) + store <4 x i16> %z, <4 x i16>* %pz + ret void +} + +define void @v2i16(<2 x i16>* %px, <2 x i16>* %py, <2 x i16>* %pz) nounwind { +; SSE2-LABEL: v2i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7] +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,6,7] +; SSE2-NEXT: psllq $48, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: psllq $48, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm2, %rcx +; SSE2-NEXT: xorl %esi, %esi +; SSE2-NEXT: movq %rcx, %rdi +; SSE2-NEXT: subq %rax, %rdi +; SSE2-NEXT: setns %sil +; SSE2-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF +; SSE2-NEXT: addq %r8, %rsi +; SSE2-NEXT: subq %rax, %rcx +; SSE2-NEXT: cmovoq %rsi, %rcx +; SSE2-NEXT: movq %xmm1, %r9 +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: xorl %edi, %edi +; SSE2-NEXT: movq %rax, %rsi +; SSE2-NEXT: subq %r9, %rsi +; SSE2-NEXT: setns %dil +; SSE2-NEXT: addq %r8, %rdi +; SSE2-NEXT: subq %r9, %rax +; SSE2-NEXT: cmovoq %rdi, %rax +; SSE2-NEXT: movq %rax, %xmm0 +; SSE2-NEXT: movq %rcx, %xmm1 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: psrlq $48, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: movd %xmm0, (%rdx) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v2i16: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,0,1,255,255,255,255,255,255,2,3] +; SSSE3-NEXT: pshufb %xmm2, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; SSSE3-NEXT: movq %xmm3, %rax +; SSSE3-NEXT: pshufb %xmm2, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSSE3-NEXT: movq %xmm2, %rcx +; SSSE3-NEXT: xorl %esi, %esi +; SSSE3-NEXT: movq %rcx, %rdi +; SSSE3-NEXT: subq %rax, %rdi +; SSSE3-NEXT: setns %sil +; SSSE3-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF +; SSSE3-NEXT: addq %r8, %rsi +; SSSE3-NEXT: subq %rax, %rcx +; SSSE3-NEXT: cmovoq %rsi, %rcx +; SSSE3-NEXT: movq %xmm1, %r9 +; SSSE3-NEXT: movq %xmm0, %rax +; SSSE3-NEXT: xorl %edi, %edi +; SSSE3-NEXT: movq %rax, %rsi +; SSSE3-NEXT: subq %r9, %rsi +; SSSE3-NEXT: setns %dil +; SSSE3-NEXT: addq %r8, %rdi +; SSSE3-NEXT: subq %r9, %rax +; SSSE3-NEXT: cmovoq %rdi, %rax +; SSSE3-NEXT: movq %rax, %xmm0 +; SSSE3-NEXT: movq %rcx, %xmm1 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,14,15],zero,zero,xmm0[14,15],zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: movd %xmm0, (%rdx) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v2i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; SSE41-NEXT: psllq $48, %xmm1 +; SSE41-NEXT: movq %xmm1, %rax +; SSE41-NEXT: psllq $48, %xmm0 +; SSE41-NEXT: movq %xmm0, %rcx +; SSE41-NEXT: xorl %esi, %esi +; SSE41-NEXT: movq %rcx, %rdi +; SSE41-NEXT: subq %rax, %rdi +; SSE41-NEXT: setns %sil +; SSE41-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF +; SSE41-NEXT: addq %r8, %rsi +; SSE41-NEXT: subq %rax, %rcx +; SSE41-NEXT: cmovoq %rsi, %rcx +; SSE41-NEXT: pextrq $1, %xmm1, %r9 +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: xorl %edi, %edi +; SSE41-NEXT: movq %rax, %rsi +; SSE41-NEXT: subq %r9, %rsi +; SSE41-NEXT: setns %dil +; SSE41-NEXT: addq %r8, %rdi +; SSE41-NEXT: subq %r9, %rax +; SSE41-NEXT: cmovoq %rdi, %rax +; SSE41-NEXT: movq %rax, %xmm0 +; SSE41-NEXT: movq %rcx, %xmm1 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,14,15],zero,zero,xmm1[14,15],zero,zero,zero,zero,zero,zero +; SSE41-NEXT: movd %xmm1, (%rdx) +; SSE41-NEXT: retq +; +; AVX1-LABEL: v2i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; AVX1-NEXT: vpsllq $48, %xmm1, %xmm1 +; AVX1-NEXT: vmovq %xmm1, %rax +; AVX1-NEXT: vpsllq $48, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rcx +; AVX1-NEXT: xorl %esi, %esi +; AVX1-NEXT: movq %rcx, %rdi +; AVX1-NEXT: subq %rax, %rdi +; AVX1-NEXT: setns %sil +; AVX1-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF +; AVX1-NEXT: addq %r8, %rsi +; AVX1-NEXT: subq %rax, %rcx +; AVX1-NEXT: cmovoq %rsi, %rcx +; AVX1-NEXT: vpextrq $1, %xmm1, %r9 +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: xorl %edi, %edi +; AVX1-NEXT: movq %rax, %rsi +; AVX1-NEXT: subq %r9, %rsi +; AVX1-NEXT: setns %dil +; AVX1-NEXT: addq %r8, %rdi +; AVX1-NEXT: subq %r9, %rax +; AVX1-NEXT: cmovoq %rdi, %rax +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: vmovq %rcx, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,14,15],zero,zero,xmm0[14,15],zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vmovd %xmm0, (%rdx) +; AVX1-NEXT: retq +; +; AVX2-LABEL: v2i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1 +; AVX2-NEXT: vmovq %xmm1, %rax +; AVX2-NEXT: vpsllq $48, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rcx +; AVX2-NEXT: xorl %esi, %esi +; AVX2-NEXT: movq %rcx, %rdi +; AVX2-NEXT: subq %rax, %rdi +; AVX2-NEXT: setns %sil +; AVX2-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF +; AVX2-NEXT: addq %r8, %rsi +; AVX2-NEXT: subq %rax, %rcx +; AVX2-NEXT: cmovoq %rsi, %rcx +; AVX2-NEXT: vpextrq $1, %xmm1, %r9 +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: xorl %edi, %edi +; AVX2-NEXT: movq %rax, %rsi +; AVX2-NEXT: subq %r9, %rsi +; AVX2-NEXT: setns %dil +; AVX2-NEXT: addq %r8, %rdi +; AVX2-NEXT: subq %r9, %rax +; AVX2-NEXT: cmovoq %rdi, %rax +; AVX2-NEXT: vmovq %rax, %xmm0 +; AVX2-NEXT: vmovq %rcx, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,14,15],zero,zero,xmm0[14,15],zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vmovd %xmm0, (%rdx) +; AVX2-NEXT: retq +; +; AVX512-LABEL: v2i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,0,1,255,255,255,255,255,255,2,3] +; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vmovq %xmm1, %rax +; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rcx +; AVX512-NEXT: xorl %esi, %esi +; AVX512-NEXT: movq %rcx, %rdi +; AVX512-NEXT: subq %rax, %rdi +; AVX512-NEXT: setns %sil +; AVX512-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF +; AVX512-NEXT: addq %r8, %rsi +; AVX512-NEXT: subq %rax, %rcx +; AVX512-NEXT: cmovoq %rsi, %rcx +; AVX512-NEXT: vpextrq $1, %xmm1, %r9 +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: xorl %edi, %edi +; AVX512-NEXT: movq %rax, %rsi +; AVX512-NEXT: subq %r9, %rsi +; AVX512-NEXT: setns %dil +; AVX512-NEXT: addq %r8, %rdi +; AVX512-NEXT: subq %r9, %rax +; AVX512-NEXT: cmovoq %rdi, %rax +; AVX512-NEXT: vmovq %rax, %xmm0 +; AVX512-NEXT: vmovq %rcx, %xmm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX512-NEXT: vpmovqw %xmm0, (%rdx) +; AVX512-NEXT: retq + %x = load <2 x i16>, <2 x i16>* %px + %y = load <2 x i16>, <2 x i16>* %py + %z = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> %x, <2 x i16> %y) + store <2 x i16> %z, <2 x i16>* %pz + ret void +} + +define <12 x i8> @v12i8(<12 x i8> %x, <12 x i8> %y) nounwind { +; SSE2-LABEL: v12i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: pushq %r15 +; SSE2-NEXT: pushq %r14 +; SSE2-NEXT: pushq %r13 +; SSE2-NEXT: pushq %r12 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r9b +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSE2-NEXT: movl %r9d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r9b +; SSE2-NEXT: jno .LBB11_2 +; SSE2-NEXT: # %bb.1: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r9d +; SSE2-NEXT: .LBB11_2: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movl %esi, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %sil +; SSE2-NEXT: jno .LBB11_4 +; SSE2-NEXT: # %bb.3: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %esi +; SSE2-NEXT: .LBB11_4: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl %ebx, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %bl +; SSE2-NEXT: jno .LBB11_6 +; SSE2-NEXT: # %bb.5: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %ebx +; SSE2-NEXT: .LBB11_6: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %dl +; SSE2-NEXT: jno .LBB11_8 +; SSE2-NEXT: # %bb.7: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %edx +; SSE2-NEXT: .LBB11_8: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r10b +; SSE2-NEXT: movl %r10d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r10b +; SSE2-NEXT: jno .LBB11_10 +; SSE2-NEXT: # %bb.9: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r10d +; SSE2-NEXT: .LBB11_10: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r11b +; SSE2-NEXT: movl %r11d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r11b +; SSE2-NEXT: jno .LBB11_12 +; SSE2-NEXT: # %bb.11: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r11d +; SSE2-NEXT: .LBB11_12: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bpl +; SSE2-NEXT: movl %ebp, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %bpl +; SSE2-NEXT: jno .LBB11_14 +; SSE2-NEXT: # %bb.13: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %ebp +; SSE2-NEXT: .LBB11_14: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r14b +; SSE2-NEXT: movl %r14d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r14b +; SSE2-NEXT: jno .LBB11_16 +; SSE2-NEXT: # %bb.15: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r14d +; SSE2-NEXT: .LBB11_16: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r15b +; SSE2-NEXT: movl %r15d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r15b +; SSE2-NEXT: jno .LBB11_18 +; SSE2-NEXT: # %bb.17: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r15d +; SSE2-NEXT: .LBB11_18: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r12b +; SSE2-NEXT: movl %r12d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r12b +; SSE2-NEXT: jno .LBB11_20 +; SSE2-NEXT: # %bb.19: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r12d +; SSE2-NEXT: .LBB11_20: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r13b +; SSE2-NEXT: movl %r13d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r13b +; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB11_22 +; SSE2-NEXT: # %bb.21: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r13d +; SSE2-NEXT: .LBB11_22: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dil +; SSE2-NEXT: movl %edi, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %dil +; SSE2-NEXT: jno .LBB11_24 +; SSE2-NEXT: # %bb.23: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %edi +; SSE2-NEXT: .LBB11_24: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r8b +; SSE2-NEXT: movl %r8d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r8b +; SSE2-NEXT: jno .LBB11_26 +; SSE2-NEXT: # %bb.25: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r8d +; SSE2-NEXT: .LBB11_26: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl %ebx, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %bl +; SSE2-NEXT: jno .LBB11_28 +; SSE2-NEXT: # %bb.27: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %ebx +; SSE2-NEXT: .LBB11_28: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: subb %dl, %cl +; SSE2-NEXT: setns %cl +; SSE2-NEXT: subb %dl, %al +; SSE2-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB11_30 +; SSE2-NEXT: # %bb.29: +; SSE2-NEXT: addb $127, %cl +; SSE2-NEXT: movl %ecx, %eax +; SSE2-NEXT: .LBB11_30: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSE2-NEXT: movl %esi, %ecx +; SSE2-NEXT: subb %dl, %cl +; SSE2-NEXT: setns %cl +; SSE2-NEXT: subb %dl, %sil +; SSE2-NEXT: jno .LBB11_32 +; SSE2-NEXT: # %bb.31: +; SSE2-NEXT: addb $127, %cl +; SSE2-NEXT: movl %ecx, %esi +; SSE2-NEXT: .LBB11_32: +; SSE2-NEXT: movzbl %sil, %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: movzbl %bl, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl %r8b, %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: movzbl %dil, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl %r13b, %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: movzbl %r12b, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl %r15b, %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: movzbl %r14b, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl %bpl, %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: movzbl %r11b, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl %r10b, %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: movzbl %r9b, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %r12 +; SSE2-NEXT: popq %r13 +; SSE2-NEXT: popq %r14 +; SSE2-NEXT: popq %r15 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v12i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pushq %rbp +; SSSE3-NEXT: pushq %r15 +; SSSE3-NEXT: pushq %r14 +; SSSE3-NEXT: pushq %r13 +; SSSE3-NEXT: pushq %r12 +; SSSE3-NEXT: pushq %rbx +; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r9b +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSSE3-NEXT: movl %r9d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r9b +; SSSE3-NEXT: jno .LBB11_2 +; SSSE3-NEXT: # %bb.1: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r9d +; SSSE3-NEXT: .LBB11_2: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movl %esi, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %sil +; SSSE3-NEXT: jno .LBB11_4 +; SSSE3-NEXT: # %bb.3: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %esi +; SSSE3-NEXT: .LBB11_4: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl %ebx, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %bl +; SSSE3-NEXT: jno .LBB11_6 +; SSSE3-NEXT: # %bb.5: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %ebx +; SSSE3-NEXT: .LBB11_6: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %dl +; SSSE3-NEXT: jno .LBB11_8 +; SSSE3-NEXT: # %bb.7: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %edx +; SSSE3-NEXT: .LBB11_8: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r10b +; SSSE3-NEXT: movl %r10d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r10b +; SSSE3-NEXT: jno .LBB11_10 +; SSSE3-NEXT: # %bb.9: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r10d +; SSSE3-NEXT: .LBB11_10: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r11b +; SSSE3-NEXT: movl %r11d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r11b +; SSSE3-NEXT: jno .LBB11_12 +; SSSE3-NEXT: # %bb.11: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r11d +; SSSE3-NEXT: .LBB11_12: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bpl +; SSSE3-NEXT: movl %ebp, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %bpl +; SSSE3-NEXT: jno .LBB11_14 +; SSSE3-NEXT: # %bb.13: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %ebp +; SSSE3-NEXT: .LBB11_14: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r14b +; SSSE3-NEXT: movl %r14d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r14b +; SSSE3-NEXT: jno .LBB11_16 +; SSSE3-NEXT: # %bb.15: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r14d +; SSSE3-NEXT: .LBB11_16: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r15b +; SSSE3-NEXT: movl %r15d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r15b +; SSSE3-NEXT: jno .LBB11_18 +; SSSE3-NEXT: # %bb.17: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r15d +; SSSE3-NEXT: .LBB11_18: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r12b +; SSSE3-NEXT: movl %r12d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r12b +; SSSE3-NEXT: jno .LBB11_20 +; SSSE3-NEXT: # %bb.19: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r12d +; SSSE3-NEXT: .LBB11_20: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r13b +; SSSE3-NEXT: movl %r13d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r13b +; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB11_22 +; SSSE3-NEXT: # %bb.21: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r13d +; SSSE3-NEXT: .LBB11_22: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dil +; SSSE3-NEXT: movl %edi, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %dil +; SSSE3-NEXT: jno .LBB11_24 +; SSSE3-NEXT: # %bb.23: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %edi +; SSSE3-NEXT: .LBB11_24: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r8b +; SSSE3-NEXT: movl %r8d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r8b +; SSSE3-NEXT: jno .LBB11_26 +; SSSE3-NEXT: # %bb.25: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r8d +; SSSE3-NEXT: .LBB11_26: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl %ebx, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %bl +; SSSE3-NEXT: jno .LBB11_28 +; SSSE3-NEXT: # %bb.27: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %ebx +; SSSE3-NEXT: .LBB11_28: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: subb %dl, %cl +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: subb %dl, %al +; SSSE3-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB11_30 +; SSSE3-NEXT: # %bb.29: +; SSSE3-NEXT: addb $127, %cl +; SSSE3-NEXT: movl %ecx, %eax +; SSSE3-NEXT: .LBB11_30: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSSE3-NEXT: movl %esi, %ecx +; SSSE3-NEXT: subb %dl, %cl +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: subb %dl, %sil +; SSSE3-NEXT: jno .LBB11_32 +; SSSE3-NEXT: # %bb.31: +; SSSE3-NEXT: addb $127, %cl +; SSSE3-NEXT: movl %ecx, %esi +; SSSE3-NEXT: .LBB11_32: +; SSSE3-NEXT: movzbl %sil, %ecx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: movzbl %al, %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: movzbl %bl, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzbl %r8b, %eax +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSSE3-NEXT: movzbl %dil, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzbl %r13b, %eax +; SSSE3-NEXT: movd %eax, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSSE3-NEXT: movzbl %r12b, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzbl %r15b, %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSSE3-NEXT: movzbl %r14b, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzbl %bpl, %eax +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: movzbl %r11b, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzbl %r10b, %eax +; SSSE3-NEXT: movd %eax, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movd %eax, %xmm4 +; SSSE3-NEXT: movzbl %r9b, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: popq %rbx +; SSSE3-NEXT: popq %r12 +; SSSE3-NEXT: popq %r13 +; SSSE3-NEXT: popq %r14 +; SSSE3-NEXT: popq %r15 +; SSSE3-NEXT: popq %rbp +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v12i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pextrb $15, %xmm1, %ecx +; SSE41-NEXT: pextrb $15, %xmm0, %edx +; SSE41-NEXT: movl %edx, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %dl +; SSE41-NEXT: jno .LBB11_2 +; SSE41-NEXT: # %bb.1: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edx +; SSE41-NEXT: .LBB11_2: +; SSE41-NEXT: pextrb $14, %xmm1, %ecx +; SSE41-NEXT: pextrb $14, %xmm0, %r11d +; SSE41-NEXT: movl %r11d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r11b +; SSE41-NEXT: jno .LBB11_4 +; SSE41-NEXT: # %bb.3: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r11d +; SSE41-NEXT: .LBB11_4: +; SSE41-NEXT: pextrb $13, %xmm1, %ecx +; SSE41-NEXT: pextrb $13, %xmm0, %edi +; SSE41-NEXT: movl %edi, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %dil +; SSE41-NEXT: jno .LBB11_6 +; SSE41-NEXT: # %bb.5: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edi +; SSE41-NEXT: .LBB11_6: +; SSE41-NEXT: pushq %rbp +; SSE41-NEXT: pushq %r15 +; SSE41-NEXT: pushq %r14 +; SSE41-NEXT: pushq %r13 +; SSE41-NEXT: pushq %r12 +; SSE41-NEXT: pushq %rbx +; SSE41-NEXT: pextrb $12, %xmm1, %ecx +; SSE41-NEXT: pextrb $12, %xmm0, %r14d +; SSE41-NEXT: movl %r14d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r14b +; SSE41-NEXT: jno .LBB11_8 +; SSE41-NEXT: # %bb.7: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r14d +; SSE41-NEXT: .LBB11_8: +; SSE41-NEXT: pextrb $11, %xmm1, %ecx +; SSE41-NEXT: pextrb $11, %xmm0, %ebp +; SSE41-NEXT: movl %ebp, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %bpl +; SSE41-NEXT: jno .LBB11_10 +; SSE41-NEXT: # %bb.9: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebp +; SSE41-NEXT: .LBB11_10: +; SSE41-NEXT: pextrb $10, %xmm1, %ecx +; SSE41-NEXT: pextrb $10, %xmm0, %r15d +; SSE41-NEXT: movl %r15d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r15b +; SSE41-NEXT: jno .LBB11_12 +; SSE41-NEXT: # %bb.11: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r15d +; SSE41-NEXT: .LBB11_12: +; SSE41-NEXT: pextrb $9, %xmm1, %ecx +; SSE41-NEXT: pextrb $9, %xmm0, %r12d +; SSE41-NEXT: movl %r12d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r12b +; SSE41-NEXT: jno .LBB11_14 +; SSE41-NEXT: # %bb.13: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r12d +; SSE41-NEXT: .LBB11_14: +; SSE41-NEXT: pextrb $8, %xmm1, %ecx +; SSE41-NEXT: pextrb $8, %xmm0, %r13d +; SSE41-NEXT: movl %r13d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r13b +; SSE41-NEXT: jno .LBB11_16 +; SSE41-NEXT: # %bb.15: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r13d +; SSE41-NEXT: .LBB11_16: +; SSE41-NEXT: pextrb $7, %xmm1, %ecx +; SSE41-NEXT: pextrb $7, %xmm0, %r10d +; SSE41-NEXT: movl %r10d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r10b +; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB11_18 +; SSE41-NEXT: # %bb.17: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r10d +; SSE41-NEXT: .LBB11_18: +; SSE41-NEXT: pextrb $6, %xmm1, %ecx +; SSE41-NEXT: pextrb $6, %xmm0, %r9d +; SSE41-NEXT: movl %r9d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r9b +; SSE41-NEXT: jno .LBB11_20 +; SSE41-NEXT: # %bb.19: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r9d +; SSE41-NEXT: .LBB11_20: +; SSE41-NEXT: pextrb $5, %xmm1, %ecx +; SSE41-NEXT: pextrb $5, %xmm0, %ebp +; SSE41-NEXT: movl %ebp, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %bpl +; SSE41-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB11_22 +; SSE41-NEXT: # %bb.21: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebp +; SSE41-NEXT: .LBB11_22: +; SSE41-NEXT: pextrb $4, %xmm1, %ecx +; SSE41-NEXT: pextrb $4, %xmm0, %edi +; SSE41-NEXT: movl %edi, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %dil +; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB11_24 +; SSE41-NEXT: # %bb.23: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edi +; SSE41-NEXT: .LBB11_24: +; SSE41-NEXT: pextrb $3, %xmm1, %edx +; SSE41-NEXT: pextrb $3, %xmm0, %eax +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: setns %cl +; SSE41-NEXT: subb %dl, %al +; SSE41-NEXT: jno .LBB11_26 +; SSE41-NEXT: # %bb.25: +; SSE41-NEXT: addb $127, %cl +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB11_26: +; SSE41-NEXT: pextrb $2, %xmm1, %ebx +; SSE41-NEXT: pextrb $2, %xmm0, %ecx +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: subb %bl, %dl +; SSE41-NEXT: setns %dl +; SSE41-NEXT: subb %bl, %cl +; SSE41-NEXT: jno .LBB11_28 +; SSE41-NEXT: # %bb.27: +; SSE41-NEXT: addb $127, %dl +; SSE41-NEXT: movl %edx, %ecx +; SSE41-NEXT: .LBB11_28: +; SSE41-NEXT: pextrb $0, %xmm1, %esi +; SSE41-NEXT: pextrb $0, %xmm0, %edx +; SSE41-NEXT: movl %edx, %ebx +; SSE41-NEXT: subb %sil, %bl +; SSE41-NEXT: setns %bl +; SSE41-NEXT: subb %sil, %dl +; SSE41-NEXT: jno .LBB11_30 +; SSE41-NEXT: # %bb.29: +; SSE41-NEXT: addb $127, %bl +; SSE41-NEXT: movl %ebx, %edx +; SSE41-NEXT: .LBB11_30: +; SSE41-NEXT: pextrb $1, %xmm1, %esi +; SSE41-NEXT: pextrb $1, %xmm0, %r8d +; SSE41-NEXT: movl %r8d, %ebx +; SSE41-NEXT: subb %sil, %bl +; SSE41-NEXT: setns %bl +; SSE41-NEXT: subb %sil, %r8b +; SSE41-NEXT: jno .LBB11_32 +; SSE41-NEXT: # %bb.31: +; SSE41-NEXT: addb $127, %bl +; SSE41-NEXT: movl %ebx, %r8d +; SSE41-NEXT: .LBB11_32: +; SSE41-NEXT: movzbl %dl, %edx +; SSE41-NEXT: movd %edx, %xmm0 +; SSE41-NEXT: movzbl %r8b, %edx +; SSE41-NEXT: pinsrb $1, %edx, %xmm0 +; SSE41-NEXT: movzbl %cl, %ecx +; SSE41-NEXT: pinsrb $2, %ecx, %xmm0 +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $3, %eax, %xmm0 +; SSE41-NEXT: movzbl %dil, %eax +; SSE41-NEXT: pinsrb $4, %eax, %xmm0 +; SSE41-NEXT: movzbl %bpl, %eax +; SSE41-NEXT: pinsrb $5, %eax, %xmm0 +; SSE41-NEXT: movzbl %r9b, %eax +; SSE41-NEXT: pinsrb $6, %eax, %xmm0 +; SSE41-NEXT: movzbl %r10b, %eax +; SSE41-NEXT: pinsrb $7, %eax, %xmm0 +; SSE41-NEXT: movzbl %r13b, %eax +; SSE41-NEXT: pinsrb $8, %eax, %xmm0 +; SSE41-NEXT: movzbl %r12b, %eax +; SSE41-NEXT: pinsrb $9, %eax, %xmm0 +; SSE41-NEXT: movzbl %r15b, %eax +; SSE41-NEXT: pinsrb $10, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $11, %eax, %xmm0 +; SSE41-NEXT: movzbl %r14b, %eax +; SSE41-NEXT: pinsrb $12, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $13, %eax, %xmm0 +; SSE41-NEXT: movzbl %r11b, %eax +; SSE41-NEXT: pinsrb $14, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $15, %eax, %xmm0 +; SSE41-NEXT: popq %rbx +; SSE41-NEXT: popq %r12 +; SSE41-NEXT: popq %r13 +; SSE41-NEXT: popq %r14 +; SSE41-NEXT: popq %r15 +; SSE41-NEXT: popq %rbp +; SSE41-NEXT: retq +; +; AVX-LABEL: v12i8: +; AVX: # %bb.0: +; AVX-NEXT: vpextrb $15, %xmm1, %ecx +; AVX-NEXT: vpextrb $15, %xmm0, %edx +; AVX-NEXT: movl %edx, %eax +; AVX-NEXT: subb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: subb %cl, %dl +; AVX-NEXT: jno .LBB11_2 +; AVX-NEXT: # %bb.1: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %edx +; AVX-NEXT: .LBB11_2: +; AVX-NEXT: vpextrb $14, %xmm1, %ecx +; AVX-NEXT: vpextrb $14, %xmm0, %r11d +; AVX-NEXT: movl %r11d, %eax +; AVX-NEXT: subb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: subb %cl, %r11b +; AVX-NEXT: jno .LBB11_4 +; AVX-NEXT: # %bb.3: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %r11d +; AVX-NEXT: .LBB11_4: +; AVX-NEXT: vpextrb $13, %xmm1, %ecx +; AVX-NEXT: vpextrb $13, %xmm0, %edi +; AVX-NEXT: movl %edi, %eax +; AVX-NEXT: subb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: subb %cl, %dil +; AVX-NEXT: jno .LBB11_6 +; AVX-NEXT: # %bb.5: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %edi +; AVX-NEXT: .LBB11_6: +; AVX-NEXT: pushq %rbp +; AVX-NEXT: pushq %r15 +; AVX-NEXT: pushq %r14 +; AVX-NEXT: pushq %r13 +; AVX-NEXT: pushq %r12 +; AVX-NEXT: pushq %rbx +; AVX-NEXT: vpextrb $12, %xmm1, %ecx +; AVX-NEXT: vpextrb $12, %xmm0, %r14d +; AVX-NEXT: movl %r14d, %eax +; AVX-NEXT: subb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: subb %cl, %r14b +; AVX-NEXT: jno .LBB11_8 +; AVX-NEXT: # %bb.7: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %r14d +; AVX-NEXT: .LBB11_8: +; AVX-NEXT: vpextrb $11, %xmm1, %ecx +; AVX-NEXT: vpextrb $11, %xmm0, %ebp +; AVX-NEXT: movl %ebp, %eax +; AVX-NEXT: subb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: subb %cl, %bpl +; AVX-NEXT: jno .LBB11_10 +; AVX-NEXT: # %bb.9: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %ebp +; AVX-NEXT: .LBB11_10: +; AVX-NEXT: vpextrb $10, %xmm1, %ecx +; AVX-NEXT: vpextrb $10, %xmm0, %r15d +; AVX-NEXT: movl %r15d, %eax +; AVX-NEXT: subb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: subb %cl, %r15b +; AVX-NEXT: jno .LBB11_12 +; AVX-NEXT: # %bb.11: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %r15d +; AVX-NEXT: .LBB11_12: +; AVX-NEXT: vpextrb $9, %xmm1, %ecx +; AVX-NEXT: vpextrb $9, %xmm0, %r12d +; AVX-NEXT: movl %r12d, %eax +; AVX-NEXT: subb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: subb %cl, %r12b +; AVX-NEXT: jno .LBB11_14 +; AVX-NEXT: # %bb.13: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %r12d +; AVX-NEXT: .LBB11_14: +; AVX-NEXT: vpextrb $8, %xmm1, %ecx +; AVX-NEXT: vpextrb $8, %xmm0, %r13d +; AVX-NEXT: movl %r13d, %eax +; AVX-NEXT: subb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: subb %cl, %r13b +; AVX-NEXT: jno .LBB11_16 +; AVX-NEXT: # %bb.15: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %r13d +; AVX-NEXT: .LBB11_16: +; AVX-NEXT: vpextrb $7, %xmm1, %ecx +; AVX-NEXT: vpextrb $7, %xmm0, %r10d +; AVX-NEXT: movl %r10d, %eax +; AVX-NEXT: subb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: subb %cl, %r10b +; AVX-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX-NEXT: jno .LBB11_18 +; AVX-NEXT: # %bb.17: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %r10d +; AVX-NEXT: .LBB11_18: +; AVX-NEXT: vpextrb $6, %xmm1, %ecx +; AVX-NEXT: vpextrb $6, %xmm0, %r9d +; AVX-NEXT: movl %r9d, %eax +; AVX-NEXT: subb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: subb %cl, %r9b +; AVX-NEXT: jno .LBB11_20 +; AVX-NEXT: # %bb.19: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %r9d +; AVX-NEXT: .LBB11_20: +; AVX-NEXT: vpextrb $5, %xmm1, %ecx +; AVX-NEXT: vpextrb $5, %xmm0, %ebp +; AVX-NEXT: movl %ebp, %eax +; AVX-NEXT: subb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: subb %cl, %bpl +; AVX-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX-NEXT: jno .LBB11_22 +; AVX-NEXT: # %bb.21: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %ebp +; AVX-NEXT: .LBB11_22: +; AVX-NEXT: vpextrb $4, %xmm1, %ecx +; AVX-NEXT: vpextrb $4, %xmm0, %edi +; AVX-NEXT: movl %edi, %eax +; AVX-NEXT: subb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: subb %cl, %dil +; AVX-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX-NEXT: jno .LBB11_24 +; AVX-NEXT: # %bb.23: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %edi +; AVX-NEXT: .LBB11_24: +; AVX-NEXT: vpextrb $3, %xmm1, %edx +; AVX-NEXT: vpextrb $3, %xmm0, %eax +; AVX-NEXT: movl %eax, %ecx +; AVX-NEXT: subb %dl, %cl +; AVX-NEXT: setns %cl +; AVX-NEXT: subb %dl, %al +; AVX-NEXT: jno .LBB11_26 +; AVX-NEXT: # %bb.25: +; AVX-NEXT: addb $127, %cl +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: .LBB11_26: +; AVX-NEXT: vpextrb $2, %xmm1, %ebx +; AVX-NEXT: vpextrb $2, %xmm0, %ecx +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: subb %bl, %dl +; AVX-NEXT: setns %dl +; AVX-NEXT: subb %bl, %cl +; AVX-NEXT: jno .LBB11_28 +; AVX-NEXT: # %bb.27: +; AVX-NEXT: addb $127, %dl +; AVX-NEXT: movl %edx, %ecx +; AVX-NEXT: .LBB11_28: +; AVX-NEXT: vpextrb $0, %xmm1, %esi +; AVX-NEXT: vpextrb $0, %xmm0, %edx +; AVX-NEXT: movl %edx, %ebx +; AVX-NEXT: subb %sil, %bl +; AVX-NEXT: setns %bl +; AVX-NEXT: subb %sil, %dl +; AVX-NEXT: jno .LBB11_30 +; AVX-NEXT: # %bb.29: +; AVX-NEXT: addb $127, %bl +; AVX-NEXT: movl %ebx, %edx +; AVX-NEXT: .LBB11_30: +; AVX-NEXT: vpextrb $1, %xmm1, %esi +; AVX-NEXT: vpextrb $1, %xmm0, %r8d +; AVX-NEXT: movl %r8d, %ebx +; AVX-NEXT: subb %sil, %bl +; AVX-NEXT: setns %bl +; AVX-NEXT: subb %sil, %r8b +; AVX-NEXT: jno .LBB11_32 +; AVX-NEXT: # %bb.31: +; AVX-NEXT: addb $127, %bl +; AVX-NEXT: movl %ebx, %r8d +; AVX-NEXT: .LBB11_32: +; AVX-NEXT: movzbl %dl, %edx +; AVX-NEXT: vmovd %edx, %xmm0 +; AVX-NEXT: movzbl %r8b, %edx +; AVX-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0 +; AVX-NEXT: movzbl %cl, %ecx +; AVX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %dil, %eax +; AVX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %bpl, %eax +; AVX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %r9b, %eax +; AVX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %r10b, %eax +; AVX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %r13b, %eax +; AVX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %r12b, %eax +; AVX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %r15b, %eax +; AVX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %r14b, %eax +; AVX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %r11b, %eax +; AVX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX-NEXT: popq %rbx +; AVX-NEXT: popq %r12 +; AVX-NEXT: popq %r13 +; AVX-NEXT: popq %r14 +; AVX-NEXT: popq %r15 +; AVX-NEXT: popq %rbp +; AVX-NEXT: retq + %z = call <12 x i8> @llvm.ssub.sat.v12i8(<12 x i8> %x, <12 x i8> %y) + ret <12 x i8> %z +} + +define void @v12i16(<12 x i16>* %px, <12 x i16>* %py, <12 x i16>* %pz) nounwind { +; SSE2-LABEL: v12i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: pushq %r15 +; SSE2-NEXT: pushq %r14 +; SSE2-NEXT: pushq %r13 +; SSE2-NEXT: pushq %r12 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm2 +; SSE2-NEXT: movdqa (%rsi), %xmm1 +; SSE2-NEXT: movdqa 16(%rsi), %xmm3 +; SSE2-NEXT: pextrw $3, %xmm3, %eax +; SSE2-NEXT: pextrw $3, %xmm2, %edx +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %edx, %esi +; SSE2-NEXT: subw %ax, %si +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %dx +; SSE2-NEXT: cmovol %ecx, %edx +; SSE2-NEXT: pextrw $2, %xmm3, %eax +; SSE2-NEXT: pextrw $2, %xmm2, %r9d +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %r9d, %esi +; SSE2-NEXT: subw %ax, %si +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %r9w +; SSE2-NEXT: cmovol %ecx, %r9d +; SSE2-NEXT: movd %xmm3, %eax +; SSE2-NEXT: movd %xmm2, %r10d +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %r10d, %esi +; SSE2-NEXT: subw %ax, %si +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %r10w +; SSE2-NEXT: cmovol %ecx, %r10d +; SSE2-NEXT: pextrw $1, %xmm3, %eax +; SSE2-NEXT: pextrw $1, %xmm2, %r11d +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %r11d, %esi +; SSE2-NEXT: subw %ax, %si +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %r11w +; SSE2-NEXT: cmovol %ecx, %r11d +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: movd %xmm0, %r14d +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %r14d, %esi +; SSE2-NEXT: subw %ax, %si +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %r14w +; SSE2-NEXT: cmovol %ecx, %r14d +; SSE2-NEXT: pextrw $1, %xmm1, %eax +; SSE2-NEXT: pextrw $1, %xmm0, %r15d +; SSE2-NEXT: xorl %esi, %esi +; SSE2-NEXT: movl %r15d, %edi +; SSE2-NEXT: subw %ax, %di +; SSE2-NEXT: setns %sil +; SSE2-NEXT: addl $32767, %esi # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %r15w +; SSE2-NEXT: cmovol %esi, %r15d +; SSE2-NEXT: pextrw $2, %xmm1, %eax +; SSE2-NEXT: pextrw $2, %xmm0, %r12d +; SSE2-NEXT: xorl %edi, %edi +; SSE2-NEXT: movl %r12d, %ebx +; SSE2-NEXT: subw %ax, %bx +; SSE2-NEXT: setns %dil +; SSE2-NEXT: addl $32767, %edi # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %r12w +; SSE2-NEXT: cmovol %edi, %r12d +; SSE2-NEXT: pextrw $3, %xmm1, %eax +; SSE2-NEXT: pextrw $3, %xmm0, %r13d +; SSE2-NEXT: xorl %ebx, %ebx +; SSE2-NEXT: movl %r13d, %ebp +; SSE2-NEXT: subw %ax, %bp +; SSE2-NEXT: setns %bl +; SSE2-NEXT: addl $32767, %ebx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %r13w +; SSE2-NEXT: cmovol %ebx, %r13d +; SSE2-NEXT: pextrw $4, %xmm1, %eax +; SSE2-NEXT: pextrw $4, %xmm0, %ebx +; SSE2-NEXT: xorl %ebp, %ebp +; SSE2-NEXT: movl %ebx, %ecx +; SSE2-NEXT: subw %ax, %cx +; SSE2-NEXT: setns %bpl +; SSE2-NEXT: addl $32767, %ebp # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %bx +; SSE2-NEXT: cmovol %ebp, %ebx +; SSE2-NEXT: pextrw $5, %xmm1, %eax +; SSE2-NEXT: pextrw $5, %xmm0, %ebp +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movl %ebp, %esi +; SSE2-NEXT: subw %ax, %si +; SSE2-NEXT: setns %cl +; SSE2-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE2-NEXT: subw %ax, %bp +; SSE2-NEXT: cmovol %ecx, %ebp +; SSE2-NEXT: pextrw $6, %xmm1, %ecx +; SSE2-NEXT: pextrw $6, %xmm0, %eax +; SSE2-NEXT: xorl %esi, %esi +; SSE2-NEXT: movl %eax, %edi +; SSE2-NEXT: subw %cx, %di +; SSE2-NEXT: setns %sil +; SSE2-NEXT: addl $32767, %esi # imm = 0x7FFF +; SSE2-NEXT: subw %cx, %ax +; SSE2-NEXT: cmovol %esi, %eax +; SSE2-NEXT: pextrw $7, %xmm1, %ecx +; SSE2-NEXT: pextrw $7, %xmm0, %esi +; SSE2-NEXT: xorl %edi, %edi +; SSE2-NEXT: movl %esi, %r8d +; SSE2-NEXT: subw %cx, %r8w +; SSE2-NEXT: setns %dil +; SSE2-NEXT: addl $32767, %edi # imm = 0x7FFF +; SSE2-NEXT: subw %cx, %si +; SSE2-NEXT: cmovol %edi, %esi +; SSE2-NEXT: movd %esi, %xmm8 +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: movd %ebp, %xmm2 +; SSE2-NEXT: movd %ebx, %xmm3 +; SSE2-NEXT: movd %r13d, %xmm4 +; SSE2-NEXT: movd %r12d, %xmm5 +; SSE2-NEXT: movd %r15d, %xmm6 +; SSE2-NEXT: movd %r14d, %xmm7 +; SSE2-NEXT: movd %r10d, %xmm0 +; SSE2-NEXT: pinsrw $1, %r11d, %xmm0 +; SSE2-NEXT: pinsrw $2, %r9d, %xmm0 +; SSE2-NEXT: pinsrw $3, %edx, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm3[0] +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE2-NEXT: movq %xmm0, 16(%rax) +; SSE2-NEXT: movdqa %xmm7, (%rax) +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %r12 +; SSE2-NEXT: popq %r13 +; SSE2-NEXT: popq %r14 +; SSE2-NEXT: popq %r15 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v12i16: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pushq %rbp +; SSSE3-NEXT: pushq %r15 +; SSSE3-NEXT: pushq %r14 +; SSSE3-NEXT: pushq %r13 +; SSSE3-NEXT: pushq %r12 +; SSSE3-NEXT: pushq %rbx +; SSSE3-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSSE3-NEXT: movdqa (%rdi), %xmm0 +; SSSE3-NEXT: movdqa 16(%rdi), %xmm2 +; SSSE3-NEXT: movdqa (%rsi), %xmm1 +; SSSE3-NEXT: movdqa 16(%rsi), %xmm3 +; SSSE3-NEXT: pextrw $3, %xmm3, %eax +; SSSE3-NEXT: pextrw $3, %xmm2, %edx +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %edx, %esi +; SSSE3-NEXT: subw %ax, %si +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %dx +; SSSE3-NEXT: cmovol %ecx, %edx +; SSSE3-NEXT: pextrw $2, %xmm3, %eax +; SSSE3-NEXT: pextrw $2, %xmm2, %r9d +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %r9d, %esi +; SSSE3-NEXT: subw %ax, %si +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %r9w +; SSSE3-NEXT: cmovol %ecx, %r9d +; SSSE3-NEXT: movd %xmm3, %eax +; SSSE3-NEXT: movd %xmm2, %r10d +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %r10d, %esi +; SSSE3-NEXT: subw %ax, %si +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %r10w +; SSSE3-NEXT: cmovol %ecx, %r10d +; SSSE3-NEXT: pextrw $1, %xmm3, %eax +; SSSE3-NEXT: pextrw $1, %xmm2, %r11d +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %r11d, %esi +; SSSE3-NEXT: subw %ax, %si +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %r11w +; SSSE3-NEXT: cmovol %ecx, %r11d +; SSSE3-NEXT: movd %xmm1, %eax +; SSSE3-NEXT: movd %xmm0, %r14d +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %r14d, %esi +; SSSE3-NEXT: subw %ax, %si +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %r14w +; SSSE3-NEXT: cmovol %ecx, %r14d +; SSSE3-NEXT: pextrw $1, %xmm1, %eax +; SSSE3-NEXT: pextrw $1, %xmm0, %r15d +; SSSE3-NEXT: xorl %esi, %esi +; SSSE3-NEXT: movl %r15d, %edi +; SSSE3-NEXT: subw %ax, %di +; SSSE3-NEXT: setns %sil +; SSSE3-NEXT: addl $32767, %esi # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %r15w +; SSSE3-NEXT: cmovol %esi, %r15d +; SSSE3-NEXT: pextrw $2, %xmm1, %eax +; SSSE3-NEXT: pextrw $2, %xmm0, %r12d +; SSSE3-NEXT: xorl %edi, %edi +; SSSE3-NEXT: movl %r12d, %ebx +; SSSE3-NEXT: subw %ax, %bx +; SSSE3-NEXT: setns %dil +; SSSE3-NEXT: addl $32767, %edi # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %r12w +; SSSE3-NEXT: cmovol %edi, %r12d +; SSSE3-NEXT: pextrw $3, %xmm1, %eax +; SSSE3-NEXT: pextrw $3, %xmm0, %r13d +; SSSE3-NEXT: xorl %ebx, %ebx +; SSSE3-NEXT: movl %r13d, %ebp +; SSSE3-NEXT: subw %ax, %bp +; SSSE3-NEXT: setns %bl +; SSSE3-NEXT: addl $32767, %ebx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %r13w +; SSSE3-NEXT: cmovol %ebx, %r13d +; SSSE3-NEXT: pextrw $4, %xmm1, %eax +; SSSE3-NEXT: pextrw $4, %xmm0, %ebx +; SSSE3-NEXT: xorl %ebp, %ebp +; SSSE3-NEXT: movl %ebx, %ecx +; SSSE3-NEXT: subw %ax, %cx +; SSSE3-NEXT: setns %bpl +; SSSE3-NEXT: addl $32767, %ebp # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %bx +; SSSE3-NEXT: cmovol %ebp, %ebx +; SSSE3-NEXT: pextrw $5, %xmm1, %eax +; SSSE3-NEXT: pextrw $5, %xmm0, %ebp +; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: movl %ebp, %esi +; SSSE3-NEXT: subw %ax, %si +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSSE3-NEXT: subw %ax, %bp +; SSSE3-NEXT: cmovol %ecx, %ebp +; SSSE3-NEXT: pextrw $6, %xmm1, %ecx +; SSSE3-NEXT: pextrw $6, %xmm0, %eax +; SSSE3-NEXT: xorl %esi, %esi +; SSSE3-NEXT: movl %eax, %edi +; SSSE3-NEXT: subw %cx, %di +; SSSE3-NEXT: setns %sil +; SSSE3-NEXT: addl $32767, %esi # imm = 0x7FFF +; SSSE3-NEXT: subw %cx, %ax +; SSSE3-NEXT: cmovol %esi, %eax +; SSSE3-NEXT: pextrw $7, %xmm1, %ecx +; SSSE3-NEXT: pextrw $7, %xmm0, %esi +; SSSE3-NEXT: xorl %edi, %edi +; SSSE3-NEXT: movl %esi, %r8d +; SSSE3-NEXT: subw %cx, %r8w +; SSSE3-NEXT: setns %dil +; SSSE3-NEXT: addl $32767, %edi # imm = 0x7FFF +; SSSE3-NEXT: subw %cx, %si +; SSSE3-NEXT: cmovol %edi, %esi +; SSSE3-NEXT: movd %esi, %xmm8 +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movd %ebp, %xmm2 +; SSSE3-NEXT: movd %ebx, %xmm3 +; SSSE3-NEXT: movd %r13d, %xmm4 +; SSSE3-NEXT: movd %r12d, %xmm5 +; SSSE3-NEXT: movd %r15d, %xmm6 +; SSSE3-NEXT: movd %r14d, %xmm7 +; SSSE3-NEXT: movd %r10d, %xmm0 +; SSSE3-NEXT: pinsrw $1, %r11d, %xmm0 +; SSSE3-NEXT: pinsrw $2, %r9d, %xmm0 +; SSSE3-NEXT: pinsrw $3, %edx, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm3[0] +; SSSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSSE3-NEXT: movq %xmm0, 16(%rax) +; SSSE3-NEXT: movdqa %xmm7, (%rax) +; SSSE3-NEXT: popq %rbx +; SSSE3-NEXT: popq %r12 +; SSSE3-NEXT: popq %r13 +; SSSE3-NEXT: popq %r14 +; SSSE3-NEXT: popq %r15 +; SSSE3-NEXT: popq %rbp +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v12i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pushq %rbp +; SSE41-NEXT: pushq %r15 +; SSE41-NEXT: pushq %r14 +; SSE41-NEXT: pushq %r13 +; SSE41-NEXT: pushq %r12 +; SSE41-NEXT: pushq %rbx +; SSE41-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: movdqa 16(%rdi), %xmm2 +; SSE41-NEXT: movdqa (%rsi), %xmm1 +; SSE41-NEXT: movdqa 16(%rsi), %xmm3 +; SSE41-NEXT: pextrw $3, %xmm3, %eax +; SSE41-NEXT: pextrw $3, %xmm2, %edx +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %edx, %esi +; SSE41-NEXT: subw %ax, %si +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %dx +; SSE41-NEXT: cmovol %ecx, %edx +; SSE41-NEXT: pextrw $2, %xmm3, %eax +; SSE41-NEXT: pextrw $2, %xmm2, %r9d +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %r9d, %esi +; SSE41-NEXT: subw %ax, %si +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %r9w +; SSE41-NEXT: cmovol %ecx, %r9d +; SSE41-NEXT: movd %xmm3, %eax +; SSE41-NEXT: movd %xmm2, %r10d +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %r10d, %esi +; SSE41-NEXT: subw %ax, %si +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %r10w +; SSE41-NEXT: cmovol %ecx, %r10d +; SSE41-NEXT: pextrw $1, %xmm3, %eax +; SSE41-NEXT: pextrw $1, %xmm2, %r11d +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %r11d, %esi +; SSE41-NEXT: subw %ax, %si +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %r11w +; SSE41-NEXT: cmovol %ecx, %r11d +; SSE41-NEXT: pextrw $7, %xmm1, %ecx +; SSE41-NEXT: pextrw $7, %xmm0, %r14d +; SSE41-NEXT: xorl %esi, %esi +; SSE41-NEXT: movl %r14d, %edi +; SSE41-NEXT: subw %cx, %di +; SSE41-NEXT: setns %sil +; SSE41-NEXT: addl $32767, %esi # imm = 0x7FFF +; SSE41-NEXT: subw %cx, %r14w +; SSE41-NEXT: cmovol %esi, %r14d +; SSE41-NEXT: pextrw $6, %xmm1, %esi +; SSE41-NEXT: pextrw $6, %xmm0, %r15d +; SSE41-NEXT: xorl %edi, %edi +; SSE41-NEXT: movl %r15d, %ebx +; SSE41-NEXT: subw %si, %bx +; SSE41-NEXT: setns %dil +; SSE41-NEXT: addl $32767, %edi # imm = 0x7FFF +; SSE41-NEXT: subw %si, %r15w +; SSE41-NEXT: cmovol %edi, %r15d +; SSE41-NEXT: pextrw $5, %xmm1, %edi +; SSE41-NEXT: pextrw $5, %xmm0, %r12d +; SSE41-NEXT: xorl %ebx, %ebx +; SSE41-NEXT: movl %r12d, %ebp +; SSE41-NEXT: subw %di, %bp +; SSE41-NEXT: setns %bl +; SSE41-NEXT: addl $32767, %ebx # imm = 0x7FFF +; SSE41-NEXT: subw %di, %r12w +; SSE41-NEXT: cmovol %ebx, %r12d +; SSE41-NEXT: pextrw $4, %xmm1, %ebx +; SSE41-NEXT: pextrw $4, %xmm0, %r13d +; SSE41-NEXT: xorl %ebp, %ebp +; SSE41-NEXT: movl %r13d, %eax +; SSE41-NEXT: subw %bx, %ax +; SSE41-NEXT: setns %bpl +; SSE41-NEXT: addl $32767, %ebp # imm = 0x7FFF +; SSE41-NEXT: subw %bx, %r13w +; SSE41-NEXT: cmovol %ebp, %r13d +; SSE41-NEXT: pextrw $3, %xmm1, %eax +; SSE41-NEXT: pextrw $3, %xmm0, %ebx +; SSE41-NEXT: xorl %ebp, %ebp +; SSE41-NEXT: movl %ebx, %ecx +; SSE41-NEXT: subw %ax, %cx +; SSE41-NEXT: setns %bpl +; SSE41-NEXT: addl $32767, %ebp # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %bx +; SSE41-NEXT: cmovol %ebp, %ebx +; SSE41-NEXT: pextrw $2, %xmm1, %eax +; SSE41-NEXT: pextrw $2, %xmm0, %ebp +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: movl %ebp, %esi +; SSE41-NEXT: subw %ax, %si +; SSE41-NEXT: setns %cl +; SSE41-NEXT: addl $32767, %ecx # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %bp +; SSE41-NEXT: cmovol %ecx, %ebp +; SSE41-NEXT: movd %xmm1, %eax +; SSE41-NEXT: movd %xmm0, %ecx +; SSE41-NEXT: xorl %esi, %esi +; SSE41-NEXT: movl %ecx, %edi +; SSE41-NEXT: subw %ax, %di +; SSE41-NEXT: setns %sil +; SSE41-NEXT: addl $32767, %esi # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %cx +; SSE41-NEXT: cmovol %esi, %ecx +; SSE41-NEXT: pextrw $1, %xmm1, %eax +; SSE41-NEXT: pextrw $1, %xmm0, %esi +; SSE41-NEXT: xorl %edi, %edi +; SSE41-NEXT: movl %esi, %r8d +; SSE41-NEXT: subw %ax, %r8w +; SSE41-NEXT: setns %dil +; SSE41-NEXT: addl $32767, %edi # imm = 0x7FFF +; SSE41-NEXT: subw %ax, %si +; SSE41-NEXT: cmovol %edi, %esi +; SSE41-NEXT: movd %ecx, %xmm0 +; SSE41-NEXT: pinsrw $1, %esi, %xmm0 +; SSE41-NEXT: pinsrw $2, %ebp, %xmm0 +; SSE41-NEXT: pinsrw $3, %ebx, %xmm0 +; SSE41-NEXT: pinsrw $4, %r13d, %xmm0 +; SSE41-NEXT: pinsrw $5, %r12d, %xmm0 +; SSE41-NEXT: pinsrw $6, %r15d, %xmm0 +; SSE41-NEXT: pinsrw $7, %r14d, %xmm0 +; SSE41-NEXT: movd %r10d, %xmm1 +; SSE41-NEXT: pinsrw $1, %r11d, %xmm1 +; SSE41-NEXT: pinsrw $2, %r9d, %xmm1 +; SSE41-NEXT: pinsrw $3, %edx, %xmm1 +; SSE41-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE41-NEXT: movq %xmm1, 16(%rax) +; SSE41-NEXT: movdqa %xmm0, (%rax) +; SSE41-NEXT: popq %rbx +; SSE41-NEXT: popq %r12 +; SSE41-NEXT: popq %r13 +; SSE41-NEXT: popq %r14 +; SSE41-NEXT: popq %r15 +; SSE41-NEXT: popq %rbp +; SSE41-NEXT: retq +; +; AVX-LABEL: v12i16: +; AVX: # %bb.0: +; AVX-NEXT: pushq %rbp +; AVX-NEXT: pushq %r15 +; AVX-NEXT: pushq %r14 +; AVX-NEXT: pushq %r13 +; AVX-NEXT: pushq %r12 +; AVX-NEXT: pushq %rbx +; AVX-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX-NEXT: vmovdqa (%rsi), %xmm2 +; AVX-NEXT: vmovdqa 16(%rsi), %xmm0 +; AVX-NEXT: vmovd %xmm2, %eax +; AVX-NEXT: vmovdqa (%rdi), %xmm3 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovd %xmm3, %edx +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: movl %edx, %esi +; AVX-NEXT: subw %ax, %si +; AVX-NEXT: setns %cl +; AVX-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX-NEXT: subw %ax, %dx +; AVX-NEXT: cmovol %ecx, %edx +; AVX-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX-NEXT: vpextrw $1, %xmm2, %eax +; AVX-NEXT: vpextrw $1, %xmm3, %edx +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: movl %edx, %esi +; AVX-NEXT: subw %ax, %si +; AVX-NEXT: setns %cl +; AVX-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX-NEXT: subw %ax, %dx +; AVX-NEXT: cmovol %ecx, %edx +; AVX-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX-NEXT: vpextrw $2, %xmm2, %eax +; AVX-NEXT: vpextrw $2, %xmm3, %edx +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: movl %edx, %esi +; AVX-NEXT: subw %ax, %si +; AVX-NEXT: setns %cl +; AVX-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX-NEXT: subw %ax, %dx +; AVX-NEXT: cmovol %ecx, %edx +; AVX-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX-NEXT: vpextrw $3, %xmm2, %eax +; AVX-NEXT: vpextrw $3, %xmm3, %edx +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: movl %edx, %esi +; AVX-NEXT: subw %ax, %si +; AVX-NEXT: setns %cl +; AVX-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX-NEXT: subw %ax, %dx +; AVX-NEXT: cmovol %ecx, %edx +; AVX-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX-NEXT: vpextrw $4, %xmm2, %eax +; AVX-NEXT: vpextrw $4, %xmm3, %r14d +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: movl %r14d, %esi +; AVX-NEXT: subw %ax, %si +; AVX-NEXT: setns %cl +; AVX-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX-NEXT: subw %ax, %r14w +; AVX-NEXT: cmovol %ecx, %r14d +; AVX-NEXT: vpextrw $5, %xmm2, %eax +; AVX-NEXT: vpextrw $5, %xmm3, %r15d +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: movl %r15d, %esi +; AVX-NEXT: subw %ax, %si +; AVX-NEXT: setns %cl +; AVX-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX-NEXT: subw %ax, %r15w +; AVX-NEXT: cmovol %ecx, %r15d +; AVX-NEXT: vpextrw $6, %xmm2, %eax +; AVX-NEXT: vpextrw $6, %xmm3, %r12d +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: movl %r12d, %esi +; AVX-NEXT: subw %ax, %si +; AVX-NEXT: setns %cl +; AVX-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX-NEXT: subw %ax, %r12w +; AVX-NEXT: cmovol %ecx, %r12d +; AVX-NEXT: vpextrw $7, %xmm2, %eax +; AVX-NEXT: vpextrw $7, %xmm3, %r13d +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: movl %r13d, %esi +; AVX-NEXT: subw %ax, %si +; AVX-NEXT: setns %cl +; AVX-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX-NEXT: subw %ax, %r13w +; AVX-NEXT: cmovol %ecx, %r13d +; AVX-NEXT: vpextrw $7, %xmm0, %eax +; AVX-NEXT: vpextrw $7, %xmm1, %ebx +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: movl %ebx, %esi +; AVX-NEXT: subw %ax, %si +; AVX-NEXT: setns %cl +; AVX-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX-NEXT: subw %ax, %bx +; AVX-NEXT: cmovol %ecx, %ebx +; AVX-NEXT: vpextrw $6, %xmm0, %eax +; AVX-NEXT: vpextrw $6, %xmm1, %ebp +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: movl %ebp, %esi +; AVX-NEXT: subw %ax, %si +; AVX-NEXT: setns %cl +; AVX-NEXT: addl $32767, %ecx # imm = 0x7FFF +; AVX-NEXT: subw %ax, %bp +; AVX-NEXT: cmovol %ecx, %ebp +; AVX-NEXT: vpextrw $5, %xmm0, %ecx +; AVX-NEXT: vpextrw $5, %xmm1, %eax +; AVX-NEXT: xorl %esi, %esi +; AVX-NEXT: movl %eax, %edi +; AVX-NEXT: subw %cx, %di +; AVX-NEXT: setns %sil +; AVX-NEXT: addl $32767, %esi # imm = 0x7FFF +; AVX-NEXT: subw %cx, %ax +; AVX-NEXT: cmovol %esi, %eax +; AVX-NEXT: vpextrw $4, %xmm0, %esi +; AVX-NEXT: vpextrw $4, %xmm1, %ecx +; AVX-NEXT: xorl %edi, %edi +; AVX-NEXT: movl %ecx, %r8d +; AVX-NEXT: subw %si, %r8w +; AVX-NEXT: setns %dil +; AVX-NEXT: addl $32767, %edi # imm = 0x7FFF +; AVX-NEXT: subw %si, %cx +; AVX-NEXT: cmovol %edi, %ecx +; AVX-NEXT: vpextrw $3, %xmm0, %edi +; AVX-NEXT: vpextrw $3, %xmm1, %r8d +; AVX-NEXT: xorl %esi, %esi +; AVX-NEXT: movl %r8d, %edx +; AVX-NEXT: subw %di, %dx +; AVX-NEXT: setns %sil +; AVX-NEXT: addl $32767, %esi # imm = 0x7FFF +; AVX-NEXT: subw %di, %r8w +; AVX-NEXT: cmovol %esi, %r8d +; AVX-NEXT: vpextrw $2, %xmm0, %edx +; AVX-NEXT: vpextrw $2, %xmm1, %edi +; AVX-NEXT: xorl %esi, %esi +; AVX-NEXT: movl %edi, %r9d +; AVX-NEXT: subw %dx, %r9w +; AVX-NEXT: setns %sil +; AVX-NEXT: addl $32767, %esi # imm = 0x7FFF +; AVX-NEXT: subw %dx, %di +; AVX-NEXT: cmovol %esi, %edi +; AVX-NEXT: vmovd %xmm0, %r9d +; AVX-NEXT: vmovd %xmm1, %esi +; AVX-NEXT: xorl %edx, %edx +; AVX-NEXT: movl %esi, %r10d +; AVX-NEXT: subw %r9w, %r10w +; AVX-NEXT: setns %dl +; AVX-NEXT: addl $32767, %edx # imm = 0x7FFF +; AVX-NEXT: subw %r9w, %si +; AVX-NEXT: cmovol %edx, %esi +; AVX-NEXT: vpextrw $1, %xmm0, %r9d +; AVX-NEXT: vpextrw $1, %xmm1, %edx +; AVX-NEXT: xorl %r10d, %r10d +; AVX-NEXT: movl %edx, %r11d +; AVX-NEXT: subw %r9w, %r11w +; AVX-NEXT: setns %r10b +; AVX-NEXT: addl $32767, %r10d # imm = 0x7FFF +; AVX-NEXT: subw %r9w, %dx +; AVX-NEXT: cmovol %r10d, %edx +; AVX-NEXT: vmovd %esi, %xmm0 +; AVX-NEXT: vpinsrw $1, %edx, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $2, %edi, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $3, %r8d, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $6, %ebp, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $7, %ebx, %xmm0, %xmm0 +; AVX-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload +; AVX-NEXT: # xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vpinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX-NEXT: vpinsrw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX-NEXT: vpinsrw $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX-NEXT: vpinsrw $4, %r14d, %xmm1, %xmm1 +; AVX-NEXT: vpinsrw $5, %r15d, %xmm1, %xmm1 +; AVX-NEXT: vpinsrw $6, %r12d, %xmm1, %xmm1 +; AVX-NEXT: vpinsrw $7, %r13d, %xmm1, %xmm1 +; AVX-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX-NEXT: vmovq %xmm0, 16(%rax) +; AVX-NEXT: vmovdqa %xmm1, (%rax) +; AVX-NEXT: popq %rbx +; AVX-NEXT: popq %r12 +; AVX-NEXT: popq %r13 +; AVX-NEXT: popq %r14 +; AVX-NEXT: popq %r15 +; AVX-NEXT: popq %rbp +; AVX-NEXT: retq + %x = load <12 x i16>, <12 x i16>* %px + %y = load <12 x i16>, <12 x i16>* %py + %z = call <12 x i16> @llvm.ssub.sat.v12i16(<12 x i16> %x, <12 x i16> %y) + store <12 x i16> %z, <12 x i16>* %pz + ret void +} + +; Scalarization + +define void @v1i8(<1 x i8>* %px, <1 x i8>* %py, <1 x i8>* %pz) nounwind { +; SSE-LABEL: v1i8: +; SSE: # %bb.0: +; SSE-NEXT: movb (%rdi), %cl +; SSE-NEXT: movb (%rsi), %dil +; SSE-NEXT: movl %ecx, %eax +; SSE-NEXT: subb %dil, %al +; SSE-NEXT: setns %sil +; SSE-NEXT: subb %dil, %cl +; SSE-NEXT: jno .LBB13_2 +; SSE-NEXT: # %bb.1: +; SSE-NEXT: addb $127, %sil +; SSE-NEXT: movl %esi, %ecx +; SSE-NEXT: .LBB13_2: +; SSE-NEXT: movb %cl, (%rdx) +; SSE-NEXT: retq +; +; AVX-LABEL: v1i8: +; AVX: # %bb.0: +; AVX-NEXT: movb (%rdi), %cl +; AVX-NEXT: movb (%rsi), %dil +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: subb %dil, %al +; AVX-NEXT: setns %sil +; AVX-NEXT: subb %dil, %cl +; AVX-NEXT: jno .LBB13_2 +; AVX-NEXT: # %bb.1: +; AVX-NEXT: addb $127, %sil +; AVX-NEXT: movl %esi, %ecx +; AVX-NEXT: .LBB13_2: +; AVX-NEXT: movb %cl, (%rdx) +; AVX-NEXT: retq + %x = load <1 x i8>, <1 x i8>* %px + %y = load <1 x i8>, <1 x i8>* %py + %z = call <1 x i8> @llvm.ssub.sat.v1i8(<1 x i8> %x, <1 x i8> %y) + store <1 x i8> %z, <1 x i8>* %pz + ret void +} + +define void @v1i16(<1 x i16>* %px, <1 x i16>* %py, <1 x i16>* %pz) nounwind { +; SSE-LABEL: v1i16: +; SSE: # %bb.0: +; SSE-NEXT: movzwl (%rdi), %eax +; SSE-NEXT: movzwl (%rsi), %ecx +; SSE-NEXT: xorl %esi, %esi +; SSE-NEXT: movl %eax, %edi +; SSE-NEXT: subw %cx, %di +; SSE-NEXT: setns %sil +; SSE-NEXT: addl $32767, %esi # imm = 0x7FFF +; SSE-NEXT: subw %cx, %ax +; SSE-NEXT: cmovol %esi, %eax +; SSE-NEXT: movw %ax, (%rdx) +; SSE-NEXT: retq +; +; AVX-LABEL: v1i16: +; AVX: # %bb.0: +; AVX-NEXT: movzwl (%rdi), %eax +; AVX-NEXT: movzwl (%rsi), %ecx +; AVX-NEXT: xorl %esi, %esi +; AVX-NEXT: movl %eax, %edi +; AVX-NEXT: subw %cx, %di +; AVX-NEXT: setns %sil +; AVX-NEXT: addl $32767, %esi # imm = 0x7FFF +; AVX-NEXT: subw %cx, %ax +; AVX-NEXT: cmovol %esi, %eax +; AVX-NEXT: movw %ax, (%rdx) +; AVX-NEXT: retq + %x = load <1 x i16>, <1 x i16>* %px + %y = load <1 x i16>, <1 x i16>* %py + %z = call <1 x i16> @llvm.ssub.sat.v1i16(<1 x i16> %x, <1 x i16> %y) + store <1 x i16> %z, <1 x i16>* %pz + ret void +} + +; Promotion + +define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { +; SSE2-LABEL: v16i4: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: pushq %r15 +; SSE2-NEXT: pushq %r14 +; SSE2-NEXT: pushq %r13 +; SSE2-NEXT: pushq %r12 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: psllw $4, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: psllw $4, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r9b +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSE2-NEXT: movl %r9d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r9b +; SSE2-NEXT: jno .LBB15_2 +; SSE2-NEXT: # %bb.1: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r9d +; SSE2-NEXT: .LBB15_2: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movl %esi, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %sil +; SSE2-NEXT: jno .LBB15_4 +; SSE2-NEXT: # %bb.3: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %esi +; SSE2-NEXT: .LBB15_4: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl %ebx, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %bl +; SSE2-NEXT: jno .LBB15_6 +; SSE2-NEXT: # %bb.5: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %ebx +; SSE2-NEXT: .LBB15_6: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %dl +; SSE2-NEXT: jno .LBB15_8 +; SSE2-NEXT: # %bb.7: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %edx +; SSE2-NEXT: .LBB15_8: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r10b +; SSE2-NEXT: movl %r10d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r10b +; SSE2-NEXT: jno .LBB15_10 +; SSE2-NEXT: # %bb.9: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r10d +; SSE2-NEXT: .LBB15_10: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r11b +; SSE2-NEXT: movl %r11d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r11b +; SSE2-NEXT: jno .LBB15_12 +; SSE2-NEXT: # %bb.11: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r11d +; SSE2-NEXT: .LBB15_12: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bpl +; SSE2-NEXT: movl %ebp, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %bpl +; SSE2-NEXT: jno .LBB15_14 +; SSE2-NEXT: # %bb.13: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %ebp +; SSE2-NEXT: .LBB15_14: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r14b +; SSE2-NEXT: movl %r14d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r14b +; SSE2-NEXT: jno .LBB15_16 +; SSE2-NEXT: # %bb.15: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r14d +; SSE2-NEXT: .LBB15_16: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r15b +; SSE2-NEXT: movl %r15d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r15b +; SSE2-NEXT: jno .LBB15_18 +; SSE2-NEXT: # %bb.17: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r15d +; SSE2-NEXT: .LBB15_18: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r12b +; SSE2-NEXT: movl %r12d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r12b +; SSE2-NEXT: jno .LBB15_20 +; SSE2-NEXT: # %bb.19: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r12d +; SSE2-NEXT: .LBB15_20: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r13b +; SSE2-NEXT: movl %r13d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r13b +; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB15_22 +; SSE2-NEXT: # %bb.21: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r13d +; SSE2-NEXT: .LBB15_22: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dil +; SSE2-NEXT: movl %edi, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %dil +; SSE2-NEXT: jno .LBB15_24 +; SSE2-NEXT: # %bb.23: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %edi +; SSE2-NEXT: .LBB15_24: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r8b +; SSE2-NEXT: movl %r8d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r8b +; SSE2-NEXT: jno .LBB15_26 +; SSE2-NEXT: # %bb.25: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r8d +; SSE2-NEXT: .LBB15_26: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl %ebx, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %bl +; SSE2-NEXT: jno .LBB15_28 +; SSE2-NEXT: # %bb.27: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %ebx +; SSE2-NEXT: .LBB15_28: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: subb %dl, %cl +; SSE2-NEXT: setns %cl +; SSE2-NEXT: subb %dl, %al +; SSE2-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB15_30 +; SSE2-NEXT: # %bb.29: +; SSE2-NEXT: addb $127, %cl +; SSE2-NEXT: movl %ecx, %eax +; SSE2-NEXT: .LBB15_30: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSE2-NEXT: movl %esi, %ecx +; SSE2-NEXT: subb %dl, %cl +; SSE2-NEXT: setns %cl +; SSE2-NEXT: subb %dl, %sil +; SSE2-NEXT: jno .LBB15_32 +; SSE2-NEXT: # %bb.31: +; SSE2-NEXT: addb $127, %cl +; SSE2-NEXT: movl %ecx, %esi +; SSE2-NEXT: .LBB15_32: +; SSE2-NEXT: movzbl %sil, %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: movzbl %bl, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl %r8b, %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: movzbl %dil, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl %r13b, %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: movzbl %r12b, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl %r15b, %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: movzbl %r14b, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl %bpl, %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: movzbl %r11b, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl %r10b, %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: movzbl %r9b, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: psubb %xmm1, %xmm0 +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %r12 +; SSE2-NEXT: popq %r13 +; SSE2-NEXT: popq %r14 +; SSE2-NEXT: popq %r15 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v16i4: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pushq %rbp +; SSSE3-NEXT: pushq %r15 +; SSSE3-NEXT: pushq %r14 +; SSSE3-NEXT: pushq %r13 +; SSSE3-NEXT: pushq %r12 +; SSSE3-NEXT: pushq %rbx +; SSSE3-NEXT: psllw $4, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: psllw $4, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r9b +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSSE3-NEXT: movl %r9d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r9b +; SSSE3-NEXT: jno .LBB15_2 +; SSSE3-NEXT: # %bb.1: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r9d +; SSSE3-NEXT: .LBB15_2: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movl %esi, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %sil +; SSSE3-NEXT: jno .LBB15_4 +; SSSE3-NEXT: # %bb.3: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %esi +; SSSE3-NEXT: .LBB15_4: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl %ebx, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %bl +; SSSE3-NEXT: jno .LBB15_6 +; SSSE3-NEXT: # %bb.5: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %ebx +; SSSE3-NEXT: .LBB15_6: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %dl +; SSSE3-NEXT: jno .LBB15_8 +; SSSE3-NEXT: # %bb.7: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %edx +; SSSE3-NEXT: .LBB15_8: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r10b +; SSSE3-NEXT: movl %r10d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r10b +; SSSE3-NEXT: jno .LBB15_10 +; SSSE3-NEXT: # %bb.9: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r10d +; SSSE3-NEXT: .LBB15_10: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r11b +; SSSE3-NEXT: movl %r11d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r11b +; SSSE3-NEXT: jno .LBB15_12 +; SSSE3-NEXT: # %bb.11: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r11d +; SSSE3-NEXT: .LBB15_12: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bpl +; SSSE3-NEXT: movl %ebp, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %bpl +; SSSE3-NEXT: jno .LBB15_14 +; SSSE3-NEXT: # %bb.13: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %ebp +; SSSE3-NEXT: .LBB15_14: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r14b +; SSSE3-NEXT: movl %r14d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r14b +; SSSE3-NEXT: jno .LBB15_16 +; SSSE3-NEXT: # %bb.15: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r14d +; SSSE3-NEXT: .LBB15_16: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r15b +; SSSE3-NEXT: movl %r15d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r15b +; SSSE3-NEXT: jno .LBB15_18 +; SSSE3-NEXT: # %bb.17: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r15d +; SSSE3-NEXT: .LBB15_18: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r12b +; SSSE3-NEXT: movl %r12d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r12b +; SSSE3-NEXT: jno .LBB15_20 +; SSSE3-NEXT: # %bb.19: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r12d +; SSSE3-NEXT: .LBB15_20: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r13b +; SSSE3-NEXT: movl %r13d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r13b +; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB15_22 +; SSSE3-NEXT: # %bb.21: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r13d +; SSSE3-NEXT: .LBB15_22: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dil +; SSSE3-NEXT: movl %edi, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %dil +; SSSE3-NEXT: jno .LBB15_24 +; SSSE3-NEXT: # %bb.23: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %edi +; SSSE3-NEXT: .LBB15_24: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r8b +; SSSE3-NEXT: movl %r8d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r8b +; SSSE3-NEXT: jno .LBB15_26 +; SSSE3-NEXT: # %bb.25: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r8d +; SSSE3-NEXT: .LBB15_26: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl %ebx, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %bl +; SSSE3-NEXT: jno .LBB15_28 +; SSSE3-NEXT: # %bb.27: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %ebx +; SSSE3-NEXT: .LBB15_28: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: subb %dl, %cl +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: subb %dl, %al +; SSSE3-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB15_30 +; SSSE3-NEXT: # %bb.29: +; SSSE3-NEXT: addb $127, %cl +; SSSE3-NEXT: movl %ecx, %eax +; SSSE3-NEXT: .LBB15_30: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSSE3-NEXT: movl %esi, %ecx +; SSSE3-NEXT: subb %dl, %cl +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: subb %dl, %sil +; SSSE3-NEXT: jno .LBB15_32 +; SSSE3-NEXT: # %bb.31: +; SSSE3-NEXT: addb $127, %cl +; SSSE3-NEXT: movl %ecx, %esi +; SSSE3-NEXT: .LBB15_32: +; SSSE3-NEXT: movzbl %sil, %ecx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: movzbl %al, %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: movzbl %bl, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzbl %r8b, %eax +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSSE3-NEXT: movzbl %dil, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzbl %r13b, %eax +; SSSE3-NEXT: movd %eax, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSSE3-NEXT: movzbl %r12b, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzbl %r15b, %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSSE3-NEXT: movzbl %r14b, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzbl %bpl, %eax +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: movzbl %r11b, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzbl %r10b, %eax +; SSSE3-NEXT: movd %eax, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movd %eax, %xmm4 +; SSSE3-NEXT: movzbl %r9b, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: psrlw $4, %xmm0 +; SSSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; SSSE3-NEXT: pxor %xmm1, %xmm0 +; SSSE3-NEXT: psubb %xmm1, %xmm0 +; SSSE3-NEXT: popq %rbx +; SSSE3-NEXT: popq %r12 +; SSSE3-NEXT: popq %r13 +; SSSE3-NEXT: popq %r14 +; SSSE3-NEXT: popq %r15 +; SSSE3-NEXT: popq %rbp +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v16i4: +; SSE41: # %bb.0: +; SSE41-NEXT: psllw $4, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; SSE41-NEXT: pand %xmm2, %xmm1 +; SSE41-NEXT: pextrb $15, %xmm1, %ecx +; SSE41-NEXT: psllw $4, %xmm0 +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pextrb $15, %xmm0, %edx +; SSE41-NEXT: movl %edx, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %dl +; SSE41-NEXT: jno .LBB15_2 +; SSE41-NEXT: # %bb.1: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edx +; SSE41-NEXT: .LBB15_2: +; SSE41-NEXT: pextrb $14, %xmm1, %ecx +; SSE41-NEXT: pextrb $14, %xmm0, %r11d +; SSE41-NEXT: movl %r11d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r11b +; SSE41-NEXT: jno .LBB15_4 +; SSE41-NEXT: # %bb.3: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r11d +; SSE41-NEXT: .LBB15_4: +; SSE41-NEXT: pextrb $13, %xmm1, %ecx +; SSE41-NEXT: pextrb $13, %xmm0, %edi +; SSE41-NEXT: movl %edi, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %dil +; SSE41-NEXT: jno .LBB15_6 +; SSE41-NEXT: # %bb.5: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edi +; SSE41-NEXT: .LBB15_6: +; SSE41-NEXT: pushq %rbp +; SSE41-NEXT: pushq %r15 +; SSE41-NEXT: pushq %r14 +; SSE41-NEXT: pushq %r13 +; SSE41-NEXT: pushq %r12 +; SSE41-NEXT: pushq %rbx +; SSE41-NEXT: pextrb $12, %xmm1, %ecx +; SSE41-NEXT: pextrb $12, %xmm0, %r14d +; SSE41-NEXT: movl %r14d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r14b +; SSE41-NEXT: jno .LBB15_8 +; SSE41-NEXT: # %bb.7: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r14d +; SSE41-NEXT: .LBB15_8: +; SSE41-NEXT: pextrb $11, %xmm1, %ecx +; SSE41-NEXT: pextrb $11, %xmm0, %ebp +; SSE41-NEXT: movl %ebp, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %bpl +; SSE41-NEXT: jno .LBB15_10 +; SSE41-NEXT: # %bb.9: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebp +; SSE41-NEXT: .LBB15_10: +; SSE41-NEXT: pextrb $10, %xmm1, %ecx +; SSE41-NEXT: pextrb $10, %xmm0, %r15d +; SSE41-NEXT: movl %r15d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r15b +; SSE41-NEXT: jno .LBB15_12 +; SSE41-NEXT: # %bb.11: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r15d +; SSE41-NEXT: .LBB15_12: +; SSE41-NEXT: pextrb $9, %xmm1, %ecx +; SSE41-NEXT: pextrb $9, %xmm0, %r12d +; SSE41-NEXT: movl %r12d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r12b +; SSE41-NEXT: jno .LBB15_14 +; SSE41-NEXT: # %bb.13: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r12d +; SSE41-NEXT: .LBB15_14: +; SSE41-NEXT: pextrb $8, %xmm1, %ecx +; SSE41-NEXT: pextrb $8, %xmm0, %r13d +; SSE41-NEXT: movl %r13d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r13b +; SSE41-NEXT: jno .LBB15_16 +; SSE41-NEXT: # %bb.15: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r13d +; SSE41-NEXT: .LBB15_16: +; SSE41-NEXT: pextrb $7, %xmm1, %ecx +; SSE41-NEXT: pextrb $7, %xmm0, %r10d +; SSE41-NEXT: movl %r10d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r10b +; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB15_18 +; SSE41-NEXT: # %bb.17: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r10d +; SSE41-NEXT: .LBB15_18: +; SSE41-NEXT: pextrb $6, %xmm1, %ecx +; SSE41-NEXT: pextrb $6, %xmm0, %r9d +; SSE41-NEXT: movl %r9d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r9b +; SSE41-NEXT: jno .LBB15_20 +; SSE41-NEXT: # %bb.19: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r9d +; SSE41-NEXT: .LBB15_20: +; SSE41-NEXT: pextrb $5, %xmm1, %ecx +; SSE41-NEXT: pextrb $5, %xmm0, %ebp +; SSE41-NEXT: movl %ebp, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %bpl +; SSE41-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB15_22 +; SSE41-NEXT: # %bb.21: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebp +; SSE41-NEXT: .LBB15_22: +; SSE41-NEXT: pextrb $4, %xmm1, %ecx +; SSE41-NEXT: pextrb $4, %xmm0, %edi +; SSE41-NEXT: movl %edi, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %dil +; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB15_24 +; SSE41-NEXT: # %bb.23: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edi +; SSE41-NEXT: .LBB15_24: +; SSE41-NEXT: pextrb $3, %xmm1, %edx +; SSE41-NEXT: pextrb $3, %xmm0, %eax +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: setns %cl +; SSE41-NEXT: subb %dl, %al +; SSE41-NEXT: jno .LBB15_26 +; SSE41-NEXT: # %bb.25: +; SSE41-NEXT: addb $127, %cl +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB15_26: +; SSE41-NEXT: pextrb $2, %xmm1, %ebx +; SSE41-NEXT: pextrb $2, %xmm0, %ecx +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: subb %bl, %dl +; SSE41-NEXT: setns %dl +; SSE41-NEXT: subb %bl, %cl +; SSE41-NEXT: jno .LBB15_28 +; SSE41-NEXT: # %bb.27: +; SSE41-NEXT: addb $127, %dl +; SSE41-NEXT: movl %edx, %ecx +; SSE41-NEXT: .LBB15_28: +; SSE41-NEXT: pextrb $0, %xmm1, %esi +; SSE41-NEXT: pextrb $0, %xmm0, %edx +; SSE41-NEXT: movl %edx, %ebx +; SSE41-NEXT: subb %sil, %bl +; SSE41-NEXT: setns %bl +; SSE41-NEXT: subb %sil, %dl +; SSE41-NEXT: jno .LBB15_30 +; SSE41-NEXT: # %bb.29: +; SSE41-NEXT: addb $127, %bl +; SSE41-NEXT: movl %ebx, %edx +; SSE41-NEXT: .LBB15_30: +; SSE41-NEXT: pextrb $1, %xmm1, %esi +; SSE41-NEXT: pextrb $1, %xmm0, %r8d +; SSE41-NEXT: movl %r8d, %ebx +; SSE41-NEXT: subb %sil, %bl +; SSE41-NEXT: setns %bl +; SSE41-NEXT: subb %sil, %r8b +; SSE41-NEXT: jno .LBB15_32 +; SSE41-NEXT: # %bb.31: +; SSE41-NEXT: addb $127, %bl +; SSE41-NEXT: movl %ebx, %r8d +; SSE41-NEXT: .LBB15_32: +; SSE41-NEXT: movzbl %dl, %edx +; SSE41-NEXT: movd %edx, %xmm0 +; SSE41-NEXT: movzbl %r8b, %edx +; SSE41-NEXT: pinsrb $1, %edx, %xmm0 +; SSE41-NEXT: movzbl %cl, %ecx +; SSE41-NEXT: pinsrb $2, %ecx, %xmm0 +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $3, %eax, %xmm0 +; SSE41-NEXT: movzbl %dil, %eax +; SSE41-NEXT: pinsrb $4, %eax, %xmm0 +; SSE41-NEXT: movzbl %bpl, %eax +; SSE41-NEXT: pinsrb $5, %eax, %xmm0 +; SSE41-NEXT: movzbl %r9b, %eax +; SSE41-NEXT: pinsrb $6, %eax, %xmm0 +; SSE41-NEXT: movzbl %r10b, %eax +; SSE41-NEXT: pinsrb $7, %eax, %xmm0 +; SSE41-NEXT: movzbl %r13b, %eax +; SSE41-NEXT: pinsrb $8, %eax, %xmm0 +; SSE41-NEXT: movzbl %r12b, %eax +; SSE41-NEXT: pinsrb $9, %eax, %xmm0 +; SSE41-NEXT: movzbl %r15b, %eax +; SSE41-NEXT: pinsrb $10, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $11, %eax, %xmm0 +; SSE41-NEXT: movzbl %r14b, %eax +; SSE41-NEXT: pinsrb $12, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $13, %eax, %xmm0 +; SSE41-NEXT: movzbl %r11b, %eax +; SSE41-NEXT: pinsrb $14, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $15, %eax, %xmm0 +; SSE41-NEXT: psrlw $4, %xmm0 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: psubb %xmm1, %xmm0 +; SSE41-NEXT: popq %rbx +; SSE41-NEXT: popq %r12 +; SSE41-NEXT: popq %r13 +; SSE41-NEXT: popq %r14 +; SSE41-NEXT: popq %r15 +; SSE41-NEXT: popq %rbp +; SSE41-NEXT: retq +; +; AVX-LABEL: v16i4: +; AVX: # %bb.0: +; AVX-NEXT: vpsllw $4, %xmm1, %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $15, %xmm1, %ecx +; AVX-NEXT: vpsllw $4, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $15, %xmm0, %edx +; AVX-NEXT: movl %edx, %eax +; AVX-NEXT: subb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: subb %cl, %dl +; AVX-NEXT: jno .LBB15_2 +; AVX-NEXT: # %bb.1: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %edx +; AVX-NEXT: .LBB15_2: +; AVX-NEXT: vpextrb $14, %xmm1, %ecx +; AVX-NEXT: vpextrb $14, %xmm0, %r11d +; AVX-NEXT: movl %r11d, %eax +; AVX-NEXT: subb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: subb %cl, %r11b +; AVX-NEXT: jno .LBB15_4 +; AVX-NEXT: # %bb.3: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %r11d +; AVX-NEXT: .LBB15_4: +; AVX-NEXT: vpextrb $13, %xmm1, %ecx +; AVX-NEXT: vpextrb $13, %xmm0, %edi +; AVX-NEXT: movl %edi, %eax +; AVX-NEXT: subb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: subb %cl, %dil +; AVX-NEXT: jno .LBB15_6 +; AVX-NEXT: # %bb.5: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %edi +; AVX-NEXT: .LBB15_6: +; AVX-NEXT: pushq %rbp +; AVX-NEXT: pushq %r15 +; AVX-NEXT: pushq %r14 +; AVX-NEXT: pushq %r13 +; AVX-NEXT: pushq %r12 +; AVX-NEXT: pushq %rbx +; AVX-NEXT: vpextrb $12, %xmm1, %ecx +; AVX-NEXT: vpextrb $12, %xmm0, %r14d +; AVX-NEXT: movl %r14d, %eax +; AVX-NEXT: subb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: subb %cl, %r14b +; AVX-NEXT: jno .LBB15_8 +; AVX-NEXT: # %bb.7: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %r14d +; AVX-NEXT: .LBB15_8: +; AVX-NEXT: vpextrb $11, %xmm1, %ecx +; AVX-NEXT: vpextrb $11, %xmm0, %ebp +; AVX-NEXT: movl %ebp, %eax +; AVX-NEXT: subb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: subb %cl, %bpl +; AVX-NEXT: jno .LBB15_10 +; AVX-NEXT: # %bb.9: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %ebp +; AVX-NEXT: .LBB15_10: +; AVX-NEXT: vpextrb $10, %xmm1, %ecx +; AVX-NEXT: vpextrb $10, %xmm0, %r15d +; AVX-NEXT: movl %r15d, %eax +; AVX-NEXT: subb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: subb %cl, %r15b +; AVX-NEXT: jno .LBB15_12 +; AVX-NEXT: # %bb.11: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %r15d +; AVX-NEXT: .LBB15_12: +; AVX-NEXT: vpextrb $9, %xmm1, %ecx +; AVX-NEXT: vpextrb $9, %xmm0, %r12d +; AVX-NEXT: movl %r12d, %eax +; AVX-NEXT: subb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: subb %cl, %r12b +; AVX-NEXT: jno .LBB15_14 +; AVX-NEXT: # %bb.13: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %r12d +; AVX-NEXT: .LBB15_14: +; AVX-NEXT: vpextrb $8, %xmm1, %ecx +; AVX-NEXT: vpextrb $8, %xmm0, %r13d +; AVX-NEXT: movl %r13d, %eax +; AVX-NEXT: subb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: subb %cl, %r13b +; AVX-NEXT: jno .LBB15_16 +; AVX-NEXT: # %bb.15: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %r13d +; AVX-NEXT: .LBB15_16: +; AVX-NEXT: vpextrb $7, %xmm1, %ecx +; AVX-NEXT: vpextrb $7, %xmm0, %r10d +; AVX-NEXT: movl %r10d, %eax +; AVX-NEXT: subb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: subb %cl, %r10b +; AVX-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX-NEXT: jno .LBB15_18 +; AVX-NEXT: # %bb.17: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %r10d +; AVX-NEXT: .LBB15_18: +; AVX-NEXT: vpextrb $6, %xmm1, %ecx +; AVX-NEXT: vpextrb $6, %xmm0, %r9d +; AVX-NEXT: movl %r9d, %eax +; AVX-NEXT: subb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: subb %cl, %r9b +; AVX-NEXT: jno .LBB15_20 +; AVX-NEXT: # %bb.19: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %r9d +; AVX-NEXT: .LBB15_20: +; AVX-NEXT: vpextrb $5, %xmm1, %ecx +; AVX-NEXT: vpextrb $5, %xmm0, %ebp +; AVX-NEXT: movl %ebp, %eax +; AVX-NEXT: subb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: subb %cl, %bpl +; AVX-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX-NEXT: jno .LBB15_22 +; AVX-NEXT: # %bb.21: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %ebp +; AVX-NEXT: .LBB15_22: +; AVX-NEXT: vpextrb $4, %xmm1, %ecx +; AVX-NEXT: vpextrb $4, %xmm0, %edi +; AVX-NEXT: movl %edi, %eax +; AVX-NEXT: subb %cl, %al +; AVX-NEXT: setns %al +; AVX-NEXT: subb %cl, %dil +; AVX-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX-NEXT: jno .LBB15_24 +; AVX-NEXT: # %bb.23: +; AVX-NEXT: addb $127, %al +; AVX-NEXT: movl %eax, %edi +; AVX-NEXT: .LBB15_24: +; AVX-NEXT: vpextrb $3, %xmm1, %edx +; AVX-NEXT: vpextrb $3, %xmm0, %eax +; AVX-NEXT: movl %eax, %ecx +; AVX-NEXT: subb %dl, %cl +; AVX-NEXT: setns %cl +; AVX-NEXT: subb %dl, %al +; AVX-NEXT: jno .LBB15_26 +; AVX-NEXT: # %bb.25: +; AVX-NEXT: addb $127, %cl +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: .LBB15_26: +; AVX-NEXT: vpextrb $2, %xmm1, %ebx +; AVX-NEXT: vpextrb $2, %xmm0, %ecx +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: subb %bl, %dl +; AVX-NEXT: setns %dl +; AVX-NEXT: subb %bl, %cl +; AVX-NEXT: jno .LBB15_28 +; AVX-NEXT: # %bb.27: +; AVX-NEXT: addb $127, %dl +; AVX-NEXT: movl %edx, %ecx +; AVX-NEXT: .LBB15_28: +; AVX-NEXT: vpextrb $0, %xmm1, %esi +; AVX-NEXT: vpextrb $0, %xmm0, %edx +; AVX-NEXT: movl %edx, %ebx +; AVX-NEXT: subb %sil, %bl +; AVX-NEXT: setns %bl +; AVX-NEXT: subb %sil, %dl +; AVX-NEXT: jno .LBB15_30 +; AVX-NEXT: # %bb.29: +; AVX-NEXT: addb $127, %bl +; AVX-NEXT: movl %ebx, %edx +; AVX-NEXT: .LBB15_30: +; AVX-NEXT: vpextrb $1, %xmm1, %esi +; AVX-NEXT: vpextrb $1, %xmm0, %r8d +; AVX-NEXT: movl %r8d, %ebx +; AVX-NEXT: subb %sil, %bl +; AVX-NEXT: setns %bl +; AVX-NEXT: subb %sil, %r8b +; AVX-NEXT: jno .LBB15_32 +; AVX-NEXT: # %bb.31: +; AVX-NEXT: addb $127, %bl +; AVX-NEXT: movl %ebx, %r8d +; AVX-NEXT: .LBB15_32: +; AVX-NEXT: movzbl %dl, %edx +; AVX-NEXT: vmovd %edx, %xmm0 +; AVX-NEXT: movzbl %r8b, %edx +; AVX-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0 +; AVX-NEXT: movzbl %cl, %ecx +; AVX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %dil, %eax +; AVX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %bpl, %eax +; AVX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %r9b, %eax +; AVX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %r10b, %eax +; AVX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %r13b, %eax +; AVX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %r12b, %eax +; AVX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %r15b, %eax +; AVX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %r14b, %eax +; AVX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl %r11b, %eax +; AVX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: popq %rbx +; AVX-NEXT: popq %r12 +; AVX-NEXT: popq %r13 +; AVX-NEXT: popq %r14 +; AVX-NEXT: popq %r15 +; AVX-NEXT: popq %rbp +; AVX-NEXT: retq + %z = call <16 x i4> @llvm.ssub.sat.v16i4(<16 x i4> %x, <16 x i4> %y) + ret <16 x i4> %z +} + +define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind { +; SSE2-LABEL: v16i1: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: pushq %r15 +; SSE2-NEXT: pushq %r14 +; SSE2-NEXT: pushq %r13 +; SSE2-NEXT: pushq %r12 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: psllw $7, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: psllw $7, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r9b +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSE2-NEXT: movl %r9d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r9b +; SSE2-NEXT: jno .LBB16_2 +; SSE2-NEXT: # %bb.1: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r9d +; SSE2-NEXT: .LBB16_2: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movl %esi, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %sil +; SSE2-NEXT: jno .LBB16_4 +; SSE2-NEXT: # %bb.3: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %esi +; SSE2-NEXT: .LBB16_4: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl %ebx, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %bl +; SSE2-NEXT: jno .LBB16_6 +; SSE2-NEXT: # %bb.5: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %ebx +; SSE2-NEXT: .LBB16_6: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %dl +; SSE2-NEXT: jno .LBB16_8 +; SSE2-NEXT: # %bb.7: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %edx +; SSE2-NEXT: .LBB16_8: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r10b +; SSE2-NEXT: movl %r10d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r10b +; SSE2-NEXT: jno .LBB16_10 +; SSE2-NEXT: # %bb.9: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r10d +; SSE2-NEXT: .LBB16_10: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r11b +; SSE2-NEXT: movl %r11d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r11b +; SSE2-NEXT: jno .LBB16_12 +; SSE2-NEXT: # %bb.11: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r11d +; SSE2-NEXT: .LBB16_12: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bpl +; SSE2-NEXT: movl %ebp, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %bpl +; SSE2-NEXT: jno .LBB16_14 +; SSE2-NEXT: # %bb.13: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %ebp +; SSE2-NEXT: .LBB16_14: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r14b +; SSE2-NEXT: movl %r14d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r14b +; SSE2-NEXT: jno .LBB16_16 +; SSE2-NEXT: # %bb.15: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r14d +; SSE2-NEXT: .LBB16_16: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r15b +; SSE2-NEXT: movl %r15d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r15b +; SSE2-NEXT: jno .LBB16_18 +; SSE2-NEXT: # %bb.17: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r15d +; SSE2-NEXT: .LBB16_18: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r12b +; SSE2-NEXT: movl %r12d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r12b +; SSE2-NEXT: jno .LBB16_20 +; SSE2-NEXT: # %bb.19: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r12d +; SSE2-NEXT: .LBB16_20: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r13b +; SSE2-NEXT: movl %r13d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r13b +; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB16_22 +; SSE2-NEXT: # %bb.21: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r13d +; SSE2-NEXT: .LBB16_22: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dil +; SSE2-NEXT: movl %edi, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %dil +; SSE2-NEXT: jno .LBB16_24 +; SSE2-NEXT: # %bb.23: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %edi +; SSE2-NEXT: .LBB16_24: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %r8b +; SSE2-NEXT: movl %r8d, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %r8b +; SSE2-NEXT: jno .LBB16_26 +; SSE2-NEXT: # %bb.25: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %r8d +; SSE2-NEXT: .LBB16_26: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl %ebx, %eax +; SSE2-NEXT: subb %cl, %al +; SSE2-NEXT: setns %al +; SSE2-NEXT: subb %cl, %bl +; SSE2-NEXT: jno .LBB16_28 +; SSE2-NEXT: # %bb.27: +; SSE2-NEXT: addb $127, %al +; SSE2-NEXT: movl %eax, %ebx +; SSE2-NEXT: .LBB16_28: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: subb %dl, %cl +; SSE2-NEXT: setns %cl +; SSE2-NEXT: subb %dl, %al +; SSE2-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: jno .LBB16_30 +; SSE2-NEXT: # %bb.29: +; SSE2-NEXT: addb $127, %cl +; SSE2-NEXT: movl %ecx, %eax +; SSE2-NEXT: .LBB16_30: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSE2-NEXT: movl %esi, %ecx +; SSE2-NEXT: subb %dl, %cl +; SSE2-NEXT: setns %cl +; SSE2-NEXT: subb %dl, %sil +; SSE2-NEXT: jno .LBB16_32 +; SSE2-NEXT: # %bb.31: +; SSE2-NEXT: addb $127, %cl +; SSE2-NEXT: movl %ecx, %esi +; SSE2-NEXT: .LBB16_32: +; SSE2-NEXT: movzbl %sil, %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: movzbl %bl, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl %r8b, %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: movzbl %dil, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl %r13b, %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: movzbl %r12b, %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: movzbl %r15b, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: movzbl %r14b, %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: movzbl %bpl, %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: movzbl %r11b, %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: movzbl %r10b, %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: movzbl %r9b, %eax +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pcmpgtb %xmm4, %xmm0 +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %r12 +; SSE2-NEXT: popq %r13 +; SSE2-NEXT: popq %r14 +; SSE2-NEXT: popq %r15 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v16i1: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pushq %rbp +; SSSE3-NEXT: pushq %r15 +; SSSE3-NEXT: pushq %r14 +; SSSE3-NEXT: pushq %r13 +; SSSE3-NEXT: pushq %r12 +; SSSE3-NEXT: pushq %rbx +; SSSE3-NEXT: psllw $7, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: psllw $7, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r9b +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSSE3-NEXT: movl %r9d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r9b +; SSSE3-NEXT: jno .LBB16_2 +; SSSE3-NEXT: # %bb.1: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r9d +; SSSE3-NEXT: .LBB16_2: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movl %esi, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %sil +; SSSE3-NEXT: jno .LBB16_4 +; SSSE3-NEXT: # %bb.3: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %esi +; SSSE3-NEXT: .LBB16_4: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl %ebx, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %bl +; SSSE3-NEXT: jno .LBB16_6 +; SSSE3-NEXT: # %bb.5: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %ebx +; SSSE3-NEXT: .LBB16_6: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl %edx, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %dl +; SSSE3-NEXT: jno .LBB16_8 +; SSSE3-NEXT: # %bb.7: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %edx +; SSSE3-NEXT: .LBB16_8: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r10b +; SSSE3-NEXT: movl %r10d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r10b +; SSSE3-NEXT: jno .LBB16_10 +; SSSE3-NEXT: # %bb.9: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r10d +; SSSE3-NEXT: .LBB16_10: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r11b +; SSSE3-NEXT: movl %r11d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r11b +; SSSE3-NEXT: jno .LBB16_12 +; SSSE3-NEXT: # %bb.11: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r11d +; SSSE3-NEXT: .LBB16_12: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bpl +; SSSE3-NEXT: movl %ebp, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %bpl +; SSSE3-NEXT: jno .LBB16_14 +; SSSE3-NEXT: # %bb.13: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %ebp +; SSSE3-NEXT: .LBB16_14: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r14b +; SSSE3-NEXT: movl %r14d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r14b +; SSSE3-NEXT: jno .LBB16_16 +; SSSE3-NEXT: # %bb.15: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r14d +; SSSE3-NEXT: .LBB16_16: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r15b +; SSSE3-NEXT: movl %r15d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r15b +; SSSE3-NEXT: jno .LBB16_18 +; SSSE3-NEXT: # %bb.17: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r15d +; SSSE3-NEXT: .LBB16_18: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r12b +; SSSE3-NEXT: movl %r12d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r12b +; SSSE3-NEXT: jno .LBB16_20 +; SSSE3-NEXT: # %bb.19: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r12d +; SSSE3-NEXT: .LBB16_20: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r13b +; SSSE3-NEXT: movl %r13d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r13b +; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB16_22 +; SSSE3-NEXT: # %bb.21: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r13d +; SSSE3-NEXT: .LBB16_22: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dil +; SSSE3-NEXT: movl %edi, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %dil +; SSSE3-NEXT: jno .LBB16_24 +; SSSE3-NEXT: # %bb.23: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %edi +; SSSE3-NEXT: .LBB16_24: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %r8b +; SSSE3-NEXT: movl %r8d, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %r8b +; SSSE3-NEXT: jno .LBB16_26 +; SSSE3-NEXT: # %bb.25: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %r8d +; SSSE3-NEXT: .LBB16_26: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl %ebx, %eax +; SSSE3-NEXT: subb %cl, %al +; SSSE3-NEXT: setns %al +; SSSE3-NEXT: subb %cl, %bl +; SSSE3-NEXT: jno .LBB16_28 +; SSSE3-NEXT: # %bb.27: +; SSSE3-NEXT: addb $127, %al +; SSSE3-NEXT: movl %eax, %ebx +; SSSE3-NEXT: .LBB16_28: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: subb %dl, %cl +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: subb %dl, %al +; SSSE3-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: jno .LBB16_30 +; SSSE3-NEXT: # %bb.29: +; SSSE3-NEXT: addb $127, %cl +; SSSE3-NEXT: movl %ecx, %eax +; SSSE3-NEXT: .LBB16_30: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %sil +; SSSE3-NEXT: movl %esi, %ecx +; SSSE3-NEXT: subb %dl, %cl +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: subb %dl, %sil +; SSSE3-NEXT: jno .LBB16_32 +; SSSE3-NEXT: # %bb.31: +; SSSE3-NEXT: addb $127, %cl +; SSSE3-NEXT: movl %ecx, %esi +; SSSE3-NEXT: .LBB16_32: +; SSSE3-NEXT: movzbl %sil, %ecx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: movzbl %al, %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: movzbl %bl, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzbl %r8b, %eax +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSSE3-NEXT: movzbl %dil, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzbl %r13b, %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: movzbl %r12b, %eax +; SSSE3-NEXT: movd %eax, %xmm3 +; SSSE3-NEXT: movzbl %r15b, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: movzbl %r14b, %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movzbl %bpl, %eax +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSSE3-NEXT: movzbl %r11b, %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movzbl %r10b, %eax +; SSSE3-NEXT: movd %eax, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movzbl %r9b, %eax +; SSSE3-NEXT: movd %eax, %xmm4 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSSE3-NEXT: pxor %xmm0, %xmm0 +; SSSE3-NEXT: pcmpgtb %xmm4, %xmm0 +; SSSE3-NEXT: popq %rbx +; SSSE3-NEXT: popq %r12 +; SSSE3-NEXT: popq %r13 +; SSSE3-NEXT: popq %r14 +; SSSE3-NEXT: popq %r15 +; SSSE3-NEXT: popq %rbp +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v16i1: +; SSE41: # %bb.0: +; SSE41-NEXT: psllw $7, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; SSE41-NEXT: pand %xmm2, %xmm1 +; SSE41-NEXT: pextrb $15, %xmm1, %ecx +; SSE41-NEXT: psllw $7, %xmm0 +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pextrb $15, %xmm0, %edx +; SSE41-NEXT: movl %edx, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %dl +; SSE41-NEXT: jno .LBB16_2 +; SSE41-NEXT: # %bb.1: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edx +; SSE41-NEXT: .LBB16_2: +; SSE41-NEXT: pextrb $14, %xmm1, %ecx +; SSE41-NEXT: pextrb $14, %xmm0, %r11d +; SSE41-NEXT: movl %r11d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r11b +; SSE41-NEXT: jno .LBB16_4 +; SSE41-NEXT: # %bb.3: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r11d +; SSE41-NEXT: .LBB16_4: +; SSE41-NEXT: pextrb $13, %xmm1, %ecx +; SSE41-NEXT: pextrb $13, %xmm0, %edi +; SSE41-NEXT: movl %edi, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %dil +; SSE41-NEXT: jno .LBB16_6 +; SSE41-NEXT: # %bb.5: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edi +; SSE41-NEXT: .LBB16_6: +; SSE41-NEXT: pushq %rbp +; SSE41-NEXT: pushq %r15 +; SSE41-NEXT: pushq %r14 +; SSE41-NEXT: pushq %r13 +; SSE41-NEXT: pushq %r12 +; SSE41-NEXT: pushq %rbx +; SSE41-NEXT: pextrb $12, %xmm1, %ecx +; SSE41-NEXT: pextrb $12, %xmm0, %r14d +; SSE41-NEXT: movl %r14d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r14b +; SSE41-NEXT: jno .LBB16_8 +; SSE41-NEXT: # %bb.7: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r14d +; SSE41-NEXT: .LBB16_8: +; SSE41-NEXT: pextrb $11, %xmm1, %ecx +; SSE41-NEXT: pextrb $11, %xmm0, %ebp +; SSE41-NEXT: movl %ebp, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %bpl +; SSE41-NEXT: jno .LBB16_10 +; SSE41-NEXT: # %bb.9: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebp +; SSE41-NEXT: .LBB16_10: +; SSE41-NEXT: pextrb $10, %xmm1, %ecx +; SSE41-NEXT: pextrb $10, %xmm0, %r15d +; SSE41-NEXT: movl %r15d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r15b +; SSE41-NEXT: jno .LBB16_12 +; SSE41-NEXT: # %bb.11: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r15d +; SSE41-NEXT: .LBB16_12: +; SSE41-NEXT: pextrb $9, %xmm1, %ecx +; SSE41-NEXT: pextrb $9, %xmm0, %r12d +; SSE41-NEXT: movl %r12d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r12b +; SSE41-NEXT: jno .LBB16_14 +; SSE41-NEXT: # %bb.13: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r12d +; SSE41-NEXT: .LBB16_14: +; SSE41-NEXT: pextrb $8, %xmm1, %ecx +; SSE41-NEXT: pextrb $8, %xmm0, %r13d +; SSE41-NEXT: movl %r13d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r13b +; SSE41-NEXT: jno .LBB16_16 +; SSE41-NEXT: # %bb.15: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r13d +; SSE41-NEXT: .LBB16_16: +; SSE41-NEXT: pextrb $7, %xmm1, %ecx +; SSE41-NEXT: pextrb $7, %xmm0, %r10d +; SSE41-NEXT: movl %r10d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r10b +; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB16_18 +; SSE41-NEXT: # %bb.17: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r10d +; SSE41-NEXT: .LBB16_18: +; SSE41-NEXT: pextrb $6, %xmm1, %ecx +; SSE41-NEXT: pextrb $6, %xmm0, %r9d +; SSE41-NEXT: movl %r9d, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %r9b +; SSE41-NEXT: jno .LBB16_20 +; SSE41-NEXT: # %bb.19: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %r9d +; SSE41-NEXT: .LBB16_20: +; SSE41-NEXT: pextrb $5, %xmm1, %ecx +; SSE41-NEXT: pextrb $5, %xmm0, %ebp +; SSE41-NEXT: movl %ebp, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %bpl +; SSE41-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB16_22 +; SSE41-NEXT: # %bb.21: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %ebp +; SSE41-NEXT: .LBB16_22: +; SSE41-NEXT: pextrb $4, %xmm1, %ecx +; SSE41-NEXT: pextrb $4, %xmm0, %edi +; SSE41-NEXT: movl %edi, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: setns %al +; SSE41-NEXT: subb %cl, %dil +; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: jno .LBB16_24 +; SSE41-NEXT: # %bb.23: +; SSE41-NEXT: addb $127, %al +; SSE41-NEXT: movl %eax, %edi +; SSE41-NEXT: .LBB16_24: +; SSE41-NEXT: pextrb $3, %xmm1, %edx +; SSE41-NEXT: pextrb $3, %xmm0, %eax +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: setns %cl +; SSE41-NEXT: subb %dl, %al +; SSE41-NEXT: jno .LBB16_26 +; SSE41-NEXT: # %bb.25: +; SSE41-NEXT: addb $127, %cl +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB16_26: +; SSE41-NEXT: pextrb $2, %xmm1, %ebx +; SSE41-NEXT: pextrb $2, %xmm0, %ecx +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: subb %bl, %dl +; SSE41-NEXT: setns %dl +; SSE41-NEXT: subb %bl, %cl +; SSE41-NEXT: jno .LBB16_28 +; SSE41-NEXT: # %bb.27: +; SSE41-NEXT: addb $127, %dl +; SSE41-NEXT: movl %edx, %ecx +; SSE41-NEXT: .LBB16_28: +; SSE41-NEXT: pextrb $0, %xmm1, %esi +; SSE41-NEXT: pextrb $0, %xmm0, %edx +; SSE41-NEXT: movl %edx, %ebx +; SSE41-NEXT: subb %sil, %bl +; SSE41-NEXT: setns %bl +; SSE41-NEXT: subb %sil, %dl +; SSE41-NEXT: jno .LBB16_30 +; SSE41-NEXT: # %bb.29: +; SSE41-NEXT: addb $127, %bl +; SSE41-NEXT: movl %ebx, %edx +; SSE41-NEXT: .LBB16_30: +; SSE41-NEXT: pextrb $1, %xmm1, %esi +; SSE41-NEXT: pextrb $1, %xmm0, %r8d +; SSE41-NEXT: movl %r8d, %ebx +; SSE41-NEXT: subb %sil, %bl +; SSE41-NEXT: setns %bl +; SSE41-NEXT: subb %sil, %r8b +; SSE41-NEXT: jno .LBB16_32 +; SSE41-NEXT: # %bb.31: +; SSE41-NEXT: addb $127, %bl +; SSE41-NEXT: movl %ebx, %r8d +; SSE41-NEXT: .LBB16_32: +; SSE41-NEXT: movzbl %dl, %edx +; SSE41-NEXT: movd %edx, %xmm1 +; SSE41-NEXT: movzbl %r8b, %edx +; SSE41-NEXT: pinsrb $1, %edx, %xmm1 +; SSE41-NEXT: movzbl %cl, %ecx +; SSE41-NEXT: pinsrb $2, %ecx, %xmm1 +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $3, %eax, %xmm1 +; SSE41-NEXT: movzbl %dil, %eax +; SSE41-NEXT: pinsrb $4, %eax, %xmm1 +; SSE41-NEXT: movzbl %bpl, %eax +; SSE41-NEXT: pinsrb $5, %eax, %xmm1 +; SSE41-NEXT: movzbl %r9b, %eax +; SSE41-NEXT: pinsrb $6, %eax, %xmm1 +; SSE41-NEXT: movzbl %r10b, %eax +; SSE41-NEXT: pinsrb $7, %eax, %xmm1 +; SSE41-NEXT: movzbl %r13b, %eax +; SSE41-NEXT: pinsrb $8, %eax, %xmm1 +; SSE41-NEXT: movzbl %r12b, %eax +; SSE41-NEXT: pinsrb $9, %eax, %xmm1 +; SSE41-NEXT: movzbl %r15b, %eax +; SSE41-NEXT: pinsrb $10, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $11, %eax, %xmm1 +; SSE41-NEXT: movzbl %r14b, %eax +; SSE41-NEXT: pinsrb $12, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $13, %eax, %xmm1 +; SSE41-NEXT: movzbl %r11b, %eax +; SSE41-NEXT: pinsrb $14, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $15, %eax, %xmm1 +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pcmpgtb %xmm1, %xmm0 +; SSE41-NEXT: popq %rbx +; SSE41-NEXT: popq %r12 +; SSE41-NEXT: popq %r13 +; SSE41-NEXT: popq %r14 +; SSE41-NEXT: popq %r15 +; SSE41-NEXT: popq %rbp +; SSE41-NEXT: retq +; +; AVX1-LABEL: v16i1: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsllw $7, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpextrb $15, %xmm1, %ecx +; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpextrb $15, %xmm0, %edx +; AVX1-NEXT: movl %edx, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %dl +; AVX1-NEXT: jno .LBB16_2 +; AVX1-NEXT: # %bb.1: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %edx +; AVX1-NEXT: .LBB16_2: +; AVX1-NEXT: vpextrb $14, %xmm1, %ecx +; AVX1-NEXT: vpextrb $14, %xmm0, %r11d +; AVX1-NEXT: movl %r11d, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %r11b +; AVX1-NEXT: jno .LBB16_4 +; AVX1-NEXT: # %bb.3: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r11d +; AVX1-NEXT: .LBB16_4: +; AVX1-NEXT: vpextrb $13, %xmm1, %ecx +; AVX1-NEXT: vpextrb $13, %xmm0, %edi +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %dil +; AVX1-NEXT: jno .LBB16_6 +; AVX1-NEXT: # %bb.5: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %edi +; AVX1-NEXT: .LBB16_6: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: pushq %r15 +; AVX1-NEXT: pushq %r14 +; AVX1-NEXT: pushq %r13 +; AVX1-NEXT: pushq %r12 +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: vpextrb $12, %xmm1, %ecx +; AVX1-NEXT: vpextrb $12, %xmm0, %r14d +; AVX1-NEXT: movl %r14d, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %r14b +; AVX1-NEXT: jno .LBB16_8 +; AVX1-NEXT: # %bb.7: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r14d +; AVX1-NEXT: .LBB16_8: +; AVX1-NEXT: vpextrb $11, %xmm1, %ecx +; AVX1-NEXT: vpextrb $11, %xmm0, %ebp +; AVX1-NEXT: movl %ebp, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %bpl +; AVX1-NEXT: jno .LBB16_10 +; AVX1-NEXT: # %bb.9: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %ebp +; AVX1-NEXT: .LBB16_10: +; AVX1-NEXT: vpextrb $10, %xmm1, %ecx +; AVX1-NEXT: vpextrb $10, %xmm0, %r15d +; AVX1-NEXT: movl %r15d, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %r15b +; AVX1-NEXT: jno .LBB16_12 +; AVX1-NEXT: # %bb.11: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r15d +; AVX1-NEXT: .LBB16_12: +; AVX1-NEXT: vpextrb $9, %xmm1, %ecx +; AVX1-NEXT: vpextrb $9, %xmm0, %r12d +; AVX1-NEXT: movl %r12d, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %r12b +; AVX1-NEXT: jno .LBB16_14 +; AVX1-NEXT: # %bb.13: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r12d +; AVX1-NEXT: .LBB16_14: +; AVX1-NEXT: vpextrb $8, %xmm1, %ecx +; AVX1-NEXT: vpextrb $8, %xmm0, %r13d +; AVX1-NEXT: movl %r13d, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %r13b +; AVX1-NEXT: jno .LBB16_16 +; AVX1-NEXT: # %bb.15: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r13d +; AVX1-NEXT: .LBB16_16: +; AVX1-NEXT: vpextrb $7, %xmm1, %ecx +; AVX1-NEXT: vpextrb $7, %xmm0, %r10d +; AVX1-NEXT: movl %r10d, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %r10b +; AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB16_18 +; AVX1-NEXT: # %bb.17: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r10d +; AVX1-NEXT: .LBB16_18: +; AVX1-NEXT: vpextrb $6, %xmm1, %ecx +; AVX1-NEXT: vpextrb $6, %xmm0, %r9d +; AVX1-NEXT: movl %r9d, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %r9b +; AVX1-NEXT: jno .LBB16_20 +; AVX1-NEXT: # %bb.19: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %r9d +; AVX1-NEXT: .LBB16_20: +; AVX1-NEXT: vpextrb $5, %xmm1, %ecx +; AVX1-NEXT: vpextrb $5, %xmm0, %ebp +; AVX1-NEXT: movl %ebp, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %bpl +; AVX1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB16_22 +; AVX1-NEXT: # %bb.21: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %ebp +; AVX1-NEXT: .LBB16_22: +; AVX1-NEXT: vpextrb $4, %xmm1, %ecx +; AVX1-NEXT: vpextrb $4, %xmm0, %edi +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: setns %al +; AVX1-NEXT: subb %cl, %dil +; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: jno .LBB16_24 +; AVX1-NEXT: # %bb.23: +; AVX1-NEXT: addb $127, %al +; AVX1-NEXT: movl %eax, %edi +; AVX1-NEXT: .LBB16_24: +; AVX1-NEXT: vpextrb $3, %xmm1, %edx +; AVX1-NEXT: vpextrb $3, %xmm0, %eax +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: subb %dl, %cl +; AVX1-NEXT: setns %cl +; AVX1-NEXT: subb %dl, %al +; AVX1-NEXT: jno .LBB16_26 +; AVX1-NEXT: # %bb.25: +; AVX1-NEXT: addb $127, %cl +; AVX1-NEXT: movl %ecx, %eax +; AVX1-NEXT: .LBB16_26: +; AVX1-NEXT: vpextrb $2, %xmm1, %ebx +; AVX1-NEXT: vpextrb $2, %xmm0, %ecx +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: subb %bl, %dl +; AVX1-NEXT: setns %dl +; AVX1-NEXT: subb %bl, %cl +; AVX1-NEXT: jno .LBB16_28 +; AVX1-NEXT: # %bb.27: +; AVX1-NEXT: addb $127, %dl +; AVX1-NEXT: movl %edx, %ecx +; AVX1-NEXT: .LBB16_28: +; AVX1-NEXT: vpextrb $0, %xmm1, %esi +; AVX1-NEXT: vpextrb $0, %xmm0, %edx +; AVX1-NEXT: movl %edx, %ebx +; AVX1-NEXT: subb %sil, %bl +; AVX1-NEXT: setns %bl +; AVX1-NEXT: subb %sil, %dl +; AVX1-NEXT: jno .LBB16_30 +; AVX1-NEXT: # %bb.29: +; AVX1-NEXT: addb $127, %bl +; AVX1-NEXT: movl %ebx, %edx +; AVX1-NEXT: .LBB16_30: +; AVX1-NEXT: vpextrb $1, %xmm1, %esi +; AVX1-NEXT: vpextrb $1, %xmm0, %r8d +; AVX1-NEXT: movl %r8d, %ebx +; AVX1-NEXT: subb %sil, %bl +; AVX1-NEXT: setns %bl +; AVX1-NEXT: subb %sil, %r8b +; AVX1-NEXT: jno .LBB16_32 +; AVX1-NEXT: # %bb.31: +; AVX1-NEXT: addb $127, %bl +; AVX1-NEXT: movl %ebx, %r8d +; AVX1-NEXT: .LBB16_32: +; AVX1-NEXT: movzbl %dl, %edx +; AVX1-NEXT: vmovd %edx, %xmm0 +; AVX1-NEXT: movzbl %r8b, %edx +; AVX1-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %cl, %ecx +; AVX1-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %al, %eax +; AVX1-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %dil, %eax +; AVX1-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %bpl, %eax +; AVX1-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %r9b, %eax +; AVX1-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %r10b, %eax +; AVX1-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %r13b, %eax +; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %r12b, %eax +; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %r15b, %eax +; AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %r14b, %eax +; AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %r11b, %eax +; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: popq %r12 +; AVX1-NEXT: popq %r13 +; AVX1-NEXT: popq %r14 +; AVX1-NEXT: popq %r15 +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: v16i1: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllw $7, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpextrb $15, %xmm1, %ecx +; AVX2-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpextrb $15, %xmm0, %edx +; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %dl +; AVX2-NEXT: jno .LBB16_2 +; AVX2-NEXT: # %bb.1: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %edx +; AVX2-NEXT: .LBB16_2: +; AVX2-NEXT: vpextrb $14, %xmm1, %ecx +; AVX2-NEXT: vpextrb $14, %xmm0, %r11d +; AVX2-NEXT: movl %r11d, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %r11b +; AVX2-NEXT: jno .LBB16_4 +; AVX2-NEXT: # %bb.3: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r11d +; AVX2-NEXT: .LBB16_4: +; AVX2-NEXT: vpextrb $13, %xmm1, %ecx +; AVX2-NEXT: vpextrb $13, %xmm0, %edi +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %dil +; AVX2-NEXT: jno .LBB16_6 +; AVX2-NEXT: # %bb.5: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %edi +; AVX2-NEXT: .LBB16_6: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %r13 +; AVX2-NEXT: pushq %r12 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: vpextrb $12, %xmm1, %ecx +; AVX2-NEXT: vpextrb $12, %xmm0, %r14d +; AVX2-NEXT: movl %r14d, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %r14b +; AVX2-NEXT: jno .LBB16_8 +; AVX2-NEXT: # %bb.7: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r14d +; AVX2-NEXT: .LBB16_8: +; AVX2-NEXT: vpextrb $11, %xmm1, %ecx +; AVX2-NEXT: vpextrb $11, %xmm0, %ebp +; AVX2-NEXT: movl %ebp, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %bpl +; AVX2-NEXT: jno .LBB16_10 +; AVX2-NEXT: # %bb.9: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %ebp +; AVX2-NEXT: .LBB16_10: +; AVX2-NEXT: vpextrb $10, %xmm1, %ecx +; AVX2-NEXT: vpextrb $10, %xmm0, %r15d +; AVX2-NEXT: movl %r15d, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %r15b +; AVX2-NEXT: jno .LBB16_12 +; AVX2-NEXT: # %bb.11: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r15d +; AVX2-NEXT: .LBB16_12: +; AVX2-NEXT: vpextrb $9, %xmm1, %ecx +; AVX2-NEXT: vpextrb $9, %xmm0, %r12d +; AVX2-NEXT: movl %r12d, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %r12b +; AVX2-NEXT: jno .LBB16_14 +; AVX2-NEXT: # %bb.13: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r12d +; AVX2-NEXT: .LBB16_14: +; AVX2-NEXT: vpextrb $8, %xmm1, %ecx +; AVX2-NEXT: vpextrb $8, %xmm0, %r13d +; AVX2-NEXT: movl %r13d, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %r13b +; AVX2-NEXT: jno .LBB16_16 +; AVX2-NEXT: # %bb.15: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r13d +; AVX2-NEXT: .LBB16_16: +; AVX2-NEXT: vpextrb $7, %xmm1, %ecx +; AVX2-NEXT: vpextrb $7, %xmm0, %r10d +; AVX2-NEXT: movl %r10d, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %r10b +; AVX2-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB16_18 +; AVX2-NEXT: # %bb.17: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r10d +; AVX2-NEXT: .LBB16_18: +; AVX2-NEXT: vpextrb $6, %xmm1, %ecx +; AVX2-NEXT: vpextrb $6, %xmm0, %r9d +; AVX2-NEXT: movl %r9d, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %r9b +; AVX2-NEXT: jno .LBB16_20 +; AVX2-NEXT: # %bb.19: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %r9d +; AVX2-NEXT: .LBB16_20: +; AVX2-NEXT: vpextrb $5, %xmm1, %ecx +; AVX2-NEXT: vpextrb $5, %xmm0, %ebp +; AVX2-NEXT: movl %ebp, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %bpl +; AVX2-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB16_22 +; AVX2-NEXT: # %bb.21: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %ebp +; AVX2-NEXT: .LBB16_22: +; AVX2-NEXT: vpextrb $4, %xmm1, %ecx +; AVX2-NEXT: vpextrb $4, %xmm0, %edi +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: setns %al +; AVX2-NEXT: subb %cl, %dil +; AVX2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: jno .LBB16_24 +; AVX2-NEXT: # %bb.23: +; AVX2-NEXT: addb $127, %al +; AVX2-NEXT: movl %eax, %edi +; AVX2-NEXT: .LBB16_24: +; AVX2-NEXT: vpextrb $3, %xmm1, %edx +; AVX2-NEXT: vpextrb $3, %xmm0, %eax +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: subb %dl, %cl +; AVX2-NEXT: setns %cl +; AVX2-NEXT: subb %dl, %al +; AVX2-NEXT: jno .LBB16_26 +; AVX2-NEXT: # %bb.25: +; AVX2-NEXT: addb $127, %cl +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: .LBB16_26: +; AVX2-NEXT: vpextrb $2, %xmm1, %ebx +; AVX2-NEXT: vpextrb $2, %xmm0, %ecx +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: subb %bl, %dl +; AVX2-NEXT: setns %dl +; AVX2-NEXT: subb %bl, %cl +; AVX2-NEXT: jno .LBB16_28 +; AVX2-NEXT: # %bb.27: +; AVX2-NEXT: addb $127, %dl +; AVX2-NEXT: movl %edx, %ecx +; AVX2-NEXT: .LBB16_28: +; AVX2-NEXT: vpextrb $0, %xmm1, %esi +; AVX2-NEXT: vpextrb $0, %xmm0, %edx +; AVX2-NEXT: movl %edx, %ebx +; AVX2-NEXT: subb %sil, %bl +; AVX2-NEXT: setns %bl +; AVX2-NEXT: subb %sil, %dl +; AVX2-NEXT: jno .LBB16_30 +; AVX2-NEXT: # %bb.29: +; AVX2-NEXT: addb $127, %bl +; AVX2-NEXT: movl %ebx, %edx +; AVX2-NEXT: .LBB16_30: +; AVX2-NEXT: vpextrb $1, %xmm1, %esi +; AVX2-NEXT: vpextrb $1, %xmm0, %r8d +; AVX2-NEXT: movl %r8d, %ebx +; AVX2-NEXT: subb %sil, %bl +; AVX2-NEXT: setns %bl +; AVX2-NEXT: subb %sil, %r8b +; AVX2-NEXT: jno .LBB16_32 +; AVX2-NEXT: # %bb.31: +; AVX2-NEXT: addb $127, %bl +; AVX2-NEXT: movl %ebx, %r8d +; AVX2-NEXT: .LBB16_32: +; AVX2-NEXT: movzbl %dl, %edx +; AVX2-NEXT: vmovd %edx, %xmm0 +; AVX2-NEXT: movzbl %r8b, %edx +; AVX2-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %dil, %eax +; AVX2-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %bpl, %eax +; AVX2-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %r9b, %eax +; AVX2-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %r10b, %eax +; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %r13b, %eax +; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %r12b, %eax +; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %r15b, %eax +; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %r14b, %eax +; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %r11b, %eax +; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r12 +; AVX2-NEXT: popq %r13 +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512-LABEL: v16i1: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX512-NEXT: vpmovb2m %xmm0, %k0 +; AVX512-NEXT: kshiftrw $1, %k0, %k1 +; AVX512-NEXT: kmovd %k1, %edx +; AVX512-NEXT: vpsllw $7, %xmm1, %xmm0 +; AVX512-NEXT: vpmovb2m %xmm0, %k1 +; AVX512-NEXT: kshiftrw $1, %k1, %k2 +; AVX512-NEXT: kmovd %k2, %eax +; AVX512-NEXT: shlb $7, %al +; AVX512-NEXT: shlb $7, %dl +; AVX512-NEXT: movl %edx, %ecx +; AVX512-NEXT: subb %al, %cl +; AVX512-NEXT: setns %cl +; AVX512-NEXT: subb %al, %dl +; AVX512-NEXT: kmovd %k0, %esi +; AVX512-NEXT: kmovd %k1, %eax +; AVX512-NEXT: jno .LBB16_2 +; AVX512-NEXT: # %bb.1: +; AVX512-NEXT: addb $127, %cl +; AVX512-NEXT: movl %ecx, %edx +; AVX512-NEXT: .LBB16_2: +; AVX512-NEXT: shlb $7, %al +; AVX512-NEXT: shlb $7, %sil +; AVX512-NEXT: movl %esi, %ecx +; AVX512-NEXT: subb %al, %cl +; AVX512-NEXT: setns %cl +; AVX512-NEXT: subb %al, %sil +; AVX512-NEXT: kshiftrw $2, %k0, %k2 +; AVX512-NEXT: kmovd %k2, %edi +; AVX512-NEXT: kshiftrw $2, %k1, %k2 +; AVX512-NEXT: kmovd %k2, %eax +; AVX512-NEXT: jno .LBB16_4 +; AVX512-NEXT: # %bb.3: +; AVX512-NEXT: addb $127, %cl +; AVX512-NEXT: movl %ecx, %esi +; AVX512-NEXT: .LBB16_4: +; AVX512-NEXT: shlb $7, %al +; AVX512-NEXT: shlb $7, %dil +; AVX512-NEXT: movl %edi, %ecx +; AVX512-NEXT: subb %al, %cl +; AVX512-NEXT: setns %cl +; AVX512-NEXT: subb %al, %dil +; AVX512-NEXT: kshiftrw $3, %k0, %k2 +; AVX512-NEXT: kmovd %k2, %r11d +; AVX512-NEXT: kshiftrw $3, %k1, %k2 +; AVX512-NEXT: kmovd %k2, %eax +; AVX512-NEXT: jno .LBB16_6 +; AVX512-NEXT: # %bb.5: +; AVX512-NEXT: addb $127, %cl +; AVX512-NEXT: movl %ecx, %edi +; AVX512-NEXT: .LBB16_6: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r15 +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %r13 +; AVX512-NEXT: pushq %r12 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: shlb $7, %al +; AVX512-NEXT: shlb $7, %r11b +; AVX512-NEXT: movl %r11d, %ecx +; AVX512-NEXT: subb %al, %cl +; AVX512-NEXT: setns %cl +; AVX512-NEXT: subb %al, %r11b +; AVX512-NEXT: kshiftrw $4, %k0, %k2 +; AVX512-NEXT: kmovd %k2, %r14d +; AVX512-NEXT: kshiftrw $4, %k1, %k2 +; AVX512-NEXT: kmovd %k2, %eax +; AVX512-NEXT: jno .LBB16_8 +; AVX512-NEXT: # %bb.7: +; AVX512-NEXT: addb $127, %cl +; AVX512-NEXT: movl %ecx, %r11d +; AVX512-NEXT: .LBB16_8: +; AVX512-NEXT: shlb $7, %al +; AVX512-NEXT: shlb $7, %r14b +; AVX512-NEXT: movl %r14d, %ecx +; AVX512-NEXT: subb %al, %cl +; AVX512-NEXT: setns %cl +; AVX512-NEXT: subb %al, %r14b +; AVX512-NEXT: kshiftrw $5, %k0, %k2 +; AVX512-NEXT: kmovd %k2, %r15d +; AVX512-NEXT: kshiftrw $5, %k1, %k2 +; AVX512-NEXT: kmovd %k2, %eax +; AVX512-NEXT: jno .LBB16_10 +; AVX512-NEXT: # %bb.9: +; AVX512-NEXT: addb $127, %cl +; AVX512-NEXT: movl %ecx, %r14d +; AVX512-NEXT: .LBB16_10: +; AVX512-NEXT: shlb $7, %al +; AVX512-NEXT: shlb $7, %r15b +; AVX512-NEXT: movl %r15d, %ecx +; AVX512-NEXT: subb %al, %cl +; AVX512-NEXT: setns %cl +; AVX512-NEXT: subb %al, %r15b +; AVX512-NEXT: kshiftrw $6, %k0, %k2 +; AVX512-NEXT: kmovd %k2, %r12d +; AVX512-NEXT: kshiftrw $6, %k1, %k2 +; AVX512-NEXT: kmovd %k2, %eax +; AVX512-NEXT: jno .LBB16_12 +; AVX512-NEXT: # %bb.11: +; AVX512-NEXT: addb $127, %cl +; AVX512-NEXT: movl %ecx, %r15d +; AVX512-NEXT: .LBB16_12: +; AVX512-NEXT: shlb $7, %al +; AVX512-NEXT: shlb $7, %r12b +; AVX512-NEXT: movl %r12d, %ecx +; AVX512-NEXT: subb %al, %cl +; AVX512-NEXT: setns %cl +; AVX512-NEXT: subb %al, %r12b +; AVX512-NEXT: kshiftrw $7, %k0, %k2 +; AVX512-NEXT: kmovd %k2, %r13d +; AVX512-NEXT: kshiftrw $7, %k1, %k2 +; AVX512-NEXT: kmovd %k2, %eax +; AVX512-NEXT: jno .LBB16_14 +; AVX512-NEXT: # %bb.13: +; AVX512-NEXT: addb $127, %cl +; AVX512-NEXT: movl %ecx, %r12d +; AVX512-NEXT: .LBB16_14: +; AVX512-NEXT: shlb $7, %al +; AVX512-NEXT: shlb $7, %r13b +; AVX512-NEXT: movl %r13d, %ecx +; AVX512-NEXT: subb %al, %cl +; AVX512-NEXT: setns %cl +; AVX512-NEXT: subb %al, %r13b +; AVX512-NEXT: kshiftrw $8, %k0, %k2 +; AVX512-NEXT: kmovd %k2, %r9d +; AVX512-NEXT: kshiftrw $8, %k1, %k2 +; AVX512-NEXT: kmovd %k2, %eax +; AVX512-NEXT: jno .LBB16_16 +; AVX512-NEXT: # %bb.15: +; AVX512-NEXT: addb $127, %cl +; AVX512-NEXT: movl %ecx, %r13d +; AVX512-NEXT: .LBB16_16: +; AVX512-NEXT: shlb $7, %al +; AVX512-NEXT: shlb $7, %r9b +; AVX512-NEXT: movl %r9d, %ecx +; AVX512-NEXT: subb %al, %cl +; AVX512-NEXT: setns %cl +; AVX512-NEXT: subb %al, %r9b +; AVX512-NEXT: kshiftrw $9, %k0, %k2 +; AVX512-NEXT: kmovd %k2, %r10d +; AVX512-NEXT: kshiftrw $9, %k1, %k2 +; AVX512-NEXT: kmovd %k2, %eax +; AVX512-NEXT: jno .LBB16_18 +; AVX512-NEXT: # %bb.17: +; AVX512-NEXT: addb $127, %cl +; AVX512-NEXT: movl %ecx, %r9d +; AVX512-NEXT: .LBB16_18: +; AVX512-NEXT: shlb $7, %al +; AVX512-NEXT: shlb $7, %r10b +; AVX512-NEXT: movl %r10d, %ecx +; AVX512-NEXT: subb %al, %cl +; AVX512-NEXT: setns %cl +; AVX512-NEXT: subb %al, %r10b +; AVX512-NEXT: kshiftrw $10, %k0, %k2 +; AVX512-NEXT: kmovd %k2, %ebp +; AVX512-NEXT: kshiftrw $10, %k1, %k2 +; AVX512-NEXT: kmovd %k2, %eax +; AVX512-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB16_20 +; AVX512-NEXT: # %bb.19: +; AVX512-NEXT: addb $127, %cl +; AVX512-NEXT: movl %ecx, %r10d +; AVX512-NEXT: .LBB16_20: +; AVX512-NEXT: shlb $7, %al +; AVX512-NEXT: shlb $7, %bpl +; AVX512-NEXT: movl %ebp, %ecx +; AVX512-NEXT: subb %al, %cl +; AVX512-NEXT: setns %cl +; AVX512-NEXT: subb %al, %bpl +; AVX512-NEXT: kshiftrw $11, %k0, %k2 +; AVX512-NEXT: kmovd %k2, %edi +; AVX512-NEXT: kshiftrw $11, %k1, %k2 +; AVX512-NEXT: kmovd %k2, %eax +; AVX512-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB16_22 +; AVX512-NEXT: # %bb.21: +; AVX512-NEXT: addb $127, %cl +; AVX512-NEXT: movl %ecx, %ebp +; AVX512-NEXT: .LBB16_22: +; AVX512-NEXT: shlb $7, %al +; AVX512-NEXT: shlb $7, %dil +; AVX512-NEXT: movl %edi, %ecx +; AVX512-NEXT: subb %al, %cl +; AVX512-NEXT: setns %dl +; AVX512-NEXT: subb %al, %dil +; AVX512-NEXT: kshiftrw $12, %k0, %k2 +; AVX512-NEXT: kmovd %k2, %eax +; AVX512-NEXT: kshiftrw $12, %k1, %k2 +; AVX512-NEXT: kmovd %k2, %ecx +; AVX512-NEXT: jno .LBB16_24 +; AVX512-NEXT: # %bb.23: +; AVX512-NEXT: addb $127, %dl +; AVX512-NEXT: movl %edx, %edi +; AVX512-NEXT: .LBB16_24: +; AVX512-NEXT: shlb $7, %cl +; AVX512-NEXT: shlb $7, %al +; AVX512-NEXT: movl %eax, %edx +; AVX512-NEXT: subb %cl, %dl +; AVX512-NEXT: setns %bl +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: kshiftrw $13, %k0, %k2 +; AVX512-NEXT: kmovd %k2, %ecx +; AVX512-NEXT: kshiftrw $13, %k1, %k2 +; AVX512-NEXT: kmovd %k2, %edx +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: jno .LBB16_26 +; AVX512-NEXT: # %bb.25: +; AVX512-NEXT: addb $127, %bl +; AVX512-NEXT: movl %ebx, %eax +; AVX512-NEXT: .LBB16_26: +; AVX512-NEXT: shlb $7, %dl +; AVX512-NEXT: shlb $7, %cl +; AVX512-NEXT: movl %ecx, %ebx +; AVX512-NEXT: subb %dl, %bl +; AVX512-NEXT: setns %bl +; AVX512-NEXT: subb %dl, %cl +; AVX512-NEXT: kshiftrw $14, %k0, %k2 +; AVX512-NEXT: kmovd %k2, %edx +; AVX512-NEXT: kshiftrw $14, %k1, %k2 +; AVX512-NEXT: kmovd %k2, %esi +; AVX512-NEXT: jno .LBB16_28 +; AVX512-NEXT: # %bb.27: +; AVX512-NEXT: addb $127, %bl +; AVX512-NEXT: movl %ebx, %ecx +; AVX512-NEXT: .LBB16_28: +; AVX512-NEXT: shlb $7, %sil +; AVX512-NEXT: shlb $7, %dl +; AVX512-NEXT: movl %edx, %ebx +; AVX512-NEXT: subb %sil, %bl +; AVX512-NEXT: setns %bl +; AVX512-NEXT: subb %sil, %dl +; AVX512-NEXT: kshiftrw $15, %k0, %k0 +; AVX512-NEXT: kmovd %k0, %r8d +; AVX512-NEXT: kshiftrw $15, %k1, %k0 +; AVX512-NEXT: kmovd %k0, %esi +; AVX512-NEXT: jno .LBB16_30 +; AVX512-NEXT: # %bb.29: +; AVX512-NEXT: addb $127, %bl +; AVX512-NEXT: movl %ebx, %edx +; AVX512-NEXT: .LBB16_30: +; AVX512-NEXT: shlb $7, %sil +; AVX512-NEXT: shlb $7, %r8b +; AVX512-NEXT: movl %r8d, %ebx +; AVX512-NEXT: subb %sil, %bl +; AVX512-NEXT: setns %bl +; AVX512-NEXT: subb %sil, %r8b +; AVX512-NEXT: jno .LBB16_32 +; AVX512-NEXT: # %bb.31: +; AVX512-NEXT: addb $127, %bl +; AVX512-NEXT: movl %ebx, %r8d +; AVX512-NEXT: .LBB16_32: +; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload +; AVX512-NEXT: sarb $7, %sil +; AVX512-NEXT: kmovd %esi, %k1 +; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload +; AVX512-NEXT: sarb $7, %sil +; AVX512-NEXT: kmovd %esi, %k0 +; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload +; AVX512-NEXT: sarb $7, %sil +; AVX512-NEXT: kmovd %esi, %k2 +; AVX512-NEXT: sarb $7, %r11b +; AVX512-NEXT: kmovd %r11d, %k3 +; AVX512-NEXT: sarb $7, %r14b +; AVX512-NEXT: kmovd %r14d, %k4 +; AVX512-NEXT: sarb $7, %r15b +; AVX512-NEXT: kmovd %r15d, %k5 +; AVX512-NEXT: sarb $7, %r12b +; AVX512-NEXT: kmovd %r12d, %k6 +; AVX512-NEXT: kshiftrw $1, %k0, %k7 +; AVX512-NEXT: kxorw %k1, %k7, %k7 +; AVX512-NEXT: sarb $7, %r13b +; AVX512-NEXT: kmovd %r13d, %k1 +; AVX512-NEXT: kshiftlw $15, %k7, %k7 +; AVX512-NEXT: kshiftrw $14, %k7, %k7 +; AVX512-NEXT: kxorw %k7, %k0, %k0 +; AVX512-NEXT: kshiftrw $2, %k0, %k7 +; AVX512-NEXT: kxorw %k2, %k7, %k7 +; AVX512-NEXT: sarb $7, %r9b +; AVX512-NEXT: kmovd %r9d, %k2 +; AVX512-NEXT: kshiftlw $15, %k7, %k7 +; AVX512-NEXT: kshiftrw $13, %k7, %k7 +; AVX512-NEXT: kxorw %k7, %k0, %k0 +; AVX512-NEXT: kshiftrw $3, %k0, %k7 +; AVX512-NEXT: kxorw %k3, %k7, %k7 +; AVX512-NEXT: sarb $7, %r10b +; AVX512-NEXT: kmovd %r10d, %k3 +; AVX512-NEXT: kshiftlw $15, %k7, %k7 +; AVX512-NEXT: kshiftrw $12, %k7, %k7 +; AVX512-NEXT: kxorw %k7, %k0, %k7 +; AVX512-NEXT: kshiftrw $4, %k7, %k0 +; AVX512-NEXT: kxorw %k4, %k0, %k4 +; AVX512-NEXT: sarb $7, %bpl +; AVX512-NEXT: kmovd %ebp, %k0 +; AVX512-NEXT: kshiftlw $15, %k4, %k4 +; AVX512-NEXT: kshiftrw $11, %k4, %k4 +; AVX512-NEXT: kxorw %k4, %k7, %k7 +; AVX512-NEXT: kshiftrw $5, %k7, %k4 +; AVX512-NEXT: kxorw %k5, %k4, %k5 +; AVX512-NEXT: sarb $7, %dil +; AVX512-NEXT: kmovd %edi, %k4 +; AVX512-NEXT: kshiftlw $15, %k5, %k5 +; AVX512-NEXT: kshiftrw $10, %k5, %k5 +; AVX512-NEXT: kxorw %k5, %k7, %k7 +; AVX512-NEXT: kshiftrw $6, %k7, %k5 +; AVX512-NEXT: kxorw %k6, %k5, %k6 +; AVX512-NEXT: sarb $7, %al +; AVX512-NEXT: kmovd %eax, %k5 +; AVX512-NEXT: kshiftlw $15, %k6, %k6 +; AVX512-NEXT: kshiftrw $9, %k6, %k6 +; AVX512-NEXT: kxorw %k6, %k7, %k6 +; AVX512-NEXT: kshiftrw $7, %k6, %k7 +; AVX512-NEXT: kxorw %k1, %k7, %k7 +; AVX512-NEXT: sarb $7, %cl +; AVX512-NEXT: kmovd %ecx, %k1 +; AVX512-NEXT: kshiftlw $15, %k7, %k7 +; AVX512-NEXT: kshiftrw $8, %k7, %k7 +; AVX512-NEXT: kxorw %k7, %k6, %k6 +; AVX512-NEXT: kshiftrw $8, %k6, %k7 +; AVX512-NEXT: kxorw %k2, %k7, %k7 +; AVX512-NEXT: sarb $7, %dl +; AVX512-NEXT: kmovd %edx, %k2 +; AVX512-NEXT: kshiftlw $15, %k7, %k7 +; AVX512-NEXT: kshiftrw $7, %k7, %k7 +; AVX512-NEXT: kxorw %k7, %k6, %k6 +; AVX512-NEXT: kshiftrw $9, %k6, %k7 +; AVX512-NEXT: kxorw %k3, %k7, %k3 +; AVX512-NEXT: sarb $7, %r8b +; AVX512-NEXT: kmovd %r8d, %k7 +; AVX512-NEXT: kshiftlw $15, %k3, %k3 +; AVX512-NEXT: kshiftrw $6, %k3, %k3 +; AVX512-NEXT: kxorw %k3, %k6, %k3 +; AVX512-NEXT: kshiftrw $10, %k3, %k6 +; AVX512-NEXT: kxorw %k0, %k6, %k0 +; AVX512-NEXT: kshiftlw $15, %k0, %k0 +; AVX512-NEXT: kshiftrw $5, %k0, %k0 +; AVX512-NEXT: kxorw %k0, %k3, %k0 +; AVX512-NEXT: kshiftrw $11, %k0, %k3 +; AVX512-NEXT: kxorw %k4, %k3, %k3 +; AVX512-NEXT: kshiftlw $15, %k3, %k3 +; AVX512-NEXT: kshiftrw $4, %k3, %k3 +; AVX512-NEXT: kxorw %k3, %k0, %k0 +; AVX512-NEXT: kshiftrw $12, %k0, %k3 +; AVX512-NEXT: kxorw %k5, %k3, %k3 +; AVX512-NEXT: kshiftlw $15, %k3, %k3 +; AVX512-NEXT: kshiftrw $3, %k3, %k3 +; AVX512-NEXT: kxorw %k3, %k0, %k0 +; AVX512-NEXT: kshiftrw $13, %k0, %k3 +; AVX512-NEXT: kxorw %k1, %k3, %k1 +; AVX512-NEXT: kshiftlw $15, %k1, %k1 +; AVX512-NEXT: kshiftrw $2, %k1, %k1 +; AVX512-NEXT: kxorw %k1, %k0, %k0 +; AVX512-NEXT: kshiftrw $14, %k0, %k1 +; AVX512-NEXT: kxorw %k2, %k1, %k1 +; AVX512-NEXT: kshiftlw $15, %k1, %k1 +; AVX512-NEXT: kshiftrw $1, %k1, %k1 +; AVX512-NEXT: kxorw %k1, %k0, %k0 +; AVX512-NEXT: kshiftlw $1, %k0, %k0 +; AVX512-NEXT: kshiftrw $1, %k0, %k0 +; AVX512-NEXT: kshiftlw $15, %k7, %k1 +; AVX512-NEXT: korw %k1, %k0, %k0 +; AVX512-NEXT: vpmovm2b %k0, %xmm0 +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r12 +; AVX512-NEXT: popq %r13 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %r15 +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq + %z = call <16 x i1> @llvm.ssub.sat.v16i1(<16 x i1> %x, <16 x i1> %y) + ret <16 x i1> %z +} + +; Expanded + +define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { +; SSE2-LABEL: v4i32: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] +; SSE2-NEXT: movd %xmm2, %ecx +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; SSE2-NEXT: movd %xmm2, %r8d +; SSE2-NEXT: xorl %edx, %edx +; SSE2-NEXT: movl %r8d, %esi +; SSE2-NEXT: subl %ecx, %esi +; SSE2-NEXT: setns %dl +; SSE2-NEXT: addl $2147483647, %edx # imm = 0x7FFFFFFF +; SSE2-NEXT: subl %ecx, %r8d +; SSE2-NEXT: cmovol %edx, %r8d +; SSE2-NEXT: movd %xmm1, %edx +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: xorl %esi, %esi +; SSE2-NEXT: movl %ecx, %edi +; SSE2-NEXT: subl %edx, %edi +; SSE2-NEXT: setns %sil +; SSE2-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; SSE2-NEXT: subl %edx, %ecx +; SSE2-NEXT: cmovol %esi, %ecx +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE2-NEXT: movd %xmm2, %edx +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: xorl %edi, %edi +; SSE2-NEXT: movl %eax, %esi +; SSE2-NEXT: subl %edx, %esi +; SSE2-NEXT: setns %dil +; SSE2-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; SSE2-NEXT: subl %edx, %eax +; SSE2-NEXT: cmovol %edi, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] +; SSE2-NEXT: movd %xmm1, %r9d +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE2-NEXT: movd %xmm0, %edx +; SSE2-NEXT: xorl %edi, %edi +; SSE2-NEXT: movl %edx, %esi +; SSE2-NEXT: subl %r9d, %esi +; SSE2-NEXT: setns %dil +; SSE2-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; SSE2-NEXT: subl %r9d, %edx +; SSE2-NEXT: cmovol %edi, %edx +; SSE2-NEXT: movd %edx, %xmm0 +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: movd %r8d, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v4i32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] +; SSSE3-NEXT: movd %xmm2, %ecx +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; SSSE3-NEXT: movd %xmm2, %r8d +; SSSE3-NEXT: xorl %edx, %edx +; SSSE3-NEXT: movl %r8d, %esi +; SSSE3-NEXT: subl %ecx, %esi +; SSSE3-NEXT: setns %dl +; SSSE3-NEXT: addl $2147483647, %edx # imm = 0x7FFFFFFF +; SSSE3-NEXT: subl %ecx, %r8d +; SSSE3-NEXT: cmovol %edx, %r8d +; SSSE3-NEXT: movd %xmm1, %edx +; SSSE3-NEXT: movd %xmm0, %ecx +; SSSE3-NEXT: xorl %esi, %esi +; SSSE3-NEXT: movl %ecx, %edi +; SSSE3-NEXT: subl %edx, %edi +; SSSE3-NEXT: setns %sil +; SSSE3-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; SSSE3-NEXT: subl %edx, %ecx +; SSSE3-NEXT: cmovol %esi, %ecx +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSSE3-NEXT: movd %xmm2, %edx +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSSE3-NEXT: movd %xmm2, %eax +; SSSE3-NEXT: xorl %edi, %edi +; SSSE3-NEXT: movl %eax, %esi +; SSSE3-NEXT: subl %edx, %esi +; SSSE3-NEXT: setns %dil +; SSSE3-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; SSSE3-NEXT: subl %edx, %eax +; SSSE3-NEXT: cmovol %edi, %eax +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] +; SSSE3-NEXT: movd %xmm1, %r9d +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSSE3-NEXT: movd %xmm0, %edx +; SSSE3-NEXT: xorl %edi, %edi +; SSSE3-NEXT: movl %edx, %esi +; SSSE3-NEXT: subl %r9d, %esi +; SSSE3-NEXT: setns %dil +; SSSE3-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; SSSE3-NEXT: subl %r9d, %edx +; SSSE3-NEXT: cmovol %edi, %edx +; SSSE3-NEXT: movd %edx, %xmm0 +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: movd %r8d, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v4i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pextrd $3, %xmm1, %ecx +; SSE41-NEXT: pextrd $3, %xmm0, %r8d +; SSE41-NEXT: xorl %edx, %edx +; SSE41-NEXT: movl %r8d, %esi +; SSE41-NEXT: subl %ecx, %esi +; SSE41-NEXT: setns %dl +; SSE41-NEXT: addl $2147483647, %edx # imm = 0x7FFFFFFF +; SSE41-NEXT: subl %ecx, %r8d +; SSE41-NEXT: cmovol %edx, %r8d +; SSE41-NEXT: pextrd $2, %xmm1, %edx +; SSE41-NEXT: pextrd $2, %xmm0, %ecx +; SSE41-NEXT: xorl %esi, %esi +; SSE41-NEXT: movl %ecx, %edi +; SSE41-NEXT: subl %edx, %edi +; SSE41-NEXT: setns %sil +; SSE41-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; SSE41-NEXT: subl %edx, %ecx +; SSE41-NEXT: cmovol %esi, %ecx +; SSE41-NEXT: movd %xmm1, %edx +; SSE41-NEXT: movd %xmm0, %eax +; SSE41-NEXT: xorl %edi, %edi +; SSE41-NEXT: movl %eax, %esi +; SSE41-NEXT: subl %edx, %esi +; SSE41-NEXT: setns %dil +; SSE41-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; SSE41-NEXT: subl %edx, %eax +; SSE41-NEXT: cmovol %edi, %eax +; SSE41-NEXT: pextrd $1, %xmm1, %r9d +; SSE41-NEXT: pextrd $1, %xmm0, %edx +; SSE41-NEXT: xorl %edi, %edi +; SSE41-NEXT: movl %edx, %esi +; SSE41-NEXT: subl %r9d, %esi +; SSE41-NEXT: setns %dil +; SSE41-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; SSE41-NEXT: subl %r9d, %edx +; SSE41-NEXT: cmovol %edi, %edx +; SSE41-NEXT: movd %eax, %xmm0 +; SSE41-NEXT: pinsrd $1, %edx, %xmm0 +; SSE41-NEXT: pinsrd $2, %ecx, %xmm0 +; SSE41-NEXT: pinsrd $3, %r8d, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vpextrd $3, %xmm1, %ecx +; AVX-NEXT: vpextrd $3, %xmm0, %r9d +; AVX-NEXT: xorl %edx, %edx +; AVX-NEXT: movl %r9d, %esi +; AVX-NEXT: subl %ecx, %esi +; AVX-NEXT: setns %dl +; AVX-NEXT: addl $2147483647, %edx # imm = 0x7FFFFFFF +; AVX-NEXT: subl %ecx, %r9d +; AVX-NEXT: cmovol %edx, %r9d +; AVX-NEXT: vpextrd $2, %xmm1, %edx +; AVX-NEXT: vpextrd $2, %xmm0, %ecx +; AVX-NEXT: xorl %esi, %esi +; AVX-NEXT: movl %ecx, %edi +; AVX-NEXT: subl %edx, %edi +; AVX-NEXT: setns %sil +; AVX-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; AVX-NEXT: subl %edx, %ecx +; AVX-NEXT: cmovol %esi, %ecx +; AVX-NEXT: vmovd %xmm1, %r8d +; AVX-NEXT: vmovd %xmm0, %edx +; AVX-NEXT: xorl %edi, %edi +; AVX-NEXT: movl %edx, %esi +; AVX-NEXT: subl %r8d, %esi +; AVX-NEXT: setns %dil +; AVX-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; AVX-NEXT: subl %r8d, %edx +; AVX-NEXT: cmovol %edi, %edx +; AVX-NEXT: vpextrd $1, %xmm1, %r8d +; AVX-NEXT: vpextrd $1, %xmm0, %eax +; AVX-NEXT: xorl %esi, %esi +; AVX-NEXT: movl %eax, %edi +; AVX-NEXT: subl %r8d, %edi +; AVX-NEXT: setns %sil +; AVX-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; AVX-NEXT: subl %r8d, %eax +; AVX-NEXT: cmovol %esi, %eax +; AVX-NEXT: vmovd %edx, %xmm0 +; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; AVX-NEXT: vpinsrd $3, %r9d, %xmm0, %xmm0 +; AVX-NEXT: retq + %z = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %x, <4 x i32> %y) + ret <4 x i32> %z +} + +define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { +; SSE2-LABEL: v2i32: +; SSE2: # %bb.0: +; SSE2-NEXT: psllq $32, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: psllq $32, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm2, %rcx +; SSE2-NEXT: xorl %edx, %edx +; SSE2-NEXT: movq %rcx, %rsi +; SSE2-NEXT: subq %rax, %rsi +; SSE2-NEXT: setns %dl +; SSE2-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF +; SSE2-NEXT: addq %r8, %rdx +; SSE2-NEXT: subq %rax, %rcx +; SSE2-NEXT: cmovoq %rdx, %rcx +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: movq %xmm0, %rsi +; SSE2-NEXT: xorl %edi, %edi +; SSE2-NEXT: movq %rsi, %rdx +; SSE2-NEXT: subq %rax, %rdx +; SSE2-NEXT: setns %dil +; SSE2-NEXT: addq %r8, %rdi +; SSE2-NEXT: subq %rax, %rsi +; SSE2-NEXT: cmovoq %rdi, %rsi +; SSE2-NEXT: movq %rsi, %xmm1 +; SSE2-NEXT: movq %rcx, %xmm0 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v2i32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: psllq $32, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSSE3-NEXT: movq %xmm2, %rax +; SSSE3-NEXT: psllq $32, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSSE3-NEXT: movq %xmm2, %rcx +; SSSE3-NEXT: xorl %edx, %edx +; SSSE3-NEXT: movq %rcx, %rsi +; SSSE3-NEXT: subq %rax, %rsi +; SSSE3-NEXT: setns %dl +; SSSE3-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF +; SSSE3-NEXT: addq %r8, %rdx +; SSSE3-NEXT: subq %rax, %rcx +; SSSE3-NEXT: cmovoq %rdx, %rcx +; SSSE3-NEXT: movq %xmm1, %rax +; SSSE3-NEXT: movq %xmm0, %rsi +; SSSE3-NEXT: xorl %edi, %edi +; SSSE3-NEXT: movq %rsi, %rdx +; SSSE3-NEXT: subq %rax, %rdx +; SSSE3-NEXT: setns %dil +; SSSE3-NEXT: addq %r8, %rdi +; SSSE3-NEXT: subq %rax, %rsi +; SSSE3-NEXT: cmovoq %rdi, %rsi +; SSSE3-NEXT: movq %rsi, %xmm1 +; SSSE3-NEXT: movq %rcx, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] +; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v2i32: +; SSE41: # %bb.0: +; SSE41-NEXT: psllq $32, %xmm1 +; SSE41-NEXT: movq %xmm1, %rax +; SSE41-NEXT: psllq $32, %xmm0 +; SSE41-NEXT: movq %xmm0, %rcx +; SSE41-NEXT: xorl %edx, %edx +; SSE41-NEXT: movq %rcx, %rsi +; SSE41-NEXT: subq %rax, %rsi +; SSE41-NEXT: setns %dl +; SSE41-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF +; SSE41-NEXT: addq %r8, %rdx +; SSE41-NEXT: subq %rax, %rcx +; SSE41-NEXT: cmovoq %rdx, %rcx +; SSE41-NEXT: pextrq $1, %xmm1, %rax +; SSE41-NEXT: pextrq $1, %xmm0, %rsi +; SSE41-NEXT: xorl %edi, %edi +; SSE41-NEXT: movq %rsi, %rdx +; SSE41-NEXT: subq %rax, %rdx +; SSE41-NEXT: setns %dil +; SSE41-NEXT: addq %r8, %rdi +; SSE41-NEXT: subq %rax, %rsi +; SSE41-NEXT: cmovoq %rdi, %rsi +; SSE41-NEXT: movq %rsi, %xmm1 +; SSE41-NEXT: movq %rcx, %xmm0 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: v2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 +; AVX1-NEXT: vmovq %xmm1, %rax +; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rcx +; AVX1-NEXT: xorl %edx, %edx +; AVX1-NEXT: movq %rcx, %rsi +; AVX1-NEXT: subq %rax, %rsi +; AVX1-NEXT: setns %dl +; AVX1-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF +; AVX1-NEXT: addq %r8, %rdx +; AVX1-NEXT: subq %rax, %rcx +; AVX1-NEXT: cmovoq %rdx, %rcx +; AVX1-NEXT: vpextrq $1, %xmm1, %rax +; AVX1-NEXT: vpextrq $1, %xmm0, %rsi +; AVX1-NEXT: xorl %edi, %edi +; AVX1-NEXT: movq %rsi, %rdx +; AVX1-NEXT: subq %rax, %rdx +; AVX1-NEXT: setns %dil +; AVX1-NEXT: addq %r8, %rdi +; AVX1-NEXT: subq %rax, %rsi +; AVX1-NEXT: cmovoq %rdi, %rsi +; AVX1-NEXT: vmovq %rsi, %xmm0 +; AVX1-NEXT: vmovq %rcx, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: v2i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1 +; AVX2-NEXT: vmovq %xmm1, %rax +; AVX2-NEXT: vpsllq $32, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rcx +; AVX2-NEXT: xorl %edx, %edx +; AVX2-NEXT: movq %rcx, %rsi +; AVX2-NEXT: subq %rax, %rsi +; AVX2-NEXT: setns %dl +; AVX2-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF +; AVX2-NEXT: addq %r8, %rdx +; AVX2-NEXT: subq %rax, %rcx +; AVX2-NEXT: cmovoq %rdx, %rcx +; AVX2-NEXT: vpextrq $1, %xmm1, %rax +; AVX2-NEXT: vpextrq $1, %xmm0, %rsi +; AVX2-NEXT: xorl %edi, %edi +; AVX2-NEXT: movq %rsi, %rdx +; AVX2-NEXT: subq %rax, %rdx +; AVX2-NEXT: setns %dil +; AVX2-NEXT: addq %r8, %rdi +; AVX2-NEXT: subq %rax, %rsi +; AVX2-NEXT: cmovoq %rdi, %rsi +; AVX2-NEXT: vmovq %rsi, %xmm0 +; AVX2-NEXT: vmovq %rcx, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX2-NEXT: retq +; +; AVX512-LABEL: v2i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllq $32, %xmm1, %xmm1 +; AVX512-NEXT: vmovq %xmm1, %rax +; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rcx +; AVX512-NEXT: xorl %edx, %edx +; AVX512-NEXT: movq %rcx, %rsi +; AVX512-NEXT: subq %rax, %rsi +; AVX512-NEXT: setns %dl +; AVX512-NEXT: movabsq $9223372036854775807, %r8 # imm = 0x7FFFFFFFFFFFFFFF +; AVX512-NEXT: addq %r8, %rdx +; AVX512-NEXT: subq %rax, %rcx +; AVX512-NEXT: cmovoq %rdx, %rcx +; AVX512-NEXT: vpextrq $1, %xmm1, %rax +; AVX512-NEXT: vpextrq $1, %xmm0, %rsi +; AVX512-NEXT: xorl %edi, %edi +; AVX512-NEXT: movq %rsi, %rdx +; AVX512-NEXT: subq %rax, %rdx +; AVX512-NEXT: setns %dil +; AVX512-NEXT: addq %r8, %rdi +; AVX512-NEXT: subq %rax, %rsi +; AVX512-NEXT: cmovoq %rdi, %rsi +; AVX512-NEXT: vmovq %rsi, %xmm0 +; AVX512-NEXT: vmovq %rcx, %xmm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NEXT: vpsraq $32, %xmm0, %xmm0 +; AVX512-NEXT: retq + %z = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %x, <2 x i32> %y) + ret <2 x i32> %z +} + +define <4 x i24> @v4i24(<4 x i24> %x, <4 x i24> %y) nounwind { +; SSE2-LABEL: v4i24: +; SSE2: # %bb.0: +; SSE2-NEXT: pslld $8, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] +; SSE2-NEXT: movd %xmm2, %ecx +; SSE2-NEXT: pslld $8, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; SSE2-NEXT: movd %xmm2, %r8d +; SSE2-NEXT: xorl %edx, %edx +; SSE2-NEXT: movl %r8d, %esi +; SSE2-NEXT: subl %ecx, %esi +; SSE2-NEXT: setns %dl +; SSE2-NEXT: addl $2147483647, %edx # imm = 0x7FFFFFFF +; SSE2-NEXT: subl %ecx, %r8d +; SSE2-NEXT: cmovol %edx, %r8d +; SSE2-NEXT: movd %xmm1, %edx +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: xorl %esi, %esi +; SSE2-NEXT: movl %ecx, %edi +; SSE2-NEXT: subl %edx, %edi +; SSE2-NEXT: setns %sil +; SSE2-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; SSE2-NEXT: subl %edx, %ecx +; SSE2-NEXT: cmovol %esi, %ecx +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE2-NEXT: movd %xmm2, %edx +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: xorl %edi, %edi +; SSE2-NEXT: movl %eax, %esi +; SSE2-NEXT: subl %edx, %esi +; SSE2-NEXT: setns %dil +; SSE2-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; SSE2-NEXT: subl %edx, %eax +; SSE2-NEXT: cmovol %edi, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] +; SSE2-NEXT: movd %xmm1, %r9d +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE2-NEXT: movd %xmm0, %edx +; SSE2-NEXT: xorl %edi, %edi +; SSE2-NEXT: movl %edx, %esi +; SSE2-NEXT: subl %r9d, %esi +; SSE2-NEXT: setns %dil +; SSE2-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; SSE2-NEXT: subl %r9d, %edx +; SSE2-NEXT: cmovol %edi, %edx +; SSE2-NEXT: movd %edx, %xmm0 +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: movd %r8d, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: psrad $8, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v4i24: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pslld $8, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] +; SSSE3-NEXT: movd %xmm2, %ecx +; SSSE3-NEXT: pslld $8, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; SSSE3-NEXT: movd %xmm2, %r8d +; SSSE3-NEXT: xorl %edx, %edx +; SSSE3-NEXT: movl %r8d, %esi +; SSSE3-NEXT: subl %ecx, %esi +; SSSE3-NEXT: setns %dl +; SSSE3-NEXT: addl $2147483647, %edx # imm = 0x7FFFFFFF +; SSSE3-NEXT: subl %ecx, %r8d +; SSSE3-NEXT: cmovol %edx, %r8d +; SSSE3-NEXT: movd %xmm1, %edx +; SSSE3-NEXT: movd %xmm0, %ecx +; SSSE3-NEXT: xorl %esi, %esi +; SSSE3-NEXT: movl %ecx, %edi +; SSSE3-NEXT: subl %edx, %edi +; SSSE3-NEXT: setns %sil +; SSSE3-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; SSSE3-NEXT: subl %edx, %ecx +; SSSE3-NEXT: cmovol %esi, %ecx +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSSE3-NEXT: movd %xmm2, %edx +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSSE3-NEXT: movd %xmm2, %eax +; SSSE3-NEXT: xorl %edi, %edi +; SSSE3-NEXT: movl %eax, %esi +; SSSE3-NEXT: subl %edx, %esi +; SSSE3-NEXT: setns %dil +; SSSE3-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; SSSE3-NEXT: subl %edx, %eax +; SSSE3-NEXT: cmovol %edi, %eax +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] +; SSSE3-NEXT: movd %xmm1, %r9d +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSSE3-NEXT: movd %xmm0, %edx +; SSSE3-NEXT: xorl %edi, %edi +; SSSE3-NEXT: movl %edx, %esi +; SSSE3-NEXT: subl %r9d, %esi +; SSSE3-NEXT: setns %dil +; SSSE3-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; SSSE3-NEXT: subl %r9d, %edx +; SSSE3-NEXT: cmovol %edi, %edx +; SSSE3-NEXT: movd %edx, %xmm0 +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: movd %r8d, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: psrad $8, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v4i24: +; SSE41: # %bb.0: +; SSE41-NEXT: pslld $8, %xmm1 +; SSE41-NEXT: pextrd $3, %xmm1, %ecx +; SSE41-NEXT: pslld $8, %xmm0 +; SSE41-NEXT: pextrd $3, %xmm0, %r8d +; SSE41-NEXT: xorl %edx, %edx +; SSE41-NEXT: movl %r8d, %esi +; SSE41-NEXT: subl %ecx, %esi +; SSE41-NEXT: setns %dl +; SSE41-NEXT: addl $2147483647, %edx # imm = 0x7FFFFFFF +; SSE41-NEXT: subl %ecx, %r8d +; SSE41-NEXT: cmovol %edx, %r8d +; SSE41-NEXT: pextrd $2, %xmm1, %edx +; SSE41-NEXT: pextrd $2, %xmm0, %ecx +; SSE41-NEXT: xorl %esi, %esi +; SSE41-NEXT: movl %ecx, %edi +; SSE41-NEXT: subl %edx, %edi +; SSE41-NEXT: setns %sil +; SSE41-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; SSE41-NEXT: subl %edx, %ecx +; SSE41-NEXT: cmovol %esi, %ecx +; SSE41-NEXT: movd %xmm1, %edx +; SSE41-NEXT: movd %xmm0, %eax +; SSE41-NEXT: xorl %edi, %edi +; SSE41-NEXT: movl %eax, %esi +; SSE41-NEXT: subl %edx, %esi +; SSE41-NEXT: setns %dil +; SSE41-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; SSE41-NEXT: subl %edx, %eax +; SSE41-NEXT: cmovol %edi, %eax +; SSE41-NEXT: pextrd $1, %xmm1, %r9d +; SSE41-NEXT: pextrd $1, %xmm0, %edx +; SSE41-NEXT: xorl %edi, %edi +; SSE41-NEXT: movl %edx, %esi +; SSE41-NEXT: subl %r9d, %esi +; SSE41-NEXT: setns %dil +; SSE41-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; SSE41-NEXT: subl %r9d, %edx +; SSE41-NEXT: cmovol %edi, %edx +; SSE41-NEXT: movd %eax, %xmm0 +; SSE41-NEXT: pinsrd $1, %edx, %xmm0 +; SSE41-NEXT: pinsrd $2, %ecx, %xmm0 +; SSE41-NEXT: pinsrd $3, %r8d, %xmm0 +; SSE41-NEXT: psrad $8, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: v4i24: +; AVX: # %bb.0: +; AVX-NEXT: vpslld $8, %xmm1, %xmm1 +; AVX-NEXT: vpextrd $3, %xmm1, %ecx +; AVX-NEXT: vpslld $8, %xmm0, %xmm0 +; AVX-NEXT: vpextrd $3, %xmm0, %r9d +; AVX-NEXT: xorl %edx, %edx +; AVX-NEXT: movl %r9d, %esi +; AVX-NEXT: subl %ecx, %esi +; AVX-NEXT: setns %dl +; AVX-NEXT: addl $2147483647, %edx # imm = 0x7FFFFFFF +; AVX-NEXT: subl %ecx, %r9d +; AVX-NEXT: cmovol %edx, %r9d +; AVX-NEXT: vpextrd $2, %xmm1, %edx +; AVX-NEXT: vpextrd $2, %xmm0, %ecx +; AVX-NEXT: xorl %esi, %esi +; AVX-NEXT: movl %ecx, %edi +; AVX-NEXT: subl %edx, %edi +; AVX-NEXT: setns %sil +; AVX-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; AVX-NEXT: subl %edx, %ecx +; AVX-NEXT: cmovol %esi, %ecx +; AVX-NEXT: vmovd %xmm1, %r8d +; AVX-NEXT: vmovd %xmm0, %edx +; AVX-NEXT: xorl %edi, %edi +; AVX-NEXT: movl %edx, %esi +; AVX-NEXT: subl %r8d, %esi +; AVX-NEXT: setns %dil +; AVX-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF +; AVX-NEXT: subl %r8d, %edx +; AVX-NEXT: cmovol %edi, %edx +; AVX-NEXT: vpextrd $1, %xmm1, %r8d +; AVX-NEXT: vpextrd $1, %xmm0, %eax +; AVX-NEXT: xorl %esi, %esi +; AVX-NEXT: movl %eax, %edi +; AVX-NEXT: subl %r8d, %edi +; AVX-NEXT: setns %sil +; AVX-NEXT: addl $2147483647, %esi # imm = 0x7FFFFFFF +; AVX-NEXT: subl %r8d, %eax +; AVX-NEXT: cmovol %esi, %eax +; AVX-NEXT: vmovd %edx, %xmm0 +; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; AVX-NEXT: vpinsrd $3, %r9d, %xmm0, %xmm0 +; AVX-NEXT: vpsrad $8, %xmm0, %xmm0 +; AVX-NEXT: retq + %z = call <4 x i24> @llvm.ssub.sat.v4i24(<4 x i24> %x, <4 x i24> %y) + ret <4 x i24> %z +} + +define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind { +; SSE-LABEL: v2i128: +; SSE: # %bb.0: +; SSE-NEXT: pushq %r15 +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %r13 +; SSE-NEXT: pushq %r12 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r14 +; SSE-NEXT: subq {{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: movq %r8, %r13 +; SSE-NEXT: sbbq %r14, %r13 +; SSE-NEXT: movq %r13, %r10 +; SSE-NEXT: sarq $63, %r10 +; SSE-NEXT: xorl %edi, %edi +; SSE-NEXT: testq %r13, %r13 +; SSE-NEXT: setns %dil +; SSE-NEXT: movabsq $9223372036854775807, %r12 # imm = 0x7FFFFFFFFFFFFFFF +; SSE-NEXT: leaq (%rdi,%r12), %r15 +; SSE-NEXT: testq %r8, %r8 +; SSE-NEXT: setns %r8b +; SSE-NEXT: cmpb %dil, %r8b +; SSE-NEXT: setne %dil +; SSE-NEXT: testq %r14, %r14 +; SSE-NEXT: setns %bl +; SSE-NEXT: cmpb %bl, %r8b +; SSE-NEXT: setne %bl +; SSE-NEXT: testb %dil, %bl +; SSE-NEXT: cmoveq %r13, %r15 +; SSE-NEXT: cmoveq %rcx, %r10 +; SSE-NEXT: subq %r9, %rsi +; SSE-NEXT: movq %rdx, %rdi +; SSE-NEXT: sbbq %r11, %rdi +; SSE-NEXT: setns %bl +; SSE-NEXT: movzbl %bl, %ebx +; SSE-NEXT: addq %rbx, %r12 +; SSE-NEXT: movq %rdi, %rcx +; SSE-NEXT: sarq $63, %rcx +; SSE-NEXT: testq %r11, %r11 +; SSE-NEXT: setns %r8b +; SSE-NEXT: testq %rdx, %rdx +; SSE-NEXT: setns %dl +; SSE-NEXT: cmpb %r8b, %dl +; SSE-NEXT: setne %r8b +; SSE-NEXT: cmpb %bl, %dl +; SSE-NEXT: setne %dl +; SSE-NEXT: testb %dl, %r8b +; SSE-NEXT: cmoveq %rsi, %rcx +; SSE-NEXT: cmoveq %rdi, %r12 +; SSE-NEXT: movq %r15, 24(%rax) +; SSE-NEXT: movq %r10, 16(%rax) +; SSE-NEXT: movq %r12, 8(%rax) +; SSE-NEXT: movq %rcx, (%rax) +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r12 +; SSE-NEXT: popq %r13 +; SSE-NEXT: popq %r14 +; SSE-NEXT: popq %r15 +; SSE-NEXT: retq +; +; AVX-LABEL: v2i128: +; AVX: # %bb.0: +; AVX-NEXT: pushq %r15 +; AVX-NEXT: pushq %r14 +; AVX-NEXT: pushq %r13 +; AVX-NEXT: pushq %r12 +; AVX-NEXT: pushq %rbx +; AVX-NEXT: movq %rdi, %rax +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r14 +; AVX-NEXT: subq {{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movq %r8, %r13 +; AVX-NEXT: sbbq %r14, %r13 +; AVX-NEXT: movq %r13, %r10 +; AVX-NEXT: sarq $63, %r10 +; AVX-NEXT: xorl %edi, %edi +; AVX-NEXT: testq %r13, %r13 +; AVX-NEXT: setns %dil +; AVX-NEXT: movabsq $9223372036854775807, %r12 # imm = 0x7FFFFFFFFFFFFFFF +; AVX-NEXT: leaq (%rdi,%r12), %r15 +; AVX-NEXT: testq %r8, %r8 +; AVX-NEXT: setns %r8b +; AVX-NEXT: cmpb %dil, %r8b +; AVX-NEXT: setne %dil +; AVX-NEXT: testq %r14, %r14 +; AVX-NEXT: setns %bl +; AVX-NEXT: cmpb %bl, %r8b +; AVX-NEXT: setne %bl +; AVX-NEXT: testb %dil, %bl +; AVX-NEXT: cmoveq %r13, %r15 +; AVX-NEXT: cmoveq %rcx, %r10 +; AVX-NEXT: subq %r9, %rsi +; AVX-NEXT: movq %rdx, %rdi +; AVX-NEXT: sbbq %r11, %rdi +; AVX-NEXT: setns %bl +; AVX-NEXT: movzbl %bl, %ebx +; AVX-NEXT: addq %rbx, %r12 +; AVX-NEXT: movq %rdi, %rcx +; AVX-NEXT: sarq $63, %rcx +; AVX-NEXT: testq %r11, %r11 +; AVX-NEXT: setns %r8b +; AVX-NEXT: testq %rdx, %rdx +; AVX-NEXT: setns %dl +; AVX-NEXT: cmpb %r8b, %dl +; AVX-NEXT: setne %r8b +; AVX-NEXT: cmpb %bl, %dl +; AVX-NEXT: setne %dl +; AVX-NEXT: testb %dl, %r8b +; AVX-NEXT: cmoveq %rsi, %rcx +; AVX-NEXT: cmoveq %rdi, %r12 +; AVX-NEXT: movq %r15, 24(%rax) +; AVX-NEXT: movq %r10, 16(%rax) +; AVX-NEXT: movq %r12, 8(%rax) +; AVX-NEXT: movq %rcx, (%rax) +; AVX-NEXT: popq %rbx +; AVX-NEXT: popq %r12 +; AVX-NEXT: popq %r13 +; AVX-NEXT: popq %r14 +; AVX-NEXT: popq %r15 +; AVX-NEXT: retq + %z = call <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128> %x, <2 x i128> %y) + ret <2 x i128> %z +} diff --git a/test/CodeGen/X86/uadd_sat_vec.ll b/test/CodeGen/X86/uadd_sat_vec.ll new file mode 100644 index 00000000000..812d9f1358f --- /dev/null +++ b/test/CodeGen/X86/uadd_sat_vec.ll @@ -0,0 +1,13616 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512 + +declare <1 x i8> @llvm.uadd.sat.v1i8(<1 x i8>, <1 x i8>) +declare <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8>, <2 x i8>) +declare <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8>, <4 x i8>) +declare <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8>, <8 x i8>) +declare <12 x i8> @llvm.uadd.sat.v12i8(<12 x i8>, <12 x i8>) +declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8>, <16 x i8>) +declare <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8>, <32 x i8>) +declare <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8>, <64 x i8>) + +declare <1 x i16> @llvm.uadd.sat.v1i16(<1 x i16>, <1 x i16>) +declare <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16>, <2 x i16>) +declare <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16>, <4 x i16>) +declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16>, <8 x i16>) +declare <12 x i16> @llvm.uadd.sat.v12i16(<12 x i16>, <12 x i16>) +declare <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16>, <16 x i16>) +declare <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16>, <32 x i16>) + +declare <16 x i1> @llvm.uadd.sat.v16i1(<16 x i1>, <16 x i1>) +declare <16 x i4> @llvm.uadd.sat.v16i4(<16 x i4>, <16 x i4>) + +declare <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32>, <4 x i32>) +declare <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32>, <2 x i32>) +declare <4 x i24> @llvm.uadd.sat.v4i24(<4 x i24>, <4 x i24>) +declare <2 x i128> @llvm.uadd.sat.v2i128(<2 x i128>, <2 x i128>) + +; Legal types, depending on architecture. + +define <16 x i8> @v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { +; SSE2-LABEL: v16i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: pushq %r15 +; SSE2-NEXT: pushq %r14 +; SSE2-NEXT: pushq %r13 +; SSE2-NEXT: pushq %r12 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %dil +; SSE2-NEXT: jb .LBB0_2 +; SSE2-NEXT: # %bb.1: +; SSE2-NEXT: movl %eax, %edi +; SSE2-NEXT: .LBB0_2: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb $-1, %al +; SSE2-NEXT: jb .LBB0_4 +; SSE2-NEXT: # %bb.3: +; SSE2-NEXT: movl %ecx, %eax +; SSE2-NEXT: .LBB0_4: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb $-1, %cl +; SSE2-NEXT: jb .LBB0_6 +; SSE2-NEXT: # %bb.5: +; SSE2-NEXT: movl %edx, %ecx +; SSE2-NEXT: .LBB0_6: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb $-1, %r10b +; SSE2-NEXT: jb .LBB0_8 +; SSE2-NEXT: # %bb.7: +; SSE2-NEXT: movl %edx, %r10d +; SSE2-NEXT: .LBB0_8: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb $-1, %r11b +; SSE2-NEXT: jb .LBB0_10 +; SSE2-NEXT: # %bb.9: +; SSE2-NEXT: movl %edx, %r11d +; SSE2-NEXT: .LBB0_10: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb $-1, %r12b +; SSE2-NEXT: jb .LBB0_12 +; SSE2-NEXT: # %bb.11: +; SSE2-NEXT: movl %edx, %r12d +; SSE2-NEXT: .LBB0_12: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb $-1, %r13b +; SSE2-NEXT: jb .LBB0_14 +; SSE2-NEXT: # %bb.13: +; SSE2-NEXT: movl %edx, %r13d +; SSE2-NEXT: .LBB0_14: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb $-1, %r8b +; SSE2-NEXT: jb .LBB0_16 +; SSE2-NEXT: # %bb.15: +; SSE2-NEXT: movl %edx, %r8d +; SSE2-NEXT: .LBB0_16: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb $-1, %r14b +; SSE2-NEXT: jb .LBB0_18 +; SSE2-NEXT: # %bb.17: +; SSE2-NEXT: movl %edx, %r14d +; SSE2-NEXT: .LBB0_18: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb $-1, %r15b +; SSE2-NEXT: jb .LBB0_20 +; SSE2-NEXT: # %bb.19: +; SSE2-NEXT: movl %edx, %r15d +; SSE2-NEXT: .LBB0_20: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb $-1, %r9b +; SSE2-NEXT: jb .LBB0_22 +; SSE2-NEXT: # %bb.21: +; SSE2-NEXT: movl %edx, %r9d +; SSE2-NEXT: .LBB0_22: +; SSE2-NEXT: movzbl %dil, %edi +; SSE2-NEXT: movzbl %al, %esi +; SSE2-NEXT: movzbl %cl, %ebp +; SSE2-NEXT: movzbl %r10b, %edx +; SSE2-NEXT: movzbl %r11b, %ebx +; SSE2-NEXT: movzbl %r12b, %r10d +; SSE2-NEXT: movzbl %r13b, %r11d +; SSE2-NEXT: movzbl %r8b, %r8d +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %cl +; SSE2-NEXT: jb .LBB0_24 +; SSE2-NEXT: # %bb.23: +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: .LBB0_24: +; SSE2-NEXT: movd %edi, %xmm2 +; SSE2-NEXT: movd %esi, %xmm3 +; SSE2-NEXT: movd %ebp, %xmm5 +; SSE2-NEXT: movd %edx, %xmm0 +; SSE2-NEXT: movd %ebx, %xmm6 +; SSE2-NEXT: movd %r10d, %xmm4 +; SSE2-NEXT: movd %r11d, %xmm7 +; SSE2-NEXT: movd %r8d, %xmm1 +; SSE2-NEXT: movzbl %r14b, %esi +; SSE2-NEXT: movzbl %r15b, %edx +; SSE2-NEXT: movzbl %r9b, %eax +; SSE2-NEXT: movzbl %cl, %edi +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb $-1, %bl +; SSE2-NEXT: jb .LBB0_26 +; SSE2-NEXT: # %bb.25: +; SSE2-NEXT: movl %ecx, %ebx +; SSE2-NEXT: .LBB0_26: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE2-NEXT: movd %esi, %xmm6 +; SSE2-NEXT: movd %edx, %xmm5 +; SSE2-NEXT: movd %eax, %xmm7 +; SSE2-NEXT: movd %edi, %xmm2 +; SSE2-NEXT: movzbl %bl, %eax +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb $-1, %cl +; SSE2-NEXT: jb .LBB0_28 +; SSE2-NEXT: # %bb.27: +; SSE2-NEXT: movl %edx, %ecx +; SSE2-NEXT: .LBB0_28: +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: movzbl %cl, %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb $-1, %cl +; SSE2-NEXT: jb .LBB0_30 +; SSE2-NEXT: # %bb.29: +; SSE2-NEXT: movl %edx, %ecx +; SSE2-NEXT: .LBB0_30: +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: movzbl %cl, %ecx +; SSE2-NEXT: movd %ecx, %xmm4 +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %cl +; SSE2-NEXT: jb .LBB0_32 +; SSE2-NEXT: # %bb.31: +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: .LBB0_32: +; SSE2-NEXT: movzbl %cl, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %r12 +; SSE2-NEXT: popq %r13 +; SSE2-NEXT: popq %r14 +; SSE2-NEXT: popq %r15 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v16i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pushq %rbp +; SSSE3-NEXT: pushq %r15 +; SSSE3-NEXT: pushq %r14 +; SSSE3-NEXT: pushq %r13 +; SSSE3-NEXT: pushq %r12 +; SSSE3-NEXT: pushq %rbx +; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %dil +; SSSE3-NEXT: jb .LBB0_2 +; SSSE3-NEXT: # %bb.1: +; SSSE3-NEXT: movl %eax, %edi +; SSSE3-NEXT: .LBB0_2: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb $-1, %al +; SSSE3-NEXT: jb .LBB0_4 +; SSSE3-NEXT: # %bb.3: +; SSSE3-NEXT: movl %ecx, %eax +; SSSE3-NEXT: .LBB0_4: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb $-1, %cl +; SSSE3-NEXT: jb .LBB0_6 +; SSSE3-NEXT: # %bb.5: +; SSSE3-NEXT: movl %edx, %ecx +; SSSE3-NEXT: .LBB0_6: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb $-1, %r10b +; SSSE3-NEXT: jb .LBB0_8 +; SSSE3-NEXT: # %bb.7: +; SSSE3-NEXT: movl %edx, %r10d +; SSSE3-NEXT: .LBB0_8: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb $-1, %r11b +; SSSE3-NEXT: jb .LBB0_10 +; SSSE3-NEXT: # %bb.9: +; SSSE3-NEXT: movl %edx, %r11d +; SSSE3-NEXT: .LBB0_10: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb $-1, %r12b +; SSSE3-NEXT: jb .LBB0_12 +; SSSE3-NEXT: # %bb.11: +; SSSE3-NEXT: movl %edx, %r12d +; SSSE3-NEXT: .LBB0_12: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb $-1, %r13b +; SSSE3-NEXT: jb .LBB0_14 +; SSSE3-NEXT: # %bb.13: +; SSSE3-NEXT: movl %edx, %r13d +; SSSE3-NEXT: .LBB0_14: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb $-1, %r8b +; SSSE3-NEXT: jb .LBB0_16 +; SSSE3-NEXT: # %bb.15: +; SSSE3-NEXT: movl %edx, %r8d +; SSSE3-NEXT: .LBB0_16: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb $-1, %r14b +; SSSE3-NEXT: jb .LBB0_18 +; SSSE3-NEXT: # %bb.17: +; SSSE3-NEXT: movl %edx, %r14d +; SSSE3-NEXT: .LBB0_18: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb $-1, %r15b +; SSSE3-NEXT: jb .LBB0_20 +; SSSE3-NEXT: # %bb.19: +; SSSE3-NEXT: movl %edx, %r15d +; SSSE3-NEXT: .LBB0_20: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb $-1, %r9b +; SSSE3-NEXT: jb .LBB0_22 +; SSSE3-NEXT: # %bb.21: +; SSSE3-NEXT: movl %edx, %r9d +; SSSE3-NEXT: .LBB0_22: +; SSSE3-NEXT: movzbl %dil, %edi +; SSSE3-NEXT: movzbl %al, %esi +; SSSE3-NEXT: movzbl %cl, %ebp +; SSSE3-NEXT: movzbl %r10b, %edx +; SSSE3-NEXT: movzbl %r11b, %ebx +; SSSE3-NEXT: movzbl %r12b, %r10d +; SSSE3-NEXT: movzbl %r13b, %r11d +; SSSE3-NEXT: movzbl %r8b, %r8d +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %cl +; SSSE3-NEXT: jb .LBB0_24 +; SSSE3-NEXT: # %bb.23: +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: .LBB0_24: +; SSSE3-NEXT: movd %edi, %xmm2 +; SSSE3-NEXT: movd %esi, %xmm3 +; SSSE3-NEXT: movd %ebp, %xmm5 +; SSSE3-NEXT: movd %edx, %xmm0 +; SSSE3-NEXT: movd %ebx, %xmm6 +; SSSE3-NEXT: movd %r10d, %xmm4 +; SSSE3-NEXT: movd %r11d, %xmm7 +; SSSE3-NEXT: movd %r8d, %xmm1 +; SSSE3-NEXT: movzbl %r14b, %esi +; SSSE3-NEXT: movzbl %r15b, %edx +; SSSE3-NEXT: movzbl %r9b, %eax +; SSSE3-NEXT: movzbl %cl, %edi +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb $-1, %bl +; SSSE3-NEXT: jb .LBB0_26 +; SSSE3-NEXT: # %bb.25: +; SSSE3-NEXT: movl %ecx, %ebx +; SSSE3-NEXT: .LBB0_26: +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSSE3-NEXT: movd %esi, %xmm6 +; SSSE3-NEXT: movd %edx, %xmm5 +; SSSE3-NEXT: movd %eax, %xmm7 +; SSSE3-NEXT: movd %edi, %xmm2 +; SSSE3-NEXT: movzbl %bl, %eax +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb $-1, %cl +; SSSE3-NEXT: jb .LBB0_28 +; SSSE3-NEXT: # %bb.27: +; SSSE3-NEXT: movl %edx, %ecx +; SSSE3-NEXT: .LBB0_28: +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSSE3-NEXT: movd %eax, %xmm4 +; SSSE3-NEXT: movzbl %cl, %eax +; SSSE3-NEXT: movd %eax, %xmm3 +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb $-1, %cl +; SSSE3-NEXT: jb .LBB0_30 +; SSSE3-NEXT: # %bb.29: +; SSSE3-NEXT: movl %edx, %ecx +; SSSE3-NEXT: .LBB0_30: +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSSE3-NEXT: movzbl %cl, %ecx +; SSSE3-NEXT: movd %ecx, %xmm4 +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %cl +; SSSE3-NEXT: jb .LBB0_32 +; SSSE3-NEXT: # %bb.31: +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: .LBB0_32: +; SSSE3-NEXT: movzbl %cl, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: popq %rbx +; SSSE3-NEXT: popq %r12 +; SSSE3-NEXT: popq %r13 +; SSSE3-NEXT: popq %r14 +; SSSE3-NEXT: popq %r15 +; SSSE3-NEXT: popq %rbp +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v16i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pextrb $1, %xmm1, %eax +; SSE41-NEXT: pextrb $1, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %sil +; SSE41-NEXT: movb $-1, %dl +; SSE41-NEXT: jb .LBB0_2 +; SSE41-NEXT: # %bb.1: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB0_2: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pextrb $0, %xmm1, %eax +; SSE41-NEXT: pextrb $0, %xmm0, %edx +; SSE41-NEXT: addb %al, %dl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB0_4 +; SSE41-NEXT: # %bb.3: +; SSE41-NEXT: movl %edx, %eax +; SSE41-NEXT: .LBB0_4: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: movd %eax, %xmm2 +; SSE41-NEXT: pinsrb $1, %ecx, %xmm2 +; SSE41-NEXT: pextrb $2, %xmm1, %eax +; SSE41-NEXT: pextrb $2, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB0_6 +; SSE41-NEXT: # %bb.5: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB0_6: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $2, %eax, %xmm2 +; SSE41-NEXT: pextrb $3, %xmm1, %eax +; SSE41-NEXT: pextrb $3, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB0_8 +; SSE41-NEXT: # %bb.7: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB0_8: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $3, %eax, %xmm2 +; SSE41-NEXT: pextrb $4, %xmm1, %eax +; SSE41-NEXT: pextrb $4, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB0_10 +; SSE41-NEXT: # %bb.9: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB0_10: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $4, %eax, %xmm2 +; SSE41-NEXT: pextrb $5, %xmm1, %eax +; SSE41-NEXT: pextrb $5, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB0_12 +; SSE41-NEXT: # %bb.11: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB0_12: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $5, %eax, %xmm2 +; SSE41-NEXT: pextrb $6, %xmm1, %eax +; SSE41-NEXT: pextrb $6, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB0_14 +; SSE41-NEXT: # %bb.13: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB0_14: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $6, %eax, %xmm2 +; SSE41-NEXT: pextrb $7, %xmm1, %eax +; SSE41-NEXT: pextrb $7, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB0_16 +; SSE41-NEXT: # %bb.15: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB0_16: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $7, %eax, %xmm2 +; SSE41-NEXT: pextrb $8, %xmm1, %eax +; SSE41-NEXT: pextrb $8, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB0_18 +; SSE41-NEXT: # %bb.17: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB0_18: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $8, %eax, %xmm2 +; SSE41-NEXT: pextrb $9, %xmm1, %eax +; SSE41-NEXT: pextrb $9, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB0_20 +; SSE41-NEXT: # %bb.19: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB0_20: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $9, %eax, %xmm2 +; SSE41-NEXT: pextrb $10, %xmm1, %eax +; SSE41-NEXT: pextrb $10, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB0_22 +; SSE41-NEXT: # %bb.21: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB0_22: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $10, %eax, %xmm2 +; SSE41-NEXT: pextrb $11, %xmm1, %eax +; SSE41-NEXT: pextrb $11, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB0_24 +; SSE41-NEXT: # %bb.23: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB0_24: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $11, %eax, %xmm2 +; SSE41-NEXT: pextrb $12, %xmm1, %eax +; SSE41-NEXT: pextrb $12, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB0_26 +; SSE41-NEXT: # %bb.25: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB0_26: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $12, %eax, %xmm2 +; SSE41-NEXT: pextrb $13, %xmm1, %eax +; SSE41-NEXT: pextrb $13, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB0_28 +; SSE41-NEXT: # %bb.27: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB0_28: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $13, %eax, %xmm2 +; SSE41-NEXT: pextrb $14, %xmm1, %eax +; SSE41-NEXT: pextrb $14, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB0_30 +; SSE41-NEXT: # %bb.29: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB0_30: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $14, %eax, %xmm2 +; SSE41-NEXT: pextrb $15, %xmm1, %eax +; SSE41-NEXT: pextrb $15, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: jb .LBB0_32 +; SSE41-NEXT: # %bb.31: +; SSE41-NEXT: movl %ecx, %esi +; SSE41-NEXT: .LBB0_32: +; SSE41-NEXT: movzbl %sil, %eax +; SSE41-NEXT: pinsrb $15, %eax, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: v16i8: +; AVX: # %bb.0: +; AVX-NEXT: vpextrb $1, %xmm1, %eax +; AVX-NEXT: vpextrb $1, %xmm0, %ecx +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movb $-1, %sil +; AVX-NEXT: movb $-1, %dl +; AVX-NEXT: jb .LBB0_2 +; AVX-NEXT: # %bb.1: +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: .LBB0_2: +; AVX-NEXT: movzbl %dl, %ecx +; AVX-NEXT: vpextrb $0, %xmm1, %eax +; AVX-NEXT: vpextrb $0, %xmm0, %edx +; AVX-NEXT: addb %al, %dl +; AVX-NEXT: movb $-1, %al +; AVX-NEXT: jb .LBB0_4 +; AVX-NEXT: # %bb.3: +; AVX-NEXT: movl %edx, %eax +; AVX-NEXT: .LBB0_4: +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vmovd %eax, %xmm2 +; AVX-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $2, %xmm1, %eax +; AVX-NEXT: vpextrb $2, %xmm0, %ecx +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movb $-1, %al +; AVX-NEXT: jb .LBB0_6 +; AVX-NEXT: # %bb.5: +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: .LBB0_6: +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $3, %xmm1, %eax +; AVX-NEXT: vpextrb $3, %xmm0, %ecx +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movb $-1, %al +; AVX-NEXT: jb .LBB0_8 +; AVX-NEXT: # %bb.7: +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: .LBB0_8: +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $4, %xmm1, %eax +; AVX-NEXT: vpextrb $4, %xmm0, %ecx +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movb $-1, %al +; AVX-NEXT: jb .LBB0_10 +; AVX-NEXT: # %bb.9: +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: .LBB0_10: +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $5, %xmm1, %eax +; AVX-NEXT: vpextrb $5, %xmm0, %ecx +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movb $-1, %al +; AVX-NEXT: jb .LBB0_12 +; AVX-NEXT: # %bb.11: +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: .LBB0_12: +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $6, %xmm1, %eax +; AVX-NEXT: vpextrb $6, %xmm0, %ecx +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movb $-1, %al +; AVX-NEXT: jb .LBB0_14 +; AVX-NEXT: # %bb.13: +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: .LBB0_14: +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $7, %xmm1, %eax +; AVX-NEXT: vpextrb $7, %xmm0, %ecx +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movb $-1, %al +; AVX-NEXT: jb .LBB0_16 +; AVX-NEXT: # %bb.15: +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: .LBB0_16: +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $8, %xmm1, %eax +; AVX-NEXT: vpextrb $8, %xmm0, %ecx +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movb $-1, %al +; AVX-NEXT: jb .LBB0_18 +; AVX-NEXT: # %bb.17: +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: .LBB0_18: +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $9, %xmm1, %eax +; AVX-NEXT: vpextrb $9, %xmm0, %ecx +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movb $-1, %al +; AVX-NEXT: jb .LBB0_20 +; AVX-NEXT: # %bb.19: +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: .LBB0_20: +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $10, %xmm1, %eax +; AVX-NEXT: vpextrb $10, %xmm0, %ecx +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movb $-1, %al +; AVX-NEXT: jb .LBB0_22 +; AVX-NEXT: # %bb.21: +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: .LBB0_22: +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $11, %xmm1, %eax +; AVX-NEXT: vpextrb $11, %xmm0, %ecx +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movb $-1, %al +; AVX-NEXT: jb .LBB0_24 +; AVX-NEXT: # %bb.23: +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: .LBB0_24: +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $12, %xmm1, %eax +; AVX-NEXT: vpextrb $12, %xmm0, %ecx +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movb $-1, %al +; AVX-NEXT: jb .LBB0_26 +; AVX-NEXT: # %bb.25: +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: .LBB0_26: +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $13, %xmm1, %eax +; AVX-NEXT: vpextrb $13, %xmm0, %ecx +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movb $-1, %al +; AVX-NEXT: jb .LBB0_28 +; AVX-NEXT: # %bb.27: +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: .LBB0_28: +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $14, %xmm1, %eax +; AVX-NEXT: vpextrb $14, %xmm0, %ecx +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movb $-1, %al +; AVX-NEXT: jb .LBB0_30 +; AVX-NEXT: # %bb.29: +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: .LBB0_30: +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $15, %xmm1, %eax +; AVX-NEXT: vpextrb $15, %xmm0, %ecx +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: jb .LBB0_32 +; AVX-NEXT: # %bb.31: +; AVX-NEXT: movl %ecx, %esi +; AVX-NEXT: .LBB0_32: +; AVX-NEXT: movzbl %sil, %eax +; AVX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0 +; AVX-NEXT: retq + %z = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %x, <16 x i8> %y) + ret <16 x i8> %z +} + +define <32 x i8> @v32i8(<32 x i8> %x, <32 x i8> %y) nounwind { +; SSE2-LABEL: v32i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: pushq %r15 +; SSE2-NEXT: pushq %r14 +; SSE2-NEXT: pushq %r13 +; SSE2-NEXT: pushq %r12 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %sil +; SSE2-NEXT: jb .LBB1_2 +; SSE2-NEXT: # %bb.1: +; SSE2-NEXT: movl %eax, %esi +; SSE2-NEXT: .LBB1_2: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %dil +; SSE2-NEXT: jb .LBB1_4 +; SSE2-NEXT: # %bb.3: +; SSE2-NEXT: movl %eax, %edi +; SSE2-NEXT: .LBB1_4: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %r8b +; SSE2-NEXT: jb .LBB1_6 +; SSE2-NEXT: # %bb.5: +; SSE2-NEXT: movl %eax, %r8d +; SSE2-NEXT: .LBB1_6: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %bl +; SSE2-NEXT: jb .LBB1_8 +; SSE2-NEXT: # %bb.7: +; SSE2-NEXT: movl %eax, %ebx +; SSE2-NEXT: .LBB1_8: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %r11b +; SSE2-NEXT: jb .LBB1_10 +; SSE2-NEXT: # %bb.9: +; SSE2-NEXT: movl %eax, %r11d +; SSE2-NEXT: .LBB1_10: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %bpl +; SSE2-NEXT: jb .LBB1_12 +; SSE2-NEXT: # %bb.11: +; SSE2-NEXT: movl %eax, %ebp +; SSE2-NEXT: .LBB1_12: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %r14b +; SSE2-NEXT: jb .LBB1_14 +; SSE2-NEXT: # %bb.13: +; SSE2-NEXT: movl %eax, %r14d +; SSE2-NEXT: .LBB1_14: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %r15b +; SSE2-NEXT: jb .LBB1_16 +; SSE2-NEXT: # %bb.15: +; SSE2-NEXT: movl %eax, %r15d +; SSE2-NEXT: .LBB1_16: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %r9b +; SSE2-NEXT: jb .LBB1_18 +; SSE2-NEXT: # %bb.17: +; SSE2-NEXT: movl %eax, %r9d +; SSE2-NEXT: .LBB1_18: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %r12b +; SSE2-NEXT: jb .LBB1_20 +; SSE2-NEXT: # %bb.19: +; SSE2-NEXT: movl %eax, %r12d +; SSE2-NEXT: .LBB1_20: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %r13b +; SSE2-NEXT: jb .LBB1_22 +; SSE2-NEXT: # %bb.21: +; SSE2-NEXT: movl %eax, %r13d +; SSE2-NEXT: .LBB1_22: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %dl +; SSE2-NEXT: jb .LBB1_24 +; SSE2-NEXT: # %bb.23: +; SSE2-NEXT: movl %eax, %edx +; SSE2-NEXT: .LBB1_24: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %cl +; SSE2-NEXT: jb .LBB1_26 +; SSE2-NEXT: # %bb.25: +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: .LBB1_26: +; SSE2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %cl +; SSE2-NEXT: jb .LBB1_28 +; SSE2-NEXT: # %bb.27: +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: .LBB1_28: +; SSE2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb $-1, %r10b +; SSE2-NEXT: jb .LBB1_30 +; SSE2-NEXT: # %bb.29: +; SSE2-NEXT: movl %ecx, %r10d +; SSE2-NEXT: .LBB1_30: +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %r9b +; SSE2-NEXT: jb .LBB1_32 +; SSE2-NEXT: # %bb.31: +; SSE2-NEXT: movl %eax, %r9d +; SSE2-NEXT: .LBB1_32: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSE2-NEXT: movl %edx, %ecx +; SSE2-NEXT: jb .LBB1_34 +; SSE2-NEXT: # %bb.33: +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB1_34: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %dl +; SSE2-NEXT: jb .LBB1_36 +; SSE2-NEXT: # %bb.35: +; SSE2-NEXT: movl %eax, %edx +; SSE2-NEXT: .LBB1_36: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSE2-NEXT: jb .LBB1_38 +; SSE2-NEXT: # %bb.37: +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB1_38: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSE2-NEXT: jb .LBB1_40 +; SSE2-NEXT: # %bb.39: +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB1_40: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSE2-NEXT: jb .LBB1_42 +; SSE2-NEXT: # %bb.41: +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB1_42: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSE2-NEXT: jb .LBB1_44 +; SSE2-NEXT: # %bb.43: +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB1_44: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSE2-NEXT: jb .LBB1_46 +; SSE2-NEXT: # %bb.45: +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB1_46: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSE2-NEXT: jb .LBB1_48 +; SSE2-NEXT: # %bb.47: +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB1_48: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSE2-NEXT: jb .LBB1_50 +; SSE2-NEXT: # %bb.49: +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB1_50: +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %dl +; SSE2-NEXT: jb .LBB1_52 +; SSE2-NEXT: # %bb.51: +; SSE2-NEXT: movl %eax, %edx +; SSE2-NEXT: .LBB1_52: +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movzbl %sil, %edx +; SSE2-NEXT: movzbl %dil, %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl %r8b, %esi +; SSE2-NEXT: movzbl %bl, %edi +; SSE2-NEXT: movzbl %r11b, %ebx +; SSE2-NEXT: movzbl %bpl, %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl %r14b, %ebp +; SSE2-NEXT: movzbl %r15b, %r11d +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload +; SSE2-NEXT: movzbl %r12b, %r14d +; SSE2-NEXT: movzbl %r13b, %r15d +; SSE2-NEXT: movzbl %cl, %r12d +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movzbl %r10b, %r10d +; SSE2-NEXT: movzbl %r9b, %r9d +; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSE2-NEXT: jb .LBB1_54 +; SSE2-NEXT: # %bb.53: +; SSE2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB1_54: +; SSE2-NEXT: movd %edx, %xmm2 +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 4-byte Folded Reload +; SSE2-NEXT: # xmm8 = mem[0],zero,zero,zero +; SSE2-NEXT: movd %esi, %xmm3 +; SSE2-NEXT: movd %edi, %xmm11 +; SSE2-NEXT: movd %ebx, %xmm5 +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 4-byte Folded Reload +; SSE2-NEXT: # xmm9 = mem[0],zero,zero,zero +; SSE2-NEXT: movd %ebp, %xmm7 +; SSE2-NEXT: movd %r11d, %xmm1 +; SSE2-NEXT: movd %r8d, %xmm12 +; SSE2-NEXT: movd %r14d, %xmm10 +; SSE2-NEXT: movd %r15d, %xmm13 +; SSE2-NEXT: movd %r12d, %xmm4 +; SSE2-NEXT: movd %r13d, %xmm14 +; SSE2-NEXT: movd %eax, %xmm6 +; SSE2-NEXT: movd %r10d, %xmm15 +; SSE2-NEXT: movd %r9d, %xmm0 +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %bl +; SSE2-NEXT: jb .LBB1_56 +; SSE2-NEXT: # %bb.55: +; SSE2-NEXT: movl %eax, %ebx +; SSE2-NEXT: .LBB1_56: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3],xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3],xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3],xmm6[4],xmm14[4],xmm6[5],xmm14[5],xmm6[6],xmm14[6],xmm6[7],xmm14[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; SSE2-NEXT: movd %edx, %xmm7 +; SSE2-NEXT: movd %esi, %xmm12 +; SSE2-NEXT: movd %edi, %xmm13 +; SSE2-NEXT: movd %r8d, %xmm5 +; SSE2-NEXT: movd %ebp, %xmm14 +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: movd %r11d, %xmm15 +; SSE2-NEXT: movd %r9d, %xmm3 +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE2-NEXT: movzbl %bl, %edi +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movb $-1, %al +; SSE2-NEXT: jb .LBB1_58 +; SSE2-NEXT: # %bb.57: +; SSE2-NEXT: movl %ebx, %eax +; SSE2-NEXT: .LBB1_58: +; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] +; SSE2-NEXT: movd %ecx, %xmm8 +; SSE2-NEXT: movd %edx, %xmm7 +; SSE2-NEXT: movd %esi, %xmm9 +; SSE2-NEXT: movd %edi, %xmm6 +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb $-1, %cl +; SSE2-NEXT: jb .LBB1_60 +; SSE2-NEXT: # %bb.59: +; SSE2-NEXT: movl %edx, %ecx +; SSE2-NEXT: .LBB1_60: +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: movzbl %cl, %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb $-1, %cl +; SSE2-NEXT: jb .LBB1_62 +; SSE2-NEXT: # %bb.61: +; SSE2-NEXT: movl %edx, %ecx +; SSE2-NEXT: .LBB1_62: +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE2-NEXT: movzbl %cl, %ecx +; SSE2-NEXT: movd %ecx, %xmm4 +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %cl +; SSE2-NEXT: jb .LBB1_64 +; SSE2-NEXT: # %bb.63: +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: .LBB1_64: +; SSE2-NEXT: movzbl %cl, %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %r12 +; SSE2-NEXT: popq %r13 +; SSE2-NEXT: popq %r14 +; SSE2-NEXT: popq %r15 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v32i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pushq %rbp +; SSSE3-NEXT: pushq %r15 +; SSSE3-NEXT: pushq %r14 +; SSSE3-NEXT: pushq %r13 +; SSSE3-NEXT: pushq %r12 +; SSSE3-NEXT: pushq %rbx +; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %sil +; SSSE3-NEXT: jb .LBB1_2 +; SSSE3-NEXT: # %bb.1: +; SSSE3-NEXT: movl %eax, %esi +; SSSE3-NEXT: .LBB1_2: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %dil +; SSSE3-NEXT: jb .LBB1_4 +; SSSE3-NEXT: # %bb.3: +; SSSE3-NEXT: movl %eax, %edi +; SSSE3-NEXT: .LBB1_4: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %r8b +; SSSE3-NEXT: jb .LBB1_6 +; SSSE3-NEXT: # %bb.5: +; SSSE3-NEXT: movl %eax, %r8d +; SSSE3-NEXT: .LBB1_6: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %bl +; SSSE3-NEXT: jb .LBB1_8 +; SSSE3-NEXT: # %bb.7: +; SSSE3-NEXT: movl %eax, %ebx +; SSSE3-NEXT: .LBB1_8: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %r11b +; SSSE3-NEXT: jb .LBB1_10 +; SSSE3-NEXT: # %bb.9: +; SSSE3-NEXT: movl %eax, %r11d +; SSSE3-NEXT: .LBB1_10: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %bpl +; SSSE3-NEXT: jb .LBB1_12 +; SSSE3-NEXT: # %bb.11: +; SSSE3-NEXT: movl %eax, %ebp +; SSSE3-NEXT: .LBB1_12: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %r14b +; SSSE3-NEXT: jb .LBB1_14 +; SSSE3-NEXT: # %bb.13: +; SSSE3-NEXT: movl %eax, %r14d +; SSSE3-NEXT: .LBB1_14: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %r15b +; SSSE3-NEXT: jb .LBB1_16 +; SSSE3-NEXT: # %bb.15: +; SSSE3-NEXT: movl %eax, %r15d +; SSSE3-NEXT: .LBB1_16: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %r9b +; SSSE3-NEXT: jb .LBB1_18 +; SSSE3-NEXT: # %bb.17: +; SSSE3-NEXT: movl %eax, %r9d +; SSSE3-NEXT: .LBB1_18: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %r12b +; SSSE3-NEXT: jb .LBB1_20 +; SSSE3-NEXT: # %bb.19: +; SSSE3-NEXT: movl %eax, %r12d +; SSSE3-NEXT: .LBB1_20: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %r13b +; SSSE3-NEXT: jb .LBB1_22 +; SSSE3-NEXT: # %bb.21: +; SSSE3-NEXT: movl %eax, %r13d +; SSSE3-NEXT: .LBB1_22: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %dl +; SSSE3-NEXT: jb .LBB1_24 +; SSSE3-NEXT: # %bb.23: +; SSSE3-NEXT: movl %eax, %edx +; SSSE3-NEXT: .LBB1_24: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %cl +; SSSE3-NEXT: jb .LBB1_26 +; SSSE3-NEXT: # %bb.25: +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: .LBB1_26: +; SSSE3-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %cl +; SSSE3-NEXT: jb .LBB1_28 +; SSSE3-NEXT: # %bb.27: +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: .LBB1_28: +; SSSE3-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb $-1, %r10b +; SSSE3-NEXT: jb .LBB1_30 +; SSSE3-NEXT: # %bb.29: +; SSSE3-NEXT: movl %ecx, %r10d +; SSSE3-NEXT: .LBB1_30: +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %r9b +; SSSE3-NEXT: jb .LBB1_32 +; SSSE3-NEXT: # %bb.31: +; SSSE3-NEXT: movl %eax, %r9d +; SSSE3-NEXT: .LBB1_32: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSSE3-NEXT: movl %edx, %ecx +; SSSE3-NEXT: jb .LBB1_34 +; SSSE3-NEXT: # %bb.33: +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB1_34: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %dl +; SSSE3-NEXT: jb .LBB1_36 +; SSSE3-NEXT: # %bb.35: +; SSSE3-NEXT: movl %eax, %edx +; SSSE3-NEXT: .LBB1_36: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSSE3-NEXT: jb .LBB1_38 +; SSSE3-NEXT: # %bb.37: +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB1_38: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSSE3-NEXT: jb .LBB1_40 +; SSSE3-NEXT: # %bb.39: +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB1_40: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSSE3-NEXT: jb .LBB1_42 +; SSSE3-NEXT: # %bb.41: +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB1_42: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSSE3-NEXT: jb .LBB1_44 +; SSSE3-NEXT: # %bb.43: +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB1_44: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSSE3-NEXT: jb .LBB1_46 +; SSSE3-NEXT: # %bb.45: +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB1_46: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSSE3-NEXT: jb .LBB1_48 +; SSSE3-NEXT: # %bb.47: +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB1_48: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSSE3-NEXT: jb .LBB1_50 +; SSSE3-NEXT: # %bb.49: +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB1_50: +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %dl +; SSSE3-NEXT: jb .LBB1_52 +; SSSE3-NEXT: # %bb.51: +; SSSE3-NEXT: movl %eax, %edx +; SSSE3-NEXT: .LBB1_52: +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movzbl %sil, %edx +; SSSE3-NEXT: movzbl %dil, %eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl %r8b, %esi +; SSSE3-NEXT: movzbl %bl, %edi +; SSSE3-NEXT: movzbl %r11b, %ebx +; SSSE3-NEXT: movzbl %bpl, %eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl %r14b, %ebp +; SSSE3-NEXT: movzbl %r15b, %r11d +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl %r12b, %r14d +; SSSE3-NEXT: movzbl %r13b, %r15d +; SSSE3-NEXT: movzbl %cl, %r12d +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movzbl %r10b, %r10d +; SSSE3-NEXT: movzbl %r9b, %r9d +; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSSE3-NEXT: jb .LBB1_54 +; SSSE3-NEXT: # %bb.53: +; SSSE3-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB1_54: +; SSSE3-NEXT: movd %edx, %xmm2 +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm8 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd %esi, %xmm3 +; SSSE3-NEXT: movd %edi, %xmm11 +; SSSE3-NEXT: movd %ebx, %xmm5 +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm9 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd %ebp, %xmm7 +; SSSE3-NEXT: movd %r11d, %xmm1 +; SSSE3-NEXT: movd %r8d, %xmm12 +; SSSE3-NEXT: movd %r14d, %xmm10 +; SSSE3-NEXT: movd %r15d, %xmm13 +; SSSE3-NEXT: movd %r12d, %xmm4 +; SSSE3-NEXT: movd %r13d, %xmm14 +; SSSE3-NEXT: movd %eax, %xmm6 +; SSSE3-NEXT: movd %r10d, %xmm15 +; SSSE3-NEXT: movd %r9d, %xmm0 +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %bl +; SSSE3-NEXT: jb .LBB1_56 +; SSSE3-NEXT: # %bb.55: +; SSSE3-NEXT: movl %eax, %ebx +; SSSE3-NEXT: .LBB1_56: +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3],xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3],xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3],xmm6[4],xmm14[4],xmm6[5],xmm14[5],xmm6[6],xmm14[6],xmm6[7],xmm14[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; SSSE3-NEXT: movd %edx, %xmm7 +; SSSE3-NEXT: movd %esi, %xmm12 +; SSSE3-NEXT: movd %edi, %xmm13 +; SSSE3-NEXT: movd %r8d, %xmm5 +; SSSE3-NEXT: movd %ebp, %xmm14 +; SSSE3-NEXT: movd %ecx, %xmm2 +; SSSE3-NEXT: movd %r11d, %xmm15 +; SSSE3-NEXT: movd %r9d, %xmm3 +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSSE3-NEXT: movzbl %bl, %edi +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movb $-1, %al +; SSSE3-NEXT: jb .LBB1_58 +; SSSE3-NEXT: # %bb.57: +; SSSE3-NEXT: movl %ebx, %eax +; SSSE3-NEXT: .LBB1_58: +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] +; SSSE3-NEXT: movd %ecx, %xmm8 +; SSSE3-NEXT: movd %edx, %xmm7 +; SSSE3-NEXT: movd %esi, %xmm9 +; SSSE3-NEXT: movd %edi, %xmm6 +; SSSE3-NEXT: movzbl %al, %eax +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb $-1, %cl +; SSSE3-NEXT: jb .LBB1_60 +; SSSE3-NEXT: # %bb.59: +; SSSE3-NEXT: movl %edx, %ecx +; SSSE3-NEXT: .LBB1_60: +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] +; SSSE3-NEXT: movd %eax, %xmm4 +; SSSE3-NEXT: movzbl %cl, %eax +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb $-1, %cl +; SSSE3-NEXT: jb .LBB1_62 +; SSSE3-NEXT: # %bb.61: +; SSSE3-NEXT: movl %edx, %ecx +; SSSE3-NEXT: .LBB1_62: +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSSE3-NEXT: movzbl %cl, %ecx +; SSSE3-NEXT: movd %ecx, %xmm4 +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %cl +; SSSE3-NEXT: jb .LBB1_64 +; SSSE3-NEXT: # %bb.63: +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: .LBB1_64: +; SSSE3-NEXT: movzbl %cl, %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSSE3-NEXT: popq %rbx +; SSSE3-NEXT: popq %r12 +; SSSE3-NEXT: popq %r13 +; SSSE3-NEXT: popq %r14 +; SSSE3-NEXT: popq %r15 +; SSSE3-NEXT: popq %rbp +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v32i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pushq %rbp +; SSE41-NEXT: pushq %r15 +; SSE41-NEXT: pushq %r14 +; SSE41-NEXT: pushq %r13 +; SSE41-NEXT: pushq %r12 +; SSE41-NEXT: pushq %rbx +; SSE41-NEXT: pextrb $1, %xmm2, %ecx +; SSE41-NEXT: pextrb $1, %xmm0, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb $-1, %bl +; SSE41-NEXT: jb .LBB1_2 +; SSE41-NEXT: # %bb.1: +; SSE41-NEXT: movl %eax, %ebx +; SSE41-NEXT: .LBB1_2: +; SSE41-NEXT: pextrb $0, %xmm2, %ecx +; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb $-1, %cl +; SSE41-NEXT: jb .LBB1_4 +; SSE41-NEXT: # %bb.3: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB1_4: +; SSE41-NEXT: pextrb $2, %xmm2, %edx +; SSE41-NEXT: pextrb $2, %xmm0, %eax +; SSE41-NEXT: addb %dl, %al +; SSE41-NEXT: movb $-1, %sil +; SSE41-NEXT: jb .LBB1_6 +; SSE41-NEXT: # %bb.5: +; SSE41-NEXT: movl %eax, %esi +; SSE41-NEXT: .LBB1_6: +; SSE41-NEXT: pextrb $3, %xmm2, %eax +; SSE41-NEXT: pextrb $3, %xmm0, %edx +; SSE41-NEXT: addb %al, %dl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB1_8 +; SSE41-NEXT: # %bb.7: +; SSE41-NEXT: movl %edx, %eax +; SSE41-NEXT: .LBB1_8: +; SSE41-NEXT: pextrb $4, %xmm2, %edx +; SSE41-NEXT: pextrb $4, %xmm0, %edi +; SSE41-NEXT: addb %dl, %dil +; SSE41-NEXT: movb $-1, %r10b +; SSE41-NEXT: jb .LBB1_10 +; SSE41-NEXT: # %bb.9: +; SSE41-NEXT: movl %edi, %r10d +; SSE41-NEXT: .LBB1_10: +; SSE41-NEXT: pextrb $5, %xmm2, %edi +; SSE41-NEXT: pextrb $5, %xmm0, %ebp +; SSE41-NEXT: addb %dil, %bpl +; SSE41-NEXT: movb $-1, %r9b +; SSE41-NEXT: jb .LBB1_12 +; SSE41-NEXT: # %bb.11: +; SSE41-NEXT: movl %ebp, %r9d +; SSE41-NEXT: .LBB1_12: +; SSE41-NEXT: pextrb $6, %xmm2, %edi +; SSE41-NEXT: pextrb $6, %xmm0, %ebp +; SSE41-NEXT: addb %dil, %bpl +; SSE41-NEXT: movb $-1, %r13b +; SSE41-NEXT: jb .LBB1_14 +; SSE41-NEXT: # %bb.13: +; SSE41-NEXT: movl %ebp, %r13d +; SSE41-NEXT: .LBB1_14: +; SSE41-NEXT: pextrb $7, %xmm2, %edi +; SSE41-NEXT: pextrb $7, %xmm0, %ebp +; SSE41-NEXT: addb %dil, %bpl +; SSE41-NEXT: movb $-1, %r12b +; SSE41-NEXT: jb .LBB1_16 +; SSE41-NEXT: # %bb.15: +; SSE41-NEXT: movl %ebp, %r12d +; SSE41-NEXT: .LBB1_16: +; SSE41-NEXT: pextrb $8, %xmm2, %edi +; SSE41-NEXT: pextrb $8, %xmm0, %ebp +; SSE41-NEXT: addb %dil, %bpl +; SSE41-NEXT: movb $-1, %r15b +; SSE41-NEXT: jb .LBB1_18 +; SSE41-NEXT: # %bb.17: +; SSE41-NEXT: movl %ebp, %r15d +; SSE41-NEXT: .LBB1_18: +; SSE41-NEXT: pextrb $9, %xmm2, %edi +; SSE41-NEXT: pextrb $9, %xmm0, %ebp +; SSE41-NEXT: addb %dil, %bpl +; SSE41-NEXT: movb $-1, %r14b +; SSE41-NEXT: jb .LBB1_20 +; SSE41-NEXT: # %bb.19: +; SSE41-NEXT: movl %ebp, %r14d +; SSE41-NEXT: .LBB1_20: +; SSE41-NEXT: pextrb $10, %xmm2, %edi +; SSE41-NEXT: pextrb $10, %xmm0, %ebp +; SSE41-NEXT: addb %dil, %bpl +; SSE41-NEXT: movb $-1, %r8b +; SSE41-NEXT: jb .LBB1_22 +; SSE41-NEXT: # %bb.21: +; SSE41-NEXT: movl %ebp, %r8d +; SSE41-NEXT: .LBB1_22: +; SSE41-NEXT: pextrb $11, %xmm2, %edi +; SSE41-NEXT: pextrb $11, %xmm0, %ebp +; SSE41-NEXT: addb %dil, %bpl +; SSE41-NEXT: movb $-1, %r11b +; SSE41-NEXT: jb .LBB1_24 +; SSE41-NEXT: # %bb.23: +; SSE41-NEXT: movl %ebp, %r11d +; SSE41-NEXT: .LBB1_24: +; SSE41-NEXT: pextrb $12, %xmm2, %edi +; SSE41-NEXT: pextrb $12, %xmm0, %ebp +; SSE41-NEXT: addb %dil, %bpl +; SSE41-NEXT: movb $-1, %dl +; SSE41-NEXT: jb .LBB1_26 +; SSE41-NEXT: # %bb.25: +; SSE41-NEXT: movl %ebp, %edx +; SSE41-NEXT: .LBB1_26: +; SSE41-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE41-NEXT: pextrb $13, %xmm2, %edi +; SSE41-NEXT: pextrb $13, %xmm0, %ebp +; SSE41-NEXT: addb %dil, %bpl +; SSE41-NEXT: movb $-1, %dl +; SSE41-NEXT: jb .LBB1_28 +; SSE41-NEXT: # %bb.27: +; SSE41-NEXT: movl %ebp, %edx +; SSE41-NEXT: .LBB1_28: +; SSE41-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE41-NEXT: pextrb $14, %xmm2, %edi +; SSE41-NEXT: pextrb $14, %xmm0, %ebp +; SSE41-NEXT: addb %dil, %bpl +; SSE41-NEXT: movb $-1, %dl +; SSE41-NEXT: jb .LBB1_30 +; SSE41-NEXT: # %bb.29: +; SSE41-NEXT: movl %ebp, %edx +; SSE41-NEXT: .LBB1_30: +; SSE41-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE41-NEXT: movzbl %cl, %ecx +; SSE41-NEXT: pextrb $15, %xmm2, %edi +; SSE41-NEXT: pextrb $15, %xmm0, %ebp +; SSE41-NEXT: addb %dil, %bpl +; SSE41-NEXT: movb $-1, %dil +; SSE41-NEXT: jb .LBB1_32 +; SSE41-NEXT: # %bb.31: +; SSE41-NEXT: movl %ebp, %edi +; SSE41-NEXT: .LBB1_32: +; SSE41-NEXT: movzbl %bl, %ebx +; SSE41-NEXT: movd %ecx, %xmm0 +; SSE41-NEXT: pextrb $1, %xmm3, %ecx +; SSE41-NEXT: pextrb $1, %xmm1, %ebp +; SSE41-NEXT: addb %cl, %bpl +; SSE41-NEXT: movb $-1, %cl +; SSE41-NEXT: jb .LBB1_34 +; SSE41-NEXT: # %bb.33: +; SSE41-NEXT: movl %ebp, %ecx +; SSE41-NEXT: .LBB1_34: +; SSE41-NEXT: pinsrb $1, %ebx, %xmm0 +; SSE41-NEXT: movzbl %sil, %ebx +; SSE41-NEXT: movzbl %cl, %esi +; SSE41-NEXT: pextrb $0, %xmm3, %ecx +; SSE41-NEXT: pextrb $0, %xmm1, %ebp +; SSE41-NEXT: addb %cl, %bpl +; SSE41-NEXT: movb $-1, %cl +; SSE41-NEXT: jb .LBB1_36 +; SSE41-NEXT: # %bb.35: +; SSE41-NEXT: movl %ebp, %ecx +; SSE41-NEXT: .LBB1_36: +; SSE41-NEXT: pinsrb $2, %ebx, %xmm0 +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: movzbl %cl, %ecx +; SSE41-NEXT: movd %ecx, %xmm2 +; SSE41-NEXT: pinsrb $1, %esi, %xmm2 +; SSE41-NEXT: pextrb $2, %xmm3, %ecx +; SSE41-NEXT: pextrb $2, %xmm1, %esi +; SSE41-NEXT: addb %cl, %sil +; SSE41-NEXT: movb $-1, %cl +; SSE41-NEXT: jb .LBB1_38 +; SSE41-NEXT: # %bb.37: +; SSE41-NEXT: movl %esi, %ecx +; SSE41-NEXT: .LBB1_38: +; SSE41-NEXT: pinsrb $3, %eax, %xmm0 +; SSE41-NEXT: movzbl %r10b, %eax +; SSE41-NEXT: movzbl %cl, %ecx +; SSE41-NEXT: pinsrb $2, %ecx, %xmm2 +; SSE41-NEXT: pextrb $3, %xmm3, %ecx +; SSE41-NEXT: pextrb $3, %xmm1, %edx +; SSE41-NEXT: addb %cl, %dl +; SSE41-NEXT: movb $-1, %cl +; SSE41-NEXT: jb .LBB1_40 +; SSE41-NEXT: # %bb.39: +; SSE41-NEXT: movl %edx, %ecx +; SSE41-NEXT: .LBB1_40: +; SSE41-NEXT: pinsrb $4, %eax, %xmm0 +; SSE41-NEXT: movzbl %r9b, %eax +; SSE41-NEXT: movzbl %cl, %ecx +; SSE41-NEXT: pinsrb $3, %ecx, %xmm2 +; SSE41-NEXT: pextrb $4, %xmm3, %ecx +; SSE41-NEXT: pextrb $4, %xmm1, %edx +; SSE41-NEXT: addb %cl, %dl +; SSE41-NEXT: movb $-1, %cl +; SSE41-NEXT: jb .LBB1_42 +; SSE41-NEXT: # %bb.41: +; SSE41-NEXT: movl %edx, %ecx +; SSE41-NEXT: .LBB1_42: +; SSE41-NEXT: pinsrb $5, %eax, %xmm0 +; SSE41-NEXT: movzbl %r13b, %eax +; SSE41-NEXT: movzbl %cl, %ecx +; SSE41-NEXT: pinsrb $4, %ecx, %xmm2 +; SSE41-NEXT: pextrb $5, %xmm3, %ecx +; SSE41-NEXT: pextrb $5, %xmm1, %edx +; SSE41-NEXT: addb %cl, %dl +; SSE41-NEXT: movb $-1, %cl +; SSE41-NEXT: jb .LBB1_44 +; SSE41-NEXT: # %bb.43: +; SSE41-NEXT: movl %edx, %ecx +; SSE41-NEXT: .LBB1_44: +; SSE41-NEXT: pinsrb $6, %eax, %xmm0 +; SSE41-NEXT: movzbl %r12b, %eax +; SSE41-NEXT: movzbl %cl, %ecx +; SSE41-NEXT: pinsrb $5, %ecx, %xmm2 +; SSE41-NEXT: pextrb $6, %xmm3, %ecx +; SSE41-NEXT: pextrb $6, %xmm1, %edx +; SSE41-NEXT: addb %cl, %dl +; SSE41-NEXT: movb $-1, %cl +; SSE41-NEXT: jb .LBB1_46 +; SSE41-NEXT: # %bb.45: +; SSE41-NEXT: movl %edx, %ecx +; SSE41-NEXT: .LBB1_46: +; SSE41-NEXT: pinsrb $7, %eax, %xmm0 +; SSE41-NEXT: movzbl %r15b, %eax +; SSE41-NEXT: movzbl %cl, %ecx +; SSE41-NEXT: pinsrb $6, %ecx, %xmm2 +; SSE41-NEXT: pextrb $7, %xmm3, %ecx +; SSE41-NEXT: pextrb $7, %xmm1, %edx +; SSE41-NEXT: addb %cl, %dl +; SSE41-NEXT: movb $-1, %cl +; SSE41-NEXT: jb .LBB1_48 +; SSE41-NEXT: # %bb.47: +; SSE41-NEXT: movl %edx, %ecx +; SSE41-NEXT: .LBB1_48: +; SSE41-NEXT: pinsrb $8, %eax, %xmm0 +; SSE41-NEXT: movzbl %r14b, %eax +; SSE41-NEXT: movzbl %cl, %ecx +; SSE41-NEXT: pinsrb $7, %ecx, %xmm2 +; SSE41-NEXT: pextrb $8, %xmm3, %ecx +; SSE41-NEXT: pextrb $8, %xmm1, %edx +; SSE41-NEXT: addb %cl, %dl +; SSE41-NEXT: movb $-1, %cl +; SSE41-NEXT: jb .LBB1_50 +; SSE41-NEXT: # %bb.49: +; SSE41-NEXT: movl %edx, %ecx +; SSE41-NEXT: .LBB1_50: +; SSE41-NEXT: pinsrb $9, %eax, %xmm0 +; SSE41-NEXT: movzbl %r8b, %eax +; SSE41-NEXT: movzbl %cl, %ecx +; SSE41-NEXT: pinsrb $8, %ecx, %xmm2 +; SSE41-NEXT: pextrb $9, %xmm3, %ecx +; SSE41-NEXT: pextrb $9, %xmm1, %edx +; SSE41-NEXT: addb %cl, %dl +; SSE41-NEXT: movb $-1, %cl +; SSE41-NEXT: jb .LBB1_52 +; SSE41-NEXT: # %bb.51: +; SSE41-NEXT: movl %edx, %ecx +; SSE41-NEXT: .LBB1_52: +; SSE41-NEXT: pinsrb $10, %eax, %xmm0 +; SSE41-NEXT: movzbl %r11b, %eax +; SSE41-NEXT: movzbl %cl, %ecx +; SSE41-NEXT: pinsrb $9, %ecx, %xmm2 +; SSE41-NEXT: pextrb $10, %xmm3, %ecx +; SSE41-NEXT: pextrb $10, %xmm1, %edx +; SSE41-NEXT: addb %cl, %dl +; SSE41-NEXT: movb $-1, %cl +; SSE41-NEXT: jb .LBB1_54 +; SSE41-NEXT: # %bb.53: +; SSE41-NEXT: movl %edx, %ecx +; SSE41-NEXT: .LBB1_54: +; SSE41-NEXT: pinsrb $11, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: movzbl %cl, %ecx +; SSE41-NEXT: pinsrb $10, %ecx, %xmm2 +; SSE41-NEXT: pextrb $11, %xmm3, %ecx +; SSE41-NEXT: pextrb $11, %xmm1, %edx +; SSE41-NEXT: addb %cl, %dl +; SSE41-NEXT: movb $-1, %cl +; SSE41-NEXT: jb .LBB1_56 +; SSE41-NEXT: # %bb.55: +; SSE41-NEXT: movl %edx, %ecx +; SSE41-NEXT: .LBB1_56: +; SSE41-NEXT: pinsrb $12, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: movzbl %cl, %ecx +; SSE41-NEXT: pinsrb $11, %ecx, %xmm2 +; SSE41-NEXT: pextrb $12, %xmm3, %ecx +; SSE41-NEXT: pextrb $12, %xmm1, %edx +; SSE41-NEXT: addb %cl, %dl +; SSE41-NEXT: movb $-1, %cl +; SSE41-NEXT: jb .LBB1_58 +; SSE41-NEXT: # %bb.57: +; SSE41-NEXT: movl %edx, %ecx +; SSE41-NEXT: .LBB1_58: +; SSE41-NEXT: pinsrb $13, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: movzbl %cl, %ecx +; SSE41-NEXT: pinsrb $12, %ecx, %xmm2 +; SSE41-NEXT: pextrb $13, %xmm3, %ecx +; SSE41-NEXT: pextrb $13, %xmm1, %edx +; SSE41-NEXT: addb %cl, %dl +; SSE41-NEXT: movb $-1, %cl +; SSE41-NEXT: jb .LBB1_60 +; SSE41-NEXT: # %bb.59: +; SSE41-NEXT: movl %edx, %ecx +; SSE41-NEXT: .LBB1_60: +; SSE41-NEXT: pinsrb $14, %eax, %xmm0 +; SSE41-NEXT: movzbl %dil, %eax +; SSE41-NEXT: movzbl %cl, %ecx +; SSE41-NEXT: pinsrb $13, %ecx, %xmm2 +; SSE41-NEXT: pextrb $14, %xmm3, %edx +; SSE41-NEXT: pextrb $14, %xmm1, %ecx +; SSE41-NEXT: addb %dl, %cl +; SSE41-NEXT: movb $-1, %dl +; SSE41-NEXT: jb .LBB1_62 +; SSE41-NEXT: # %bb.61: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB1_62: +; SSE41-NEXT: pinsrb $15, %eax, %xmm0 +; SSE41-NEXT: movzbl %dl, %eax +; SSE41-NEXT: pinsrb $14, %eax, %xmm2 +; SSE41-NEXT: pextrb $15, %xmm3, %ecx +; SSE41-NEXT: pextrb $15, %xmm1, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb $-1, %cl +; SSE41-NEXT: jb .LBB1_64 +; SSE41-NEXT: # %bb.63: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB1_64: +; SSE41-NEXT: movzbl %cl, %eax +; SSE41-NEXT: pinsrb $15, %eax, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: popq %rbx +; SSE41-NEXT: popq %r12 +; SSE41-NEXT: popq %r13 +; SSE41-NEXT: popq %r14 +; SSE41-NEXT: popq %r15 +; SSE41-NEXT: popq %rbp +; SSE41-NEXT: retq +; +; AVX1-LABEL: v32i8: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: pushq %r15 +; AVX1-NEXT: pushq %r14 +; AVX1-NEXT: pushq %r13 +; AVX1-NEXT: pushq %r12 +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpextrb $1, %xmm2, %ecx +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpextrb $1, %xmm3, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: movb $-1, %bl +; AVX1-NEXT: jb .LBB1_2 +; AVX1-NEXT: # %bb.1: +; AVX1-NEXT: movl %eax, %ebx +; AVX1-NEXT: .LBB1_2: +; AVX1-NEXT: vpextrb $0, %xmm2, %ecx +; AVX1-NEXT: vpextrb $0, %xmm3, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: movb $-1, %cl +; AVX1-NEXT: jb .LBB1_4 +; AVX1-NEXT: # %bb.3: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB1_4: +; AVX1-NEXT: vpextrb $2, %xmm2, %edx +; AVX1-NEXT: vpextrb $2, %xmm3, %eax +; AVX1-NEXT: addb %dl, %al +; AVX1-NEXT: movb $-1, %sil +; AVX1-NEXT: jb .LBB1_6 +; AVX1-NEXT: # %bb.5: +; AVX1-NEXT: movl %eax, %esi +; AVX1-NEXT: .LBB1_6: +; AVX1-NEXT: vpextrb $3, %xmm2, %eax +; AVX1-NEXT: vpextrb $3, %xmm3, %edx +; AVX1-NEXT: addb %al, %dl +; AVX1-NEXT: movb $-1, %al +; AVX1-NEXT: jb .LBB1_8 +; AVX1-NEXT: # %bb.7: +; AVX1-NEXT: movl %edx, %eax +; AVX1-NEXT: .LBB1_8: +; AVX1-NEXT: vpextrb $4, %xmm2, %edx +; AVX1-NEXT: vpextrb $4, %xmm3, %edi +; AVX1-NEXT: addb %dl, %dil +; AVX1-NEXT: movb $-1, %r9b +; AVX1-NEXT: jb .LBB1_10 +; AVX1-NEXT: # %bb.9: +; AVX1-NEXT: movl %edi, %r9d +; AVX1-NEXT: .LBB1_10: +; AVX1-NEXT: vpextrb $5, %xmm2, %edx +; AVX1-NEXT: vpextrb $5, %xmm3, %ebp +; AVX1-NEXT: addb %dl, %bpl +; AVX1-NEXT: movb $-1, %dil +; AVX1-NEXT: jb .LBB1_12 +; AVX1-NEXT: # %bb.11: +; AVX1-NEXT: movl %ebp, %edi +; AVX1-NEXT: .LBB1_12: +; AVX1-NEXT: vpextrb $6, %xmm2, %edx +; AVX1-NEXT: vpextrb $6, %xmm3, %ebp +; AVX1-NEXT: addb %dl, %bpl +; AVX1-NEXT: movb $-1, %r13b +; AVX1-NEXT: jb .LBB1_14 +; AVX1-NEXT: # %bb.13: +; AVX1-NEXT: movl %ebp, %r13d +; AVX1-NEXT: .LBB1_14: +; AVX1-NEXT: vpextrb $7, %xmm2, %edx +; AVX1-NEXT: vpextrb $7, %xmm3, %ebp +; AVX1-NEXT: addb %dl, %bpl +; AVX1-NEXT: movb $-1, %r12b +; AVX1-NEXT: jb .LBB1_16 +; AVX1-NEXT: # %bb.15: +; AVX1-NEXT: movl %ebp, %r12d +; AVX1-NEXT: .LBB1_16: +; AVX1-NEXT: vpextrb $8, %xmm2, %edx +; AVX1-NEXT: vpextrb $8, %xmm3, %ebp +; AVX1-NEXT: addb %dl, %bpl +; AVX1-NEXT: movb $-1, %r15b +; AVX1-NEXT: jb .LBB1_18 +; AVX1-NEXT: # %bb.17: +; AVX1-NEXT: movl %ebp, %r15d +; AVX1-NEXT: .LBB1_18: +; AVX1-NEXT: vpextrb $9, %xmm2, %edx +; AVX1-NEXT: vpextrb $9, %xmm3, %ebp +; AVX1-NEXT: addb %dl, %bpl +; AVX1-NEXT: movb $-1, %r14b +; AVX1-NEXT: jb .LBB1_20 +; AVX1-NEXT: # %bb.19: +; AVX1-NEXT: movl %ebp, %r14d +; AVX1-NEXT: .LBB1_20: +; AVX1-NEXT: vpextrb $10, %xmm2, %edx +; AVX1-NEXT: vpextrb $10, %xmm3, %ebp +; AVX1-NEXT: addb %dl, %bpl +; AVX1-NEXT: movb $-1, %r8b +; AVX1-NEXT: jb .LBB1_22 +; AVX1-NEXT: # %bb.21: +; AVX1-NEXT: movl %ebp, %r8d +; AVX1-NEXT: .LBB1_22: +; AVX1-NEXT: vpextrb $11, %xmm2, %edx +; AVX1-NEXT: vpextrb $11, %xmm3, %ebp +; AVX1-NEXT: addb %dl, %bpl +; AVX1-NEXT: movb $-1, %r11b +; AVX1-NEXT: jb .LBB1_24 +; AVX1-NEXT: # %bb.23: +; AVX1-NEXT: movl %ebp, %r11d +; AVX1-NEXT: .LBB1_24: +; AVX1-NEXT: vpextrb $12, %xmm2, %edx +; AVX1-NEXT: vpextrb $12, %xmm3, %ebp +; AVX1-NEXT: addb %dl, %bpl +; AVX1-NEXT: movb $-1, %r10b +; AVX1-NEXT: jb .LBB1_26 +; AVX1-NEXT: # %bb.25: +; AVX1-NEXT: movl %ebp, %r10d +; AVX1-NEXT: .LBB1_26: +; AVX1-NEXT: vpextrb $13, %xmm2, %edx +; AVX1-NEXT: vpextrb $13, %xmm3, %ebp +; AVX1-NEXT: addb %dl, %bpl +; AVX1-NEXT: movb $-1, %dl +; AVX1-NEXT: jb .LBB1_28 +; AVX1-NEXT: # %bb.27: +; AVX1-NEXT: movl %ebp, %edx +; AVX1-NEXT: .LBB1_28: +; AVX1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX1-NEXT: vpextrb $14, %xmm2, %edx +; AVX1-NEXT: vpextrb $14, %xmm3, %ebp +; AVX1-NEXT: addb %dl, %bpl +; AVX1-NEXT: movb $-1, %dl +; AVX1-NEXT: jb .LBB1_30 +; AVX1-NEXT: # %bb.29: +; AVX1-NEXT: movl %ebp, %edx +; AVX1-NEXT: .LBB1_30: +; AVX1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX1-NEXT: movzbl %cl, %ecx +; AVX1-NEXT: vpextrb $15, %xmm2, %edx +; AVX1-NEXT: vpextrb $15, %xmm3, %ebp +; AVX1-NEXT: addb %dl, %bpl +; AVX1-NEXT: movb $-1, %dl +; AVX1-NEXT: jb .LBB1_32 +; AVX1-NEXT: # %bb.31: +; AVX1-NEXT: movl %ebp, %edx +; AVX1-NEXT: .LBB1_32: +; AVX1-NEXT: movl %edx, %ebp +; AVX1-NEXT: movzbl %bl, %ebx +; AVX1-NEXT: vmovd %ecx, %xmm2 +; AVX1-NEXT: vpextrb $1, %xmm1, %edx +; AVX1-NEXT: vpextrb $1, %xmm0, %ecx +; AVX1-NEXT: addb %dl, %cl +; AVX1-NEXT: movb $-1, %dl +; AVX1-NEXT: jb .LBB1_34 +; AVX1-NEXT: # %bb.33: +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: .LBB1_34: +; AVX1-NEXT: vpinsrb $1, %ebx, %xmm2, %xmm2 +; AVX1-NEXT: movzbl %sil, %ecx +; AVX1-NEXT: movzbl %dl, %esi +; AVX1-NEXT: vpextrb $0, %xmm1, %edx +; AVX1-NEXT: vpextrb $0, %xmm0, %ebx +; AVX1-NEXT: addb %dl, %bl +; AVX1-NEXT: movb $-1, %dl +; AVX1-NEXT: jb .LBB1_36 +; AVX1-NEXT: # %bb.35: +; AVX1-NEXT: movl %ebx, %edx +; AVX1-NEXT: .LBB1_36: +; AVX1-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: movzbl %al, %eax +; AVX1-NEXT: movzbl %dl, %ecx +; AVX1-NEXT: vmovd %ecx, %xmm3 +; AVX1-NEXT: vpinsrb $1, %esi, %xmm3, %xmm3 +; AVX1-NEXT: vpextrb $2, %xmm1, %edx +; AVX1-NEXT: vpextrb $2, %xmm0, %ecx +; AVX1-NEXT: addb %dl, %cl +; AVX1-NEXT: movb $-1, %dl +; AVX1-NEXT: jb .LBB1_38 +; AVX1-NEXT: # %bb.37: +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: .LBB1_38: +; AVX1-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl %r9b, %eax +; AVX1-NEXT: movzbl %dl, %ecx +; AVX1-NEXT: vpinsrb $2, %ecx, %xmm3, %xmm3 +; AVX1-NEXT: vpextrb $3, %xmm1, %edx +; AVX1-NEXT: vpextrb $3, %xmm0, %ecx +; AVX1-NEXT: addb %dl, %cl +; AVX1-NEXT: movb $-1, %dl +; AVX1-NEXT: jb .LBB1_40 +; AVX1-NEXT: # %bb.39: +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: .LBB1_40: +; AVX1-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl %dil, %eax +; AVX1-NEXT: movzbl %dl, %ecx +; AVX1-NEXT: vpinsrb $3, %ecx, %xmm3, %xmm3 +; AVX1-NEXT: vpextrb $4, %xmm1, %edx +; AVX1-NEXT: vpextrb $4, %xmm0, %ecx +; AVX1-NEXT: addb %dl, %cl +; AVX1-NEXT: movb $-1, %dl +; AVX1-NEXT: jb .LBB1_42 +; AVX1-NEXT: # %bb.41: +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: .LBB1_42: +; AVX1-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl %r13b, %eax +; AVX1-NEXT: movzbl %dl, %ecx +; AVX1-NEXT: vpinsrb $4, %ecx, %xmm3, %xmm3 +; AVX1-NEXT: vpextrb $5, %xmm1, %edx +; AVX1-NEXT: vpextrb $5, %xmm0, %ecx +; AVX1-NEXT: addb %dl, %cl +; AVX1-NEXT: movb $-1, %dl +; AVX1-NEXT: jb .LBB1_44 +; AVX1-NEXT: # %bb.43: +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: .LBB1_44: +; AVX1-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl %r12b, %eax +; AVX1-NEXT: movzbl %dl, %ecx +; AVX1-NEXT: vpinsrb $5, %ecx, %xmm3, %xmm3 +; AVX1-NEXT: vpextrb $6, %xmm1, %edx +; AVX1-NEXT: vpextrb $6, %xmm0, %ecx +; AVX1-NEXT: addb %dl, %cl +; AVX1-NEXT: movb $-1, %dl +; AVX1-NEXT: jb .LBB1_46 +; AVX1-NEXT: # %bb.45: +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: .LBB1_46: +; AVX1-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl %r15b, %eax +; AVX1-NEXT: movzbl %dl, %ecx +; AVX1-NEXT: vpinsrb $6, %ecx, %xmm3, %xmm3 +; AVX1-NEXT: vpextrb $7, %xmm1, %edx +; AVX1-NEXT: vpextrb $7, %xmm0, %ecx +; AVX1-NEXT: addb %dl, %cl +; AVX1-NEXT: movb $-1, %dl +; AVX1-NEXT: jb .LBB1_48 +; AVX1-NEXT: # %bb.47: +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: .LBB1_48: +; AVX1-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl %r14b, %eax +; AVX1-NEXT: movzbl %dl, %ecx +; AVX1-NEXT: vpinsrb $7, %ecx, %xmm3, %xmm3 +; AVX1-NEXT: vpextrb $8, %xmm1, %edx +; AVX1-NEXT: vpextrb $8, %xmm0, %ecx +; AVX1-NEXT: addb %dl, %cl +; AVX1-NEXT: movb $-1, %dl +; AVX1-NEXT: jb .LBB1_50 +; AVX1-NEXT: # %bb.49: +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: .LBB1_50: +; AVX1-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl %r8b, %eax +; AVX1-NEXT: movzbl %dl, %ecx +; AVX1-NEXT: vpinsrb $8, %ecx, %xmm3, %xmm3 +; AVX1-NEXT: vpextrb $9, %xmm1, %edx +; AVX1-NEXT: vpextrb $9, %xmm0, %ecx +; AVX1-NEXT: addb %dl, %cl +; AVX1-NEXT: movb $-1, %dl +; AVX1-NEXT: jb .LBB1_52 +; AVX1-NEXT: # %bb.51: +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: .LBB1_52: +; AVX1-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl %r11b, %eax +; AVX1-NEXT: movzbl %dl, %ecx +; AVX1-NEXT: vpinsrb $9, %ecx, %xmm3, %xmm3 +; AVX1-NEXT: vpextrb $10, %xmm1, %edx +; AVX1-NEXT: vpextrb $10, %xmm0, %ecx +; AVX1-NEXT: addb %dl, %cl +; AVX1-NEXT: movb $-1, %dl +; AVX1-NEXT: jb .LBB1_54 +; AVX1-NEXT: # %bb.53: +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: .LBB1_54: +; AVX1-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl %r10b, %eax +; AVX1-NEXT: movzbl %dl, %ecx +; AVX1-NEXT: vpinsrb $10, %ecx, %xmm3, %xmm3 +; AVX1-NEXT: vpextrb $11, %xmm1, %edx +; AVX1-NEXT: vpextrb $11, %xmm0, %ecx +; AVX1-NEXT: addb %dl, %cl +; AVX1-NEXT: movb $-1, %dl +; AVX1-NEXT: jb .LBB1_56 +; AVX1-NEXT: # %bb.55: +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: .LBB1_56: +; AVX1-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: movzbl %dl, %ecx +; AVX1-NEXT: vpinsrb $11, %ecx, %xmm3, %xmm3 +; AVX1-NEXT: vpextrb $12, %xmm1, %edx +; AVX1-NEXT: vpextrb $12, %xmm0, %ecx +; AVX1-NEXT: addb %dl, %cl +; AVX1-NEXT: movb $-1, %dl +; AVX1-NEXT: jb .LBB1_58 +; AVX1-NEXT: # %bb.57: +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: .LBB1_58: +; AVX1-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: movzbl %dl, %ecx +; AVX1-NEXT: vpinsrb $12, %ecx, %xmm3, %xmm3 +; AVX1-NEXT: vpextrb $13, %xmm1, %edx +; AVX1-NEXT: vpextrb $13, %xmm0, %ecx +; AVX1-NEXT: addb %dl, %cl +; AVX1-NEXT: movb $-1, %dl +; AVX1-NEXT: jb .LBB1_60 +; AVX1-NEXT: # %bb.59: +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: .LBB1_60: +; AVX1-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl %bpl, %eax +; AVX1-NEXT: movzbl %dl, %ecx +; AVX1-NEXT: vpinsrb $13, %ecx, %xmm3, %xmm3 +; AVX1-NEXT: vpextrb $14, %xmm1, %edx +; AVX1-NEXT: vpextrb $14, %xmm0, %ecx +; AVX1-NEXT: addb %dl, %cl +; AVX1-NEXT: movb $-1, %dl +; AVX1-NEXT: jb .LBB1_62 +; AVX1-NEXT: # %bb.61: +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: .LBB1_62: +; AVX1-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl %dl, %eax +; AVX1-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 +; AVX1-NEXT: vpextrb $15, %xmm1, %ecx +; AVX1-NEXT: vpextrb $15, %xmm0, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: movb $-1, %cl +; AVX1-NEXT: jb .LBB1_64 +; AVX1-NEXT: # %bb.63: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB1_64: +; AVX1-NEXT: movzbl %cl, %eax +; AVX1-NEXT: vpinsrb $15, %eax, %xmm3, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: popq %r12 +; AVX1-NEXT: popq %r13 +; AVX1-NEXT: popq %r14 +; AVX1-NEXT: popq %r15 +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: v32i8: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %r13 +; AVX2-NEXT: pushq %r12 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpextrb $1, %xmm2, %ecx +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-NEXT: vpextrb $1, %xmm3, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: movb $-1, %bl +; AVX2-NEXT: jb .LBB1_2 +; AVX2-NEXT: # %bb.1: +; AVX2-NEXT: movl %eax, %ebx +; AVX2-NEXT: .LBB1_2: +; AVX2-NEXT: vpextrb $0, %xmm2, %ecx +; AVX2-NEXT: vpextrb $0, %xmm3, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: movb $-1, %cl +; AVX2-NEXT: jb .LBB1_4 +; AVX2-NEXT: # %bb.3: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB1_4: +; AVX2-NEXT: vpextrb $2, %xmm2, %edx +; AVX2-NEXT: vpextrb $2, %xmm3, %eax +; AVX2-NEXT: addb %dl, %al +; AVX2-NEXT: movb $-1, %sil +; AVX2-NEXT: jb .LBB1_6 +; AVX2-NEXT: # %bb.5: +; AVX2-NEXT: movl %eax, %esi +; AVX2-NEXT: .LBB1_6: +; AVX2-NEXT: vpextrb $3, %xmm2, %eax +; AVX2-NEXT: vpextrb $3, %xmm3, %edx +; AVX2-NEXT: addb %al, %dl +; AVX2-NEXT: movb $-1, %al +; AVX2-NEXT: jb .LBB1_8 +; AVX2-NEXT: # %bb.7: +; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: .LBB1_8: +; AVX2-NEXT: vpextrb $4, %xmm2, %edx +; AVX2-NEXT: vpextrb $4, %xmm3, %edi +; AVX2-NEXT: addb %dl, %dil +; AVX2-NEXT: movb $-1, %r9b +; AVX2-NEXT: jb .LBB1_10 +; AVX2-NEXT: # %bb.9: +; AVX2-NEXT: movl %edi, %r9d +; AVX2-NEXT: .LBB1_10: +; AVX2-NEXT: vpextrb $5, %xmm2, %edx +; AVX2-NEXT: vpextrb $5, %xmm3, %ebp +; AVX2-NEXT: addb %dl, %bpl +; AVX2-NEXT: movb $-1, %dil +; AVX2-NEXT: jb .LBB1_12 +; AVX2-NEXT: # %bb.11: +; AVX2-NEXT: movl %ebp, %edi +; AVX2-NEXT: .LBB1_12: +; AVX2-NEXT: vpextrb $6, %xmm2, %edx +; AVX2-NEXT: vpextrb $6, %xmm3, %ebp +; AVX2-NEXT: addb %dl, %bpl +; AVX2-NEXT: movb $-1, %r13b +; AVX2-NEXT: jb .LBB1_14 +; AVX2-NEXT: # %bb.13: +; AVX2-NEXT: movl %ebp, %r13d +; AVX2-NEXT: .LBB1_14: +; AVX2-NEXT: vpextrb $7, %xmm2, %edx +; AVX2-NEXT: vpextrb $7, %xmm3, %ebp +; AVX2-NEXT: addb %dl, %bpl +; AVX2-NEXT: movb $-1, %r12b +; AVX2-NEXT: jb .LBB1_16 +; AVX2-NEXT: # %bb.15: +; AVX2-NEXT: movl %ebp, %r12d +; AVX2-NEXT: .LBB1_16: +; AVX2-NEXT: vpextrb $8, %xmm2, %edx +; AVX2-NEXT: vpextrb $8, %xmm3, %ebp +; AVX2-NEXT: addb %dl, %bpl +; AVX2-NEXT: movb $-1, %r15b +; AVX2-NEXT: jb .LBB1_18 +; AVX2-NEXT: # %bb.17: +; AVX2-NEXT: movl %ebp, %r15d +; AVX2-NEXT: .LBB1_18: +; AVX2-NEXT: vpextrb $9, %xmm2, %edx +; AVX2-NEXT: vpextrb $9, %xmm3, %ebp +; AVX2-NEXT: addb %dl, %bpl +; AVX2-NEXT: movb $-1, %r14b +; AVX2-NEXT: jb .LBB1_20 +; AVX2-NEXT: # %bb.19: +; AVX2-NEXT: movl %ebp, %r14d +; AVX2-NEXT: .LBB1_20: +; AVX2-NEXT: vpextrb $10, %xmm2, %edx +; AVX2-NEXT: vpextrb $10, %xmm3, %ebp +; AVX2-NEXT: addb %dl, %bpl +; AVX2-NEXT: movb $-1, %r8b +; AVX2-NEXT: jb .LBB1_22 +; AVX2-NEXT: # %bb.21: +; AVX2-NEXT: movl %ebp, %r8d +; AVX2-NEXT: .LBB1_22: +; AVX2-NEXT: vpextrb $11, %xmm2, %edx +; AVX2-NEXT: vpextrb $11, %xmm3, %ebp +; AVX2-NEXT: addb %dl, %bpl +; AVX2-NEXT: movb $-1, %r11b +; AVX2-NEXT: jb .LBB1_24 +; AVX2-NEXT: # %bb.23: +; AVX2-NEXT: movl %ebp, %r11d +; AVX2-NEXT: .LBB1_24: +; AVX2-NEXT: vpextrb $12, %xmm2, %edx +; AVX2-NEXT: vpextrb $12, %xmm3, %ebp +; AVX2-NEXT: addb %dl, %bpl +; AVX2-NEXT: movb $-1, %r10b +; AVX2-NEXT: jb .LBB1_26 +; AVX2-NEXT: # %bb.25: +; AVX2-NEXT: movl %ebp, %r10d +; AVX2-NEXT: .LBB1_26: +; AVX2-NEXT: vpextrb $13, %xmm2, %edx +; AVX2-NEXT: vpextrb $13, %xmm3, %ebp +; AVX2-NEXT: addb %dl, %bpl +; AVX2-NEXT: movb $-1, %dl +; AVX2-NEXT: jb .LBB1_28 +; AVX2-NEXT: # %bb.27: +; AVX2-NEXT: movl %ebp, %edx +; AVX2-NEXT: .LBB1_28: +; AVX2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX2-NEXT: vpextrb $14, %xmm2, %edx +; AVX2-NEXT: vpextrb $14, %xmm3, %ebp +; AVX2-NEXT: addb %dl, %bpl +; AVX2-NEXT: movb $-1, %dl +; AVX2-NEXT: jb .LBB1_30 +; AVX2-NEXT: # %bb.29: +; AVX2-NEXT: movl %ebp, %edx +; AVX2-NEXT: .LBB1_30: +; AVX2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: vpextrb $15, %xmm2, %edx +; AVX2-NEXT: vpextrb $15, %xmm3, %ebp +; AVX2-NEXT: addb %dl, %bpl +; AVX2-NEXT: movb $-1, %dl +; AVX2-NEXT: jb .LBB1_32 +; AVX2-NEXT: # %bb.31: +; AVX2-NEXT: movl %ebp, %edx +; AVX2-NEXT: .LBB1_32: +; AVX2-NEXT: movl %edx, %ebp +; AVX2-NEXT: movzbl %bl, %ebx +; AVX2-NEXT: vmovd %ecx, %xmm2 +; AVX2-NEXT: vpextrb $1, %xmm1, %edx +; AVX2-NEXT: vpextrb $1, %xmm0, %ecx +; AVX2-NEXT: addb %dl, %cl +; AVX2-NEXT: movb $-1, %dl +; AVX2-NEXT: jb .LBB1_34 +; AVX2-NEXT: # %bb.33: +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: .LBB1_34: +; AVX2-NEXT: vpinsrb $1, %ebx, %xmm2, %xmm2 +; AVX2-NEXT: movzbl %sil, %ecx +; AVX2-NEXT: movzbl %dl, %esi +; AVX2-NEXT: vpextrb $0, %xmm1, %edx +; AVX2-NEXT: vpextrb $0, %xmm0, %ebx +; AVX2-NEXT: addb %dl, %bl +; AVX2-NEXT: movb $-1, %dl +; AVX2-NEXT: jb .LBB1_36 +; AVX2-NEXT: # %bb.35: +; AVX2-NEXT: movl %ebx, %edx +; AVX2-NEXT: .LBB1_36: +; AVX2-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: movzbl %dl, %ecx +; AVX2-NEXT: vmovd %ecx, %xmm3 +; AVX2-NEXT: vpinsrb $1, %esi, %xmm3, %xmm3 +; AVX2-NEXT: vpextrb $2, %xmm1, %edx +; AVX2-NEXT: vpextrb $2, %xmm0, %ecx +; AVX2-NEXT: addb %dl, %cl +; AVX2-NEXT: movb $-1, %dl +; AVX2-NEXT: jb .LBB1_38 +; AVX2-NEXT: # %bb.37: +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: .LBB1_38: +; AVX2-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl %r9b, %eax +; AVX2-NEXT: movzbl %dl, %ecx +; AVX2-NEXT: vpinsrb $2, %ecx, %xmm3, %xmm3 +; AVX2-NEXT: vpextrb $3, %xmm1, %edx +; AVX2-NEXT: vpextrb $3, %xmm0, %ecx +; AVX2-NEXT: addb %dl, %cl +; AVX2-NEXT: movb $-1, %dl +; AVX2-NEXT: jb .LBB1_40 +; AVX2-NEXT: # %bb.39: +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: .LBB1_40: +; AVX2-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl %dil, %eax +; AVX2-NEXT: movzbl %dl, %ecx +; AVX2-NEXT: vpinsrb $3, %ecx, %xmm3, %xmm3 +; AVX2-NEXT: vpextrb $4, %xmm1, %edx +; AVX2-NEXT: vpextrb $4, %xmm0, %ecx +; AVX2-NEXT: addb %dl, %cl +; AVX2-NEXT: movb $-1, %dl +; AVX2-NEXT: jb .LBB1_42 +; AVX2-NEXT: # %bb.41: +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: .LBB1_42: +; AVX2-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl %r13b, %eax +; AVX2-NEXT: movzbl %dl, %ecx +; AVX2-NEXT: vpinsrb $4, %ecx, %xmm3, %xmm3 +; AVX2-NEXT: vpextrb $5, %xmm1, %edx +; AVX2-NEXT: vpextrb $5, %xmm0, %ecx +; AVX2-NEXT: addb %dl, %cl +; AVX2-NEXT: movb $-1, %dl +; AVX2-NEXT: jb .LBB1_44 +; AVX2-NEXT: # %bb.43: +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: .LBB1_44: +; AVX2-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl %r12b, %eax +; AVX2-NEXT: movzbl %dl, %ecx +; AVX2-NEXT: vpinsrb $5, %ecx, %xmm3, %xmm3 +; AVX2-NEXT: vpextrb $6, %xmm1, %edx +; AVX2-NEXT: vpextrb $6, %xmm0, %ecx +; AVX2-NEXT: addb %dl, %cl +; AVX2-NEXT: movb $-1, %dl +; AVX2-NEXT: jb .LBB1_46 +; AVX2-NEXT: # %bb.45: +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: .LBB1_46: +; AVX2-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl %r15b, %eax +; AVX2-NEXT: movzbl %dl, %ecx +; AVX2-NEXT: vpinsrb $6, %ecx, %xmm3, %xmm3 +; AVX2-NEXT: vpextrb $7, %xmm1, %edx +; AVX2-NEXT: vpextrb $7, %xmm0, %ecx +; AVX2-NEXT: addb %dl, %cl +; AVX2-NEXT: movb $-1, %dl +; AVX2-NEXT: jb .LBB1_48 +; AVX2-NEXT: # %bb.47: +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: .LBB1_48: +; AVX2-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl %r14b, %eax +; AVX2-NEXT: movzbl %dl, %ecx +; AVX2-NEXT: vpinsrb $7, %ecx, %xmm3, %xmm3 +; AVX2-NEXT: vpextrb $8, %xmm1, %edx +; AVX2-NEXT: vpextrb $8, %xmm0, %ecx +; AVX2-NEXT: addb %dl, %cl +; AVX2-NEXT: movb $-1, %dl +; AVX2-NEXT: jb .LBB1_50 +; AVX2-NEXT: # %bb.49: +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: .LBB1_50: +; AVX2-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl %r8b, %eax +; AVX2-NEXT: movzbl %dl, %ecx +; AVX2-NEXT: vpinsrb $8, %ecx, %xmm3, %xmm3 +; AVX2-NEXT: vpextrb $9, %xmm1, %edx +; AVX2-NEXT: vpextrb $9, %xmm0, %ecx +; AVX2-NEXT: addb %dl, %cl +; AVX2-NEXT: movb $-1, %dl +; AVX2-NEXT: jb .LBB1_52 +; AVX2-NEXT: # %bb.51: +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: .LBB1_52: +; AVX2-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl %r11b, %eax +; AVX2-NEXT: movzbl %dl, %ecx +; AVX2-NEXT: vpinsrb $9, %ecx, %xmm3, %xmm3 +; AVX2-NEXT: vpextrb $10, %xmm1, %edx +; AVX2-NEXT: vpextrb $10, %xmm0, %ecx +; AVX2-NEXT: addb %dl, %cl +; AVX2-NEXT: movb $-1, %dl +; AVX2-NEXT: jb .LBB1_54 +; AVX2-NEXT: # %bb.53: +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: .LBB1_54: +; AVX2-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl %r10b, %eax +; AVX2-NEXT: movzbl %dl, %ecx +; AVX2-NEXT: vpinsrb $10, %ecx, %xmm3, %xmm3 +; AVX2-NEXT: vpextrb $11, %xmm1, %edx +; AVX2-NEXT: vpextrb $11, %xmm0, %ecx +; AVX2-NEXT: addb %dl, %cl +; AVX2-NEXT: movb $-1, %dl +; AVX2-NEXT: jb .LBB1_56 +; AVX2-NEXT: # %bb.55: +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: .LBB1_56: +; AVX2-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: movzbl %dl, %ecx +; AVX2-NEXT: vpinsrb $11, %ecx, %xmm3, %xmm3 +; AVX2-NEXT: vpextrb $12, %xmm1, %edx +; AVX2-NEXT: vpextrb $12, %xmm0, %ecx +; AVX2-NEXT: addb %dl, %cl +; AVX2-NEXT: movb $-1, %dl +; AVX2-NEXT: jb .LBB1_58 +; AVX2-NEXT: # %bb.57: +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: .LBB1_58: +; AVX2-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: movzbl %dl, %ecx +; AVX2-NEXT: vpinsrb $12, %ecx, %xmm3, %xmm3 +; AVX2-NEXT: vpextrb $13, %xmm1, %edx +; AVX2-NEXT: vpextrb $13, %xmm0, %ecx +; AVX2-NEXT: addb %dl, %cl +; AVX2-NEXT: movb $-1, %dl +; AVX2-NEXT: jb .LBB1_60 +; AVX2-NEXT: # %bb.59: +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: .LBB1_60: +; AVX2-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl %bpl, %eax +; AVX2-NEXT: movzbl %dl, %ecx +; AVX2-NEXT: vpinsrb $13, %ecx, %xmm3, %xmm3 +; AVX2-NEXT: vpextrb $14, %xmm1, %edx +; AVX2-NEXT: vpextrb $14, %xmm0, %ecx +; AVX2-NEXT: addb %dl, %cl +; AVX2-NEXT: movb $-1, %dl +; AVX2-NEXT: jb .LBB1_62 +; AVX2-NEXT: # %bb.61: +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: .LBB1_62: +; AVX2-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl %dl, %eax +; AVX2-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 +; AVX2-NEXT: vpextrb $15, %xmm1, %ecx +; AVX2-NEXT: vpextrb $15, %xmm0, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: movb $-1, %cl +; AVX2-NEXT: jb .LBB1_64 +; AVX2-NEXT: # %bb.63: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB1_64: +; AVX2-NEXT: movzbl %cl, %eax +; AVX2-NEXT: vpinsrb $15, %eax, %xmm3, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r12 +; AVX2-NEXT: popq %r13 +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512-LABEL: v32i8: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r15 +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %r13 +; AVX512-NEXT: pushq %r12 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vpextrb $1, %xmm2, %ecx +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512-NEXT: vpextrb $1, %xmm3, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: movb $-1, %bl +; AVX512-NEXT: jb .LBB1_2 +; AVX512-NEXT: # %bb.1: +; AVX512-NEXT: movl %eax, %ebx +; AVX512-NEXT: .LBB1_2: +; AVX512-NEXT: vpextrb $0, %xmm2, %ecx +; AVX512-NEXT: vpextrb $0, %xmm3, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: movb $-1, %cl +; AVX512-NEXT: jb .LBB1_4 +; AVX512-NEXT: # %bb.3: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB1_4: +; AVX512-NEXT: vpextrb $2, %xmm2, %edx +; AVX512-NEXT: vpextrb $2, %xmm3, %eax +; AVX512-NEXT: addb %dl, %al +; AVX512-NEXT: movb $-1, %sil +; AVX512-NEXT: jb .LBB1_6 +; AVX512-NEXT: # %bb.5: +; AVX512-NEXT: movl %eax, %esi +; AVX512-NEXT: .LBB1_6: +; AVX512-NEXT: vpextrb $3, %xmm2, %eax +; AVX512-NEXT: vpextrb $3, %xmm3, %edx +; AVX512-NEXT: addb %al, %dl +; AVX512-NEXT: movb $-1, %al +; AVX512-NEXT: jb .LBB1_8 +; AVX512-NEXT: # %bb.7: +; AVX512-NEXT: movl %edx, %eax +; AVX512-NEXT: .LBB1_8: +; AVX512-NEXT: vpextrb $4, %xmm2, %edx +; AVX512-NEXT: vpextrb $4, %xmm3, %edi +; AVX512-NEXT: addb %dl, %dil +; AVX512-NEXT: movb $-1, %r9b +; AVX512-NEXT: jb .LBB1_10 +; AVX512-NEXT: # %bb.9: +; AVX512-NEXT: movl %edi, %r9d +; AVX512-NEXT: .LBB1_10: +; AVX512-NEXT: vpextrb $5, %xmm2, %edx +; AVX512-NEXT: vpextrb $5, %xmm3, %ebp +; AVX512-NEXT: addb %dl, %bpl +; AVX512-NEXT: movb $-1, %dil +; AVX512-NEXT: jb .LBB1_12 +; AVX512-NEXT: # %bb.11: +; AVX512-NEXT: movl %ebp, %edi +; AVX512-NEXT: .LBB1_12: +; AVX512-NEXT: vpextrb $6, %xmm2, %edx +; AVX512-NEXT: vpextrb $6, %xmm3, %ebp +; AVX512-NEXT: addb %dl, %bpl +; AVX512-NEXT: movb $-1, %r13b +; AVX512-NEXT: jb .LBB1_14 +; AVX512-NEXT: # %bb.13: +; AVX512-NEXT: movl %ebp, %r13d +; AVX512-NEXT: .LBB1_14: +; AVX512-NEXT: vpextrb $7, %xmm2, %edx +; AVX512-NEXT: vpextrb $7, %xmm3, %ebp +; AVX512-NEXT: addb %dl, %bpl +; AVX512-NEXT: movb $-1, %r12b +; AVX512-NEXT: jb .LBB1_16 +; AVX512-NEXT: # %bb.15: +; AVX512-NEXT: movl %ebp, %r12d +; AVX512-NEXT: .LBB1_16: +; AVX512-NEXT: vpextrb $8, %xmm2, %edx +; AVX512-NEXT: vpextrb $8, %xmm3, %ebp +; AVX512-NEXT: addb %dl, %bpl +; AVX512-NEXT: movb $-1, %r15b +; AVX512-NEXT: jb .LBB1_18 +; AVX512-NEXT: # %bb.17: +; AVX512-NEXT: movl %ebp, %r15d +; AVX512-NEXT: .LBB1_18: +; AVX512-NEXT: vpextrb $9, %xmm2, %edx +; AVX512-NEXT: vpextrb $9, %xmm3, %ebp +; AVX512-NEXT: addb %dl, %bpl +; AVX512-NEXT: movb $-1, %r14b +; AVX512-NEXT: jb .LBB1_20 +; AVX512-NEXT: # %bb.19: +; AVX512-NEXT: movl %ebp, %r14d +; AVX512-NEXT: .LBB1_20: +; AVX512-NEXT: vpextrb $10, %xmm2, %edx +; AVX512-NEXT: vpextrb $10, %xmm3, %ebp +; AVX512-NEXT: addb %dl, %bpl +; AVX512-NEXT: movb $-1, %r8b +; AVX512-NEXT: jb .LBB1_22 +; AVX512-NEXT: # %bb.21: +; AVX512-NEXT: movl %ebp, %r8d +; AVX512-NEXT: .LBB1_22: +; AVX512-NEXT: vpextrb $11, %xmm2, %edx +; AVX512-NEXT: vpextrb $11, %xmm3, %ebp +; AVX512-NEXT: addb %dl, %bpl +; AVX512-NEXT: movb $-1, %r11b +; AVX512-NEXT: jb .LBB1_24 +; AVX512-NEXT: # %bb.23: +; AVX512-NEXT: movl %ebp, %r11d +; AVX512-NEXT: .LBB1_24: +; AVX512-NEXT: vpextrb $12, %xmm2, %edx +; AVX512-NEXT: vpextrb $12, %xmm3, %ebp +; AVX512-NEXT: addb %dl, %bpl +; AVX512-NEXT: movb $-1, %r10b +; AVX512-NEXT: jb .LBB1_26 +; AVX512-NEXT: # %bb.25: +; AVX512-NEXT: movl %ebp, %r10d +; AVX512-NEXT: .LBB1_26: +; AVX512-NEXT: vpextrb $13, %xmm2, %edx +; AVX512-NEXT: vpextrb $13, %xmm3, %ebp +; AVX512-NEXT: addb %dl, %bpl +; AVX512-NEXT: movb $-1, %dl +; AVX512-NEXT: jb .LBB1_28 +; AVX512-NEXT: # %bb.27: +; AVX512-NEXT: movl %ebp, %edx +; AVX512-NEXT: .LBB1_28: +; AVX512-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX512-NEXT: vpextrb $14, %xmm2, %edx +; AVX512-NEXT: vpextrb $14, %xmm3, %ebp +; AVX512-NEXT: addb %dl, %bpl +; AVX512-NEXT: movb $-1, %dl +; AVX512-NEXT: jb .LBB1_30 +; AVX512-NEXT: # %bb.29: +; AVX512-NEXT: movl %ebp, %edx +; AVX512-NEXT: .LBB1_30: +; AVX512-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX512-NEXT: movzbl %cl, %ecx +; AVX512-NEXT: vpextrb $15, %xmm2, %edx +; AVX512-NEXT: vpextrb $15, %xmm3, %ebp +; AVX512-NEXT: addb %dl, %bpl +; AVX512-NEXT: movb $-1, %dl +; AVX512-NEXT: jb .LBB1_32 +; AVX512-NEXT: # %bb.31: +; AVX512-NEXT: movl %ebp, %edx +; AVX512-NEXT: .LBB1_32: +; AVX512-NEXT: movl %edx, %ebp +; AVX512-NEXT: movzbl %bl, %ebx +; AVX512-NEXT: vmovd %ecx, %xmm2 +; AVX512-NEXT: vpextrb $1, %xmm1, %edx +; AVX512-NEXT: vpextrb $1, %xmm0, %ecx +; AVX512-NEXT: addb %dl, %cl +; AVX512-NEXT: movb $-1, %dl +; AVX512-NEXT: jb .LBB1_34 +; AVX512-NEXT: # %bb.33: +; AVX512-NEXT: movl %ecx, %edx +; AVX512-NEXT: .LBB1_34: +; AVX512-NEXT: vpinsrb $1, %ebx, %xmm2, %xmm2 +; AVX512-NEXT: movzbl %sil, %ecx +; AVX512-NEXT: movzbl %dl, %esi +; AVX512-NEXT: vpextrb $0, %xmm1, %edx +; AVX512-NEXT: vpextrb $0, %xmm0, %ebx +; AVX512-NEXT: addb %dl, %bl +; AVX512-NEXT: movb $-1, %dl +; AVX512-NEXT: jb .LBB1_36 +; AVX512-NEXT: # %bb.35: +; AVX512-NEXT: movl %ebx, %edx +; AVX512-NEXT: .LBB1_36: +; AVX512-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 +; AVX512-NEXT: movzbl %al, %eax +; AVX512-NEXT: movzbl %dl, %ecx +; AVX512-NEXT: vmovd %ecx, %xmm3 +; AVX512-NEXT: vpinsrb $1, %esi, %xmm3, %xmm3 +; AVX512-NEXT: vpextrb $2, %xmm1, %edx +; AVX512-NEXT: vpextrb $2, %xmm0, %ecx +; AVX512-NEXT: addb %dl, %cl +; AVX512-NEXT: movb $-1, %dl +; AVX512-NEXT: jb .LBB1_38 +; AVX512-NEXT: # %bb.37: +; AVX512-NEXT: movl %ecx, %edx +; AVX512-NEXT: .LBB1_38: +; AVX512-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl %r9b, %eax +; AVX512-NEXT: movzbl %dl, %ecx +; AVX512-NEXT: vpinsrb $2, %ecx, %xmm3, %xmm3 +; AVX512-NEXT: vpextrb $3, %xmm1, %edx +; AVX512-NEXT: vpextrb $3, %xmm0, %ecx +; AVX512-NEXT: addb %dl, %cl +; AVX512-NEXT: movb $-1, %dl +; AVX512-NEXT: jb .LBB1_40 +; AVX512-NEXT: # %bb.39: +; AVX512-NEXT: movl %ecx, %edx +; AVX512-NEXT: .LBB1_40: +; AVX512-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl %dil, %eax +; AVX512-NEXT: movzbl %dl, %ecx +; AVX512-NEXT: vpinsrb $3, %ecx, %xmm3, %xmm3 +; AVX512-NEXT: vpextrb $4, %xmm1, %edx +; AVX512-NEXT: vpextrb $4, %xmm0, %ecx +; AVX512-NEXT: addb %dl, %cl +; AVX512-NEXT: movb $-1, %dl +; AVX512-NEXT: jb .LBB1_42 +; AVX512-NEXT: # %bb.41: +; AVX512-NEXT: movl %ecx, %edx +; AVX512-NEXT: .LBB1_42: +; AVX512-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl %r13b, %eax +; AVX512-NEXT: movzbl %dl, %ecx +; AVX512-NEXT: vpinsrb $4, %ecx, %xmm3, %xmm3 +; AVX512-NEXT: vpextrb $5, %xmm1, %edx +; AVX512-NEXT: vpextrb $5, %xmm0, %ecx +; AVX512-NEXT: addb %dl, %cl +; AVX512-NEXT: movb $-1, %dl +; AVX512-NEXT: jb .LBB1_44 +; AVX512-NEXT: # %bb.43: +; AVX512-NEXT: movl %ecx, %edx +; AVX512-NEXT: .LBB1_44: +; AVX512-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl %r12b, %eax +; AVX512-NEXT: movzbl %dl, %ecx +; AVX512-NEXT: vpinsrb $5, %ecx, %xmm3, %xmm3 +; AVX512-NEXT: vpextrb $6, %xmm1, %edx +; AVX512-NEXT: vpextrb $6, %xmm0, %ecx +; AVX512-NEXT: addb %dl, %cl +; AVX512-NEXT: movb $-1, %dl +; AVX512-NEXT: jb .LBB1_46 +; AVX512-NEXT: # %bb.45: +; AVX512-NEXT: movl %ecx, %edx +; AVX512-NEXT: .LBB1_46: +; AVX512-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl %r15b, %eax +; AVX512-NEXT: movzbl %dl, %ecx +; AVX512-NEXT: vpinsrb $6, %ecx, %xmm3, %xmm3 +; AVX512-NEXT: vpextrb $7, %xmm1, %edx +; AVX512-NEXT: vpextrb $7, %xmm0, %ecx +; AVX512-NEXT: addb %dl, %cl +; AVX512-NEXT: movb $-1, %dl +; AVX512-NEXT: jb .LBB1_48 +; AVX512-NEXT: # %bb.47: +; AVX512-NEXT: movl %ecx, %edx +; AVX512-NEXT: .LBB1_48: +; AVX512-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl %r14b, %eax +; AVX512-NEXT: movzbl %dl, %ecx +; AVX512-NEXT: vpinsrb $7, %ecx, %xmm3, %xmm3 +; AVX512-NEXT: vpextrb $8, %xmm1, %edx +; AVX512-NEXT: vpextrb $8, %xmm0, %ecx +; AVX512-NEXT: addb %dl, %cl +; AVX512-NEXT: movb $-1, %dl +; AVX512-NEXT: jb .LBB1_50 +; AVX512-NEXT: # %bb.49: +; AVX512-NEXT: movl %ecx, %edx +; AVX512-NEXT: .LBB1_50: +; AVX512-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl %r8b, %eax +; AVX512-NEXT: movzbl %dl, %ecx +; AVX512-NEXT: vpinsrb $8, %ecx, %xmm3, %xmm3 +; AVX512-NEXT: vpextrb $9, %xmm1, %edx +; AVX512-NEXT: vpextrb $9, %xmm0, %ecx +; AVX512-NEXT: addb %dl, %cl +; AVX512-NEXT: movb $-1, %dl +; AVX512-NEXT: jb .LBB1_52 +; AVX512-NEXT: # %bb.51: +; AVX512-NEXT: movl %ecx, %edx +; AVX512-NEXT: .LBB1_52: +; AVX512-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl %r11b, %eax +; AVX512-NEXT: movzbl %dl, %ecx +; AVX512-NEXT: vpinsrb $9, %ecx, %xmm3, %xmm3 +; AVX512-NEXT: vpextrb $10, %xmm1, %edx +; AVX512-NEXT: vpextrb $10, %xmm0, %ecx +; AVX512-NEXT: addb %dl, %cl +; AVX512-NEXT: movb $-1, %dl +; AVX512-NEXT: jb .LBB1_54 +; AVX512-NEXT: # %bb.53: +; AVX512-NEXT: movl %ecx, %edx +; AVX512-NEXT: .LBB1_54: +; AVX512-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl %r10b, %eax +; AVX512-NEXT: movzbl %dl, %ecx +; AVX512-NEXT: vpinsrb $10, %ecx, %xmm3, %xmm3 +; AVX512-NEXT: vpextrb $11, %xmm1, %edx +; AVX512-NEXT: vpextrb $11, %xmm0, %ecx +; AVX512-NEXT: addb %dl, %cl +; AVX512-NEXT: movb $-1, %dl +; AVX512-NEXT: jb .LBB1_56 +; AVX512-NEXT: # %bb.55: +; AVX512-NEXT: movl %ecx, %edx +; AVX512-NEXT: .LBB1_56: +; AVX512-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: movzbl %dl, %ecx +; AVX512-NEXT: vpinsrb $11, %ecx, %xmm3, %xmm3 +; AVX512-NEXT: vpextrb $12, %xmm1, %edx +; AVX512-NEXT: vpextrb $12, %xmm0, %ecx +; AVX512-NEXT: addb %dl, %cl +; AVX512-NEXT: movb $-1, %dl +; AVX512-NEXT: jb .LBB1_58 +; AVX512-NEXT: # %bb.57: +; AVX512-NEXT: movl %ecx, %edx +; AVX512-NEXT: .LBB1_58: +; AVX512-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: movzbl %dl, %ecx +; AVX512-NEXT: vpinsrb $12, %ecx, %xmm3, %xmm3 +; AVX512-NEXT: vpextrb $13, %xmm1, %edx +; AVX512-NEXT: vpextrb $13, %xmm0, %ecx +; AVX512-NEXT: addb %dl, %cl +; AVX512-NEXT: movb $-1, %dl +; AVX512-NEXT: jb .LBB1_60 +; AVX512-NEXT: # %bb.59: +; AVX512-NEXT: movl %ecx, %edx +; AVX512-NEXT: .LBB1_60: +; AVX512-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl %bpl, %eax +; AVX512-NEXT: movzbl %dl, %ecx +; AVX512-NEXT: vpinsrb $13, %ecx, %xmm3, %xmm3 +; AVX512-NEXT: vpextrb $14, %xmm1, %edx +; AVX512-NEXT: vpextrb $14, %xmm0, %ecx +; AVX512-NEXT: addb %dl, %cl +; AVX512-NEXT: movb $-1, %dl +; AVX512-NEXT: jb .LBB1_62 +; AVX512-NEXT: # %bb.61: +; AVX512-NEXT: movl %ecx, %edx +; AVX512-NEXT: .LBB1_62: +; AVX512-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl %dl, %eax +; AVX512-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 +; AVX512-NEXT: vpextrb $15, %xmm1, %ecx +; AVX512-NEXT: vpextrb $15, %xmm0, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: movb $-1, %cl +; AVX512-NEXT: jb .LBB1_64 +; AVX512-NEXT: # %bb.63: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB1_64: +; AVX512-NEXT: movzbl %cl, %eax +; AVX512-NEXT: vpinsrb $15, %eax, %xmm3, %xmm0 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r12 +; AVX512-NEXT: popq %r13 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %r15 +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq + %z = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> %x, <32 x i8> %y) + ret <32 x i8> %z +} + +define <64 x i8> @v64i8(<64 x i8> %x, <64 x i8> %y) nounwind { +; SSE2-LABEL: v64i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: pushq %r15 +; SSE2-NEXT: pushq %r14 +; SSE2-NEXT: pushq %r13 +; SSE2-NEXT: pushq %r12 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: subq $648, %rsp # imm = 0x288 +; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movaps %xmm5, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movaps %xmm6, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movaps %xmm7, (%rsp) +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %cl +; SSE2-NEXT: jb .LBB2_2 +; SSE2-NEXT: # %bb.1: +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: .LBB2_2: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %dil +; SSE2-NEXT: jb .LBB2_4 +; SSE2-NEXT: # %bb.3: +; SSE2-NEXT: movl %eax, %edi +; SSE2-NEXT: .LBB2_4: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %r8b +; SSE2-NEXT: jb .LBB2_6 +; SSE2-NEXT: # %bb.5: +; SSE2-NEXT: movl %eax, %r8d +; SSE2-NEXT: .LBB2_6: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %r9b +; SSE2-NEXT: jb .LBB2_8 +; SSE2-NEXT: # %bb.7: +; SSE2-NEXT: movl %eax, %r9d +; SSE2-NEXT: .LBB2_8: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %r10b +; SSE2-NEXT: jb .LBB2_10 +; SSE2-NEXT: # %bb.9: +; SSE2-NEXT: movl %eax, %r10d +; SSE2-NEXT: .LBB2_10: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %r11b +; SSE2-NEXT: jb .LBB2_12 +; SSE2-NEXT: # %bb.11: +; SSE2-NEXT: movl %eax, %r11d +; SSE2-NEXT: .LBB2_12: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %bpl +; SSE2-NEXT: jb .LBB2_14 +; SSE2-NEXT: # %bb.13: +; SSE2-NEXT: movl %eax, %ebp +; SSE2-NEXT: .LBB2_14: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %r14b +; SSE2-NEXT: jb .LBB2_16 +; SSE2-NEXT: # %bb.15: +; SSE2-NEXT: movl %eax, %r14d +; SSE2-NEXT: .LBB2_16: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %dl +; SSE2-NEXT: jb .LBB2_18 +; SSE2-NEXT: # %bb.17: +; SSE2-NEXT: movl %eax, %edx +; SSE2-NEXT: .LBB2_18: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %r15b +; SSE2-NEXT: jb .LBB2_20 +; SSE2-NEXT: # %bb.19: +; SSE2-NEXT: movl %eax, %r15d +; SSE2-NEXT: .LBB2_20: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %r12b +; SSE2-NEXT: jb .LBB2_22 +; SSE2-NEXT: # %bb.21: +; SSE2-NEXT: movl %eax, %r12d +; SSE2-NEXT: .LBB2_22: +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %dl +; SSE2-NEXT: jb .LBB2_24 +; SSE2-NEXT: # %bb.23: +; SSE2-NEXT: movl %eax, %edx +; SSE2-NEXT: .LBB2_24: +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %dl +; SSE2-NEXT: jb .LBB2_26 +; SSE2-NEXT: # %bb.25: +; SSE2-NEXT: movl %eax, %edx +; SSE2-NEXT: .LBB2_26: +; SSE2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %sil +; SSE2-NEXT: jb .LBB2_28 +; SSE2-NEXT: # %bb.27: +; SSE2-NEXT: movl %eax, %esi +; SSE2-NEXT: .LBB2_28: +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSE2-NEXT: jb .LBB2_30 +; SSE2-NEXT: # %bb.29: +; SSE2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_30: +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %dl +; SSE2-NEXT: jb .LBB2_32 +; SSE2-NEXT: # %bb.31: +; SSE2-NEXT: movl %eax, %edx +; SSE2-NEXT: .LBB2_32: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %cl +; SSE2-NEXT: jb .LBB2_34 +; SSE2-NEXT: # %bb.33: +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: .LBB2_34: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSE2-NEXT: jb .LBB2_36 +; SSE2-NEXT: # %bb.35: +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_36: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSE2-NEXT: jb .LBB2_38 +; SSE2-NEXT: # %bb.37: +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_38: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSE2-NEXT: jb .LBB2_40 +; SSE2-NEXT: # %bb.39: +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_40: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSE2-NEXT: jb .LBB2_42 +; SSE2-NEXT: # %bb.41: +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_42: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSE2-NEXT: jb .LBB2_44 +; SSE2-NEXT: # %bb.43: +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_44: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSE2-NEXT: jb .LBB2_46 +; SSE2-NEXT: # %bb.45: +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_46: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSE2-NEXT: jb .LBB2_48 +; SSE2-NEXT: # %bb.47: +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_48: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSE2-NEXT: jb .LBB2_50 +; SSE2-NEXT: # %bb.49: +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_50: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSE2-NEXT: jb .LBB2_52 +; SSE2-NEXT: # %bb.51: +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_52: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSE2-NEXT: jb .LBB2_54 +; SSE2-NEXT: # %bb.53: +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_54: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSE2-NEXT: jb .LBB2_56 +; SSE2-NEXT: # %bb.55: +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_56: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSE2-NEXT: jb .LBB2_58 +; SSE2-NEXT: # %bb.57: +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_58: +; SSE2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %cl +; SSE2-NEXT: jb .LBB2_60 +; SSE2-NEXT: # %bb.59: +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: .LBB2_60: +; SSE2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSE2-NEXT: jb .LBB2_62 +; SSE2-NEXT: # %bb.61: +; SSE2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_62: +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %dl +; SSE2-NEXT: jb .LBB2_64 +; SSE2-NEXT: # %bb.63: +; SSE2-NEXT: movl %eax, %edx +; SSE2-NEXT: .LBB2_64: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %cl +; SSE2-NEXT: jb .LBB2_66 +; SSE2-NEXT: # %bb.65: +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: .LBB2_66: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSE2-NEXT: jb .LBB2_68 +; SSE2-NEXT: # %bb.67: +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_68: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSE2-NEXT: jb .LBB2_70 +; SSE2-NEXT: # %bb.69: +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_70: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSE2-NEXT: jb .LBB2_72 +; SSE2-NEXT: # %bb.71: +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_72: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSE2-NEXT: jb .LBB2_74 +; SSE2-NEXT: # %bb.73: +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_74: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSE2-NEXT: jb .LBB2_76 +; SSE2-NEXT: # %bb.75: +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_76: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSE2-NEXT: jb .LBB2_78 +; SSE2-NEXT: # %bb.77: +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_78: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSE2-NEXT: jb .LBB2_80 +; SSE2-NEXT: # %bb.79: +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_80: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSE2-NEXT: jb .LBB2_82 +; SSE2-NEXT: # %bb.81: +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_82: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSE2-NEXT: jb .LBB2_84 +; SSE2-NEXT: # %bb.83: +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_84: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSE2-NEXT: jb .LBB2_86 +; SSE2-NEXT: # %bb.85: +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_86: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %r13b +; SSE2-NEXT: jb .LBB2_88 +; SSE2-NEXT: # %bb.87: +; SSE2-NEXT: movl %eax, %r13d +; SSE2-NEXT: .LBB2_88: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %bl +; SSE2-NEXT: jb .LBB2_90 +; SSE2-NEXT: # %bb.89: +; SSE2-NEXT: movl %eax, %ebx +; SSE2-NEXT: .LBB2_90: +; SSE2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %cl +; SSE2-NEXT: jb .LBB2_92 +; SSE2-NEXT: # %bb.91: +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: .LBB2_92: +; SSE2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSE2-NEXT: jb .LBB2_94 +; SSE2-NEXT: # %bb.93: +; SSE2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_94: +; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %cl +; SSE2-NEXT: jb .LBB2_96 +; SSE2-NEXT: # %bb.95: +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: .LBB2_96: +; SSE2-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSE2-NEXT: movl %esi, %r13d +; SSE2-NEXT: jb .LBB2_98 +; SSE2-NEXT: # %bb.97: +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_98: +; SSE2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %sil +; SSE2-NEXT: jb .LBB2_100 +; SSE2-NEXT: # %bb.99: +; SSE2-NEXT: movl %eax, %esi +; SSE2-NEXT: .LBB2_100: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %cl +; SSE2-NEXT: jb .LBB2_102 +; SSE2-NEXT: # %bb.101: +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: .LBB2_102: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %bl +; SSE2-NEXT: jb .LBB2_104 +; SSE2-NEXT: # %bb.103: +; SSE2-NEXT: movl %eax, %ebx +; SSE2-NEXT: .LBB2_104: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSE2-NEXT: jb .LBB2_106 +; SSE2-NEXT: # %bb.105: +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_106: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSE2-NEXT: jb .LBB2_108 +; SSE2-NEXT: # %bb.107: +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_108: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSE2-NEXT: jb .LBB2_110 +; SSE2-NEXT: # %bb.109: +; SSE2-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_110: +; SSE2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %cl +; SSE2-NEXT: jb .LBB2_112 +; SSE2-NEXT: # %bb.111: +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: .LBB2_112: +; SSE2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %cl +; SSE2-NEXT: jb .LBB2_114 +; SSE2-NEXT: # %bb.113: +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: .LBB2_114: +; SSE2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %cl +; SSE2-NEXT: jb .LBB2_116 +; SSE2-NEXT: # %bb.115: +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: .LBB2_116: +; SSE2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl %dil, %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl %r8b, %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl %r9b, %edi +; SSE2-NEXT: movzbl %r10b, %r8d +; SSE2-NEXT: movzbl %r11b, %r9d +; SSE2-NEXT: movzbl %bpl, %r10d +; SSE2-NEXT: movzbl %r14b, %r11d +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload +; SSE2-NEXT: movzbl %r15b, %ebp +; SSE2-NEXT: movzbl %r12b, %r14d +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload +; SSE2-NEXT: movzbl %r13b, %r13d +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl %dl, %edx +; SSE2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: addb {{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSE2-NEXT: jb .LBB2_118 +; SSE2-NEXT: # %bb.117: +; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: .LBB2_118: +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd %edi, %xmm3 +; SSE2-NEXT: movd %r8d, %xmm0 +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd %r9d, %xmm0 +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd %r10d, %xmm0 +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd %r11d, %xmm0 +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd %ebx, %xmm0 +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd %ebp, %xmm0 +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd %r14d, %xmm0 +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd %r15d, %xmm0 +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd %r12d, %xmm0 +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd %r13d, %xmm0 +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: addb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload +; SSE2-NEXT: movb $-1, %bl +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 4-byte Folded Reload +; SSE2-NEXT: # xmm11 = mem[0],zero,zero,zero +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload +; SSE2-NEXT: # xmm15 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 4-byte Folded Reload +; SSE2-NEXT: # xmm9 = mem[0],zero,zero,zero +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 4-byte Folded Reload +; SSE2-NEXT: # xmm14 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 4-byte Folded Reload +; SSE2-NEXT: # xmm5 = mem[0],zero,zero,zero +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 4-byte Folded Reload +; SSE2-NEXT: # xmm6 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 4-byte Folded Reload +; SSE2-NEXT: # xmm13 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 4-byte Folded Reload +; SSE2-NEXT: # xmm10 = mem[0],zero,zero,zero +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload +; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 4-byte Folded Reload +; SSE2-NEXT: # xmm12 = mem[0],zero,zero,zero +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 4-byte Folded Reload +; SSE2-NEXT: # xmm7 = mem[0],zero,zero,zero +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 4-byte Folded Reload +; SSE2-NEXT: # xmm8 = mem[0],zero,zero,zero +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload +; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: jb .LBB2_120 +; SSE2-NEXT: # %bb.119: +; SSE2-NEXT: movl %ecx, %ebx +; SSE2-NEXT: .LBB2_120: +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE2-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3],xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; SSE2-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3],xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] +; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE2-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1],xmm15[2],mem[2],xmm15[3],mem[3],xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] +; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE2-NEXT: # xmm14 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3],xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3],xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7] +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE2-NEXT: # xmm12 = xmm12[0],mem[0],xmm12[1],mem[1],xmm12[2],mem[2],xmm12[3],mem[3],xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE2-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3],xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE2-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3],xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE2-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE2-NEXT: movd %esi, %xmm3 +; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd %edx, %xmm10 +; SSE2-NEXT: movd %edi, %xmm3 +; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd %ebp, %xmm9 +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd %r10d, %xmm11 +; SSE2-NEXT: movd %r9d, %xmm3 +; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd %r8d, %xmm4 +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE2-NEXT: movzbl %bl, %ecx +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: addb {{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movb $-1, %al +; SSE2-NEXT: jb .LBB2_122 +; SSE2-NEXT: # %bb.121: +; SSE2-NEXT: movl %ebx, %eax +; SSE2-NEXT: .LBB2_122: +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE2-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1],xmm15[2],mem[2],xmm15[3],mem[3] +; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE2-NEXT: # xmm14 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE2-NEXT: # xmm12 = xmm12[0],mem[0],xmm12[1],mem[1],xmm12[2],mem[2],xmm12[3],mem[3] +; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE2-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3] +; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE2-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3] +; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE2-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE2-NEXT: # xmm10 = xmm10[0],mem[0],xmm10[1],mem[1],xmm10[2],mem[2],xmm10[3],mem[3],xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE2-NEXT: # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1],xmm9[2],mem[2],xmm9[3],mem[3],xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE2-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3],xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE2-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3],xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; SSE2-NEXT: movd %edi, %xmm3 +; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd %edx, %xmm3 +; SSE2-NEXT: movd %esi, %xmm5 +; SSE2-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd %ecx, %xmm5 +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: addb {{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb $-1, %cl +; SSE2-NEXT: jb .LBB2_124 +; SSE2-NEXT: # %bb.123: +; SSE2-NEXT: movl %edx, %ecx +; SSE2-NEXT: .LBB2_124: +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE2-NEXT: # xmm6 = xmm6[0],mem[0],xmm6[1],mem[1] +; SSE2-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3] +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE2-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3],xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; SSE2-NEXT: movd %eax, %xmm8 +; SSE2-NEXT: movzbl %cl, %eax +; SSE2-NEXT: movd %eax, %xmm6 +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: addb {{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb $-1, %cl +; SSE2-NEXT: jb .LBB2_126 +; SSE2-NEXT: # %bb.125: +; SSE2-NEXT: movl %edx, %ecx +; SSE2-NEXT: .LBB2_126: +; SSE2-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE2-NEXT: # xmm0 = xmm0[0],mem[0] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm14[0] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm7[0] +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; SSE2-NEXT: movzbl %cl, %ecx +; SSE2-NEXT: movd %ecx, %xmm7 +; SSE2-NEXT: addb (%rsp), %al +; SSE2-NEXT: movb $-1, %cl +; SSE2-NEXT: jb .LBB2_128 +; SSE2-NEXT: # %bb.127: +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: .LBB2_128: +; SSE2-NEXT: movzbl %cl, %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; SSE2-NEXT: addq $648, %rsp # imm = 0x288 +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %r12 +; SSE2-NEXT: popq %r13 +; SSE2-NEXT: popq %r14 +; SSE2-NEXT: popq %r15 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v64i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pushq %rbp +; SSSE3-NEXT: pushq %r15 +; SSSE3-NEXT: pushq %r14 +; SSSE3-NEXT: pushq %r13 +; SSSE3-NEXT: pushq %r12 +; SSSE3-NEXT: pushq %rbx +; SSSE3-NEXT: subq $648, %rsp # imm = 0x288 +; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movaps %xmm5, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movaps %xmm6, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movaps %xmm7, (%rsp) +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %cl +; SSSE3-NEXT: jb .LBB2_2 +; SSSE3-NEXT: # %bb.1: +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: .LBB2_2: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %dil +; SSSE3-NEXT: jb .LBB2_4 +; SSSE3-NEXT: # %bb.3: +; SSSE3-NEXT: movl %eax, %edi +; SSSE3-NEXT: .LBB2_4: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %r8b +; SSSE3-NEXT: jb .LBB2_6 +; SSSE3-NEXT: # %bb.5: +; SSSE3-NEXT: movl %eax, %r8d +; SSSE3-NEXT: .LBB2_6: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %r9b +; SSSE3-NEXT: jb .LBB2_8 +; SSSE3-NEXT: # %bb.7: +; SSSE3-NEXT: movl %eax, %r9d +; SSSE3-NEXT: .LBB2_8: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %r10b +; SSSE3-NEXT: jb .LBB2_10 +; SSSE3-NEXT: # %bb.9: +; SSSE3-NEXT: movl %eax, %r10d +; SSSE3-NEXT: .LBB2_10: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %r11b +; SSSE3-NEXT: jb .LBB2_12 +; SSSE3-NEXT: # %bb.11: +; SSSE3-NEXT: movl %eax, %r11d +; SSSE3-NEXT: .LBB2_12: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %bpl +; SSSE3-NEXT: jb .LBB2_14 +; SSSE3-NEXT: # %bb.13: +; SSSE3-NEXT: movl %eax, %ebp +; SSSE3-NEXT: .LBB2_14: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %r14b +; SSSE3-NEXT: jb .LBB2_16 +; SSSE3-NEXT: # %bb.15: +; SSSE3-NEXT: movl %eax, %r14d +; SSSE3-NEXT: .LBB2_16: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %dl +; SSSE3-NEXT: jb .LBB2_18 +; SSSE3-NEXT: # %bb.17: +; SSSE3-NEXT: movl %eax, %edx +; SSSE3-NEXT: .LBB2_18: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %r15b +; SSSE3-NEXT: jb .LBB2_20 +; SSSE3-NEXT: # %bb.19: +; SSSE3-NEXT: movl %eax, %r15d +; SSSE3-NEXT: .LBB2_20: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %r12b +; SSSE3-NEXT: jb .LBB2_22 +; SSSE3-NEXT: # %bb.21: +; SSSE3-NEXT: movl %eax, %r12d +; SSSE3-NEXT: .LBB2_22: +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %dl +; SSSE3-NEXT: jb .LBB2_24 +; SSSE3-NEXT: # %bb.23: +; SSSE3-NEXT: movl %eax, %edx +; SSSE3-NEXT: .LBB2_24: +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %dl +; SSSE3-NEXT: jb .LBB2_26 +; SSSE3-NEXT: # %bb.25: +; SSSE3-NEXT: movl %eax, %edx +; SSSE3-NEXT: .LBB2_26: +; SSSE3-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %sil +; SSSE3-NEXT: jb .LBB2_28 +; SSSE3-NEXT: # %bb.27: +; SSSE3-NEXT: movl %eax, %esi +; SSSE3-NEXT: .LBB2_28: +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_30 +; SSSE3-NEXT: # %bb.29: +; SSSE3-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_30: +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %dl +; SSSE3-NEXT: jb .LBB2_32 +; SSSE3-NEXT: # %bb.31: +; SSSE3-NEXT: movl %eax, %edx +; SSSE3-NEXT: .LBB2_32: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %cl +; SSSE3-NEXT: jb .LBB2_34 +; SSSE3-NEXT: # %bb.33: +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: .LBB2_34: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_36 +; SSSE3-NEXT: # %bb.35: +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_36: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_38 +; SSSE3-NEXT: # %bb.37: +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_38: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_40 +; SSSE3-NEXT: # %bb.39: +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_40: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_42 +; SSSE3-NEXT: # %bb.41: +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_42: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_44 +; SSSE3-NEXT: # %bb.43: +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_44: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_46 +; SSSE3-NEXT: # %bb.45: +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_46: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_48 +; SSSE3-NEXT: # %bb.47: +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_48: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_50 +; SSSE3-NEXT: # %bb.49: +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_50: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_52 +; SSSE3-NEXT: # %bb.51: +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_52: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_54 +; SSSE3-NEXT: # %bb.53: +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_54: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_56 +; SSSE3-NEXT: # %bb.55: +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_56: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_58 +; SSSE3-NEXT: # %bb.57: +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_58: +; SSSE3-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %cl +; SSSE3-NEXT: jb .LBB2_60 +; SSSE3-NEXT: # %bb.59: +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: .LBB2_60: +; SSSE3-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_62 +; SSSE3-NEXT: # %bb.61: +; SSSE3-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_62: +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %dl +; SSSE3-NEXT: jb .LBB2_64 +; SSSE3-NEXT: # %bb.63: +; SSSE3-NEXT: movl %eax, %edx +; SSSE3-NEXT: .LBB2_64: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %cl +; SSSE3-NEXT: jb .LBB2_66 +; SSSE3-NEXT: # %bb.65: +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: .LBB2_66: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_68 +; SSSE3-NEXT: # %bb.67: +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_68: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_70 +; SSSE3-NEXT: # %bb.69: +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_70: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_72 +; SSSE3-NEXT: # %bb.71: +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_72: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_74 +; SSSE3-NEXT: # %bb.73: +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_74: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_76 +; SSSE3-NEXT: # %bb.75: +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_76: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_78 +; SSSE3-NEXT: # %bb.77: +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_78: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_80 +; SSSE3-NEXT: # %bb.79: +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_80: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_82 +; SSSE3-NEXT: # %bb.81: +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_82: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_84 +; SSSE3-NEXT: # %bb.83: +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_84: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_86 +; SSSE3-NEXT: # %bb.85: +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_86: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %r13b +; SSSE3-NEXT: jb .LBB2_88 +; SSSE3-NEXT: # %bb.87: +; SSSE3-NEXT: movl %eax, %r13d +; SSSE3-NEXT: .LBB2_88: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %bl +; SSSE3-NEXT: jb .LBB2_90 +; SSSE3-NEXT: # %bb.89: +; SSSE3-NEXT: movl %eax, %ebx +; SSSE3-NEXT: .LBB2_90: +; SSSE3-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %cl +; SSSE3-NEXT: jb .LBB2_92 +; SSSE3-NEXT: # %bb.91: +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: .LBB2_92: +; SSSE3-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_94 +; SSSE3-NEXT: # %bb.93: +; SSSE3-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_94: +; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %cl +; SSSE3-NEXT: jb .LBB2_96 +; SSSE3-NEXT: # %bb.95: +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: .LBB2_96: +; SSSE3-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSSE3-NEXT: movl %esi, %r13d +; SSSE3-NEXT: jb .LBB2_98 +; SSSE3-NEXT: # %bb.97: +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_98: +; SSSE3-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %sil +; SSSE3-NEXT: jb .LBB2_100 +; SSSE3-NEXT: # %bb.99: +; SSSE3-NEXT: movl %eax, %esi +; SSSE3-NEXT: .LBB2_100: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %cl +; SSSE3-NEXT: jb .LBB2_102 +; SSSE3-NEXT: # %bb.101: +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: .LBB2_102: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %bl +; SSSE3-NEXT: jb .LBB2_104 +; SSSE3-NEXT: # %bb.103: +; SSSE3-NEXT: movl %eax, %ebx +; SSSE3-NEXT: .LBB2_104: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_106 +; SSSE3-NEXT: # %bb.105: +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_106: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_108 +; SSSE3-NEXT: # %bb.107: +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_108: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_110 +; SSSE3-NEXT: # %bb.109: +; SSSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_110: +; SSSE3-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %cl +; SSSE3-NEXT: jb .LBB2_112 +; SSSE3-NEXT: # %bb.111: +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: .LBB2_112: +; SSSE3-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %cl +; SSSE3-NEXT: jb .LBB2_114 +; SSSE3-NEXT: # %bb.113: +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: .LBB2_114: +; SSSE3-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %cl +; SSSE3-NEXT: jb .LBB2_116 +; SSSE3-NEXT: # %bb.115: +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: .LBB2_116: +; SSSE3-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl %dil, %eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl %r8b, %eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl %r9b, %edi +; SSSE3-NEXT: movzbl %r10b, %r8d +; SSSE3-NEXT: movzbl %r11b, %r9d +; SSSE3-NEXT: movzbl %bpl, %r10d +; SSSE3-NEXT: movzbl %r14b, %r11d +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload +; SSSE3-NEXT: movzbl %r15b, %ebp +; SSSE3-NEXT: movzbl %r12b, %r14d +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl %r13b, %r13d +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl %dl, %edx +; SSSE3-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: addb {{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSSE3-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movb $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_118 +; SSSE3-NEXT: # %bb.117: +; SSSE3-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSSE3-NEXT: .LBB2_118: +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSSE3-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSSE3-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSSE3-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd %edi, %xmm3 +; SSSE3-NEXT: movd %r8d, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd %r9d, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd %r10d, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd %r11d, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd %ebx, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd %ebp, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd %r14d, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd %r15d, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd %r12d, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd %r13d, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: addb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload +; SSSE3-NEXT: movb $-1, %bl +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm11 = mem[0],zero,zero,zero +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm15 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm9 = mem[0],zero,zero,zero +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm14 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm5 = mem[0],zero,zero,zero +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm6 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm13 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm10 = mem[0],zero,zero,zero +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm12 = mem[0],zero,zero,zero +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm7 = mem[0],zero,zero,zero +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm8 = mem[0],zero,zero,zero +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: jb .LBB2_120 +; SSSE3-NEXT: # %bb.119: +; SSSE3-NEXT: movl %ecx, %ebx +; SSSE3-NEXT: .LBB2_120: +; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3],xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; SSSE3-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3],xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] +; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1],xmm15[2],mem[2],xmm15[3],mem[3],xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] +; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] +; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm14 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3],xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] +; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; SSSE3-NEXT: movdqa %xmm3, %xmm5 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3],xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7] +; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm12 = xmm12[0],mem[0],xmm12[1],mem[1],xmm12[2],mem[2],xmm12[3],mem[3],xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] +; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3],xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] +; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3],xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] +; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSSE3-NEXT: movd %esi, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd %edx, %xmm10 +; SSSE3-NEXT: movd %edi, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd %ebp, %xmm9 +; SSSE3-NEXT: movd %eax, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd %r10d, %xmm11 +; SSSE3-NEXT: movd %r9d, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd %r8d, %xmm4 +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSSE3-NEXT: movzbl %bl, %ecx +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: addb {{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movb $-1, %al +; SSSE3-NEXT: jb .LBB2_122 +; SSSE3-NEXT: # %bb.121: +; SSSE3-NEXT: movl %ebx, %eax +; SSSE3-NEXT: .LBB2_122: +; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1],xmm15[2],mem[2],xmm15[3],mem[3] +; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm14 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm12 = xmm12[0],mem[0],xmm12[1],mem[1],xmm12[2],mem[2],xmm12[3],mem[3] +; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3] +; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3] +; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm10 = xmm10[0],mem[0],xmm10[1],mem[1],xmm10[2],mem[2],xmm10[3],mem[3],xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1],xmm9[2],mem[2],xmm9[3],mem[3],xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3],xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3],xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; SSSE3-NEXT: movd %edi, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd %edx, %xmm3 +; SSSE3-NEXT: movd %esi, %xmm5 +; SSSE3-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd %ecx, %xmm5 +; SSSE3-NEXT: movzbl %al, %eax +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: addb {{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb $-1, %cl +; SSSE3-NEXT: jb .LBB2_124 +; SSSE3-NEXT: # %bb.123: +; SSSE3-NEXT: movl %edx, %ecx +; SSSE3-NEXT: .LBB2_124: +; SSSE3-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSSE3-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm6 = xmm6[0],mem[0],xmm6[1],mem[1] +; SSSE3-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3] +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3],xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; SSSE3-NEXT: movd %eax, %xmm8 +; SSSE3-NEXT: movzbl %cl, %eax +; SSSE3-NEXT: movd %eax, %xmm6 +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: addb {{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb $-1, %cl +; SSSE3-NEXT: jb .LBB2_126 +; SSSE3-NEXT: # %bb.125: +; SSSE3-NEXT: movl %edx, %ecx +; SSSE3-NEXT: .LBB2_126: +; SSSE3-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm0 = xmm0[0],mem[0] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm14[0] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm7[0] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; SSSE3-NEXT: movzbl %cl, %ecx +; SSSE3-NEXT: movd %ecx, %xmm7 +; SSSE3-NEXT: addb (%rsp), %al +; SSSE3-NEXT: movb $-1, %cl +; SSSE3-NEXT: jb .LBB2_128 +; SSSE3-NEXT: # %bb.127: +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: .LBB2_128: +; SSSE3-NEXT: movzbl %cl, %eax +; SSSE3-NEXT: movd %eax, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; SSSE3-NEXT: addq $648, %rsp # imm = 0x288 +; SSSE3-NEXT: popq %rbx +; SSSE3-NEXT: popq %r12 +; SSSE3-NEXT: popq %r13 +; SSSE3-NEXT: popq %r14 +; SSSE3-NEXT: popq %r15 +; SSSE3-NEXT: popq %rbp +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v64i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pextrb $1, %xmm4, %ecx +; SSE41-NEXT: pextrb $1, %xmm0, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb $-1, %cl +; SSE41-NEXT: jb .LBB2_2 +; SSE41-NEXT: # %bb.1: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_2: +; SSE41-NEXT: pushq %rbp +; SSE41-NEXT: pushq %r15 +; SSE41-NEXT: pushq %r14 +; SSE41-NEXT: pushq %r13 +; SSE41-NEXT: pushq %r12 +; SSE41-NEXT: pushq %rbx +; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE41-NEXT: pextrb $0, %xmm4, %ecx +; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb $-1, %r10b +; SSE41-NEXT: jb .LBB2_4 +; SSE41-NEXT: # %bb.3: +; SSE41-NEXT: movl %eax, %r10d +; SSE41-NEXT: .LBB2_4: +; SSE41-NEXT: pextrb $2, %xmm4, %ecx +; SSE41-NEXT: pextrb $2, %xmm0, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb $-1, %cl +; SSE41-NEXT: jb .LBB2_6 +; SSE41-NEXT: # %bb.5: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_6: +; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE41-NEXT: pextrb $3, %xmm4, %ecx +; SSE41-NEXT: pextrb $3, %xmm0, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb $-1, %cl +; SSE41-NEXT: jb .LBB2_8 +; SSE41-NEXT: # %bb.7: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_8: +; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE41-NEXT: pextrb $4, %xmm4, %ecx +; SSE41-NEXT: pextrb $4, %xmm0, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb $-1, %cl +; SSE41-NEXT: jb .LBB2_10 +; SSE41-NEXT: # %bb.9: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_10: +; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE41-NEXT: pextrb $5, %xmm4, %ecx +; SSE41-NEXT: pextrb $5, %xmm0, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb $-1, %cl +; SSE41-NEXT: jb .LBB2_12 +; SSE41-NEXT: # %bb.11: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_12: +; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE41-NEXT: pextrb $6, %xmm4, %ecx +; SSE41-NEXT: pextrb $6, %xmm0, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb $-1, %cl +; SSE41-NEXT: jb .LBB2_14 +; SSE41-NEXT: # %bb.13: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_14: +; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE41-NEXT: pextrb $7, %xmm4, %ecx +; SSE41-NEXT: pextrb $7, %xmm0, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb $-1, %cl +; SSE41-NEXT: jb .LBB2_16 +; SSE41-NEXT: # %bb.15: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_16: +; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE41-NEXT: pextrb $8, %xmm4, %ecx +; SSE41-NEXT: pextrb $8, %xmm0, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb $-1, %cl +; SSE41-NEXT: jb .LBB2_18 +; SSE41-NEXT: # %bb.17: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_18: +; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE41-NEXT: pextrb $9, %xmm4, %ecx +; SSE41-NEXT: pextrb $9, %xmm0, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb $-1, %cl +; SSE41-NEXT: jb .LBB2_20 +; SSE41-NEXT: # %bb.19: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_20: +; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE41-NEXT: pextrb $10, %xmm4, %ecx +; SSE41-NEXT: pextrb $10, %xmm0, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb $-1, %cl +; SSE41-NEXT: jb .LBB2_22 +; SSE41-NEXT: # %bb.21: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_22: +; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE41-NEXT: pextrb $11, %xmm4, %ecx +; SSE41-NEXT: pextrb $11, %xmm0, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb $-1, %cl +; SSE41-NEXT: jb .LBB2_24 +; SSE41-NEXT: # %bb.23: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_24: +; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE41-NEXT: pextrb $12, %xmm4, %ecx +; SSE41-NEXT: pextrb $12, %xmm0, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb $-1, %cl +; SSE41-NEXT: jb .LBB2_26 +; SSE41-NEXT: # %bb.25: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_26: +; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE41-NEXT: pextrb $13, %xmm4, %ecx +; SSE41-NEXT: pextrb $13, %xmm0, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb $-1, %cl +; SSE41-NEXT: jb .LBB2_28 +; SSE41-NEXT: # %bb.27: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_28: +; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE41-NEXT: pextrb $14, %xmm4, %ecx +; SSE41-NEXT: pextrb $14, %xmm0, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb $-1, %cl +; SSE41-NEXT: jb .LBB2_30 +; SSE41-NEXT: # %bb.29: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_30: +; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE41-NEXT: pextrb $15, %xmm4, %ecx +; SSE41-NEXT: pextrb $15, %xmm0, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb $-1, %cl +; SSE41-NEXT: jb .LBB2_32 +; SSE41-NEXT: # %bb.31: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_32: +; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE41-NEXT: pextrb $1, %xmm5, %ecx +; SSE41-NEXT: pextrb $1, %xmm1, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb $-1, %bl +; SSE41-NEXT: jb .LBB2_34 +; SSE41-NEXT: # %bb.33: +; SSE41-NEXT: movl %eax, %ebx +; SSE41-NEXT: .LBB2_34: +; SSE41-NEXT: pextrb $0, %xmm5, %ecx +; SSE41-NEXT: pextrb $0, %xmm1, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb $-1, %bpl +; SSE41-NEXT: jb .LBB2_36 +; SSE41-NEXT: # %bb.35: +; SSE41-NEXT: movl %eax, %ebp +; SSE41-NEXT: .LBB2_36: +; SSE41-NEXT: pextrb $2, %xmm5, %ecx +; SSE41-NEXT: pextrb $2, %xmm1, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb $-1, %cl +; SSE41-NEXT: jb .LBB2_38 +; SSE41-NEXT: # %bb.37: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_38: +; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE41-NEXT: pextrb $3, %xmm5, %ecx +; SSE41-NEXT: pextrb $3, %xmm1, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb $-1, %cl +; SSE41-NEXT: jb .LBB2_40 +; SSE41-NEXT: # %bb.39: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_40: +; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE41-NEXT: pextrb $4, %xmm5, %ecx +; SSE41-NEXT: pextrb $4, %xmm1, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb $-1, %cl +; SSE41-NEXT: jb .LBB2_42 +; SSE41-NEXT: # %bb.41: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_42: +; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE41-NEXT: pextrb $5, %xmm5, %ecx +; SSE41-NEXT: pextrb $5, %xmm1, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb $-1, %cl +; SSE41-NEXT: jb .LBB2_44 +; SSE41-NEXT: # %bb.43: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_44: +; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE41-NEXT: pextrb $6, %xmm5, %ecx +; SSE41-NEXT: pextrb $6, %xmm1, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb $-1, %cl +; SSE41-NEXT: jb .LBB2_46 +; SSE41-NEXT: # %bb.45: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_46: +; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE41-NEXT: pextrb $7, %xmm5, %ecx +; SSE41-NEXT: pextrb $7, %xmm1, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb $-1, %cl +; SSE41-NEXT: jb .LBB2_48 +; SSE41-NEXT: # %bb.47: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_48: +; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE41-NEXT: pextrb $8, %xmm5, %ecx +; SSE41-NEXT: pextrb $8, %xmm1, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb $-1, %cl +; SSE41-NEXT: jb .LBB2_50 +; SSE41-NEXT: # %bb.49: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_50: +; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE41-NEXT: pextrb $9, %xmm5, %ecx +; SSE41-NEXT: pextrb $9, %xmm1, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb $-1, %cl +; SSE41-NEXT: jb .LBB2_52 +; SSE41-NEXT: # %bb.51: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_52: +; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE41-NEXT: pextrb $10, %xmm5, %ecx +; SSE41-NEXT: pextrb $10, %xmm1, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb $-1, %cl +; SSE41-NEXT: jb .LBB2_54 +; SSE41-NEXT: # %bb.53: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_54: +; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE41-NEXT: pextrb $11, %xmm5, %ecx +; SSE41-NEXT: pextrb $11, %xmm1, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb $-1, %cl +; SSE41-NEXT: jb .LBB2_56 +; SSE41-NEXT: # %bb.55: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_56: +; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE41-NEXT: pextrb $12, %xmm5, %ecx +; SSE41-NEXT: pextrb $12, %xmm1, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb $-1, %cl +; SSE41-NEXT: jb .LBB2_58 +; SSE41-NEXT: # %bb.57: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_58: +; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE41-NEXT: pextrb $13, %xmm5, %ecx +; SSE41-NEXT: pextrb $13, %xmm1, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb $-1, %cl +; SSE41-NEXT: jb .LBB2_60 +; SSE41-NEXT: # %bb.59: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_60: +; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE41-NEXT: pextrb $14, %xmm5, %ecx +; SSE41-NEXT: pextrb $14, %xmm1, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb $-1, %cl +; SSE41-NEXT: jb .LBB2_62 +; SSE41-NEXT: # %bb.61: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_62: +; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE41-NEXT: pextrb $15, %xmm5, %ecx +; SSE41-NEXT: pextrb $15, %xmm1, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb $-1, %cl +; SSE41-NEXT: jb .LBB2_64 +; SSE41-NEXT: # %bb.63: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_64: +; SSE41-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE41-NEXT: pextrb $1, %xmm6, %ecx +; SSE41-NEXT: pextrb $1, %xmm2, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb $-1, %dil +; SSE41-NEXT: jb .LBB2_66 +; SSE41-NEXT: # %bb.65: +; SSE41-NEXT: movl %eax, %edi +; SSE41-NEXT: .LBB2_66: +; SSE41-NEXT: pextrb $0, %xmm6, %ecx +; SSE41-NEXT: pextrb $0, %xmm2, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb $-1, %cl +; SSE41-NEXT: jb .LBB2_68 +; SSE41-NEXT: # %bb.67: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_68: +; SSE41-NEXT: pextrb $2, %xmm6, %edx +; SSE41-NEXT: pextrb $2, %xmm2, %eax +; SSE41-NEXT: addb %dl, %al +; SSE41-NEXT: movb $-1, %r8b +; SSE41-NEXT: jb .LBB2_70 +; SSE41-NEXT: # %bb.69: +; SSE41-NEXT: movl %eax, %r8d +; SSE41-NEXT: .LBB2_70: +; SSE41-NEXT: pextrb $3, %xmm6, %edx +; SSE41-NEXT: pextrb $3, %xmm2, %eax +; SSE41-NEXT: addb %dl, %al +; SSE41-NEXT: movb $-1, %sil +; SSE41-NEXT: jb .LBB2_72 +; SSE41-NEXT: # %bb.71: +; SSE41-NEXT: movl %eax, %esi +; SSE41-NEXT: .LBB2_72: +; SSE41-NEXT: pextrb $4, %xmm6, %eax +; SSE41-NEXT: pextrb $4, %xmm2, %edx +; SSE41-NEXT: addb %al, %dl +; SSE41-NEXT: movb $-1, %r15b +; SSE41-NEXT: jb .LBB2_74 +; SSE41-NEXT: # %bb.73: +; SSE41-NEXT: movl %edx, %r15d +; SSE41-NEXT: .LBB2_74: +; SSE41-NEXT: pextrb $5, %xmm6, %eax +; SSE41-NEXT: pextrb $5, %xmm2, %edx +; SSE41-NEXT: addb %al, %dl +; SSE41-NEXT: movb $-1, %r14b +; SSE41-NEXT: jb .LBB2_76 +; SSE41-NEXT: # %bb.75: +; SSE41-NEXT: movl %edx, %r14d +; SSE41-NEXT: .LBB2_76: +; SSE41-NEXT: pextrb $6, %xmm6, %eax +; SSE41-NEXT: pextrb $6, %xmm2, %edx +; SSE41-NEXT: addb %al, %dl +; SSE41-NEXT: movb $-1, %r11b +; SSE41-NEXT: jb .LBB2_78 +; SSE41-NEXT: # %bb.77: +; SSE41-NEXT: movl %edx, %r11d +; SSE41-NEXT: .LBB2_78: +; SSE41-NEXT: pextrb $7, %xmm6, %eax +; SSE41-NEXT: pextrb $7, %xmm2, %edx +; SSE41-NEXT: addb %al, %dl +; SSE41-NEXT: movb $-1, %r9b +; SSE41-NEXT: jb .LBB2_80 +; SSE41-NEXT: # %bb.79: +; SSE41-NEXT: movl %edx, %r9d +; SSE41-NEXT: .LBB2_80: +; SSE41-NEXT: pextrb $8, %xmm6, %eax +; SSE41-NEXT: pextrb $8, %xmm2, %edx +; SSE41-NEXT: addb %al, %dl +; SSE41-NEXT: movb $-1, %r13b +; SSE41-NEXT: jb .LBB2_82 +; SSE41-NEXT: # %bb.81: +; SSE41-NEXT: movl %edx, %r13d +; SSE41-NEXT: .LBB2_82: +; SSE41-NEXT: pextrb $9, %xmm6, %eax +; SSE41-NEXT: pextrb $9, %xmm2, %edx +; SSE41-NEXT: addb %al, %dl +; SSE41-NEXT: movb $-1, %r12b +; SSE41-NEXT: jb .LBB2_84 +; SSE41-NEXT: # %bb.83: +; SSE41-NEXT: movl %edx, %r12d +; SSE41-NEXT: .LBB2_84: +; SSE41-NEXT: pextrb $10, %xmm6, %eax +; SSE41-NEXT: pextrb $10, %xmm2, %edx +; SSE41-NEXT: addb %al, %dl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB2_86 +; SSE41-NEXT: # %bb.85: +; SSE41-NEXT: movl %edx, %eax +; SSE41-NEXT: .LBB2_86: +; SSE41-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE41-NEXT: pextrb $11, %xmm6, %eax +; SSE41-NEXT: pextrb $11, %xmm2, %edx +; SSE41-NEXT: addb %al, %dl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB2_88 +; SSE41-NEXT: # %bb.87: +; SSE41-NEXT: movl %edx, %eax +; SSE41-NEXT: .LBB2_88: +; SSE41-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE41-NEXT: pextrb $12, %xmm6, %eax +; SSE41-NEXT: pextrb $12, %xmm2, %edx +; SSE41-NEXT: addb %al, %dl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB2_90 +; SSE41-NEXT: # %bb.89: +; SSE41-NEXT: movl %edx, %eax +; SSE41-NEXT: .LBB2_90: +; SSE41-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE41-NEXT: pextrb $13, %xmm6, %eax +; SSE41-NEXT: pextrb $13, %xmm2, %edx +; SSE41-NEXT: addb %al, %dl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB2_92 +; SSE41-NEXT: # %bb.91: +; SSE41-NEXT: movl %edx, %eax +; SSE41-NEXT: .LBB2_92: +; SSE41-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE41-NEXT: pextrb $14, %xmm6, %eax +; SSE41-NEXT: pextrb $14, %xmm2, %edx +; SSE41-NEXT: addb %al, %dl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB2_94 +; SSE41-NEXT: # %bb.93: +; SSE41-NEXT: movl %edx, %eax +; SSE41-NEXT: .LBB2_94: +; SSE41-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE41-NEXT: movzbl %r10b, %edx +; SSE41-NEXT: movzbl %bpl, %r10d +; SSE41-NEXT: movzbl %cl, %ebp +; SSE41-NEXT: pextrb $15, %xmm6, %eax +; SSE41-NEXT: pextrb $15, %xmm2, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB2_96 +; SSE41-NEXT: # %bb.95: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB2_96: +; SSE41-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE41-NEXT: movd %edx, %xmm0 +; SSE41-NEXT: movzbl %bl, %edx +; SSE41-NEXT: movd %r10d, %xmm1 +; SSE41-NEXT: movzbl %dil, %edi +; SSE41-NEXT: movd %ebp, %xmm2 +; SSE41-NEXT: pextrb $1, %xmm7, %eax +; SSE41-NEXT: pextrb $1, %xmm3, %ebp +; SSE41-NEXT: addb %al, %bpl +; SSE41-NEXT: movb $-1, %bl +; SSE41-NEXT: jb .LBB2_98 +; SSE41-NEXT: # %bb.97: +; SSE41-NEXT: movl %ebp, %ebx +; SSE41-NEXT: .LBB2_98: +; SSE41-NEXT: pinsrb $1, %ecx, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $1, %edx, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $1, %edi, %xmm2 +; SSE41-NEXT: movzbl %r8b, %edx +; SSE41-NEXT: movzbl %bl, %edi +; SSE41-NEXT: pextrb $0, %xmm7, %ebx +; SSE41-NEXT: pextrb $0, %xmm3, %eax +; SSE41-NEXT: addb %bl, %al +; SSE41-NEXT: movb $-1, %bl +; SSE41-NEXT: jb .LBB2_100 +; SSE41-NEXT: # %bb.99: +; SSE41-NEXT: movl %eax, %ebx +; SSE41-NEXT: .LBB2_100: +; SSE41-NEXT: pinsrb $2, %ecx, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $2, %ebp, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $2, %edx, %xmm2 +; SSE41-NEXT: movzbl %sil, %esi +; SSE41-NEXT: movzbl %bl, %eax +; SSE41-NEXT: movd %eax, %xmm4 +; SSE41-NEXT: pinsrb $1, %edi, %xmm4 +; SSE41-NEXT: pextrb $2, %xmm7, %edx +; SSE41-NEXT: pextrb $2, %xmm3, %eax +; SSE41-NEXT: addb %dl, %al +; SSE41-NEXT: movb $-1, %dl +; SSE41-NEXT: jb .LBB2_102 +; SSE41-NEXT: # %bb.101: +; SSE41-NEXT: movl %eax, %edx +; SSE41-NEXT: .LBB2_102: +; SSE41-NEXT: pinsrb $3, %ecx, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $3, %ebp, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $3, %esi, %xmm2 +; SSE41-NEXT: movzbl %r15b, %esi +; SSE41-NEXT: movzbl %dl, %eax +; SSE41-NEXT: pinsrb $2, %eax, %xmm4 +; SSE41-NEXT: pextrb $3, %xmm7, %edx +; SSE41-NEXT: pextrb $3, %xmm3, %eax +; SSE41-NEXT: addb %dl, %al +; SSE41-NEXT: movb $-1, %dl +; SSE41-NEXT: jb .LBB2_104 +; SSE41-NEXT: # %bb.103: +; SSE41-NEXT: movl %eax, %edx +; SSE41-NEXT: .LBB2_104: +; SSE41-NEXT: pinsrb $4, %ecx, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $4, %edi, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $4, %esi, %xmm2 +; SSE41-NEXT: movzbl %r14b, %esi +; SSE41-NEXT: movzbl %dl, %edx +; SSE41-NEXT: pinsrb $3, %edx, %xmm4 +; SSE41-NEXT: pextrb $4, %xmm7, %edi +; SSE41-NEXT: pextrb $4, %xmm3, %edx +; SSE41-NEXT: addb %dil, %dl +; SSE41-NEXT: movb $-1, %bl +; SSE41-NEXT: jb .LBB2_106 +; SSE41-NEXT: # %bb.105: +; SSE41-NEXT: movl %edx, %ebx +; SSE41-NEXT: .LBB2_106: +; SSE41-NEXT: pinsrb $5, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $5, %ecx, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $5, %esi, %xmm2 +; SSE41-NEXT: movzbl %r11b, %edx +; SSE41-NEXT: movzbl %bl, %esi +; SSE41-NEXT: pinsrb $4, %esi, %xmm4 +; SSE41-NEXT: pextrb $5, %xmm7, %edi +; SSE41-NEXT: pextrb $5, %xmm3, %esi +; SSE41-NEXT: addb %dil, %sil +; SSE41-NEXT: movb $-1, %bl +; SSE41-NEXT: jb .LBB2_108 +; SSE41-NEXT: # %bb.107: +; SSE41-NEXT: movl %esi, %ebx +; SSE41-NEXT: .LBB2_108: +; SSE41-NEXT: pinsrb $6, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $6, %ecx, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $6, %edx, %xmm2 +; SSE41-NEXT: movzbl %r9b, %edx +; SSE41-NEXT: movzbl %bl, %esi +; SSE41-NEXT: pinsrb $5, %esi, %xmm4 +; SSE41-NEXT: pextrb $6, %xmm7, %edi +; SSE41-NEXT: pextrb $6, %xmm3, %esi +; SSE41-NEXT: addb %dil, %sil +; SSE41-NEXT: movb $-1, %bl +; SSE41-NEXT: jb .LBB2_110 +; SSE41-NEXT: # %bb.109: +; SSE41-NEXT: movl %esi, %ebx +; SSE41-NEXT: .LBB2_110: +; SSE41-NEXT: pinsrb $7, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $7, %ecx, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $7, %edx, %xmm2 +; SSE41-NEXT: movzbl %r13b, %edx +; SSE41-NEXT: movzbl %bl, %esi +; SSE41-NEXT: pinsrb $6, %esi, %xmm4 +; SSE41-NEXT: pextrb $7, %xmm7, %edi +; SSE41-NEXT: pextrb $7, %xmm3, %esi +; SSE41-NEXT: addb %dil, %sil +; SSE41-NEXT: movb $-1, %bl +; SSE41-NEXT: jb .LBB2_112 +; SSE41-NEXT: # %bb.111: +; SSE41-NEXT: movl %esi, %ebx +; SSE41-NEXT: .LBB2_112: +; SSE41-NEXT: pinsrb $8, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $8, %ecx, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $8, %edx, %xmm2 +; SSE41-NEXT: movzbl %r12b, %edx +; SSE41-NEXT: movzbl %bl, %esi +; SSE41-NEXT: pinsrb $7, %esi, %xmm4 +; SSE41-NEXT: pextrb $8, %xmm7, %edi +; SSE41-NEXT: pextrb $8, %xmm3, %esi +; SSE41-NEXT: addb %dil, %sil +; SSE41-NEXT: movb $-1, %bl +; SSE41-NEXT: jb .LBB2_114 +; SSE41-NEXT: # %bb.113: +; SSE41-NEXT: movl %esi, %ebx +; SSE41-NEXT: .LBB2_114: +; SSE41-NEXT: pinsrb $9, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $9, %ecx, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $9, %edx, %xmm2 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; SSE41-NEXT: movzbl %bl, %esi +; SSE41-NEXT: pinsrb $8, %esi, %xmm4 +; SSE41-NEXT: pextrb $9, %xmm7, %edi +; SSE41-NEXT: pextrb $9, %xmm3, %esi +; SSE41-NEXT: addb %dil, %sil +; SSE41-NEXT: movb $-1, %bl +; SSE41-NEXT: jb .LBB2_116 +; SSE41-NEXT: # %bb.115: +; SSE41-NEXT: movl %esi, %ebx +; SSE41-NEXT: .LBB2_116: +; SSE41-NEXT: pinsrb $10, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $10, %ecx, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $10, %edx, %xmm2 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; SSE41-NEXT: movzbl %bl, %esi +; SSE41-NEXT: pinsrb $9, %esi, %xmm4 +; SSE41-NEXT: pextrb $10, %xmm7, %edi +; SSE41-NEXT: pextrb $10, %xmm3, %esi +; SSE41-NEXT: addb %dil, %sil +; SSE41-NEXT: movb $-1, %bl +; SSE41-NEXT: jb .LBB2_118 +; SSE41-NEXT: # %bb.117: +; SSE41-NEXT: movl %esi, %ebx +; SSE41-NEXT: .LBB2_118: +; SSE41-NEXT: pinsrb $11, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $11, %ecx, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $11, %edx, %xmm2 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; SSE41-NEXT: movzbl %bl, %esi +; SSE41-NEXT: pinsrb $10, %esi, %xmm4 +; SSE41-NEXT: pextrb $11, %xmm7, %edi +; SSE41-NEXT: pextrb $11, %xmm3, %esi +; SSE41-NEXT: addb %dil, %sil +; SSE41-NEXT: movb $-1, %bl +; SSE41-NEXT: jb .LBB2_120 +; SSE41-NEXT: # %bb.119: +; SSE41-NEXT: movl %esi, %ebx +; SSE41-NEXT: .LBB2_120: +; SSE41-NEXT: pinsrb $12, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $12, %ecx, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $12, %edx, %xmm2 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; SSE41-NEXT: movzbl %bl, %esi +; SSE41-NEXT: pinsrb $11, %esi, %xmm4 +; SSE41-NEXT: pextrb $12, %xmm7, %edi +; SSE41-NEXT: pextrb $12, %xmm3, %esi +; SSE41-NEXT: addb %dil, %sil +; SSE41-NEXT: movb $-1, %bl +; SSE41-NEXT: jb .LBB2_122 +; SSE41-NEXT: # %bb.121: +; SSE41-NEXT: movl %esi, %ebx +; SSE41-NEXT: .LBB2_122: +; SSE41-NEXT: pinsrb $13, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $13, %ecx, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $13, %edx, %xmm2 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; SSE41-NEXT: movzbl %bl, %esi +; SSE41-NEXT: pinsrb $12, %esi, %xmm4 +; SSE41-NEXT: pextrb $13, %xmm7, %edi +; SSE41-NEXT: pextrb $13, %xmm3, %esi +; SSE41-NEXT: addb %dil, %sil +; SSE41-NEXT: movb $-1, %bl +; SSE41-NEXT: jb .LBB2_124 +; SSE41-NEXT: # %bb.123: +; SSE41-NEXT: movl %esi, %ebx +; SSE41-NEXT: .LBB2_124: +; SSE41-NEXT: pinsrb $14, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $14, %ecx, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $14, %edx, %xmm2 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; SSE41-NEXT: movzbl %bl, %esi +; SSE41-NEXT: pinsrb $13, %esi, %xmm4 +; SSE41-NEXT: pextrb $14, %xmm7, %edi +; SSE41-NEXT: pextrb $14, %xmm3, %esi +; SSE41-NEXT: addb %dil, %sil +; SSE41-NEXT: movb $-1, %bl +; SSE41-NEXT: jb .LBB2_126 +; SSE41-NEXT: # %bb.125: +; SSE41-NEXT: movl %esi, %ebx +; SSE41-NEXT: .LBB2_126: +; SSE41-NEXT: pinsrb $15, %eax, %xmm0 +; SSE41-NEXT: pinsrb $15, %ecx, %xmm1 +; SSE41-NEXT: pinsrb $15, %edx, %xmm2 +; SSE41-NEXT: movzbl %bl, %eax +; SSE41-NEXT: pinsrb $14, %eax, %xmm4 +; SSE41-NEXT: pextrb $15, %xmm7, %ecx +; SSE41-NEXT: pextrb $15, %xmm3, %eax +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb $-1, %cl +; SSE41-NEXT: popq %rbx +; SSE41-NEXT: popq %r12 +; SSE41-NEXT: popq %r13 +; SSE41-NEXT: popq %r14 +; SSE41-NEXT: popq %r15 +; SSE41-NEXT: popq %rbp +; SSE41-NEXT: jb .LBB2_128 +; SSE41-NEXT: # %bb.127: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_128: +; SSE41-NEXT: movzbl %cl, %eax +; SSE41-NEXT: pinsrb $15, %eax, %xmm4 +; SSE41-NEXT: movdqa %xmm4, %xmm3 +; SSE41-NEXT: retq +; +; AVX1-LABEL: v64i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vpextrb $1, %xmm4, %ecx +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpextrb $1, %xmm5, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: movb $-1, %cl +; AVX1-NEXT: jb .LBB2_2 +; AVX1-NEXT: # %bb.1: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_2: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: pushq %r15 +; AVX1-NEXT: pushq %r14 +; AVX1-NEXT: pushq %r13 +; AVX1-NEXT: pushq %r12 +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX1-NEXT: vpextrb $0, %xmm4, %ecx +; AVX1-NEXT: vpextrb $0, %xmm5, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: movb $-1, %cl +; AVX1-NEXT: jb .LBB2_4 +; AVX1-NEXT: # %bb.3: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_4: +; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX1-NEXT: vpextrb $2, %xmm4, %ecx +; AVX1-NEXT: vpextrb $2, %xmm5, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: movb $-1, %cl +; AVX1-NEXT: jb .LBB2_6 +; AVX1-NEXT: # %bb.5: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_6: +; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX1-NEXT: vpextrb $3, %xmm4, %ecx +; AVX1-NEXT: vpextrb $3, %xmm5, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: movb $-1, %cl +; AVX1-NEXT: jb .LBB2_8 +; AVX1-NEXT: # %bb.7: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_8: +; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX1-NEXT: vpextrb $4, %xmm4, %ecx +; AVX1-NEXT: vpextrb $4, %xmm5, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: movb $-1, %cl +; AVX1-NEXT: jb .LBB2_10 +; AVX1-NEXT: # %bb.9: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_10: +; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX1-NEXT: vpextrb $5, %xmm4, %ecx +; AVX1-NEXT: vpextrb $5, %xmm5, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: movb $-1, %cl +; AVX1-NEXT: jb .LBB2_12 +; AVX1-NEXT: # %bb.11: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_12: +; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX1-NEXT: vpextrb $6, %xmm4, %ecx +; AVX1-NEXT: vpextrb $6, %xmm5, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: movb $-1, %cl +; AVX1-NEXT: jb .LBB2_14 +; AVX1-NEXT: # %bb.13: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_14: +; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX1-NEXT: vpextrb $7, %xmm4, %ecx +; AVX1-NEXT: vpextrb $7, %xmm5, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: movb $-1, %cl +; AVX1-NEXT: jb .LBB2_16 +; AVX1-NEXT: # %bb.15: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_16: +; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX1-NEXT: vpextrb $8, %xmm4, %ecx +; AVX1-NEXT: vpextrb $8, %xmm5, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: movb $-1, %cl +; AVX1-NEXT: jb .LBB2_18 +; AVX1-NEXT: # %bb.17: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_18: +; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX1-NEXT: vpextrb $9, %xmm4, %ecx +; AVX1-NEXT: vpextrb $9, %xmm5, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: movb $-1, %cl +; AVX1-NEXT: jb .LBB2_20 +; AVX1-NEXT: # %bb.19: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_20: +; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX1-NEXT: vpextrb $10, %xmm4, %ecx +; AVX1-NEXT: vpextrb $10, %xmm5, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: movb $-1, %cl +; AVX1-NEXT: jb .LBB2_22 +; AVX1-NEXT: # %bb.21: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_22: +; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX1-NEXT: vpextrb $11, %xmm4, %ecx +; AVX1-NEXT: vpextrb $11, %xmm5, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: movb $-1, %cl +; AVX1-NEXT: jb .LBB2_24 +; AVX1-NEXT: # %bb.23: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_24: +; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX1-NEXT: vpextrb $12, %xmm4, %ecx +; AVX1-NEXT: vpextrb $12, %xmm5, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: movb $-1, %cl +; AVX1-NEXT: jb .LBB2_26 +; AVX1-NEXT: # %bb.25: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_26: +; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX1-NEXT: vpextrb $13, %xmm4, %ecx +; AVX1-NEXT: vpextrb $13, %xmm5, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: movb $-1, %cl +; AVX1-NEXT: jb .LBB2_28 +; AVX1-NEXT: # %bb.27: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_28: +; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX1-NEXT: vpextrb $14, %xmm4, %ecx +; AVX1-NEXT: vpextrb $14, %xmm5, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: movb $-1, %cl +; AVX1-NEXT: jb .LBB2_30 +; AVX1-NEXT: # %bb.29: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_30: +; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX1-NEXT: vpextrb $15, %xmm4, %ecx +; AVX1-NEXT: vpextrb $15, %xmm5, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: movb $-1, %cl +; AVX1-NEXT: jb .LBB2_32 +; AVX1-NEXT: # %bb.31: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_32: +; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX1-NEXT: vpextrb $1, %xmm2, %ecx +; AVX1-NEXT: vpextrb $1, %xmm0, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: movb $-1, %r11b +; AVX1-NEXT: jb .LBB2_34 +; AVX1-NEXT: # %bb.33: +; AVX1-NEXT: movl %eax, %r11d +; AVX1-NEXT: .LBB2_34: +; AVX1-NEXT: vpextrb $0, %xmm2, %ecx +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: movb $-1, %r8b +; AVX1-NEXT: jb .LBB2_36 +; AVX1-NEXT: # %bb.35: +; AVX1-NEXT: movl %eax, %r8d +; AVX1-NEXT: .LBB2_36: +; AVX1-NEXT: vpextrb $2, %xmm2, %ecx +; AVX1-NEXT: vpextrb $2, %xmm0, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: movb $-1, %cl +; AVX1-NEXT: jb .LBB2_38 +; AVX1-NEXT: # %bb.37: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_38: +; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX1-NEXT: vpextrb $3, %xmm2, %ecx +; AVX1-NEXT: vpextrb $3, %xmm0, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: movb $-1, %cl +; AVX1-NEXT: jb .LBB2_40 +; AVX1-NEXT: # %bb.39: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_40: +; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX1-NEXT: vpextrb $4, %xmm2, %ecx +; AVX1-NEXT: vpextrb $4, %xmm0, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: movb $-1, %cl +; AVX1-NEXT: jb .LBB2_42 +; AVX1-NEXT: # %bb.41: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_42: +; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX1-NEXT: vpextrb $5, %xmm2, %ecx +; AVX1-NEXT: vpextrb $5, %xmm0, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: movb $-1, %cl +; AVX1-NEXT: jb .LBB2_44 +; AVX1-NEXT: # %bb.43: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_44: +; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX1-NEXT: vpextrb $6, %xmm2, %ecx +; AVX1-NEXT: vpextrb $6, %xmm0, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: movb $-1, %cl +; AVX1-NEXT: jb .LBB2_46 +; AVX1-NEXT: # %bb.45: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_46: +; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX1-NEXT: vpextrb $7, %xmm2, %ecx +; AVX1-NEXT: vpextrb $7, %xmm0, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: movb $-1, %cl +; AVX1-NEXT: jb .LBB2_48 +; AVX1-NEXT: # %bb.47: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_48: +; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX1-NEXT: vpextrb $8, %xmm2, %ecx +; AVX1-NEXT: vpextrb $8, %xmm0, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: movb $-1, %cl +; AVX1-NEXT: jb .LBB2_50 +; AVX1-NEXT: # %bb.49: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_50: +; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX1-NEXT: vpextrb $9, %xmm2, %ecx +; AVX1-NEXT: vpextrb $9, %xmm0, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: movb $-1, %cl +; AVX1-NEXT: jb .LBB2_52 +; AVX1-NEXT: # %bb.51: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_52: +; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX1-NEXT: vpextrb $10, %xmm2, %ecx +; AVX1-NEXT: vpextrb $10, %xmm0, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: movb $-1, %cl +; AVX1-NEXT: jb .LBB2_54 +; AVX1-NEXT: # %bb.53: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_54: +; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX1-NEXT: vpextrb $11, %xmm2, %ecx +; AVX1-NEXT: vpextrb $11, %xmm0, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: movb $-1, %cl +; AVX1-NEXT: jb .LBB2_56 +; AVX1-NEXT: # %bb.55: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_56: +; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX1-NEXT: vpextrb $12, %xmm2, %ecx +; AVX1-NEXT: vpextrb $12, %xmm0, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: movb $-1, %cl +; AVX1-NEXT: jb .LBB2_58 +; AVX1-NEXT: # %bb.57: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_58: +; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX1-NEXT: vpextrb $13, %xmm2, %ecx +; AVX1-NEXT: vpextrb $13, %xmm0, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: movb $-1, %cl +; AVX1-NEXT: jb .LBB2_60 +; AVX1-NEXT: # %bb.59: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_60: +; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX1-NEXT: vpextrb $14, %xmm2, %ecx +; AVX1-NEXT: vpextrb $14, %xmm0, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: movb $-1, %cl +; AVX1-NEXT: jb .LBB2_62 +; AVX1-NEXT: # %bb.61: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_62: +; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX1-NEXT: vpextrb $15, %xmm2, %ecx +; AVX1-NEXT: vpextrb $15, %xmm0, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: movb $-1, %cl +; AVX1-NEXT: jb .LBB2_64 +; AVX1-NEXT: # %bb.63: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_64: +; AVX1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm0 +; AVX1-NEXT: vpextrb $1, %xmm0, %ecx +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpextrb $1, %xmm2, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: movb $-1, %dil +; AVX1-NEXT: jb .LBB2_66 +; AVX1-NEXT: # %bb.65: +; AVX1-NEXT: movl %eax, %edi +; AVX1-NEXT: .LBB2_66: +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: vpextrb $0, %xmm2, %ecx +; AVX1-NEXT: addb %al, %cl +; AVX1-NEXT: movb $-1, %al +; AVX1-NEXT: jb .LBB2_68 +; AVX1-NEXT: # %bb.67: +; AVX1-NEXT: movl %ecx, %eax +; AVX1-NEXT: .LBB2_68: +; AVX1-NEXT: vpextrb $2, %xmm0, %esi +; AVX1-NEXT: vpextrb $2, %xmm2, %ecx +; AVX1-NEXT: addb %sil, %cl +; AVX1-NEXT: movb $-1, %r15b +; AVX1-NEXT: jb .LBB2_70 +; AVX1-NEXT: # %bb.69: +; AVX1-NEXT: movl %ecx, %r15d +; AVX1-NEXT: .LBB2_70: +; AVX1-NEXT: vpextrb $3, %xmm0, %esi +; AVX1-NEXT: vpextrb $3, %xmm2, %ecx +; AVX1-NEXT: addb %sil, %cl +; AVX1-NEXT: movb $-1, %sil +; AVX1-NEXT: jb .LBB2_72 +; AVX1-NEXT: # %bb.71: +; AVX1-NEXT: movl %ecx, %esi +; AVX1-NEXT: .LBB2_72: +; AVX1-NEXT: vpextrb $4, %xmm0, %ebp +; AVX1-NEXT: vpextrb $4, %xmm2, %ecx +; AVX1-NEXT: addb %bpl, %cl +; AVX1-NEXT: movb $-1, %r14b +; AVX1-NEXT: jb .LBB2_74 +; AVX1-NEXT: # %bb.73: +; AVX1-NEXT: movl %ecx, %r14d +; AVX1-NEXT: .LBB2_74: +; AVX1-NEXT: vpextrb $5, %xmm0, %ebp +; AVX1-NEXT: vpextrb $5, %xmm2, %ecx +; AVX1-NEXT: addb %bpl, %cl +; AVX1-NEXT: movb $-1, %r10b +; AVX1-NEXT: jb .LBB2_76 +; AVX1-NEXT: # %bb.75: +; AVX1-NEXT: movl %ecx, %r10d +; AVX1-NEXT: .LBB2_76: +; AVX1-NEXT: vpextrb $6, %xmm0, %ebp +; AVX1-NEXT: vpextrb $6, %xmm2, %ecx +; AVX1-NEXT: addb %bpl, %cl +; AVX1-NEXT: movb $-1, %r9b +; AVX1-NEXT: jb .LBB2_78 +; AVX1-NEXT: # %bb.77: +; AVX1-NEXT: movl %ecx, %r9d +; AVX1-NEXT: .LBB2_78: +; AVX1-NEXT: vpextrb $7, %xmm0, %ebx +; AVX1-NEXT: vpextrb $7, %xmm2, %ecx +; AVX1-NEXT: addb %bl, %cl +; AVX1-NEXT: movb $-1, %r13b +; AVX1-NEXT: jb .LBB2_80 +; AVX1-NEXT: # %bb.79: +; AVX1-NEXT: movl %ecx, %r13d +; AVX1-NEXT: .LBB2_80: +; AVX1-NEXT: vpextrb $8, %xmm0, %ebx +; AVX1-NEXT: vpextrb $8, %xmm2, %ecx +; AVX1-NEXT: addb %bl, %cl +; AVX1-NEXT: movb $-1, %r12b +; AVX1-NEXT: jb .LBB2_82 +; AVX1-NEXT: # %bb.81: +; AVX1-NEXT: movl %ecx, %r12d +; AVX1-NEXT: .LBB2_82: +; AVX1-NEXT: vpextrb $9, %xmm0, %ebx +; AVX1-NEXT: vpextrb $9, %xmm2, %ecx +; AVX1-NEXT: addb %bl, %cl +; AVX1-NEXT: movb $-1, %dl +; AVX1-NEXT: jb .LBB2_84 +; AVX1-NEXT: # %bb.83: +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: .LBB2_84: +; AVX1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX1-NEXT: vpextrb $10, %xmm0, %ebx +; AVX1-NEXT: vpextrb $10, %xmm2, %ecx +; AVX1-NEXT: addb %bl, %cl +; AVX1-NEXT: movb $-1, %dl +; AVX1-NEXT: jb .LBB2_86 +; AVX1-NEXT: # %bb.85: +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: .LBB2_86: +; AVX1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX1-NEXT: vpextrb $11, %xmm0, %ebx +; AVX1-NEXT: vpextrb $11, %xmm2, %ecx +; AVX1-NEXT: addb %bl, %cl +; AVX1-NEXT: movb $-1, %dl +; AVX1-NEXT: jb .LBB2_88 +; AVX1-NEXT: # %bb.87: +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: .LBB2_88: +; AVX1-NEXT: vpextrb $12, %xmm0, %ebx +; AVX1-NEXT: vpextrb $12, %xmm2, %ecx +; AVX1-NEXT: addb %bl, %cl +; AVX1-NEXT: movb $-1, %bl +; AVX1-NEXT: jb .LBB2_90 +; AVX1-NEXT: # %bb.89: +; AVX1-NEXT: movl %ecx, %ebx +; AVX1-NEXT: .LBB2_90: +; AVX1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX1-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX1-NEXT: movzbl %r8b, %ecx +; AVX1-NEXT: vpextrb $13, %xmm0, %ebx +; AVX1-NEXT: vpextrb $13, %xmm2, %edx +; AVX1-NEXT: addb %bl, %dl +; AVX1-NEXT: movb $-1, %bl +; AVX1-NEXT: jb .LBB2_92 +; AVX1-NEXT: # %bb.91: +; AVX1-NEXT: movl %edx, %ebx +; AVX1-NEXT: .LBB2_92: +; AVX1-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX1-NEXT: movzbl %r11b, %ebp +; AVX1-NEXT: vmovd %ecx, %xmm4 +; AVX1-NEXT: vpextrb $14, %xmm0, %ebx +; AVX1-NEXT: vpextrb $14, %xmm2, %ecx +; AVX1-NEXT: addb %bl, %cl +; AVX1-NEXT: movb $-1, %r8b +; AVX1-NEXT: jb .LBB2_94 +; AVX1-NEXT: # %bb.93: +; AVX1-NEXT: movl %ecx, %r8d +; AVX1-NEXT: .LBB2_94: +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX1-NEXT: vmovd %edx, %xmm5 +; AVX1-NEXT: vpinsrb $1, %ebp, %xmm4, %xmm4 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX1-NEXT: movzbl %al, %r11d +; AVX1-NEXT: vpextrb $15, %xmm0, %ebx +; AVX1-NEXT: vpextrb $15, %xmm2, %eax +; AVX1-NEXT: addb %bl, %al +; AVX1-NEXT: movb $-1, %bl +; AVX1-NEXT: jb .LBB2_96 +; AVX1-NEXT: # %bb.95: +; AVX1-NEXT: movl %eax, %ebx +; AVX1-NEXT: .LBB2_96: +; AVX1-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX1-NEXT: vpinsrb $1, %ecx, %xmm5, %xmm0 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $2, %edx, %xmm4, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX1-NEXT: movzbl %dil, %eax +; AVX1-NEXT: vmovd %r11d, %xmm4 +; AVX1-NEXT: vpextrb $1, %xmm3, %ebp +; AVX1-NEXT: vpextrb $1, %xmm1, %edi +; AVX1-NEXT: addb %bpl, %dil +; AVX1-NEXT: movb $-1, %bl +; AVX1-NEXT: jb .LBB2_98 +; AVX1-NEXT: # %bb.97: +; AVX1-NEXT: movl %edi, %ebx +; AVX1-NEXT: .LBB2_98: +; AVX1-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $3, %edx, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4 +; AVX1-NEXT: movzbl %r15b, %ebp +; AVX1-NEXT: movzbl %bl, %edi +; AVX1-NEXT: vpextrb $0, %xmm3, %ebx +; AVX1-NEXT: vpextrb $0, %xmm1, %eax +; AVX1-NEXT: addb %bl, %al +; AVX1-NEXT: movb $-1, %bl +; AVX1-NEXT: jb .LBB2_100 +; AVX1-NEXT: # %bb.99: +; AVX1-NEXT: movl %eax, %ebx +; AVX1-NEXT: .LBB2_100: +; AVX1-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $4, %edx, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $2, %ebp, %xmm4, %xmm4 +; AVX1-NEXT: movzbl %sil, %esi +; AVX1-NEXT: movzbl %bl, %edx +; AVX1-NEXT: vmovd %edx, %xmm5 +; AVX1-NEXT: vpinsrb $1, %edi, %xmm5, %xmm5 +; AVX1-NEXT: vpextrb $2, %xmm3, %edi +; AVX1-NEXT: vpextrb $2, %xmm1, %edx +; AVX1-NEXT: addb %dil, %dl +; AVX1-NEXT: movb $-1, %bl +; AVX1-NEXT: jb .LBB2_102 +; AVX1-NEXT: # %bb.101: +; AVX1-NEXT: movl %edx, %ebx +; AVX1-NEXT: .LBB2_102: +; AVX1-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $3, %esi, %xmm4, %xmm4 +; AVX1-NEXT: movzbl %r14b, %edx +; AVX1-NEXT: movzbl %bl, %esi +; AVX1-NEXT: vpinsrb $2, %esi, %xmm5, %xmm5 +; AVX1-NEXT: vpextrb $3, %xmm3, %edi +; AVX1-NEXT: vpextrb $3, %xmm1, %esi +; AVX1-NEXT: addb %dil, %sil +; AVX1-NEXT: movb $-1, %bl +; AVX1-NEXT: jb .LBB2_104 +; AVX1-NEXT: # %bb.103: +; AVX1-NEXT: movl %esi, %ebx +; AVX1-NEXT: .LBB2_104: +; AVX1-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $4, %edx, %xmm4, %xmm4 +; AVX1-NEXT: movzbl %r10b, %edx +; AVX1-NEXT: movzbl %bl, %esi +; AVX1-NEXT: vpinsrb $3, %esi, %xmm5, %xmm5 +; AVX1-NEXT: vpextrb $4, %xmm3, %edi +; AVX1-NEXT: vpextrb $4, %xmm1, %esi +; AVX1-NEXT: addb %dil, %sil +; AVX1-NEXT: movb $-1, %bl +; AVX1-NEXT: jb .LBB2_106 +; AVX1-NEXT: # %bb.105: +; AVX1-NEXT: movl %esi, %ebx +; AVX1-NEXT: .LBB2_106: +; AVX1-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $5, %edx, %xmm4, %xmm4 +; AVX1-NEXT: movzbl %r9b, %edx +; AVX1-NEXT: movzbl %bl, %esi +; AVX1-NEXT: vpinsrb $4, %esi, %xmm5, %xmm5 +; AVX1-NEXT: vpextrb $5, %xmm3, %edi +; AVX1-NEXT: vpextrb $5, %xmm1, %esi +; AVX1-NEXT: addb %dil, %sil +; AVX1-NEXT: movb $-1, %bl +; AVX1-NEXT: jb .LBB2_108 +; AVX1-NEXT: # %bb.107: +; AVX1-NEXT: movl %esi, %ebx +; AVX1-NEXT: .LBB2_108: +; AVX1-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $6, %edx, %xmm4, %xmm4 +; AVX1-NEXT: movzbl %r13b, %edx +; AVX1-NEXT: movzbl %bl, %esi +; AVX1-NEXT: vpinsrb $5, %esi, %xmm5, %xmm5 +; AVX1-NEXT: vpextrb $6, %xmm3, %edi +; AVX1-NEXT: vpextrb $6, %xmm1, %esi +; AVX1-NEXT: addb %dil, %sil +; AVX1-NEXT: movb $-1, %bl +; AVX1-NEXT: jb .LBB2_110 +; AVX1-NEXT: # %bb.109: +; AVX1-NEXT: movl %esi, %ebx +; AVX1-NEXT: .LBB2_110: +; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $7, %edx, %xmm4, %xmm4 +; AVX1-NEXT: movzbl %r12b, %edx +; AVX1-NEXT: movzbl %bl, %esi +; AVX1-NEXT: vpinsrb $6, %esi, %xmm5, %xmm5 +; AVX1-NEXT: vpextrb $7, %xmm3, %edi +; AVX1-NEXT: vpextrb $7, %xmm1, %esi +; AVX1-NEXT: addb %dil, %sil +; AVX1-NEXT: movb $-1, %bl +; AVX1-NEXT: jb .LBB2_112 +; AVX1-NEXT: # %bb.111: +; AVX1-NEXT: movl %esi, %ebx +; AVX1-NEXT: .LBB2_112: +; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $8, %edx, %xmm4, %xmm4 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX1-NEXT: movzbl %bl, %esi +; AVX1-NEXT: vpinsrb $7, %esi, %xmm5, %xmm5 +; AVX1-NEXT: vpextrb $8, %xmm3, %edi +; AVX1-NEXT: vpextrb $8, %xmm1, %esi +; AVX1-NEXT: addb %dil, %sil +; AVX1-NEXT: movb $-1, %bl +; AVX1-NEXT: jb .LBB2_114 +; AVX1-NEXT: # %bb.113: +; AVX1-NEXT: movl %esi, %ebx +; AVX1-NEXT: .LBB2_114: +; AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $9, %edx, %xmm4, %xmm4 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX1-NEXT: movzbl %bl, %esi +; AVX1-NEXT: vpinsrb $8, %esi, %xmm5, %xmm5 +; AVX1-NEXT: vpextrb $9, %xmm3, %edi +; AVX1-NEXT: vpextrb $9, %xmm1, %esi +; AVX1-NEXT: addb %dil, %sil +; AVX1-NEXT: movb $-1, %bl +; AVX1-NEXT: jb .LBB2_116 +; AVX1-NEXT: # %bb.115: +; AVX1-NEXT: movl %esi, %ebx +; AVX1-NEXT: .LBB2_116: +; AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $10, %edx, %xmm4, %xmm4 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX1-NEXT: movzbl %bl, %esi +; AVX1-NEXT: vpinsrb $9, %esi, %xmm5, %xmm5 +; AVX1-NEXT: vpextrb $10, %xmm3, %edi +; AVX1-NEXT: vpextrb $10, %xmm1, %esi +; AVX1-NEXT: addb %dil, %sil +; AVX1-NEXT: movb $-1, %bl +; AVX1-NEXT: jb .LBB2_118 +; AVX1-NEXT: # %bb.117: +; AVX1-NEXT: movl %esi, %ebx +; AVX1-NEXT: .LBB2_118: +; AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $11, %edx, %xmm4, %xmm4 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX1-NEXT: movzbl %bl, %esi +; AVX1-NEXT: vpinsrb $10, %esi, %xmm5, %xmm5 +; AVX1-NEXT: vpextrb $11, %xmm3, %edi +; AVX1-NEXT: vpextrb $11, %xmm1, %esi +; AVX1-NEXT: addb %dil, %sil +; AVX1-NEXT: movb $-1, %bl +; AVX1-NEXT: jb .LBB2_120 +; AVX1-NEXT: # %bb.119: +; AVX1-NEXT: movl %esi, %ebx +; AVX1-NEXT: .LBB2_120: +; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $12, %edx, %xmm4, %xmm6 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX1-NEXT: movzbl %bl, %esi +; AVX1-NEXT: vpinsrb $11, %esi, %xmm5, %xmm5 +; AVX1-NEXT: vpextrb $12, %xmm3, %edi +; AVX1-NEXT: vpextrb $12, %xmm1, %esi +; AVX1-NEXT: addb %dil, %sil +; AVX1-NEXT: movb $-1, %bl +; AVX1-NEXT: jb .LBB2_122 +; AVX1-NEXT: # %bb.121: +; AVX1-NEXT: movl %esi, %ebx +; AVX1-NEXT: .LBB2_122: +; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm4 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $15, %ecx, %xmm2, %xmm0 +; AVX1-NEXT: vpinsrb $13, %edx, %xmm6, %xmm6 +; AVX1-NEXT: movzbl %r8b, %ecx +; AVX1-NEXT: movzbl %bl, %edx +; AVX1-NEXT: vpinsrb $12, %edx, %xmm5, %xmm5 +; AVX1-NEXT: vpextrb $13, %xmm3, %edx +; AVX1-NEXT: vpextrb $13, %xmm1, %esi +; AVX1-NEXT: addb %dl, %sil +; AVX1-NEXT: movb $-1, %dl +; AVX1-NEXT: jb .LBB2_124 +; AVX1-NEXT: # %bb.123: +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: .LBB2_124: +; AVX1-NEXT: vpinsrb $15, %eax, %xmm4, %xmm2 +; AVX1-NEXT: vpinsrb $14, %ecx, %xmm6, %xmm4 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: movzbl %dl, %ecx +; AVX1-NEXT: vpinsrb $13, %ecx, %xmm5, %xmm5 +; AVX1-NEXT: vpextrb $14, %xmm3, %edx +; AVX1-NEXT: vpextrb $14, %xmm1, %ecx +; AVX1-NEXT: addb %dl, %cl +; AVX1-NEXT: movb $-1, %dl +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: popq %r12 +; AVX1-NEXT: popq %r13 +; AVX1-NEXT: popq %r14 +; AVX1-NEXT: popq %r15 +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: jb .LBB2_126 +; AVX1-NEXT: # %bb.125: +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: .LBB2_126: +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vpinsrb $15, %eax, %xmm4, %xmm2 +; AVX1-NEXT: movzbl %dl, %eax +; AVX1-NEXT: vpinsrb $14, %eax, %xmm5, %xmm4 +; AVX1-NEXT: vpextrb $15, %xmm3, %ecx +; AVX1-NEXT: vpextrb $15, %xmm1, %eax +; AVX1-NEXT: addb %cl, %al +; AVX1-NEXT: movb $-1, %cl +; AVX1-NEXT: jb .LBB2_128 +; AVX1-NEXT: # %bb.127: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_128: +; AVX1-NEXT: movzbl %cl, %eax +; AVX1-NEXT: vpinsrb $15, %eax, %xmm4, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: v64i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-NEXT: vpextrb $1, %xmm4, %ecx +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-NEXT: vpextrb $1, %xmm5, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: movb $-1, %cl +; AVX2-NEXT: jb .LBB2_2 +; AVX2-NEXT: # %bb.1: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_2: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %r13 +; AVX2-NEXT: pushq %r12 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX2-NEXT: vpextrb $0, %xmm4, %ecx +; AVX2-NEXT: vpextrb $0, %xmm5, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: movb $-1, %cl +; AVX2-NEXT: jb .LBB2_4 +; AVX2-NEXT: # %bb.3: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_4: +; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX2-NEXT: vpextrb $2, %xmm4, %ecx +; AVX2-NEXT: vpextrb $2, %xmm5, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: movb $-1, %cl +; AVX2-NEXT: jb .LBB2_6 +; AVX2-NEXT: # %bb.5: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_6: +; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX2-NEXT: vpextrb $3, %xmm4, %ecx +; AVX2-NEXT: vpextrb $3, %xmm5, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: movb $-1, %cl +; AVX2-NEXT: jb .LBB2_8 +; AVX2-NEXT: # %bb.7: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_8: +; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX2-NEXT: vpextrb $4, %xmm4, %ecx +; AVX2-NEXT: vpextrb $4, %xmm5, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: movb $-1, %cl +; AVX2-NEXT: jb .LBB2_10 +; AVX2-NEXT: # %bb.9: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_10: +; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX2-NEXT: vpextrb $5, %xmm4, %ecx +; AVX2-NEXT: vpextrb $5, %xmm5, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: movb $-1, %cl +; AVX2-NEXT: jb .LBB2_12 +; AVX2-NEXT: # %bb.11: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_12: +; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX2-NEXT: vpextrb $6, %xmm4, %ecx +; AVX2-NEXT: vpextrb $6, %xmm5, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: movb $-1, %cl +; AVX2-NEXT: jb .LBB2_14 +; AVX2-NEXT: # %bb.13: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_14: +; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX2-NEXT: vpextrb $7, %xmm4, %ecx +; AVX2-NEXT: vpextrb $7, %xmm5, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: movb $-1, %cl +; AVX2-NEXT: jb .LBB2_16 +; AVX2-NEXT: # %bb.15: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_16: +; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX2-NEXT: vpextrb $8, %xmm4, %ecx +; AVX2-NEXT: vpextrb $8, %xmm5, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: movb $-1, %cl +; AVX2-NEXT: jb .LBB2_18 +; AVX2-NEXT: # %bb.17: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_18: +; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX2-NEXT: vpextrb $9, %xmm4, %ecx +; AVX2-NEXT: vpextrb $9, %xmm5, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: movb $-1, %cl +; AVX2-NEXT: jb .LBB2_20 +; AVX2-NEXT: # %bb.19: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_20: +; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX2-NEXT: vpextrb $10, %xmm4, %ecx +; AVX2-NEXT: vpextrb $10, %xmm5, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: movb $-1, %cl +; AVX2-NEXT: jb .LBB2_22 +; AVX2-NEXT: # %bb.21: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_22: +; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX2-NEXT: vpextrb $11, %xmm4, %ecx +; AVX2-NEXT: vpextrb $11, %xmm5, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: movb $-1, %cl +; AVX2-NEXT: jb .LBB2_24 +; AVX2-NEXT: # %bb.23: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_24: +; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX2-NEXT: vpextrb $12, %xmm4, %ecx +; AVX2-NEXT: vpextrb $12, %xmm5, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: movb $-1, %cl +; AVX2-NEXT: jb .LBB2_26 +; AVX2-NEXT: # %bb.25: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_26: +; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX2-NEXT: vpextrb $13, %xmm4, %ecx +; AVX2-NEXT: vpextrb $13, %xmm5, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: movb $-1, %cl +; AVX2-NEXT: jb .LBB2_28 +; AVX2-NEXT: # %bb.27: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_28: +; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX2-NEXT: vpextrb $14, %xmm4, %ecx +; AVX2-NEXT: vpextrb $14, %xmm5, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: movb $-1, %cl +; AVX2-NEXT: jb .LBB2_30 +; AVX2-NEXT: # %bb.29: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_30: +; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX2-NEXT: vpextrb $15, %xmm4, %ecx +; AVX2-NEXT: vpextrb $15, %xmm5, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: movb $-1, %cl +; AVX2-NEXT: jb .LBB2_32 +; AVX2-NEXT: # %bb.31: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_32: +; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX2-NEXT: vpextrb $1, %xmm2, %ecx +; AVX2-NEXT: vpextrb $1, %xmm0, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: movb $-1, %r11b +; AVX2-NEXT: jb .LBB2_34 +; AVX2-NEXT: # %bb.33: +; AVX2-NEXT: movl %eax, %r11d +; AVX2-NEXT: .LBB2_34: +; AVX2-NEXT: vpextrb $0, %xmm2, %ecx +; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: movb $-1, %r8b +; AVX2-NEXT: jb .LBB2_36 +; AVX2-NEXT: # %bb.35: +; AVX2-NEXT: movl %eax, %r8d +; AVX2-NEXT: .LBB2_36: +; AVX2-NEXT: vpextrb $2, %xmm2, %ecx +; AVX2-NEXT: vpextrb $2, %xmm0, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: movb $-1, %cl +; AVX2-NEXT: jb .LBB2_38 +; AVX2-NEXT: # %bb.37: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_38: +; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX2-NEXT: vpextrb $3, %xmm2, %ecx +; AVX2-NEXT: vpextrb $3, %xmm0, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: movb $-1, %cl +; AVX2-NEXT: jb .LBB2_40 +; AVX2-NEXT: # %bb.39: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_40: +; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX2-NEXT: vpextrb $4, %xmm2, %ecx +; AVX2-NEXT: vpextrb $4, %xmm0, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: movb $-1, %cl +; AVX2-NEXT: jb .LBB2_42 +; AVX2-NEXT: # %bb.41: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_42: +; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX2-NEXT: vpextrb $5, %xmm2, %ecx +; AVX2-NEXT: vpextrb $5, %xmm0, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: movb $-1, %cl +; AVX2-NEXT: jb .LBB2_44 +; AVX2-NEXT: # %bb.43: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_44: +; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX2-NEXT: vpextrb $6, %xmm2, %ecx +; AVX2-NEXT: vpextrb $6, %xmm0, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: movb $-1, %cl +; AVX2-NEXT: jb .LBB2_46 +; AVX2-NEXT: # %bb.45: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_46: +; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX2-NEXT: vpextrb $7, %xmm2, %ecx +; AVX2-NEXT: vpextrb $7, %xmm0, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: movb $-1, %cl +; AVX2-NEXT: jb .LBB2_48 +; AVX2-NEXT: # %bb.47: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_48: +; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX2-NEXT: vpextrb $8, %xmm2, %ecx +; AVX2-NEXT: vpextrb $8, %xmm0, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: movb $-1, %cl +; AVX2-NEXT: jb .LBB2_50 +; AVX2-NEXT: # %bb.49: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_50: +; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX2-NEXT: vpextrb $9, %xmm2, %ecx +; AVX2-NEXT: vpextrb $9, %xmm0, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: movb $-1, %cl +; AVX2-NEXT: jb .LBB2_52 +; AVX2-NEXT: # %bb.51: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_52: +; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX2-NEXT: vpextrb $10, %xmm2, %ecx +; AVX2-NEXT: vpextrb $10, %xmm0, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: movb $-1, %cl +; AVX2-NEXT: jb .LBB2_54 +; AVX2-NEXT: # %bb.53: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_54: +; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX2-NEXT: vpextrb $11, %xmm2, %ecx +; AVX2-NEXT: vpextrb $11, %xmm0, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: movb $-1, %cl +; AVX2-NEXT: jb .LBB2_56 +; AVX2-NEXT: # %bb.55: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_56: +; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX2-NEXT: vpextrb $12, %xmm2, %ecx +; AVX2-NEXT: vpextrb $12, %xmm0, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: movb $-1, %cl +; AVX2-NEXT: jb .LBB2_58 +; AVX2-NEXT: # %bb.57: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_58: +; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX2-NEXT: vpextrb $13, %xmm2, %ecx +; AVX2-NEXT: vpextrb $13, %xmm0, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: movb $-1, %cl +; AVX2-NEXT: jb .LBB2_60 +; AVX2-NEXT: # %bb.59: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_60: +; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX2-NEXT: vpextrb $14, %xmm2, %ecx +; AVX2-NEXT: vpextrb $14, %xmm0, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: movb $-1, %cl +; AVX2-NEXT: jb .LBB2_62 +; AVX2-NEXT: # %bb.61: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_62: +; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX2-NEXT: vpextrb $15, %xmm2, %ecx +; AVX2-NEXT: vpextrb $15, %xmm0, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: movb $-1, %cl +; AVX2-NEXT: jb .LBB2_64 +; AVX2-NEXT: # %bb.63: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_64: +; AVX2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm0 +; AVX2-NEXT: vpextrb $1, %xmm0, %ecx +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpextrb $1, %xmm2, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: movb $-1, %dil +; AVX2-NEXT: jb .LBB2_66 +; AVX2-NEXT: # %bb.65: +; AVX2-NEXT: movl %eax, %edi +; AVX2-NEXT: .LBB2_66: +; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: vpextrb $0, %xmm2, %ecx +; AVX2-NEXT: addb %al, %cl +; AVX2-NEXT: movb $-1, %al +; AVX2-NEXT: jb .LBB2_68 +; AVX2-NEXT: # %bb.67: +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: .LBB2_68: +; AVX2-NEXT: vpextrb $2, %xmm0, %esi +; AVX2-NEXT: vpextrb $2, %xmm2, %ecx +; AVX2-NEXT: addb %sil, %cl +; AVX2-NEXT: movb $-1, %r15b +; AVX2-NEXT: jb .LBB2_70 +; AVX2-NEXT: # %bb.69: +; AVX2-NEXT: movl %ecx, %r15d +; AVX2-NEXT: .LBB2_70: +; AVX2-NEXT: vpextrb $3, %xmm0, %esi +; AVX2-NEXT: vpextrb $3, %xmm2, %ecx +; AVX2-NEXT: addb %sil, %cl +; AVX2-NEXT: movb $-1, %sil +; AVX2-NEXT: jb .LBB2_72 +; AVX2-NEXT: # %bb.71: +; AVX2-NEXT: movl %ecx, %esi +; AVX2-NEXT: .LBB2_72: +; AVX2-NEXT: vpextrb $4, %xmm0, %ebp +; AVX2-NEXT: vpextrb $4, %xmm2, %ecx +; AVX2-NEXT: addb %bpl, %cl +; AVX2-NEXT: movb $-1, %r14b +; AVX2-NEXT: jb .LBB2_74 +; AVX2-NEXT: # %bb.73: +; AVX2-NEXT: movl %ecx, %r14d +; AVX2-NEXT: .LBB2_74: +; AVX2-NEXT: vpextrb $5, %xmm0, %ebp +; AVX2-NEXT: vpextrb $5, %xmm2, %ecx +; AVX2-NEXT: addb %bpl, %cl +; AVX2-NEXT: movb $-1, %r10b +; AVX2-NEXT: jb .LBB2_76 +; AVX2-NEXT: # %bb.75: +; AVX2-NEXT: movl %ecx, %r10d +; AVX2-NEXT: .LBB2_76: +; AVX2-NEXT: vpextrb $6, %xmm0, %ebp +; AVX2-NEXT: vpextrb $6, %xmm2, %ecx +; AVX2-NEXT: addb %bpl, %cl +; AVX2-NEXT: movb $-1, %r9b +; AVX2-NEXT: jb .LBB2_78 +; AVX2-NEXT: # %bb.77: +; AVX2-NEXT: movl %ecx, %r9d +; AVX2-NEXT: .LBB2_78: +; AVX2-NEXT: vpextrb $7, %xmm0, %ebx +; AVX2-NEXT: vpextrb $7, %xmm2, %ecx +; AVX2-NEXT: addb %bl, %cl +; AVX2-NEXT: movb $-1, %r13b +; AVX2-NEXT: jb .LBB2_80 +; AVX2-NEXT: # %bb.79: +; AVX2-NEXT: movl %ecx, %r13d +; AVX2-NEXT: .LBB2_80: +; AVX2-NEXT: vpextrb $8, %xmm0, %ebx +; AVX2-NEXT: vpextrb $8, %xmm2, %ecx +; AVX2-NEXT: addb %bl, %cl +; AVX2-NEXT: movb $-1, %r12b +; AVX2-NEXT: jb .LBB2_82 +; AVX2-NEXT: # %bb.81: +; AVX2-NEXT: movl %ecx, %r12d +; AVX2-NEXT: .LBB2_82: +; AVX2-NEXT: vpextrb $9, %xmm0, %ebx +; AVX2-NEXT: vpextrb $9, %xmm2, %ecx +; AVX2-NEXT: addb %bl, %cl +; AVX2-NEXT: movb $-1, %dl +; AVX2-NEXT: jb .LBB2_84 +; AVX2-NEXT: # %bb.83: +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: .LBB2_84: +; AVX2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX2-NEXT: vpextrb $10, %xmm0, %ebx +; AVX2-NEXT: vpextrb $10, %xmm2, %ecx +; AVX2-NEXT: addb %bl, %cl +; AVX2-NEXT: movb $-1, %dl +; AVX2-NEXT: jb .LBB2_86 +; AVX2-NEXT: # %bb.85: +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: .LBB2_86: +; AVX2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX2-NEXT: vpextrb $11, %xmm0, %ebx +; AVX2-NEXT: vpextrb $11, %xmm2, %ecx +; AVX2-NEXT: addb %bl, %cl +; AVX2-NEXT: movb $-1, %dl +; AVX2-NEXT: jb .LBB2_88 +; AVX2-NEXT: # %bb.87: +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: .LBB2_88: +; AVX2-NEXT: vpextrb $12, %xmm0, %ebx +; AVX2-NEXT: vpextrb $12, %xmm2, %ecx +; AVX2-NEXT: addb %bl, %cl +; AVX2-NEXT: movb $-1, %bl +; AVX2-NEXT: jb .LBB2_90 +; AVX2-NEXT: # %bb.89: +; AVX2-NEXT: movl %ecx, %ebx +; AVX2-NEXT: .LBB2_90: +; AVX2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX2-NEXT: movzbl %r8b, %ecx +; AVX2-NEXT: vpextrb $13, %xmm0, %ebx +; AVX2-NEXT: vpextrb $13, %xmm2, %edx +; AVX2-NEXT: addb %bl, %dl +; AVX2-NEXT: movb $-1, %bl +; AVX2-NEXT: jb .LBB2_92 +; AVX2-NEXT: # %bb.91: +; AVX2-NEXT: movl %edx, %ebx +; AVX2-NEXT: .LBB2_92: +; AVX2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX2-NEXT: movzbl %r11b, %ebp +; AVX2-NEXT: vmovd %ecx, %xmm4 +; AVX2-NEXT: vpextrb $14, %xmm0, %ebx +; AVX2-NEXT: vpextrb $14, %xmm2, %ecx +; AVX2-NEXT: addb %bl, %cl +; AVX2-NEXT: movb $-1, %r8b +; AVX2-NEXT: jb .LBB2_94 +; AVX2-NEXT: # %bb.93: +; AVX2-NEXT: movl %ecx, %r8d +; AVX2-NEXT: .LBB2_94: +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX2-NEXT: vmovd %edx, %xmm5 +; AVX2-NEXT: vpinsrb $1, %ebp, %xmm4, %xmm4 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX2-NEXT: movzbl %al, %r11d +; AVX2-NEXT: vpextrb $15, %xmm0, %ebx +; AVX2-NEXT: vpextrb $15, %xmm2, %eax +; AVX2-NEXT: addb %bl, %al +; AVX2-NEXT: movb $-1, %bl +; AVX2-NEXT: jb .LBB2_96 +; AVX2-NEXT: # %bb.95: +; AVX2-NEXT: movl %eax, %ebx +; AVX2-NEXT: .LBB2_96: +; AVX2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX2-NEXT: vpinsrb $1, %ecx, %xmm5, %xmm0 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $2, %edx, %xmm4, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX2-NEXT: movzbl %dil, %eax +; AVX2-NEXT: vmovd %r11d, %xmm4 +; AVX2-NEXT: vpextrb $1, %xmm3, %ebp +; AVX2-NEXT: vpextrb $1, %xmm1, %edi +; AVX2-NEXT: addb %bpl, %dil +; AVX2-NEXT: movb $-1, %bl +; AVX2-NEXT: jb .LBB2_98 +; AVX2-NEXT: # %bb.97: +; AVX2-NEXT: movl %edi, %ebx +; AVX2-NEXT: .LBB2_98: +; AVX2-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $3, %edx, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4 +; AVX2-NEXT: movzbl %r15b, %ebp +; AVX2-NEXT: movzbl %bl, %edi +; AVX2-NEXT: vpextrb $0, %xmm3, %ebx +; AVX2-NEXT: vpextrb $0, %xmm1, %eax +; AVX2-NEXT: addb %bl, %al +; AVX2-NEXT: movb $-1, %bl +; AVX2-NEXT: jb .LBB2_100 +; AVX2-NEXT: # %bb.99: +; AVX2-NEXT: movl %eax, %ebx +; AVX2-NEXT: .LBB2_100: +; AVX2-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $4, %edx, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $2, %ebp, %xmm4, %xmm4 +; AVX2-NEXT: movzbl %sil, %esi +; AVX2-NEXT: movzbl %bl, %edx +; AVX2-NEXT: vmovd %edx, %xmm5 +; AVX2-NEXT: vpinsrb $1, %edi, %xmm5, %xmm5 +; AVX2-NEXT: vpextrb $2, %xmm3, %edi +; AVX2-NEXT: vpextrb $2, %xmm1, %edx +; AVX2-NEXT: addb %dil, %dl +; AVX2-NEXT: movb $-1, %bl +; AVX2-NEXT: jb .LBB2_102 +; AVX2-NEXT: # %bb.101: +; AVX2-NEXT: movl %edx, %ebx +; AVX2-NEXT: .LBB2_102: +; AVX2-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $3, %esi, %xmm4, %xmm4 +; AVX2-NEXT: movzbl %r14b, %edx +; AVX2-NEXT: movzbl %bl, %esi +; AVX2-NEXT: vpinsrb $2, %esi, %xmm5, %xmm5 +; AVX2-NEXT: vpextrb $3, %xmm3, %edi +; AVX2-NEXT: vpextrb $3, %xmm1, %esi +; AVX2-NEXT: addb %dil, %sil +; AVX2-NEXT: movb $-1, %bl +; AVX2-NEXT: jb .LBB2_104 +; AVX2-NEXT: # %bb.103: +; AVX2-NEXT: movl %esi, %ebx +; AVX2-NEXT: .LBB2_104: +; AVX2-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $4, %edx, %xmm4, %xmm4 +; AVX2-NEXT: movzbl %r10b, %edx +; AVX2-NEXT: movzbl %bl, %esi +; AVX2-NEXT: vpinsrb $3, %esi, %xmm5, %xmm5 +; AVX2-NEXT: vpextrb $4, %xmm3, %edi +; AVX2-NEXT: vpextrb $4, %xmm1, %esi +; AVX2-NEXT: addb %dil, %sil +; AVX2-NEXT: movb $-1, %bl +; AVX2-NEXT: jb .LBB2_106 +; AVX2-NEXT: # %bb.105: +; AVX2-NEXT: movl %esi, %ebx +; AVX2-NEXT: .LBB2_106: +; AVX2-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $5, %edx, %xmm4, %xmm4 +; AVX2-NEXT: movzbl %r9b, %edx +; AVX2-NEXT: movzbl %bl, %esi +; AVX2-NEXT: vpinsrb $4, %esi, %xmm5, %xmm5 +; AVX2-NEXT: vpextrb $5, %xmm3, %edi +; AVX2-NEXT: vpextrb $5, %xmm1, %esi +; AVX2-NEXT: addb %dil, %sil +; AVX2-NEXT: movb $-1, %bl +; AVX2-NEXT: jb .LBB2_108 +; AVX2-NEXT: # %bb.107: +; AVX2-NEXT: movl %esi, %ebx +; AVX2-NEXT: .LBB2_108: +; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $6, %edx, %xmm4, %xmm4 +; AVX2-NEXT: movzbl %r13b, %edx +; AVX2-NEXT: movzbl %bl, %esi +; AVX2-NEXT: vpinsrb $5, %esi, %xmm5, %xmm5 +; AVX2-NEXT: vpextrb $6, %xmm3, %edi +; AVX2-NEXT: vpextrb $6, %xmm1, %esi +; AVX2-NEXT: addb %dil, %sil +; AVX2-NEXT: movb $-1, %bl +; AVX2-NEXT: jb .LBB2_110 +; AVX2-NEXT: # %bb.109: +; AVX2-NEXT: movl %esi, %ebx +; AVX2-NEXT: .LBB2_110: +; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $7, %edx, %xmm4, %xmm4 +; AVX2-NEXT: movzbl %r12b, %edx +; AVX2-NEXT: movzbl %bl, %esi +; AVX2-NEXT: vpinsrb $6, %esi, %xmm5, %xmm5 +; AVX2-NEXT: vpextrb $7, %xmm3, %edi +; AVX2-NEXT: vpextrb $7, %xmm1, %esi +; AVX2-NEXT: addb %dil, %sil +; AVX2-NEXT: movb $-1, %bl +; AVX2-NEXT: jb .LBB2_112 +; AVX2-NEXT: # %bb.111: +; AVX2-NEXT: movl %esi, %ebx +; AVX2-NEXT: .LBB2_112: +; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $8, %edx, %xmm4, %xmm4 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX2-NEXT: movzbl %bl, %esi +; AVX2-NEXT: vpinsrb $7, %esi, %xmm5, %xmm5 +; AVX2-NEXT: vpextrb $8, %xmm3, %edi +; AVX2-NEXT: vpextrb $8, %xmm1, %esi +; AVX2-NEXT: addb %dil, %sil +; AVX2-NEXT: movb $-1, %bl +; AVX2-NEXT: jb .LBB2_114 +; AVX2-NEXT: # %bb.113: +; AVX2-NEXT: movl %esi, %ebx +; AVX2-NEXT: .LBB2_114: +; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $9, %edx, %xmm4, %xmm4 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX2-NEXT: movzbl %bl, %esi +; AVX2-NEXT: vpinsrb $8, %esi, %xmm5, %xmm5 +; AVX2-NEXT: vpextrb $9, %xmm3, %edi +; AVX2-NEXT: vpextrb $9, %xmm1, %esi +; AVX2-NEXT: addb %dil, %sil +; AVX2-NEXT: movb $-1, %bl +; AVX2-NEXT: jb .LBB2_116 +; AVX2-NEXT: # %bb.115: +; AVX2-NEXT: movl %esi, %ebx +; AVX2-NEXT: .LBB2_116: +; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $10, %edx, %xmm4, %xmm4 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX2-NEXT: movzbl %bl, %esi +; AVX2-NEXT: vpinsrb $9, %esi, %xmm5, %xmm5 +; AVX2-NEXT: vpextrb $10, %xmm3, %edi +; AVX2-NEXT: vpextrb $10, %xmm1, %esi +; AVX2-NEXT: addb %dil, %sil +; AVX2-NEXT: movb $-1, %bl +; AVX2-NEXT: jb .LBB2_118 +; AVX2-NEXT: # %bb.117: +; AVX2-NEXT: movl %esi, %ebx +; AVX2-NEXT: .LBB2_118: +; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $11, %edx, %xmm4, %xmm4 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX2-NEXT: movzbl %bl, %esi +; AVX2-NEXT: vpinsrb $10, %esi, %xmm5, %xmm5 +; AVX2-NEXT: vpextrb $11, %xmm3, %edi +; AVX2-NEXT: vpextrb $11, %xmm1, %esi +; AVX2-NEXT: addb %dil, %sil +; AVX2-NEXT: movb $-1, %bl +; AVX2-NEXT: jb .LBB2_120 +; AVX2-NEXT: # %bb.119: +; AVX2-NEXT: movl %esi, %ebx +; AVX2-NEXT: .LBB2_120: +; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $12, %edx, %xmm4, %xmm6 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX2-NEXT: movzbl %bl, %esi +; AVX2-NEXT: vpinsrb $11, %esi, %xmm5, %xmm5 +; AVX2-NEXT: vpextrb $12, %xmm3, %edi +; AVX2-NEXT: vpextrb $12, %xmm1, %esi +; AVX2-NEXT: addb %dil, %sil +; AVX2-NEXT: movb $-1, %bl +; AVX2-NEXT: jb .LBB2_122 +; AVX2-NEXT: # %bb.121: +; AVX2-NEXT: movl %esi, %ebx +; AVX2-NEXT: .LBB2_122: +; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm4 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $15, %ecx, %xmm2, %xmm0 +; AVX2-NEXT: vpinsrb $13, %edx, %xmm6, %xmm6 +; AVX2-NEXT: movzbl %r8b, %ecx +; AVX2-NEXT: movzbl %bl, %edx +; AVX2-NEXT: vpinsrb $12, %edx, %xmm5, %xmm5 +; AVX2-NEXT: vpextrb $13, %xmm3, %edx +; AVX2-NEXT: vpextrb $13, %xmm1, %esi +; AVX2-NEXT: addb %dl, %sil +; AVX2-NEXT: movb $-1, %dl +; AVX2-NEXT: jb .LBB2_124 +; AVX2-NEXT: # %bb.123: +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: .LBB2_124: +; AVX2-NEXT: vpinsrb $15, %eax, %xmm4, %xmm2 +; AVX2-NEXT: vpinsrb $14, %ecx, %xmm6, %xmm4 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: movzbl %dl, %ecx +; AVX2-NEXT: vpinsrb $13, %ecx, %xmm5, %xmm5 +; AVX2-NEXT: vpextrb $14, %xmm3, %edx +; AVX2-NEXT: vpextrb $14, %xmm1, %ecx +; AVX2-NEXT: addb %dl, %cl +; AVX2-NEXT: movb $-1, %dl +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r12 +; AVX2-NEXT: popq %r13 +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: jb .LBB2_126 +; AVX2-NEXT: # %bb.125: +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: .LBB2_126: +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: vpinsrb $15, %eax, %xmm4, %xmm2 +; AVX2-NEXT: movzbl %dl, %eax +; AVX2-NEXT: vpinsrb $14, %eax, %xmm5, %xmm4 +; AVX2-NEXT: vpextrb $15, %xmm3, %ecx +; AVX2-NEXT: vpextrb $15, %xmm1, %eax +; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: movb $-1, %cl +; AVX2-NEXT: jb .LBB2_128 +; AVX2-NEXT: # %bb.127: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_128: +; AVX2-NEXT: movzbl %cl, %eax +; AVX2-NEXT: vpinsrb $15, %eax, %xmm4, %xmm1 +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX512-LABEL: v64i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm2 +; AVX512-NEXT: vpextrb $1, %xmm2, %ecx +; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm3 +; AVX512-NEXT: vpextrb $1, %xmm3, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: movb $-1, %cl +; AVX512-NEXT: jb .LBB2_2 +; AVX512-NEXT: # %bb.1: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_2: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r15 +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %r13 +; AVX512-NEXT: pushq %r12 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX512-NEXT: vpextrb $0, %xmm2, %ecx +; AVX512-NEXT: vpextrb $0, %xmm3, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: movb $-1, %cl +; AVX512-NEXT: jb .LBB2_4 +; AVX512-NEXT: # %bb.3: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_4: +; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX512-NEXT: vpextrb $2, %xmm2, %ecx +; AVX512-NEXT: vpextrb $2, %xmm3, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: movb $-1, %cl +; AVX512-NEXT: jb .LBB2_6 +; AVX512-NEXT: # %bb.5: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_6: +; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX512-NEXT: vpextrb $3, %xmm2, %ecx +; AVX512-NEXT: vpextrb $3, %xmm3, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: movb $-1, %cl +; AVX512-NEXT: jb .LBB2_8 +; AVX512-NEXT: # %bb.7: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_8: +; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX512-NEXT: vpextrb $4, %xmm2, %ecx +; AVX512-NEXT: vpextrb $4, %xmm3, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: movb $-1, %cl +; AVX512-NEXT: jb .LBB2_10 +; AVX512-NEXT: # %bb.9: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_10: +; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX512-NEXT: vpextrb $5, %xmm2, %ecx +; AVX512-NEXT: vpextrb $5, %xmm3, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: movb $-1, %cl +; AVX512-NEXT: jb .LBB2_12 +; AVX512-NEXT: # %bb.11: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_12: +; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX512-NEXT: vpextrb $6, %xmm2, %ecx +; AVX512-NEXT: vpextrb $6, %xmm3, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: movb $-1, %cl +; AVX512-NEXT: jb .LBB2_14 +; AVX512-NEXT: # %bb.13: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_14: +; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX512-NEXT: vpextrb $7, %xmm2, %ecx +; AVX512-NEXT: vpextrb $7, %xmm3, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: movb $-1, %cl +; AVX512-NEXT: jb .LBB2_16 +; AVX512-NEXT: # %bb.15: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_16: +; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX512-NEXT: vpextrb $8, %xmm2, %ecx +; AVX512-NEXT: vpextrb $8, %xmm3, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: movb $-1, %cl +; AVX512-NEXT: jb .LBB2_18 +; AVX512-NEXT: # %bb.17: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_18: +; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX512-NEXT: vpextrb $9, %xmm2, %ecx +; AVX512-NEXT: vpextrb $9, %xmm3, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: movb $-1, %cl +; AVX512-NEXT: jb .LBB2_20 +; AVX512-NEXT: # %bb.19: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_20: +; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX512-NEXT: vpextrb $10, %xmm2, %ecx +; AVX512-NEXT: vpextrb $10, %xmm3, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: movb $-1, %cl +; AVX512-NEXT: jb .LBB2_22 +; AVX512-NEXT: # %bb.21: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_22: +; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX512-NEXT: vpextrb $11, %xmm2, %ecx +; AVX512-NEXT: vpextrb $11, %xmm3, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: movb $-1, %cl +; AVX512-NEXT: jb .LBB2_24 +; AVX512-NEXT: # %bb.23: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_24: +; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX512-NEXT: vpextrb $12, %xmm2, %ecx +; AVX512-NEXT: vpextrb $12, %xmm3, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: movb $-1, %cl +; AVX512-NEXT: jb .LBB2_26 +; AVX512-NEXT: # %bb.25: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_26: +; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX512-NEXT: vpextrb $13, %xmm2, %ecx +; AVX512-NEXT: vpextrb $13, %xmm3, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: movb $-1, %cl +; AVX512-NEXT: jb .LBB2_28 +; AVX512-NEXT: # %bb.27: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_28: +; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX512-NEXT: vpextrb $14, %xmm2, %ecx +; AVX512-NEXT: vpextrb $14, %xmm3, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: movb $-1, %cl +; AVX512-NEXT: jb .LBB2_30 +; AVX512-NEXT: # %bb.29: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_30: +; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX512-NEXT: vpextrb $15, %xmm2, %ecx +; AVX512-NEXT: vpextrb $15, %xmm3, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: movb $-1, %cl +; AVX512-NEXT: jb .LBB2_32 +; AVX512-NEXT: # %bb.31: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_32: +; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm2 +; AVX512-NEXT: vpextrb $1, %xmm2, %ecx +; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm3 +; AVX512-NEXT: vpextrb $1, %xmm3, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: movb $-1, %r11b +; AVX512-NEXT: jb .LBB2_34 +; AVX512-NEXT: # %bb.33: +; AVX512-NEXT: movl %eax, %r11d +; AVX512-NEXT: .LBB2_34: +; AVX512-NEXT: vpextrb $0, %xmm2, %ecx +; AVX512-NEXT: vpextrb $0, %xmm3, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: movb $-1, %r8b +; AVX512-NEXT: jb .LBB2_36 +; AVX512-NEXT: # %bb.35: +; AVX512-NEXT: movl %eax, %r8d +; AVX512-NEXT: .LBB2_36: +; AVX512-NEXT: vpextrb $2, %xmm2, %ecx +; AVX512-NEXT: vpextrb $2, %xmm3, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: movb $-1, %cl +; AVX512-NEXT: jb .LBB2_38 +; AVX512-NEXT: # %bb.37: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_38: +; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX512-NEXT: vpextrb $3, %xmm2, %ecx +; AVX512-NEXT: vpextrb $3, %xmm3, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: movb $-1, %cl +; AVX512-NEXT: jb .LBB2_40 +; AVX512-NEXT: # %bb.39: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_40: +; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX512-NEXT: vpextrb $4, %xmm2, %ecx +; AVX512-NEXT: vpextrb $4, %xmm3, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: movb $-1, %cl +; AVX512-NEXT: jb .LBB2_42 +; AVX512-NEXT: # %bb.41: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_42: +; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX512-NEXT: vpextrb $5, %xmm2, %ecx +; AVX512-NEXT: vpextrb $5, %xmm3, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: movb $-1, %cl +; AVX512-NEXT: jb .LBB2_44 +; AVX512-NEXT: # %bb.43: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_44: +; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX512-NEXT: vpextrb $6, %xmm2, %ecx +; AVX512-NEXT: vpextrb $6, %xmm3, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: movb $-1, %cl +; AVX512-NEXT: jb .LBB2_46 +; AVX512-NEXT: # %bb.45: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_46: +; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX512-NEXT: vpextrb $7, %xmm2, %ecx +; AVX512-NEXT: vpextrb $7, %xmm3, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: movb $-1, %cl +; AVX512-NEXT: jb .LBB2_48 +; AVX512-NEXT: # %bb.47: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_48: +; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX512-NEXT: vpextrb $8, %xmm2, %ecx +; AVX512-NEXT: vpextrb $8, %xmm3, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: movb $-1, %cl +; AVX512-NEXT: jb .LBB2_50 +; AVX512-NEXT: # %bb.49: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_50: +; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX512-NEXT: vpextrb $9, %xmm2, %ecx +; AVX512-NEXT: vpextrb $9, %xmm3, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: movb $-1, %cl +; AVX512-NEXT: jb .LBB2_52 +; AVX512-NEXT: # %bb.51: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_52: +; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX512-NEXT: vpextrb $10, %xmm2, %ecx +; AVX512-NEXT: vpextrb $10, %xmm3, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: movb $-1, %cl +; AVX512-NEXT: jb .LBB2_54 +; AVX512-NEXT: # %bb.53: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_54: +; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX512-NEXT: vpextrb $11, %xmm2, %ecx +; AVX512-NEXT: vpextrb $11, %xmm3, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: movb $-1, %cl +; AVX512-NEXT: jb .LBB2_56 +; AVX512-NEXT: # %bb.55: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_56: +; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX512-NEXT: vpextrb $12, %xmm2, %ecx +; AVX512-NEXT: vpextrb $12, %xmm3, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: movb $-1, %cl +; AVX512-NEXT: jb .LBB2_58 +; AVX512-NEXT: # %bb.57: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_58: +; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX512-NEXT: vpextrb $13, %xmm2, %ecx +; AVX512-NEXT: vpextrb $13, %xmm3, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: movb $-1, %cl +; AVX512-NEXT: jb .LBB2_60 +; AVX512-NEXT: # %bb.59: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_60: +; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX512-NEXT: vpextrb $14, %xmm2, %ecx +; AVX512-NEXT: vpextrb $14, %xmm3, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: movb $-1, %cl +; AVX512-NEXT: jb .LBB2_62 +; AVX512-NEXT: # %bb.61: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_62: +; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX512-NEXT: vpextrb $15, %xmm2, %ecx +; AVX512-NEXT: vpextrb $15, %xmm3, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: movb $-1, %cl +; AVX512-NEXT: jb .LBB2_64 +; AVX512-NEXT: # %bb.63: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_64: +; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vpextrb $1, %xmm2, %ecx +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512-NEXT: vpextrb $1, %xmm3, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: movb $-1, %dil +; AVX512-NEXT: jb .LBB2_66 +; AVX512-NEXT: # %bb.65: +; AVX512-NEXT: movl %eax, %edi +; AVX512-NEXT: .LBB2_66: +; AVX512-NEXT: vpextrb $0, %xmm2, %eax +; AVX512-NEXT: vpextrb $0, %xmm3, %ecx +; AVX512-NEXT: addb %al, %cl +; AVX512-NEXT: movb $-1, %al +; AVX512-NEXT: jb .LBB2_68 +; AVX512-NEXT: # %bb.67: +; AVX512-NEXT: movl %ecx, %eax +; AVX512-NEXT: .LBB2_68: +; AVX512-NEXT: vpextrb $2, %xmm2, %esi +; AVX512-NEXT: vpextrb $2, %xmm3, %ecx +; AVX512-NEXT: addb %sil, %cl +; AVX512-NEXT: movb $-1, %r15b +; AVX512-NEXT: jb .LBB2_70 +; AVX512-NEXT: # %bb.69: +; AVX512-NEXT: movl %ecx, %r15d +; AVX512-NEXT: .LBB2_70: +; AVX512-NEXT: vpextrb $3, %xmm2, %esi +; AVX512-NEXT: vpextrb $3, %xmm3, %ecx +; AVX512-NEXT: addb %sil, %cl +; AVX512-NEXT: movb $-1, %sil +; AVX512-NEXT: jb .LBB2_72 +; AVX512-NEXT: # %bb.71: +; AVX512-NEXT: movl %ecx, %esi +; AVX512-NEXT: .LBB2_72: +; AVX512-NEXT: vpextrb $4, %xmm2, %ebp +; AVX512-NEXT: vpextrb $4, %xmm3, %ecx +; AVX512-NEXT: addb %bpl, %cl +; AVX512-NEXT: movb $-1, %r14b +; AVX512-NEXT: jb .LBB2_74 +; AVX512-NEXT: # %bb.73: +; AVX512-NEXT: movl %ecx, %r14d +; AVX512-NEXT: .LBB2_74: +; AVX512-NEXT: vpextrb $5, %xmm2, %ebp +; AVX512-NEXT: vpextrb $5, %xmm3, %ecx +; AVX512-NEXT: addb %bpl, %cl +; AVX512-NEXT: movb $-1, %r10b +; AVX512-NEXT: jb .LBB2_76 +; AVX512-NEXT: # %bb.75: +; AVX512-NEXT: movl %ecx, %r10d +; AVX512-NEXT: .LBB2_76: +; AVX512-NEXT: vpextrb $6, %xmm2, %ebp +; AVX512-NEXT: vpextrb $6, %xmm3, %ecx +; AVX512-NEXT: addb %bpl, %cl +; AVX512-NEXT: movb $-1, %r9b +; AVX512-NEXT: jb .LBB2_78 +; AVX512-NEXT: # %bb.77: +; AVX512-NEXT: movl %ecx, %r9d +; AVX512-NEXT: .LBB2_78: +; AVX512-NEXT: vpextrb $7, %xmm2, %ebx +; AVX512-NEXT: vpextrb $7, %xmm3, %ecx +; AVX512-NEXT: addb %bl, %cl +; AVX512-NEXT: movb $-1, %r13b +; AVX512-NEXT: jb .LBB2_80 +; AVX512-NEXT: # %bb.79: +; AVX512-NEXT: movl %ecx, %r13d +; AVX512-NEXT: .LBB2_80: +; AVX512-NEXT: vpextrb $8, %xmm2, %ebx +; AVX512-NEXT: vpextrb $8, %xmm3, %ecx +; AVX512-NEXT: addb %bl, %cl +; AVX512-NEXT: movb $-1, %r12b +; AVX512-NEXT: jb .LBB2_82 +; AVX512-NEXT: # %bb.81: +; AVX512-NEXT: movl %ecx, %r12d +; AVX512-NEXT: .LBB2_82: +; AVX512-NEXT: vpextrb $9, %xmm2, %ebx +; AVX512-NEXT: vpextrb $9, %xmm3, %ecx +; AVX512-NEXT: addb %bl, %cl +; AVX512-NEXT: movb $-1, %dl +; AVX512-NEXT: jb .LBB2_84 +; AVX512-NEXT: # %bb.83: +; AVX512-NEXT: movl %ecx, %edx +; AVX512-NEXT: .LBB2_84: +; AVX512-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX512-NEXT: vpextrb $10, %xmm2, %ebx +; AVX512-NEXT: vpextrb $10, %xmm3, %ecx +; AVX512-NEXT: addb %bl, %cl +; AVX512-NEXT: movb $-1, %dl +; AVX512-NEXT: jb .LBB2_86 +; AVX512-NEXT: # %bb.85: +; AVX512-NEXT: movl %ecx, %edx +; AVX512-NEXT: .LBB2_86: +; AVX512-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX512-NEXT: vpextrb $11, %xmm2, %ebx +; AVX512-NEXT: vpextrb $11, %xmm3, %ecx +; AVX512-NEXT: addb %bl, %cl +; AVX512-NEXT: movb $-1, %dl +; AVX512-NEXT: jb .LBB2_88 +; AVX512-NEXT: # %bb.87: +; AVX512-NEXT: movl %ecx, %edx +; AVX512-NEXT: .LBB2_88: +; AVX512-NEXT: vpextrb $12, %xmm2, %ebx +; AVX512-NEXT: vpextrb $12, %xmm3, %ecx +; AVX512-NEXT: addb %bl, %cl +; AVX512-NEXT: movb $-1, %bl +; AVX512-NEXT: jb .LBB2_90 +; AVX512-NEXT: # %bb.89: +; AVX512-NEXT: movl %ecx, %ebx +; AVX512-NEXT: .LBB2_90: +; AVX512-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX512-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX512-NEXT: movzbl %r8b, %ecx +; AVX512-NEXT: vpextrb $13, %xmm2, %ebx +; AVX512-NEXT: vpextrb $13, %xmm3, %edx +; AVX512-NEXT: addb %bl, %dl +; AVX512-NEXT: movb $-1, %bl +; AVX512-NEXT: jb .LBB2_92 +; AVX512-NEXT: # %bb.91: +; AVX512-NEXT: movl %edx, %ebx +; AVX512-NEXT: .LBB2_92: +; AVX512-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX512-NEXT: movzbl %r11b, %ebp +; AVX512-NEXT: vmovd %ecx, %xmm4 +; AVX512-NEXT: vpextrb $14, %xmm2, %ebx +; AVX512-NEXT: vpextrb $14, %xmm3, %ecx +; AVX512-NEXT: addb %bl, %cl +; AVX512-NEXT: movb $-1, %r8b +; AVX512-NEXT: jb .LBB2_94 +; AVX512-NEXT: # %bb.93: +; AVX512-NEXT: movl %ecx, %r8d +; AVX512-NEXT: .LBB2_94: +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX512-NEXT: vmovd %edx, %xmm5 +; AVX512-NEXT: vpinsrb $1, %ebp, %xmm4, %xmm4 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX512-NEXT: movzbl %al, %r11d +; AVX512-NEXT: vpextrb $15, %xmm2, %ebx +; AVX512-NEXT: vpextrb $15, %xmm3, %eax +; AVX512-NEXT: addb %bl, %al +; AVX512-NEXT: movb $-1, %bl +; AVX512-NEXT: jb .LBB2_96 +; AVX512-NEXT: # %bb.95: +; AVX512-NEXT: movl %eax, %ebx +; AVX512-NEXT: .LBB2_96: +; AVX512-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX512-NEXT: vpinsrb $1, %ecx, %xmm5, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $2, %edx, %xmm4, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX512-NEXT: movzbl %dil, %eax +; AVX512-NEXT: vmovd %r11d, %xmm4 +; AVX512-NEXT: vpextrb $1, %xmm1, %ebp +; AVX512-NEXT: vpextrb $1, %xmm0, %edi +; AVX512-NEXT: addb %bpl, %dil +; AVX512-NEXT: movb $-1, %bl +; AVX512-NEXT: jb .LBB2_98 +; AVX512-NEXT: # %bb.97: +; AVX512-NEXT: movl %edi, %ebx +; AVX512-NEXT: .LBB2_98: +; AVX512-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $3, %edx, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4 +; AVX512-NEXT: movzbl %r15b, %ebp +; AVX512-NEXT: movzbl %bl, %edi +; AVX512-NEXT: vpextrb $0, %xmm1, %ebx +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: addb %bl, %al +; AVX512-NEXT: movb $-1, %bl +; AVX512-NEXT: jb .LBB2_100 +; AVX512-NEXT: # %bb.99: +; AVX512-NEXT: movl %eax, %ebx +; AVX512-NEXT: .LBB2_100: +; AVX512-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $4, %edx, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $2, %ebp, %xmm4, %xmm4 +; AVX512-NEXT: movzbl %sil, %esi +; AVX512-NEXT: movzbl %bl, %edx +; AVX512-NEXT: vmovd %edx, %xmm5 +; AVX512-NEXT: vpinsrb $1, %edi, %xmm5, %xmm5 +; AVX512-NEXT: vpextrb $2, %xmm1, %edi +; AVX512-NEXT: vpextrb $2, %xmm0, %edx +; AVX512-NEXT: addb %dil, %dl +; AVX512-NEXT: movb $-1, %bl +; AVX512-NEXT: jb .LBB2_102 +; AVX512-NEXT: # %bb.101: +; AVX512-NEXT: movl %edx, %ebx +; AVX512-NEXT: .LBB2_102: +; AVX512-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $5, %ecx, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $3, %esi, %xmm4, %xmm4 +; AVX512-NEXT: movzbl %r14b, %edx +; AVX512-NEXT: movzbl %bl, %esi +; AVX512-NEXT: vpinsrb $2, %esi, %xmm5, %xmm5 +; AVX512-NEXT: vpextrb $3, %xmm1, %edi +; AVX512-NEXT: vpextrb $3, %xmm0, %esi +; AVX512-NEXT: addb %dil, %sil +; AVX512-NEXT: movb $-1, %bl +; AVX512-NEXT: jb .LBB2_104 +; AVX512-NEXT: # %bb.103: +; AVX512-NEXT: movl %esi, %ebx +; AVX512-NEXT: .LBB2_104: +; AVX512-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $6, %ecx, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $4, %edx, %xmm4, %xmm4 +; AVX512-NEXT: movzbl %r10b, %edx +; AVX512-NEXT: movzbl %bl, %esi +; AVX512-NEXT: vpinsrb $3, %esi, %xmm5, %xmm5 +; AVX512-NEXT: vpextrb $4, %xmm1, %edi +; AVX512-NEXT: vpextrb $4, %xmm0, %esi +; AVX512-NEXT: addb %dil, %sil +; AVX512-NEXT: movb $-1, %bl +; AVX512-NEXT: jb .LBB2_106 +; AVX512-NEXT: # %bb.105: +; AVX512-NEXT: movl %esi, %ebx +; AVX512-NEXT: .LBB2_106: +; AVX512-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $7, %ecx, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $5, %edx, %xmm4, %xmm4 +; AVX512-NEXT: movzbl %r9b, %edx +; AVX512-NEXT: movzbl %bl, %esi +; AVX512-NEXT: vpinsrb $4, %esi, %xmm5, %xmm5 +; AVX512-NEXT: vpextrb $5, %xmm1, %edi +; AVX512-NEXT: vpextrb $5, %xmm0, %esi +; AVX512-NEXT: addb %dil, %sil +; AVX512-NEXT: movb $-1, %bl +; AVX512-NEXT: jb .LBB2_108 +; AVX512-NEXT: # %bb.107: +; AVX512-NEXT: movl %esi, %ebx +; AVX512-NEXT: .LBB2_108: +; AVX512-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $8, %ecx, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $6, %edx, %xmm4, %xmm4 +; AVX512-NEXT: movzbl %r13b, %edx +; AVX512-NEXT: movzbl %bl, %esi +; AVX512-NEXT: vpinsrb $5, %esi, %xmm5, %xmm5 +; AVX512-NEXT: vpextrb $6, %xmm1, %edi +; AVX512-NEXT: vpextrb $6, %xmm0, %esi +; AVX512-NEXT: addb %dil, %sil +; AVX512-NEXT: movb $-1, %bl +; AVX512-NEXT: jb .LBB2_110 +; AVX512-NEXT: # %bb.109: +; AVX512-NEXT: movl %esi, %ebx +; AVX512-NEXT: .LBB2_110: +; AVX512-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $9, %ecx, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $7, %edx, %xmm4, %xmm4 +; AVX512-NEXT: movzbl %r12b, %edx +; AVX512-NEXT: movzbl %bl, %esi +; AVX512-NEXT: vpinsrb $6, %esi, %xmm5, %xmm5 +; AVX512-NEXT: vpextrb $7, %xmm1, %edi +; AVX512-NEXT: vpextrb $7, %xmm0, %esi +; AVX512-NEXT: addb %dil, %sil +; AVX512-NEXT: movb $-1, %bl +; AVX512-NEXT: jb .LBB2_112 +; AVX512-NEXT: # %bb.111: +; AVX512-NEXT: movl %esi, %ebx +; AVX512-NEXT: .LBB2_112: +; AVX512-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $10, %ecx, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $8, %edx, %xmm4, %xmm4 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX512-NEXT: movzbl %bl, %esi +; AVX512-NEXT: vpinsrb $7, %esi, %xmm5, %xmm5 +; AVX512-NEXT: vpextrb $8, %xmm1, %edi +; AVX512-NEXT: vpextrb $8, %xmm0, %esi +; AVX512-NEXT: addb %dil, %sil +; AVX512-NEXT: movb $-1, %bl +; AVX512-NEXT: jb .LBB2_114 +; AVX512-NEXT: # %bb.113: +; AVX512-NEXT: movl %esi, %ebx +; AVX512-NEXT: .LBB2_114: +; AVX512-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $11, %ecx, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $9, %edx, %xmm4, %xmm4 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX512-NEXT: movzbl %bl, %esi +; AVX512-NEXT: vpinsrb $8, %esi, %xmm5, %xmm5 +; AVX512-NEXT: vpextrb $9, %xmm1, %edi +; AVX512-NEXT: vpextrb $9, %xmm0, %esi +; AVX512-NEXT: addb %dil, %sil +; AVX512-NEXT: movb $-1, %bl +; AVX512-NEXT: jb .LBB2_116 +; AVX512-NEXT: # %bb.115: +; AVX512-NEXT: movl %esi, %ebx +; AVX512-NEXT: .LBB2_116: +; AVX512-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $12, %ecx, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $10, %edx, %xmm4, %xmm4 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX512-NEXT: movzbl %bl, %esi +; AVX512-NEXT: vpinsrb $9, %esi, %xmm5, %xmm5 +; AVX512-NEXT: vpextrb $10, %xmm1, %edi +; AVX512-NEXT: vpextrb $10, %xmm0, %esi +; AVX512-NEXT: addb %dil, %sil +; AVX512-NEXT: movb $-1, %bl +; AVX512-NEXT: jb .LBB2_118 +; AVX512-NEXT: # %bb.117: +; AVX512-NEXT: movl %esi, %ebx +; AVX512-NEXT: .LBB2_118: +; AVX512-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $13, %ecx, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $11, %edx, %xmm4, %xmm4 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX512-NEXT: movzbl %bl, %esi +; AVX512-NEXT: vpinsrb $10, %esi, %xmm5, %xmm5 +; AVX512-NEXT: vpextrb $11, %xmm1, %edi +; AVX512-NEXT: vpextrb $11, %xmm0, %esi +; AVX512-NEXT: addb %dil, %sil +; AVX512-NEXT: movb $-1, %bl +; AVX512-NEXT: jb .LBB2_120 +; AVX512-NEXT: # %bb.119: +; AVX512-NEXT: movl %esi, %ebx +; AVX512-NEXT: .LBB2_120: +; AVX512-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $14, %ecx, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $12, %edx, %xmm4, %xmm6 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX512-NEXT: movzbl %bl, %esi +; AVX512-NEXT: vpinsrb $11, %esi, %xmm5, %xmm5 +; AVX512-NEXT: vpextrb $12, %xmm1, %edi +; AVX512-NEXT: vpextrb $12, %xmm0, %esi +; AVX512-NEXT: addb %dil, %sil +; AVX512-NEXT: movb $-1, %bl +; AVX512-NEXT: jb .LBB2_122 +; AVX512-NEXT: # %bb.121: +; AVX512-NEXT: movl %esi, %ebx +; AVX512-NEXT: .LBB2_122: +; AVX512-NEXT: vpinsrb $14, %eax, %xmm2, %xmm4 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $15, %ecx, %xmm3, %xmm2 +; AVX512-NEXT: vpinsrb $13, %edx, %xmm6, %xmm6 +; AVX512-NEXT: movzbl %r8b, %ecx +; AVX512-NEXT: movzbl %bl, %edx +; AVX512-NEXT: vpinsrb $12, %edx, %xmm5, %xmm5 +; AVX512-NEXT: vpextrb $13, %xmm1, %edx +; AVX512-NEXT: vpextrb $13, %xmm0, %esi +; AVX512-NEXT: addb %dl, %sil +; AVX512-NEXT: movb $-1, %dl +; AVX512-NEXT: jb .LBB2_124 +; AVX512-NEXT: # %bb.123: +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: .LBB2_124: +; AVX512-NEXT: vpinsrb $15, %eax, %xmm4, %xmm3 +; AVX512-NEXT: vpinsrb $14, %ecx, %xmm6, %xmm4 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: movzbl %dl, %ecx +; AVX512-NEXT: vpinsrb $13, %ecx, %xmm5, %xmm5 +; AVX512-NEXT: vpextrb $14, %xmm1, %edx +; AVX512-NEXT: vpextrb $14, %xmm0, %ecx +; AVX512-NEXT: addb %dl, %cl +; AVX512-NEXT: movb $-1, %dl +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r12 +; AVX512-NEXT: popq %r13 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %r15 +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: jb .LBB2_126 +; AVX512-NEXT: # %bb.125: +; AVX512-NEXT: movl %ecx, %edx +; AVX512-NEXT: .LBB2_126: +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vpinsrb $15, %eax, %xmm4, %xmm3 +; AVX512-NEXT: movzbl %dl, %eax +; AVX512-NEXT: vpinsrb $14, %eax, %xmm5, %xmm4 +; AVX512-NEXT: vpextrb $15, %xmm1, %ecx +; AVX512-NEXT: vpextrb $15, %xmm0, %eax +; AVX512-NEXT: addb %cl, %al +; AVX512-NEXT: movb $-1, %cl +; AVX512-NEXT: jb .LBB2_128 +; AVX512-NEXT: # %bb.127: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_128: +; AVX512-NEXT: movzbl %cl, %eax +; AVX512-NEXT: vpinsrb $15, %eax, %xmm4, %xmm0 +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: retq + %z = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %x, <64 x i8> %y) + ret <64 x i8> %z +} + +define <8 x i16> @v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { +; SSE2-LABEL: v8i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pextrw $7, %xmm1, %eax +; SSE2-NEXT: pextrw $7, %xmm0, %ecx +; SSE2-NEXT: addw %ax, %cx +; SSE2-NEXT: movl $65535, %eax # imm = 0xFFFF +; SSE2-NEXT: cmovbl %eax, %ecx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: pextrw $6, %xmm1, %ecx +; SSE2-NEXT: pextrw $6, %xmm0, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: pextrw $5, %xmm1, %ecx +; SSE2-NEXT: pextrw $5, %xmm0, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm4 +; SSE2-NEXT: pextrw $4, %xmm1, %ecx +; SSE2-NEXT: pextrw $4, %xmm0, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE2-NEXT: pextrw $3, %xmm1, %ecx +; SSE2-NEXT: pextrw $3, %xmm0, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm3 +; SSE2-NEXT: pextrw $2, %xmm1, %ecx +; SSE2-NEXT: pextrw $2, %xmm0, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE2-NEXT: pextrw $1, %xmm1, %ecx +; SSE2-NEXT: pextrw $1, %xmm0, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm3 +; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: movd %xmm0, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v8i16: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pextrw $7, %xmm1, %eax +; SSSE3-NEXT: pextrw $7, %xmm0, %ecx +; SSSE3-NEXT: addw %ax, %cx +; SSSE3-NEXT: movl $65535, %eax # imm = 0xFFFF +; SSSE3-NEXT: cmovbl %eax, %ecx +; SSSE3-NEXT: movd %ecx, %xmm2 +; SSSE3-NEXT: pextrw $6, %xmm1, %ecx +; SSSE3-NEXT: pextrw $6, %xmm0, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm3 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSSE3-NEXT: pextrw $5, %xmm1, %ecx +; SSSE3-NEXT: pextrw $5, %xmm0, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm4 +; SSSE3-NEXT: pextrw $4, %xmm1, %ecx +; SSSE3-NEXT: pextrw $4, %xmm0, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSSE3-NEXT: pextrw $3, %xmm1, %ecx +; SSSE3-NEXT: pextrw $3, %xmm0, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm3 +; SSSE3-NEXT: pextrw $2, %xmm1, %ecx +; SSSE3-NEXT: pextrw $2, %xmm0, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm4 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSSE3-NEXT: pextrw $1, %xmm1, %ecx +; SSSE3-NEXT: pextrw $1, %xmm0, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm3 +; SSSE3-NEXT: movd %xmm1, %ecx +; SSSE3-NEXT: movd %xmm0, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v8i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pextrw $1, %xmm1, %eax +; SSE41-NEXT: pextrw $1, %xmm0, %ecx +; SSE41-NEXT: addw %ax, %cx +; SSE41-NEXT: movl $65535, %eax # imm = 0xFFFF +; SSE41-NEXT: cmovbl %eax, %ecx +; SSE41-NEXT: movd %xmm1, %edx +; SSE41-NEXT: movd %xmm0, %esi +; SSE41-NEXT: addw %dx, %si +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: movd %esi, %xmm2 +; SSE41-NEXT: pinsrw $1, %ecx, %xmm2 +; SSE41-NEXT: pextrw $2, %xmm1, %ecx +; SSE41-NEXT: pextrw $2, %xmm0, %edx +; SSE41-NEXT: addw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $2, %edx, %xmm2 +; SSE41-NEXT: pextrw $3, %xmm1, %ecx +; SSE41-NEXT: pextrw $3, %xmm0, %edx +; SSE41-NEXT: addw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $3, %edx, %xmm2 +; SSE41-NEXT: pextrw $4, %xmm1, %ecx +; SSE41-NEXT: pextrw $4, %xmm0, %edx +; SSE41-NEXT: addw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $4, %edx, %xmm2 +; SSE41-NEXT: pextrw $5, %xmm1, %ecx +; SSE41-NEXT: pextrw $5, %xmm0, %edx +; SSE41-NEXT: addw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $5, %edx, %xmm2 +; SSE41-NEXT: pextrw $6, %xmm1, %ecx +; SSE41-NEXT: pextrw $6, %xmm0, %edx +; SSE41-NEXT: addw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $6, %edx, %xmm2 +; SSE41-NEXT: pextrw $7, %xmm1, %ecx +; SSE41-NEXT: pextrw $7, %xmm0, %edx +; SSE41-NEXT: addw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $7, %edx, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: v8i16: +; AVX: # %bb.0: +; AVX-NEXT: vpextrw $1, %xmm1, %eax +; AVX-NEXT: vpextrw $1, %xmm0, %ecx +; AVX-NEXT: addw %ax, %cx +; AVX-NEXT: movl $65535, %eax # imm = 0xFFFF +; AVX-NEXT: cmovbl %eax, %ecx +; AVX-NEXT: vmovd %xmm1, %edx +; AVX-NEXT: vmovd %xmm0, %esi +; AVX-NEXT: addw %dx, %si +; AVX-NEXT: cmovbl %eax, %esi +; AVX-NEXT: vmovd %esi, %xmm2 +; AVX-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrw $2, %xmm1, %ecx +; AVX-NEXT: vpextrw $2, %xmm0, %edx +; AVX-NEXT: addw %cx, %dx +; AVX-NEXT: cmovbl %eax, %edx +; AVX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2 +; AVX-NEXT: vpextrw $3, %xmm1, %ecx +; AVX-NEXT: vpextrw $3, %xmm0, %edx +; AVX-NEXT: addw %cx, %dx +; AVX-NEXT: cmovbl %eax, %edx +; AVX-NEXT: vpinsrw $3, %edx, %xmm2, %xmm2 +; AVX-NEXT: vpextrw $4, %xmm1, %ecx +; AVX-NEXT: vpextrw $4, %xmm0, %edx +; AVX-NEXT: addw %cx, %dx +; AVX-NEXT: cmovbl %eax, %edx +; AVX-NEXT: vpinsrw $4, %edx, %xmm2, %xmm2 +; AVX-NEXT: vpextrw $5, %xmm1, %ecx +; AVX-NEXT: vpextrw $5, %xmm0, %edx +; AVX-NEXT: addw %cx, %dx +; AVX-NEXT: cmovbl %eax, %edx +; AVX-NEXT: vpinsrw $5, %edx, %xmm2, %xmm2 +; AVX-NEXT: vpextrw $6, %xmm1, %ecx +; AVX-NEXT: vpextrw $6, %xmm0, %edx +; AVX-NEXT: addw %cx, %dx +; AVX-NEXT: cmovbl %eax, %edx +; AVX-NEXT: vpinsrw $6, %edx, %xmm2, %xmm2 +; AVX-NEXT: vpextrw $7, %xmm1, %ecx +; AVX-NEXT: vpextrw $7, %xmm0, %edx +; AVX-NEXT: addw %cx, %dx +; AVX-NEXT: cmovbl %eax, %edx +; AVX-NEXT: vpinsrw $7, %edx, %xmm2, %xmm0 +; AVX-NEXT: retq + %z = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %x, <8 x i16> %y) + ret <8 x i16> %z +} + +define <16 x i16> @v16i16(<16 x i16> %x, <16 x i16> %y) nounwind { +; SSE2-LABEL: v16i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pextrw $7, %xmm2, %eax +; SSE2-NEXT: pextrw $7, %xmm0, %ecx +; SSE2-NEXT: addw %ax, %cx +; SSE2-NEXT: movl $65535, %eax # imm = 0xFFFF +; SSE2-NEXT: cmovbl %eax, %ecx +; SSE2-NEXT: movd %ecx, %xmm4 +; SSE2-NEXT: pextrw $6, %xmm2, %ecx +; SSE2-NEXT: pextrw $6, %xmm0, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE2-NEXT: pextrw $5, %xmm2, %ecx +; SSE2-NEXT: pextrw $5, %xmm0, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm6 +; SSE2-NEXT: pextrw $4, %xmm2, %ecx +; SSE2-NEXT: pextrw $4, %xmm0, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE2-NEXT: pextrw $3, %xmm2, %ecx +; SSE2-NEXT: pextrw $3, %xmm0, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm5 +; SSE2-NEXT: pextrw $2, %xmm2, %ecx +; SSE2-NEXT: pextrw $2, %xmm0, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm6 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; SSE2-NEXT: pextrw $1, %xmm2, %ecx +; SSE2-NEXT: pextrw $1, %xmm0, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm5 +; SSE2-NEXT: movd %xmm2, %ecx +; SSE2-NEXT: movd %xmm0, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; SSE2-NEXT: pextrw $7, %xmm3, %ecx +; SSE2-NEXT: pextrw $7, %xmm1, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm2 +; SSE2-NEXT: pextrw $6, %xmm3, %ecx +; SSE2-NEXT: pextrw $6, %xmm1, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE2-NEXT: pextrw $5, %xmm3, %ecx +; SSE2-NEXT: pextrw $5, %xmm1, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm5 +; SSE2-NEXT: pextrw $4, %xmm3, %ecx +; SSE2-NEXT: pextrw $4, %xmm1, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE2-NEXT: pextrw $3, %xmm3, %ecx +; SSE2-NEXT: pextrw $3, %xmm1, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm4 +; SSE2-NEXT: pextrw $2, %xmm3, %ecx +; SSE2-NEXT: pextrw $2, %xmm1, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE2-NEXT: pextrw $1, %xmm3, %ecx +; SSE2-NEXT: pextrw $1, %xmm1, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm4 +; SSE2-NEXT: movd %xmm3, %ecx +; SSE2-NEXT: movd %xmm1, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v16i16: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pextrw $7, %xmm2, %eax +; SSSE3-NEXT: pextrw $7, %xmm0, %ecx +; SSSE3-NEXT: addw %ax, %cx +; SSSE3-NEXT: movl $65535, %eax # imm = 0xFFFF +; SSSE3-NEXT: cmovbl %eax, %ecx +; SSSE3-NEXT: movd %ecx, %xmm4 +; SSSE3-NEXT: pextrw $6, %xmm2, %ecx +; SSSE3-NEXT: pextrw $6, %xmm0, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm5 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSSE3-NEXT: pextrw $5, %xmm2, %ecx +; SSSE3-NEXT: pextrw $5, %xmm0, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm6 +; SSSE3-NEXT: pextrw $4, %xmm2, %ecx +; SSSE3-NEXT: pextrw $4, %xmm0, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm4 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSSE3-NEXT: pextrw $3, %xmm2, %ecx +; SSSE3-NEXT: pextrw $3, %xmm0, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm5 +; SSSE3-NEXT: pextrw $2, %xmm2, %ecx +; SSSE3-NEXT: pextrw $2, %xmm0, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm6 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; SSSE3-NEXT: pextrw $1, %xmm2, %ecx +; SSSE3-NEXT: pextrw $1, %xmm0, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm5 +; SSSE3-NEXT: movd %xmm2, %ecx +; SSSE3-NEXT: movd %xmm0, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; SSSE3-NEXT: pextrw $7, %xmm3, %ecx +; SSSE3-NEXT: pextrw $7, %xmm1, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm2 +; SSSE3-NEXT: pextrw $6, %xmm3, %ecx +; SSSE3-NEXT: pextrw $6, %xmm1, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm4 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSSE3-NEXT: pextrw $5, %xmm3, %ecx +; SSSE3-NEXT: pextrw $5, %xmm1, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm5 +; SSSE3-NEXT: pextrw $4, %xmm3, %ecx +; SSSE3-NEXT: pextrw $4, %xmm1, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSSE3-NEXT: pextrw $3, %xmm3, %ecx +; SSSE3-NEXT: pextrw $3, %xmm1, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm4 +; SSSE3-NEXT: pextrw $2, %xmm3, %ecx +; SSSE3-NEXT: pextrw $2, %xmm1, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm5 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSSE3-NEXT: pextrw $1, %xmm3, %ecx +; SSSE3-NEXT: pextrw $1, %xmm1, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm4 +; SSSE3-NEXT: movd %xmm3, %ecx +; SSSE3-NEXT: movd %xmm1, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v16i16: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pextrw $1, %xmm2, %eax +; SSE41-NEXT: pextrw $1, %xmm0, %ecx +; SSE41-NEXT: addw %ax, %cx +; SSE41-NEXT: movl $65535, %eax # imm = 0xFFFF +; SSE41-NEXT: cmovbl %eax, %ecx +; SSE41-NEXT: movd %xmm2, %edx +; SSE41-NEXT: movd %xmm0, %esi +; SSE41-NEXT: addw %dx, %si +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: movd %esi, %xmm0 +; SSE41-NEXT: pinsrw $1, %ecx, %xmm0 +; SSE41-NEXT: pextrw $2, %xmm2, %ecx +; SSE41-NEXT: pextrw $2, %xmm4, %edx +; SSE41-NEXT: addw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $2, %edx, %xmm0 +; SSE41-NEXT: pextrw $3, %xmm2, %ecx +; SSE41-NEXT: pextrw $3, %xmm4, %edx +; SSE41-NEXT: addw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $3, %edx, %xmm0 +; SSE41-NEXT: pextrw $4, %xmm2, %ecx +; SSE41-NEXT: pextrw $4, %xmm4, %edx +; SSE41-NEXT: addw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $4, %edx, %xmm0 +; SSE41-NEXT: pextrw $5, %xmm2, %ecx +; SSE41-NEXT: pextrw $5, %xmm4, %edx +; SSE41-NEXT: addw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $5, %edx, %xmm0 +; SSE41-NEXT: pextrw $6, %xmm2, %ecx +; SSE41-NEXT: pextrw $6, %xmm4, %edx +; SSE41-NEXT: addw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $6, %edx, %xmm0 +; SSE41-NEXT: pextrw $7, %xmm2, %ecx +; SSE41-NEXT: pextrw $7, %xmm4, %edx +; SSE41-NEXT: addw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $7, %edx, %xmm0 +; SSE41-NEXT: pextrw $1, %xmm3, %ecx +; SSE41-NEXT: pextrw $1, %xmm1, %edx +; SSE41-NEXT: addw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: movd %xmm3, %ecx +; SSE41-NEXT: movd %xmm1, %esi +; SSE41-NEXT: addw %cx, %si +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: movd %esi, %xmm2 +; SSE41-NEXT: pinsrw $1, %edx, %xmm2 +; SSE41-NEXT: pextrw $2, %xmm3, %ecx +; SSE41-NEXT: pextrw $2, %xmm1, %edx +; SSE41-NEXT: addw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $2, %edx, %xmm2 +; SSE41-NEXT: pextrw $3, %xmm3, %ecx +; SSE41-NEXT: pextrw $3, %xmm1, %edx +; SSE41-NEXT: addw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $3, %edx, %xmm2 +; SSE41-NEXT: pextrw $4, %xmm3, %ecx +; SSE41-NEXT: pextrw $4, %xmm1, %edx +; SSE41-NEXT: addw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $4, %edx, %xmm2 +; SSE41-NEXT: pextrw $5, %xmm3, %ecx +; SSE41-NEXT: pextrw $5, %xmm1, %edx +; SSE41-NEXT: addw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $5, %edx, %xmm2 +; SSE41-NEXT: pextrw $6, %xmm3, %ecx +; SSE41-NEXT: pextrw $6, %xmm1, %edx +; SSE41-NEXT: addw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $6, %edx, %xmm2 +; SSE41-NEXT: pextrw $7, %xmm3, %ecx +; SSE41-NEXT: pextrw $7, %xmm1, %edx +; SSE41-NEXT: addw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $7, %edx, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: v16i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpextrw $1, %xmm2, %eax +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpextrw $1, %xmm3, %ecx +; AVX1-NEXT: addw %ax, %cx +; AVX1-NEXT: movl $65535, %eax # imm = 0xFFFF +; AVX1-NEXT: cmovbl %eax, %ecx +; AVX1-NEXT: vmovd %xmm2, %edx +; AVX1-NEXT: vmovd %xmm3, %esi +; AVX1-NEXT: addw %dx, %si +; AVX1-NEXT: cmovbl %eax, %esi +; AVX1-NEXT: vmovd %esi, %xmm4 +; AVX1-NEXT: vpinsrw $1, %ecx, %xmm4, %xmm4 +; AVX1-NEXT: vpextrw $2, %xmm2, %ecx +; AVX1-NEXT: vpextrw $2, %xmm3, %edx +; AVX1-NEXT: addw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 +; AVX1-NEXT: vpextrw $3, %xmm2, %ecx +; AVX1-NEXT: vpextrw $3, %xmm3, %edx +; AVX1-NEXT: addw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $3, %edx, %xmm4, %xmm4 +; AVX1-NEXT: vpextrw $4, %xmm2, %ecx +; AVX1-NEXT: vpextrw $4, %xmm3, %edx +; AVX1-NEXT: addw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $4, %edx, %xmm4, %xmm4 +; AVX1-NEXT: vpextrw $5, %xmm2, %ecx +; AVX1-NEXT: vpextrw $5, %xmm3, %edx +; AVX1-NEXT: addw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $5, %edx, %xmm4, %xmm4 +; AVX1-NEXT: vpextrw $6, %xmm2, %ecx +; AVX1-NEXT: vpextrw $6, %xmm3, %edx +; AVX1-NEXT: addw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $6, %edx, %xmm4, %xmm4 +; AVX1-NEXT: vpextrw $7, %xmm2, %ecx +; AVX1-NEXT: vpextrw $7, %xmm3, %edx +; AVX1-NEXT: addw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $7, %edx, %xmm4, %xmm2 +; AVX1-NEXT: vpextrw $1, %xmm1, %ecx +; AVX1-NEXT: vpextrw $1, %xmm0, %edx +; AVX1-NEXT: addw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vmovd %xmm1, %ecx +; AVX1-NEXT: vmovd %xmm0, %esi +; AVX1-NEXT: addw %cx, %si +; AVX1-NEXT: cmovbl %eax, %esi +; AVX1-NEXT: vmovd %esi, %xmm3 +; AVX1-NEXT: vpinsrw $1, %edx, %xmm3, %xmm3 +; AVX1-NEXT: vpextrw $2, %xmm1, %ecx +; AVX1-NEXT: vpextrw $2, %xmm0, %edx +; AVX1-NEXT: addw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 +; AVX1-NEXT: vpextrw $3, %xmm1, %ecx +; AVX1-NEXT: vpextrw $3, %xmm0, %edx +; AVX1-NEXT: addw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $3, %edx, %xmm3, %xmm3 +; AVX1-NEXT: vpextrw $4, %xmm1, %ecx +; AVX1-NEXT: vpextrw $4, %xmm0, %edx +; AVX1-NEXT: addw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $4, %edx, %xmm3, %xmm3 +; AVX1-NEXT: vpextrw $5, %xmm1, %ecx +; AVX1-NEXT: vpextrw $5, %xmm0, %edx +; AVX1-NEXT: addw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $5, %edx, %xmm3, %xmm3 +; AVX1-NEXT: vpextrw $6, %xmm1, %ecx +; AVX1-NEXT: vpextrw $6, %xmm0, %edx +; AVX1-NEXT: addw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $6, %edx, %xmm3, %xmm3 +; AVX1-NEXT: vpextrw $7, %xmm1, %ecx +; AVX1-NEXT: vpextrw $7, %xmm0, %edx +; AVX1-NEXT: addw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $7, %edx, %xmm3, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: v16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpextrw $1, %xmm2, %eax +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-NEXT: vpextrw $1, %xmm3, %ecx +; AVX2-NEXT: addw %ax, %cx +; AVX2-NEXT: movl $65535, %eax # imm = 0xFFFF +; AVX2-NEXT: cmovbl %eax, %ecx +; AVX2-NEXT: vmovd %xmm2, %edx +; AVX2-NEXT: vmovd %xmm3, %esi +; AVX2-NEXT: addw %dx, %si +; AVX2-NEXT: cmovbl %eax, %esi +; AVX2-NEXT: vmovd %esi, %xmm4 +; AVX2-NEXT: vpinsrw $1, %ecx, %xmm4, %xmm4 +; AVX2-NEXT: vpextrw $2, %xmm2, %ecx +; AVX2-NEXT: vpextrw $2, %xmm3, %edx +; AVX2-NEXT: addw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 +; AVX2-NEXT: vpextrw $3, %xmm2, %ecx +; AVX2-NEXT: vpextrw $3, %xmm3, %edx +; AVX2-NEXT: addw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $3, %edx, %xmm4, %xmm4 +; AVX2-NEXT: vpextrw $4, %xmm2, %ecx +; AVX2-NEXT: vpextrw $4, %xmm3, %edx +; AVX2-NEXT: addw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $4, %edx, %xmm4, %xmm4 +; AVX2-NEXT: vpextrw $5, %xmm2, %ecx +; AVX2-NEXT: vpextrw $5, %xmm3, %edx +; AVX2-NEXT: addw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $5, %edx, %xmm4, %xmm4 +; AVX2-NEXT: vpextrw $6, %xmm2, %ecx +; AVX2-NEXT: vpextrw $6, %xmm3, %edx +; AVX2-NEXT: addw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $6, %edx, %xmm4, %xmm4 +; AVX2-NEXT: vpextrw $7, %xmm2, %ecx +; AVX2-NEXT: vpextrw $7, %xmm3, %edx +; AVX2-NEXT: addw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $7, %edx, %xmm4, %xmm2 +; AVX2-NEXT: vpextrw $1, %xmm1, %ecx +; AVX2-NEXT: vpextrw $1, %xmm0, %edx +; AVX2-NEXT: addw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vmovd %xmm1, %ecx +; AVX2-NEXT: vmovd %xmm0, %esi +; AVX2-NEXT: addw %cx, %si +; AVX2-NEXT: cmovbl %eax, %esi +; AVX2-NEXT: vmovd %esi, %xmm3 +; AVX2-NEXT: vpinsrw $1, %edx, %xmm3, %xmm3 +; AVX2-NEXT: vpextrw $2, %xmm1, %ecx +; AVX2-NEXT: vpextrw $2, %xmm0, %edx +; AVX2-NEXT: addw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 +; AVX2-NEXT: vpextrw $3, %xmm1, %ecx +; AVX2-NEXT: vpextrw $3, %xmm0, %edx +; AVX2-NEXT: addw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $3, %edx, %xmm3, %xmm3 +; AVX2-NEXT: vpextrw $4, %xmm1, %ecx +; AVX2-NEXT: vpextrw $4, %xmm0, %edx +; AVX2-NEXT: addw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $4, %edx, %xmm3, %xmm3 +; AVX2-NEXT: vpextrw $5, %xmm1, %ecx +; AVX2-NEXT: vpextrw $5, %xmm0, %edx +; AVX2-NEXT: addw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $5, %edx, %xmm3, %xmm3 +; AVX2-NEXT: vpextrw $6, %xmm1, %ecx +; AVX2-NEXT: vpextrw $6, %xmm0, %edx +; AVX2-NEXT: addw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $6, %edx, %xmm3, %xmm3 +; AVX2-NEXT: vpextrw $7, %xmm1, %ecx +; AVX2-NEXT: vpextrw $7, %xmm0, %edx +; AVX2-NEXT: addw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $7, %edx, %xmm3, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: v16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vpextrw $1, %xmm2, %eax +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512-NEXT: vpextrw $1, %xmm3, %ecx +; AVX512-NEXT: addw %ax, %cx +; AVX512-NEXT: movl $65535, %eax # imm = 0xFFFF +; AVX512-NEXT: cmovbl %eax, %ecx +; AVX512-NEXT: vmovd %xmm2, %edx +; AVX512-NEXT: vmovd %xmm3, %esi +; AVX512-NEXT: addw %dx, %si +; AVX512-NEXT: cmovbl %eax, %esi +; AVX512-NEXT: vmovd %esi, %xmm4 +; AVX512-NEXT: vpinsrw $1, %ecx, %xmm4, %xmm4 +; AVX512-NEXT: vpextrw $2, %xmm2, %ecx +; AVX512-NEXT: vpextrw $2, %xmm3, %edx +; AVX512-NEXT: addw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 +; AVX512-NEXT: vpextrw $3, %xmm2, %ecx +; AVX512-NEXT: vpextrw $3, %xmm3, %edx +; AVX512-NEXT: addw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $3, %edx, %xmm4, %xmm4 +; AVX512-NEXT: vpextrw $4, %xmm2, %ecx +; AVX512-NEXT: vpextrw $4, %xmm3, %edx +; AVX512-NEXT: addw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $4, %edx, %xmm4, %xmm4 +; AVX512-NEXT: vpextrw $5, %xmm2, %ecx +; AVX512-NEXT: vpextrw $5, %xmm3, %edx +; AVX512-NEXT: addw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $5, %edx, %xmm4, %xmm4 +; AVX512-NEXT: vpextrw $6, %xmm2, %ecx +; AVX512-NEXT: vpextrw $6, %xmm3, %edx +; AVX512-NEXT: addw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $6, %edx, %xmm4, %xmm4 +; AVX512-NEXT: vpextrw $7, %xmm2, %ecx +; AVX512-NEXT: vpextrw $7, %xmm3, %edx +; AVX512-NEXT: addw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $7, %edx, %xmm4, %xmm2 +; AVX512-NEXT: vpextrw $1, %xmm1, %ecx +; AVX512-NEXT: vpextrw $1, %xmm0, %edx +; AVX512-NEXT: addw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vmovd %xmm1, %ecx +; AVX512-NEXT: vmovd %xmm0, %esi +; AVX512-NEXT: addw %cx, %si +; AVX512-NEXT: cmovbl %eax, %esi +; AVX512-NEXT: vmovd %esi, %xmm3 +; AVX512-NEXT: vpinsrw $1, %edx, %xmm3, %xmm3 +; AVX512-NEXT: vpextrw $2, %xmm1, %ecx +; AVX512-NEXT: vpextrw $2, %xmm0, %edx +; AVX512-NEXT: addw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 +; AVX512-NEXT: vpextrw $3, %xmm1, %ecx +; AVX512-NEXT: vpextrw $3, %xmm0, %edx +; AVX512-NEXT: addw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $3, %edx, %xmm3, %xmm3 +; AVX512-NEXT: vpextrw $4, %xmm1, %ecx +; AVX512-NEXT: vpextrw $4, %xmm0, %edx +; AVX512-NEXT: addw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $4, %edx, %xmm3, %xmm3 +; AVX512-NEXT: vpextrw $5, %xmm1, %ecx +; AVX512-NEXT: vpextrw $5, %xmm0, %edx +; AVX512-NEXT: addw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $5, %edx, %xmm3, %xmm3 +; AVX512-NEXT: vpextrw $6, %xmm1, %ecx +; AVX512-NEXT: vpextrw $6, %xmm0, %edx +; AVX512-NEXT: addw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $6, %edx, %xmm3, %xmm3 +; AVX512-NEXT: vpextrw $7, %xmm1, %ecx +; AVX512-NEXT: vpextrw $7, %xmm0, %edx +; AVX512-NEXT: addw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $7, %edx, %xmm3, %xmm0 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512-NEXT: retq + %z = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> %x, <16 x i16> %y) + ret <16 x i16> %z +} + +define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind { +; SSE2-LABEL: v32i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pextrw $7, %xmm4, %eax +; SSE2-NEXT: pextrw $7, %xmm0, %ecx +; SSE2-NEXT: addw %ax, %cx +; SSE2-NEXT: movl $65535, %eax # imm = 0xFFFF +; SSE2-NEXT: cmovbl %eax, %ecx +; SSE2-NEXT: movd %ecx, %xmm8 +; SSE2-NEXT: pextrw $6, %xmm4, %ecx +; SSE2-NEXT: pextrw $6, %xmm0, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm9 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; SSE2-NEXT: pextrw $5, %xmm4, %ecx +; SSE2-NEXT: pextrw $5, %xmm0, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm10 +; SSE2-NEXT: pextrw $4, %xmm4, %ecx +; SSE2-NEXT: pextrw $4, %xmm0, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm8 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] +; SSE2-NEXT: pextrw $3, %xmm4, %ecx +; SSE2-NEXT: pextrw $3, %xmm0, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm9 +; SSE2-NEXT: pextrw $2, %xmm4, %ecx +; SSE2-NEXT: pextrw $2, %xmm0, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm10 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; SSE2-NEXT: pextrw $1, %xmm4, %ecx +; SSE2-NEXT: pextrw $1, %xmm0, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm9 +; SSE2-NEXT: movd %xmm4, %ecx +; SSE2-NEXT: movd %xmm0, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm8[0] +; SSE2-NEXT: pextrw $7, %xmm5, %ecx +; SSE2-NEXT: pextrw $7, %xmm1, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm8 +; SSE2-NEXT: pextrw $6, %xmm5, %ecx +; SSE2-NEXT: pextrw $6, %xmm1, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] +; SSE2-NEXT: pextrw $5, %xmm5, %ecx +; SSE2-NEXT: pextrw $5, %xmm1, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm9 +; SSE2-NEXT: pextrw $4, %xmm5, %ecx +; SSE2-NEXT: pextrw $4, %xmm1, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm8 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; SSE2-NEXT: pextrw $3, %xmm5, %ecx +; SSE2-NEXT: pextrw $3, %xmm1, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm9 +; SSE2-NEXT: pextrw $2, %xmm5, %ecx +; SSE2-NEXT: pextrw $2, %xmm1, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] +; SSE2-NEXT: pextrw $1, %xmm5, %ecx +; SSE2-NEXT: pextrw $1, %xmm1, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm9 +; SSE2-NEXT: movd %xmm5, %ecx +; SSE2-NEXT: movd %xmm1, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm8[0] +; SSE2-NEXT: pextrw $7, %xmm6, %ecx +; SSE2-NEXT: pextrw $7, %xmm2, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm4 +; SSE2-NEXT: pextrw $6, %xmm6, %ecx +; SSE2-NEXT: pextrw $6, %xmm2, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE2-NEXT: pextrw $5, %xmm6, %ecx +; SSE2-NEXT: pextrw $5, %xmm2, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm8 +; SSE2-NEXT: pextrw $4, %xmm6, %ecx +; SSE2-NEXT: pextrw $4, %xmm2, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE2-NEXT: pextrw $3, %xmm6, %ecx +; SSE2-NEXT: pextrw $3, %xmm2, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm8 +; SSE2-NEXT: pextrw $2, %xmm6, %ecx +; SSE2-NEXT: pextrw $2, %xmm2, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3] +; SSE2-NEXT: pextrw $1, %xmm6, %ecx +; SSE2-NEXT: pextrw $1, %xmm2, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm8 +; SSE2-NEXT: movd %xmm6, %ecx +; SSE2-NEXT: movd %xmm2, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE2-NEXT: pextrw $7, %xmm7, %ecx +; SSE2-NEXT: pextrw $7, %xmm3, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm4 +; SSE2-NEXT: pextrw $6, %xmm7, %ecx +; SSE2-NEXT: pextrw $6, %xmm3, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE2-NEXT: pextrw $5, %xmm7, %ecx +; SSE2-NEXT: pextrw $5, %xmm3, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm6 +; SSE2-NEXT: pextrw $4, %xmm7, %ecx +; SSE2-NEXT: pextrw $4, %xmm3, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE2-NEXT: pextrw $3, %xmm7, %ecx +; SSE2-NEXT: pextrw $3, %xmm3, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm5 +; SSE2-NEXT: pextrw $2, %xmm7, %ecx +; SSE2-NEXT: pextrw $2, %xmm3, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm6 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; SSE2-NEXT: pextrw $1, %xmm7, %ecx +; SSE2-NEXT: pextrw $1, %xmm3, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm5 +; SSE2-NEXT: movd %xmm7, %ecx +; SSE2-NEXT: movd %xmm3, %edx +; SSE2-NEXT: addw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v32i16: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pextrw $7, %xmm4, %eax +; SSSE3-NEXT: pextrw $7, %xmm0, %ecx +; SSSE3-NEXT: addw %ax, %cx +; SSSE3-NEXT: movl $65535, %eax # imm = 0xFFFF +; SSSE3-NEXT: cmovbl %eax, %ecx +; SSSE3-NEXT: movd %ecx, %xmm8 +; SSSE3-NEXT: pextrw $6, %xmm4, %ecx +; SSSE3-NEXT: pextrw $6, %xmm0, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm9 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; SSSE3-NEXT: pextrw $5, %xmm4, %ecx +; SSSE3-NEXT: pextrw $5, %xmm0, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm10 +; SSSE3-NEXT: pextrw $4, %xmm4, %ecx +; SSSE3-NEXT: pextrw $4, %xmm0, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm8 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] +; SSSE3-NEXT: pextrw $3, %xmm4, %ecx +; SSSE3-NEXT: pextrw $3, %xmm0, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm9 +; SSSE3-NEXT: pextrw $2, %xmm4, %ecx +; SSSE3-NEXT: pextrw $2, %xmm0, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm10 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; SSSE3-NEXT: pextrw $1, %xmm4, %ecx +; SSSE3-NEXT: pextrw $1, %xmm0, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm9 +; SSSE3-NEXT: movd %xmm4, %ecx +; SSSE3-NEXT: movd %xmm0, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm8[0] +; SSSE3-NEXT: pextrw $7, %xmm5, %ecx +; SSSE3-NEXT: pextrw $7, %xmm1, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm8 +; SSSE3-NEXT: pextrw $6, %xmm5, %ecx +; SSSE3-NEXT: pextrw $6, %xmm1, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm4 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] +; SSSE3-NEXT: pextrw $5, %xmm5, %ecx +; SSSE3-NEXT: pextrw $5, %xmm1, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm9 +; SSSE3-NEXT: pextrw $4, %xmm5, %ecx +; SSSE3-NEXT: pextrw $4, %xmm1, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm8 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; SSSE3-NEXT: pextrw $3, %xmm5, %ecx +; SSSE3-NEXT: pextrw $3, %xmm1, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm9 +; SSSE3-NEXT: pextrw $2, %xmm5, %ecx +; SSSE3-NEXT: pextrw $2, %xmm1, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm4 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] +; SSSE3-NEXT: pextrw $1, %xmm5, %ecx +; SSSE3-NEXT: pextrw $1, %xmm1, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm9 +; SSSE3-NEXT: movd %xmm5, %ecx +; SSSE3-NEXT: movd %xmm1, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm8[0] +; SSSE3-NEXT: pextrw $7, %xmm6, %ecx +; SSSE3-NEXT: pextrw $7, %xmm2, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm4 +; SSSE3-NEXT: pextrw $6, %xmm6, %ecx +; SSSE3-NEXT: pextrw $6, %xmm2, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm5 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSSE3-NEXT: pextrw $5, %xmm6, %ecx +; SSSE3-NEXT: pextrw $5, %xmm2, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm8 +; SSSE3-NEXT: pextrw $4, %xmm6, %ecx +; SSSE3-NEXT: pextrw $4, %xmm2, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm4 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSSE3-NEXT: pextrw $3, %xmm6, %ecx +; SSSE3-NEXT: pextrw $3, %xmm2, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm8 +; SSSE3-NEXT: pextrw $2, %xmm6, %ecx +; SSSE3-NEXT: pextrw $2, %xmm2, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm5 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3] +; SSSE3-NEXT: pextrw $1, %xmm6, %ecx +; SSSE3-NEXT: pextrw $1, %xmm2, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm8 +; SSSE3-NEXT: movd %xmm6, %ecx +; SSSE3-NEXT: movd %xmm2, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSSE3-NEXT: pextrw $7, %xmm7, %ecx +; SSSE3-NEXT: pextrw $7, %xmm3, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm4 +; SSSE3-NEXT: pextrw $6, %xmm7, %ecx +; SSSE3-NEXT: pextrw $6, %xmm3, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm5 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSSE3-NEXT: pextrw $5, %xmm7, %ecx +; SSSE3-NEXT: pextrw $5, %xmm3, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm6 +; SSSE3-NEXT: pextrw $4, %xmm7, %ecx +; SSSE3-NEXT: pextrw $4, %xmm3, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm4 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSSE3-NEXT: pextrw $3, %xmm7, %ecx +; SSSE3-NEXT: pextrw $3, %xmm3, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm5 +; SSSE3-NEXT: pextrw $2, %xmm7, %ecx +; SSSE3-NEXT: pextrw $2, %xmm3, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm6 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; SSSE3-NEXT: pextrw $1, %xmm7, %ecx +; SSSE3-NEXT: pextrw $1, %xmm3, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm5 +; SSSE3-NEXT: movd %xmm7, %ecx +; SSSE3-NEXT: movd %xmm3, %edx +; SSSE3-NEXT: addw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm3 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v32i16: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm1, %xmm8 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pextrw $1, %xmm4, %eax +; SSE41-NEXT: pextrw $1, %xmm0, %ecx +; SSE41-NEXT: addw %ax, %cx +; SSE41-NEXT: movl $65535, %eax # imm = 0xFFFF +; SSE41-NEXT: cmovbl %eax, %ecx +; SSE41-NEXT: movd %xmm4, %edx +; SSE41-NEXT: movd %xmm0, %esi +; SSE41-NEXT: addw %dx, %si +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: movd %esi, %xmm0 +; SSE41-NEXT: pinsrw $1, %ecx, %xmm0 +; SSE41-NEXT: pextrw $2, %xmm4, %ecx +; SSE41-NEXT: pextrw $2, %xmm1, %edx +; SSE41-NEXT: addw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $2, %edx, %xmm0 +; SSE41-NEXT: pextrw $3, %xmm4, %ecx +; SSE41-NEXT: pextrw $3, %xmm1, %edx +; SSE41-NEXT: addw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $3, %edx, %xmm0 +; SSE41-NEXT: pextrw $4, %xmm4, %ecx +; SSE41-NEXT: pextrw $4, %xmm1, %edx +; SSE41-NEXT: addw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $4, %edx, %xmm0 +; SSE41-NEXT: pextrw $5, %xmm4, %ecx +; SSE41-NEXT: pextrw $5, %xmm1, %edx +; SSE41-NEXT: addw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $5, %edx, %xmm0 +; SSE41-NEXT: pextrw $6, %xmm4, %ecx +; SSE41-NEXT: pextrw $6, %xmm1, %edx +; SSE41-NEXT: addw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $6, %edx, %xmm0 +; SSE41-NEXT: pextrw $7, %xmm4, %ecx +; SSE41-NEXT: pextrw $7, %xmm1, %edx +; SSE41-NEXT: addw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $7, %edx, %xmm0 +; SSE41-NEXT: pextrw $1, %xmm5, %ecx +; SSE41-NEXT: pextrw $1, %xmm8, %edx +; SSE41-NEXT: addw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: movd %xmm5, %ecx +; SSE41-NEXT: movd %xmm8, %esi +; SSE41-NEXT: addw %cx, %si +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: movd %esi, %xmm1 +; SSE41-NEXT: pinsrw $1, %edx, %xmm1 +; SSE41-NEXT: pextrw $2, %xmm5, %ecx +; SSE41-NEXT: pextrw $2, %xmm8, %edx +; SSE41-NEXT: addw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $2, %edx, %xmm1 +; SSE41-NEXT: pextrw $3, %xmm5, %ecx +; SSE41-NEXT: pextrw $3, %xmm8, %edx +; SSE41-NEXT: addw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $3, %edx, %xmm1 +; SSE41-NEXT: pextrw $4, %xmm5, %ecx +; SSE41-NEXT: pextrw $4, %xmm8, %edx +; SSE41-NEXT: addw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $4, %edx, %xmm1 +; SSE41-NEXT: pextrw $5, %xmm5, %ecx +; SSE41-NEXT: pextrw $5, %xmm8, %edx +; SSE41-NEXT: addw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $5, %edx, %xmm1 +; SSE41-NEXT: pextrw $6, %xmm5, %ecx +; SSE41-NEXT: pextrw $6, %xmm8, %edx +; SSE41-NEXT: addw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $6, %edx, %xmm1 +; SSE41-NEXT: pextrw $7, %xmm5, %ecx +; SSE41-NEXT: pextrw $7, %xmm8, %edx +; SSE41-NEXT: addw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $7, %edx, %xmm1 +; SSE41-NEXT: pextrw $1, %xmm6, %ecx +; SSE41-NEXT: pextrw $1, %xmm2, %edx +; SSE41-NEXT: addw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: movd %xmm6, %ecx +; SSE41-NEXT: movd %xmm2, %esi +; SSE41-NEXT: addw %cx, %si +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: movd %esi, %xmm4 +; SSE41-NEXT: pinsrw $1, %edx, %xmm4 +; SSE41-NEXT: pextrw $2, %xmm6, %ecx +; SSE41-NEXT: pextrw $2, %xmm2, %edx +; SSE41-NEXT: addw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $2, %edx, %xmm4 +; SSE41-NEXT: pextrw $3, %xmm6, %ecx +; SSE41-NEXT: pextrw $3, %xmm2, %edx +; SSE41-NEXT: addw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $3, %edx, %xmm4 +; SSE41-NEXT: pextrw $4, %xmm6, %ecx +; SSE41-NEXT: pextrw $4, %xmm2, %edx +; SSE41-NEXT: addw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $4, %edx, %xmm4 +; SSE41-NEXT: pextrw $5, %xmm6, %ecx +; SSE41-NEXT: pextrw $5, %xmm2, %edx +; SSE41-NEXT: addw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $5, %edx, %xmm4 +; SSE41-NEXT: pextrw $6, %xmm6, %ecx +; SSE41-NEXT: pextrw $6, %xmm2, %edx +; SSE41-NEXT: addw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $6, %edx, %xmm4 +; SSE41-NEXT: pextrw $7, %xmm6, %ecx +; SSE41-NEXT: pextrw $7, %xmm2, %edx +; SSE41-NEXT: addw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $7, %edx, %xmm4 +; SSE41-NEXT: pextrw $1, %xmm7, %ecx +; SSE41-NEXT: pextrw $1, %xmm3, %edx +; SSE41-NEXT: addw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: movd %xmm7, %ecx +; SSE41-NEXT: movd %xmm3, %esi +; SSE41-NEXT: addw %cx, %si +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: movd %esi, %xmm5 +; SSE41-NEXT: pinsrw $1, %edx, %xmm5 +; SSE41-NEXT: pextrw $2, %xmm7, %ecx +; SSE41-NEXT: pextrw $2, %xmm3, %edx +; SSE41-NEXT: addw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $2, %edx, %xmm5 +; SSE41-NEXT: pextrw $3, %xmm7, %ecx +; SSE41-NEXT: pextrw $3, %xmm3, %edx +; SSE41-NEXT: addw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $3, %edx, %xmm5 +; SSE41-NEXT: pextrw $4, %xmm7, %ecx +; SSE41-NEXT: pextrw $4, %xmm3, %edx +; SSE41-NEXT: addw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $4, %edx, %xmm5 +; SSE41-NEXT: pextrw $5, %xmm7, %ecx +; SSE41-NEXT: pextrw $5, %xmm3, %edx +; SSE41-NEXT: addw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $5, %edx, %xmm5 +; SSE41-NEXT: pextrw $6, %xmm7, %ecx +; SSE41-NEXT: pextrw $6, %xmm3, %edx +; SSE41-NEXT: addw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $6, %edx, %xmm5 +; SSE41-NEXT: pextrw $7, %xmm7, %ecx +; SSE41-NEXT: pextrw $7, %xmm3, %edx +; SSE41-NEXT: addw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $7, %edx, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm2 +; SSE41-NEXT: movdqa %xmm5, %xmm3 +; SSE41-NEXT: retq +; +; AVX1-LABEL: v32i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vpextrw $1, %xmm4, %eax +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpextrw $1, %xmm5, %ecx +; AVX1-NEXT: addw %ax, %cx +; AVX1-NEXT: movl $65535, %eax # imm = 0xFFFF +; AVX1-NEXT: cmovbl %eax, %ecx +; AVX1-NEXT: vmovd %xmm4, %edx +; AVX1-NEXT: vmovd %xmm5, %esi +; AVX1-NEXT: addw %dx, %si +; AVX1-NEXT: cmovbl %eax, %esi +; AVX1-NEXT: vmovd %esi, %xmm6 +; AVX1-NEXT: vpinsrw $1, %ecx, %xmm6, %xmm6 +; AVX1-NEXT: vpextrw $2, %xmm4, %ecx +; AVX1-NEXT: vpextrw $2, %xmm5, %edx +; AVX1-NEXT: addw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $2, %edx, %xmm6, %xmm6 +; AVX1-NEXT: vpextrw $3, %xmm4, %ecx +; AVX1-NEXT: vpextrw $3, %xmm5, %edx +; AVX1-NEXT: addw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $3, %edx, %xmm6, %xmm6 +; AVX1-NEXT: vpextrw $4, %xmm4, %ecx +; AVX1-NEXT: vpextrw $4, %xmm5, %edx +; AVX1-NEXT: addw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $4, %edx, %xmm6, %xmm6 +; AVX1-NEXT: vpextrw $5, %xmm4, %ecx +; AVX1-NEXT: vpextrw $5, %xmm5, %edx +; AVX1-NEXT: addw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $5, %edx, %xmm6, %xmm6 +; AVX1-NEXT: vpextrw $6, %xmm4, %ecx +; AVX1-NEXT: vpextrw $6, %xmm5, %edx +; AVX1-NEXT: addw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $6, %edx, %xmm6, %xmm6 +; AVX1-NEXT: vpextrw $7, %xmm4, %ecx +; AVX1-NEXT: vpextrw $7, %xmm5, %edx +; AVX1-NEXT: addw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $7, %edx, %xmm6, %xmm4 +; AVX1-NEXT: vpextrw $1, %xmm2, %ecx +; AVX1-NEXT: vpextrw $1, %xmm0, %edx +; AVX1-NEXT: addw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vmovd %xmm2, %ecx +; AVX1-NEXT: vmovd %xmm0, %esi +; AVX1-NEXT: addw %cx, %si +; AVX1-NEXT: cmovbl %eax, %esi +; AVX1-NEXT: vmovd %esi, %xmm5 +; AVX1-NEXT: vpinsrw $1, %edx, %xmm5, %xmm5 +; AVX1-NEXT: vpextrw $2, %xmm2, %ecx +; AVX1-NEXT: vpextrw $2, %xmm0, %edx +; AVX1-NEXT: addw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 +; AVX1-NEXT: vpextrw $3, %xmm2, %ecx +; AVX1-NEXT: vpextrw $3, %xmm0, %edx +; AVX1-NEXT: addw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $3, %edx, %xmm5, %xmm5 +; AVX1-NEXT: vpextrw $4, %xmm2, %ecx +; AVX1-NEXT: vpextrw $4, %xmm0, %edx +; AVX1-NEXT: addw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $4, %edx, %xmm5, %xmm5 +; AVX1-NEXT: vpextrw $5, %xmm2, %ecx +; AVX1-NEXT: vpextrw $5, %xmm0, %edx +; AVX1-NEXT: addw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $5, %edx, %xmm5, %xmm5 +; AVX1-NEXT: vpextrw $6, %xmm2, %ecx +; AVX1-NEXT: vpextrw $6, %xmm0, %edx +; AVX1-NEXT: addw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $6, %edx, %xmm5, %xmm5 +; AVX1-NEXT: vpextrw $7, %xmm2, %ecx +; AVX1-NEXT: vpextrw $7, %xmm0, %edx +; AVX1-NEXT: addw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $7, %edx, %xmm5, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 +; AVX1-NEXT: vpextrw $1, %xmm2, %ecx +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpextrw $1, %xmm4, %edx +; AVX1-NEXT: addw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vmovd %xmm2, %ecx +; AVX1-NEXT: vmovd %xmm4, %esi +; AVX1-NEXT: addw %cx, %si +; AVX1-NEXT: cmovbl %eax, %esi +; AVX1-NEXT: vmovd %esi, %xmm5 +; AVX1-NEXT: vpinsrw $1, %edx, %xmm5, %xmm5 +; AVX1-NEXT: vpextrw $2, %xmm2, %ecx +; AVX1-NEXT: vpextrw $2, %xmm4, %edx +; AVX1-NEXT: addw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 +; AVX1-NEXT: vpextrw $3, %xmm2, %ecx +; AVX1-NEXT: vpextrw $3, %xmm4, %edx +; AVX1-NEXT: addw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $3, %edx, %xmm5, %xmm5 +; AVX1-NEXT: vpextrw $4, %xmm2, %ecx +; AVX1-NEXT: vpextrw $4, %xmm4, %edx +; AVX1-NEXT: addw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $4, %edx, %xmm5, %xmm5 +; AVX1-NEXT: vpextrw $5, %xmm2, %ecx +; AVX1-NEXT: vpextrw $5, %xmm4, %edx +; AVX1-NEXT: addw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $5, %edx, %xmm5, %xmm5 +; AVX1-NEXT: vpextrw $6, %xmm2, %ecx +; AVX1-NEXT: vpextrw $6, %xmm4, %edx +; AVX1-NEXT: addw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $6, %edx, %xmm5, %xmm5 +; AVX1-NEXT: vpextrw $7, %xmm2, %ecx +; AVX1-NEXT: vpextrw $7, %xmm4, %edx +; AVX1-NEXT: addw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $7, %edx, %xmm5, %xmm2 +; AVX1-NEXT: vpextrw $1, %xmm3, %ecx +; AVX1-NEXT: vpextrw $1, %xmm1, %edx +; AVX1-NEXT: addw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vmovd %xmm3, %ecx +; AVX1-NEXT: vmovd %xmm1, %esi +; AVX1-NEXT: addw %cx, %si +; AVX1-NEXT: cmovbl %eax, %esi +; AVX1-NEXT: vmovd %esi, %xmm4 +; AVX1-NEXT: vpinsrw $1, %edx, %xmm4, %xmm4 +; AVX1-NEXT: vpextrw $2, %xmm3, %ecx +; AVX1-NEXT: vpextrw $2, %xmm1, %edx +; AVX1-NEXT: addw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 +; AVX1-NEXT: vpextrw $3, %xmm3, %ecx +; AVX1-NEXT: vpextrw $3, %xmm1, %edx +; AVX1-NEXT: addw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $3, %edx, %xmm4, %xmm4 +; AVX1-NEXT: vpextrw $4, %xmm3, %ecx +; AVX1-NEXT: vpextrw $4, %xmm1, %edx +; AVX1-NEXT: addw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $4, %edx, %xmm4, %xmm4 +; AVX1-NEXT: vpextrw $5, %xmm3, %ecx +; AVX1-NEXT: vpextrw $5, %xmm1, %edx +; AVX1-NEXT: addw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $5, %edx, %xmm4, %xmm4 +; AVX1-NEXT: vpextrw $6, %xmm3, %ecx +; AVX1-NEXT: vpextrw $6, %xmm1, %edx +; AVX1-NEXT: addw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $6, %edx, %xmm4, %xmm4 +; AVX1-NEXT: vpextrw $7, %xmm3, %ecx +; AVX1-NEXT: vpextrw $7, %xmm1, %edx +; AVX1-NEXT: addw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $7, %edx, %xmm4, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: v32i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-NEXT: vpextrw $1, %xmm4, %eax +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-NEXT: vpextrw $1, %xmm5, %ecx +; AVX2-NEXT: addw %ax, %cx +; AVX2-NEXT: movl $65535, %eax # imm = 0xFFFF +; AVX2-NEXT: cmovbl %eax, %ecx +; AVX2-NEXT: vmovd %xmm4, %edx +; AVX2-NEXT: vmovd %xmm5, %esi +; AVX2-NEXT: addw %dx, %si +; AVX2-NEXT: cmovbl %eax, %esi +; AVX2-NEXT: vmovd %esi, %xmm6 +; AVX2-NEXT: vpinsrw $1, %ecx, %xmm6, %xmm6 +; AVX2-NEXT: vpextrw $2, %xmm4, %ecx +; AVX2-NEXT: vpextrw $2, %xmm5, %edx +; AVX2-NEXT: addw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $2, %edx, %xmm6, %xmm6 +; AVX2-NEXT: vpextrw $3, %xmm4, %ecx +; AVX2-NEXT: vpextrw $3, %xmm5, %edx +; AVX2-NEXT: addw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $3, %edx, %xmm6, %xmm6 +; AVX2-NEXT: vpextrw $4, %xmm4, %ecx +; AVX2-NEXT: vpextrw $4, %xmm5, %edx +; AVX2-NEXT: addw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $4, %edx, %xmm6, %xmm6 +; AVX2-NEXT: vpextrw $5, %xmm4, %ecx +; AVX2-NEXT: vpextrw $5, %xmm5, %edx +; AVX2-NEXT: addw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $5, %edx, %xmm6, %xmm6 +; AVX2-NEXT: vpextrw $6, %xmm4, %ecx +; AVX2-NEXT: vpextrw $6, %xmm5, %edx +; AVX2-NEXT: addw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $6, %edx, %xmm6, %xmm6 +; AVX2-NEXT: vpextrw $7, %xmm4, %ecx +; AVX2-NEXT: vpextrw $7, %xmm5, %edx +; AVX2-NEXT: addw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $7, %edx, %xmm6, %xmm4 +; AVX2-NEXT: vpextrw $1, %xmm2, %ecx +; AVX2-NEXT: vpextrw $1, %xmm0, %edx +; AVX2-NEXT: addw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vmovd %xmm2, %ecx +; AVX2-NEXT: vmovd %xmm0, %esi +; AVX2-NEXT: addw %cx, %si +; AVX2-NEXT: cmovbl %eax, %esi +; AVX2-NEXT: vmovd %esi, %xmm5 +; AVX2-NEXT: vpinsrw $1, %edx, %xmm5, %xmm5 +; AVX2-NEXT: vpextrw $2, %xmm2, %ecx +; AVX2-NEXT: vpextrw $2, %xmm0, %edx +; AVX2-NEXT: addw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 +; AVX2-NEXT: vpextrw $3, %xmm2, %ecx +; AVX2-NEXT: vpextrw $3, %xmm0, %edx +; AVX2-NEXT: addw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $3, %edx, %xmm5, %xmm5 +; AVX2-NEXT: vpextrw $4, %xmm2, %ecx +; AVX2-NEXT: vpextrw $4, %xmm0, %edx +; AVX2-NEXT: addw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $4, %edx, %xmm5, %xmm5 +; AVX2-NEXT: vpextrw $5, %xmm2, %ecx +; AVX2-NEXT: vpextrw $5, %xmm0, %edx +; AVX2-NEXT: addw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $5, %edx, %xmm5, %xmm5 +; AVX2-NEXT: vpextrw $6, %xmm2, %ecx +; AVX2-NEXT: vpextrw $6, %xmm0, %edx +; AVX2-NEXT: addw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $6, %edx, %xmm5, %xmm5 +; AVX2-NEXT: vpextrw $7, %xmm2, %ecx +; AVX2-NEXT: vpextrw $7, %xmm0, %edx +; AVX2-NEXT: addw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $7, %edx, %xmm5, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm2 +; AVX2-NEXT: vpextrw $1, %xmm2, %ecx +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-NEXT: vpextrw $1, %xmm4, %edx +; AVX2-NEXT: addw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vmovd %xmm2, %ecx +; AVX2-NEXT: vmovd %xmm4, %esi +; AVX2-NEXT: addw %cx, %si +; AVX2-NEXT: cmovbl %eax, %esi +; AVX2-NEXT: vmovd %esi, %xmm5 +; AVX2-NEXT: vpinsrw $1, %edx, %xmm5, %xmm5 +; AVX2-NEXT: vpextrw $2, %xmm2, %ecx +; AVX2-NEXT: vpextrw $2, %xmm4, %edx +; AVX2-NEXT: addw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 +; AVX2-NEXT: vpextrw $3, %xmm2, %ecx +; AVX2-NEXT: vpextrw $3, %xmm4, %edx +; AVX2-NEXT: addw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $3, %edx, %xmm5, %xmm5 +; AVX2-NEXT: vpextrw $4, %xmm2, %ecx +; AVX2-NEXT: vpextrw $4, %xmm4, %edx +; AVX2-NEXT: addw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $4, %edx, %xmm5, %xmm5 +; AVX2-NEXT: vpextrw $5, %xmm2, %ecx +; AVX2-NEXT: vpextrw $5, %xmm4, %edx +; AVX2-NEXT: addw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $5, %edx, %xmm5, %xmm5 +; AVX2-NEXT: vpextrw $6, %xmm2, %ecx +; AVX2-NEXT: vpextrw $6, %xmm4, %edx +; AVX2-NEXT: addw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $6, %edx, %xmm5, %xmm5 +; AVX2-NEXT: vpextrw $7, %xmm2, %ecx +; AVX2-NEXT: vpextrw $7, %xmm4, %edx +; AVX2-NEXT: addw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $7, %edx, %xmm5, %xmm2 +; AVX2-NEXT: vpextrw $1, %xmm3, %ecx +; AVX2-NEXT: vpextrw $1, %xmm1, %edx +; AVX2-NEXT: addw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vmovd %xmm3, %ecx +; AVX2-NEXT: vmovd %xmm1, %esi +; AVX2-NEXT: addw %cx, %si +; AVX2-NEXT: cmovbl %eax, %esi +; AVX2-NEXT: vmovd %esi, %xmm4 +; AVX2-NEXT: vpinsrw $1, %edx, %xmm4, %xmm4 +; AVX2-NEXT: vpextrw $2, %xmm3, %ecx +; AVX2-NEXT: vpextrw $2, %xmm1, %edx +; AVX2-NEXT: addw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 +; AVX2-NEXT: vpextrw $3, %xmm3, %ecx +; AVX2-NEXT: vpextrw $3, %xmm1, %edx +; AVX2-NEXT: addw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $3, %edx, %xmm4, %xmm4 +; AVX2-NEXT: vpextrw $4, %xmm3, %ecx +; AVX2-NEXT: vpextrw $4, %xmm1, %edx +; AVX2-NEXT: addw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $4, %edx, %xmm4, %xmm4 +; AVX2-NEXT: vpextrw $5, %xmm3, %ecx +; AVX2-NEXT: vpextrw $5, %xmm1, %edx +; AVX2-NEXT: addw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $5, %edx, %xmm4, %xmm4 +; AVX2-NEXT: vpextrw $6, %xmm3, %ecx +; AVX2-NEXT: vpextrw $6, %xmm1, %edx +; AVX2-NEXT: addw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $6, %edx, %xmm4, %xmm4 +; AVX2-NEXT: vpextrw $7, %xmm3, %ecx +; AVX2-NEXT: vpextrw $7, %xmm1, %edx +; AVX2-NEXT: addw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $7, %edx, %xmm4, %xmm1 +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX512-LABEL: v32i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm2 +; AVX512-NEXT: vpextrw $1, %xmm2, %eax +; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm3 +; AVX512-NEXT: vpextrw $1, %xmm3, %ecx +; AVX512-NEXT: addw %ax, %cx +; AVX512-NEXT: movl $65535, %eax # imm = 0xFFFF +; AVX512-NEXT: cmovbl %eax, %ecx +; AVX512-NEXT: vmovd %xmm2, %edx +; AVX512-NEXT: vmovd %xmm3, %esi +; AVX512-NEXT: addw %dx, %si +; AVX512-NEXT: cmovbl %eax, %esi +; AVX512-NEXT: vmovd %esi, %xmm4 +; AVX512-NEXT: vpinsrw $1, %ecx, %xmm4, %xmm4 +; AVX512-NEXT: vpextrw $2, %xmm2, %ecx +; AVX512-NEXT: vpextrw $2, %xmm3, %edx +; AVX512-NEXT: addw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 +; AVX512-NEXT: vpextrw $3, %xmm2, %ecx +; AVX512-NEXT: vpextrw $3, %xmm3, %edx +; AVX512-NEXT: addw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $3, %edx, %xmm4, %xmm4 +; AVX512-NEXT: vpextrw $4, %xmm2, %ecx +; AVX512-NEXT: vpextrw $4, %xmm3, %edx +; AVX512-NEXT: addw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $4, %edx, %xmm4, %xmm4 +; AVX512-NEXT: vpextrw $5, %xmm2, %ecx +; AVX512-NEXT: vpextrw $5, %xmm3, %edx +; AVX512-NEXT: addw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $5, %edx, %xmm4, %xmm4 +; AVX512-NEXT: vpextrw $6, %xmm2, %ecx +; AVX512-NEXT: vpextrw $6, %xmm3, %edx +; AVX512-NEXT: addw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $6, %edx, %xmm4, %xmm4 +; AVX512-NEXT: vpextrw $7, %xmm2, %ecx +; AVX512-NEXT: vpextrw $7, %xmm3, %edx +; AVX512-NEXT: addw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $7, %edx, %xmm4, %xmm2 +; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm3 +; AVX512-NEXT: vpextrw $1, %xmm3, %ecx +; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm4 +; AVX512-NEXT: vpextrw $1, %xmm4, %edx +; AVX512-NEXT: addw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vmovd %xmm3, %ecx +; AVX512-NEXT: vmovd %xmm4, %esi +; AVX512-NEXT: addw %cx, %si +; AVX512-NEXT: cmovbl %eax, %esi +; AVX512-NEXT: vmovd %esi, %xmm5 +; AVX512-NEXT: vpinsrw $1, %edx, %xmm5, %xmm5 +; AVX512-NEXT: vpextrw $2, %xmm3, %ecx +; AVX512-NEXT: vpextrw $2, %xmm4, %edx +; AVX512-NEXT: addw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 +; AVX512-NEXT: vpextrw $3, %xmm3, %ecx +; AVX512-NEXT: vpextrw $3, %xmm4, %edx +; AVX512-NEXT: addw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $3, %edx, %xmm5, %xmm5 +; AVX512-NEXT: vpextrw $4, %xmm3, %ecx +; AVX512-NEXT: vpextrw $4, %xmm4, %edx +; AVX512-NEXT: addw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $4, %edx, %xmm5, %xmm5 +; AVX512-NEXT: vpextrw $5, %xmm3, %ecx +; AVX512-NEXT: vpextrw $5, %xmm4, %edx +; AVX512-NEXT: addw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $5, %edx, %xmm5, %xmm5 +; AVX512-NEXT: vpextrw $6, %xmm3, %ecx +; AVX512-NEXT: vpextrw $6, %xmm4, %edx +; AVX512-NEXT: addw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $6, %edx, %xmm5, %xmm5 +; AVX512-NEXT: vpextrw $7, %xmm3, %ecx +; AVX512-NEXT: vpextrw $7, %xmm4, %edx +; AVX512-NEXT: addw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $7, %edx, %xmm5, %xmm3 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512-NEXT: vpextrw $1, %xmm3, %ecx +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512-NEXT: vpextrw $1, %xmm4, %edx +; AVX512-NEXT: addw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vmovd %xmm3, %ecx +; AVX512-NEXT: vmovd %xmm4, %esi +; AVX512-NEXT: addw %cx, %si +; AVX512-NEXT: cmovbl %eax, %esi +; AVX512-NEXT: vmovd %esi, %xmm5 +; AVX512-NEXT: vpinsrw $1, %edx, %xmm5, %xmm5 +; AVX512-NEXT: vpextrw $2, %xmm3, %ecx +; AVX512-NEXT: vpextrw $2, %xmm4, %edx +; AVX512-NEXT: addw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 +; AVX512-NEXT: vpextrw $3, %xmm3, %ecx +; AVX512-NEXT: vpextrw $3, %xmm4, %edx +; AVX512-NEXT: addw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $3, %edx, %xmm5, %xmm5 +; AVX512-NEXT: vpextrw $4, %xmm3, %ecx +; AVX512-NEXT: vpextrw $4, %xmm4, %edx +; AVX512-NEXT: addw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $4, %edx, %xmm5, %xmm5 +; AVX512-NEXT: vpextrw $5, %xmm3, %ecx +; AVX512-NEXT: vpextrw $5, %xmm4, %edx +; AVX512-NEXT: addw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $5, %edx, %xmm5, %xmm5 +; AVX512-NEXT: vpextrw $6, %xmm3, %ecx +; AVX512-NEXT: vpextrw $6, %xmm4, %edx +; AVX512-NEXT: addw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $6, %edx, %xmm5, %xmm5 +; AVX512-NEXT: vpextrw $7, %xmm3, %ecx +; AVX512-NEXT: vpextrw $7, %xmm4, %edx +; AVX512-NEXT: addw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $7, %edx, %xmm5, %xmm3 +; AVX512-NEXT: vpextrw $1, %xmm1, %ecx +; AVX512-NEXT: vpextrw $1, %xmm0, %edx +; AVX512-NEXT: addw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vmovd %xmm1, %ecx +; AVX512-NEXT: vmovd %xmm0, %esi +; AVX512-NEXT: addw %cx, %si +; AVX512-NEXT: cmovbl %eax, %esi +; AVX512-NEXT: vmovd %esi, %xmm4 +; AVX512-NEXT: vpinsrw $1, %edx, %xmm4, %xmm4 +; AVX512-NEXT: vpextrw $2, %xmm1, %ecx +; AVX512-NEXT: vpextrw $2, %xmm0, %edx +; AVX512-NEXT: addw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 +; AVX512-NEXT: vpextrw $3, %xmm1, %ecx +; AVX512-NEXT: vpextrw $3, %xmm0, %edx +; AVX512-NEXT: addw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $3, %edx, %xmm4, %xmm4 +; AVX512-NEXT: vpextrw $4, %xmm1, %ecx +; AVX512-NEXT: vpextrw $4, %xmm0, %edx +; AVX512-NEXT: addw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $4, %edx, %xmm4, %xmm4 +; AVX512-NEXT: vpextrw $5, %xmm1, %ecx +; AVX512-NEXT: vpextrw $5, %xmm0, %edx +; AVX512-NEXT: addw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $5, %edx, %xmm4, %xmm4 +; AVX512-NEXT: vpextrw $6, %xmm1, %ecx +; AVX512-NEXT: vpextrw $6, %xmm0, %edx +; AVX512-NEXT: addw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $6, %edx, %xmm4, %xmm4 +; AVX512-NEXT: vpextrw $7, %xmm1, %ecx +; AVX512-NEXT: vpextrw $7, %xmm0, %edx +; AVX512-NEXT: addw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $7, %edx, %xmm4, %xmm0 +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: retq + %z = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %x, <32 x i16> %y) + ret <32 x i16> %z +} + +; Too narrow vectors, legalized by widening. + +define void @v8i8(<8 x i8>* %px, <8 x i8>* %py, <8 x i8>* %pz) nounwind { +; SSE2-LABEL: v8i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE2-NEXT: pextrw $7, %xmm1, %eax +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: pextrw $7, %xmm0, %ecx +; SSE2-NEXT: addw %ax, %cx +; SSE2-NEXT: movl $65535, %eax # imm = 0xFFFF +; SSE2-NEXT: cmovbl %eax, %ecx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: pextrw $6, %xmm1, %ecx +; SSE2-NEXT: pextrw $6, %xmm0, %esi +; SSE2-NEXT: addw %cx, %si +; SSE2-NEXT: cmovbl %eax, %esi +; SSE2-NEXT: movd %esi, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: pextrw $5, %xmm1, %ecx +; SSE2-NEXT: pextrw $5, %xmm0, %esi +; SSE2-NEXT: addw %cx, %si +; SSE2-NEXT: cmovbl %eax, %esi +; SSE2-NEXT: movd %esi, %xmm4 +; SSE2-NEXT: pextrw $4, %xmm1, %ecx +; SSE2-NEXT: pextrw $4, %xmm0, %esi +; SSE2-NEXT: addw %cx, %si +; SSE2-NEXT: cmovbl %eax, %esi +; SSE2-NEXT: movd %esi, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE2-NEXT: pextrw $3, %xmm1, %ecx +; SSE2-NEXT: pextrw $3, %xmm0, %esi +; SSE2-NEXT: addw %cx, %si +; SSE2-NEXT: cmovbl %eax, %esi +; SSE2-NEXT: movd %esi, %xmm3 +; SSE2-NEXT: pextrw $2, %xmm1, %ecx +; SSE2-NEXT: pextrw $2, %xmm0, %esi +; SSE2-NEXT: addw %cx, %si +; SSE2-NEXT: cmovbl %eax, %esi +; SSE2-NEXT: movd %esi, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE2-NEXT: pextrw $1, %xmm1, %ecx +; SSE2-NEXT: pextrw $1, %xmm0, %esi +; SSE2-NEXT: addw %cx, %si +; SSE2-NEXT: cmovbl %eax, %esi +; SSE2-NEXT: movd %esi, %xmm3 +; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: movd %xmm0, %esi +; SSE2-NEXT: addw %cx, %si +; SSE2-NEXT: cmovbl %eax, %esi +; SSE2-NEXT: movd %esi, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: movq %xmm0, (%rdx) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v8i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; SSSE3-NEXT: movq {{.*#+}} xmm3 = mem[0],zero +; SSSE3-NEXT: pxor %xmm0, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSSE3-NEXT: pextrw $7, %xmm1, %eax +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSSE3-NEXT: pextrw $7, %xmm0, %ecx +; SSSE3-NEXT: addw %ax, %cx +; SSSE3-NEXT: movl $65535, %eax # imm = 0xFFFF +; SSSE3-NEXT: cmovbl %eax, %ecx +; SSSE3-NEXT: movd %ecx, %xmm2 +; SSSE3-NEXT: pextrw $6, %xmm1, %ecx +; SSSE3-NEXT: pextrw $6, %xmm0, %esi +; SSSE3-NEXT: addw %cx, %si +; SSSE3-NEXT: cmovbl %eax, %esi +; SSSE3-NEXT: movd %esi, %xmm3 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSSE3-NEXT: pextrw $5, %xmm1, %ecx +; SSSE3-NEXT: pextrw $5, %xmm0, %esi +; SSSE3-NEXT: addw %cx, %si +; SSSE3-NEXT: cmovbl %eax, %esi +; SSSE3-NEXT: movd %esi, %xmm4 +; SSSE3-NEXT: pextrw $4, %xmm1, %ecx +; SSSE3-NEXT: pextrw $4, %xmm0, %esi +; SSSE3-NEXT: addw %cx, %si +; SSSE3-NEXT: cmovbl %eax, %esi +; SSSE3-NEXT: movd %esi, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSSE3-NEXT: pextrw $3, %xmm1, %ecx +; SSSE3-NEXT: pextrw $3, %xmm0, %esi +; SSSE3-NEXT: addw %cx, %si +; SSSE3-NEXT: cmovbl %eax, %esi +; SSSE3-NEXT: movd %esi, %xmm3 +; SSSE3-NEXT: pextrw $2, %xmm1, %ecx +; SSSE3-NEXT: pextrw $2, %xmm0, %esi +; SSSE3-NEXT: addw %cx, %si +; SSSE3-NEXT: cmovbl %eax, %esi +; SSSE3-NEXT: movd %esi, %xmm4 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSSE3-NEXT: pextrw $1, %xmm1, %ecx +; SSSE3-NEXT: pextrw $1, %xmm0, %esi +; SSSE3-NEXT: addw %cx, %si +; SSSE3-NEXT: cmovbl %eax, %esi +; SSSE3-NEXT: movd %esi, %xmm3 +; SSSE3-NEXT: movd %xmm1, %ecx +; SSSE3-NEXT: movd %xmm0, %esi +; SSSE3-NEXT: addw %cx, %si +; SSSE3-NEXT: cmovbl %eax, %esi +; SSSE3-NEXT: movd %esi, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSSE3-NEXT: psrlw $8, %xmm0 +; SSSE3-NEXT: packuswb %xmm0, %xmm0 +; SSSE3-NEXT: movq %xmm0, (%rdx) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v8i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; SSE41-NEXT: movq {{.*#+}} xmm3 = mem[0],zero +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE41-NEXT: pextrw $1, %xmm1, %eax +; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE41-NEXT: pextrw $1, %xmm0, %ecx +; SSE41-NEXT: addw %ax, %cx +; SSE41-NEXT: movl $65535, %eax # imm = 0xFFFF +; SSE41-NEXT: cmovbl %eax, %ecx +; SSE41-NEXT: movd %xmm1, %esi +; SSE41-NEXT: movd %xmm0, %edi +; SSE41-NEXT: addw %si, %di +; SSE41-NEXT: cmovbl %eax, %edi +; SSE41-NEXT: movd %edi, %xmm2 +; SSE41-NEXT: pinsrw $1, %ecx, %xmm2 +; SSE41-NEXT: pextrw $2, %xmm1, %ecx +; SSE41-NEXT: pextrw $2, %xmm0, %esi +; SSE41-NEXT: addw %cx, %si +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: pinsrw $2, %esi, %xmm2 +; SSE41-NEXT: pextrw $3, %xmm1, %ecx +; SSE41-NEXT: pextrw $3, %xmm0, %esi +; SSE41-NEXT: addw %cx, %si +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: pinsrw $3, %esi, %xmm2 +; SSE41-NEXT: pextrw $4, %xmm1, %ecx +; SSE41-NEXT: pextrw $4, %xmm0, %esi +; SSE41-NEXT: addw %cx, %si +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: pinsrw $4, %esi, %xmm2 +; SSE41-NEXT: pextrw $5, %xmm1, %ecx +; SSE41-NEXT: pextrw $5, %xmm0, %esi +; SSE41-NEXT: addw %cx, %si +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: pinsrw $5, %esi, %xmm2 +; SSE41-NEXT: pextrw $6, %xmm1, %ecx +; SSE41-NEXT: pextrw $6, %xmm0, %esi +; SSE41-NEXT: addw %cx, %si +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: pinsrw $6, %esi, %xmm2 +; SSE41-NEXT: pextrw $7, %xmm1, %ecx +; SSE41-NEXT: pextrw $7, %xmm0, %esi +; SSE41-NEXT: addw %cx, %si +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: pinsrw $7, %esi, %xmm2 +; SSE41-NEXT: psrlw $8, %xmm2 +; SSE41-NEXT: packuswb %xmm0, %xmm2 +; SSE41-NEXT: movq %xmm2, (%rdx) +; SSE41-NEXT: retq +; +; AVX1-LABEL: v8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX1-NEXT: vpextrw $1, %xmm0, %eax +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX1-NEXT: vpextrw $1, %xmm1, %ecx +; AVX1-NEXT: addw %ax, %cx +; AVX1-NEXT: movl $65535, %eax # imm = 0xFFFF +; AVX1-NEXT: cmovbl %eax, %ecx +; AVX1-NEXT: vmovd %xmm0, %esi +; AVX1-NEXT: vmovd %xmm1, %edi +; AVX1-NEXT: addw %si, %di +; AVX1-NEXT: cmovbl %eax, %edi +; AVX1-NEXT: vmovd %edi, %xmm2 +; AVX1-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $2, %xmm0, %ecx +; AVX1-NEXT: vpextrw $2, %xmm1, %esi +; AVX1-NEXT: addw %cx, %si +; AVX1-NEXT: cmovbl %eax, %esi +; AVX1-NEXT: vpinsrw $2, %esi, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $3, %xmm0, %ecx +; AVX1-NEXT: vpextrw $3, %xmm1, %esi +; AVX1-NEXT: addw %cx, %si +; AVX1-NEXT: cmovbl %eax, %esi +; AVX1-NEXT: vpinsrw $3, %esi, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $4, %xmm0, %ecx +; AVX1-NEXT: vpextrw $4, %xmm1, %esi +; AVX1-NEXT: addw %cx, %si +; AVX1-NEXT: cmovbl %eax, %esi +; AVX1-NEXT: vpinsrw $4, %esi, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $5, %xmm0, %ecx +; AVX1-NEXT: vpextrw $5, %xmm1, %esi +; AVX1-NEXT: addw %cx, %si +; AVX1-NEXT: cmovbl %eax, %esi +; AVX1-NEXT: vpinsrw $5, %esi, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $6, %xmm0, %ecx +; AVX1-NEXT: vpextrw $6, %xmm1, %esi +; AVX1-NEXT: addw %cx, %si +; AVX1-NEXT: cmovbl %eax, %esi +; AVX1-NEXT: vpinsrw $6, %esi, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $7, %xmm0, %ecx +; AVX1-NEXT: vpextrw $7, %xmm1, %esi +; AVX1-NEXT: addw %cx, %si +; AVX1-NEXT: cmovbl %eax, %esi +; AVX1-NEXT: vpinsrw $7, %esi, %xmm2, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, (%rdx) +; AVX1-NEXT: retq +; +; AVX2-LABEL: v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX2-NEXT: vpextrw $1, %xmm0, %eax +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX2-NEXT: vpextrw $1, %xmm1, %ecx +; AVX2-NEXT: addw %ax, %cx +; AVX2-NEXT: movl $65535, %eax # imm = 0xFFFF +; AVX2-NEXT: cmovbl %eax, %ecx +; AVX2-NEXT: vmovd %xmm0, %esi +; AVX2-NEXT: vmovd %xmm1, %edi +; AVX2-NEXT: addw %si, %di +; AVX2-NEXT: cmovbl %eax, %edi +; AVX2-NEXT: vmovd %edi, %xmm2 +; AVX2-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $2, %xmm0, %ecx +; AVX2-NEXT: vpextrw $2, %xmm1, %esi +; AVX2-NEXT: addw %cx, %si +; AVX2-NEXT: cmovbl %eax, %esi +; AVX2-NEXT: vpinsrw $2, %esi, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $3, %xmm0, %ecx +; AVX2-NEXT: vpextrw $3, %xmm1, %esi +; AVX2-NEXT: addw %cx, %si +; AVX2-NEXT: cmovbl %eax, %esi +; AVX2-NEXT: vpinsrw $3, %esi, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $4, %xmm0, %ecx +; AVX2-NEXT: vpextrw $4, %xmm1, %esi +; AVX2-NEXT: addw %cx, %si +; AVX2-NEXT: cmovbl %eax, %esi +; AVX2-NEXT: vpinsrw $4, %esi, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $5, %xmm0, %ecx +; AVX2-NEXT: vpextrw $5, %xmm1, %esi +; AVX2-NEXT: addw %cx, %si +; AVX2-NEXT: cmovbl %eax, %esi +; AVX2-NEXT: vpinsrw $5, %esi, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $6, %xmm0, %ecx +; AVX2-NEXT: vpextrw $6, %xmm1, %esi +; AVX2-NEXT: addw %cx, %si +; AVX2-NEXT: cmovbl %eax, %esi +; AVX2-NEXT: vpinsrw $6, %esi, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $7, %xmm0, %ecx +; AVX2-NEXT: vpextrw $7, %xmm1, %esi +; AVX2-NEXT: addw %cx, %si +; AVX2-NEXT: cmovbl %eax, %esi +; AVX2-NEXT: vpinsrw $7, %esi, %xmm2, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rdx) +; AVX2-NEXT: retq +; +; AVX512-LABEL: v8i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512-NEXT: vpextrw $1, %xmm0, %eax +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512-NEXT: vpextrw $1, %xmm1, %ecx +; AVX512-NEXT: addw %ax, %cx +; AVX512-NEXT: movl $65535, %eax # imm = 0xFFFF +; AVX512-NEXT: cmovbl %eax, %ecx +; AVX512-NEXT: vmovd %xmm0, %esi +; AVX512-NEXT: vmovd %xmm1, %edi +; AVX512-NEXT: addw %si, %di +; AVX512-NEXT: cmovbl %eax, %edi +; AVX512-NEXT: vmovd %edi, %xmm2 +; AVX512-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 +; AVX512-NEXT: vpextrw $2, %xmm0, %ecx +; AVX512-NEXT: vpextrw $2, %xmm1, %esi +; AVX512-NEXT: addw %cx, %si +; AVX512-NEXT: cmovbl %eax, %esi +; AVX512-NEXT: vpinsrw $2, %esi, %xmm2, %xmm2 +; AVX512-NEXT: vpextrw $3, %xmm0, %ecx +; AVX512-NEXT: vpextrw $3, %xmm1, %esi +; AVX512-NEXT: addw %cx, %si +; AVX512-NEXT: cmovbl %eax, %esi +; AVX512-NEXT: vpinsrw $3, %esi, %xmm2, %xmm2 +; AVX512-NEXT: vpextrw $4, %xmm0, %ecx +; AVX512-NEXT: vpextrw $4, %xmm1, %esi +; AVX512-NEXT: addw %cx, %si +; AVX512-NEXT: cmovbl %eax, %esi +; AVX512-NEXT: vpinsrw $4, %esi, %xmm2, %xmm2 +; AVX512-NEXT: vpextrw $5, %xmm0, %ecx +; AVX512-NEXT: vpextrw $5, %xmm1, %esi +; AVX512-NEXT: addw %cx, %si +; AVX512-NEXT: cmovbl %eax, %esi +; AVX512-NEXT: vpinsrw $5, %esi, %xmm2, %xmm2 +; AVX512-NEXT: vpextrw $6, %xmm0, %ecx +; AVX512-NEXT: vpextrw $6, %xmm1, %esi +; AVX512-NEXT: addw %cx, %si +; AVX512-NEXT: cmovbl %eax, %esi +; AVX512-NEXT: vpinsrw $6, %esi, %xmm2, %xmm2 +; AVX512-NEXT: vpextrw $7, %xmm0, %ecx +; AVX512-NEXT: vpextrw $7, %xmm1, %esi +; AVX512-NEXT: addw %cx, %si +; AVX512-NEXT: cmovbl %eax, %esi +; AVX512-NEXT: vpinsrw $7, %esi, %xmm2, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512-NEXT: vpmovwb %xmm0, (%rdx) +; AVX512-NEXT: retq + %x = load <8 x i8>, <8 x i8>* %px + %y = load <8 x i8>, <8 x i8>* %py + %z = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> %x, <8 x i8> %y) + store <8 x i8> %z, <8 x i8>* %pz + ret void +} + +define void @v4i8(<4 x i8>* %px, <4 x i8>* %py, <4 x i8>* %pz) nounwind { +; SSE2-LABEL: v4i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: pslld $24, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3] +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: pslld $24, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] +; SSE2-NEXT: movd %xmm2, %ecx +; SSE2-NEXT: addl %eax, %ecx +; SSE2-NEXT: movl $-1, %eax +; SSE2-NEXT: cmovbl %eax, %ecx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; SSE2-NEXT: movd %xmm3, %ecx +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; SSE2-NEXT: movd %xmm3, %esi +; SSE2-NEXT: addl %ecx, %esi +; SSE2-NEXT: cmovbl %eax, %esi +; SSE2-NEXT: movd %esi, %xmm3 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: movd %xmm0, %esi +; SSE2-NEXT: addl %ecx, %esi +; SSE2-NEXT: cmovbl %eax, %esi +; SSE2-NEXT: movd %esi, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE2-NEXT: movd %xmm0, %esi +; SSE2-NEXT: addl %ecx, %esi +; SSE2-NEXT: cmovbl %eax, %esi +; SSE2-NEXT: movd %esi, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE2-NEXT: psrld $24, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm2 +; SSE2-NEXT: movd %xmm2, (%rdx) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v4i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,1,255,255,255,2,255,255,255,3] +; SSSE3-NEXT: pshufb %xmm2, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,1,2,3] +; SSSE3-NEXT: movd %xmm3, %eax +; SSSE3-NEXT: pshufb %xmm2, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] +; SSSE3-NEXT: movd %xmm2, %ecx +; SSSE3-NEXT: addl %eax, %ecx +; SSSE3-NEXT: movl $-1, %eax +; SSSE3-NEXT: cmovbl %eax, %ecx +; SSSE3-NEXT: movd %ecx, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; SSSE3-NEXT: movd %xmm3, %ecx +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; SSSE3-NEXT: movd %xmm3, %esi +; SSSE3-NEXT: addl %ecx, %esi +; SSSE3-NEXT: cmovbl %eax, %esi +; SSSE3-NEXT: movd %esi, %xmm3 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSSE3-NEXT: movd %xmm1, %ecx +; SSSE3-NEXT: movd %xmm0, %esi +; SSSE3-NEXT: addl %ecx, %esi +; SSSE3-NEXT: cmovbl %eax, %esi +; SSSE3-NEXT: movd %esi, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; SSSE3-NEXT: movd %xmm1, %ecx +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSSE3-NEXT: movd %xmm0, %esi +; SSSE3-NEXT: addl %ecx, %esi +; SSSE3-NEXT: cmovbl %eax, %esi +; SSSE3-NEXT: movd %esi, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: movd %xmm2, (%rdx) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v4i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; SSE41-NEXT: pslld $24, %xmm1 +; SSE41-NEXT: pextrd $1, %xmm1, %eax +; SSE41-NEXT: pslld $24, %xmm0 +; SSE41-NEXT: pextrd $1, %xmm0, %ecx +; SSE41-NEXT: addl %eax, %ecx +; SSE41-NEXT: movl $-1, %eax +; SSE41-NEXT: cmovbl %eax, %ecx +; SSE41-NEXT: movd %xmm1, %esi +; SSE41-NEXT: movd %xmm0, %edi +; SSE41-NEXT: addl %esi, %edi +; SSE41-NEXT: cmovbl %eax, %edi +; SSE41-NEXT: movd %edi, %xmm2 +; SSE41-NEXT: pinsrd $1, %ecx, %xmm2 +; SSE41-NEXT: pextrd $2, %xmm1, %ecx +; SSE41-NEXT: pextrd $2, %xmm0, %esi +; SSE41-NEXT: addl %ecx, %esi +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: pinsrd $2, %esi, %xmm2 +; SSE41-NEXT: pextrd $3, %xmm1, %ecx +; SSE41-NEXT: pextrd $3, %xmm0, %esi +; SSE41-NEXT: addl %ecx, %esi +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: pinsrd $3, %esi, %xmm2 +; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] +; SSE41-NEXT: movd %xmm2, (%rdx) +; SSE41-NEXT: retq +; +; AVX1-LABEL: v4i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpslld $24, %xmm1, %xmm1 +; AVX1-NEXT: vpextrd $1, %xmm1, %eax +; AVX1-NEXT: vpslld $24, %xmm0, %xmm0 +; AVX1-NEXT: vpextrd $1, %xmm0, %ecx +; AVX1-NEXT: addl %eax, %ecx +; AVX1-NEXT: movl $-1, %eax +; AVX1-NEXT: cmovbl %eax, %ecx +; AVX1-NEXT: vmovd %xmm1, %esi +; AVX1-NEXT: vmovd %xmm0, %edi +; AVX1-NEXT: addl %esi, %edi +; AVX1-NEXT: cmovbl %eax, %edi +; AVX1-NEXT: vmovd %edi, %xmm2 +; AVX1-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrd $2, %xmm1, %ecx +; AVX1-NEXT: vpextrd $2, %xmm0, %esi +; AVX1-NEXT: addl %ecx, %esi +; AVX1-NEXT: cmovbl %eax, %esi +; AVX1-NEXT: vpinsrd $2, %esi, %xmm2, %xmm2 +; AVX1-NEXT: vpextrd $3, %xmm1, %ecx +; AVX1-NEXT: vpextrd $3, %xmm0, %esi +; AVX1-NEXT: addl %ecx, %esi +; AVX1-NEXT: cmovbl %eax, %esi +; AVX1-NEXT: vpinsrd $3, %esi, %xmm2, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vmovd %xmm0, (%rdx) +; AVX1-NEXT: retq +; +; AVX2-LABEL: v4i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX2-NEXT: vpslld $24, %xmm1, %xmm1 +; AVX2-NEXT: vpextrd $1, %xmm1, %eax +; AVX2-NEXT: vpslld $24, %xmm0, %xmm0 +; AVX2-NEXT: vpextrd $1, %xmm0, %ecx +; AVX2-NEXT: addl %eax, %ecx +; AVX2-NEXT: movl $-1, %eax +; AVX2-NEXT: cmovbl %eax, %ecx +; AVX2-NEXT: vmovd %xmm1, %esi +; AVX2-NEXT: vmovd %xmm0, %edi +; AVX2-NEXT: addl %esi, %edi +; AVX2-NEXT: cmovbl %eax, %edi +; AVX2-NEXT: vmovd %edi, %xmm2 +; AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrd $2, %xmm1, %ecx +; AVX2-NEXT: vpextrd $2, %xmm0, %esi +; AVX2-NEXT: addl %ecx, %esi +; AVX2-NEXT: cmovbl %eax, %esi +; AVX2-NEXT: vpinsrd $2, %esi, %xmm2, %xmm2 +; AVX2-NEXT: vpextrd $3, %xmm1, %ecx +; AVX2-NEXT: vpextrd $3, %xmm0, %esi +; AVX2-NEXT: addl %ecx, %esi +; AVX2-NEXT: cmovbl %eax, %esi +; AVX2-NEXT: vpinsrd $3, %esi, %xmm2, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovd %xmm0, (%rdx) +; AVX2-NEXT: retq +; +; AVX512-LABEL: v4i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,1,255,255,255,2,255,255,255,3] +; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpextrd $1, %xmm1, %eax +; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpextrd $1, %xmm0, %ecx +; AVX512-NEXT: addl %eax, %ecx +; AVX512-NEXT: movl $-1, %eax +; AVX512-NEXT: cmovbl %eax, %ecx +; AVX512-NEXT: vmovd %xmm1, %esi +; AVX512-NEXT: vmovd %xmm0, %edi +; AVX512-NEXT: addl %esi, %edi +; AVX512-NEXT: cmovbl %eax, %edi +; AVX512-NEXT: vmovd %edi, %xmm2 +; AVX512-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 +; AVX512-NEXT: vpextrd $2, %xmm1, %ecx +; AVX512-NEXT: vpextrd $2, %xmm0, %esi +; AVX512-NEXT: addl %ecx, %esi +; AVX512-NEXT: cmovbl %eax, %esi +; AVX512-NEXT: vpinsrd $2, %esi, %xmm2, %xmm2 +; AVX512-NEXT: vpextrd $3, %xmm1, %ecx +; AVX512-NEXT: vpextrd $3, %xmm0, %esi +; AVX512-NEXT: addl %ecx, %esi +; AVX512-NEXT: cmovbl %eax, %esi +; AVX512-NEXT: vpinsrd $3, %esi, %xmm2, %xmm0 +; AVX512-NEXT: vpsrld $24, %xmm0, %xmm0 +; AVX512-NEXT: vpmovdb %xmm0, (%rdx) +; AVX512-NEXT: retq + %x = load <4 x i8>, <4 x i8>* %px + %y = load <4 x i8>, <4 x i8>* %py + %z = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> %x, <4 x i8> %y) + store <4 x i8> %z, <4 x i8>* %pz + ret void +} + +define void @v2i8(<2 x i8>* %px, <2 x i8>* %py, <2 x i8>* %pz) nounwind { +; SSE2-LABEL: v2i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movzwl (%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSE2-NEXT: movzwl (%rsi), %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] +; SSE2-NEXT: psllq $56, %xmm1 +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: psllq $56, %xmm0 +; SSE2-NEXT: movq %xmm0, %rcx +; SSE2-NEXT: addq %rax, %rcx +; SSE2-NEXT: movq $-1, %rax +; SSE2-NEXT: cmovbq %rax, %rcx +; SSE2-NEXT: movq %rcx, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: movq %xmm1, %rcx +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm0, %rsi +; SSE2-NEXT: addq %rcx, %rsi +; SSE2-NEXT: cmovbq %rax, %rsi +; SSE2-NEXT: movq %rsi, %xmm0 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE2-NEXT: psrlq $56, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm2 +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: movw %ax, (%rdx) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v2i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movzwl (%rdi), %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzwl (%rsi), %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,1] +; SSSE3-NEXT: pshufb %xmm2, %xmm1 +; SSSE3-NEXT: movq %xmm1, %rax +; SSSE3-NEXT: pshufb %xmm2, %xmm0 +; SSSE3-NEXT: movq %xmm0, %rcx +; SSSE3-NEXT: addq %rax, %rcx +; SSSE3-NEXT: movq $-1, %rax +; SSSE3-NEXT: cmovbq %rax, %rcx +; SSSE3-NEXT: movq %rcx, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSSE3-NEXT: movq %xmm1, %rcx +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSSE3-NEXT: movq %xmm0, %rsi +; SSSE3-NEXT: addq %rcx, %rsi +; SSSE3-NEXT: cmovbq %rax, %rsi +; SSSE3-NEXT: movq %rsi, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: movd %xmm2, %eax +; SSSE3-NEXT: movw %ax, (%rdx) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v2i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: psllq $56, %xmm1 +; SSE41-NEXT: pextrq $1, %xmm1, %rax +; SSE41-NEXT: psllq $56, %xmm0 +; SSE41-NEXT: pextrq $1, %xmm0, %rcx +; SSE41-NEXT: addq %rax, %rcx +; SSE41-NEXT: movq $-1, %rax +; SSE41-NEXT: cmovbq %rax, %rcx +; SSE41-NEXT: movq %rcx, %xmm2 +; SSE41-NEXT: movq %xmm1, %rcx +; SSE41-NEXT: movq %xmm0, %rsi +; SSE41-NEXT: addq %rcx, %rsi +; SSE41-NEXT: cmovbq %rax, %rsi +; SSE41-NEXT: movq %rsi, %xmm0 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; SSE41-NEXT: pextrw $0, %xmm0, (%rdx) +; SSE41-NEXT: retq +; +; AVX1-LABEL: v2i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsllq $56, %xmm1, %xmm1 +; AVX1-NEXT: vpextrq $1, %xmm1, %rax +; AVX1-NEXT: vpsllq $56, %xmm0, %xmm0 +; AVX1-NEXT: vpextrq $1, %xmm0, %rcx +; AVX1-NEXT: addq %rax, %rcx +; AVX1-NEXT: movq $-1, %rax +; AVX1-NEXT: cmovbq %rax, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm2 +; AVX1-NEXT: vmovq %xmm1, %rcx +; AVX1-NEXT: vmovq %xmm0, %rsi +; AVX1-NEXT: addq %rcx, %rsi +; AVX1-NEXT: cmovbq %rax, %rsi +; AVX1-NEXT: vmovq %rsi, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpextrw $0, %xmm0, (%rdx) +; AVX1-NEXT: retq +; +; AVX2-LABEL: v2i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpsllq $56, %xmm1, %xmm1 +; AVX2-NEXT: vpextrq $1, %xmm1, %rax +; AVX2-NEXT: vpsllq $56, %xmm0, %xmm0 +; AVX2-NEXT: vpextrq $1, %xmm0, %rcx +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: movq $-1, %rax +; AVX2-NEXT: cmovbq %rax, %rcx +; AVX2-NEXT: vmovq %rcx, %xmm2 +; AVX2-NEXT: vmovq %xmm1, %rcx +; AVX2-NEXT: vmovq %xmm0, %rsi +; AVX2-NEXT: addq %rcx, %rsi +; AVX2-NEXT: cmovbq %rax, %rsi +; AVX2-NEXT: vmovq %rsi, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpextrw $0, %xmm0, (%rdx) +; AVX2-NEXT: retq +; +; AVX512-LABEL: v2i8: +; AVX512: # %bb.0: +; AVX512-NEXT: movzwl (%rdi), %eax +; AVX512-NEXT: vmovd %eax, %xmm0 +; AVX512-NEXT: movzwl (%rsi), %eax +; AVX512-NEXT: vmovd %eax, %xmm1 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,1] +; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpextrq $1, %xmm1, %rax +; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512-NEXT: addq %rax, %rcx +; AVX512-NEXT: movq $-1, %rax +; AVX512-NEXT: cmovbq %rax, %rcx +; AVX512-NEXT: vmovq %rcx, %xmm2 +; AVX512-NEXT: vmovq %xmm1, %rcx +; AVX512-NEXT: vmovq %xmm0, %rsi +; AVX512-NEXT: addq %rcx, %rsi +; AVX512-NEXT: cmovbq %rax, %rsi +; AVX512-NEXT: vmovq %rsi, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX512-NEXT: vpsrlq $56, %xmm0, %xmm0 +; AVX512-NEXT: vpmovqb %xmm0, (%rdx) +; AVX512-NEXT: retq + %x = load <2 x i8>, <2 x i8>* %px + %y = load <2 x i8>, <2 x i8>* %py + %z = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> %x, <2 x i8> %y) + store <2 x i8> %z, <2 x i8>* %pz + ret void +} + +define void @v4i16(<4 x i16>* %px, <4 x i16>* %py, <4 x i16>* %pz) nounwind { +; SSE2-LABEL: v4i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,1,2,3] +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] +; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: addl %eax, %ecx +; SSE2-NEXT: movl $-1, %eax +; SSE2-NEXT: cmovbl %eax, %ecx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; SSE2-NEXT: movd %xmm2, %ecx +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE2-NEXT: movd %xmm2, %esi +; SSE2-NEXT: addl %ecx, %esi +; SSE2-NEXT: cmovbl %eax, %esi +; SSE2-NEXT: movd %esi, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: movd %xmm3, %ecx +; SSE2-NEXT: movd %xmm0, %esi +; SSE2-NEXT: addl %ecx, %esi +; SSE2-NEXT: cmovbl %eax, %esi +; SSE2-NEXT: movd %esi, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] +; SSE2-NEXT: movd %xmm3, %ecx +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE2-NEXT: movd %xmm0, %esi +; SSE2-NEXT: addl %ecx, %esi +; SSE2-NEXT: cmovbl %eax, %esi +; SSE2-NEXT: movd %esi, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: movq %xmm0, (%rdx) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v4i16: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSSE3-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; SSSE3-NEXT: pxor %xmm0, %xmm0 +; SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,1,2,3] +; SSSE3-NEXT: movd %xmm2, %eax +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] +; SSSE3-NEXT: movd %xmm1, %ecx +; SSSE3-NEXT: addl %eax, %ecx +; SSSE3-NEXT: movl $-1, %eax +; SSSE3-NEXT: cmovbl %eax, %ecx +; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; SSSE3-NEXT: movd %xmm2, %ecx +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSSE3-NEXT: movd %xmm2, %esi +; SSSE3-NEXT: addl %ecx, %esi +; SSSE3-NEXT: cmovbl %eax, %esi +; SSSE3-NEXT: movd %esi, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSSE3-NEXT: movd %xmm3, %ecx +; SSSE3-NEXT: movd %xmm0, %esi +; SSSE3-NEXT: addl %ecx, %esi +; SSSE3-NEXT: cmovbl %eax, %esi +; SSSE3-NEXT: movd %esi, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] +; SSSE3-NEXT: movd %xmm3, %ecx +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSSE3-NEXT: movd %xmm0, %esi +; SSSE3-NEXT: addl %ecx, %esi +; SSSE3-NEXT: cmovbl %eax, %esi +; SSSE3-NEXT: movd %esi, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,10,11,14,15,14,15],zero,zero +; SSSE3-NEXT: movq %xmm1, (%rdx) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v4i16: +; SSE41: # %bb.0: +; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE41-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE41-NEXT: pextrd $1, %xmm3, %eax +; SSE41-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE41-NEXT: pextrd $1, %xmm2, %ecx +; SSE41-NEXT: addl %eax, %ecx +; SSE41-NEXT: movl $-1, %eax +; SSE41-NEXT: cmovbl %eax, %ecx +; SSE41-NEXT: movd %xmm3, %esi +; SSE41-NEXT: movd %xmm2, %edi +; SSE41-NEXT: addl %esi, %edi +; SSE41-NEXT: cmovbl %eax, %edi +; SSE41-NEXT: movd %edi, %xmm0 +; SSE41-NEXT: pinsrd $1, %ecx, %xmm0 +; SSE41-NEXT: pextrd $2, %xmm3, %ecx +; SSE41-NEXT: pextrd $2, %xmm2, %esi +; SSE41-NEXT: addl %ecx, %esi +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: pinsrd $2, %esi, %xmm0 +; SSE41-NEXT: pextrd $3, %xmm3, %ecx +; SSE41-NEXT: pextrd $3, %xmm2, %esi +; SSE41-NEXT: addl %ecx, %esi +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: pinsrd $3, %esi, %xmm0 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: packusdw %xmm0, %xmm0 +; SSE41-NEXT: movq %xmm0, (%rdx) +; SSE41-NEXT: retq +; +; AVX1-LABEL: v4i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-NEXT: vpextrd $1, %xmm1, %eax +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX1-NEXT: vpextrd $1, %xmm0, %ecx +; AVX1-NEXT: addl %eax, %ecx +; AVX1-NEXT: movl $-1, %eax +; AVX1-NEXT: cmovbl %eax, %ecx +; AVX1-NEXT: vmovd %xmm1, %esi +; AVX1-NEXT: vmovd %xmm0, %edi +; AVX1-NEXT: addl %esi, %edi +; AVX1-NEXT: cmovbl %eax, %edi +; AVX1-NEXT: vmovd %edi, %xmm2 +; AVX1-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrd $2, %xmm1, %ecx +; AVX1-NEXT: vpextrd $2, %xmm0, %esi +; AVX1-NEXT: addl %ecx, %esi +; AVX1-NEXT: cmovbl %eax, %esi +; AVX1-NEXT: vpinsrd $2, %esi, %xmm2, %xmm2 +; AVX1-NEXT: vpextrd $3, %xmm1, %ecx +; AVX1-NEXT: vpextrd $3, %xmm0, %esi +; AVX1-NEXT: addl %ecx, %esi +; AVX1-NEXT: cmovbl %eax, %esi +; AVX1-NEXT: vpinsrd $3, %esi, %xmm2, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, (%rdx) +; AVX1-NEXT: retq +; +; AVX2-LABEL: v4i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-NEXT: vpextrd $1, %xmm1, %eax +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX2-NEXT: vpextrd $1, %xmm0, %ecx +; AVX2-NEXT: addl %eax, %ecx +; AVX2-NEXT: movl $-1, %eax +; AVX2-NEXT: cmovbl %eax, %ecx +; AVX2-NEXT: vmovd %xmm1, %esi +; AVX2-NEXT: vmovd %xmm0, %edi +; AVX2-NEXT: addl %esi, %edi +; AVX2-NEXT: cmovbl %eax, %edi +; AVX2-NEXT: vmovd %edi, %xmm2 +; AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrd $2, %xmm1, %ecx +; AVX2-NEXT: vpextrd $2, %xmm0, %esi +; AVX2-NEXT: addl %ecx, %esi +; AVX2-NEXT: cmovbl %eax, %esi +; AVX2-NEXT: vpinsrd $2, %esi, %xmm2, %xmm2 +; AVX2-NEXT: vpextrd $3, %xmm1, %ecx +; AVX2-NEXT: vpextrd $3, %xmm0, %esi +; AVX2-NEXT: addl %ecx, %esi +; AVX2-NEXT: cmovbl %eax, %esi +; AVX2-NEXT: vpinsrd $3, %esi, %xmm2, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rdx) +; AVX2-NEXT: retq +; +; AVX512-LABEL: v4i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,0,1,255,255,2,3,255,255,4,5,255,255,6,7] +; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpextrd $1, %xmm1, %eax +; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpextrd $1, %xmm0, %ecx +; AVX512-NEXT: addl %eax, %ecx +; AVX512-NEXT: movl $-1, %eax +; AVX512-NEXT: cmovbl %eax, %ecx +; AVX512-NEXT: vmovd %xmm1, %esi +; AVX512-NEXT: vmovd %xmm0, %edi +; AVX512-NEXT: addl %esi, %edi +; AVX512-NEXT: cmovbl %eax, %edi +; AVX512-NEXT: vmovd %edi, %xmm2 +; AVX512-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 +; AVX512-NEXT: vpextrd $2, %xmm1, %ecx +; AVX512-NEXT: vpextrd $2, %xmm0, %esi +; AVX512-NEXT: addl %ecx, %esi +; AVX512-NEXT: cmovbl %eax, %esi +; AVX512-NEXT: vpinsrd $2, %esi, %xmm2, %xmm2 +; AVX512-NEXT: vpextrd $3, %xmm1, %ecx +; AVX512-NEXT: vpextrd $3, %xmm0, %esi +; AVX512-NEXT: addl %ecx, %esi +; AVX512-NEXT: cmovbl %eax, %esi +; AVX512-NEXT: vpinsrd $3, %esi, %xmm2, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX512-NEXT: vpmovdw %xmm0, (%rdx) +; AVX512-NEXT: retq + %x = load <4 x i16>, <4 x i16>* %px + %y = load <4 x i16>, <4 x i16>* %py + %z = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> %x, <4 x i16> %y) + store <4 x i16> %z, <4 x i16>* %pz + ret void +} + +define void @v2i16(<2 x i16>* %px, <2 x i16>* %py, <2 x i16>* %pz) nounwind { +; SSE2-LABEL: v2i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7] +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,6,7] +; SSE2-NEXT: psllq $48, %xmm1 +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: psllq $48, %xmm0 +; SSE2-NEXT: movq %xmm0, %rcx +; SSE2-NEXT: addq %rax, %rcx +; SSE2-NEXT: movq $-1, %rax +; SSE2-NEXT: cmovbq %rax, %rcx +; SSE2-NEXT: movq %rcx, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: movq %xmm1, %rcx +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm0, %rsi +; SSE2-NEXT: addq %rcx, %rsi +; SSE2-NEXT: cmovbq %rax, %rsi +; SSE2-NEXT: movq %rsi, %xmm0 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE2-NEXT: psrlq $48, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: movd %xmm0, (%rdx) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v2i16: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,0,1,255,255,255,255,255,255,2,3] +; SSSE3-NEXT: pshufb %xmm2, %xmm1 +; SSSE3-NEXT: movq %xmm1, %rax +; SSSE3-NEXT: pshufb %xmm2, %xmm0 +; SSSE3-NEXT: movq %xmm0, %rcx +; SSSE3-NEXT: addq %rax, %rcx +; SSSE3-NEXT: movq $-1, %rax +; SSSE3-NEXT: cmovbq %rax, %rcx +; SSSE3-NEXT: movq %rcx, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSSE3-NEXT: movq %xmm1, %rcx +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSSE3-NEXT: movq %xmm0, %rsi +; SSSE3-NEXT: addq %rcx, %rsi +; SSSE3-NEXT: cmovbq %rax, %rsi +; SSSE3-NEXT: movq %rsi, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[6,7,14,15,14,15],zero,zero,xmm2[14,15],zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: movd %xmm2, (%rdx) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v2i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; SSE41-NEXT: psllq $48, %xmm1 +; SSE41-NEXT: pextrq $1, %xmm1, %rax +; SSE41-NEXT: psllq $48, %xmm0 +; SSE41-NEXT: pextrq $1, %xmm0, %rcx +; SSE41-NEXT: addq %rax, %rcx +; SSE41-NEXT: movq $-1, %rax +; SSE41-NEXT: cmovbq %rax, %rcx +; SSE41-NEXT: movq %rcx, %xmm2 +; SSE41-NEXT: movq %xmm1, %rcx +; SSE41-NEXT: movq %xmm0, %rsi +; SSE41-NEXT: addq %rcx, %rsi +; SSE41-NEXT: cmovbq %rax, %rsi +; SSE41-NEXT: movq %rsi, %xmm0 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,14,15],zero,zero,xmm0[14,15],zero,zero,zero,zero,zero,zero +; SSE41-NEXT: movd %xmm0, (%rdx) +; SSE41-NEXT: retq +; +; AVX1-LABEL: v2i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; AVX1-NEXT: vpsllq $48, %xmm1, %xmm1 +; AVX1-NEXT: vpextrq $1, %xmm1, %rax +; AVX1-NEXT: vpsllq $48, %xmm0, %xmm0 +; AVX1-NEXT: vpextrq $1, %xmm0, %rcx +; AVX1-NEXT: addq %rax, %rcx +; AVX1-NEXT: movq $-1, %rax +; AVX1-NEXT: cmovbq %rax, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm2 +; AVX1-NEXT: vmovq %xmm1, %rcx +; AVX1-NEXT: vmovq %xmm0, %rsi +; AVX1-NEXT: addq %rcx, %rsi +; AVX1-NEXT: cmovbq %rax, %rsi +; AVX1-NEXT: vmovq %rsi, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,14,15],zero,zero,xmm0[14,15],zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vmovd %xmm0, (%rdx) +; AVX1-NEXT: retq +; +; AVX2-LABEL: v2i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1 +; AVX2-NEXT: vpextrq $1, %xmm1, %rax +; AVX2-NEXT: vpsllq $48, %xmm0, %xmm0 +; AVX2-NEXT: vpextrq $1, %xmm0, %rcx +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: movq $-1, %rax +; AVX2-NEXT: cmovbq %rax, %rcx +; AVX2-NEXT: vmovq %rcx, %xmm2 +; AVX2-NEXT: vmovq %xmm1, %rcx +; AVX2-NEXT: vmovq %xmm0, %rsi +; AVX2-NEXT: addq %rcx, %rsi +; AVX2-NEXT: cmovbq %rax, %rsi +; AVX2-NEXT: vmovq %rsi, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,14,15],zero,zero,xmm0[14,15],zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vmovd %xmm0, (%rdx) +; AVX2-NEXT: retq +; +; AVX512-LABEL: v2i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,0,1,255,255,255,255,255,255,2,3] +; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpextrq $1, %xmm1, %rax +; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512-NEXT: addq %rax, %rcx +; AVX512-NEXT: movq $-1, %rax +; AVX512-NEXT: cmovbq %rax, %rcx +; AVX512-NEXT: vmovq %rcx, %xmm2 +; AVX512-NEXT: vmovq %xmm1, %rcx +; AVX512-NEXT: vmovq %xmm0, %rsi +; AVX512-NEXT: addq %rcx, %rsi +; AVX512-NEXT: cmovbq %rax, %rsi +; AVX512-NEXT: vmovq %rsi, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX512-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX512-NEXT: vpmovqw %xmm0, (%rdx) +; AVX512-NEXT: retq + %x = load <2 x i16>, <2 x i16>* %px + %y = load <2 x i16>, <2 x i16>* %py + %z = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> %x, <2 x i16> %y) + store <2 x i16> %z, <2 x i16>* %pz + ret void +} + +define <12 x i8> @v12i8(<12 x i8> %x, <12 x i8> %y) nounwind { +; SSE2-LABEL: v12i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: pushq %r15 +; SSE2-NEXT: pushq %r14 +; SSE2-NEXT: pushq %r13 +; SSE2-NEXT: pushq %r12 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %dil +; SSE2-NEXT: jb .LBB11_2 +; SSE2-NEXT: # %bb.1: +; SSE2-NEXT: movl %eax, %edi +; SSE2-NEXT: .LBB11_2: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb $-1, %al +; SSE2-NEXT: jb .LBB11_4 +; SSE2-NEXT: # %bb.3: +; SSE2-NEXT: movl %ecx, %eax +; SSE2-NEXT: .LBB11_4: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb $-1, %cl +; SSE2-NEXT: jb .LBB11_6 +; SSE2-NEXT: # %bb.5: +; SSE2-NEXT: movl %edx, %ecx +; SSE2-NEXT: .LBB11_6: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb $-1, %r10b +; SSE2-NEXT: jb .LBB11_8 +; SSE2-NEXT: # %bb.7: +; SSE2-NEXT: movl %edx, %r10d +; SSE2-NEXT: .LBB11_8: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb $-1, %r11b +; SSE2-NEXT: jb .LBB11_10 +; SSE2-NEXT: # %bb.9: +; SSE2-NEXT: movl %edx, %r11d +; SSE2-NEXT: .LBB11_10: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb $-1, %r12b +; SSE2-NEXT: jb .LBB11_12 +; SSE2-NEXT: # %bb.11: +; SSE2-NEXT: movl %edx, %r12d +; SSE2-NEXT: .LBB11_12: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb $-1, %r13b +; SSE2-NEXT: jb .LBB11_14 +; SSE2-NEXT: # %bb.13: +; SSE2-NEXT: movl %edx, %r13d +; SSE2-NEXT: .LBB11_14: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb $-1, %r8b +; SSE2-NEXT: jb .LBB11_16 +; SSE2-NEXT: # %bb.15: +; SSE2-NEXT: movl %edx, %r8d +; SSE2-NEXT: .LBB11_16: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb $-1, %r14b +; SSE2-NEXT: jb .LBB11_18 +; SSE2-NEXT: # %bb.17: +; SSE2-NEXT: movl %edx, %r14d +; SSE2-NEXT: .LBB11_18: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb $-1, %r15b +; SSE2-NEXT: jb .LBB11_20 +; SSE2-NEXT: # %bb.19: +; SSE2-NEXT: movl %edx, %r15d +; SSE2-NEXT: .LBB11_20: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb $-1, %r9b +; SSE2-NEXT: jb .LBB11_22 +; SSE2-NEXT: # %bb.21: +; SSE2-NEXT: movl %edx, %r9d +; SSE2-NEXT: .LBB11_22: +; SSE2-NEXT: movzbl %dil, %edi +; SSE2-NEXT: movzbl %al, %esi +; SSE2-NEXT: movzbl %cl, %ebp +; SSE2-NEXT: movzbl %r10b, %edx +; SSE2-NEXT: movzbl %r11b, %ebx +; SSE2-NEXT: movzbl %r12b, %r10d +; SSE2-NEXT: movzbl %r13b, %r11d +; SSE2-NEXT: movzbl %r8b, %r8d +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %cl +; SSE2-NEXT: jb .LBB11_24 +; SSE2-NEXT: # %bb.23: +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: .LBB11_24: +; SSE2-NEXT: movd %edi, %xmm2 +; SSE2-NEXT: movd %esi, %xmm3 +; SSE2-NEXT: movd %ebp, %xmm5 +; SSE2-NEXT: movd %edx, %xmm0 +; SSE2-NEXT: movd %ebx, %xmm6 +; SSE2-NEXT: movd %r10d, %xmm4 +; SSE2-NEXT: movd %r11d, %xmm7 +; SSE2-NEXT: movd %r8d, %xmm1 +; SSE2-NEXT: movzbl %r14b, %esi +; SSE2-NEXT: movzbl %r15b, %edx +; SSE2-NEXT: movzbl %r9b, %eax +; SSE2-NEXT: movzbl %cl, %edi +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb $-1, %bl +; SSE2-NEXT: jb .LBB11_26 +; SSE2-NEXT: # %bb.25: +; SSE2-NEXT: movl %ecx, %ebx +; SSE2-NEXT: .LBB11_26: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE2-NEXT: movd %esi, %xmm6 +; SSE2-NEXT: movd %edx, %xmm5 +; SSE2-NEXT: movd %eax, %xmm7 +; SSE2-NEXT: movd %edi, %xmm2 +; SSE2-NEXT: movzbl %bl, %eax +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb $-1, %cl +; SSE2-NEXT: jb .LBB11_28 +; SSE2-NEXT: # %bb.27: +; SSE2-NEXT: movl %edx, %ecx +; SSE2-NEXT: .LBB11_28: +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: movzbl %cl, %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb $-1, %cl +; SSE2-NEXT: jb .LBB11_30 +; SSE2-NEXT: # %bb.29: +; SSE2-NEXT: movl %edx, %ecx +; SSE2-NEXT: .LBB11_30: +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: movzbl %cl, %ecx +; SSE2-NEXT: movd %ecx, %xmm4 +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %cl +; SSE2-NEXT: jb .LBB11_32 +; SSE2-NEXT: # %bb.31: +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: .LBB11_32: +; SSE2-NEXT: movzbl %cl, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %r12 +; SSE2-NEXT: popq %r13 +; SSE2-NEXT: popq %r14 +; SSE2-NEXT: popq %r15 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v12i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pushq %rbp +; SSSE3-NEXT: pushq %r15 +; SSSE3-NEXT: pushq %r14 +; SSSE3-NEXT: pushq %r13 +; SSSE3-NEXT: pushq %r12 +; SSSE3-NEXT: pushq %rbx +; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %dil +; SSSE3-NEXT: jb .LBB11_2 +; SSSE3-NEXT: # %bb.1: +; SSSE3-NEXT: movl %eax, %edi +; SSSE3-NEXT: .LBB11_2: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb $-1, %al +; SSSE3-NEXT: jb .LBB11_4 +; SSSE3-NEXT: # %bb.3: +; SSSE3-NEXT: movl %ecx, %eax +; SSSE3-NEXT: .LBB11_4: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb $-1, %cl +; SSSE3-NEXT: jb .LBB11_6 +; SSSE3-NEXT: # %bb.5: +; SSSE3-NEXT: movl %edx, %ecx +; SSSE3-NEXT: .LBB11_6: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb $-1, %r10b +; SSSE3-NEXT: jb .LBB11_8 +; SSSE3-NEXT: # %bb.7: +; SSSE3-NEXT: movl %edx, %r10d +; SSSE3-NEXT: .LBB11_8: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb $-1, %r11b +; SSSE3-NEXT: jb .LBB11_10 +; SSSE3-NEXT: # %bb.9: +; SSSE3-NEXT: movl %edx, %r11d +; SSSE3-NEXT: .LBB11_10: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb $-1, %r12b +; SSSE3-NEXT: jb .LBB11_12 +; SSSE3-NEXT: # %bb.11: +; SSSE3-NEXT: movl %edx, %r12d +; SSSE3-NEXT: .LBB11_12: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb $-1, %r13b +; SSSE3-NEXT: jb .LBB11_14 +; SSSE3-NEXT: # %bb.13: +; SSSE3-NEXT: movl %edx, %r13d +; SSSE3-NEXT: .LBB11_14: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb $-1, %r8b +; SSSE3-NEXT: jb .LBB11_16 +; SSSE3-NEXT: # %bb.15: +; SSSE3-NEXT: movl %edx, %r8d +; SSSE3-NEXT: .LBB11_16: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb $-1, %r14b +; SSSE3-NEXT: jb .LBB11_18 +; SSSE3-NEXT: # %bb.17: +; SSSE3-NEXT: movl %edx, %r14d +; SSSE3-NEXT: .LBB11_18: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb $-1, %r15b +; SSSE3-NEXT: jb .LBB11_20 +; SSSE3-NEXT: # %bb.19: +; SSSE3-NEXT: movl %edx, %r15d +; SSSE3-NEXT: .LBB11_20: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb $-1, %r9b +; SSSE3-NEXT: jb .LBB11_22 +; SSSE3-NEXT: # %bb.21: +; SSSE3-NEXT: movl %edx, %r9d +; SSSE3-NEXT: .LBB11_22: +; SSSE3-NEXT: movzbl %dil, %edi +; SSSE3-NEXT: movzbl %al, %esi +; SSSE3-NEXT: movzbl %cl, %ebp +; SSSE3-NEXT: movzbl %r10b, %edx +; SSSE3-NEXT: movzbl %r11b, %ebx +; SSSE3-NEXT: movzbl %r12b, %r10d +; SSSE3-NEXT: movzbl %r13b, %r11d +; SSSE3-NEXT: movzbl %r8b, %r8d +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %cl +; SSSE3-NEXT: jb .LBB11_24 +; SSSE3-NEXT: # %bb.23: +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: .LBB11_24: +; SSSE3-NEXT: movd %edi, %xmm2 +; SSSE3-NEXT: movd %esi, %xmm3 +; SSSE3-NEXT: movd %ebp, %xmm5 +; SSSE3-NEXT: movd %edx, %xmm0 +; SSSE3-NEXT: movd %ebx, %xmm6 +; SSSE3-NEXT: movd %r10d, %xmm4 +; SSSE3-NEXT: movd %r11d, %xmm7 +; SSSE3-NEXT: movd %r8d, %xmm1 +; SSSE3-NEXT: movzbl %r14b, %esi +; SSSE3-NEXT: movzbl %r15b, %edx +; SSSE3-NEXT: movzbl %r9b, %eax +; SSSE3-NEXT: movzbl %cl, %edi +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb $-1, %bl +; SSSE3-NEXT: jb .LBB11_26 +; SSSE3-NEXT: # %bb.25: +; SSSE3-NEXT: movl %ecx, %ebx +; SSSE3-NEXT: .LBB11_26: +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSSE3-NEXT: movd %esi, %xmm6 +; SSSE3-NEXT: movd %edx, %xmm5 +; SSSE3-NEXT: movd %eax, %xmm7 +; SSSE3-NEXT: movd %edi, %xmm2 +; SSSE3-NEXT: movzbl %bl, %eax +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb $-1, %cl +; SSSE3-NEXT: jb .LBB11_28 +; SSSE3-NEXT: # %bb.27: +; SSSE3-NEXT: movl %edx, %ecx +; SSSE3-NEXT: .LBB11_28: +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSSE3-NEXT: movd %eax, %xmm4 +; SSSE3-NEXT: movzbl %cl, %eax +; SSSE3-NEXT: movd %eax, %xmm3 +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb $-1, %cl +; SSSE3-NEXT: jb .LBB11_30 +; SSSE3-NEXT: # %bb.29: +; SSSE3-NEXT: movl %edx, %ecx +; SSSE3-NEXT: .LBB11_30: +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSSE3-NEXT: movzbl %cl, %ecx +; SSSE3-NEXT: movd %ecx, %xmm4 +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %cl +; SSSE3-NEXT: jb .LBB11_32 +; SSSE3-NEXT: # %bb.31: +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: .LBB11_32: +; SSSE3-NEXT: movzbl %cl, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: popq %rbx +; SSSE3-NEXT: popq %r12 +; SSSE3-NEXT: popq %r13 +; SSSE3-NEXT: popq %r14 +; SSSE3-NEXT: popq %r15 +; SSSE3-NEXT: popq %rbp +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v12i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pextrb $1, %xmm1, %eax +; SSE41-NEXT: pextrb $1, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %sil +; SSE41-NEXT: movb $-1, %dl +; SSE41-NEXT: jb .LBB11_2 +; SSE41-NEXT: # %bb.1: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB11_2: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pextrb $0, %xmm1, %eax +; SSE41-NEXT: pextrb $0, %xmm0, %edx +; SSE41-NEXT: addb %al, %dl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB11_4 +; SSE41-NEXT: # %bb.3: +; SSE41-NEXT: movl %edx, %eax +; SSE41-NEXT: .LBB11_4: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: movd %eax, %xmm2 +; SSE41-NEXT: pinsrb $1, %ecx, %xmm2 +; SSE41-NEXT: pextrb $2, %xmm1, %eax +; SSE41-NEXT: pextrb $2, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB11_6 +; SSE41-NEXT: # %bb.5: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB11_6: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $2, %eax, %xmm2 +; SSE41-NEXT: pextrb $3, %xmm1, %eax +; SSE41-NEXT: pextrb $3, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB11_8 +; SSE41-NEXT: # %bb.7: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB11_8: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $3, %eax, %xmm2 +; SSE41-NEXT: pextrb $4, %xmm1, %eax +; SSE41-NEXT: pextrb $4, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB11_10 +; SSE41-NEXT: # %bb.9: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB11_10: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $4, %eax, %xmm2 +; SSE41-NEXT: pextrb $5, %xmm1, %eax +; SSE41-NEXT: pextrb $5, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB11_12 +; SSE41-NEXT: # %bb.11: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB11_12: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $5, %eax, %xmm2 +; SSE41-NEXT: pextrb $6, %xmm1, %eax +; SSE41-NEXT: pextrb $6, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB11_14 +; SSE41-NEXT: # %bb.13: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB11_14: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $6, %eax, %xmm2 +; SSE41-NEXT: pextrb $7, %xmm1, %eax +; SSE41-NEXT: pextrb $7, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB11_16 +; SSE41-NEXT: # %bb.15: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB11_16: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $7, %eax, %xmm2 +; SSE41-NEXT: pextrb $8, %xmm1, %eax +; SSE41-NEXT: pextrb $8, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB11_18 +; SSE41-NEXT: # %bb.17: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB11_18: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $8, %eax, %xmm2 +; SSE41-NEXT: pextrb $9, %xmm1, %eax +; SSE41-NEXT: pextrb $9, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB11_20 +; SSE41-NEXT: # %bb.19: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB11_20: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $9, %eax, %xmm2 +; SSE41-NEXT: pextrb $10, %xmm1, %eax +; SSE41-NEXT: pextrb $10, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB11_22 +; SSE41-NEXT: # %bb.21: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB11_22: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $10, %eax, %xmm2 +; SSE41-NEXT: pextrb $11, %xmm1, %eax +; SSE41-NEXT: pextrb $11, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB11_24 +; SSE41-NEXT: # %bb.23: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB11_24: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $11, %eax, %xmm2 +; SSE41-NEXT: pextrb $12, %xmm1, %eax +; SSE41-NEXT: pextrb $12, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB11_26 +; SSE41-NEXT: # %bb.25: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB11_26: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $12, %eax, %xmm2 +; SSE41-NEXT: pextrb $13, %xmm1, %eax +; SSE41-NEXT: pextrb $13, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB11_28 +; SSE41-NEXT: # %bb.27: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB11_28: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $13, %eax, %xmm2 +; SSE41-NEXT: pextrb $14, %xmm1, %eax +; SSE41-NEXT: pextrb $14, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB11_30 +; SSE41-NEXT: # %bb.29: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB11_30: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $14, %eax, %xmm2 +; SSE41-NEXT: pextrb $15, %xmm1, %eax +; SSE41-NEXT: pextrb $15, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: jb .LBB11_32 +; SSE41-NEXT: # %bb.31: +; SSE41-NEXT: movl %ecx, %esi +; SSE41-NEXT: .LBB11_32: +; SSE41-NEXT: movzbl %sil, %eax +; SSE41-NEXT: pinsrb $15, %eax, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: v12i8: +; AVX: # %bb.0: +; AVX-NEXT: vpextrb $1, %xmm1, %eax +; AVX-NEXT: vpextrb $1, %xmm0, %ecx +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movb $-1, %sil +; AVX-NEXT: movb $-1, %dl +; AVX-NEXT: jb .LBB11_2 +; AVX-NEXT: # %bb.1: +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: .LBB11_2: +; AVX-NEXT: movzbl %dl, %ecx +; AVX-NEXT: vpextrb $0, %xmm1, %eax +; AVX-NEXT: vpextrb $0, %xmm0, %edx +; AVX-NEXT: addb %al, %dl +; AVX-NEXT: movb $-1, %al +; AVX-NEXT: jb .LBB11_4 +; AVX-NEXT: # %bb.3: +; AVX-NEXT: movl %edx, %eax +; AVX-NEXT: .LBB11_4: +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vmovd %eax, %xmm2 +; AVX-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $2, %xmm1, %eax +; AVX-NEXT: vpextrb $2, %xmm0, %ecx +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movb $-1, %al +; AVX-NEXT: jb .LBB11_6 +; AVX-NEXT: # %bb.5: +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: .LBB11_6: +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $3, %xmm1, %eax +; AVX-NEXT: vpextrb $3, %xmm0, %ecx +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movb $-1, %al +; AVX-NEXT: jb .LBB11_8 +; AVX-NEXT: # %bb.7: +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: .LBB11_8: +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $4, %xmm1, %eax +; AVX-NEXT: vpextrb $4, %xmm0, %ecx +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movb $-1, %al +; AVX-NEXT: jb .LBB11_10 +; AVX-NEXT: # %bb.9: +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: .LBB11_10: +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $5, %xmm1, %eax +; AVX-NEXT: vpextrb $5, %xmm0, %ecx +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movb $-1, %al +; AVX-NEXT: jb .LBB11_12 +; AVX-NEXT: # %bb.11: +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: .LBB11_12: +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $6, %xmm1, %eax +; AVX-NEXT: vpextrb $6, %xmm0, %ecx +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movb $-1, %al +; AVX-NEXT: jb .LBB11_14 +; AVX-NEXT: # %bb.13: +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: .LBB11_14: +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $7, %xmm1, %eax +; AVX-NEXT: vpextrb $7, %xmm0, %ecx +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movb $-1, %al +; AVX-NEXT: jb .LBB11_16 +; AVX-NEXT: # %bb.15: +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: .LBB11_16: +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $8, %xmm1, %eax +; AVX-NEXT: vpextrb $8, %xmm0, %ecx +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movb $-1, %al +; AVX-NEXT: jb .LBB11_18 +; AVX-NEXT: # %bb.17: +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: .LBB11_18: +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $9, %xmm1, %eax +; AVX-NEXT: vpextrb $9, %xmm0, %ecx +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movb $-1, %al +; AVX-NEXT: jb .LBB11_20 +; AVX-NEXT: # %bb.19: +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: .LBB11_20: +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $10, %xmm1, %eax +; AVX-NEXT: vpextrb $10, %xmm0, %ecx +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movb $-1, %al +; AVX-NEXT: jb .LBB11_22 +; AVX-NEXT: # %bb.21: +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: .LBB11_22: +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $11, %xmm1, %eax +; AVX-NEXT: vpextrb $11, %xmm0, %ecx +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movb $-1, %al +; AVX-NEXT: jb .LBB11_24 +; AVX-NEXT: # %bb.23: +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: .LBB11_24: +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $12, %xmm1, %eax +; AVX-NEXT: vpextrb $12, %xmm0, %ecx +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movb $-1, %al +; AVX-NEXT: jb .LBB11_26 +; AVX-NEXT: # %bb.25: +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: .LBB11_26: +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $13, %xmm1, %eax +; AVX-NEXT: vpextrb $13, %xmm0, %ecx +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movb $-1, %al +; AVX-NEXT: jb .LBB11_28 +; AVX-NEXT: # %bb.27: +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: .LBB11_28: +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $14, %xmm1, %eax +; AVX-NEXT: vpextrb $14, %xmm0, %ecx +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movb $-1, %al +; AVX-NEXT: jb .LBB11_30 +; AVX-NEXT: # %bb.29: +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: .LBB11_30: +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $15, %xmm1, %eax +; AVX-NEXT: vpextrb $15, %xmm0, %ecx +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: jb .LBB11_32 +; AVX-NEXT: # %bb.31: +; AVX-NEXT: movl %ecx, %esi +; AVX-NEXT: .LBB11_32: +; AVX-NEXT: movzbl %sil, %eax +; AVX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0 +; AVX-NEXT: retq + %z = call <12 x i8> @llvm.uadd.sat.v12i8(<12 x i8> %x, <12 x i8> %y) + ret <12 x i8> %z +} + +define void @v12i16(<12 x i16>* %px, <12 x i16>* %py, <12 x i16>* %pz) nounwind { +; SSE2-LABEL: v12i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm2 +; SSE2-NEXT: movdqa 16(%rdi), %xmm0 +; SSE2-NEXT: movdqa (%rsi), %xmm3 +; SSE2-NEXT: movdqa 16(%rsi), %xmm1 +; SSE2-NEXT: pextrw $7, %xmm3, %eax +; SSE2-NEXT: pextrw $7, %xmm2, %ecx +; SSE2-NEXT: addw %ax, %cx +; SSE2-NEXT: movl $65535, %eax # imm = 0xFFFF +; SSE2-NEXT: cmovbl %eax, %ecx +; SSE2-NEXT: movd %ecx, %xmm4 +; SSE2-NEXT: pextrw $6, %xmm3, %ecx +; SSE2-NEXT: pextrw $6, %xmm2, %esi +; SSE2-NEXT: addw %cx, %si +; SSE2-NEXT: cmovbl %eax, %esi +; SSE2-NEXT: movd %esi, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE2-NEXT: pextrw $5, %xmm3, %ecx +; SSE2-NEXT: pextrw $5, %xmm2, %esi +; SSE2-NEXT: addw %cx, %si +; SSE2-NEXT: cmovbl %eax, %esi +; SSE2-NEXT: movd %esi, %xmm6 +; SSE2-NEXT: pextrw $4, %xmm3, %ecx +; SSE2-NEXT: pextrw $4, %xmm2, %esi +; SSE2-NEXT: addw %cx, %si +; SSE2-NEXT: cmovbl %eax, %esi +; SSE2-NEXT: movd %esi, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE2-NEXT: pextrw $3, %xmm3, %ecx +; SSE2-NEXT: pextrw $3, %xmm2, %esi +; SSE2-NEXT: addw %cx, %si +; SSE2-NEXT: cmovbl %eax, %esi +; SSE2-NEXT: movd %esi, %xmm5 +; SSE2-NEXT: pextrw $2, %xmm3, %ecx +; SSE2-NEXT: pextrw $2, %xmm2, %esi +; SSE2-NEXT: addw %cx, %si +; SSE2-NEXT: cmovbl %eax, %esi +; SSE2-NEXT: movd %esi, %xmm6 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; SSE2-NEXT: pextrw $1, %xmm3, %ecx +; SSE2-NEXT: pextrw $1, %xmm2, %esi +; SSE2-NEXT: addw %cx, %si +; SSE2-NEXT: cmovbl %eax, %esi +; SSE2-NEXT: movd %esi, %xmm5 +; SSE2-NEXT: movd %xmm3, %ecx +; SSE2-NEXT: movd %xmm2, %esi +; SSE2-NEXT: addw %cx, %si +; SSE2-NEXT: cmovbl %eax, %esi +; SSE2-NEXT: movd %esi, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE2-NEXT: pextrw $1, %xmm1, %ecx +; SSE2-NEXT: pextrw $1, %xmm0, %esi +; SSE2-NEXT: addw %cx, %si +; SSE2-NEXT: cmovbl %eax, %esi +; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: movd %xmm0, %edi +; SSE2-NEXT: addw %cx, %di +; SSE2-NEXT: cmovbl %eax, %edi +; SSE2-NEXT: movd %edi, %xmm3 +; SSE2-NEXT: pinsrw $1, %esi, %xmm3 +; SSE2-NEXT: pextrw $2, %xmm1, %ecx +; SSE2-NEXT: pextrw $2, %xmm0, %esi +; SSE2-NEXT: addw %cx, %si +; SSE2-NEXT: cmovbl %eax, %esi +; SSE2-NEXT: pinsrw $2, %esi, %xmm3 +; SSE2-NEXT: pextrw $3, %xmm1, %ecx +; SSE2-NEXT: pextrw $3, %xmm0, %esi +; SSE2-NEXT: addw %cx, %si +; SSE2-NEXT: cmovbl %eax, %esi +; SSE2-NEXT: pinsrw $3, %esi, %xmm3 +; SSE2-NEXT: movq %xmm3, 16(%rdx) +; SSE2-NEXT: movdqa %xmm2, (%rdx) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v12i16: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa (%rdi), %xmm2 +; SSSE3-NEXT: movdqa 16(%rdi), %xmm0 +; SSSE3-NEXT: movdqa (%rsi), %xmm3 +; SSSE3-NEXT: movdqa 16(%rsi), %xmm1 +; SSSE3-NEXT: pextrw $7, %xmm3, %eax +; SSSE3-NEXT: pextrw $7, %xmm2, %ecx +; SSSE3-NEXT: addw %ax, %cx +; SSSE3-NEXT: movl $65535, %eax # imm = 0xFFFF +; SSSE3-NEXT: cmovbl %eax, %ecx +; SSSE3-NEXT: movd %ecx, %xmm4 +; SSSE3-NEXT: pextrw $6, %xmm3, %ecx +; SSSE3-NEXT: pextrw $6, %xmm2, %esi +; SSSE3-NEXT: addw %cx, %si +; SSSE3-NEXT: cmovbl %eax, %esi +; SSSE3-NEXT: movd %esi, %xmm5 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSSE3-NEXT: pextrw $5, %xmm3, %ecx +; SSSE3-NEXT: pextrw $5, %xmm2, %esi +; SSSE3-NEXT: addw %cx, %si +; SSSE3-NEXT: cmovbl %eax, %esi +; SSSE3-NEXT: movd %esi, %xmm6 +; SSSE3-NEXT: pextrw $4, %xmm3, %ecx +; SSSE3-NEXT: pextrw $4, %xmm2, %esi +; SSSE3-NEXT: addw %cx, %si +; SSSE3-NEXT: cmovbl %eax, %esi +; SSSE3-NEXT: movd %esi, %xmm4 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSSE3-NEXT: pextrw $3, %xmm3, %ecx +; SSSE3-NEXT: pextrw $3, %xmm2, %esi +; SSSE3-NEXT: addw %cx, %si +; SSSE3-NEXT: cmovbl %eax, %esi +; SSSE3-NEXT: movd %esi, %xmm5 +; SSSE3-NEXT: pextrw $2, %xmm3, %ecx +; SSSE3-NEXT: pextrw $2, %xmm2, %esi +; SSSE3-NEXT: addw %cx, %si +; SSSE3-NEXT: cmovbl %eax, %esi +; SSSE3-NEXT: movd %esi, %xmm6 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; SSSE3-NEXT: pextrw $1, %xmm3, %ecx +; SSSE3-NEXT: pextrw $1, %xmm2, %esi +; SSSE3-NEXT: addw %cx, %si +; SSSE3-NEXT: cmovbl %eax, %esi +; SSSE3-NEXT: movd %esi, %xmm5 +; SSSE3-NEXT: movd %xmm3, %ecx +; SSSE3-NEXT: movd %xmm2, %esi +; SSSE3-NEXT: addw %cx, %si +; SSSE3-NEXT: cmovbl %eax, %esi +; SSSE3-NEXT: movd %esi, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSSE3-NEXT: pextrw $1, %xmm1, %ecx +; SSSE3-NEXT: pextrw $1, %xmm0, %esi +; SSSE3-NEXT: addw %cx, %si +; SSSE3-NEXT: cmovbl %eax, %esi +; SSSE3-NEXT: movd %xmm1, %ecx +; SSSE3-NEXT: movd %xmm0, %edi +; SSSE3-NEXT: addw %cx, %di +; SSSE3-NEXT: cmovbl %eax, %edi +; SSSE3-NEXT: movd %edi, %xmm3 +; SSSE3-NEXT: pinsrw $1, %esi, %xmm3 +; SSSE3-NEXT: pextrw $2, %xmm1, %ecx +; SSSE3-NEXT: pextrw $2, %xmm0, %esi +; SSSE3-NEXT: addw %cx, %si +; SSSE3-NEXT: cmovbl %eax, %esi +; SSSE3-NEXT: pinsrw $2, %esi, %xmm3 +; SSSE3-NEXT: pextrw $3, %xmm1, %ecx +; SSSE3-NEXT: pextrw $3, %xmm0, %esi +; SSSE3-NEXT: addw %cx, %si +; SSSE3-NEXT: cmovbl %eax, %esi +; SSSE3-NEXT: pinsrw $3, %esi, %xmm3 +; SSSE3-NEXT: movq %xmm3, 16(%rdx) +; SSSE3-NEXT: movdqa %xmm2, (%rdx) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v12i16: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm3 +; SSE41-NEXT: movdqa 16(%rdi), %xmm0 +; SSE41-NEXT: movdqa (%rsi), %xmm4 +; SSE41-NEXT: movdqa 16(%rsi), %xmm1 +; SSE41-NEXT: pextrw $1, %xmm4, %eax +; SSE41-NEXT: pextrw $1, %xmm3, %ecx +; SSE41-NEXT: addw %ax, %cx +; SSE41-NEXT: movl $65535, %eax # imm = 0xFFFF +; SSE41-NEXT: cmovbl %eax, %ecx +; SSE41-NEXT: movd %xmm4, %esi +; SSE41-NEXT: movd %xmm3, %edi +; SSE41-NEXT: addw %si, %di +; SSE41-NEXT: cmovbl %eax, %edi +; SSE41-NEXT: movd %edi, %xmm2 +; SSE41-NEXT: pinsrw $1, %ecx, %xmm2 +; SSE41-NEXT: pextrw $2, %xmm4, %ecx +; SSE41-NEXT: pextrw $2, %xmm3, %esi +; SSE41-NEXT: addw %cx, %si +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: pinsrw $2, %esi, %xmm2 +; SSE41-NEXT: pextrw $3, %xmm4, %ecx +; SSE41-NEXT: pextrw $3, %xmm3, %esi +; SSE41-NEXT: addw %cx, %si +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: pinsrw $3, %esi, %xmm2 +; SSE41-NEXT: pextrw $4, %xmm4, %ecx +; SSE41-NEXT: pextrw $4, %xmm3, %esi +; SSE41-NEXT: addw %cx, %si +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: pinsrw $4, %esi, %xmm2 +; SSE41-NEXT: pextrw $5, %xmm4, %ecx +; SSE41-NEXT: pextrw $5, %xmm3, %esi +; SSE41-NEXT: addw %cx, %si +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: pinsrw $5, %esi, %xmm2 +; SSE41-NEXT: pextrw $6, %xmm4, %ecx +; SSE41-NEXT: pextrw $6, %xmm3, %esi +; SSE41-NEXT: addw %cx, %si +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: pinsrw $6, %esi, %xmm2 +; SSE41-NEXT: pextrw $7, %xmm4, %ecx +; SSE41-NEXT: pextrw $7, %xmm3, %esi +; SSE41-NEXT: addw %cx, %si +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: pinsrw $7, %esi, %xmm2 +; SSE41-NEXT: pextrw $1, %xmm1, %ecx +; SSE41-NEXT: pextrw $1, %xmm0, %esi +; SSE41-NEXT: addw %cx, %si +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: movd %xmm1, %ecx +; SSE41-NEXT: movd %xmm0, %edi +; SSE41-NEXT: addw %cx, %di +; SSE41-NEXT: cmovbl %eax, %edi +; SSE41-NEXT: movd %edi, %xmm3 +; SSE41-NEXT: pinsrw $1, %esi, %xmm3 +; SSE41-NEXT: pextrw $2, %xmm1, %ecx +; SSE41-NEXT: pextrw $2, %xmm0, %esi +; SSE41-NEXT: addw %cx, %si +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: pinsrw $2, %esi, %xmm3 +; SSE41-NEXT: pextrw $3, %xmm1, %ecx +; SSE41-NEXT: pextrw $3, %xmm0, %esi +; SSE41-NEXT: addw %cx, %si +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: pinsrw $3, %esi, %xmm3 +; SSE41-NEXT: movq %xmm3, 16(%rdx) +; SSE41-NEXT: movdqa %xmm2, (%rdx) +; SSE41-NEXT: retq +; +; AVX-LABEL: v12i16: +; AVX: # %bb.0: +; AVX-NEXT: pushq %rbp +; AVX-NEXT: pushq %rbx +; AVX-NEXT: vmovdqa (%rsi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rsi), %xmm2 +; AVX-NEXT: vpextrw $1, %xmm2, %eax +; AVX-NEXT: vmovdqa (%rdi), %xmm1 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX-NEXT: vpextrw $1, %xmm3, %ecx +; AVX-NEXT: addw %ax, %cx +; AVX-NEXT: movl $65535, %r8d # imm = 0xFFFF +; AVX-NEXT: cmovbl %r8d, %ecx +; AVX-NEXT: vmovd %xmm2, %eax +; AVX-NEXT: vmovd %xmm3, %esi +; AVX-NEXT: addw %ax, %si +; AVX-NEXT: cmovbl %r8d, %esi +; AVX-NEXT: vmovd %esi, %xmm4 +; AVX-NEXT: vpinsrw $1, %ecx, %xmm4, %xmm4 +; AVX-NEXT: vpextrw $2, %xmm2, %eax +; AVX-NEXT: vpextrw $2, %xmm3, %ecx +; AVX-NEXT: addw %ax, %cx +; AVX-NEXT: cmovbl %r8d, %ecx +; AVX-NEXT: vpinsrw $2, %ecx, %xmm4, %xmm4 +; AVX-NEXT: vpextrw $3, %xmm2, %eax +; AVX-NEXT: vpextrw $3, %xmm3, %ecx +; AVX-NEXT: addw %ax, %cx +; AVX-NEXT: cmovbl %r8d, %ecx +; AVX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm4 +; AVX-NEXT: vpextrw $4, %xmm2, %eax +; AVX-NEXT: vpextrw $4, %xmm3, %ecx +; AVX-NEXT: addw %ax, %cx +; AVX-NEXT: cmovbl %r8d, %ecx +; AVX-NEXT: vpinsrw $4, %ecx, %xmm4, %xmm4 +; AVX-NEXT: vpextrw $5, %xmm2, %eax +; AVX-NEXT: vpextrw $5, %xmm3, %ecx +; AVX-NEXT: addw %ax, %cx +; AVX-NEXT: cmovbl %r8d, %ecx +; AVX-NEXT: vpinsrw $5, %ecx, %xmm4, %xmm4 +; AVX-NEXT: vpextrw $6, %xmm2, %eax +; AVX-NEXT: vpextrw $6, %xmm3, %ecx +; AVX-NEXT: addw %ax, %cx +; AVX-NEXT: cmovbl %r8d, %ecx +; AVX-NEXT: vpinsrw $6, %ecx, %xmm4, %xmm4 +; AVX-NEXT: vpextrw $7, %xmm2, %eax +; AVX-NEXT: vpextrw $7, %xmm3, %ecx +; AVX-NEXT: addw %ax, %cx +; AVX-NEXT: cmovbl %r8d, %ecx +; AVX-NEXT: vpinsrw $7, %ecx, %xmm4, %xmm2 +; AVX-NEXT: vpextrw $7, %xmm0, %eax +; AVX-NEXT: vpextrw $7, %xmm1, %r9d +; AVX-NEXT: addw %ax, %r9w +; AVX-NEXT: cmovbl %r8d, %r9d +; AVX-NEXT: vpextrw $6, %xmm0, %eax +; AVX-NEXT: vpextrw $6, %xmm1, %r10d +; AVX-NEXT: addw %ax, %r10w +; AVX-NEXT: cmovbl %r8d, %r10d +; AVX-NEXT: vpextrw $5, %xmm0, %eax +; AVX-NEXT: vpextrw $5, %xmm1, %edi +; AVX-NEXT: addw %ax, %di +; AVX-NEXT: cmovbl %r8d, %edi +; AVX-NEXT: vpextrw $4, %xmm0, %ecx +; AVX-NEXT: vpextrw $4, %xmm1, %eax +; AVX-NEXT: addw %cx, %ax +; AVX-NEXT: cmovbl %r8d, %eax +; AVX-NEXT: vpextrw $3, %xmm0, %esi +; AVX-NEXT: vpextrw $3, %xmm1, %ecx +; AVX-NEXT: addw %si, %cx +; AVX-NEXT: cmovbl %r8d, %ecx +; AVX-NEXT: vpextrw $2, %xmm0, %r11d +; AVX-NEXT: vpextrw $2, %xmm1, %esi +; AVX-NEXT: addw %r11w, %si +; AVX-NEXT: cmovbl %r8d, %esi +; AVX-NEXT: vpextrw $1, %xmm0, %r11d +; AVX-NEXT: vpextrw $1, %xmm1, %ebx +; AVX-NEXT: addw %r11w, %bx +; AVX-NEXT: cmovbl %r8d, %ebx +; AVX-NEXT: vmovd %xmm0, %r11d +; AVX-NEXT: vmovd %xmm1, %ebp +; AVX-NEXT: addw %r11w, %bp +; AVX-NEXT: cmovbl %r8d, %ebp +; AVX-NEXT: vmovq %xmm2, 16(%rdx) +; AVX-NEXT: vmovd %ebp, %xmm0 +; AVX-NEXT: vpinsrw $1, %ebx, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $2, %esi, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $5, %edi, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $6, %r10d, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $7, %r9d, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rdx) +; AVX-NEXT: popq %rbx +; AVX-NEXT: popq %rbp +; AVX-NEXT: retq + %x = load <12 x i16>, <12 x i16>* %px + %y = load <12 x i16>, <12 x i16>* %py + %z = call <12 x i16> @llvm.uadd.sat.v12i16(<12 x i16> %x, <12 x i16> %y) + store <12 x i16> %z, <12 x i16>* %pz + ret void +} + +; Scalarization + +define void @v1i8(<1 x i8>* %px, <1 x i8>* %py, <1 x i8>* %pz) nounwind { +; SSE-LABEL: v1i8: +; SSE: # %bb.0: +; SSE-NEXT: movb (%rdi), %al +; SSE-NEXT: addb (%rsi), %al +; SSE-NEXT: movb $-1, %cl +; SSE-NEXT: jb .LBB13_2 +; SSE-NEXT: # %bb.1: +; SSE-NEXT: movl %eax, %ecx +; SSE-NEXT: .LBB13_2: +; SSE-NEXT: movb %cl, (%rdx) +; SSE-NEXT: retq +; +; AVX-LABEL: v1i8: +; AVX: # %bb.0: +; AVX-NEXT: movb (%rdi), %al +; AVX-NEXT: addb (%rsi), %al +; AVX-NEXT: movb $-1, %cl +; AVX-NEXT: jb .LBB13_2 +; AVX-NEXT: # %bb.1: +; AVX-NEXT: movl %eax, %ecx +; AVX-NEXT: .LBB13_2: +; AVX-NEXT: movb %cl, (%rdx) +; AVX-NEXT: retq + %x = load <1 x i8>, <1 x i8>* %px + %y = load <1 x i8>, <1 x i8>* %py + %z = call <1 x i8> @llvm.uadd.sat.v1i8(<1 x i8> %x, <1 x i8> %y) + store <1 x i8> %z, <1 x i8>* %pz + ret void +} + +define void @v1i16(<1 x i16>* %px, <1 x i16>* %py, <1 x i16>* %pz) nounwind { +; SSE-LABEL: v1i16: +; SSE: # %bb.0: +; SSE-NEXT: movzwl (%rdi), %eax +; SSE-NEXT: addw (%rsi), %ax +; SSE-NEXT: movl $65535, %ecx # imm = 0xFFFF +; SSE-NEXT: cmovael %eax, %ecx +; SSE-NEXT: movw %cx, (%rdx) +; SSE-NEXT: retq +; +; AVX-LABEL: v1i16: +; AVX: # %bb.0: +; AVX-NEXT: movzwl (%rdi), %eax +; AVX-NEXT: addw (%rsi), %ax +; AVX-NEXT: movl $65535, %ecx # imm = 0xFFFF +; AVX-NEXT: cmovael %eax, %ecx +; AVX-NEXT: movw %cx, (%rdx) +; AVX-NEXT: retq + %x = load <1 x i16>, <1 x i16>* %px + %y = load <1 x i16>, <1 x i16>* %py + %z = call <1 x i16> @llvm.uadd.sat.v1i16(<1 x i16> %x, <1 x i16> %y) + store <1 x i16> %z, <1 x i16>* %pz + ret void +} + +; Promotion + +define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { +; SSE2-LABEL: v16i4: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: pushq %r15 +; SSE2-NEXT: pushq %r14 +; SSE2-NEXT: pushq %r13 +; SSE2-NEXT: pushq %r12 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: psllw $4, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: psllw $4, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %dil +; SSE2-NEXT: jb .LBB15_2 +; SSE2-NEXT: # %bb.1: +; SSE2-NEXT: movl %eax, %edi +; SSE2-NEXT: .LBB15_2: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb $-1, %al +; SSE2-NEXT: jb .LBB15_4 +; SSE2-NEXT: # %bb.3: +; SSE2-NEXT: movl %ecx, %eax +; SSE2-NEXT: .LBB15_4: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb $-1, %cl +; SSE2-NEXT: jb .LBB15_6 +; SSE2-NEXT: # %bb.5: +; SSE2-NEXT: movl %edx, %ecx +; SSE2-NEXT: .LBB15_6: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb $-1, %r10b +; SSE2-NEXT: jb .LBB15_8 +; SSE2-NEXT: # %bb.7: +; SSE2-NEXT: movl %edx, %r10d +; SSE2-NEXT: .LBB15_8: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb $-1, %r11b +; SSE2-NEXT: jb .LBB15_10 +; SSE2-NEXT: # %bb.9: +; SSE2-NEXT: movl %edx, %r11d +; SSE2-NEXT: .LBB15_10: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb $-1, %r12b +; SSE2-NEXT: jb .LBB15_12 +; SSE2-NEXT: # %bb.11: +; SSE2-NEXT: movl %edx, %r12d +; SSE2-NEXT: .LBB15_12: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb $-1, %r13b +; SSE2-NEXT: jb .LBB15_14 +; SSE2-NEXT: # %bb.13: +; SSE2-NEXT: movl %edx, %r13d +; SSE2-NEXT: .LBB15_14: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb $-1, %r8b +; SSE2-NEXT: jb .LBB15_16 +; SSE2-NEXT: # %bb.15: +; SSE2-NEXT: movl %edx, %r8d +; SSE2-NEXT: .LBB15_16: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb $-1, %r14b +; SSE2-NEXT: jb .LBB15_18 +; SSE2-NEXT: # %bb.17: +; SSE2-NEXT: movl %edx, %r14d +; SSE2-NEXT: .LBB15_18: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb $-1, %r15b +; SSE2-NEXT: jb .LBB15_20 +; SSE2-NEXT: # %bb.19: +; SSE2-NEXT: movl %edx, %r15d +; SSE2-NEXT: .LBB15_20: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb $-1, %r9b +; SSE2-NEXT: jb .LBB15_22 +; SSE2-NEXT: # %bb.21: +; SSE2-NEXT: movl %edx, %r9d +; SSE2-NEXT: .LBB15_22: +; SSE2-NEXT: movzbl %dil, %edi +; SSE2-NEXT: movzbl %al, %esi +; SSE2-NEXT: movzbl %cl, %ebp +; SSE2-NEXT: movzbl %r10b, %edx +; SSE2-NEXT: movzbl %r11b, %ebx +; SSE2-NEXT: movzbl %r12b, %r10d +; SSE2-NEXT: movzbl %r13b, %r11d +; SSE2-NEXT: movzbl %r8b, %r8d +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %cl +; SSE2-NEXT: jb .LBB15_24 +; SSE2-NEXT: # %bb.23: +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: .LBB15_24: +; SSE2-NEXT: movd %edi, %xmm2 +; SSE2-NEXT: movd %esi, %xmm3 +; SSE2-NEXT: movd %ebp, %xmm5 +; SSE2-NEXT: movd %edx, %xmm0 +; SSE2-NEXT: movd %ebx, %xmm6 +; SSE2-NEXT: movd %r10d, %xmm4 +; SSE2-NEXT: movd %r11d, %xmm7 +; SSE2-NEXT: movd %r8d, %xmm1 +; SSE2-NEXT: movzbl %r14b, %esi +; SSE2-NEXT: movzbl %r15b, %edx +; SSE2-NEXT: movzbl %r9b, %eax +; SSE2-NEXT: movzbl %cl, %edi +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb $-1, %bl +; SSE2-NEXT: jb .LBB15_26 +; SSE2-NEXT: # %bb.25: +; SSE2-NEXT: movl %ecx, %ebx +; SSE2-NEXT: .LBB15_26: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE2-NEXT: movd %esi, %xmm6 +; SSE2-NEXT: movd %edx, %xmm5 +; SSE2-NEXT: movd %eax, %xmm7 +; SSE2-NEXT: movd %edi, %xmm2 +; SSE2-NEXT: movzbl %bl, %eax +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb $-1, %cl +; SSE2-NEXT: jb .LBB15_28 +; SSE2-NEXT: # %bb.27: +; SSE2-NEXT: movl %edx, %ecx +; SSE2-NEXT: .LBB15_28: +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: movzbl %cl, %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb $-1, %cl +; SSE2-NEXT: jb .LBB15_30 +; SSE2-NEXT: # %bb.29: +; SSE2-NEXT: movl %edx, %ecx +; SSE2-NEXT: .LBB15_30: +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: movzbl %cl, %ecx +; SSE2-NEXT: movd %ecx, %xmm4 +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %cl +; SSE2-NEXT: jb .LBB15_32 +; SSE2-NEXT: # %bb.31: +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: .LBB15_32: +; SSE2-NEXT: movzbl %cl, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %r12 +; SSE2-NEXT: popq %r13 +; SSE2-NEXT: popq %r14 +; SSE2-NEXT: popq %r15 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v16i4: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pushq %rbp +; SSSE3-NEXT: pushq %r15 +; SSSE3-NEXT: pushq %r14 +; SSSE3-NEXT: pushq %r13 +; SSSE3-NEXT: pushq %r12 +; SSSE3-NEXT: pushq %rbx +; SSSE3-NEXT: psllw $4, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: psllw $4, %xmm1 +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %dil +; SSSE3-NEXT: jb .LBB15_2 +; SSSE3-NEXT: # %bb.1: +; SSSE3-NEXT: movl %eax, %edi +; SSSE3-NEXT: .LBB15_2: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb $-1, %al +; SSSE3-NEXT: jb .LBB15_4 +; SSSE3-NEXT: # %bb.3: +; SSSE3-NEXT: movl %ecx, %eax +; SSSE3-NEXT: .LBB15_4: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb $-1, %cl +; SSSE3-NEXT: jb .LBB15_6 +; SSSE3-NEXT: # %bb.5: +; SSSE3-NEXT: movl %edx, %ecx +; SSSE3-NEXT: .LBB15_6: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb $-1, %r10b +; SSSE3-NEXT: jb .LBB15_8 +; SSSE3-NEXT: # %bb.7: +; SSSE3-NEXT: movl %edx, %r10d +; SSSE3-NEXT: .LBB15_8: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb $-1, %r11b +; SSSE3-NEXT: jb .LBB15_10 +; SSSE3-NEXT: # %bb.9: +; SSSE3-NEXT: movl %edx, %r11d +; SSSE3-NEXT: .LBB15_10: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb $-1, %r12b +; SSSE3-NEXT: jb .LBB15_12 +; SSSE3-NEXT: # %bb.11: +; SSSE3-NEXT: movl %edx, %r12d +; SSSE3-NEXT: .LBB15_12: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb $-1, %r13b +; SSSE3-NEXT: jb .LBB15_14 +; SSSE3-NEXT: # %bb.13: +; SSSE3-NEXT: movl %edx, %r13d +; SSSE3-NEXT: .LBB15_14: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb $-1, %r8b +; SSSE3-NEXT: jb .LBB15_16 +; SSSE3-NEXT: # %bb.15: +; SSSE3-NEXT: movl %edx, %r8d +; SSSE3-NEXT: .LBB15_16: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb $-1, %r14b +; SSSE3-NEXT: jb .LBB15_18 +; SSSE3-NEXT: # %bb.17: +; SSSE3-NEXT: movl %edx, %r14d +; SSSE3-NEXT: .LBB15_18: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb $-1, %r15b +; SSSE3-NEXT: jb .LBB15_20 +; SSSE3-NEXT: # %bb.19: +; SSSE3-NEXT: movl %edx, %r15d +; SSSE3-NEXT: .LBB15_20: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb $-1, %r9b +; SSSE3-NEXT: jb .LBB15_22 +; SSSE3-NEXT: # %bb.21: +; SSSE3-NEXT: movl %edx, %r9d +; SSSE3-NEXT: .LBB15_22: +; SSSE3-NEXT: movzbl %dil, %edi +; SSSE3-NEXT: movzbl %al, %esi +; SSSE3-NEXT: movzbl %cl, %ebp +; SSSE3-NEXT: movzbl %r10b, %edx +; SSSE3-NEXT: movzbl %r11b, %ebx +; SSSE3-NEXT: movzbl %r12b, %r10d +; SSSE3-NEXT: movzbl %r13b, %r11d +; SSSE3-NEXT: movzbl %r8b, %r8d +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %cl +; SSSE3-NEXT: jb .LBB15_24 +; SSSE3-NEXT: # %bb.23: +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: .LBB15_24: +; SSSE3-NEXT: movd %edi, %xmm2 +; SSSE3-NEXT: movd %esi, %xmm3 +; SSSE3-NEXT: movd %ebp, %xmm5 +; SSSE3-NEXT: movd %edx, %xmm0 +; SSSE3-NEXT: movd %ebx, %xmm6 +; SSSE3-NEXT: movd %r10d, %xmm4 +; SSSE3-NEXT: movd %r11d, %xmm7 +; SSSE3-NEXT: movd %r8d, %xmm1 +; SSSE3-NEXT: movzbl %r14b, %esi +; SSSE3-NEXT: movzbl %r15b, %edx +; SSSE3-NEXT: movzbl %r9b, %eax +; SSSE3-NEXT: movzbl %cl, %edi +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb $-1, %bl +; SSSE3-NEXT: jb .LBB15_26 +; SSSE3-NEXT: # %bb.25: +; SSSE3-NEXT: movl %ecx, %ebx +; SSSE3-NEXT: .LBB15_26: +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSSE3-NEXT: movd %esi, %xmm6 +; SSSE3-NEXT: movd %edx, %xmm5 +; SSSE3-NEXT: movd %eax, %xmm7 +; SSSE3-NEXT: movd %edi, %xmm2 +; SSSE3-NEXT: movzbl %bl, %eax +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb $-1, %cl +; SSSE3-NEXT: jb .LBB15_28 +; SSSE3-NEXT: # %bb.27: +; SSSE3-NEXT: movl %edx, %ecx +; SSSE3-NEXT: .LBB15_28: +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSSE3-NEXT: movd %eax, %xmm4 +; SSSE3-NEXT: movzbl %cl, %eax +; SSSE3-NEXT: movd %eax, %xmm3 +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb $-1, %cl +; SSSE3-NEXT: jb .LBB15_30 +; SSSE3-NEXT: # %bb.29: +; SSSE3-NEXT: movl %edx, %ecx +; SSSE3-NEXT: .LBB15_30: +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSSE3-NEXT: movzbl %cl, %ecx +; SSSE3-NEXT: movd %ecx, %xmm4 +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %cl +; SSSE3-NEXT: jb .LBB15_32 +; SSSE3-NEXT: # %bb.31: +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: .LBB15_32: +; SSSE3-NEXT: movzbl %cl, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: psrlw $4, %xmm0 +; SSSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSSE3-NEXT: popq %rbx +; SSSE3-NEXT: popq %r12 +; SSSE3-NEXT: popq %r13 +; SSSE3-NEXT: popq %r14 +; SSSE3-NEXT: popq %r15 +; SSSE3-NEXT: popq %rbp +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v16i4: +; SSE41: # %bb.0: +; SSE41-NEXT: psllw $4, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; SSE41-NEXT: pand %xmm2, %xmm1 +; SSE41-NEXT: pextrb $1, %xmm1, %eax +; SSE41-NEXT: psllw $4, %xmm0 +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pextrb $1, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %sil +; SSE41-NEXT: movb $-1, %dl +; SSE41-NEXT: jb .LBB15_2 +; SSE41-NEXT: # %bb.1: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB15_2: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pextrb $0, %xmm1, %eax +; SSE41-NEXT: pextrb $0, %xmm0, %edx +; SSE41-NEXT: addb %al, %dl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB15_4 +; SSE41-NEXT: # %bb.3: +; SSE41-NEXT: movl %edx, %eax +; SSE41-NEXT: .LBB15_4: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: movd %eax, %xmm2 +; SSE41-NEXT: pinsrb $1, %ecx, %xmm2 +; SSE41-NEXT: pextrb $2, %xmm1, %eax +; SSE41-NEXT: pextrb $2, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB15_6 +; SSE41-NEXT: # %bb.5: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB15_6: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $2, %eax, %xmm2 +; SSE41-NEXT: pextrb $3, %xmm1, %eax +; SSE41-NEXT: pextrb $3, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB15_8 +; SSE41-NEXT: # %bb.7: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB15_8: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $3, %eax, %xmm2 +; SSE41-NEXT: pextrb $4, %xmm1, %eax +; SSE41-NEXT: pextrb $4, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB15_10 +; SSE41-NEXT: # %bb.9: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB15_10: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $4, %eax, %xmm2 +; SSE41-NEXT: pextrb $5, %xmm1, %eax +; SSE41-NEXT: pextrb $5, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB15_12 +; SSE41-NEXT: # %bb.11: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB15_12: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $5, %eax, %xmm2 +; SSE41-NEXT: pextrb $6, %xmm1, %eax +; SSE41-NEXT: pextrb $6, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB15_14 +; SSE41-NEXT: # %bb.13: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB15_14: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $6, %eax, %xmm2 +; SSE41-NEXT: pextrb $7, %xmm1, %eax +; SSE41-NEXT: pextrb $7, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB15_16 +; SSE41-NEXT: # %bb.15: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB15_16: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $7, %eax, %xmm2 +; SSE41-NEXT: pextrb $8, %xmm1, %eax +; SSE41-NEXT: pextrb $8, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB15_18 +; SSE41-NEXT: # %bb.17: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB15_18: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $8, %eax, %xmm2 +; SSE41-NEXT: pextrb $9, %xmm1, %eax +; SSE41-NEXT: pextrb $9, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB15_20 +; SSE41-NEXT: # %bb.19: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB15_20: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $9, %eax, %xmm2 +; SSE41-NEXT: pextrb $10, %xmm1, %eax +; SSE41-NEXT: pextrb $10, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB15_22 +; SSE41-NEXT: # %bb.21: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB15_22: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $10, %eax, %xmm2 +; SSE41-NEXT: pextrb $11, %xmm1, %eax +; SSE41-NEXT: pextrb $11, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB15_24 +; SSE41-NEXT: # %bb.23: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB15_24: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $11, %eax, %xmm2 +; SSE41-NEXT: pextrb $12, %xmm1, %eax +; SSE41-NEXT: pextrb $12, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB15_26 +; SSE41-NEXT: # %bb.25: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB15_26: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $12, %eax, %xmm2 +; SSE41-NEXT: pextrb $13, %xmm1, %eax +; SSE41-NEXT: pextrb $13, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB15_28 +; SSE41-NEXT: # %bb.27: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB15_28: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $13, %eax, %xmm2 +; SSE41-NEXT: pextrb $14, %xmm1, %eax +; SSE41-NEXT: pextrb $14, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB15_30 +; SSE41-NEXT: # %bb.29: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB15_30: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $14, %eax, %xmm2 +; SSE41-NEXT: pextrb $15, %xmm1, %eax +; SSE41-NEXT: pextrb $15, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: jb .LBB15_32 +; SSE41-NEXT: # %bb.31: +; SSE41-NEXT: movl %ecx, %esi +; SSE41-NEXT: .LBB15_32: +; SSE41-NEXT: movzbl %sil, %eax +; SSE41-NEXT: pinsrb $15, %eax, %xmm2 +; SSE41-NEXT: psrlw $4, %xmm2 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: v16i4: +; AVX: # %bb.0: +; AVX-NEXT: vpsllw $4, %xmm1, %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $1, %xmm1, %eax +; AVX-NEXT: vpsllw $4, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $1, %xmm0, %ecx +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movb $-1, %sil +; AVX-NEXT: movb $-1, %dl +; AVX-NEXT: jb .LBB15_2 +; AVX-NEXT: # %bb.1: +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: .LBB15_2: +; AVX-NEXT: movzbl %dl, %ecx +; AVX-NEXT: vpextrb $0, %xmm1, %eax +; AVX-NEXT: vpextrb $0, %xmm0, %edx +; AVX-NEXT: addb %al, %dl +; AVX-NEXT: movb $-1, %al +; AVX-NEXT: jb .LBB15_4 +; AVX-NEXT: # %bb.3: +; AVX-NEXT: movl %edx, %eax +; AVX-NEXT: .LBB15_4: +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vmovd %eax, %xmm2 +; AVX-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $2, %xmm1, %eax +; AVX-NEXT: vpextrb $2, %xmm0, %ecx +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movb $-1, %al +; AVX-NEXT: jb .LBB15_6 +; AVX-NEXT: # %bb.5: +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: .LBB15_6: +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $3, %xmm1, %eax +; AVX-NEXT: vpextrb $3, %xmm0, %ecx +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movb $-1, %al +; AVX-NEXT: jb .LBB15_8 +; AVX-NEXT: # %bb.7: +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: .LBB15_8: +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $4, %xmm1, %eax +; AVX-NEXT: vpextrb $4, %xmm0, %ecx +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movb $-1, %al +; AVX-NEXT: jb .LBB15_10 +; AVX-NEXT: # %bb.9: +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: .LBB15_10: +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $5, %xmm1, %eax +; AVX-NEXT: vpextrb $5, %xmm0, %ecx +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movb $-1, %al +; AVX-NEXT: jb .LBB15_12 +; AVX-NEXT: # %bb.11: +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: .LBB15_12: +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $6, %xmm1, %eax +; AVX-NEXT: vpextrb $6, %xmm0, %ecx +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movb $-1, %al +; AVX-NEXT: jb .LBB15_14 +; AVX-NEXT: # %bb.13: +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: .LBB15_14: +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $7, %xmm1, %eax +; AVX-NEXT: vpextrb $7, %xmm0, %ecx +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movb $-1, %al +; AVX-NEXT: jb .LBB15_16 +; AVX-NEXT: # %bb.15: +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: .LBB15_16: +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $8, %xmm1, %eax +; AVX-NEXT: vpextrb $8, %xmm0, %ecx +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movb $-1, %al +; AVX-NEXT: jb .LBB15_18 +; AVX-NEXT: # %bb.17: +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: .LBB15_18: +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $9, %xmm1, %eax +; AVX-NEXT: vpextrb $9, %xmm0, %ecx +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movb $-1, %al +; AVX-NEXT: jb .LBB15_20 +; AVX-NEXT: # %bb.19: +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: .LBB15_20: +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $10, %xmm1, %eax +; AVX-NEXT: vpextrb $10, %xmm0, %ecx +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movb $-1, %al +; AVX-NEXT: jb .LBB15_22 +; AVX-NEXT: # %bb.21: +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: .LBB15_22: +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $11, %xmm1, %eax +; AVX-NEXT: vpextrb $11, %xmm0, %ecx +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movb $-1, %al +; AVX-NEXT: jb .LBB15_24 +; AVX-NEXT: # %bb.23: +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: .LBB15_24: +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $12, %xmm1, %eax +; AVX-NEXT: vpextrb $12, %xmm0, %ecx +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movb $-1, %al +; AVX-NEXT: jb .LBB15_26 +; AVX-NEXT: # %bb.25: +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: .LBB15_26: +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $13, %xmm1, %eax +; AVX-NEXT: vpextrb $13, %xmm0, %ecx +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movb $-1, %al +; AVX-NEXT: jb .LBB15_28 +; AVX-NEXT: # %bb.27: +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: .LBB15_28: +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $14, %xmm1, %eax +; AVX-NEXT: vpextrb $14, %xmm0, %ecx +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movb $-1, %al +; AVX-NEXT: jb .LBB15_30 +; AVX-NEXT: # %bb.29: +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: .LBB15_30: +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $15, %xmm1, %eax +; AVX-NEXT: vpextrb $15, %xmm0, %ecx +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: jb .LBB15_32 +; AVX-NEXT: # %bb.31: +; AVX-NEXT: movl %ecx, %esi +; AVX-NEXT: .LBB15_32: +; AVX-NEXT: movzbl %sil, %eax +; AVX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: retq + %z = call <16 x i4> @llvm.uadd.sat.v16i4(<16 x i4> %x, <16 x i4> %y) + ret <16 x i4> %z +} + +define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind { +; SSE2-LABEL: v16i1: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: pushq %r15 +; SSE2-NEXT: pushq %r14 +; SSE2-NEXT: pushq %r13 +; SSE2-NEXT: pushq %r12 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: psllw $7, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: psllw $7, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %dil +; SSE2-NEXT: jb .LBB16_2 +; SSE2-NEXT: # %bb.1: +; SSE2-NEXT: movl %eax, %edi +; SSE2-NEXT: .LBB16_2: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb $-1, %al +; SSE2-NEXT: jb .LBB16_4 +; SSE2-NEXT: # %bb.3: +; SSE2-NEXT: movl %ecx, %eax +; SSE2-NEXT: .LBB16_4: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb $-1, %cl +; SSE2-NEXT: jb .LBB16_6 +; SSE2-NEXT: # %bb.5: +; SSE2-NEXT: movl %edx, %ecx +; SSE2-NEXT: .LBB16_6: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb $-1, %r10b +; SSE2-NEXT: jb .LBB16_8 +; SSE2-NEXT: # %bb.7: +; SSE2-NEXT: movl %edx, %r10d +; SSE2-NEXT: .LBB16_8: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb $-1, %r11b +; SSE2-NEXT: jb .LBB16_10 +; SSE2-NEXT: # %bb.9: +; SSE2-NEXT: movl %edx, %r11d +; SSE2-NEXT: .LBB16_10: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb $-1, %r12b +; SSE2-NEXT: jb .LBB16_12 +; SSE2-NEXT: # %bb.11: +; SSE2-NEXT: movl %edx, %r12d +; SSE2-NEXT: .LBB16_12: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb $-1, %r13b +; SSE2-NEXT: jb .LBB16_14 +; SSE2-NEXT: # %bb.13: +; SSE2-NEXT: movl %edx, %r13d +; SSE2-NEXT: .LBB16_14: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb $-1, %r8b +; SSE2-NEXT: jb .LBB16_16 +; SSE2-NEXT: # %bb.15: +; SSE2-NEXT: movl %edx, %r8d +; SSE2-NEXT: .LBB16_16: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb $-1, %r14b +; SSE2-NEXT: jb .LBB16_18 +; SSE2-NEXT: # %bb.17: +; SSE2-NEXT: movl %edx, %r14d +; SSE2-NEXT: .LBB16_18: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb $-1, %r15b +; SSE2-NEXT: jb .LBB16_20 +; SSE2-NEXT: # %bb.19: +; SSE2-NEXT: movl %edx, %r15d +; SSE2-NEXT: .LBB16_20: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb $-1, %r9b +; SSE2-NEXT: jb .LBB16_22 +; SSE2-NEXT: # %bb.21: +; SSE2-NEXT: movl %edx, %r9d +; SSE2-NEXT: .LBB16_22: +; SSE2-NEXT: movzbl %dil, %edi +; SSE2-NEXT: movzbl %al, %esi +; SSE2-NEXT: movzbl %cl, %ebp +; SSE2-NEXT: movzbl %r10b, %edx +; SSE2-NEXT: movzbl %r11b, %ebx +; SSE2-NEXT: movzbl %r12b, %r10d +; SSE2-NEXT: movzbl %r13b, %r11d +; SSE2-NEXT: movzbl %r8b, %r8d +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %cl +; SSE2-NEXT: jb .LBB16_24 +; SSE2-NEXT: # %bb.23: +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: .LBB16_24: +; SSE2-NEXT: movd %edi, %xmm2 +; SSE2-NEXT: movd %esi, %xmm3 +; SSE2-NEXT: movd %ebp, %xmm5 +; SSE2-NEXT: movd %edx, %xmm0 +; SSE2-NEXT: movd %ebx, %xmm6 +; SSE2-NEXT: movd %r10d, %xmm4 +; SSE2-NEXT: movd %r11d, %xmm7 +; SSE2-NEXT: movd %r8d, %xmm1 +; SSE2-NEXT: movzbl %r14b, %esi +; SSE2-NEXT: movzbl %r15b, %edx +; SSE2-NEXT: movzbl %r9b, %eax +; SSE2-NEXT: movzbl %cl, %edi +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movb $-1, %bl +; SSE2-NEXT: jb .LBB16_26 +; SSE2-NEXT: # %bb.25: +; SSE2-NEXT: movl %ecx, %ebx +; SSE2-NEXT: .LBB16_26: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE2-NEXT: movd %esi, %xmm6 +; SSE2-NEXT: movd %edx, %xmm5 +; SSE2-NEXT: movd %eax, %xmm7 +; SSE2-NEXT: movd %edi, %xmm2 +; SSE2-NEXT: movzbl %bl, %eax +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb $-1, %cl +; SSE2-NEXT: jb .LBB16_28 +; SSE2-NEXT: # %bb.27: +; SSE2-NEXT: movl %edx, %ecx +; SSE2-NEXT: .LBB16_28: +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: movzbl %cl, %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movb $-1, %cl +; SSE2-NEXT: jb .LBB16_30 +; SSE2-NEXT: # %bb.29: +; SSE2-NEXT: movl %edx, %ecx +; SSE2-NEXT: .LBB16_30: +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: movzbl %cl, %ecx +; SSE2-NEXT: movd %ecx, %xmm4 +; SSE2-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb $-1, %cl +; SSE2-NEXT: jb .LBB16_32 +; SSE2-NEXT: # %bb.31: +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: .LBB16_32: +; SSE2-NEXT: movzbl %cl, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: psrlw $7, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %r12 +; SSE2-NEXT: popq %r13 +; SSE2-NEXT: popq %r14 +; SSE2-NEXT: popq %r15 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v16i1: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pushq %rbp +; SSSE3-NEXT: pushq %r15 +; SSSE3-NEXT: pushq %r14 +; SSSE3-NEXT: pushq %r13 +; SSSE3-NEXT: pushq %r12 +; SSSE3-NEXT: pushq %rbx +; SSSE3-NEXT: psllw $7, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: psllw $7, %xmm1 +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %dil +; SSSE3-NEXT: jb .LBB16_2 +; SSSE3-NEXT: # %bb.1: +; SSSE3-NEXT: movl %eax, %edi +; SSSE3-NEXT: .LBB16_2: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb $-1, %al +; SSSE3-NEXT: jb .LBB16_4 +; SSSE3-NEXT: # %bb.3: +; SSSE3-NEXT: movl %ecx, %eax +; SSSE3-NEXT: .LBB16_4: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb $-1, %cl +; SSSE3-NEXT: jb .LBB16_6 +; SSSE3-NEXT: # %bb.5: +; SSSE3-NEXT: movl %edx, %ecx +; SSSE3-NEXT: .LBB16_6: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb $-1, %r10b +; SSSE3-NEXT: jb .LBB16_8 +; SSSE3-NEXT: # %bb.7: +; SSSE3-NEXT: movl %edx, %r10d +; SSSE3-NEXT: .LBB16_8: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb $-1, %r11b +; SSSE3-NEXT: jb .LBB16_10 +; SSSE3-NEXT: # %bb.9: +; SSSE3-NEXT: movl %edx, %r11d +; SSSE3-NEXT: .LBB16_10: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb $-1, %r12b +; SSSE3-NEXT: jb .LBB16_12 +; SSSE3-NEXT: # %bb.11: +; SSSE3-NEXT: movl %edx, %r12d +; SSSE3-NEXT: .LBB16_12: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb $-1, %r13b +; SSSE3-NEXT: jb .LBB16_14 +; SSSE3-NEXT: # %bb.13: +; SSSE3-NEXT: movl %edx, %r13d +; SSSE3-NEXT: .LBB16_14: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb $-1, %r8b +; SSSE3-NEXT: jb .LBB16_16 +; SSSE3-NEXT: # %bb.15: +; SSSE3-NEXT: movl %edx, %r8d +; SSSE3-NEXT: .LBB16_16: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb $-1, %r14b +; SSSE3-NEXT: jb .LBB16_18 +; SSSE3-NEXT: # %bb.17: +; SSSE3-NEXT: movl %edx, %r14d +; SSSE3-NEXT: .LBB16_18: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb $-1, %r15b +; SSSE3-NEXT: jb .LBB16_20 +; SSSE3-NEXT: # %bb.19: +; SSSE3-NEXT: movl %edx, %r15d +; SSSE3-NEXT: .LBB16_20: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb $-1, %r9b +; SSSE3-NEXT: jb .LBB16_22 +; SSSE3-NEXT: # %bb.21: +; SSSE3-NEXT: movl %edx, %r9d +; SSSE3-NEXT: .LBB16_22: +; SSSE3-NEXT: movzbl %dil, %edi +; SSSE3-NEXT: movzbl %al, %esi +; SSSE3-NEXT: movzbl %cl, %ebp +; SSSE3-NEXT: movzbl %r10b, %edx +; SSSE3-NEXT: movzbl %r11b, %ebx +; SSSE3-NEXT: movzbl %r12b, %r10d +; SSSE3-NEXT: movzbl %r13b, %r11d +; SSSE3-NEXT: movzbl %r8b, %r8d +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %cl +; SSSE3-NEXT: jb .LBB16_24 +; SSSE3-NEXT: # %bb.23: +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: .LBB16_24: +; SSSE3-NEXT: movd %edi, %xmm2 +; SSSE3-NEXT: movd %esi, %xmm3 +; SSSE3-NEXT: movd %ebp, %xmm5 +; SSSE3-NEXT: movd %edx, %xmm0 +; SSSE3-NEXT: movd %ebx, %xmm6 +; SSSE3-NEXT: movd %r10d, %xmm4 +; SSSE3-NEXT: movd %r11d, %xmm7 +; SSSE3-NEXT: movd %r8d, %xmm1 +; SSSE3-NEXT: movzbl %r14b, %esi +; SSSE3-NEXT: movzbl %r15b, %edx +; SSSE3-NEXT: movzbl %r9b, %eax +; SSSE3-NEXT: movzbl %cl, %edi +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movb $-1, %bl +; SSSE3-NEXT: jb .LBB16_26 +; SSSE3-NEXT: # %bb.25: +; SSSE3-NEXT: movl %ecx, %ebx +; SSSE3-NEXT: .LBB16_26: +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSSE3-NEXT: movd %esi, %xmm6 +; SSSE3-NEXT: movd %edx, %xmm5 +; SSSE3-NEXT: movd %eax, %xmm7 +; SSSE3-NEXT: movd %edi, %xmm2 +; SSSE3-NEXT: movzbl %bl, %eax +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb $-1, %cl +; SSSE3-NEXT: jb .LBB16_28 +; SSSE3-NEXT: # %bb.27: +; SSSE3-NEXT: movl %edx, %ecx +; SSSE3-NEXT: .LBB16_28: +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSSE3-NEXT: movd %eax, %xmm4 +; SSSE3-NEXT: movzbl %cl, %eax +; SSSE3-NEXT: movd %eax, %xmm3 +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movb $-1, %cl +; SSSE3-NEXT: jb .LBB16_30 +; SSSE3-NEXT: # %bb.29: +; SSSE3-NEXT: movl %edx, %ecx +; SSSE3-NEXT: .LBB16_30: +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSSE3-NEXT: movzbl %cl, %ecx +; SSSE3-NEXT: movd %ecx, %xmm4 +; SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb $-1, %cl +; SSSE3-NEXT: jb .LBB16_32 +; SSSE3-NEXT: # %bb.31: +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: .LBB16_32: +; SSSE3-NEXT: movzbl %cl, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: psrlw $7, %xmm0 +; SSSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSSE3-NEXT: popq %rbx +; SSSE3-NEXT: popq %r12 +; SSSE3-NEXT: popq %r13 +; SSSE3-NEXT: popq %r14 +; SSSE3-NEXT: popq %r15 +; SSSE3-NEXT: popq %rbp +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v16i1: +; SSE41: # %bb.0: +; SSE41-NEXT: psllw $7, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; SSE41-NEXT: pand %xmm2, %xmm1 +; SSE41-NEXT: pextrb $1, %xmm1, %eax +; SSE41-NEXT: psllw $7, %xmm0 +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pextrb $1, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %sil +; SSE41-NEXT: movb $-1, %dl +; SSE41-NEXT: jb .LBB16_2 +; SSE41-NEXT: # %bb.1: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB16_2: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pextrb $0, %xmm1, %eax +; SSE41-NEXT: pextrb $0, %xmm0, %edx +; SSE41-NEXT: addb %al, %dl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB16_4 +; SSE41-NEXT: # %bb.3: +; SSE41-NEXT: movl %edx, %eax +; SSE41-NEXT: .LBB16_4: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: movd %eax, %xmm2 +; SSE41-NEXT: pinsrb $1, %ecx, %xmm2 +; SSE41-NEXT: pextrb $2, %xmm1, %eax +; SSE41-NEXT: pextrb $2, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB16_6 +; SSE41-NEXT: # %bb.5: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB16_6: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $2, %eax, %xmm2 +; SSE41-NEXT: pextrb $3, %xmm1, %eax +; SSE41-NEXT: pextrb $3, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB16_8 +; SSE41-NEXT: # %bb.7: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB16_8: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $3, %eax, %xmm2 +; SSE41-NEXT: pextrb $4, %xmm1, %eax +; SSE41-NEXT: pextrb $4, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB16_10 +; SSE41-NEXT: # %bb.9: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB16_10: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $4, %eax, %xmm2 +; SSE41-NEXT: pextrb $5, %xmm1, %eax +; SSE41-NEXT: pextrb $5, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB16_12 +; SSE41-NEXT: # %bb.11: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB16_12: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $5, %eax, %xmm2 +; SSE41-NEXT: pextrb $6, %xmm1, %eax +; SSE41-NEXT: pextrb $6, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB16_14 +; SSE41-NEXT: # %bb.13: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB16_14: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $6, %eax, %xmm2 +; SSE41-NEXT: pextrb $7, %xmm1, %eax +; SSE41-NEXT: pextrb $7, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB16_16 +; SSE41-NEXT: # %bb.15: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB16_16: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $7, %eax, %xmm2 +; SSE41-NEXT: pextrb $8, %xmm1, %eax +; SSE41-NEXT: pextrb $8, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB16_18 +; SSE41-NEXT: # %bb.17: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB16_18: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $8, %eax, %xmm2 +; SSE41-NEXT: pextrb $9, %xmm1, %eax +; SSE41-NEXT: pextrb $9, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB16_20 +; SSE41-NEXT: # %bb.19: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB16_20: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $9, %eax, %xmm2 +; SSE41-NEXT: pextrb $10, %xmm1, %eax +; SSE41-NEXT: pextrb $10, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB16_22 +; SSE41-NEXT: # %bb.21: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB16_22: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $10, %eax, %xmm2 +; SSE41-NEXT: pextrb $11, %xmm1, %eax +; SSE41-NEXT: pextrb $11, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB16_24 +; SSE41-NEXT: # %bb.23: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB16_24: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $11, %eax, %xmm2 +; SSE41-NEXT: pextrb $12, %xmm1, %eax +; SSE41-NEXT: pextrb $12, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB16_26 +; SSE41-NEXT: # %bb.25: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB16_26: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $12, %eax, %xmm2 +; SSE41-NEXT: pextrb $13, %xmm1, %eax +; SSE41-NEXT: pextrb $13, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB16_28 +; SSE41-NEXT: # %bb.27: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB16_28: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $13, %eax, %xmm2 +; SSE41-NEXT: pextrb $14, %xmm1, %eax +; SSE41-NEXT: pextrb $14, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: movb $-1, %al +; SSE41-NEXT: jb .LBB16_30 +; SSE41-NEXT: # %bb.29: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB16_30: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $14, %eax, %xmm2 +; SSE41-NEXT: pextrb $15, %xmm1, %eax +; SSE41-NEXT: pextrb $15, %xmm0, %ecx +; SSE41-NEXT: addb %al, %cl +; SSE41-NEXT: jb .LBB16_32 +; SSE41-NEXT: # %bb.31: +; SSE41-NEXT: movl %ecx, %esi +; SSE41-NEXT: .LBB16_32: +; SSE41-NEXT: movzbl %sil, %eax +; SSE41-NEXT: pinsrb $15, %eax, %xmm2 +; SSE41-NEXT: psrlw $7, %xmm2 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: v16i1: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsllw $7, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpextrb $1, %xmm1, %eax +; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpextrb $1, %xmm0, %ecx +; AVX1-NEXT: addb %al, %cl +; AVX1-NEXT: movb $-1, %sil +; AVX1-NEXT: movb $-1, %dl +; AVX1-NEXT: jb .LBB16_2 +; AVX1-NEXT: # %bb.1: +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: .LBB16_2: +; AVX1-NEXT: movzbl %dl, %ecx +; AVX1-NEXT: vpextrb $0, %xmm1, %eax +; AVX1-NEXT: vpextrb $0, %xmm0, %edx +; AVX1-NEXT: addb %al, %dl +; AVX1-NEXT: movb $-1, %al +; AVX1-NEXT: jb .LBB16_4 +; AVX1-NEXT: # %bb.3: +; AVX1-NEXT: movl %edx, %eax +; AVX1-NEXT: .LBB16_4: +; AVX1-NEXT: movzbl %al, %eax +; AVX1-NEXT: vmovd %eax, %xmm2 +; AVX1-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $2, %xmm1, %eax +; AVX1-NEXT: vpextrb $2, %xmm0, %ecx +; AVX1-NEXT: addb %al, %cl +; AVX1-NEXT: movb $-1, %al +; AVX1-NEXT: jb .LBB16_6 +; AVX1-NEXT: # %bb.5: +; AVX1-NEXT: movl %ecx, %eax +; AVX1-NEXT: .LBB16_6: +; AVX1-NEXT: movzbl %al, %eax +; AVX1-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $3, %xmm1, %eax +; AVX1-NEXT: vpextrb $3, %xmm0, %ecx +; AVX1-NEXT: addb %al, %cl +; AVX1-NEXT: movb $-1, %al +; AVX1-NEXT: jb .LBB16_8 +; AVX1-NEXT: # %bb.7: +; AVX1-NEXT: movl %ecx, %eax +; AVX1-NEXT: .LBB16_8: +; AVX1-NEXT: movzbl %al, %eax +; AVX1-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $4, %xmm1, %eax +; AVX1-NEXT: vpextrb $4, %xmm0, %ecx +; AVX1-NEXT: addb %al, %cl +; AVX1-NEXT: movb $-1, %al +; AVX1-NEXT: jb .LBB16_10 +; AVX1-NEXT: # %bb.9: +; AVX1-NEXT: movl %ecx, %eax +; AVX1-NEXT: .LBB16_10: +; AVX1-NEXT: movzbl %al, %eax +; AVX1-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $5, %xmm1, %eax +; AVX1-NEXT: vpextrb $5, %xmm0, %ecx +; AVX1-NEXT: addb %al, %cl +; AVX1-NEXT: movb $-1, %al +; AVX1-NEXT: jb .LBB16_12 +; AVX1-NEXT: # %bb.11: +; AVX1-NEXT: movl %ecx, %eax +; AVX1-NEXT: .LBB16_12: +; AVX1-NEXT: movzbl %al, %eax +; AVX1-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $6, %xmm1, %eax +; AVX1-NEXT: vpextrb $6, %xmm0, %ecx +; AVX1-NEXT: addb %al, %cl +; AVX1-NEXT: movb $-1, %al +; AVX1-NEXT: jb .LBB16_14 +; AVX1-NEXT: # %bb.13: +; AVX1-NEXT: movl %ecx, %eax +; AVX1-NEXT: .LBB16_14: +; AVX1-NEXT: movzbl %al, %eax +; AVX1-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $7, %xmm1, %eax +; AVX1-NEXT: vpextrb $7, %xmm0, %ecx +; AVX1-NEXT: addb %al, %cl +; AVX1-NEXT: movb $-1, %al +; AVX1-NEXT: jb .LBB16_16 +; AVX1-NEXT: # %bb.15: +; AVX1-NEXT: movl %ecx, %eax +; AVX1-NEXT: .LBB16_16: +; AVX1-NEXT: movzbl %al, %eax +; AVX1-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $8, %xmm1, %eax +; AVX1-NEXT: vpextrb $8, %xmm0, %ecx +; AVX1-NEXT: addb %al, %cl +; AVX1-NEXT: movb $-1, %al +; AVX1-NEXT: jb .LBB16_18 +; AVX1-NEXT: # %bb.17: +; AVX1-NEXT: movl %ecx, %eax +; AVX1-NEXT: .LBB16_18: +; AVX1-NEXT: movzbl %al, %eax +; AVX1-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $9, %xmm1, %eax +; AVX1-NEXT: vpextrb $9, %xmm0, %ecx +; AVX1-NEXT: addb %al, %cl +; AVX1-NEXT: movb $-1, %al +; AVX1-NEXT: jb .LBB16_20 +; AVX1-NEXT: # %bb.19: +; AVX1-NEXT: movl %ecx, %eax +; AVX1-NEXT: .LBB16_20: +; AVX1-NEXT: movzbl %al, %eax +; AVX1-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $10, %xmm1, %eax +; AVX1-NEXT: vpextrb $10, %xmm0, %ecx +; AVX1-NEXT: addb %al, %cl +; AVX1-NEXT: movb $-1, %al +; AVX1-NEXT: jb .LBB16_22 +; AVX1-NEXT: # %bb.21: +; AVX1-NEXT: movl %ecx, %eax +; AVX1-NEXT: .LBB16_22: +; AVX1-NEXT: movzbl %al, %eax +; AVX1-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $11, %xmm1, %eax +; AVX1-NEXT: vpextrb $11, %xmm0, %ecx +; AVX1-NEXT: addb %al, %cl +; AVX1-NEXT: movb $-1, %al +; AVX1-NEXT: jb .LBB16_24 +; AVX1-NEXT: # %bb.23: +; AVX1-NEXT: movl %ecx, %eax +; AVX1-NEXT: .LBB16_24: +; AVX1-NEXT: movzbl %al, %eax +; AVX1-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $12, %xmm1, %eax +; AVX1-NEXT: vpextrb $12, %xmm0, %ecx +; AVX1-NEXT: addb %al, %cl +; AVX1-NEXT: movb $-1, %al +; AVX1-NEXT: jb .LBB16_26 +; AVX1-NEXT: # %bb.25: +; AVX1-NEXT: movl %ecx, %eax +; AVX1-NEXT: .LBB16_26: +; AVX1-NEXT: movzbl %al, %eax +; AVX1-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $13, %xmm1, %eax +; AVX1-NEXT: vpextrb $13, %xmm0, %ecx +; AVX1-NEXT: addb %al, %cl +; AVX1-NEXT: movb $-1, %al +; AVX1-NEXT: jb .LBB16_28 +; AVX1-NEXT: # %bb.27: +; AVX1-NEXT: movl %ecx, %eax +; AVX1-NEXT: .LBB16_28: +; AVX1-NEXT: movzbl %al, %eax +; AVX1-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $14, %xmm1, %eax +; AVX1-NEXT: vpextrb $14, %xmm0, %ecx +; AVX1-NEXT: addb %al, %cl +; AVX1-NEXT: movb $-1, %al +; AVX1-NEXT: jb .LBB16_30 +; AVX1-NEXT: # %bb.29: +; AVX1-NEXT: movl %ecx, %eax +; AVX1-NEXT: .LBB16_30: +; AVX1-NEXT: movzbl %al, %eax +; AVX1-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $15, %xmm1, %eax +; AVX1-NEXT: vpextrb $15, %xmm0, %ecx +; AVX1-NEXT: addb %al, %cl +; AVX1-NEXT: jb .LBB16_32 +; AVX1-NEXT: # %bb.31: +; AVX1-NEXT: movl %ecx, %esi +; AVX1-NEXT: .LBB16_32: +; AVX1-NEXT: movzbl %sil, %eax +; AVX1-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0 +; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: v16i1: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllw $7, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpextrb $1, %xmm1, %eax +; AVX2-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpextrb $1, %xmm0, %ecx +; AVX2-NEXT: addb %al, %cl +; AVX2-NEXT: movb $-1, %sil +; AVX2-NEXT: movb $-1, %dl +; AVX2-NEXT: jb .LBB16_2 +; AVX2-NEXT: # %bb.1: +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: .LBB16_2: +; AVX2-NEXT: movzbl %dl, %ecx +; AVX2-NEXT: vpextrb $0, %xmm1, %eax +; AVX2-NEXT: vpextrb $0, %xmm0, %edx +; AVX2-NEXT: addb %al, %dl +; AVX2-NEXT: movb $-1, %al +; AVX2-NEXT: jb .LBB16_4 +; AVX2-NEXT: # %bb.3: +; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: .LBB16_4: +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: vmovd %eax, %xmm2 +; AVX2-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $2, %xmm1, %eax +; AVX2-NEXT: vpextrb $2, %xmm0, %ecx +; AVX2-NEXT: addb %al, %cl +; AVX2-NEXT: movb $-1, %al +; AVX2-NEXT: jb .LBB16_6 +; AVX2-NEXT: # %bb.5: +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: .LBB16_6: +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $3, %xmm1, %eax +; AVX2-NEXT: vpextrb $3, %xmm0, %ecx +; AVX2-NEXT: addb %al, %cl +; AVX2-NEXT: movb $-1, %al +; AVX2-NEXT: jb .LBB16_8 +; AVX2-NEXT: # %bb.7: +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: .LBB16_8: +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $4, %xmm1, %eax +; AVX2-NEXT: vpextrb $4, %xmm0, %ecx +; AVX2-NEXT: addb %al, %cl +; AVX2-NEXT: movb $-1, %al +; AVX2-NEXT: jb .LBB16_10 +; AVX2-NEXT: # %bb.9: +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: .LBB16_10: +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $5, %xmm1, %eax +; AVX2-NEXT: vpextrb $5, %xmm0, %ecx +; AVX2-NEXT: addb %al, %cl +; AVX2-NEXT: movb $-1, %al +; AVX2-NEXT: jb .LBB16_12 +; AVX2-NEXT: # %bb.11: +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: .LBB16_12: +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $6, %xmm1, %eax +; AVX2-NEXT: vpextrb $6, %xmm0, %ecx +; AVX2-NEXT: addb %al, %cl +; AVX2-NEXT: movb $-1, %al +; AVX2-NEXT: jb .LBB16_14 +; AVX2-NEXT: # %bb.13: +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: .LBB16_14: +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $7, %xmm1, %eax +; AVX2-NEXT: vpextrb $7, %xmm0, %ecx +; AVX2-NEXT: addb %al, %cl +; AVX2-NEXT: movb $-1, %al +; AVX2-NEXT: jb .LBB16_16 +; AVX2-NEXT: # %bb.15: +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: .LBB16_16: +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $8, %xmm1, %eax +; AVX2-NEXT: vpextrb $8, %xmm0, %ecx +; AVX2-NEXT: addb %al, %cl +; AVX2-NEXT: movb $-1, %al +; AVX2-NEXT: jb .LBB16_18 +; AVX2-NEXT: # %bb.17: +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: .LBB16_18: +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $9, %xmm1, %eax +; AVX2-NEXT: vpextrb $9, %xmm0, %ecx +; AVX2-NEXT: addb %al, %cl +; AVX2-NEXT: movb $-1, %al +; AVX2-NEXT: jb .LBB16_20 +; AVX2-NEXT: # %bb.19: +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: .LBB16_20: +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $10, %xmm1, %eax +; AVX2-NEXT: vpextrb $10, %xmm0, %ecx +; AVX2-NEXT: addb %al, %cl +; AVX2-NEXT: movb $-1, %al +; AVX2-NEXT: jb .LBB16_22 +; AVX2-NEXT: # %bb.21: +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: .LBB16_22: +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $11, %xmm1, %eax +; AVX2-NEXT: vpextrb $11, %xmm0, %ecx +; AVX2-NEXT: addb %al, %cl +; AVX2-NEXT: movb $-1, %al +; AVX2-NEXT: jb .LBB16_24 +; AVX2-NEXT: # %bb.23: +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: .LBB16_24: +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $12, %xmm1, %eax +; AVX2-NEXT: vpextrb $12, %xmm0, %ecx +; AVX2-NEXT: addb %al, %cl +; AVX2-NEXT: movb $-1, %al +; AVX2-NEXT: jb .LBB16_26 +; AVX2-NEXT: # %bb.25: +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: .LBB16_26: +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $13, %xmm1, %eax +; AVX2-NEXT: vpextrb $13, %xmm0, %ecx +; AVX2-NEXT: addb %al, %cl +; AVX2-NEXT: movb $-1, %al +; AVX2-NEXT: jb .LBB16_28 +; AVX2-NEXT: # %bb.27: +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: .LBB16_28: +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $14, %xmm1, %eax +; AVX2-NEXT: vpextrb $14, %xmm0, %ecx +; AVX2-NEXT: addb %al, %cl +; AVX2-NEXT: movb $-1, %al +; AVX2-NEXT: jb .LBB16_30 +; AVX2-NEXT: # %bb.29: +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: .LBB16_30: +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $15, %xmm1, %eax +; AVX2-NEXT: vpextrb $15, %xmm0, %ecx +; AVX2-NEXT: addb %al, %cl +; AVX2-NEXT: jb .LBB16_32 +; AVX2-NEXT: # %bb.31: +; AVX2-NEXT: movl %ecx, %esi +; AVX2-NEXT: .LBB16_32: +; AVX2-NEXT: movzbl %sil, %eax +; AVX2-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0 +; AVX2-NEXT: vpsrlw $7, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: v16i1: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r15 +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX512-NEXT: vpmovb2m %xmm0, %k0 +; AVX512-NEXT: vpsllw $7, %xmm1, %xmm0 +; AVX512-NEXT: vpmovb2m %xmm0, %k1 +; AVX512-NEXT: kshiftrw $4, %k0, %k2 +; AVX512-NEXT: kshiftrw $4, %k1, %k3 +; AVX512-NEXT: kshiftrw $3, %k0, %k4 +; AVX512-NEXT: kmovd %k4, %r15d +; AVX512-NEXT: kshiftrw $3, %k1, %k4 +; AVX512-NEXT: kmovd %k4, %r9d +; AVX512-NEXT: kshiftrw $2, %k0, %k4 +; AVX512-NEXT: kmovd %k4, %eax +; AVX512-NEXT: kshiftrw $2, %k1, %k4 +; AVX512-NEXT: kmovd %k4, %ebp +; AVX512-NEXT: kmovd %k0, %ecx +; AVX512-NEXT: kmovd %k1, %esi +; AVX512-NEXT: kshiftrw $1, %k0, %k4 +; AVX512-NEXT: kmovd %k4, %edi +; AVX512-NEXT: kshiftrw $1, %k1, %k4 +; AVX512-NEXT: kmovd %k4, %edx +; AVX512-NEXT: shlb $7, %dl +; AVX512-NEXT: shlb $7, %dil +; AVX512-NEXT: addb %dl, %dil +; AVX512-NEXT: movb $-1, %r8b +; AVX512-NEXT: movb $-1, %bl +; AVX512-NEXT: jb .LBB16_2 +; AVX512-NEXT: # %bb.1: +; AVX512-NEXT: movl %edi, %ebx +; AVX512-NEXT: .LBB16_2: +; AVX512-NEXT: kshiftrw $5, %k0, %k4 +; AVX512-NEXT: kshiftrw $5, %k1, %k5 +; AVX512-NEXT: kmovd %k2, %edi +; AVX512-NEXT: kmovd %k3, %r11d +; AVX512-NEXT: shrb $7, %bl +; AVX512-NEXT: kmovd %ebx, %k6 +; AVX512-NEXT: shlb $7, %sil +; AVX512-NEXT: shlb $7, %cl +; AVX512-NEXT: addb %sil, %cl +; AVX512-NEXT: movb $-1, %dl +; AVX512-NEXT: jb .LBB16_4 +; AVX512-NEXT: # %bb.3: +; AVX512-NEXT: movl %ecx, %edx +; AVX512-NEXT: .LBB16_4: +; AVX512-NEXT: kshiftrw $6, %k0, %k2 +; AVX512-NEXT: kshiftrw $6, %k1, %k3 +; AVX512-NEXT: kmovd %k4, %esi +; AVX512-NEXT: kmovd %k5, %r14d +; AVX512-NEXT: shrb $7, %dl +; AVX512-NEXT: kmovd %edx, %k4 +; AVX512-NEXT: kshiftrw $1, %k4, %k5 +; AVX512-NEXT: kxorw %k6, %k5, %k5 +; AVX512-NEXT: kshiftlw $15, %k5, %k5 +; AVX512-NEXT: kshiftrw $14, %k5, %k5 +; AVX512-NEXT: kxorw %k5, %k4, %k6 +; AVX512-NEXT: kshiftrw $2, %k6, %k7 +; AVX512-NEXT: shlb $7, %bpl +; AVX512-NEXT: shlb $7, %al +; AVX512-NEXT: addb %bpl, %al +; AVX512-NEXT: movb $-1, %cl +; AVX512-NEXT: jb .LBB16_6 +; AVX512-NEXT: # %bb.5: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB16_6: +; AVX512-NEXT: kshiftrw $7, %k0, %k4 +; AVX512-NEXT: kshiftrw $7, %k1, %k5 +; AVX512-NEXT: kmovd %k2, %eax +; AVX512-NEXT: kmovd %k3, %r10d +; AVX512-NEXT: shrb $7, %cl +; AVX512-NEXT: kmovd %ecx, %k2 +; AVX512-NEXT: kxorw %k2, %k7, %k2 +; AVX512-NEXT: kshiftlw $15, %k2, %k2 +; AVX512-NEXT: kshiftrw $13, %k2, %k2 +; AVX512-NEXT: kxorw %k2, %k6, %k6 +; AVX512-NEXT: kshiftrw $3, %k6, %k7 +; AVX512-NEXT: shlb $7, %r9b +; AVX512-NEXT: shlb $7, %r15b +; AVX512-NEXT: addb %r9b, %r15b +; AVX512-NEXT: movb $-1, %dl +; AVX512-NEXT: jb .LBB16_8 +; AVX512-NEXT: # %bb.7: +; AVX512-NEXT: movl %r15d, %edx +; AVX512-NEXT: .LBB16_8: +; AVX512-NEXT: kshiftrw $8, %k0, %k2 +; AVX512-NEXT: kshiftrw $8, %k1, %k3 +; AVX512-NEXT: kmovd %k4, %ecx +; AVX512-NEXT: kmovd %k5, %r9d +; AVX512-NEXT: shrb $7, %dl +; AVX512-NEXT: kmovd %edx, %k4 +; AVX512-NEXT: kxorw %k4, %k7, %k4 +; AVX512-NEXT: kshiftlw $15, %k4, %k4 +; AVX512-NEXT: kshiftrw $12, %k4, %k4 +; AVX512-NEXT: kxorw %k4, %k6, %k6 +; AVX512-NEXT: kshiftrw $4, %k6, %k7 +; AVX512-NEXT: shlb $7, %r11b +; AVX512-NEXT: shlb $7, %dil +; AVX512-NEXT: addb %r11b, %dil +; AVX512-NEXT: movb $-1, %dl +; AVX512-NEXT: jb .LBB16_10 +; AVX512-NEXT: # %bb.9: +; AVX512-NEXT: movl %edi, %edx +; AVX512-NEXT: .LBB16_10: +; AVX512-NEXT: kshiftrw $9, %k0, %k4 +; AVX512-NEXT: kshiftrw $9, %k1, %k5 +; AVX512-NEXT: kmovd %k2, %edi +; AVX512-NEXT: kmovd %k3, %ebx +; AVX512-NEXT: shrb $7, %dl +; AVX512-NEXT: kmovd %edx, %k2 +; AVX512-NEXT: kxorw %k2, %k7, %k2 +; AVX512-NEXT: kshiftlw $15, %k2, %k2 +; AVX512-NEXT: kshiftrw $11, %k2, %k2 +; AVX512-NEXT: kxorw %k2, %k6, %k6 +; AVX512-NEXT: kshiftrw $5, %k6, %k7 +; AVX512-NEXT: shlb $7, %r14b +; AVX512-NEXT: shlb $7, %sil +; AVX512-NEXT: addb %r14b, %sil +; AVX512-NEXT: movb $-1, %dl +; AVX512-NEXT: jb .LBB16_12 +; AVX512-NEXT: # %bb.11: +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: .LBB16_12: +; AVX512-NEXT: kshiftrw $10, %k0, %k2 +; AVX512-NEXT: kshiftrw $10, %k1, %k3 +; AVX512-NEXT: kmovd %k4, %esi +; AVX512-NEXT: kmovd %k5, %r11d +; AVX512-NEXT: shrb $7, %dl +; AVX512-NEXT: kmovd %edx, %k4 +; AVX512-NEXT: kxorw %k4, %k7, %k4 +; AVX512-NEXT: kshiftlw $15, %k4, %k4 +; AVX512-NEXT: kshiftrw $10, %k4, %k4 +; AVX512-NEXT: kxorw %k4, %k6, %k6 +; AVX512-NEXT: kshiftrw $6, %k6, %k7 +; AVX512-NEXT: shlb $7, %r10b +; AVX512-NEXT: shlb $7, %al +; AVX512-NEXT: addb %r10b, %al +; AVX512-NEXT: movb $-1, %bpl +; AVX512-NEXT: jb .LBB16_14 +; AVX512-NEXT: # %bb.13: +; AVX512-NEXT: movl %eax, %ebp +; AVX512-NEXT: .LBB16_14: +; AVX512-NEXT: kshiftrw $11, %k0, %k4 +; AVX512-NEXT: kshiftrw $11, %k1, %k5 +; AVX512-NEXT: kmovd %k2, %r15d +; AVX512-NEXT: kmovd %k3, %r10d +; AVX512-NEXT: shrb $7, %bpl +; AVX512-NEXT: kmovd %ebp, %k2 +; AVX512-NEXT: kxorw %k2, %k7, %k2 +; AVX512-NEXT: kshiftlw $15, %k2, %k2 +; AVX512-NEXT: kshiftrw $9, %k2, %k2 +; AVX512-NEXT: kxorw %k2, %k6, %k6 +; AVX512-NEXT: kshiftrw $7, %k6, %k7 +; AVX512-NEXT: shlb $7, %r9b +; AVX512-NEXT: shlb $7, %cl +; AVX512-NEXT: addb %r9b, %cl +; AVX512-NEXT: movb $-1, %al +; AVX512-NEXT: jb .LBB16_16 +; AVX512-NEXT: # %bb.15: +; AVX512-NEXT: movl %ecx, %eax +; AVX512-NEXT: .LBB16_16: +; AVX512-NEXT: kshiftrw $12, %k0, %k2 +; AVX512-NEXT: kshiftrw $12, %k1, %k3 +; AVX512-NEXT: kmovd %k4, %ecx +; AVX512-NEXT: kmovd %k5, %r9d +; AVX512-NEXT: shrb $7, %al +; AVX512-NEXT: kmovd %eax, %k4 +; AVX512-NEXT: kxorw %k4, %k7, %k4 +; AVX512-NEXT: kshiftlw $15, %k4, %k4 +; AVX512-NEXT: kshiftrw $8, %k4, %k4 +; AVX512-NEXT: kxorw %k4, %k6, %k6 +; AVX512-NEXT: kshiftrw $8, %k6, %k7 +; AVX512-NEXT: shlb $7, %bl +; AVX512-NEXT: shlb $7, %dil +; AVX512-NEXT: addb %bl, %dil +; AVX512-NEXT: movb $-1, %bl +; AVX512-NEXT: jb .LBB16_18 +; AVX512-NEXT: # %bb.17: +; AVX512-NEXT: movl %edi, %ebx +; AVX512-NEXT: .LBB16_18: +; AVX512-NEXT: kshiftrw $13, %k0, %k4 +; AVX512-NEXT: kshiftrw $13, %k1, %k5 +; AVX512-NEXT: kmovd %k2, %eax +; AVX512-NEXT: kmovd %k3, %r14d +; AVX512-NEXT: shrb $7, %bl +; AVX512-NEXT: kmovd %ebx, %k2 +; AVX512-NEXT: kxorw %k2, %k7, %k2 +; AVX512-NEXT: kshiftlw $15, %k2, %k2 +; AVX512-NEXT: kshiftrw $7, %k2, %k2 +; AVX512-NEXT: kxorw %k2, %k6, %k6 +; AVX512-NEXT: kshiftrw $9, %k6, %k7 +; AVX512-NEXT: shlb $7, %r11b +; AVX512-NEXT: shlb $7, %sil +; AVX512-NEXT: addb %r11b, %sil +; AVX512-NEXT: movb $-1, %dil +; AVX512-NEXT: jb .LBB16_20 +; AVX512-NEXT: # %bb.19: +; AVX512-NEXT: movl %esi, %edi +; AVX512-NEXT: .LBB16_20: +; AVX512-NEXT: kshiftrw $14, %k0, %k2 +; AVX512-NEXT: kshiftrw $14, %k1, %k3 +; AVX512-NEXT: kmovd %k4, %esi +; AVX512-NEXT: kmovd %k5, %r11d +; AVX512-NEXT: shrb $7, %dil +; AVX512-NEXT: kmovd %edi, %k4 +; AVX512-NEXT: kxorw %k4, %k7, %k4 +; AVX512-NEXT: kshiftlw $15, %k4, %k4 +; AVX512-NEXT: kshiftrw $6, %k4, %k4 +; AVX512-NEXT: kxorw %k4, %k6, %k4 +; AVX512-NEXT: kshiftrw $10, %k4, %k5 +; AVX512-NEXT: shlb $7, %r10b +; AVX512-NEXT: shlb $7, %r15b +; AVX512-NEXT: addb %r10b, %r15b +; AVX512-NEXT: movb $-1, %dl +; AVX512-NEXT: jb .LBB16_22 +; AVX512-NEXT: # %bb.21: +; AVX512-NEXT: movl %r15d, %edx +; AVX512-NEXT: .LBB16_22: +; AVX512-NEXT: kshiftrw $15, %k0, %k0 +; AVX512-NEXT: kshiftrw $15, %k1, %k1 +; AVX512-NEXT: kmovd %k2, %ebx +; AVX512-NEXT: kmovd %k3, %edi +; AVX512-NEXT: shrb $7, %dl +; AVX512-NEXT: kmovd %edx, %k2 +; AVX512-NEXT: kxorw %k2, %k5, %k2 +; AVX512-NEXT: kshiftlw $15, %k2, %k2 +; AVX512-NEXT: kshiftrw $5, %k2, %k2 +; AVX512-NEXT: kxorw %k2, %k4, %k2 +; AVX512-NEXT: kshiftrw $11, %k2, %k3 +; AVX512-NEXT: shlb $7, %r9b +; AVX512-NEXT: shlb $7, %cl +; AVX512-NEXT: addb %r9b, %cl +; AVX512-NEXT: movb $-1, %dl +; AVX512-NEXT: jb .LBB16_24 +; AVX512-NEXT: # %bb.23: +; AVX512-NEXT: movl %ecx, %edx +; AVX512-NEXT: .LBB16_24: +; AVX512-NEXT: kmovd %k0, %ecx +; AVX512-NEXT: kmovd %k1, %ebp +; AVX512-NEXT: shrb $7, %dl +; AVX512-NEXT: kmovd %edx, %k0 +; AVX512-NEXT: kxorw %k0, %k3, %k0 +; AVX512-NEXT: kshiftlw $15, %k0, %k0 +; AVX512-NEXT: kshiftrw $4, %k0, %k0 +; AVX512-NEXT: kxorw %k0, %k2, %k0 +; AVX512-NEXT: kshiftrw $12, %k0, %k1 +; AVX512-NEXT: shlb $7, %r14b +; AVX512-NEXT: shlb $7, %al +; AVX512-NEXT: addb %r14b, %al +; AVX512-NEXT: movb $-1, %dl +; AVX512-NEXT: jb .LBB16_26 +; AVX512-NEXT: # %bb.25: +; AVX512-NEXT: movl %eax, %edx +; AVX512-NEXT: .LBB16_26: +; AVX512-NEXT: shrb $7, %dl +; AVX512-NEXT: kmovd %edx, %k2 +; AVX512-NEXT: kxorw %k2, %k1, %k1 +; AVX512-NEXT: kshiftlw $15, %k1, %k1 +; AVX512-NEXT: kshiftrw $3, %k1, %k1 +; AVX512-NEXT: kxorw %k1, %k0, %k0 +; AVX512-NEXT: kshiftrw $13, %k0, %k1 +; AVX512-NEXT: shlb $7, %r11b +; AVX512-NEXT: shlb $7, %sil +; AVX512-NEXT: addb %r11b, %sil +; AVX512-NEXT: movb $-1, %al +; AVX512-NEXT: jb .LBB16_28 +; AVX512-NEXT: # %bb.27: +; AVX512-NEXT: movl %esi, %eax +; AVX512-NEXT: .LBB16_28: +; AVX512-NEXT: shrb $7, %al +; AVX512-NEXT: kmovd %eax, %k2 +; AVX512-NEXT: kxorw %k2, %k1, %k1 +; AVX512-NEXT: kshiftlw $15, %k1, %k1 +; AVX512-NEXT: kshiftrw $2, %k1, %k1 +; AVX512-NEXT: kxorw %k1, %k0, %k0 +; AVX512-NEXT: kshiftrw $14, %k0, %k1 +; AVX512-NEXT: shlb $7, %dil +; AVX512-NEXT: shlb $7, %bl +; AVX512-NEXT: addb %dil, %bl +; AVX512-NEXT: movb $-1, %al +; AVX512-NEXT: jb .LBB16_30 +; AVX512-NEXT: # %bb.29: +; AVX512-NEXT: movl %ebx, %eax +; AVX512-NEXT: .LBB16_30: +; AVX512-NEXT: shrb $7, %al +; AVX512-NEXT: kmovd %eax, %k2 +; AVX512-NEXT: kxorw %k2, %k1, %k1 +; AVX512-NEXT: kshiftlw $15, %k1, %k1 +; AVX512-NEXT: kshiftrw $1, %k1, %k1 +; AVX512-NEXT: kxorw %k1, %k0, %k0 +; AVX512-NEXT: kshiftlw $1, %k0, %k0 +; AVX512-NEXT: kshiftrw $1, %k0, %k0 +; AVX512-NEXT: shlb $7, %bpl +; AVX512-NEXT: shlb $7, %cl +; AVX512-NEXT: addb %bpl, %cl +; AVX512-NEXT: jb .LBB16_32 +; AVX512-NEXT: # %bb.31: +; AVX512-NEXT: movl %ecx, %r8d +; AVX512-NEXT: .LBB16_32: +; AVX512-NEXT: shrb $7, %r8b +; AVX512-NEXT: kmovd %r8d, %k1 +; AVX512-NEXT: kshiftlw $15, %k1, %k1 +; AVX512-NEXT: korw %k1, %k0, %k0 +; AVX512-NEXT: vpmovm2b %k0, %xmm0 +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %r15 +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq + %z = call <16 x i1> @llvm.uadd.sat.v16i1(<16 x i1> %x, <16 x i1> %y) + ret <16 x i1> %z +} + +; Expanded + +define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { +; SSE2-LABEL: v4i32: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3] +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] +; SSE2-NEXT: movd %xmm2, %ecx +; SSE2-NEXT: addl %eax, %ecx +; SSE2-NEXT: movl $-1, %eax +; SSE2-NEXT: cmovbl %eax, %ecx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; SSE2-NEXT: movd %xmm3, %ecx +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; SSE2-NEXT: movd %xmm3, %edx +; SSE2-NEXT: addl %ecx, %edx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm3 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: movd %xmm0, %edx +; SSE2-NEXT: addl %ecx, %edx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE2-NEXT: movd %xmm0, %edx +; SSE2-NEXT: addl %ecx, %edx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v4i32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3] +; SSSE3-NEXT: movd %xmm2, %eax +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] +; SSSE3-NEXT: movd %xmm2, %ecx +; SSSE3-NEXT: addl %eax, %ecx +; SSSE3-NEXT: movl $-1, %eax +; SSSE3-NEXT: cmovbl %eax, %ecx +; SSSE3-NEXT: movd %ecx, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; SSSE3-NEXT: movd %xmm3, %ecx +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; SSSE3-NEXT: movd %xmm3, %edx +; SSSE3-NEXT: addl %ecx, %edx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm3 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSSE3-NEXT: movd %xmm1, %ecx +; SSSE3-NEXT: movd %xmm0, %edx +; SSSE3-NEXT: addl %ecx, %edx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; SSSE3-NEXT: movd %xmm1, %ecx +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSSE3-NEXT: movd %xmm0, %edx +; SSSE3-NEXT: addl %ecx, %edx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v4i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pextrd $1, %xmm1, %eax +; SSE41-NEXT: pextrd $1, %xmm0, %ecx +; SSE41-NEXT: addl %eax, %ecx +; SSE41-NEXT: movl $-1, %eax +; SSE41-NEXT: cmovbl %eax, %ecx +; SSE41-NEXT: movd %xmm1, %edx +; SSE41-NEXT: movd %xmm0, %esi +; SSE41-NEXT: addl %edx, %esi +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: movd %esi, %xmm2 +; SSE41-NEXT: pinsrd $1, %ecx, %xmm2 +; SSE41-NEXT: pextrd $2, %xmm1, %ecx +; SSE41-NEXT: pextrd $2, %xmm0, %edx +; SSE41-NEXT: addl %ecx, %edx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrd $2, %edx, %xmm2 +; SSE41-NEXT: pextrd $3, %xmm1, %ecx +; SSE41-NEXT: pextrd $3, %xmm0, %edx +; SSE41-NEXT: addl %ecx, %edx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrd $3, %edx, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vpextrd $1, %xmm1, %eax +; AVX-NEXT: vpextrd $1, %xmm0, %ecx +; AVX-NEXT: addl %eax, %ecx +; AVX-NEXT: movl $-1, %eax +; AVX-NEXT: cmovbl %eax, %ecx +; AVX-NEXT: vmovd %xmm1, %edx +; AVX-NEXT: vmovd %xmm0, %esi +; AVX-NEXT: addl %edx, %esi +; AVX-NEXT: cmovbl %eax, %esi +; AVX-NEXT: vmovd %esi, %xmm2 +; AVX-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrd $2, %xmm1, %ecx +; AVX-NEXT: vpextrd $2, %xmm0, %edx +; AVX-NEXT: addl %ecx, %edx +; AVX-NEXT: cmovbl %eax, %edx +; AVX-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2 +; AVX-NEXT: vpextrd $3, %xmm1, %ecx +; AVX-NEXT: vpextrd $3, %xmm0, %edx +; AVX-NEXT: addl %ecx, %edx +; AVX-NEXT: cmovbl %eax, %edx +; AVX-NEXT: vpinsrd $3, %edx, %xmm2, %xmm0 +; AVX-NEXT: retq + %z = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %x, <4 x i32> %y) + ret <4 x i32> %z +} + +define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { +; SSE2-LABEL: v2i32: +; SSE2: # %bb.0: +; SSE2-NEXT: psllq $32, %xmm1 +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: psllq $32, %xmm0 +; SSE2-NEXT: movq %xmm0, %rcx +; SSE2-NEXT: addq %rax, %rcx +; SSE2-NEXT: movq $-1, %rax +; SSE2-NEXT: cmovbq %rax, %rcx +; SSE2-NEXT: movq %rcx, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: movq %xmm1, %rcx +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm0, %rdx +; SSE2-NEXT: addq %rcx, %rdx +; SSE2-NEXT: cmovbq %rax, %rdx +; SSE2-NEXT: movq %rdx, %xmm0 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE2-NEXT: psrlq $32, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v2i32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: psllq $32, %xmm1 +; SSSE3-NEXT: movq %xmm1, %rax +; SSSE3-NEXT: psllq $32, %xmm0 +; SSSE3-NEXT: movq %xmm0, %rcx +; SSSE3-NEXT: addq %rax, %rcx +; SSSE3-NEXT: movq $-1, %rax +; SSSE3-NEXT: cmovbq %rax, %rcx +; SSSE3-NEXT: movq %rcx, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSSE3-NEXT: movq %xmm1, %rcx +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSSE3-NEXT: movq %xmm0, %rdx +; SSSE3-NEXT: addq %rcx, %rdx +; SSSE3-NEXT: cmovbq %rax, %rdx +; SSSE3-NEXT: movq %rdx, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSSE3-NEXT: psrlq $32, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v2i32: +; SSE41: # %bb.0: +; SSE41-NEXT: psllq $32, %xmm1 +; SSE41-NEXT: pextrq $1, %xmm1, %rax +; SSE41-NEXT: psllq $32, %xmm0 +; SSE41-NEXT: pextrq $1, %xmm0, %rcx +; SSE41-NEXT: addq %rax, %rcx +; SSE41-NEXT: movq $-1, %rax +; SSE41-NEXT: cmovbq %rax, %rcx +; SSE41-NEXT: movq %rcx, %xmm2 +; SSE41-NEXT: movq %xmm1, %rcx +; SSE41-NEXT: movq %xmm0, %rdx +; SSE41-NEXT: addq %rcx, %rdx +; SSE41-NEXT: cmovbq %rax, %rdx +; SSE41-NEXT: movq %rdx, %xmm0 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE41-NEXT: psrlq $32, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: v2i32: +; AVX: # %bb.0: +; AVX-NEXT: vpsllq $32, %xmm1, %xmm1 +; AVX-NEXT: vpextrq $1, %xmm1, %rax +; AVX-NEXT: vpsllq $32, %xmm0, %xmm0 +; AVX-NEXT: vpextrq $1, %xmm0, %rcx +; AVX-NEXT: addq %rax, %rcx +; AVX-NEXT: movq $-1, %rax +; AVX-NEXT: cmovbq %rax, %rcx +; AVX-NEXT: vmovq %rcx, %xmm2 +; AVX-NEXT: vmovq %xmm1, %rcx +; AVX-NEXT: vmovq %xmm0, %rdx +; AVX-NEXT: addq %rcx, %rdx +; AVX-NEXT: cmovbq %rax, %rdx +; AVX-NEXT: vmovq %rdx, %xmm0 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX-NEXT: retq + %z = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> %x, <2 x i32> %y) + ret <2 x i32> %z +} + +define <4 x i24> @v4i24(<4 x i24> %x, <4 x i24> %y) nounwind { +; SSE2-LABEL: v4i24: +; SSE2: # %bb.0: +; SSE2-NEXT: pslld $8, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3] +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: pslld $8, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] +; SSE2-NEXT: movd %xmm2, %ecx +; SSE2-NEXT: addl %eax, %ecx +; SSE2-NEXT: movl $-1, %eax +; SSE2-NEXT: cmovbl %eax, %ecx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; SSE2-NEXT: movd %xmm3, %ecx +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; SSE2-NEXT: movd %xmm3, %edx +; SSE2-NEXT: addl %ecx, %edx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm3 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: movd %xmm0, %edx +; SSE2-NEXT: addl %ecx, %edx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE2-NEXT: movd %xmm0, %edx +; SSE2-NEXT: addl %ecx, %edx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE2-NEXT: psrld $8, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v4i24: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pslld $8, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3] +; SSSE3-NEXT: movd %xmm2, %eax +; SSSE3-NEXT: pslld $8, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] +; SSSE3-NEXT: movd %xmm2, %ecx +; SSSE3-NEXT: addl %eax, %ecx +; SSSE3-NEXT: movl $-1, %eax +; SSSE3-NEXT: cmovbl %eax, %ecx +; SSSE3-NEXT: movd %ecx, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; SSSE3-NEXT: movd %xmm3, %ecx +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; SSSE3-NEXT: movd %xmm3, %edx +; SSSE3-NEXT: addl %ecx, %edx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm3 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSSE3-NEXT: movd %xmm1, %ecx +; SSSE3-NEXT: movd %xmm0, %edx +; SSSE3-NEXT: addl %ecx, %edx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; SSSE3-NEXT: movd %xmm1, %ecx +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSSE3-NEXT: movd %xmm0, %edx +; SSSE3-NEXT: addl %ecx, %edx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSSE3-NEXT: psrld $8, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v4i24: +; SSE41: # %bb.0: +; SSE41-NEXT: pslld $8, %xmm1 +; SSE41-NEXT: pextrd $1, %xmm1, %eax +; SSE41-NEXT: pslld $8, %xmm0 +; SSE41-NEXT: pextrd $1, %xmm0, %ecx +; SSE41-NEXT: addl %eax, %ecx +; SSE41-NEXT: movl $-1, %eax +; SSE41-NEXT: cmovbl %eax, %ecx +; SSE41-NEXT: movd %xmm1, %edx +; SSE41-NEXT: movd %xmm0, %esi +; SSE41-NEXT: addl %edx, %esi +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: movd %esi, %xmm2 +; SSE41-NEXT: pinsrd $1, %ecx, %xmm2 +; SSE41-NEXT: pextrd $2, %xmm1, %ecx +; SSE41-NEXT: pextrd $2, %xmm0, %edx +; SSE41-NEXT: addl %ecx, %edx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrd $2, %edx, %xmm2 +; SSE41-NEXT: pextrd $3, %xmm1, %ecx +; SSE41-NEXT: pextrd $3, %xmm0, %edx +; SSE41-NEXT: addl %ecx, %edx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrd $3, %edx, %xmm2 +; SSE41-NEXT: psrld $8, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: v4i24: +; AVX: # %bb.0: +; AVX-NEXT: vpslld $8, %xmm1, %xmm1 +; AVX-NEXT: vpextrd $1, %xmm1, %eax +; AVX-NEXT: vpslld $8, %xmm0, %xmm0 +; AVX-NEXT: vpextrd $1, %xmm0, %ecx +; AVX-NEXT: addl %eax, %ecx +; AVX-NEXT: movl $-1, %eax +; AVX-NEXT: cmovbl %eax, %ecx +; AVX-NEXT: vmovd %xmm1, %edx +; AVX-NEXT: vmovd %xmm0, %esi +; AVX-NEXT: addl %edx, %esi +; AVX-NEXT: cmovbl %eax, %esi +; AVX-NEXT: vmovd %esi, %xmm2 +; AVX-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrd $2, %xmm1, %ecx +; AVX-NEXT: vpextrd $2, %xmm0, %edx +; AVX-NEXT: addl %ecx, %edx +; AVX-NEXT: cmovbl %eax, %edx +; AVX-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2 +; AVX-NEXT: vpextrd $3, %xmm1, %ecx +; AVX-NEXT: vpextrd $3, %xmm0, %edx +; AVX-NEXT: addl %ecx, %edx +; AVX-NEXT: cmovbl %eax, %edx +; AVX-NEXT: vpinsrd $3, %edx, %xmm2, %xmm0 +; AVX-NEXT: vpsrld $8, %xmm0, %xmm0 +; AVX-NEXT: retq + %z = call <4 x i24> @llvm.uadd.sat.v4i24(<4 x i24> %x, <4 x i24> %y) + ret <4 x i24> %z +} + +define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind { +; SSE-LABEL: v2i128: +; SSE: # %bb.0: +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: addq %r9, %rsi +; SSE-NEXT: adcq {{[0-9]+}}(%rsp), %rdx +; SSE-NEXT: movq $-1, %rdi +; SSE-NEXT: cmovbq %rdi, %rsi +; SSE-NEXT: cmovbq %rdi, %rdx +; SSE-NEXT: addq {{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: adcq {{[0-9]+}}(%rsp), %r8 +; SSE-NEXT: cmovbq %rdi, %r8 +; SSE-NEXT: cmovbq %rdi, %rcx +; SSE-NEXT: movq %r8, 24(%rax) +; SSE-NEXT: movq %rcx, 16(%rax) +; SSE-NEXT: movq %rdx, 8(%rax) +; SSE-NEXT: movq %rsi, (%rax) +; SSE-NEXT: retq +; +; AVX-LABEL: v2i128: +; AVX: # %bb.0: +; AVX-NEXT: movq %rdi, %rax +; AVX-NEXT: addq %r9, %rsi +; AVX-NEXT: adcq {{[0-9]+}}(%rsp), %rdx +; AVX-NEXT: movq $-1, %rdi +; AVX-NEXT: cmovbq %rdi, %rsi +; AVX-NEXT: cmovbq %rdi, %rdx +; AVX-NEXT: addq {{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: adcq {{[0-9]+}}(%rsp), %r8 +; AVX-NEXT: cmovbq %rdi, %r8 +; AVX-NEXT: cmovbq %rdi, %rcx +; AVX-NEXT: movq %r8, 24(%rax) +; AVX-NEXT: movq %rcx, 16(%rax) +; AVX-NEXT: movq %rdx, 8(%rax) +; AVX-NEXT: movq %rsi, (%rax) +; AVX-NEXT: retq + %z = call <2 x i128> @llvm.uadd.sat.v2i128(<2 x i128> %x, <2 x i128> %y) + ret <2 x i128> %z +} diff --git a/test/CodeGen/X86/usub_sat_vec.ll b/test/CodeGen/X86/usub_sat_vec.ll new file mode 100644 index 00000000000..d01e2b29f57 --- /dev/null +++ b/test/CodeGen/X86/usub_sat_vec.ll @@ -0,0 +1,13695 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512 + +declare <1 x i8> @llvm.usub.sat.v1i8(<1 x i8>, <1 x i8>) +declare <2 x i8> @llvm.usub.sat.v2i8(<2 x i8>, <2 x i8>) +declare <4 x i8> @llvm.usub.sat.v4i8(<4 x i8>, <4 x i8>) +declare <8 x i8> @llvm.usub.sat.v8i8(<8 x i8>, <8 x i8>) +declare <12 x i8> @llvm.usub.sat.v12i8(<12 x i8>, <12 x i8>) +declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8>, <16 x i8>) +declare <32 x i8> @llvm.usub.sat.v32i8(<32 x i8>, <32 x i8>) +declare <64 x i8> @llvm.usub.sat.v64i8(<64 x i8>, <64 x i8>) + +declare <1 x i16> @llvm.usub.sat.v1i16(<1 x i16>, <1 x i16>) +declare <2 x i16> @llvm.usub.sat.v2i16(<2 x i16>, <2 x i16>) +declare <4 x i16> @llvm.usub.sat.v4i16(<4 x i16>, <4 x i16>) +declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16>, <8 x i16>) +declare <12 x i16> @llvm.usub.sat.v12i16(<12 x i16>, <12 x i16>) +declare <16 x i16> @llvm.usub.sat.v16i16(<16 x i16>, <16 x i16>) +declare <32 x i16> @llvm.usub.sat.v32i16(<32 x i16>, <32 x i16>) + +declare <16 x i1> @llvm.usub.sat.v16i1(<16 x i1>, <16 x i1>) +declare <16 x i4> @llvm.usub.sat.v16i4(<16 x i4>, <16 x i4>) + +declare <4 x i32> @llvm.usub.sat.v4i32(<4 x i32>, <4 x i32>) +declare <2 x i32> @llvm.usub.sat.v2i32(<2 x i32>, <2 x i32>) +declare <4 x i24> @llvm.usub.sat.v4i24(<4 x i24>, <4 x i24>) +declare <2 x i128> @llvm.usub.sat.v2i128(<2 x i128>, <2 x i128>) + +; Legal types, depending on architecture. + +define <16 x i8> @v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { +; SSE2-LABEL: v16i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: pushq %r15 +; SSE2-NEXT: pushq %r14 +; SSE2-NEXT: pushq %r13 +; SSE2-NEXT: pushq %r12 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: jb .LBB0_2 +; SSE2-NEXT: # %bb.1: +; SSE2-NEXT: movl %ecx, %eax +; SSE2-NEXT: .LBB0_2: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl $0, %ecx +; SSE2-NEXT: jb .LBB0_4 +; SSE2-NEXT: # %bb.3: +; SSE2-NEXT: movl %edx, %ecx +; SSE2-NEXT: .LBB0_4: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl $0, %edx +; SSE2-NEXT: jb .LBB0_6 +; SSE2-NEXT: # %bb.5: +; SSE2-NEXT: movl %ebx, %edx +; SSE2-NEXT: .LBB0_6: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl $0, %esi +; SSE2-NEXT: jb .LBB0_8 +; SSE2-NEXT: # %bb.7: +; SSE2-NEXT: movl %ebx, %esi +; SSE2-NEXT: .LBB0_8: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl $0, %edi +; SSE2-NEXT: jb .LBB0_10 +; SSE2-NEXT: # %bb.9: +; SSE2-NEXT: movl %ebx, %edi +; SSE2-NEXT: .LBB0_10: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl $0, %r12d +; SSE2-NEXT: jb .LBB0_12 +; SSE2-NEXT: # %bb.11: +; SSE2-NEXT: movl %ebx, %r12d +; SSE2-NEXT: .LBB0_12: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl $0, %r8d +; SSE2-NEXT: jb .LBB0_14 +; SSE2-NEXT: # %bb.13: +; SSE2-NEXT: movl %ebx, %r8d +; SSE2-NEXT: .LBB0_14: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl $0, %r10d +; SSE2-NEXT: jb .LBB0_16 +; SSE2-NEXT: # %bb.15: +; SSE2-NEXT: movl %ebx, %r10d +; SSE2-NEXT: .LBB0_16: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl $0, %r13d +; SSE2-NEXT: jb .LBB0_18 +; SSE2-NEXT: # %bb.17: +; SSE2-NEXT: movl %ebx, %r13d +; SSE2-NEXT: .LBB0_18: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl $0, %r9d +; SSE2-NEXT: jb .LBB0_20 +; SSE2-NEXT: # %bb.19: +; SSE2-NEXT: movl %ebx, %r9d +; SSE2-NEXT: .LBB0_20: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl $0, %r11d +; SSE2-NEXT: jb .LBB0_22 +; SSE2-NEXT: # %bb.21: +; SSE2-NEXT: movl %ebx, %r11d +; SSE2-NEXT: .LBB0_22: +; SSE2-NEXT: movzbl %al, %r14d +; SSE2-NEXT: movzbl %cl, %r15d +; SSE2-NEXT: movzbl %dl, %edx +; SSE2-NEXT: movzbl %sil, %esi +; SSE2-NEXT: movzbl %dil, %ebx +; SSE2-NEXT: movzbl %r12b, %ebp +; SSE2-NEXT: movzbl %r8b, %edi +; SSE2-NEXT: movzbl %r10b, %r8d +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: jb .LBB0_24 +; SSE2-NEXT: # %bb.23: +; SSE2-NEXT: movl %ecx, %eax +; SSE2-NEXT: .LBB0_24: +; SSE2-NEXT: movd %r14d, %xmm2 +; SSE2-NEXT: movd %r15d, %xmm3 +; SSE2-NEXT: movd %edx, %xmm5 +; SSE2-NEXT: movd %esi, %xmm0 +; SSE2-NEXT: movd %ebx, %xmm6 +; SSE2-NEXT: movd %ebp, %xmm4 +; SSE2-NEXT: movd %edi, %xmm7 +; SSE2-NEXT: movd %r8d, %xmm1 +; SSE2-NEXT: movzbl %r13b, %ebp +; SSE2-NEXT: movzbl %r9b, %ecx +; SSE2-NEXT: movzbl %r11b, %edx +; SSE2-NEXT: movzbl %al, %esi +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, %edi +; SSE2-NEXT: jb .LBB0_26 +; SSE2-NEXT: # %bb.25: +; SSE2-NEXT: movl %eax, %edi +; SSE2-NEXT: .LBB0_26: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE2-NEXT: movd %ebp, %xmm6 +; SSE2-NEXT: movd %ecx, %xmm5 +; SSE2-NEXT: movd %edx, %xmm7 +; SSE2-NEXT: movd %esi, %xmm2 +; SSE2-NEXT: movzbl %dil, %eax +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl $0, %ecx +; SSE2-NEXT: jb .LBB0_28 +; SSE2-NEXT: # %bb.27: +; SSE2-NEXT: movl %edx, %ecx +; SSE2-NEXT: .LBB0_28: +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: movzbl %cl, %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl $0, %ecx +; SSE2-NEXT: jb .LBB0_30 +; SSE2-NEXT: # %bb.29: +; SSE2-NEXT: movl %edx, %ecx +; SSE2-NEXT: .LBB0_30: +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: movzbl %cl, %ecx +; SSE2-NEXT: movd %ecx, %xmm4 +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, %ecx +; SSE2-NEXT: jb .LBB0_32 +; SSE2-NEXT: # %bb.31: +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: .LBB0_32: +; SSE2-NEXT: movzbl %cl, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %r12 +; SSE2-NEXT: popq %r13 +; SSE2-NEXT: popq %r14 +; SSE2-NEXT: popq %r15 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v16i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pushq %rbp +; SSSE3-NEXT: pushq %r15 +; SSSE3-NEXT: pushq %r14 +; SSSE3-NEXT: pushq %r13 +; SSSE3-NEXT: pushq %r12 +; SSSE3-NEXT: pushq %rbx +; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movl $0, %eax +; SSSE3-NEXT: jb .LBB0_2 +; SSSE3-NEXT: # %bb.1: +; SSSE3-NEXT: movl %ecx, %eax +; SSSE3-NEXT: .LBB0_2: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl $0, %ecx +; SSSE3-NEXT: jb .LBB0_4 +; SSSE3-NEXT: # %bb.3: +; SSSE3-NEXT: movl %edx, %ecx +; SSSE3-NEXT: .LBB0_4: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl $0, %edx +; SSSE3-NEXT: jb .LBB0_6 +; SSSE3-NEXT: # %bb.5: +; SSSE3-NEXT: movl %ebx, %edx +; SSSE3-NEXT: .LBB0_6: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl $0, %esi +; SSSE3-NEXT: jb .LBB0_8 +; SSSE3-NEXT: # %bb.7: +; SSSE3-NEXT: movl %ebx, %esi +; SSSE3-NEXT: .LBB0_8: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl $0, %edi +; SSSE3-NEXT: jb .LBB0_10 +; SSSE3-NEXT: # %bb.9: +; SSSE3-NEXT: movl %ebx, %edi +; SSSE3-NEXT: .LBB0_10: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl $0, %r12d +; SSSE3-NEXT: jb .LBB0_12 +; SSSE3-NEXT: # %bb.11: +; SSSE3-NEXT: movl %ebx, %r12d +; SSSE3-NEXT: .LBB0_12: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl $0, %r8d +; SSSE3-NEXT: jb .LBB0_14 +; SSSE3-NEXT: # %bb.13: +; SSSE3-NEXT: movl %ebx, %r8d +; SSSE3-NEXT: .LBB0_14: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl $0, %r10d +; SSSE3-NEXT: jb .LBB0_16 +; SSSE3-NEXT: # %bb.15: +; SSSE3-NEXT: movl %ebx, %r10d +; SSSE3-NEXT: .LBB0_16: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl $0, %r13d +; SSSE3-NEXT: jb .LBB0_18 +; SSSE3-NEXT: # %bb.17: +; SSSE3-NEXT: movl %ebx, %r13d +; SSSE3-NEXT: .LBB0_18: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl $0, %r9d +; SSSE3-NEXT: jb .LBB0_20 +; SSSE3-NEXT: # %bb.19: +; SSSE3-NEXT: movl %ebx, %r9d +; SSSE3-NEXT: .LBB0_20: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl $0, %r11d +; SSSE3-NEXT: jb .LBB0_22 +; SSSE3-NEXT: # %bb.21: +; SSSE3-NEXT: movl %ebx, %r11d +; SSSE3-NEXT: .LBB0_22: +; SSSE3-NEXT: movzbl %al, %r14d +; SSSE3-NEXT: movzbl %cl, %r15d +; SSSE3-NEXT: movzbl %dl, %edx +; SSSE3-NEXT: movzbl %sil, %esi +; SSSE3-NEXT: movzbl %dil, %ebx +; SSSE3-NEXT: movzbl %r12b, %ebp +; SSSE3-NEXT: movzbl %r8b, %edi +; SSSE3-NEXT: movzbl %r10b, %r8d +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movl $0, %eax +; SSSE3-NEXT: jb .LBB0_24 +; SSSE3-NEXT: # %bb.23: +; SSSE3-NEXT: movl %ecx, %eax +; SSSE3-NEXT: .LBB0_24: +; SSSE3-NEXT: movd %r14d, %xmm2 +; SSSE3-NEXT: movd %r15d, %xmm3 +; SSSE3-NEXT: movd %edx, %xmm5 +; SSSE3-NEXT: movd %esi, %xmm0 +; SSSE3-NEXT: movd %ebx, %xmm6 +; SSSE3-NEXT: movd %ebp, %xmm4 +; SSSE3-NEXT: movd %edi, %xmm7 +; SSSE3-NEXT: movd %r8d, %xmm1 +; SSSE3-NEXT: movzbl %r13b, %ebp +; SSSE3-NEXT: movzbl %r9b, %ecx +; SSSE3-NEXT: movzbl %r11b, %edx +; SSSE3-NEXT: movzbl %al, %esi +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, %edi +; SSSE3-NEXT: jb .LBB0_26 +; SSSE3-NEXT: # %bb.25: +; SSSE3-NEXT: movl %eax, %edi +; SSSE3-NEXT: .LBB0_26: +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSSE3-NEXT: movd %ebp, %xmm6 +; SSSE3-NEXT: movd %ecx, %xmm5 +; SSSE3-NEXT: movd %edx, %xmm7 +; SSSE3-NEXT: movd %esi, %xmm2 +; SSSE3-NEXT: movzbl %dil, %eax +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl $0, %ecx +; SSSE3-NEXT: jb .LBB0_28 +; SSSE3-NEXT: # %bb.27: +; SSSE3-NEXT: movl %edx, %ecx +; SSSE3-NEXT: .LBB0_28: +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSSE3-NEXT: movd %eax, %xmm4 +; SSSE3-NEXT: movzbl %cl, %eax +; SSSE3-NEXT: movd %eax, %xmm3 +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl $0, %ecx +; SSSE3-NEXT: jb .LBB0_30 +; SSSE3-NEXT: # %bb.29: +; SSSE3-NEXT: movl %edx, %ecx +; SSSE3-NEXT: .LBB0_30: +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSSE3-NEXT: movzbl %cl, %ecx +; SSSE3-NEXT: movd %ecx, %xmm4 +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, %ecx +; SSSE3-NEXT: jb .LBB0_32 +; SSSE3-NEXT: # %bb.31: +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: .LBB0_32: +; SSSE3-NEXT: movzbl %cl, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: popq %rbx +; SSSE3-NEXT: popq %r12 +; SSSE3-NEXT: popq %r13 +; SSSE3-NEXT: popq %r14 +; SSSE3-NEXT: popq %r15 +; SSSE3-NEXT: popq %rbp +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v16i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pextrb $1, %xmm1, %edx +; SSE41-NEXT: pextrb $1, %xmm0, %ecx +; SSE41-NEXT: xorl %eax, %eax +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB0_2 +; SSE41-NEXT: # %bb.1: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB0_2: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pextrb $0, %xmm1, %esi +; SSE41-NEXT: pextrb $0, %xmm0, %edx +; SSE41-NEXT: subb %sil, %dl +; SSE41-NEXT: movl $0, %esi +; SSE41-NEXT: jb .LBB0_4 +; SSE41-NEXT: # %bb.3: +; SSE41-NEXT: movl %edx, %esi +; SSE41-NEXT: .LBB0_4: +; SSE41-NEXT: movzbl %sil, %edx +; SSE41-NEXT: movd %edx, %xmm2 +; SSE41-NEXT: pinsrb $1, %ecx, %xmm2 +; SSE41-NEXT: pextrb $2, %xmm1, %edx +; SSE41-NEXT: pextrb $2, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB0_6 +; SSE41-NEXT: # %bb.5: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB0_6: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $2, %ecx, %xmm2 +; SSE41-NEXT: pextrb $3, %xmm1, %edx +; SSE41-NEXT: pextrb $3, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB0_8 +; SSE41-NEXT: # %bb.7: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB0_8: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $3, %ecx, %xmm2 +; SSE41-NEXT: pextrb $4, %xmm1, %edx +; SSE41-NEXT: pextrb $4, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB0_10 +; SSE41-NEXT: # %bb.9: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB0_10: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $4, %ecx, %xmm2 +; SSE41-NEXT: pextrb $5, %xmm1, %edx +; SSE41-NEXT: pextrb $5, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB0_12 +; SSE41-NEXT: # %bb.11: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB0_12: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $5, %ecx, %xmm2 +; SSE41-NEXT: pextrb $6, %xmm1, %edx +; SSE41-NEXT: pextrb $6, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB0_14 +; SSE41-NEXT: # %bb.13: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB0_14: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $6, %ecx, %xmm2 +; SSE41-NEXT: pextrb $7, %xmm1, %edx +; SSE41-NEXT: pextrb $7, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB0_16 +; SSE41-NEXT: # %bb.15: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB0_16: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $7, %ecx, %xmm2 +; SSE41-NEXT: pextrb $8, %xmm1, %edx +; SSE41-NEXT: pextrb $8, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB0_18 +; SSE41-NEXT: # %bb.17: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB0_18: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $8, %ecx, %xmm2 +; SSE41-NEXT: pextrb $9, %xmm1, %edx +; SSE41-NEXT: pextrb $9, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB0_20 +; SSE41-NEXT: # %bb.19: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB0_20: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $9, %ecx, %xmm2 +; SSE41-NEXT: pextrb $10, %xmm1, %edx +; SSE41-NEXT: pextrb $10, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB0_22 +; SSE41-NEXT: # %bb.21: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB0_22: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $10, %ecx, %xmm2 +; SSE41-NEXT: pextrb $11, %xmm1, %edx +; SSE41-NEXT: pextrb $11, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB0_24 +; SSE41-NEXT: # %bb.23: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB0_24: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $11, %ecx, %xmm2 +; SSE41-NEXT: pextrb $12, %xmm1, %edx +; SSE41-NEXT: pextrb $12, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB0_26 +; SSE41-NEXT: # %bb.25: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB0_26: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $12, %ecx, %xmm2 +; SSE41-NEXT: pextrb $13, %xmm1, %edx +; SSE41-NEXT: pextrb $13, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB0_28 +; SSE41-NEXT: # %bb.27: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB0_28: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $13, %ecx, %xmm2 +; SSE41-NEXT: pextrb $14, %xmm1, %edx +; SSE41-NEXT: pextrb $14, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB0_30 +; SSE41-NEXT: # %bb.29: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB0_30: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $14, %ecx, %xmm2 +; SSE41-NEXT: pextrb $15, %xmm1, %edx +; SSE41-NEXT: pextrb $15, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: jb .LBB0_32 +; SSE41-NEXT: # %bb.31: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB0_32: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $15, %eax, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: v16i8: +; AVX: # %bb.0: +; AVX-NEXT: vpextrb $1, %xmm1, %edx +; AVX-NEXT: vpextrb $1, %xmm0, %ecx +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: subb %dl, %cl +; AVX-NEXT: movl $0, %edx +; AVX-NEXT: jb .LBB0_2 +; AVX-NEXT: # %bb.1: +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: .LBB0_2: +; AVX-NEXT: movzbl %dl, %ecx +; AVX-NEXT: vpextrb $0, %xmm1, %esi +; AVX-NEXT: vpextrb $0, %xmm0, %edx +; AVX-NEXT: subb %sil, %dl +; AVX-NEXT: movl $0, %esi +; AVX-NEXT: jb .LBB0_4 +; AVX-NEXT: # %bb.3: +; AVX-NEXT: movl %edx, %esi +; AVX-NEXT: .LBB0_4: +; AVX-NEXT: movzbl %sil, %edx +; AVX-NEXT: vmovd %edx, %xmm2 +; AVX-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $2, %xmm1, %edx +; AVX-NEXT: vpextrb $2, %xmm0, %ecx +; AVX-NEXT: subb %dl, %cl +; AVX-NEXT: movl $0, %edx +; AVX-NEXT: jb .LBB0_6 +; AVX-NEXT: # %bb.5: +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: .LBB0_6: +; AVX-NEXT: movzbl %dl, %ecx +; AVX-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $3, %xmm1, %edx +; AVX-NEXT: vpextrb $3, %xmm0, %ecx +; AVX-NEXT: subb %dl, %cl +; AVX-NEXT: movl $0, %edx +; AVX-NEXT: jb .LBB0_8 +; AVX-NEXT: # %bb.7: +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: .LBB0_8: +; AVX-NEXT: movzbl %dl, %ecx +; AVX-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $4, %xmm1, %edx +; AVX-NEXT: vpextrb $4, %xmm0, %ecx +; AVX-NEXT: subb %dl, %cl +; AVX-NEXT: movl $0, %edx +; AVX-NEXT: jb .LBB0_10 +; AVX-NEXT: # %bb.9: +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: .LBB0_10: +; AVX-NEXT: movzbl %dl, %ecx +; AVX-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $5, %xmm1, %edx +; AVX-NEXT: vpextrb $5, %xmm0, %ecx +; AVX-NEXT: subb %dl, %cl +; AVX-NEXT: movl $0, %edx +; AVX-NEXT: jb .LBB0_12 +; AVX-NEXT: # %bb.11: +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: .LBB0_12: +; AVX-NEXT: movzbl %dl, %ecx +; AVX-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $6, %xmm1, %edx +; AVX-NEXT: vpextrb $6, %xmm0, %ecx +; AVX-NEXT: subb %dl, %cl +; AVX-NEXT: movl $0, %edx +; AVX-NEXT: jb .LBB0_14 +; AVX-NEXT: # %bb.13: +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: .LBB0_14: +; AVX-NEXT: movzbl %dl, %ecx +; AVX-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $7, %xmm1, %edx +; AVX-NEXT: vpextrb $7, %xmm0, %ecx +; AVX-NEXT: subb %dl, %cl +; AVX-NEXT: movl $0, %edx +; AVX-NEXT: jb .LBB0_16 +; AVX-NEXT: # %bb.15: +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: .LBB0_16: +; AVX-NEXT: movzbl %dl, %ecx +; AVX-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $8, %xmm1, %edx +; AVX-NEXT: vpextrb $8, %xmm0, %ecx +; AVX-NEXT: subb %dl, %cl +; AVX-NEXT: movl $0, %edx +; AVX-NEXT: jb .LBB0_18 +; AVX-NEXT: # %bb.17: +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: .LBB0_18: +; AVX-NEXT: movzbl %dl, %ecx +; AVX-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $9, %xmm1, %edx +; AVX-NEXT: vpextrb $9, %xmm0, %ecx +; AVX-NEXT: subb %dl, %cl +; AVX-NEXT: movl $0, %edx +; AVX-NEXT: jb .LBB0_20 +; AVX-NEXT: # %bb.19: +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: .LBB0_20: +; AVX-NEXT: movzbl %dl, %ecx +; AVX-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $10, %xmm1, %edx +; AVX-NEXT: vpextrb $10, %xmm0, %ecx +; AVX-NEXT: subb %dl, %cl +; AVX-NEXT: movl $0, %edx +; AVX-NEXT: jb .LBB0_22 +; AVX-NEXT: # %bb.21: +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: .LBB0_22: +; AVX-NEXT: movzbl %dl, %ecx +; AVX-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $11, %xmm1, %edx +; AVX-NEXT: vpextrb $11, %xmm0, %ecx +; AVX-NEXT: subb %dl, %cl +; AVX-NEXT: movl $0, %edx +; AVX-NEXT: jb .LBB0_24 +; AVX-NEXT: # %bb.23: +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: .LBB0_24: +; AVX-NEXT: movzbl %dl, %ecx +; AVX-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $12, %xmm1, %edx +; AVX-NEXT: vpextrb $12, %xmm0, %ecx +; AVX-NEXT: subb %dl, %cl +; AVX-NEXT: movl $0, %edx +; AVX-NEXT: jb .LBB0_26 +; AVX-NEXT: # %bb.25: +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: .LBB0_26: +; AVX-NEXT: movzbl %dl, %ecx +; AVX-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $13, %xmm1, %edx +; AVX-NEXT: vpextrb $13, %xmm0, %ecx +; AVX-NEXT: subb %dl, %cl +; AVX-NEXT: movl $0, %edx +; AVX-NEXT: jb .LBB0_28 +; AVX-NEXT: # %bb.27: +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: .LBB0_28: +; AVX-NEXT: movzbl %dl, %ecx +; AVX-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $14, %xmm1, %edx +; AVX-NEXT: vpextrb $14, %xmm0, %ecx +; AVX-NEXT: subb %dl, %cl +; AVX-NEXT: movl $0, %edx +; AVX-NEXT: jb .LBB0_30 +; AVX-NEXT: # %bb.29: +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: .LBB0_30: +; AVX-NEXT: movzbl %dl, %ecx +; AVX-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $15, %xmm1, %edx +; AVX-NEXT: vpextrb $15, %xmm0, %ecx +; AVX-NEXT: subb %dl, %cl +; AVX-NEXT: jb .LBB0_32 +; AVX-NEXT: # %bb.31: +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: .LBB0_32: +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0 +; AVX-NEXT: retq + %z = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %x, <16 x i8> %y) + ret <16 x i8> %z +} + +define <32 x i8> @v32i8(<32 x i8> %x, <32 x i8> %y) nounwind { +; SSE2-LABEL: v32i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: pushq %r15 +; SSE2-NEXT: pushq %r14 +; SSE2-NEXT: pushq %r13 +; SSE2-NEXT: pushq %r12 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, %edx +; SSE2-NEXT: jb .LBB1_2 +; SSE2-NEXT: # %bb.1: +; SSE2-NEXT: movl %eax, %edx +; SSE2-NEXT: .LBB1_2: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, %r15d +; SSE2-NEXT: jb .LBB1_4 +; SSE2-NEXT: # %bb.3: +; SSE2-NEXT: movl %eax, %r15d +; SSE2-NEXT: .LBB1_4: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, %esi +; SSE2-NEXT: jb .LBB1_6 +; SSE2-NEXT: # %bb.5: +; SSE2-NEXT: movl %eax, %esi +; SSE2-NEXT: .LBB1_6: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, %ebx +; SSE2-NEXT: jb .LBB1_8 +; SSE2-NEXT: # %bb.7: +; SSE2-NEXT: movl %eax, %ebx +; SSE2-NEXT: .LBB1_8: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, %ebp +; SSE2-NEXT: jb .LBB1_10 +; SSE2-NEXT: # %bb.9: +; SSE2-NEXT: movl %eax, %ebp +; SSE2-NEXT: .LBB1_10: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, %edi +; SSE2-NEXT: jb .LBB1_12 +; SSE2-NEXT: # %bb.11: +; SSE2-NEXT: movl %eax, %edi +; SSE2-NEXT: .LBB1_12: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, %r13d +; SSE2-NEXT: jb .LBB1_14 +; SSE2-NEXT: # %bb.13: +; SSE2-NEXT: movl %eax, %r13d +; SSE2-NEXT: .LBB1_14: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, %ecx +; SSE2-NEXT: jb .LBB1_16 +; SSE2-NEXT: # %bb.15: +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: .LBB1_16: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, %r12d +; SSE2-NEXT: jb .LBB1_18 +; SSE2-NEXT: # %bb.17: +; SSE2-NEXT: movl %eax, %r12d +; SSE2-NEXT: .LBB1_18: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, %r14d +; SSE2-NEXT: jb .LBB1_20 +; SSE2-NEXT: # %bb.19: +; SSE2-NEXT: movl %eax, %r14d +; SSE2-NEXT: .LBB1_20: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, %r8d +; SSE2-NEXT: jb .LBB1_22 +; SSE2-NEXT: # %bb.21: +; SSE2-NEXT: movl %eax, %r8d +; SSE2-NEXT: .LBB1_22: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, %r9d +; SSE2-NEXT: jb .LBB1_24 +; SSE2-NEXT: # %bb.23: +; SSE2-NEXT: movl %eax, %r9d +; SSE2-NEXT: .LBB1_24: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, %r10d +; SSE2-NEXT: jb .LBB1_26 +; SSE2-NEXT: # %bb.25: +; SSE2-NEXT: movl %eax, %r10d +; SSE2-NEXT: .LBB1_26: +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, %r11d +; SSE2-NEXT: jb .LBB1_28 +; SSE2-NEXT: # %bb.27: +; SSE2-NEXT: movl %eax, %r11d +; SSE2-NEXT: .LBB1_28: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: jb .LBB1_30 +; SSE2-NEXT: # %bb.29: +; SSE2-NEXT: # kill: def $cl killed $cl def $ecx +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB1_30: +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, %ecx +; SSE2-NEXT: jb .LBB1_32 +; SSE2-NEXT: # %bb.31: +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: .LBB1_32: +; SSE2-NEXT: movl %ecx, (%rsp) # 4-byte Spill +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, %ecx +; SSE2-NEXT: jb .LBB1_34 +; SSE2-NEXT: # %bb.33: +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: .LBB1_34: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: jb .LBB1_36 +; SSE2-NEXT: # %bb.35: +; SSE2-NEXT: # kill: def $al killed $al def $eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB1_36: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: jb .LBB1_38 +; SSE2-NEXT: # %bb.37: +; SSE2-NEXT: # kill: def $al killed $al def $eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB1_38: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: jb .LBB1_40 +; SSE2-NEXT: # %bb.39: +; SSE2-NEXT: # kill: def $al killed $al def $eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB1_40: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: jb .LBB1_42 +; SSE2-NEXT: # %bb.41: +; SSE2-NEXT: # kill: def $al killed $al def $eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB1_42: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: jb .LBB1_44 +; SSE2-NEXT: # %bb.43: +; SSE2-NEXT: # kill: def $al killed $al def $eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB1_44: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: jb .LBB1_46 +; SSE2-NEXT: # %bb.45: +; SSE2-NEXT: # kill: def $al killed $al def $eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB1_46: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: jb .LBB1_48 +; SSE2-NEXT: # %bb.47: +; SSE2-NEXT: # kill: def $al killed $al def $eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB1_48: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: jb .LBB1_50 +; SSE2-NEXT: # %bb.49: +; SSE2-NEXT: # kill: def $al killed $al def $eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB1_50: +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, %ecx +; SSE2-NEXT: jb .LBB1_52 +; SSE2-NEXT: # %bb.51: +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: .LBB1_52: +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl %dl, %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl %r15b, %ecx +; SSE2-NEXT: movzbl %sil, %edx +; SSE2-NEXT: movzbl %bl, %esi +; SSE2-NEXT: movzbl %bpl, %ebx +; SSE2-NEXT: movzbl %dil, %ebp +; SSE2-NEXT: movzbl %r13b, %edi +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload +; SSE2-NEXT: movzbl %r12b, %r12d +; SSE2-NEXT: movzbl %r14b, %r13d +; SSE2-NEXT: movzbl %r8b, %r8d +; SSE2-NEXT: movzbl %r9b, %r9d +; SSE2-NEXT: movzbl %r10b, %r10d +; SSE2-NEXT: movzbl %r11b, %r11d +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload +; SSE2-NEXT: movzbl (%rsp), %eax # 1-byte Folded Reload +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: jb .LBB1_54 +; SSE2-NEXT: # %bb.53: +; SSE2-NEXT: # kill: def $al killed $al def $eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB1_54: +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload +; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movd %ecx, %xmm8 +; SSE2-NEXT: movd %edx, %xmm3 +; SSE2-NEXT: movd %esi, %xmm11 +; SSE2-NEXT: movd %ebx, %xmm5 +; SSE2-NEXT: movd %ebp, %xmm9 +; SSE2-NEXT: movd %edi, %xmm7 +; SSE2-NEXT: movd %r15d, %xmm1 +; SSE2-NEXT: movd %r12d, %xmm12 +; SSE2-NEXT: movd %r13d, %xmm10 +; SSE2-NEXT: movd %r8d, %xmm13 +; SSE2-NEXT: movd %r9d, %xmm4 +; SSE2-NEXT: movd %r10d, %xmm14 +; SSE2-NEXT: movd %r11d, %xmm6 +; SSE2-NEXT: movd %r14d, %xmm15 +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: jb .LBB1_56 +; SSE2-NEXT: # %bb.55: +; SSE2-NEXT: movl %ecx, %eax +; SSE2-NEXT: .LBB1_56: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3],xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3],xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3],xmm6[4],xmm14[4],xmm6[5],xmm14[5],xmm6[6],xmm14[6],xmm6[7],xmm14[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; SSE2-NEXT: movd %r8d, %xmm7 +; SSE2-NEXT: movd %edx, %xmm12 +; SSE2-NEXT: movd %esi, %xmm13 +; SSE2-NEXT: movd %ebx, %xmm5 +; SSE2-NEXT: movd %ebp, %xmm14 +; SSE2-NEXT: movd %edi, %xmm2 +; SSE2-NEXT: movd %r10d, %xmm15 +; SSE2-NEXT: movd %r9d, %xmm3 +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl $0, %edi +; SSE2-NEXT: jb .LBB1_58 +; SSE2-NEXT: # %bb.57: +; SSE2-NEXT: movl %ebx, %edi +; SSE2-NEXT: .LBB1_58: +; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] +; SSE2-NEXT: movd %ecx, %xmm8 +; SSE2-NEXT: movd %edx, %xmm7 +; SSE2-NEXT: movd %esi, %xmm9 +; SSE2-NEXT: movd %eax, %xmm6 +; SSE2-NEXT: movzbl %dil, %eax +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl $0, %ecx +; SSE2-NEXT: jb .LBB1_60 +; SSE2-NEXT: # %bb.59: +; SSE2-NEXT: movl %edx, %ecx +; SSE2-NEXT: .LBB1_60: +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: movzbl %cl, %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl $0, %ecx +; SSE2-NEXT: jb .LBB1_62 +; SSE2-NEXT: # %bb.61: +; SSE2-NEXT: movl %edx, %ecx +; SSE2-NEXT: .LBB1_62: +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE2-NEXT: movzbl %cl, %ecx +; SSE2-NEXT: movd %ecx, %xmm4 +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, %ecx +; SSE2-NEXT: jb .LBB1_64 +; SSE2-NEXT: # %bb.63: +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: .LBB1_64: +; SSE2-NEXT: movzbl %cl, %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE2-NEXT: addq $8, %rsp +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %r12 +; SSE2-NEXT: popq %r13 +; SSE2-NEXT: popq %r14 +; SSE2-NEXT: popq %r15 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v32i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pushq %rbp +; SSSE3-NEXT: pushq %r15 +; SSSE3-NEXT: pushq %r14 +; SSSE3-NEXT: pushq %r13 +; SSSE3-NEXT: pushq %r12 +; SSSE3-NEXT: pushq %rbx +; SSSE3-NEXT: pushq %rax +; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, %edx +; SSSE3-NEXT: jb .LBB1_2 +; SSSE3-NEXT: # %bb.1: +; SSSE3-NEXT: movl %eax, %edx +; SSSE3-NEXT: .LBB1_2: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, %r15d +; SSSE3-NEXT: jb .LBB1_4 +; SSSE3-NEXT: # %bb.3: +; SSSE3-NEXT: movl %eax, %r15d +; SSSE3-NEXT: .LBB1_4: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, %esi +; SSSE3-NEXT: jb .LBB1_6 +; SSSE3-NEXT: # %bb.5: +; SSSE3-NEXT: movl %eax, %esi +; SSSE3-NEXT: .LBB1_6: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, %ebx +; SSSE3-NEXT: jb .LBB1_8 +; SSSE3-NEXT: # %bb.7: +; SSSE3-NEXT: movl %eax, %ebx +; SSSE3-NEXT: .LBB1_8: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, %ebp +; SSSE3-NEXT: jb .LBB1_10 +; SSSE3-NEXT: # %bb.9: +; SSSE3-NEXT: movl %eax, %ebp +; SSSE3-NEXT: .LBB1_10: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, %edi +; SSSE3-NEXT: jb .LBB1_12 +; SSSE3-NEXT: # %bb.11: +; SSSE3-NEXT: movl %eax, %edi +; SSSE3-NEXT: .LBB1_12: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, %r13d +; SSSE3-NEXT: jb .LBB1_14 +; SSSE3-NEXT: # %bb.13: +; SSSE3-NEXT: movl %eax, %r13d +; SSSE3-NEXT: .LBB1_14: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, %ecx +; SSSE3-NEXT: jb .LBB1_16 +; SSSE3-NEXT: # %bb.15: +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: .LBB1_16: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, %r12d +; SSSE3-NEXT: jb .LBB1_18 +; SSSE3-NEXT: # %bb.17: +; SSSE3-NEXT: movl %eax, %r12d +; SSSE3-NEXT: .LBB1_18: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, %r14d +; SSSE3-NEXT: jb .LBB1_20 +; SSSE3-NEXT: # %bb.19: +; SSSE3-NEXT: movl %eax, %r14d +; SSSE3-NEXT: .LBB1_20: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, %r8d +; SSSE3-NEXT: jb .LBB1_22 +; SSSE3-NEXT: # %bb.21: +; SSSE3-NEXT: movl %eax, %r8d +; SSSE3-NEXT: .LBB1_22: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, %r9d +; SSSE3-NEXT: jb .LBB1_24 +; SSSE3-NEXT: # %bb.23: +; SSSE3-NEXT: movl %eax, %r9d +; SSSE3-NEXT: .LBB1_24: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, %r10d +; SSSE3-NEXT: jb .LBB1_26 +; SSSE3-NEXT: # %bb.25: +; SSSE3-NEXT: movl %eax, %r10d +; SSSE3-NEXT: .LBB1_26: +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, %r11d +; SSSE3-NEXT: jb .LBB1_28 +; SSSE3-NEXT: # %bb.27: +; SSSE3-NEXT: movl %eax, %r11d +; SSSE3-NEXT: .LBB1_28: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSSE3-NEXT: jb .LBB1_30 +; SSSE3-NEXT: # %bb.29: +; SSSE3-NEXT: # kill: def $cl killed $cl def $ecx +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: .LBB1_30: +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, %ecx +; SSSE3-NEXT: jb .LBB1_32 +; SSSE3-NEXT: # %bb.31: +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: .LBB1_32: +; SSSE3-NEXT: movl %ecx, (%rsp) # 4-byte Spill +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, %ecx +; SSSE3-NEXT: jb .LBB1_34 +; SSSE3-NEXT: # %bb.33: +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: .LBB1_34: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSSE3-NEXT: jb .LBB1_36 +; SSSE3-NEXT: # %bb.35: +; SSSE3-NEXT: # kill: def $al killed $al def $eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: .LBB1_36: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSSE3-NEXT: jb .LBB1_38 +; SSSE3-NEXT: # %bb.37: +; SSSE3-NEXT: # kill: def $al killed $al def $eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: .LBB1_38: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSSE3-NEXT: jb .LBB1_40 +; SSSE3-NEXT: # %bb.39: +; SSSE3-NEXT: # kill: def $al killed $al def $eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: .LBB1_40: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSSE3-NEXT: jb .LBB1_42 +; SSSE3-NEXT: # %bb.41: +; SSSE3-NEXT: # kill: def $al killed $al def $eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: .LBB1_42: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSSE3-NEXT: jb .LBB1_44 +; SSSE3-NEXT: # %bb.43: +; SSSE3-NEXT: # kill: def $al killed $al def $eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: .LBB1_44: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSSE3-NEXT: jb .LBB1_46 +; SSSE3-NEXT: # %bb.45: +; SSSE3-NEXT: # kill: def $al killed $al def $eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: .LBB1_46: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSSE3-NEXT: jb .LBB1_48 +; SSSE3-NEXT: # %bb.47: +; SSSE3-NEXT: # kill: def $al killed $al def $eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: .LBB1_48: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSSE3-NEXT: jb .LBB1_50 +; SSSE3-NEXT: # %bb.49: +; SSSE3-NEXT: # kill: def $al killed $al def $eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: .LBB1_50: +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, %ecx +; SSSE3-NEXT: jb .LBB1_52 +; SSSE3-NEXT: # %bb.51: +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: .LBB1_52: +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl %dl, %eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl %r15b, %ecx +; SSSE3-NEXT: movzbl %sil, %edx +; SSSE3-NEXT: movzbl %bl, %esi +; SSSE3-NEXT: movzbl %bpl, %ebx +; SSSE3-NEXT: movzbl %dil, %ebp +; SSSE3-NEXT: movzbl %r13b, %edi +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl %r12b, %r12d +; SSSE3-NEXT: movzbl %r14b, %r13d +; SSSE3-NEXT: movzbl %r8b, %r8d +; SSSE3-NEXT: movzbl %r9b, %r9d +; SSSE3-NEXT: movzbl %r10b, %r10d +; SSSE3-NEXT: movzbl %r11b, %r11d +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl (%rsp), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSSE3-NEXT: jb .LBB1_54 +; SSSE3-NEXT: # %bb.53: +; SSSE3-NEXT: # kill: def $al killed $al def $eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: .LBB1_54: +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd %ecx, %xmm8 +; SSSE3-NEXT: movd %edx, %xmm3 +; SSSE3-NEXT: movd %esi, %xmm11 +; SSSE3-NEXT: movd %ebx, %xmm5 +; SSSE3-NEXT: movd %ebp, %xmm9 +; SSSE3-NEXT: movd %edi, %xmm7 +; SSSE3-NEXT: movd %r15d, %xmm1 +; SSSE3-NEXT: movd %r12d, %xmm12 +; SSSE3-NEXT: movd %r13d, %xmm10 +; SSSE3-NEXT: movd %r8d, %xmm13 +; SSSE3-NEXT: movd %r9d, %xmm4 +; SSSE3-NEXT: movd %r10d, %xmm14 +; SSSE3-NEXT: movd %r11d, %xmm6 +; SSSE3-NEXT: movd %r14d, %xmm15 +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movl $0, %eax +; SSSE3-NEXT: jb .LBB1_56 +; SSSE3-NEXT: # %bb.55: +; SSSE3-NEXT: movl %ecx, %eax +; SSSE3-NEXT: .LBB1_56: +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3],xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3],xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3],xmm6[4],xmm14[4],xmm6[5],xmm14[5],xmm6[6],xmm14[6],xmm6[7],xmm14[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; SSSE3-NEXT: movd %r8d, %xmm7 +; SSSE3-NEXT: movd %edx, %xmm12 +; SSSE3-NEXT: movd %esi, %xmm13 +; SSSE3-NEXT: movd %ebx, %xmm5 +; SSSE3-NEXT: movd %ebp, %xmm14 +; SSSE3-NEXT: movd %edi, %xmm2 +; SSSE3-NEXT: movd %r10d, %xmm15 +; SSSE3-NEXT: movd %r9d, %xmm3 +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSSE3-NEXT: movzbl %al, %eax +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl $0, %edi +; SSSE3-NEXT: jb .LBB1_58 +; SSSE3-NEXT: # %bb.57: +; SSSE3-NEXT: movl %ebx, %edi +; SSSE3-NEXT: .LBB1_58: +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] +; SSSE3-NEXT: movd %ecx, %xmm8 +; SSSE3-NEXT: movd %edx, %xmm7 +; SSSE3-NEXT: movd %esi, %xmm9 +; SSSE3-NEXT: movd %eax, %xmm6 +; SSSE3-NEXT: movzbl %dil, %eax +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl $0, %ecx +; SSSE3-NEXT: jb .LBB1_60 +; SSSE3-NEXT: # %bb.59: +; SSSE3-NEXT: movl %edx, %ecx +; SSSE3-NEXT: .LBB1_60: +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] +; SSSE3-NEXT: movd %eax, %xmm4 +; SSSE3-NEXT: movzbl %cl, %eax +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl $0, %ecx +; SSSE3-NEXT: jb .LBB1_62 +; SSSE3-NEXT: # %bb.61: +; SSSE3-NEXT: movl %edx, %ecx +; SSSE3-NEXT: .LBB1_62: +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSSE3-NEXT: movzbl %cl, %ecx +; SSSE3-NEXT: movd %ecx, %xmm4 +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, %ecx +; SSSE3-NEXT: jb .LBB1_64 +; SSSE3-NEXT: # %bb.63: +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: .LBB1_64: +; SSSE3-NEXT: movzbl %cl, %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSSE3-NEXT: addq $8, %rsp +; SSSE3-NEXT: popq %rbx +; SSSE3-NEXT: popq %r12 +; SSSE3-NEXT: popq %r13 +; SSSE3-NEXT: popq %r14 +; SSSE3-NEXT: popq %r15 +; SSSE3-NEXT: popq %rbp +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v32i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pextrb $1, %xmm2, %ecx +; SSE41-NEXT: pextrb $1, %xmm0, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB1_2 +; SSE41-NEXT: # %bb.1: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB1_2: +; SSE41-NEXT: pextrb $0, %xmm2, %edx +; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: subb %dl, %al +; SSE41-NEXT: movl $0, %esi +; SSE41-NEXT: jb .LBB1_4 +; SSE41-NEXT: # %bb.3: +; SSE41-NEXT: movl %eax, %esi +; SSE41-NEXT: .LBB1_4: +; SSE41-NEXT: pextrb $2, %xmm2, %edx +; SSE41-NEXT: pextrb $2, %xmm0, %eax +; SSE41-NEXT: subb %dl, %al +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB1_6 +; SSE41-NEXT: # %bb.5: +; SSE41-NEXT: movl %eax, %edx +; SSE41-NEXT: .LBB1_6: +; SSE41-NEXT: pushq %rbp +; SSE41-NEXT: pushq %r15 +; SSE41-NEXT: pushq %r14 +; SSE41-NEXT: pushq %r13 +; SSE41-NEXT: pushq %r12 +; SSE41-NEXT: pushq %rbx +; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrb $3, %xmm2, %edx +; SSE41-NEXT: pextrb $3, %xmm0, %eax +; SSE41-NEXT: subb %dl, %al +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB1_8 +; SSE41-NEXT: # %bb.7: +; SSE41-NEXT: movl %eax, %edx +; SSE41-NEXT: .LBB1_8: +; SSE41-NEXT: pextrb $4, %xmm2, %eax +; SSE41-NEXT: pextrb $4, %xmm0, %edi +; SSE41-NEXT: subb %al, %dil +; SSE41-NEXT: movl $0, %eax +; SSE41-NEXT: jb .LBB1_10 +; SSE41-NEXT: # %bb.9: +; SSE41-NEXT: movl %edi, %eax +; SSE41-NEXT: .LBB1_10: +; SSE41-NEXT: pextrb $5, %xmm2, %ebp +; SSE41-NEXT: pextrb $5, %xmm0, %edi +; SSE41-NEXT: subb %bpl, %dil +; SSE41-NEXT: movl $0, %ebx +; SSE41-NEXT: jb .LBB1_12 +; SSE41-NEXT: # %bb.11: +; SSE41-NEXT: movl %edi, %ebx +; SSE41-NEXT: .LBB1_12: +; SSE41-NEXT: pextrb $6, %xmm2, %ebp +; SSE41-NEXT: pextrb $6, %xmm0, %edi +; SSE41-NEXT: subb %bpl, %dil +; SSE41-NEXT: movl $0, %ebp +; SSE41-NEXT: jb .LBB1_14 +; SSE41-NEXT: # %bb.13: +; SSE41-NEXT: movl %edi, %ebp +; SSE41-NEXT: .LBB1_14: +; SSE41-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrb $7, %xmm2, %ebx +; SSE41-NEXT: pextrb $7, %xmm0, %edi +; SSE41-NEXT: subb %bl, %dil +; SSE41-NEXT: movl $0, %ebp +; SSE41-NEXT: jb .LBB1_16 +; SSE41-NEXT: # %bb.15: +; SSE41-NEXT: movl %edi, %ebp +; SSE41-NEXT: .LBB1_16: +; SSE41-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrb $8, %xmm2, %ebx +; SSE41-NEXT: pextrb $8, %xmm0, %edi +; SSE41-NEXT: subb %bl, %dil +; SSE41-NEXT: movl $0, %r8d +; SSE41-NEXT: jb .LBB1_18 +; SSE41-NEXT: # %bb.17: +; SSE41-NEXT: movl %edi, %r8d +; SSE41-NEXT: .LBB1_18: +; SSE41-NEXT: pextrb $9, %xmm2, %ebx +; SSE41-NEXT: pextrb $9, %xmm0, %edi +; SSE41-NEXT: subb %bl, %dil +; SSE41-NEXT: movl $0, %r14d +; SSE41-NEXT: jb .LBB1_20 +; SSE41-NEXT: # %bb.19: +; SSE41-NEXT: movl %edi, %r14d +; SSE41-NEXT: .LBB1_20: +; SSE41-NEXT: pextrb $10, %xmm2, %ebx +; SSE41-NEXT: pextrb $10, %xmm0, %edi +; SSE41-NEXT: subb %bl, %dil +; SSE41-NEXT: movl $0, %ebp +; SSE41-NEXT: jb .LBB1_22 +; SSE41-NEXT: # %bb.21: +; SSE41-NEXT: movl %edi, %ebp +; SSE41-NEXT: .LBB1_22: +; SSE41-NEXT: pextrb $11, %xmm2, %ebx +; SSE41-NEXT: pextrb $11, %xmm0, %edi +; SSE41-NEXT: subb %bl, %dil +; SSE41-NEXT: movl $0, %r11d +; SSE41-NEXT: jb .LBB1_24 +; SSE41-NEXT: # %bb.23: +; SSE41-NEXT: movl %edi, %r11d +; SSE41-NEXT: .LBB1_24: +; SSE41-NEXT: pextrb $12, %xmm2, %ebx +; SSE41-NEXT: pextrb $12, %xmm0, %edi +; SSE41-NEXT: subb %bl, %dil +; SSE41-NEXT: movl $0, %r9d +; SSE41-NEXT: jb .LBB1_26 +; SSE41-NEXT: # %bb.25: +; SSE41-NEXT: movl %edi, %r9d +; SSE41-NEXT: .LBB1_26: +; SSE41-NEXT: pextrb $13, %xmm2, %ebx +; SSE41-NEXT: pextrb $13, %xmm0, %edi +; SSE41-NEXT: subb %bl, %dil +; SSE41-NEXT: movl $0, %r13d +; SSE41-NEXT: jb .LBB1_28 +; SSE41-NEXT: # %bb.27: +; SSE41-NEXT: movl %edi, %r13d +; SSE41-NEXT: .LBB1_28: +; SSE41-NEXT: pextrb $14, %xmm2, %ebx +; SSE41-NEXT: pextrb $14, %xmm0, %edi +; SSE41-NEXT: subb %bl, %dil +; SSE41-NEXT: movl $0, %r12d +; SSE41-NEXT: jb .LBB1_30 +; SSE41-NEXT: # %bb.29: +; SSE41-NEXT: movl %edi, %r12d +; SSE41-NEXT: .LBB1_30: +; SSE41-NEXT: movzbl %sil, %esi +; SSE41-NEXT: pextrb $15, %xmm2, %ebx +; SSE41-NEXT: pextrb $15, %xmm0, %edi +; SSE41-NEXT: subb %bl, %dil +; SSE41-NEXT: movl $0, %r15d +; SSE41-NEXT: jb .LBB1_32 +; SSE41-NEXT: # %bb.31: +; SSE41-NEXT: movl %edi, %r15d +; SSE41-NEXT: .LBB1_32: +; SSE41-NEXT: movzbl %cl, %ecx +; SSE41-NEXT: movd %esi, %xmm0 +; SSE41-NEXT: pextrb $1, %xmm3, %edi +; SSE41-NEXT: pextrb $1, %xmm1, %esi +; SSE41-NEXT: subb %dil, %sil +; SSE41-NEXT: movl $0, %r10d +; SSE41-NEXT: jb .LBB1_34 +; SSE41-NEXT: # %bb.33: +; SSE41-NEXT: movl %esi, %r10d +; SSE41-NEXT: .LBB1_34: +; SSE41-NEXT: pinsrb $1, %ecx, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE41-NEXT: movzbl %r10b, %ecx +; SSE41-NEXT: pextrb $0, %xmm3, %ebx +; SSE41-NEXT: pextrb $0, %xmm1, %edi +; SSE41-NEXT: subb %bl, %dil +; SSE41-NEXT: movl $0, %ebx +; SSE41-NEXT: jb .LBB1_36 +; SSE41-NEXT: # %bb.35: +; SSE41-NEXT: movl %edi, %ebx +; SSE41-NEXT: .LBB1_36: +; SSE41-NEXT: pinsrb $2, %esi, %xmm0 +; SSE41-NEXT: movzbl %dl, %edx +; SSE41-NEXT: movzbl %bl, %esi +; SSE41-NEXT: movd %esi, %xmm2 +; SSE41-NEXT: pinsrb $1, %ecx, %xmm2 +; SSE41-NEXT: pextrb $2, %xmm3, %ecx +; SSE41-NEXT: pextrb $2, %xmm1, %esi +; SSE41-NEXT: subb %cl, %sil +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB1_38 +; SSE41-NEXT: # %bb.37: +; SSE41-NEXT: movl %esi, %ecx +; SSE41-NEXT: .LBB1_38: +; SSE41-NEXT: pinsrb $3, %edx, %xmm0 +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: movzbl %cl, %ecx +; SSE41-NEXT: pinsrb $2, %ecx, %xmm2 +; SSE41-NEXT: pextrb $3, %xmm3, %ecx +; SSE41-NEXT: pextrb $3, %xmm1, %edx +; SSE41-NEXT: subb %cl, %dl +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB1_40 +; SSE41-NEXT: # %bb.39: +; SSE41-NEXT: movl %edx, %ecx +; SSE41-NEXT: .LBB1_40: +; SSE41-NEXT: pinsrb $4, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: movzbl %cl, %ecx +; SSE41-NEXT: pinsrb $3, %ecx, %xmm2 +; SSE41-NEXT: pextrb $4, %xmm3, %ecx +; SSE41-NEXT: pextrb $4, %xmm1, %edx +; SSE41-NEXT: subb %cl, %dl +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB1_42 +; SSE41-NEXT: # %bb.41: +; SSE41-NEXT: movl %edx, %ecx +; SSE41-NEXT: .LBB1_42: +; SSE41-NEXT: pinsrb $5, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: movzbl %cl, %ecx +; SSE41-NEXT: pinsrb $4, %ecx, %xmm2 +; SSE41-NEXT: pextrb $5, %xmm3, %ecx +; SSE41-NEXT: pextrb $5, %xmm1, %edx +; SSE41-NEXT: subb %cl, %dl +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB1_44 +; SSE41-NEXT: # %bb.43: +; SSE41-NEXT: movl %edx, %ecx +; SSE41-NEXT: .LBB1_44: +; SSE41-NEXT: pinsrb $6, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: movzbl %cl, %ecx +; SSE41-NEXT: pinsrb $5, %ecx, %xmm2 +; SSE41-NEXT: pextrb $6, %xmm3, %ecx +; SSE41-NEXT: pextrb $6, %xmm1, %edx +; SSE41-NEXT: subb %cl, %dl +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB1_46 +; SSE41-NEXT: # %bb.45: +; SSE41-NEXT: movl %edx, %ecx +; SSE41-NEXT: .LBB1_46: +; SSE41-NEXT: pinsrb $7, %eax, %xmm0 +; SSE41-NEXT: movzbl %r8b, %eax +; SSE41-NEXT: movzbl %cl, %ecx +; SSE41-NEXT: pinsrb $6, %ecx, %xmm2 +; SSE41-NEXT: pextrb $7, %xmm3, %ecx +; SSE41-NEXT: pextrb $7, %xmm1, %edx +; SSE41-NEXT: subb %cl, %dl +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB1_48 +; SSE41-NEXT: # %bb.47: +; SSE41-NEXT: movl %edx, %ecx +; SSE41-NEXT: .LBB1_48: +; SSE41-NEXT: pinsrb $8, %eax, %xmm0 +; SSE41-NEXT: movzbl %r14b, %eax +; SSE41-NEXT: movzbl %cl, %ecx +; SSE41-NEXT: pinsrb $7, %ecx, %xmm2 +; SSE41-NEXT: pextrb $8, %xmm3, %ecx +; SSE41-NEXT: pextrb $8, %xmm1, %edx +; SSE41-NEXT: subb %cl, %dl +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB1_50 +; SSE41-NEXT: # %bb.49: +; SSE41-NEXT: movl %edx, %ecx +; SSE41-NEXT: .LBB1_50: +; SSE41-NEXT: pinsrb $9, %eax, %xmm0 +; SSE41-NEXT: movzbl %bpl, %eax +; SSE41-NEXT: movzbl %cl, %ecx +; SSE41-NEXT: pinsrb $8, %ecx, %xmm2 +; SSE41-NEXT: pextrb $9, %xmm3, %ecx +; SSE41-NEXT: pextrb $9, %xmm1, %edx +; SSE41-NEXT: subb %cl, %dl +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB1_52 +; SSE41-NEXT: # %bb.51: +; SSE41-NEXT: movl %edx, %ecx +; SSE41-NEXT: .LBB1_52: +; SSE41-NEXT: pinsrb $10, %eax, %xmm0 +; SSE41-NEXT: movzbl %r11b, %eax +; SSE41-NEXT: movzbl %cl, %ecx +; SSE41-NEXT: pinsrb $9, %ecx, %xmm2 +; SSE41-NEXT: pextrb $10, %xmm3, %ecx +; SSE41-NEXT: pextrb $10, %xmm1, %edx +; SSE41-NEXT: subb %cl, %dl +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB1_54 +; SSE41-NEXT: # %bb.53: +; SSE41-NEXT: movl %edx, %ecx +; SSE41-NEXT: .LBB1_54: +; SSE41-NEXT: pinsrb $11, %eax, %xmm0 +; SSE41-NEXT: movzbl %r9b, %eax +; SSE41-NEXT: movzbl %cl, %ecx +; SSE41-NEXT: pinsrb $10, %ecx, %xmm2 +; SSE41-NEXT: pextrb $11, %xmm3, %ecx +; SSE41-NEXT: pextrb $11, %xmm1, %edx +; SSE41-NEXT: subb %cl, %dl +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB1_56 +; SSE41-NEXT: # %bb.55: +; SSE41-NEXT: movl %edx, %ecx +; SSE41-NEXT: .LBB1_56: +; SSE41-NEXT: pinsrb $12, %eax, %xmm0 +; SSE41-NEXT: movzbl %r13b, %eax +; SSE41-NEXT: movzbl %cl, %ecx +; SSE41-NEXT: pinsrb $11, %ecx, %xmm2 +; SSE41-NEXT: pextrb $12, %xmm3, %ecx +; SSE41-NEXT: pextrb $12, %xmm1, %edx +; SSE41-NEXT: subb %cl, %dl +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB1_58 +; SSE41-NEXT: # %bb.57: +; SSE41-NEXT: movl %edx, %ecx +; SSE41-NEXT: .LBB1_58: +; SSE41-NEXT: pinsrb $13, %eax, %xmm0 +; SSE41-NEXT: movzbl %r12b, %eax +; SSE41-NEXT: movzbl %cl, %ecx +; SSE41-NEXT: pinsrb $12, %ecx, %xmm2 +; SSE41-NEXT: pextrb $13, %xmm3, %ecx +; SSE41-NEXT: pextrb $13, %xmm1, %edx +; SSE41-NEXT: subb %cl, %dl +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB1_60 +; SSE41-NEXT: # %bb.59: +; SSE41-NEXT: movl %edx, %ecx +; SSE41-NEXT: .LBB1_60: +; SSE41-NEXT: pinsrb $14, %eax, %xmm0 +; SSE41-NEXT: movzbl %r15b, %eax +; SSE41-NEXT: movzbl %cl, %ecx +; SSE41-NEXT: pinsrb $13, %ecx, %xmm2 +; SSE41-NEXT: pextrb $14, %xmm3, %edx +; SSE41-NEXT: pextrb $14, %xmm1, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: popq %rbx +; SSE41-NEXT: popq %r12 +; SSE41-NEXT: popq %r13 +; SSE41-NEXT: popq %r14 +; SSE41-NEXT: popq %r15 +; SSE41-NEXT: popq %rbp +; SSE41-NEXT: jb .LBB1_62 +; SSE41-NEXT: # %bb.61: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB1_62: +; SSE41-NEXT: pinsrb $15, %eax, %xmm0 +; SSE41-NEXT: movzbl %dl, %eax +; SSE41-NEXT: pinsrb $14, %eax, %xmm2 +; SSE41-NEXT: pextrb $15, %xmm3, %ecx +; SSE41-NEXT: pextrb $15, %xmm1, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB1_64 +; SSE41-NEXT: # %bb.63: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB1_64: +; SSE41-NEXT: movzbl %cl, %eax +; SSE41-NEXT: pinsrb $15, %eax, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: v32i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpextrb $1, %xmm2, %ecx +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpextrb $1, %xmm3, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %ecx +; AVX1-NEXT: jb .LBB1_2 +; AVX1-NEXT: # %bb.1: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB1_2: +; AVX1-NEXT: vpextrb $0, %xmm2, %edx +; AVX1-NEXT: vpextrb $0, %xmm3, %eax +; AVX1-NEXT: subb %dl, %al +; AVX1-NEXT: movl $0, %esi +; AVX1-NEXT: jb .LBB1_4 +; AVX1-NEXT: # %bb.3: +; AVX1-NEXT: movl %eax, %esi +; AVX1-NEXT: .LBB1_4: +; AVX1-NEXT: vpextrb $2, %xmm2, %edx +; AVX1-NEXT: vpextrb $2, %xmm3, %eax +; AVX1-NEXT: subb %dl, %al +; AVX1-NEXT: movl $0, %r8d +; AVX1-NEXT: jb .LBB1_6 +; AVX1-NEXT: # %bb.5: +; AVX1-NEXT: movl %eax, %r8d +; AVX1-NEXT: .LBB1_6: +; AVX1-NEXT: vpextrb $3, %xmm2, %edx +; AVX1-NEXT: vpextrb $3, %xmm3, %eax +; AVX1-NEXT: subb %dl, %al +; AVX1-NEXT: movl $0, %edx +; AVX1-NEXT: jb .LBB1_8 +; AVX1-NEXT: # %bb.7: +; AVX1-NEXT: movl %eax, %edx +; AVX1-NEXT: .LBB1_8: +; AVX1-NEXT: vpextrb $4, %xmm2, %eax +; AVX1-NEXT: vpextrb $4, %xmm3, %edi +; AVX1-NEXT: subb %al, %dil +; AVX1-NEXT: movl $0, %eax +; AVX1-NEXT: jb .LBB1_10 +; AVX1-NEXT: # %bb.9: +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: .LBB1_10: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: pushq %r15 +; AVX1-NEXT: pushq %r14 +; AVX1-NEXT: pushq %r13 +; AVX1-NEXT: pushq %r12 +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: vpextrb $5, %xmm2, %ebp +; AVX1-NEXT: vpextrb $5, %xmm3, %edi +; AVX1-NEXT: subb %bpl, %dil +; AVX1-NEXT: movl $0, %r10d +; AVX1-NEXT: jb .LBB1_12 +; AVX1-NEXT: # %bb.11: +; AVX1-NEXT: movl %edi, %r10d +; AVX1-NEXT: .LBB1_12: +; AVX1-NEXT: vpextrb $6, %xmm2, %ebp +; AVX1-NEXT: vpextrb $6, %xmm3, %edi +; AVX1-NEXT: subb %bpl, %dil +; AVX1-NEXT: movl $0, %ebp +; AVX1-NEXT: jb .LBB1_14 +; AVX1-NEXT: # %bb.13: +; AVX1-NEXT: movl %edi, %ebp +; AVX1-NEXT: .LBB1_14: +; AVX1-NEXT: vpextrb $7, %xmm2, %ebx +; AVX1-NEXT: vpextrb $7, %xmm3, %edi +; AVX1-NEXT: subb %bl, %dil +; AVX1-NEXT: movl $0, %r11d +; AVX1-NEXT: jb .LBB1_16 +; AVX1-NEXT: # %bb.15: +; AVX1-NEXT: movl %edi, %r11d +; AVX1-NEXT: .LBB1_16: +; AVX1-NEXT: vpextrb $8, %xmm2, %ebx +; AVX1-NEXT: vpextrb $8, %xmm3, %edi +; AVX1-NEXT: subb %bl, %dil +; AVX1-NEXT: movl $0, %r9d +; AVX1-NEXT: jb .LBB1_18 +; AVX1-NEXT: # %bb.17: +; AVX1-NEXT: movl %edi, %r9d +; AVX1-NEXT: .LBB1_18: +; AVX1-NEXT: vpextrb $9, %xmm2, %ebx +; AVX1-NEXT: vpextrb $9, %xmm3, %edi +; AVX1-NEXT: subb %bl, %dil +; AVX1-NEXT: movl $0, %r13d +; AVX1-NEXT: jb .LBB1_20 +; AVX1-NEXT: # %bb.19: +; AVX1-NEXT: movl %edi, %r13d +; AVX1-NEXT: .LBB1_20: +; AVX1-NEXT: vpextrb $10, %xmm2, %ebx +; AVX1-NEXT: vpextrb $10, %xmm3, %edi +; AVX1-NEXT: subb %bl, %dil +; AVX1-NEXT: movl $0, %ebx +; AVX1-NEXT: jb .LBB1_22 +; AVX1-NEXT: # %bb.21: +; AVX1-NEXT: movl %edi, %ebx +; AVX1-NEXT: .LBB1_22: +; AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrb $11, %xmm2, %ebx +; AVX1-NEXT: vpextrb $11, %xmm3, %edi +; AVX1-NEXT: subb %bl, %dil +; AVX1-NEXT: movl $0, %ebx +; AVX1-NEXT: jb .LBB1_24 +; AVX1-NEXT: # %bb.23: +; AVX1-NEXT: movl %edi, %ebx +; AVX1-NEXT: .LBB1_24: +; AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrb $12, %xmm2, %ebx +; AVX1-NEXT: vpextrb $12, %xmm3, %edi +; AVX1-NEXT: subb %bl, %dil +; AVX1-NEXT: movl $0, %ebx +; AVX1-NEXT: jb .LBB1_26 +; AVX1-NEXT: # %bb.25: +; AVX1-NEXT: movl %edi, %ebx +; AVX1-NEXT: .LBB1_26: +; AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrb $13, %xmm2, %ebx +; AVX1-NEXT: vpextrb $13, %xmm3, %edi +; AVX1-NEXT: subb %bl, %dil +; AVX1-NEXT: movl $0, %r12d +; AVX1-NEXT: jb .LBB1_28 +; AVX1-NEXT: # %bb.27: +; AVX1-NEXT: movl %edi, %r12d +; AVX1-NEXT: .LBB1_28: +; AVX1-NEXT: vpextrb $14, %xmm2, %ebx +; AVX1-NEXT: vpextrb $14, %xmm3, %edi +; AVX1-NEXT: subb %bl, %dil +; AVX1-NEXT: movl $0, %r15d +; AVX1-NEXT: jb .LBB1_30 +; AVX1-NEXT: # %bb.29: +; AVX1-NEXT: movl %edi, %r15d +; AVX1-NEXT: .LBB1_30: +; AVX1-NEXT: movzbl %sil, %esi +; AVX1-NEXT: vpextrb $15, %xmm2, %ebx +; AVX1-NEXT: vpextrb $15, %xmm3, %edi +; AVX1-NEXT: subb %bl, %dil +; AVX1-NEXT: movl $0, %r14d +; AVX1-NEXT: jb .LBB1_32 +; AVX1-NEXT: # %bb.31: +; AVX1-NEXT: movl %edi, %r14d +; AVX1-NEXT: .LBB1_32: +; AVX1-NEXT: movzbl %cl, %ecx +; AVX1-NEXT: vmovd %esi, %xmm2 +; AVX1-NEXT: vpextrb $1, %xmm1, %edi +; AVX1-NEXT: vpextrb $1, %xmm0, %esi +; AVX1-NEXT: subb %dil, %sil +; AVX1-NEXT: movl $0, %edi +; AVX1-NEXT: jb .LBB1_34 +; AVX1-NEXT: # %bb.33: +; AVX1-NEXT: movl %esi, %edi +; AVX1-NEXT: .LBB1_34: +; AVX1-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: movzbl %r8b, %esi +; AVX1-NEXT: movzbl %dil, %ecx +; AVX1-NEXT: vpextrb $0, %xmm1, %ebx +; AVX1-NEXT: vpextrb $0, %xmm0, %edi +; AVX1-NEXT: subb %bl, %dil +; AVX1-NEXT: movl $0, %ebx +; AVX1-NEXT: jb .LBB1_36 +; AVX1-NEXT: # %bb.35: +; AVX1-NEXT: movl %edi, %ebx +; AVX1-NEXT: .LBB1_36: +; AVX1-NEXT: vpinsrb $2, %esi, %xmm2, %xmm2 +; AVX1-NEXT: movzbl %dl, %edx +; AVX1-NEXT: movzbl %bl, %esi +; AVX1-NEXT: vmovd %esi, %xmm3 +; AVX1-NEXT: vpinsrb $1, %ecx, %xmm3, %xmm3 +; AVX1-NEXT: vpextrb $2, %xmm1, %esi +; AVX1-NEXT: vpextrb $2, %xmm0, %ecx +; AVX1-NEXT: subb %sil, %cl +; AVX1-NEXT: movl $0, %esi +; AVX1-NEXT: jb .LBB1_38 +; AVX1-NEXT: # %bb.37: +; AVX1-NEXT: movl %ecx, %esi +; AVX1-NEXT: .LBB1_38: +; AVX1-NEXT: vpinsrb $3, %edx, %xmm2, %xmm2 +; AVX1-NEXT: movzbl %al, %eax +; AVX1-NEXT: movzbl %sil, %ecx +; AVX1-NEXT: vpinsrb $2, %ecx, %xmm3, %xmm3 +; AVX1-NEXT: vpextrb $3, %xmm1, %edx +; AVX1-NEXT: vpextrb $3, %xmm0, %ecx +; AVX1-NEXT: subb %dl, %cl +; AVX1-NEXT: movl $0, %edx +; AVX1-NEXT: jb .LBB1_40 +; AVX1-NEXT: # %bb.39: +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: .LBB1_40: +; AVX1-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl %r10b, %eax +; AVX1-NEXT: movzbl %dl, %ecx +; AVX1-NEXT: vpinsrb $3, %ecx, %xmm3, %xmm3 +; AVX1-NEXT: vpextrb $4, %xmm1, %edx +; AVX1-NEXT: vpextrb $4, %xmm0, %ecx +; AVX1-NEXT: subb %dl, %cl +; AVX1-NEXT: movl $0, %edx +; AVX1-NEXT: jb .LBB1_42 +; AVX1-NEXT: # %bb.41: +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: .LBB1_42: +; AVX1-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl %bpl, %eax +; AVX1-NEXT: movzbl %dl, %ecx +; AVX1-NEXT: vpinsrb $4, %ecx, %xmm3, %xmm3 +; AVX1-NEXT: vpextrb $5, %xmm1, %edx +; AVX1-NEXT: vpextrb $5, %xmm0, %ecx +; AVX1-NEXT: subb %dl, %cl +; AVX1-NEXT: movl $0, %edx +; AVX1-NEXT: jb .LBB1_44 +; AVX1-NEXT: # %bb.43: +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: .LBB1_44: +; AVX1-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl %r11b, %eax +; AVX1-NEXT: movzbl %dl, %ecx +; AVX1-NEXT: vpinsrb $5, %ecx, %xmm3, %xmm3 +; AVX1-NEXT: vpextrb $6, %xmm1, %edx +; AVX1-NEXT: vpextrb $6, %xmm0, %ecx +; AVX1-NEXT: subb %dl, %cl +; AVX1-NEXT: movl $0, %edx +; AVX1-NEXT: jb .LBB1_46 +; AVX1-NEXT: # %bb.45: +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: .LBB1_46: +; AVX1-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl %r9b, %eax +; AVX1-NEXT: movzbl %dl, %ecx +; AVX1-NEXT: vpinsrb $6, %ecx, %xmm3, %xmm3 +; AVX1-NEXT: vpextrb $7, %xmm1, %edx +; AVX1-NEXT: vpextrb $7, %xmm0, %ecx +; AVX1-NEXT: subb %dl, %cl +; AVX1-NEXT: movl $0, %edx +; AVX1-NEXT: jb .LBB1_48 +; AVX1-NEXT: # %bb.47: +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: .LBB1_48: +; AVX1-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl %r13b, %eax +; AVX1-NEXT: movzbl %dl, %ecx +; AVX1-NEXT: vpinsrb $7, %ecx, %xmm3, %xmm3 +; AVX1-NEXT: vpextrb $8, %xmm1, %edx +; AVX1-NEXT: vpextrb $8, %xmm0, %ecx +; AVX1-NEXT: subb %dl, %cl +; AVX1-NEXT: movl $0, %edx +; AVX1-NEXT: jb .LBB1_50 +; AVX1-NEXT: # %bb.49: +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: .LBB1_50: +; AVX1-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: movzbl %dl, %ecx +; AVX1-NEXT: vpinsrb $8, %ecx, %xmm3, %xmm3 +; AVX1-NEXT: vpextrb $9, %xmm1, %edx +; AVX1-NEXT: vpextrb $9, %xmm0, %ecx +; AVX1-NEXT: subb %dl, %cl +; AVX1-NEXT: movl $0, %edx +; AVX1-NEXT: jb .LBB1_52 +; AVX1-NEXT: # %bb.51: +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: .LBB1_52: +; AVX1-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: movzbl %dl, %ecx +; AVX1-NEXT: vpinsrb $9, %ecx, %xmm3, %xmm3 +; AVX1-NEXT: vpextrb $10, %xmm1, %edx +; AVX1-NEXT: vpextrb $10, %xmm0, %ecx +; AVX1-NEXT: subb %dl, %cl +; AVX1-NEXT: movl $0, %edx +; AVX1-NEXT: jb .LBB1_54 +; AVX1-NEXT: # %bb.53: +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: .LBB1_54: +; AVX1-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: movzbl %dl, %ecx +; AVX1-NEXT: vpinsrb $10, %ecx, %xmm3, %xmm3 +; AVX1-NEXT: vpextrb $11, %xmm1, %edx +; AVX1-NEXT: vpextrb $11, %xmm0, %ecx +; AVX1-NEXT: subb %dl, %cl +; AVX1-NEXT: movl $0, %edx +; AVX1-NEXT: jb .LBB1_56 +; AVX1-NEXT: # %bb.55: +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: .LBB1_56: +; AVX1-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl %r12b, %eax +; AVX1-NEXT: movzbl %dl, %ecx +; AVX1-NEXT: vpinsrb $11, %ecx, %xmm3, %xmm3 +; AVX1-NEXT: vpextrb $12, %xmm1, %edx +; AVX1-NEXT: vpextrb $12, %xmm0, %ecx +; AVX1-NEXT: subb %dl, %cl +; AVX1-NEXT: movl $0, %edx +; AVX1-NEXT: jb .LBB1_58 +; AVX1-NEXT: # %bb.57: +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: .LBB1_58: +; AVX1-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl %r15b, %eax +; AVX1-NEXT: movzbl %dl, %ecx +; AVX1-NEXT: vpinsrb $12, %ecx, %xmm3, %xmm3 +; AVX1-NEXT: vpextrb $13, %xmm1, %edx +; AVX1-NEXT: vpextrb $13, %xmm0, %ecx +; AVX1-NEXT: subb %dl, %cl +; AVX1-NEXT: movl $0, %edx +; AVX1-NEXT: jb .LBB1_60 +; AVX1-NEXT: # %bb.59: +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: .LBB1_60: +; AVX1-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl %r14b, %eax +; AVX1-NEXT: movzbl %dl, %ecx +; AVX1-NEXT: vpinsrb $13, %ecx, %xmm3, %xmm3 +; AVX1-NEXT: vpextrb $14, %xmm1, %edx +; AVX1-NEXT: vpextrb $14, %xmm0, %ecx +; AVX1-NEXT: subb %dl, %cl +; AVX1-NEXT: movl $0, %edx +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: popq %r12 +; AVX1-NEXT: popq %r13 +; AVX1-NEXT: popq %r14 +; AVX1-NEXT: popq %r15 +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: jb .LBB1_62 +; AVX1-NEXT: # %bb.61: +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: .LBB1_62: +; AVX1-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movzbl %dl, %eax +; AVX1-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 +; AVX1-NEXT: vpextrb $15, %xmm1, %ecx +; AVX1-NEXT: vpextrb $15, %xmm0, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %ecx +; AVX1-NEXT: jb .LBB1_64 +; AVX1-NEXT: # %bb.63: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB1_64: +; AVX1-NEXT: movzbl %cl, %eax +; AVX1-NEXT: vpinsrb $15, %eax, %xmm3, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: v32i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpextrb $1, %xmm2, %ecx +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-NEXT: vpextrb $1, %xmm3, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %ecx +; AVX2-NEXT: jb .LBB1_2 +; AVX2-NEXT: # %bb.1: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB1_2: +; AVX2-NEXT: vpextrb $0, %xmm2, %edx +; AVX2-NEXT: vpextrb $0, %xmm3, %eax +; AVX2-NEXT: subb %dl, %al +; AVX2-NEXT: movl $0, %esi +; AVX2-NEXT: jb .LBB1_4 +; AVX2-NEXT: # %bb.3: +; AVX2-NEXT: movl %eax, %esi +; AVX2-NEXT: .LBB1_4: +; AVX2-NEXT: vpextrb $2, %xmm2, %edx +; AVX2-NEXT: vpextrb $2, %xmm3, %eax +; AVX2-NEXT: subb %dl, %al +; AVX2-NEXT: movl $0, %r8d +; AVX2-NEXT: jb .LBB1_6 +; AVX2-NEXT: # %bb.5: +; AVX2-NEXT: movl %eax, %r8d +; AVX2-NEXT: .LBB1_6: +; AVX2-NEXT: vpextrb $3, %xmm2, %edx +; AVX2-NEXT: vpextrb $3, %xmm3, %eax +; AVX2-NEXT: subb %dl, %al +; AVX2-NEXT: movl $0, %edx +; AVX2-NEXT: jb .LBB1_8 +; AVX2-NEXT: # %bb.7: +; AVX2-NEXT: movl %eax, %edx +; AVX2-NEXT: .LBB1_8: +; AVX2-NEXT: vpextrb $4, %xmm2, %eax +; AVX2-NEXT: vpextrb $4, %xmm3, %edi +; AVX2-NEXT: subb %al, %dil +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: jb .LBB1_10 +; AVX2-NEXT: # %bb.9: +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: .LBB1_10: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %r13 +; AVX2-NEXT: pushq %r12 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: vpextrb $5, %xmm2, %ebp +; AVX2-NEXT: vpextrb $5, %xmm3, %edi +; AVX2-NEXT: subb %bpl, %dil +; AVX2-NEXT: movl $0, %r10d +; AVX2-NEXT: jb .LBB1_12 +; AVX2-NEXT: # %bb.11: +; AVX2-NEXT: movl %edi, %r10d +; AVX2-NEXT: .LBB1_12: +; AVX2-NEXT: vpextrb $6, %xmm2, %ebp +; AVX2-NEXT: vpextrb $6, %xmm3, %edi +; AVX2-NEXT: subb %bpl, %dil +; AVX2-NEXT: movl $0, %ebp +; AVX2-NEXT: jb .LBB1_14 +; AVX2-NEXT: # %bb.13: +; AVX2-NEXT: movl %edi, %ebp +; AVX2-NEXT: .LBB1_14: +; AVX2-NEXT: vpextrb $7, %xmm2, %ebx +; AVX2-NEXT: vpextrb $7, %xmm3, %edi +; AVX2-NEXT: subb %bl, %dil +; AVX2-NEXT: movl $0, %r11d +; AVX2-NEXT: jb .LBB1_16 +; AVX2-NEXT: # %bb.15: +; AVX2-NEXT: movl %edi, %r11d +; AVX2-NEXT: .LBB1_16: +; AVX2-NEXT: vpextrb $8, %xmm2, %ebx +; AVX2-NEXT: vpextrb $8, %xmm3, %edi +; AVX2-NEXT: subb %bl, %dil +; AVX2-NEXT: movl $0, %r9d +; AVX2-NEXT: jb .LBB1_18 +; AVX2-NEXT: # %bb.17: +; AVX2-NEXT: movl %edi, %r9d +; AVX2-NEXT: .LBB1_18: +; AVX2-NEXT: vpextrb $9, %xmm2, %ebx +; AVX2-NEXT: vpextrb $9, %xmm3, %edi +; AVX2-NEXT: subb %bl, %dil +; AVX2-NEXT: movl $0, %r13d +; AVX2-NEXT: jb .LBB1_20 +; AVX2-NEXT: # %bb.19: +; AVX2-NEXT: movl %edi, %r13d +; AVX2-NEXT: .LBB1_20: +; AVX2-NEXT: vpextrb $10, %xmm2, %ebx +; AVX2-NEXT: vpextrb $10, %xmm3, %edi +; AVX2-NEXT: subb %bl, %dil +; AVX2-NEXT: movl $0, %ebx +; AVX2-NEXT: jb .LBB1_22 +; AVX2-NEXT: # %bb.21: +; AVX2-NEXT: movl %edi, %ebx +; AVX2-NEXT: .LBB1_22: +; AVX2-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrb $11, %xmm2, %ebx +; AVX2-NEXT: vpextrb $11, %xmm3, %edi +; AVX2-NEXT: subb %bl, %dil +; AVX2-NEXT: movl $0, %ebx +; AVX2-NEXT: jb .LBB1_24 +; AVX2-NEXT: # %bb.23: +; AVX2-NEXT: movl %edi, %ebx +; AVX2-NEXT: .LBB1_24: +; AVX2-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrb $12, %xmm2, %ebx +; AVX2-NEXT: vpextrb $12, %xmm3, %edi +; AVX2-NEXT: subb %bl, %dil +; AVX2-NEXT: movl $0, %ebx +; AVX2-NEXT: jb .LBB1_26 +; AVX2-NEXT: # %bb.25: +; AVX2-NEXT: movl %edi, %ebx +; AVX2-NEXT: .LBB1_26: +; AVX2-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrb $13, %xmm2, %ebx +; AVX2-NEXT: vpextrb $13, %xmm3, %edi +; AVX2-NEXT: subb %bl, %dil +; AVX2-NEXT: movl $0, %r12d +; AVX2-NEXT: jb .LBB1_28 +; AVX2-NEXT: # %bb.27: +; AVX2-NEXT: movl %edi, %r12d +; AVX2-NEXT: .LBB1_28: +; AVX2-NEXT: vpextrb $14, %xmm2, %ebx +; AVX2-NEXT: vpextrb $14, %xmm3, %edi +; AVX2-NEXT: subb %bl, %dil +; AVX2-NEXT: movl $0, %r15d +; AVX2-NEXT: jb .LBB1_30 +; AVX2-NEXT: # %bb.29: +; AVX2-NEXT: movl %edi, %r15d +; AVX2-NEXT: .LBB1_30: +; AVX2-NEXT: movzbl %sil, %esi +; AVX2-NEXT: vpextrb $15, %xmm2, %ebx +; AVX2-NEXT: vpextrb $15, %xmm3, %edi +; AVX2-NEXT: subb %bl, %dil +; AVX2-NEXT: movl $0, %r14d +; AVX2-NEXT: jb .LBB1_32 +; AVX2-NEXT: # %bb.31: +; AVX2-NEXT: movl %edi, %r14d +; AVX2-NEXT: .LBB1_32: +; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: vmovd %esi, %xmm2 +; AVX2-NEXT: vpextrb $1, %xmm1, %edi +; AVX2-NEXT: vpextrb $1, %xmm0, %esi +; AVX2-NEXT: subb %dil, %sil +; AVX2-NEXT: movl $0, %edi +; AVX2-NEXT: jb .LBB1_34 +; AVX2-NEXT: # %bb.33: +; AVX2-NEXT: movl %esi, %edi +; AVX2-NEXT: .LBB1_34: +; AVX2-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: movzbl %r8b, %esi +; AVX2-NEXT: movzbl %dil, %ecx +; AVX2-NEXT: vpextrb $0, %xmm1, %ebx +; AVX2-NEXT: vpextrb $0, %xmm0, %edi +; AVX2-NEXT: subb %bl, %dil +; AVX2-NEXT: movl $0, %ebx +; AVX2-NEXT: jb .LBB1_36 +; AVX2-NEXT: # %bb.35: +; AVX2-NEXT: movl %edi, %ebx +; AVX2-NEXT: .LBB1_36: +; AVX2-NEXT: vpinsrb $2, %esi, %xmm2, %xmm2 +; AVX2-NEXT: movzbl %dl, %edx +; AVX2-NEXT: movzbl %bl, %esi +; AVX2-NEXT: vmovd %esi, %xmm3 +; AVX2-NEXT: vpinsrb $1, %ecx, %xmm3, %xmm3 +; AVX2-NEXT: vpextrb $2, %xmm1, %esi +; AVX2-NEXT: vpextrb $2, %xmm0, %ecx +; AVX2-NEXT: subb %sil, %cl +; AVX2-NEXT: movl $0, %esi +; AVX2-NEXT: jb .LBB1_38 +; AVX2-NEXT: # %bb.37: +; AVX2-NEXT: movl %ecx, %esi +; AVX2-NEXT: .LBB1_38: +; AVX2-NEXT: vpinsrb $3, %edx, %xmm2, %xmm2 +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: movzbl %sil, %ecx +; AVX2-NEXT: vpinsrb $2, %ecx, %xmm3, %xmm3 +; AVX2-NEXT: vpextrb $3, %xmm1, %edx +; AVX2-NEXT: vpextrb $3, %xmm0, %ecx +; AVX2-NEXT: subb %dl, %cl +; AVX2-NEXT: movl $0, %edx +; AVX2-NEXT: jb .LBB1_40 +; AVX2-NEXT: # %bb.39: +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: .LBB1_40: +; AVX2-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl %r10b, %eax +; AVX2-NEXT: movzbl %dl, %ecx +; AVX2-NEXT: vpinsrb $3, %ecx, %xmm3, %xmm3 +; AVX2-NEXT: vpextrb $4, %xmm1, %edx +; AVX2-NEXT: vpextrb $4, %xmm0, %ecx +; AVX2-NEXT: subb %dl, %cl +; AVX2-NEXT: movl $0, %edx +; AVX2-NEXT: jb .LBB1_42 +; AVX2-NEXT: # %bb.41: +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: .LBB1_42: +; AVX2-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl %bpl, %eax +; AVX2-NEXT: movzbl %dl, %ecx +; AVX2-NEXT: vpinsrb $4, %ecx, %xmm3, %xmm3 +; AVX2-NEXT: vpextrb $5, %xmm1, %edx +; AVX2-NEXT: vpextrb $5, %xmm0, %ecx +; AVX2-NEXT: subb %dl, %cl +; AVX2-NEXT: movl $0, %edx +; AVX2-NEXT: jb .LBB1_44 +; AVX2-NEXT: # %bb.43: +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: .LBB1_44: +; AVX2-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl %r11b, %eax +; AVX2-NEXT: movzbl %dl, %ecx +; AVX2-NEXT: vpinsrb $5, %ecx, %xmm3, %xmm3 +; AVX2-NEXT: vpextrb $6, %xmm1, %edx +; AVX2-NEXT: vpextrb $6, %xmm0, %ecx +; AVX2-NEXT: subb %dl, %cl +; AVX2-NEXT: movl $0, %edx +; AVX2-NEXT: jb .LBB1_46 +; AVX2-NEXT: # %bb.45: +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: .LBB1_46: +; AVX2-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl %r9b, %eax +; AVX2-NEXT: movzbl %dl, %ecx +; AVX2-NEXT: vpinsrb $6, %ecx, %xmm3, %xmm3 +; AVX2-NEXT: vpextrb $7, %xmm1, %edx +; AVX2-NEXT: vpextrb $7, %xmm0, %ecx +; AVX2-NEXT: subb %dl, %cl +; AVX2-NEXT: movl $0, %edx +; AVX2-NEXT: jb .LBB1_48 +; AVX2-NEXT: # %bb.47: +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: .LBB1_48: +; AVX2-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl %r13b, %eax +; AVX2-NEXT: movzbl %dl, %ecx +; AVX2-NEXT: vpinsrb $7, %ecx, %xmm3, %xmm3 +; AVX2-NEXT: vpextrb $8, %xmm1, %edx +; AVX2-NEXT: vpextrb $8, %xmm0, %ecx +; AVX2-NEXT: subb %dl, %cl +; AVX2-NEXT: movl $0, %edx +; AVX2-NEXT: jb .LBB1_50 +; AVX2-NEXT: # %bb.49: +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: .LBB1_50: +; AVX2-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: movzbl %dl, %ecx +; AVX2-NEXT: vpinsrb $8, %ecx, %xmm3, %xmm3 +; AVX2-NEXT: vpextrb $9, %xmm1, %edx +; AVX2-NEXT: vpextrb $9, %xmm0, %ecx +; AVX2-NEXT: subb %dl, %cl +; AVX2-NEXT: movl $0, %edx +; AVX2-NEXT: jb .LBB1_52 +; AVX2-NEXT: # %bb.51: +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: .LBB1_52: +; AVX2-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: movzbl %dl, %ecx +; AVX2-NEXT: vpinsrb $9, %ecx, %xmm3, %xmm3 +; AVX2-NEXT: vpextrb $10, %xmm1, %edx +; AVX2-NEXT: vpextrb $10, %xmm0, %ecx +; AVX2-NEXT: subb %dl, %cl +; AVX2-NEXT: movl $0, %edx +; AVX2-NEXT: jb .LBB1_54 +; AVX2-NEXT: # %bb.53: +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: .LBB1_54: +; AVX2-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: movzbl %dl, %ecx +; AVX2-NEXT: vpinsrb $10, %ecx, %xmm3, %xmm3 +; AVX2-NEXT: vpextrb $11, %xmm1, %edx +; AVX2-NEXT: vpextrb $11, %xmm0, %ecx +; AVX2-NEXT: subb %dl, %cl +; AVX2-NEXT: movl $0, %edx +; AVX2-NEXT: jb .LBB1_56 +; AVX2-NEXT: # %bb.55: +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: .LBB1_56: +; AVX2-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl %r12b, %eax +; AVX2-NEXT: movzbl %dl, %ecx +; AVX2-NEXT: vpinsrb $11, %ecx, %xmm3, %xmm3 +; AVX2-NEXT: vpextrb $12, %xmm1, %edx +; AVX2-NEXT: vpextrb $12, %xmm0, %ecx +; AVX2-NEXT: subb %dl, %cl +; AVX2-NEXT: movl $0, %edx +; AVX2-NEXT: jb .LBB1_58 +; AVX2-NEXT: # %bb.57: +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: .LBB1_58: +; AVX2-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl %r15b, %eax +; AVX2-NEXT: movzbl %dl, %ecx +; AVX2-NEXT: vpinsrb $12, %ecx, %xmm3, %xmm3 +; AVX2-NEXT: vpextrb $13, %xmm1, %edx +; AVX2-NEXT: vpextrb $13, %xmm0, %ecx +; AVX2-NEXT: subb %dl, %cl +; AVX2-NEXT: movl $0, %edx +; AVX2-NEXT: jb .LBB1_60 +; AVX2-NEXT: # %bb.59: +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: .LBB1_60: +; AVX2-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl %r14b, %eax +; AVX2-NEXT: movzbl %dl, %ecx +; AVX2-NEXT: vpinsrb $13, %ecx, %xmm3, %xmm3 +; AVX2-NEXT: vpextrb $14, %xmm1, %edx +; AVX2-NEXT: vpextrb $14, %xmm0, %ecx +; AVX2-NEXT: subb %dl, %cl +; AVX2-NEXT: movl $0, %edx +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r12 +; AVX2-NEXT: popq %r13 +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: jb .LBB1_62 +; AVX2-NEXT: # %bb.61: +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: .LBB1_62: +; AVX2-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movzbl %dl, %eax +; AVX2-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 +; AVX2-NEXT: vpextrb $15, %xmm1, %ecx +; AVX2-NEXT: vpextrb $15, %xmm0, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %ecx +; AVX2-NEXT: jb .LBB1_64 +; AVX2-NEXT: # %bb.63: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB1_64: +; AVX2-NEXT: movzbl %cl, %eax +; AVX2-NEXT: vpinsrb $15, %eax, %xmm3, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: v32i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vpextrb $1, %xmm2, %ecx +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512-NEXT: vpextrb $1, %xmm3, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %ecx +; AVX512-NEXT: jb .LBB1_2 +; AVX512-NEXT: # %bb.1: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB1_2: +; AVX512-NEXT: vpextrb $0, %xmm2, %edx +; AVX512-NEXT: vpextrb $0, %xmm3, %eax +; AVX512-NEXT: subb %dl, %al +; AVX512-NEXT: movl $0, %esi +; AVX512-NEXT: jb .LBB1_4 +; AVX512-NEXT: # %bb.3: +; AVX512-NEXT: movl %eax, %esi +; AVX512-NEXT: .LBB1_4: +; AVX512-NEXT: vpextrb $2, %xmm2, %edx +; AVX512-NEXT: vpextrb $2, %xmm3, %eax +; AVX512-NEXT: subb %dl, %al +; AVX512-NEXT: movl $0, %r8d +; AVX512-NEXT: jb .LBB1_6 +; AVX512-NEXT: # %bb.5: +; AVX512-NEXT: movl %eax, %r8d +; AVX512-NEXT: .LBB1_6: +; AVX512-NEXT: vpextrb $3, %xmm2, %edx +; AVX512-NEXT: vpextrb $3, %xmm3, %eax +; AVX512-NEXT: subb %dl, %al +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: jb .LBB1_8 +; AVX512-NEXT: # %bb.7: +; AVX512-NEXT: movl %eax, %edx +; AVX512-NEXT: .LBB1_8: +; AVX512-NEXT: vpextrb $4, %xmm2, %eax +; AVX512-NEXT: vpextrb $4, %xmm3, %edi +; AVX512-NEXT: subb %al, %dil +; AVX512-NEXT: movl $0, %eax +; AVX512-NEXT: jb .LBB1_10 +; AVX512-NEXT: # %bb.9: +; AVX512-NEXT: movl %edi, %eax +; AVX512-NEXT: .LBB1_10: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r15 +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %r13 +; AVX512-NEXT: pushq %r12 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: vpextrb $5, %xmm2, %ebp +; AVX512-NEXT: vpextrb $5, %xmm3, %edi +; AVX512-NEXT: subb %bpl, %dil +; AVX512-NEXT: movl $0, %r10d +; AVX512-NEXT: jb .LBB1_12 +; AVX512-NEXT: # %bb.11: +; AVX512-NEXT: movl %edi, %r10d +; AVX512-NEXT: .LBB1_12: +; AVX512-NEXT: vpextrb $6, %xmm2, %ebp +; AVX512-NEXT: vpextrb $6, %xmm3, %edi +; AVX512-NEXT: subb %bpl, %dil +; AVX512-NEXT: movl $0, %ebp +; AVX512-NEXT: jb .LBB1_14 +; AVX512-NEXT: # %bb.13: +; AVX512-NEXT: movl %edi, %ebp +; AVX512-NEXT: .LBB1_14: +; AVX512-NEXT: vpextrb $7, %xmm2, %ebx +; AVX512-NEXT: vpextrb $7, %xmm3, %edi +; AVX512-NEXT: subb %bl, %dil +; AVX512-NEXT: movl $0, %r11d +; AVX512-NEXT: jb .LBB1_16 +; AVX512-NEXT: # %bb.15: +; AVX512-NEXT: movl %edi, %r11d +; AVX512-NEXT: .LBB1_16: +; AVX512-NEXT: vpextrb $8, %xmm2, %ebx +; AVX512-NEXT: vpextrb $8, %xmm3, %edi +; AVX512-NEXT: subb %bl, %dil +; AVX512-NEXT: movl $0, %r9d +; AVX512-NEXT: jb .LBB1_18 +; AVX512-NEXT: # %bb.17: +; AVX512-NEXT: movl %edi, %r9d +; AVX512-NEXT: .LBB1_18: +; AVX512-NEXT: vpextrb $9, %xmm2, %ebx +; AVX512-NEXT: vpextrb $9, %xmm3, %edi +; AVX512-NEXT: subb %bl, %dil +; AVX512-NEXT: movl $0, %r13d +; AVX512-NEXT: jb .LBB1_20 +; AVX512-NEXT: # %bb.19: +; AVX512-NEXT: movl %edi, %r13d +; AVX512-NEXT: .LBB1_20: +; AVX512-NEXT: vpextrb $10, %xmm2, %ebx +; AVX512-NEXT: vpextrb $10, %xmm3, %edi +; AVX512-NEXT: subb %bl, %dil +; AVX512-NEXT: movl $0, %ebx +; AVX512-NEXT: jb .LBB1_22 +; AVX512-NEXT: # %bb.21: +; AVX512-NEXT: movl %edi, %ebx +; AVX512-NEXT: .LBB1_22: +; AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrb $11, %xmm2, %ebx +; AVX512-NEXT: vpextrb $11, %xmm3, %edi +; AVX512-NEXT: subb %bl, %dil +; AVX512-NEXT: movl $0, %ebx +; AVX512-NEXT: jb .LBB1_24 +; AVX512-NEXT: # %bb.23: +; AVX512-NEXT: movl %edi, %ebx +; AVX512-NEXT: .LBB1_24: +; AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrb $12, %xmm2, %ebx +; AVX512-NEXT: vpextrb $12, %xmm3, %edi +; AVX512-NEXT: subb %bl, %dil +; AVX512-NEXT: movl $0, %ebx +; AVX512-NEXT: jb .LBB1_26 +; AVX512-NEXT: # %bb.25: +; AVX512-NEXT: movl %edi, %ebx +; AVX512-NEXT: .LBB1_26: +; AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrb $13, %xmm2, %ebx +; AVX512-NEXT: vpextrb $13, %xmm3, %edi +; AVX512-NEXT: subb %bl, %dil +; AVX512-NEXT: movl $0, %r12d +; AVX512-NEXT: jb .LBB1_28 +; AVX512-NEXT: # %bb.27: +; AVX512-NEXT: movl %edi, %r12d +; AVX512-NEXT: .LBB1_28: +; AVX512-NEXT: vpextrb $14, %xmm2, %ebx +; AVX512-NEXT: vpextrb $14, %xmm3, %edi +; AVX512-NEXT: subb %bl, %dil +; AVX512-NEXT: movl $0, %r15d +; AVX512-NEXT: jb .LBB1_30 +; AVX512-NEXT: # %bb.29: +; AVX512-NEXT: movl %edi, %r15d +; AVX512-NEXT: .LBB1_30: +; AVX512-NEXT: movzbl %sil, %esi +; AVX512-NEXT: vpextrb $15, %xmm2, %ebx +; AVX512-NEXT: vpextrb $15, %xmm3, %edi +; AVX512-NEXT: subb %bl, %dil +; AVX512-NEXT: movl $0, %r14d +; AVX512-NEXT: jb .LBB1_32 +; AVX512-NEXT: # %bb.31: +; AVX512-NEXT: movl %edi, %r14d +; AVX512-NEXT: .LBB1_32: +; AVX512-NEXT: movzbl %cl, %ecx +; AVX512-NEXT: vmovd %esi, %xmm2 +; AVX512-NEXT: vpextrb $1, %xmm1, %edi +; AVX512-NEXT: vpextrb $1, %xmm0, %esi +; AVX512-NEXT: subb %dil, %sil +; AVX512-NEXT: movl $0, %edi +; AVX512-NEXT: jb .LBB1_34 +; AVX512-NEXT: # %bb.33: +; AVX512-NEXT: movl %esi, %edi +; AVX512-NEXT: .LBB1_34: +; AVX512-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 +; AVX512-NEXT: movzbl %r8b, %esi +; AVX512-NEXT: movzbl %dil, %ecx +; AVX512-NEXT: vpextrb $0, %xmm1, %ebx +; AVX512-NEXT: vpextrb $0, %xmm0, %edi +; AVX512-NEXT: subb %bl, %dil +; AVX512-NEXT: movl $0, %ebx +; AVX512-NEXT: jb .LBB1_36 +; AVX512-NEXT: # %bb.35: +; AVX512-NEXT: movl %edi, %ebx +; AVX512-NEXT: .LBB1_36: +; AVX512-NEXT: vpinsrb $2, %esi, %xmm2, %xmm2 +; AVX512-NEXT: movzbl %dl, %edx +; AVX512-NEXT: movzbl %bl, %esi +; AVX512-NEXT: vmovd %esi, %xmm3 +; AVX512-NEXT: vpinsrb $1, %ecx, %xmm3, %xmm3 +; AVX512-NEXT: vpextrb $2, %xmm1, %esi +; AVX512-NEXT: vpextrb $2, %xmm0, %ecx +; AVX512-NEXT: subb %sil, %cl +; AVX512-NEXT: movl $0, %esi +; AVX512-NEXT: jb .LBB1_38 +; AVX512-NEXT: # %bb.37: +; AVX512-NEXT: movl %ecx, %esi +; AVX512-NEXT: .LBB1_38: +; AVX512-NEXT: vpinsrb $3, %edx, %xmm2, %xmm2 +; AVX512-NEXT: movzbl %al, %eax +; AVX512-NEXT: movzbl %sil, %ecx +; AVX512-NEXT: vpinsrb $2, %ecx, %xmm3, %xmm3 +; AVX512-NEXT: vpextrb $3, %xmm1, %edx +; AVX512-NEXT: vpextrb $3, %xmm0, %ecx +; AVX512-NEXT: subb %dl, %cl +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: jb .LBB1_40 +; AVX512-NEXT: # %bb.39: +; AVX512-NEXT: movl %ecx, %edx +; AVX512-NEXT: .LBB1_40: +; AVX512-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl %r10b, %eax +; AVX512-NEXT: movzbl %dl, %ecx +; AVX512-NEXT: vpinsrb $3, %ecx, %xmm3, %xmm3 +; AVX512-NEXT: vpextrb $4, %xmm1, %edx +; AVX512-NEXT: vpextrb $4, %xmm0, %ecx +; AVX512-NEXT: subb %dl, %cl +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: jb .LBB1_42 +; AVX512-NEXT: # %bb.41: +; AVX512-NEXT: movl %ecx, %edx +; AVX512-NEXT: .LBB1_42: +; AVX512-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl %bpl, %eax +; AVX512-NEXT: movzbl %dl, %ecx +; AVX512-NEXT: vpinsrb $4, %ecx, %xmm3, %xmm3 +; AVX512-NEXT: vpextrb $5, %xmm1, %edx +; AVX512-NEXT: vpextrb $5, %xmm0, %ecx +; AVX512-NEXT: subb %dl, %cl +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: jb .LBB1_44 +; AVX512-NEXT: # %bb.43: +; AVX512-NEXT: movl %ecx, %edx +; AVX512-NEXT: .LBB1_44: +; AVX512-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl %r11b, %eax +; AVX512-NEXT: movzbl %dl, %ecx +; AVX512-NEXT: vpinsrb $5, %ecx, %xmm3, %xmm3 +; AVX512-NEXT: vpextrb $6, %xmm1, %edx +; AVX512-NEXT: vpextrb $6, %xmm0, %ecx +; AVX512-NEXT: subb %dl, %cl +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: jb .LBB1_46 +; AVX512-NEXT: # %bb.45: +; AVX512-NEXT: movl %ecx, %edx +; AVX512-NEXT: .LBB1_46: +; AVX512-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl %r9b, %eax +; AVX512-NEXT: movzbl %dl, %ecx +; AVX512-NEXT: vpinsrb $6, %ecx, %xmm3, %xmm3 +; AVX512-NEXT: vpextrb $7, %xmm1, %edx +; AVX512-NEXT: vpextrb $7, %xmm0, %ecx +; AVX512-NEXT: subb %dl, %cl +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: jb .LBB1_48 +; AVX512-NEXT: # %bb.47: +; AVX512-NEXT: movl %ecx, %edx +; AVX512-NEXT: .LBB1_48: +; AVX512-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl %r13b, %eax +; AVX512-NEXT: movzbl %dl, %ecx +; AVX512-NEXT: vpinsrb $7, %ecx, %xmm3, %xmm3 +; AVX512-NEXT: vpextrb $8, %xmm1, %edx +; AVX512-NEXT: vpextrb $8, %xmm0, %ecx +; AVX512-NEXT: subb %dl, %cl +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: jb .LBB1_50 +; AVX512-NEXT: # %bb.49: +; AVX512-NEXT: movl %ecx, %edx +; AVX512-NEXT: .LBB1_50: +; AVX512-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: movzbl %dl, %ecx +; AVX512-NEXT: vpinsrb $8, %ecx, %xmm3, %xmm3 +; AVX512-NEXT: vpextrb $9, %xmm1, %edx +; AVX512-NEXT: vpextrb $9, %xmm0, %ecx +; AVX512-NEXT: subb %dl, %cl +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: jb .LBB1_52 +; AVX512-NEXT: # %bb.51: +; AVX512-NEXT: movl %ecx, %edx +; AVX512-NEXT: .LBB1_52: +; AVX512-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: movzbl %dl, %ecx +; AVX512-NEXT: vpinsrb $9, %ecx, %xmm3, %xmm3 +; AVX512-NEXT: vpextrb $10, %xmm1, %edx +; AVX512-NEXT: vpextrb $10, %xmm0, %ecx +; AVX512-NEXT: subb %dl, %cl +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: jb .LBB1_54 +; AVX512-NEXT: # %bb.53: +; AVX512-NEXT: movl %ecx, %edx +; AVX512-NEXT: .LBB1_54: +; AVX512-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: movzbl %dl, %ecx +; AVX512-NEXT: vpinsrb $10, %ecx, %xmm3, %xmm3 +; AVX512-NEXT: vpextrb $11, %xmm1, %edx +; AVX512-NEXT: vpextrb $11, %xmm0, %ecx +; AVX512-NEXT: subb %dl, %cl +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: jb .LBB1_56 +; AVX512-NEXT: # %bb.55: +; AVX512-NEXT: movl %ecx, %edx +; AVX512-NEXT: .LBB1_56: +; AVX512-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl %r12b, %eax +; AVX512-NEXT: movzbl %dl, %ecx +; AVX512-NEXT: vpinsrb $11, %ecx, %xmm3, %xmm3 +; AVX512-NEXT: vpextrb $12, %xmm1, %edx +; AVX512-NEXT: vpextrb $12, %xmm0, %ecx +; AVX512-NEXT: subb %dl, %cl +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: jb .LBB1_58 +; AVX512-NEXT: # %bb.57: +; AVX512-NEXT: movl %ecx, %edx +; AVX512-NEXT: .LBB1_58: +; AVX512-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl %r15b, %eax +; AVX512-NEXT: movzbl %dl, %ecx +; AVX512-NEXT: vpinsrb $12, %ecx, %xmm3, %xmm3 +; AVX512-NEXT: vpextrb $13, %xmm1, %edx +; AVX512-NEXT: vpextrb $13, %xmm0, %ecx +; AVX512-NEXT: subb %dl, %cl +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: jb .LBB1_60 +; AVX512-NEXT: # %bb.59: +; AVX512-NEXT: movl %ecx, %edx +; AVX512-NEXT: .LBB1_60: +; AVX512-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl %r14b, %eax +; AVX512-NEXT: movzbl %dl, %ecx +; AVX512-NEXT: vpinsrb $13, %ecx, %xmm3, %xmm3 +; AVX512-NEXT: vpextrb $14, %xmm1, %edx +; AVX512-NEXT: vpextrb $14, %xmm0, %ecx +; AVX512-NEXT: subb %dl, %cl +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r12 +; AVX512-NEXT: popq %r13 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %r15 +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: jb .LBB1_62 +; AVX512-NEXT: # %bb.61: +; AVX512-NEXT: movl %ecx, %edx +; AVX512-NEXT: .LBB1_62: +; AVX512-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl %dl, %eax +; AVX512-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 +; AVX512-NEXT: vpextrb $15, %xmm1, %ecx +; AVX512-NEXT: vpextrb $15, %xmm0, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %ecx +; AVX512-NEXT: jb .LBB1_64 +; AVX512-NEXT: # %bb.63: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB1_64: +; AVX512-NEXT: movzbl %cl, %eax +; AVX512-NEXT: vpinsrb $15, %eax, %xmm3, %xmm0 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512-NEXT: retq + %z = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> %x, <32 x i8> %y) + ret <32 x i8> %z +} + +define <64 x i8> @v64i8(<64 x i8> %x, <64 x i8> %y) nounwind { +; SSE2-LABEL: v64i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: pushq %r15 +; SSE2-NEXT: pushq %r14 +; SSE2-NEXT: pushq %r13 +; SSE2-NEXT: pushq %r12 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: subq $648, %rsp # imm = 0x288 +; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movaps %xmm5, (%rsp) +; SSE2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp) +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, %esi +; SSE2-NEXT: jb .LBB2_2 +; SSE2-NEXT: # %bb.1: +; SSE2-NEXT: movl %eax, %esi +; SSE2-NEXT: .LBB2_2: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, %ebx +; SSE2-NEXT: jb .LBB2_4 +; SSE2-NEXT: # %bb.3: +; SSE2-NEXT: movl %eax, %ebx +; SSE2-NEXT: .LBB2_4: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, %ebp +; SSE2-NEXT: jb .LBB2_6 +; SSE2-NEXT: # %bb.5: +; SSE2-NEXT: movl %eax, %ebp +; SSE2-NEXT: .LBB2_6: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, %edi +; SSE2-NEXT: jb .LBB2_8 +; SSE2-NEXT: # %bb.7: +; SSE2-NEXT: movl %eax, %edi +; SSE2-NEXT: .LBB2_8: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, %r9d +; SSE2-NEXT: jb .LBB2_10 +; SSE2-NEXT: # %bb.9: +; SSE2-NEXT: movl %eax, %r9d +; SSE2-NEXT: .LBB2_10: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, %edx +; SSE2-NEXT: jb .LBB2_12 +; SSE2-NEXT: # %bb.11: +; SSE2-NEXT: movl %eax, %edx +; SSE2-NEXT: .LBB2_12: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, %ecx +; SSE2-NEXT: jb .LBB2_14 +; SSE2-NEXT: # %bb.13: +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: .LBB2_14: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, %r10d +; SSE2-NEXT: jb .LBB2_16 +; SSE2-NEXT: # %bb.15: +; SSE2-NEXT: movl %eax, %r10d +; SSE2-NEXT: .LBB2_16: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, %r11d +; SSE2-NEXT: jb .LBB2_18 +; SSE2-NEXT: # %bb.17: +; SSE2-NEXT: movl %eax, %r11d +; SSE2-NEXT: .LBB2_18: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, %r14d +; SSE2-NEXT: jb .LBB2_20 +; SSE2-NEXT: # %bb.19: +; SSE2-NEXT: movl %eax, %r14d +; SSE2-NEXT: .LBB2_20: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, %r15d +; SSE2-NEXT: jb .LBB2_22 +; SSE2-NEXT: # %bb.21: +; SSE2-NEXT: movl %eax, %r15d +; SSE2-NEXT: .LBB2_22: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, %r12d +; SSE2-NEXT: jb .LBB2_24 +; SSE2-NEXT: # %bb.23: +; SSE2-NEXT: movl %eax, %r12d +; SSE2-NEXT: .LBB2_24: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, %r13d +; SSE2-NEXT: jb .LBB2_26 +; SSE2-NEXT: # %bb.25: +; SSE2-NEXT: movl %eax, %r13d +; SSE2-NEXT: .LBB2_26: +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, %r8d +; SSE2-NEXT: jb .LBB2_28 +; SSE2-NEXT: # %bb.27: +; SSE2-NEXT: movl %eax, %r8d +; SSE2-NEXT: .LBB2_28: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: jb .LBB2_30 +; SSE2-NEXT: # %bb.29: +; SSE2-NEXT: # kill: def $cl killed $cl def $ecx +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB2_30: +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, %ecx +; SSE2-NEXT: jb .LBB2_32 +; SSE2-NEXT: # %bb.31: +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: .LBB2_32: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: jb .LBB2_34 +; SSE2-NEXT: # %bb.33: +; SSE2-NEXT: # kill: def $al killed $al def $eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB2_34: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: jb .LBB2_36 +; SSE2-NEXT: # %bb.35: +; SSE2-NEXT: # kill: def $al killed $al def $eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB2_36: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: jb .LBB2_38 +; SSE2-NEXT: # %bb.37: +; SSE2-NEXT: # kill: def $al killed $al def $eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB2_38: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: jb .LBB2_40 +; SSE2-NEXT: # %bb.39: +; SSE2-NEXT: # kill: def $al killed $al def $eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB2_40: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: jb .LBB2_42 +; SSE2-NEXT: # %bb.41: +; SSE2-NEXT: # kill: def $al killed $al def $eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB2_42: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: jb .LBB2_44 +; SSE2-NEXT: # %bb.43: +; SSE2-NEXT: # kill: def $al killed $al def $eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB2_44: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: jb .LBB2_46 +; SSE2-NEXT: # %bb.45: +; SSE2-NEXT: # kill: def $al killed $al def $eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB2_46: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: jb .LBB2_48 +; SSE2-NEXT: # %bb.47: +; SSE2-NEXT: # kill: def $al killed $al def $eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB2_48: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: jb .LBB2_50 +; SSE2-NEXT: # %bb.49: +; SSE2-NEXT: # kill: def $al killed $al def $eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB2_50: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: jb .LBB2_52 +; SSE2-NEXT: # %bb.51: +; SSE2-NEXT: # kill: def $al killed $al def $eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB2_52: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: jb .LBB2_54 +; SSE2-NEXT: # %bb.53: +; SSE2-NEXT: # kill: def $al killed $al def $eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB2_54: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: jb .LBB2_56 +; SSE2-NEXT: # %bb.55: +; SSE2-NEXT: # kill: def $al killed $al def $eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB2_56: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: jb .LBB2_58 +; SSE2-NEXT: # %bb.57: +; SSE2-NEXT: # kill: def $al killed $al def $eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB2_58: +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, %ecx +; SSE2-NEXT: jb .LBB2_60 +; SSE2-NEXT: # %bb.59: +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: .LBB2_60: +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: jb .LBB2_62 +; SSE2-NEXT: # %bb.61: +; SSE2-NEXT: # kill: def $cl killed $cl def $ecx +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB2_62: +; SSE2-NEXT: subb (%rsp), %al +; SSE2-NEXT: movl $0, %ecx +; SSE2-NEXT: jb .LBB2_64 +; SSE2-NEXT: # %bb.63: +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: .LBB2_64: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: jb .LBB2_66 +; SSE2-NEXT: # %bb.65: +; SSE2-NEXT: # kill: def $al killed $al def $eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB2_66: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: jb .LBB2_68 +; SSE2-NEXT: # %bb.67: +; SSE2-NEXT: # kill: def $al killed $al def $eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB2_68: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: jb .LBB2_70 +; SSE2-NEXT: # %bb.69: +; SSE2-NEXT: # kill: def $al killed $al def $eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB2_70: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: jb .LBB2_72 +; SSE2-NEXT: # %bb.71: +; SSE2-NEXT: # kill: def $al killed $al def $eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB2_72: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: jb .LBB2_74 +; SSE2-NEXT: # %bb.73: +; SSE2-NEXT: # kill: def $al killed $al def $eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB2_74: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: jb .LBB2_76 +; SSE2-NEXT: # %bb.75: +; SSE2-NEXT: # kill: def $al killed $al def $eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB2_76: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: jb .LBB2_78 +; SSE2-NEXT: # %bb.77: +; SSE2-NEXT: # kill: def $al killed $al def $eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB2_78: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: jb .LBB2_80 +; SSE2-NEXT: # %bb.79: +; SSE2-NEXT: # kill: def $al killed $al def $eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB2_80: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: jb .LBB2_82 +; SSE2-NEXT: # %bb.81: +; SSE2-NEXT: # kill: def $al killed $al def $eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB2_82: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: jb .LBB2_84 +; SSE2-NEXT: # %bb.83: +; SSE2-NEXT: # kill: def $al killed $al def $eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB2_84: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: jb .LBB2_86 +; SSE2-NEXT: # %bb.85: +; SSE2-NEXT: # kill: def $al killed $al def $eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB2_86: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: jb .LBB2_88 +; SSE2-NEXT: # %bb.87: +; SSE2-NEXT: # kill: def $al killed $al def $eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB2_88: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: jb .LBB2_90 +; SSE2-NEXT: # %bb.89: +; SSE2-NEXT: # kill: def $al killed $al def $eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB2_90: +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, %ecx +; SSE2-NEXT: jb .LBB2_92 +; SSE2-NEXT: # %bb.91: +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: .LBB2_92: +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: jb .LBB2_94 +; SSE2-NEXT: # %bb.93: +; SSE2-NEXT: # kill: def $cl killed $cl def $ecx +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB2_94: +; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, %ecx +; SSE2-NEXT: jb .LBB2_96 +; SSE2-NEXT: # %bb.95: +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: .LBB2_96: +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, %ecx +; SSE2-NEXT: jb .LBB2_98 +; SSE2-NEXT: # %bb.97: +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: .LBB2_98: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: jb .LBB2_100 +; SSE2-NEXT: # %bb.99: +; SSE2-NEXT: # kill: def $al killed $al def $eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB2_100: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: jb .LBB2_102 +; SSE2-NEXT: # %bb.101: +; SSE2-NEXT: # kill: def $al killed $al def $eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB2_102: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: jb .LBB2_104 +; SSE2-NEXT: # %bb.103: +; SSE2-NEXT: # kill: def $al killed $al def $eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB2_104: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: jb .LBB2_106 +; SSE2-NEXT: # %bb.105: +; SSE2-NEXT: # kill: def $al killed $al def $eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB2_106: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: jb .LBB2_108 +; SSE2-NEXT: # %bb.107: +; SSE2-NEXT: # kill: def $al killed $al def $eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB2_108: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: jb .LBB2_110 +; SSE2-NEXT: # %bb.109: +; SSE2-NEXT: # kill: def $al killed $al def $eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB2_110: +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, %ecx +; SSE2-NEXT: jb .LBB2_112 +; SSE2-NEXT: # %bb.111: +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: .LBB2_112: +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, %ecx +; SSE2-NEXT: jb .LBB2_114 +; SSE2-NEXT: # %bb.113: +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: .LBB2_114: +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, %ecx +; SSE2-NEXT: jb .LBB2_116 +; SSE2-NEXT: # %bb.115: +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: .LBB2_116: +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl %sil, %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl %bl, %esi +; SSE2-NEXT: movzbl %bpl, %ebx +; SSE2-NEXT: movzbl %dil, %ebp +; SSE2-NEXT: movzbl %r9b, %edi +; SSE2-NEXT: movzbl %dl, %ecx +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; SSE2-NEXT: movzbl %r10b, %r10d +; SSE2-NEXT: movzbl %r11b, %r11d +; SSE2-NEXT: movzbl %r14b, %r14d +; SSE2-NEXT: movzbl %r15b, %r15d +; SSE2-NEXT: movzbl %r12b, %r12d +; SSE2-NEXT: movzbl %r13b, %r13d +; SSE2-NEXT: movzbl %r8b, %r8d +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: jb .LBB2_118 +; SSE2-NEXT: # %bb.117: +; SSE2-NEXT: # kill: def $al killed $al def $eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: .LBB2_118: +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 4-byte Folded Reload +; SSE2-NEXT: # xmm10 = mem[0],zero,zero,zero +; SSE2-NEXT: movd %esi, %xmm4 +; SSE2-NEXT: movd %ebx, %xmm0 +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd %ebp, %xmm3 +; SSE2-NEXT: movd %edi, %xmm0 +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd %ecx, %xmm5 +; SSE2-NEXT: movd %edx, %xmm0 +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd %r10d, %xmm0 +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd %r11d, %xmm0 +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd %r14d, %xmm0 +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd %r15d, %xmm0 +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd %r12d, %xmm0 +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd %r13d, %xmm0 +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd %r8d, %xmm0 +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd %r9d, %xmm0 +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movl $0, %ebx +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 4-byte Folded Reload +; SSE2-NEXT: # xmm6 = mem[0],zero,zero,zero +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload +; SSE2-NEXT: # xmm15 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 4-byte Folded Reload +; SSE2-NEXT: # xmm9 = mem[0],zero,zero,zero +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 4-byte Folded Reload +; SSE2-NEXT: # xmm11 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 4-byte Folded Reload +; SSE2-NEXT: # xmm14 = mem[0],zero,zero,zero +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 4-byte Folded Reload +; SSE2-NEXT: # xmm13 = mem[0],zero,zero,zero +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload +; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 4-byte Folded Reload +; SSE2-NEXT: # xmm12 = mem[0],zero,zero,zero +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 4-byte Folded Reload +; SSE2-NEXT: # xmm7 = mem[0],zero,zero,zero +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 4-byte Folded Reload +; SSE2-NEXT: # xmm8 = mem[0],zero,zero,zero +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload +; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: jb .LBB2_120 +; SSE2-NEXT: # %bb.119: +; SSE2-NEXT: movl %ecx, %ebx +; SSE2-NEXT: .LBB2_120: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] +; SSE2-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE2-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3],xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE2-NEXT: # xmm10 = xmm10[0],mem[0],xmm10[1],mem[1],xmm10[2],mem[2],xmm10[3],mem[3],xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] +; SSE2-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE2-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1],xmm15[2],mem[2],xmm15[3],mem[3],xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] +; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3],xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE2-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1],xmm13[2],mem[2],xmm13[3],mem[3],xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE2-NEXT: # xmm12 = xmm12[0],mem[0],xmm12[1],mem[1],xmm12[2],mem[2],xmm12[3],mem[3],xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE2-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3],xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE2-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3],xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE2-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE2-NEXT: movd %r10d, %xmm3 +; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd %r11d, %xmm10 +; SSE2-NEXT: movd %r14d, %xmm3 +; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd %eax, %xmm9 +; SSE2-NEXT: movd %r15d, %xmm3 +; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd %r12d, %xmm11 +; SSE2-NEXT: movd %r13d, %xmm3 +; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd %esi, %xmm5 +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; SSE2-NEXT: movzbl %bl, %esi +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl $0, %edi +; SSE2-NEXT: jb .LBB2_122 +; SSE2-NEXT: # %bb.121: +; SSE2-NEXT: movl %ebx, %edi +; SSE2-NEXT: .LBB2_122: +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3] +; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE2-NEXT: # xmm14 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3] +; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE2-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1],xmm13[2],mem[2],xmm13[3],mem[3] +; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE2-NEXT: # xmm12 = xmm12[0],mem[0],xmm12[1],mem[1],xmm12[2],mem[2],xmm12[3],mem[3] +; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE2-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3] +; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE2-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3] +; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE2-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE2-NEXT: # xmm10 = xmm10[0],mem[0],xmm10[1],mem[1],xmm10[2],mem[2],xmm10[3],mem[3],xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE2-NEXT: # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1],xmm9[2],mem[2],xmm9[3],mem[3],xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE2-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3],xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE2-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3],xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd %ecx, %xmm3 +; SSE2-NEXT: movd %edx, %xmm4 +; SSE2-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movd %esi, %xmm4 +; SSE2-NEXT: movzbl %dil, %eax +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl $0, %ecx +; SSE2-NEXT: jb .LBB2_124 +; SSE2-NEXT: # %bb.123: +; SSE2-NEXT: movl %edx, %ecx +; SSE2-NEXT: .LBB2_124: +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE2-NEXT: # xmm6 = xmm6[0],mem[0],xmm6[1],mem[1] +; SSE2-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE2-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE2-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE2-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3],xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; SSE2-NEXT: movd %eax, %xmm8 +; SSE2-NEXT: movzbl %cl, %eax +; SSE2-NEXT: movd %eax, %xmm6 +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl $0, %ecx +; SSE2-NEXT: jb .LBB2_126 +; SSE2-NEXT: # %bb.125: +; SSE2-NEXT: movl %edx, %ecx +; SSE2-NEXT: .LBB2_126: +; SSE2-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE2-NEXT: # xmm0 = xmm0[0],mem[0] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm14[0] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm7[0] +; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; SSE2-NEXT: movzbl %cl, %ecx +; SSE2-NEXT: movd %ecx, %xmm7 +; SSE2-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, %ecx +; SSE2-NEXT: jb .LBB2_128 +; SSE2-NEXT: # %bb.127: +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: .LBB2_128: +; SSE2-NEXT: movzbl %cl, %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0] +; SSE2-NEXT: addq $648, %rsp # imm = 0x288 +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %r12 +; SSE2-NEXT: popq %r13 +; SSE2-NEXT: popq %r14 +; SSE2-NEXT: popq %r15 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v64i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pushq %rbp +; SSSE3-NEXT: pushq %r15 +; SSSE3-NEXT: pushq %r14 +; SSSE3-NEXT: pushq %r13 +; SSSE3-NEXT: pushq %r12 +; SSSE3-NEXT: pushq %rbx +; SSSE3-NEXT: subq $648, %rsp # imm = 0x288 +; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movaps %xmm5, (%rsp) +; SSSE3-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp) +; SSSE3-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp) +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, %esi +; SSSE3-NEXT: jb .LBB2_2 +; SSSE3-NEXT: # %bb.1: +; SSSE3-NEXT: movl %eax, %esi +; SSSE3-NEXT: .LBB2_2: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, %ebx +; SSSE3-NEXT: jb .LBB2_4 +; SSSE3-NEXT: # %bb.3: +; SSSE3-NEXT: movl %eax, %ebx +; SSSE3-NEXT: .LBB2_4: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, %ebp +; SSSE3-NEXT: jb .LBB2_6 +; SSSE3-NEXT: # %bb.5: +; SSSE3-NEXT: movl %eax, %ebp +; SSSE3-NEXT: .LBB2_6: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, %edi +; SSSE3-NEXT: jb .LBB2_8 +; SSSE3-NEXT: # %bb.7: +; SSSE3-NEXT: movl %eax, %edi +; SSSE3-NEXT: .LBB2_8: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, %r9d +; SSSE3-NEXT: jb .LBB2_10 +; SSSE3-NEXT: # %bb.9: +; SSSE3-NEXT: movl %eax, %r9d +; SSSE3-NEXT: .LBB2_10: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, %edx +; SSSE3-NEXT: jb .LBB2_12 +; SSSE3-NEXT: # %bb.11: +; SSSE3-NEXT: movl %eax, %edx +; SSSE3-NEXT: .LBB2_12: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, %ecx +; SSSE3-NEXT: jb .LBB2_14 +; SSSE3-NEXT: # %bb.13: +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: .LBB2_14: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, %r10d +; SSSE3-NEXT: jb .LBB2_16 +; SSSE3-NEXT: # %bb.15: +; SSSE3-NEXT: movl %eax, %r10d +; SSSE3-NEXT: .LBB2_16: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, %r11d +; SSSE3-NEXT: jb .LBB2_18 +; SSSE3-NEXT: # %bb.17: +; SSSE3-NEXT: movl %eax, %r11d +; SSSE3-NEXT: .LBB2_18: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, %r14d +; SSSE3-NEXT: jb .LBB2_20 +; SSSE3-NEXT: # %bb.19: +; SSSE3-NEXT: movl %eax, %r14d +; SSSE3-NEXT: .LBB2_20: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, %r15d +; SSSE3-NEXT: jb .LBB2_22 +; SSSE3-NEXT: # %bb.21: +; SSSE3-NEXT: movl %eax, %r15d +; SSSE3-NEXT: .LBB2_22: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, %r12d +; SSSE3-NEXT: jb .LBB2_24 +; SSSE3-NEXT: # %bb.23: +; SSSE3-NEXT: movl %eax, %r12d +; SSSE3-NEXT: .LBB2_24: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, %r13d +; SSSE3-NEXT: jb .LBB2_26 +; SSSE3-NEXT: # %bb.25: +; SSSE3-NEXT: movl %eax, %r13d +; SSSE3-NEXT: .LBB2_26: +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, %r8d +; SSSE3-NEXT: jb .LBB2_28 +; SSSE3-NEXT: # %bb.27: +; SSSE3-NEXT: movl %eax, %r8d +; SSSE3-NEXT: .LBB2_28: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_30 +; SSSE3-NEXT: # %bb.29: +; SSSE3-NEXT: # kill: def $cl killed $cl def $ecx +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: .LBB2_30: +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, %ecx +; SSSE3-NEXT: jb .LBB2_32 +; SSSE3-NEXT: # %bb.31: +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: .LBB2_32: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_34 +; SSSE3-NEXT: # %bb.33: +; SSSE3-NEXT: # kill: def $al killed $al def $eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: .LBB2_34: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_36 +; SSSE3-NEXT: # %bb.35: +; SSSE3-NEXT: # kill: def $al killed $al def $eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: .LBB2_36: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_38 +; SSSE3-NEXT: # %bb.37: +; SSSE3-NEXT: # kill: def $al killed $al def $eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: .LBB2_38: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_40 +; SSSE3-NEXT: # %bb.39: +; SSSE3-NEXT: # kill: def $al killed $al def $eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: .LBB2_40: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_42 +; SSSE3-NEXT: # %bb.41: +; SSSE3-NEXT: # kill: def $al killed $al def $eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: .LBB2_42: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_44 +; SSSE3-NEXT: # %bb.43: +; SSSE3-NEXT: # kill: def $al killed $al def $eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: .LBB2_44: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_46 +; SSSE3-NEXT: # %bb.45: +; SSSE3-NEXT: # kill: def $al killed $al def $eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: .LBB2_46: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_48 +; SSSE3-NEXT: # %bb.47: +; SSSE3-NEXT: # kill: def $al killed $al def $eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: .LBB2_48: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_50 +; SSSE3-NEXT: # %bb.49: +; SSSE3-NEXT: # kill: def $al killed $al def $eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: .LBB2_50: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_52 +; SSSE3-NEXT: # %bb.51: +; SSSE3-NEXT: # kill: def $al killed $al def $eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: .LBB2_52: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_54 +; SSSE3-NEXT: # %bb.53: +; SSSE3-NEXT: # kill: def $al killed $al def $eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: .LBB2_54: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_56 +; SSSE3-NEXT: # %bb.55: +; SSSE3-NEXT: # kill: def $al killed $al def $eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: .LBB2_56: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_58 +; SSSE3-NEXT: # %bb.57: +; SSSE3-NEXT: # kill: def $al killed $al def $eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: .LBB2_58: +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, %ecx +; SSSE3-NEXT: jb .LBB2_60 +; SSSE3-NEXT: # %bb.59: +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: .LBB2_60: +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_62 +; SSSE3-NEXT: # %bb.61: +; SSSE3-NEXT: # kill: def $cl killed $cl def $ecx +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: .LBB2_62: +; SSSE3-NEXT: subb (%rsp), %al +; SSSE3-NEXT: movl $0, %ecx +; SSSE3-NEXT: jb .LBB2_64 +; SSSE3-NEXT: # %bb.63: +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: .LBB2_64: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_66 +; SSSE3-NEXT: # %bb.65: +; SSSE3-NEXT: # kill: def $al killed $al def $eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: .LBB2_66: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_68 +; SSSE3-NEXT: # %bb.67: +; SSSE3-NEXT: # kill: def $al killed $al def $eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: .LBB2_68: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_70 +; SSSE3-NEXT: # %bb.69: +; SSSE3-NEXT: # kill: def $al killed $al def $eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: .LBB2_70: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_72 +; SSSE3-NEXT: # %bb.71: +; SSSE3-NEXT: # kill: def $al killed $al def $eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: .LBB2_72: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_74 +; SSSE3-NEXT: # %bb.73: +; SSSE3-NEXT: # kill: def $al killed $al def $eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: .LBB2_74: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_76 +; SSSE3-NEXT: # %bb.75: +; SSSE3-NEXT: # kill: def $al killed $al def $eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: .LBB2_76: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_78 +; SSSE3-NEXT: # %bb.77: +; SSSE3-NEXT: # kill: def $al killed $al def $eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: .LBB2_78: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_80 +; SSSE3-NEXT: # %bb.79: +; SSSE3-NEXT: # kill: def $al killed $al def $eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: .LBB2_80: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_82 +; SSSE3-NEXT: # %bb.81: +; SSSE3-NEXT: # kill: def $al killed $al def $eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: .LBB2_82: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_84 +; SSSE3-NEXT: # %bb.83: +; SSSE3-NEXT: # kill: def $al killed $al def $eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: .LBB2_84: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_86 +; SSSE3-NEXT: # %bb.85: +; SSSE3-NEXT: # kill: def $al killed $al def $eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: .LBB2_86: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_88 +; SSSE3-NEXT: # %bb.87: +; SSSE3-NEXT: # kill: def $al killed $al def $eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: .LBB2_88: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_90 +; SSSE3-NEXT: # %bb.89: +; SSSE3-NEXT: # kill: def $al killed $al def $eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: .LBB2_90: +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, %ecx +; SSSE3-NEXT: jb .LBB2_92 +; SSSE3-NEXT: # %bb.91: +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: .LBB2_92: +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_94 +; SSSE3-NEXT: # %bb.93: +; SSSE3-NEXT: # kill: def $cl killed $cl def $ecx +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: .LBB2_94: +; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, %ecx +; SSSE3-NEXT: jb .LBB2_96 +; SSSE3-NEXT: # %bb.95: +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: .LBB2_96: +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, %ecx +; SSSE3-NEXT: jb .LBB2_98 +; SSSE3-NEXT: # %bb.97: +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: .LBB2_98: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_100 +; SSSE3-NEXT: # %bb.99: +; SSSE3-NEXT: # kill: def $al killed $al def $eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: .LBB2_100: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_102 +; SSSE3-NEXT: # %bb.101: +; SSSE3-NEXT: # kill: def $al killed $al def $eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: .LBB2_102: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_104 +; SSSE3-NEXT: # %bb.103: +; SSSE3-NEXT: # kill: def $al killed $al def $eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: .LBB2_104: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_106 +; SSSE3-NEXT: # %bb.105: +; SSSE3-NEXT: # kill: def $al killed $al def $eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: .LBB2_106: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_108 +; SSSE3-NEXT: # %bb.107: +; SSSE3-NEXT: # kill: def $al killed $al def $eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: .LBB2_108: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_110 +; SSSE3-NEXT: # %bb.109: +; SSSE3-NEXT: # kill: def $al killed $al def $eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: .LBB2_110: +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, %ecx +; SSSE3-NEXT: jb .LBB2_112 +; SSSE3-NEXT: # %bb.111: +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: .LBB2_112: +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, %ecx +; SSSE3-NEXT: jb .LBB2_114 +; SSSE3-NEXT: # %bb.113: +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: .LBB2_114: +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, %ecx +; SSSE3-NEXT: jb .LBB2_116 +; SSSE3-NEXT: # %bb.115: +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: .LBB2_116: +; SSSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl %sil, %eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl %bl, %esi +; SSSE3-NEXT: movzbl %bpl, %ebx +; SSSE3-NEXT: movzbl %dil, %ebp +; SSSE3-NEXT: movzbl %r9b, %edi +; SSSE3-NEXT: movzbl %dl, %ecx +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; SSSE3-NEXT: movzbl %r10b, %r10d +; SSSE3-NEXT: movzbl %r11b, %r11d +; SSSE3-NEXT: movzbl %r14b, %r14d +; SSSE3-NEXT: movzbl %r15b, %r15d +; SSSE3-NEXT: movzbl %r12b, %r12d +; SSSE3-NEXT: movzbl %r13b, %r13d +; SSSE3-NEXT: movzbl %r8b, %r8d +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSSE3-NEXT: jb .LBB2_118 +; SSSE3-NEXT: # %bb.117: +; SSSE3-NEXT: # kill: def $al killed $al def $eax +; SSSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSSE3-NEXT: .LBB2_118: +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm10 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd %esi, %xmm4 +; SSSE3-NEXT: movd %ebx, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd %ebp, %xmm3 +; SSSE3-NEXT: movd %edi, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd %ecx, %xmm5 +; SSSE3-NEXT: movd %edx, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd %r10d, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd %r11d, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd %r14d, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd %r15d, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd %r12d, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd %r13d, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd %r8d, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd %r9d, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movl $0, %ebx +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm6 = mem[0],zero,zero,zero +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm15 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm9 = mem[0],zero,zero,zero +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm11 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm14 = mem[0],zero,zero,zero +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm13 = mem[0],zero,zero,zero +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm12 = mem[0],zero,zero,zero +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm7 = mem[0],zero,zero,zero +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm8 = mem[0],zero,zero,zero +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Folded Reload +; SSSE3-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: jb .LBB2_120 +; SSSE3-NEXT: # %bb.119: +; SSSE3-NEXT: movl %ecx, %ebx +; SSSE3-NEXT: .LBB2_120: +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] +; SSSE3-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3],xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; SSSE3-NEXT: movdqa %xmm5, %xmm4 +; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm10 = xmm10[0],mem[0],xmm10[1],mem[1],xmm10[2],mem[2],xmm10[3],mem[3],xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] +; SSSE3-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] +; SSSE3-NEXT: movdqa %xmm3, %xmm6 +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1],xmm15[2],mem[2],xmm15[3],mem[3],xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] +; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] +; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3],xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] +; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1],xmm13[2],mem[2],xmm13[3],mem[3],xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] +; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm12 = xmm12[0],mem[0],xmm12[1],mem[1],xmm12[2],mem[2],xmm12[3],mem[3],xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] +; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3],xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] +; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3],xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] +; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSSE3-NEXT: movd %r10d, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd %r11d, %xmm10 +; SSSE3-NEXT: movd %r14d, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd %eax, %xmm9 +; SSSE3-NEXT: movd %r15d, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd %r12d, %xmm11 +; SSSE3-NEXT: movd %r13d, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd %esi, %xmm5 +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; SSSE3-NEXT: movzbl %bl, %esi +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl $0, %edi +; SSSE3-NEXT: jb .LBB2_122 +; SSSE3-NEXT: # %bb.121: +; SSSE3-NEXT: movl %ebx, %edi +; SSSE3-NEXT: .LBB2_122: +; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3] +; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm14 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3] +; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1],xmm13[2],mem[2],xmm13[3],mem[3] +; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm12 = xmm12[0],mem[0],xmm12[1],mem[1],xmm12[2],mem[2],xmm12[3],mem[3] +; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3] +; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3] +; SSSE3-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm10 = xmm10[0],mem[0],xmm10[1],mem[1],xmm10[2],mem[2],xmm10[3],mem[3],xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1],xmm9[2],mem[2],xmm9[3],mem[3],xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3],xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3],xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; SSSE3-NEXT: movd %eax, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd %ecx, %xmm3 +; SSSE3-NEXT: movd %edx, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movd %esi, %xmm4 +; SSSE3-NEXT: movzbl %dil, %eax +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl $0, %ecx +; SSSE3-NEXT: jb .LBB2_124 +; SSSE3-NEXT: # %bb.123: +; SSSE3-NEXT: movl %edx, %ecx +; SSSE3-NEXT: .LBB2_124: +; SSSE3-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSSE3-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm6 = xmm6[0],mem[0],xmm6[1],mem[1] +; SSSE3-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSSE3-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3],xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; SSSE3-NEXT: movd %eax, %xmm8 +; SSSE3-NEXT: movzbl %cl, %eax +; SSSE3-NEXT: movd %eax, %xmm6 +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl $0, %ecx +; SSSE3-NEXT: jb .LBB2_126 +; SSSE3-NEXT: # %bb.125: +; SSSE3-NEXT: movl %edx, %ecx +; SSSE3-NEXT: .LBB2_126: +; SSSE3-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSSE3-NEXT: # xmm0 = xmm0[0],mem[0] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm14[0] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm7[0] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; SSSE3-NEXT: movzbl %cl, %ecx +; SSSE3-NEXT: movd %ecx, %xmm7 +; SSSE3-NEXT: subb {{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, %ecx +; SSSE3-NEXT: jb .LBB2_128 +; SSSE3-NEXT: # %bb.127: +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: .LBB2_128: +; SSSE3-NEXT: movzbl %cl, %eax +; SSSE3-NEXT: movd %eax, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0] +; SSSE3-NEXT: addq $648, %rsp # imm = 0x288 +; SSSE3-NEXT: popq %rbx +; SSSE3-NEXT: popq %r12 +; SSSE3-NEXT: popq %r13 +; SSSE3-NEXT: popq %r14 +; SSSE3-NEXT: popq %r15 +; SSSE3-NEXT: popq %rbp +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v64i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pextrb $1, %xmm4, %ecx +; SSE41-NEXT: pextrb $1, %xmm0, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %edi +; SSE41-NEXT: jb .LBB2_2 +; SSE41-NEXT: # %bb.1: +; SSE41-NEXT: movl %eax, %edi +; SSE41-NEXT: .LBB2_2: +; SSE41-NEXT: pextrb $0, %xmm4, %ecx +; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB2_4 +; SSE41-NEXT: # %bb.3: +; SSE41-NEXT: movl %eax, %edx +; SSE41-NEXT: .LBB2_4: +; SSE41-NEXT: pextrb $2, %xmm4, %ecx +; SSE41-NEXT: pextrb $2, %xmm0, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB2_6 +; SSE41-NEXT: # %bb.5: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_6: +; SSE41-NEXT: pushq %rbp +; SSE41-NEXT: pushq %r15 +; SSE41-NEXT: pushq %r14 +; SSE41-NEXT: pushq %r13 +; SSE41-NEXT: pushq %r12 +; SSE41-NEXT: pushq %rbx +; SSE41-NEXT: subq $16, %rsp +; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrb $3, %xmm4, %ecx +; SSE41-NEXT: pextrb $3, %xmm0, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB2_8 +; SSE41-NEXT: # %bb.7: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_8: +; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrb $4, %xmm4, %ecx +; SSE41-NEXT: pextrb $4, %xmm0, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB2_10 +; SSE41-NEXT: # %bb.9: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_10: +; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrb $5, %xmm4, %ecx +; SSE41-NEXT: pextrb $5, %xmm0, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB2_12 +; SSE41-NEXT: # %bb.11: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_12: +; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrb $6, %xmm4, %ecx +; SSE41-NEXT: pextrb $6, %xmm0, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB2_14 +; SSE41-NEXT: # %bb.13: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_14: +; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrb $7, %xmm4, %ecx +; SSE41-NEXT: pextrb $7, %xmm0, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB2_16 +; SSE41-NEXT: # %bb.15: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_16: +; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrb $8, %xmm4, %ecx +; SSE41-NEXT: pextrb $8, %xmm0, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB2_18 +; SSE41-NEXT: # %bb.17: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_18: +; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrb $9, %xmm4, %ecx +; SSE41-NEXT: pextrb $9, %xmm0, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB2_20 +; SSE41-NEXT: # %bb.19: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_20: +; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrb $10, %xmm4, %ecx +; SSE41-NEXT: pextrb $10, %xmm0, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB2_22 +; SSE41-NEXT: # %bb.21: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_22: +; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrb $11, %xmm4, %ecx +; SSE41-NEXT: pextrb $11, %xmm0, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB2_24 +; SSE41-NEXT: # %bb.23: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_24: +; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrb $12, %xmm4, %ecx +; SSE41-NEXT: pextrb $12, %xmm0, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB2_26 +; SSE41-NEXT: # %bb.25: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_26: +; SSE41-NEXT: movl %ecx, (%rsp) # 4-byte Spill +; SSE41-NEXT: pextrb $13, %xmm4, %ecx +; SSE41-NEXT: pextrb $13, %xmm0, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB2_28 +; SSE41-NEXT: # %bb.27: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_28: +; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrb $14, %xmm4, %ecx +; SSE41-NEXT: pextrb $14, %xmm0, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB2_30 +; SSE41-NEXT: # %bb.29: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_30: +; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrb $15, %xmm4, %ecx +; SSE41-NEXT: pextrb $15, %xmm0, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB2_32 +; SSE41-NEXT: # %bb.31: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_32: +; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrb $1, %xmm5, %ecx +; SSE41-NEXT: pextrb $1, %xmm1, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB2_34 +; SSE41-NEXT: # %bb.33: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_34: +; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrb $0, %xmm5, %ecx +; SSE41-NEXT: pextrb $0, %xmm1, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %ebx +; SSE41-NEXT: jb .LBB2_36 +; SSE41-NEXT: # %bb.35: +; SSE41-NEXT: movl %eax, %ebx +; SSE41-NEXT: .LBB2_36: +; SSE41-NEXT: pextrb $2, %xmm5, %ecx +; SSE41-NEXT: pextrb $2, %xmm1, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB2_38 +; SSE41-NEXT: # %bb.37: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_38: +; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrb $3, %xmm5, %ecx +; SSE41-NEXT: pextrb $3, %xmm1, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB2_40 +; SSE41-NEXT: # %bb.39: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_40: +; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrb $4, %xmm5, %ecx +; SSE41-NEXT: pextrb $4, %xmm1, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB2_42 +; SSE41-NEXT: # %bb.41: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_42: +; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrb $5, %xmm5, %ecx +; SSE41-NEXT: pextrb $5, %xmm1, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB2_44 +; SSE41-NEXT: # %bb.43: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_44: +; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrb $6, %xmm5, %ecx +; SSE41-NEXT: pextrb $6, %xmm1, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB2_46 +; SSE41-NEXT: # %bb.45: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_46: +; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrb $7, %xmm5, %ecx +; SSE41-NEXT: pextrb $7, %xmm1, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB2_48 +; SSE41-NEXT: # %bb.47: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_48: +; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrb $8, %xmm5, %ecx +; SSE41-NEXT: pextrb $8, %xmm1, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB2_50 +; SSE41-NEXT: # %bb.49: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_50: +; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrb $9, %xmm5, %ecx +; SSE41-NEXT: pextrb $9, %xmm1, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB2_52 +; SSE41-NEXT: # %bb.51: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_52: +; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrb $10, %xmm5, %ecx +; SSE41-NEXT: pextrb $10, %xmm1, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB2_54 +; SSE41-NEXT: # %bb.53: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_54: +; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrb $11, %xmm5, %ecx +; SSE41-NEXT: pextrb $11, %xmm1, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB2_56 +; SSE41-NEXT: # %bb.55: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_56: +; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrb $12, %xmm5, %ecx +; SSE41-NEXT: pextrb $12, %xmm1, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB2_58 +; SSE41-NEXT: # %bb.57: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_58: +; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrb $13, %xmm5, %ecx +; SSE41-NEXT: pextrb $13, %xmm1, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB2_60 +; SSE41-NEXT: # %bb.59: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_60: +; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrb $14, %xmm5, %ecx +; SSE41-NEXT: pextrb $14, %xmm1, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB2_62 +; SSE41-NEXT: # %bb.61: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_62: +; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrb $15, %xmm5, %ecx +; SSE41-NEXT: pextrb $15, %xmm1, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB2_64 +; SSE41-NEXT: # %bb.63: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_64: +; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrb $1, %xmm6, %ecx +; SSE41-NEXT: pextrb $1, %xmm2, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %r9d +; SSE41-NEXT: jb .LBB2_66 +; SSE41-NEXT: # %bb.65: +; SSE41-NEXT: movl %eax, %r9d +; SSE41-NEXT: .LBB2_66: +; SSE41-NEXT: pextrb $0, %xmm6, %ecx +; SSE41-NEXT: pextrb $0, %xmm2, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %esi +; SSE41-NEXT: jb .LBB2_68 +; SSE41-NEXT: # %bb.67: +; SSE41-NEXT: movl %eax, %esi +; SSE41-NEXT: .LBB2_68: +; SSE41-NEXT: pextrb $2, %xmm6, %ecx +; SSE41-NEXT: pextrb $2, %xmm2, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %r10d +; SSE41-NEXT: jb .LBB2_70 +; SSE41-NEXT: # %bb.69: +; SSE41-NEXT: movl %eax, %r10d +; SSE41-NEXT: .LBB2_70: +; SSE41-NEXT: pextrb $3, %xmm6, %ecx +; SSE41-NEXT: pextrb $3, %xmm2, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %r8d +; SSE41-NEXT: jb .LBB2_72 +; SSE41-NEXT: # %bb.71: +; SSE41-NEXT: movl %eax, %r8d +; SSE41-NEXT: .LBB2_72: +; SSE41-NEXT: pextrb $4, %xmm6, %ecx +; SSE41-NEXT: pextrb $4, %xmm2, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %r14d +; SSE41-NEXT: jb .LBB2_74 +; SSE41-NEXT: # %bb.73: +; SSE41-NEXT: movl %eax, %r14d +; SSE41-NEXT: .LBB2_74: +; SSE41-NEXT: pextrb $5, %xmm6, %ecx +; SSE41-NEXT: pextrb $5, %xmm2, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %r11d +; SSE41-NEXT: jb .LBB2_76 +; SSE41-NEXT: # %bb.75: +; SSE41-NEXT: movl %eax, %r11d +; SSE41-NEXT: .LBB2_76: +; SSE41-NEXT: pextrb $6, %xmm6, %ecx +; SSE41-NEXT: pextrb $6, %xmm2, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %r13d +; SSE41-NEXT: jb .LBB2_78 +; SSE41-NEXT: # %bb.77: +; SSE41-NEXT: movl %eax, %r13d +; SSE41-NEXT: .LBB2_78: +; SSE41-NEXT: pextrb $7, %xmm6, %ecx +; SSE41-NEXT: pextrb $7, %xmm2, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %r12d +; SSE41-NEXT: jb .LBB2_80 +; SSE41-NEXT: # %bb.79: +; SSE41-NEXT: movl %eax, %r12d +; SSE41-NEXT: .LBB2_80: +; SSE41-NEXT: pextrb $8, %xmm6, %ecx +; SSE41-NEXT: pextrb $8, %xmm2, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %r15d +; SSE41-NEXT: jb .LBB2_82 +; SSE41-NEXT: # %bb.81: +; SSE41-NEXT: movl %eax, %r15d +; SSE41-NEXT: .LBB2_82: +; SSE41-NEXT: pextrb $9, %xmm6, %ecx +; SSE41-NEXT: pextrb $9, %xmm2, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB2_84 +; SSE41-NEXT: # %bb.83: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_84: +; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrb $10, %xmm6, %ecx +; SSE41-NEXT: pextrb $10, %xmm2, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB2_86 +; SSE41-NEXT: # %bb.85: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_86: +; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrb $11, %xmm6, %ecx +; SSE41-NEXT: pextrb $11, %xmm2, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB2_88 +; SSE41-NEXT: # %bb.87: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_88: +; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrb $12, %xmm6, %ecx +; SSE41-NEXT: pextrb $12, %xmm2, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB2_90 +; SSE41-NEXT: # %bb.89: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_90: +; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrb $13, %xmm6, %ecx +; SSE41-NEXT: pextrb $13, %xmm2, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB2_92 +; SSE41-NEXT: # %bb.91: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_92: +; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: pextrb $14, %xmm6, %ecx +; SSE41-NEXT: pextrb $14, %xmm2, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB2_94 +; SSE41-NEXT: # %bb.93: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_94: +; SSE41-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movzbl %dl, %eax +; SSE41-NEXT: movzbl %bl, %ebp +; SSE41-NEXT: movzbl %sil, %ebx +; SSE41-NEXT: pextrb $15, %xmm6, %edx +; SSE41-NEXT: pextrb $15, %xmm2, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB2_96 +; SSE41-NEXT: # %bb.95: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB2_96: +; SSE41-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE41-NEXT: movzbl %dil, %edi +; SSE41-NEXT: movd %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: movd %ebp, %xmm1 +; SSE41-NEXT: movzbl %r9b, %esi +; SSE41-NEXT: movd %ebx, %xmm2 +; SSE41-NEXT: pextrb $1, %xmm7, %edx +; SSE41-NEXT: pextrb $1, %xmm3, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %ebx +; SSE41-NEXT: jb .LBB2_98 +; SSE41-NEXT: # %bb.97: +; SSE41-NEXT: movl %ecx, %ebx +; SSE41-NEXT: .LBB2_98: +; SSE41-NEXT: pinsrb $1, %edi, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $1, %eax, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $1, %esi, %xmm2 +; SSE41-NEXT: movzbl %r10b, %eax +; SSE41-NEXT: movzbl %bl, %esi +; SSE41-NEXT: pextrb $0, %xmm7, %edx +; SSE41-NEXT: pextrb $0, %xmm3, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB2_100 +; SSE41-NEXT: # %bb.99: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB2_100: +; SSE41-NEXT: pinsrb $2, %edi, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $2, %ebp, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $2, %eax, %xmm2 +; SSE41-NEXT: movzbl %r8b, %edi +; SSE41-NEXT: movzbl %dl, %eax +; SSE41-NEXT: movd %eax, %xmm4 +; SSE41-NEXT: pinsrb $1, %esi, %xmm4 +; SSE41-NEXT: pextrb $2, %xmm7, %eax +; SSE41-NEXT: pextrb $2, %xmm3, %ecx +; SSE41-NEXT: subb %al, %cl +; SSE41-NEXT: movl $0, %eax +; SSE41-NEXT: jb .LBB2_102 +; SSE41-NEXT: # %bb.101: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB2_102: +; SSE41-NEXT: pinsrb $3, %ebx, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $3, %ebp, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $3, %edi, %xmm2 +; SSE41-NEXT: movzbl %r14b, %edi +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $2, %eax, %xmm4 +; SSE41-NEXT: pextrb $3, %xmm7, %eax +; SSE41-NEXT: pextrb $3, %xmm3, %ecx +; SSE41-NEXT: subb %al, %cl +; SSE41-NEXT: movl $0, %eax +; SSE41-NEXT: jb .LBB2_104 +; SSE41-NEXT: # %bb.103: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB2_104: +; SSE41-NEXT: pinsrb $4, %esi, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $4, %ebp, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $4, %edi, %xmm2 +; SSE41-NEXT: movzbl %r11b, %edi +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $3, %eax, %xmm4 +; SSE41-NEXT: pextrb $4, %xmm7, %eax +; SSE41-NEXT: pextrb $4, %xmm3, %ecx +; SSE41-NEXT: subb %al, %cl +; SSE41-NEXT: movl $0, %eax +; SSE41-NEXT: jb .LBB2_106 +; SSE41-NEXT: # %bb.105: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB2_106: +; SSE41-NEXT: pinsrb $5, %edx, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $5, %esi, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $5, %edi, %xmm2 +; SSE41-NEXT: movzbl %r13b, %esi +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $4, %eax, %xmm4 +; SSE41-NEXT: pextrb $5, %xmm7, %edi +; SSE41-NEXT: pextrb $5, %xmm3, %eax +; SSE41-NEXT: subb %dil, %al +; SSE41-NEXT: movl $0, %edi +; SSE41-NEXT: jb .LBB2_108 +; SSE41-NEXT: # %bb.107: +; SSE41-NEXT: movl %eax, %edi +; SSE41-NEXT: .LBB2_108: +; SSE41-NEXT: pinsrb $6, %ecx, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $6, %edx, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $6, %esi, %xmm2 +; SSE41-NEXT: movzbl %r12b, %edx +; SSE41-NEXT: movzbl %dil, %esi +; SSE41-NEXT: pinsrb $5, %esi, %xmm4 +; SSE41-NEXT: pextrb $6, %xmm7, %esi +; SSE41-NEXT: pextrb $6, %xmm3, %edi +; SSE41-NEXT: subb %sil, %dil +; SSE41-NEXT: movl $0, %esi +; SSE41-NEXT: jb .LBB2_110 +; SSE41-NEXT: # %bb.109: +; SSE41-NEXT: movl %edi, %esi +; SSE41-NEXT: .LBB2_110: +; SSE41-NEXT: pinsrb $7, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $7, %ecx, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $7, %edx, %xmm2 +; SSE41-NEXT: movzbl %r15b, %edx +; SSE41-NEXT: movzbl %sil, %esi +; SSE41-NEXT: pinsrb $6, %esi, %xmm4 +; SSE41-NEXT: pextrb $7, %xmm7, %esi +; SSE41-NEXT: pextrb $7, %xmm3, %edi +; SSE41-NEXT: subb %sil, %dil +; SSE41-NEXT: movl $0, %esi +; SSE41-NEXT: jb .LBB2_112 +; SSE41-NEXT: # %bb.111: +; SSE41-NEXT: movl %edi, %esi +; SSE41-NEXT: .LBB2_112: +; SSE41-NEXT: pinsrb $8, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $8, %ecx, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $8, %edx, %xmm2 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; SSE41-NEXT: movzbl %sil, %esi +; SSE41-NEXT: pinsrb $7, %esi, %xmm4 +; SSE41-NEXT: pextrb $8, %xmm7, %esi +; SSE41-NEXT: pextrb $8, %xmm3, %edi +; SSE41-NEXT: subb %sil, %dil +; SSE41-NEXT: movl $0, %esi +; SSE41-NEXT: jb .LBB2_114 +; SSE41-NEXT: # %bb.113: +; SSE41-NEXT: movl %edi, %esi +; SSE41-NEXT: .LBB2_114: +; SSE41-NEXT: pinsrb $9, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $9, %ecx, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $9, %edx, %xmm2 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; SSE41-NEXT: movzbl %sil, %esi +; SSE41-NEXT: pinsrb $8, %esi, %xmm4 +; SSE41-NEXT: pextrb $9, %xmm7, %esi +; SSE41-NEXT: pextrb $9, %xmm3, %edi +; SSE41-NEXT: subb %sil, %dil +; SSE41-NEXT: movl $0, %esi +; SSE41-NEXT: jb .LBB2_116 +; SSE41-NEXT: # %bb.115: +; SSE41-NEXT: movl %edi, %esi +; SSE41-NEXT: .LBB2_116: +; SSE41-NEXT: pinsrb $10, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $10, %ecx, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $10, %edx, %xmm2 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; SSE41-NEXT: movzbl %sil, %esi +; SSE41-NEXT: pinsrb $9, %esi, %xmm4 +; SSE41-NEXT: pextrb $10, %xmm7, %esi +; SSE41-NEXT: pextrb $10, %xmm3, %edi +; SSE41-NEXT: subb %sil, %dil +; SSE41-NEXT: movl $0, %esi +; SSE41-NEXT: jb .LBB2_118 +; SSE41-NEXT: # %bb.117: +; SSE41-NEXT: movl %edi, %esi +; SSE41-NEXT: .LBB2_118: +; SSE41-NEXT: pinsrb $11, %eax, %xmm0 +; SSE41-NEXT: movzbl (%rsp), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $11, %ecx, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $11, %edx, %xmm2 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; SSE41-NEXT: movzbl %sil, %esi +; SSE41-NEXT: pinsrb $10, %esi, %xmm4 +; SSE41-NEXT: pextrb $11, %xmm7, %esi +; SSE41-NEXT: pextrb $11, %xmm3, %edi +; SSE41-NEXT: subb %sil, %dil +; SSE41-NEXT: movl $0, %esi +; SSE41-NEXT: jb .LBB2_120 +; SSE41-NEXT: # %bb.119: +; SSE41-NEXT: movl %edi, %esi +; SSE41-NEXT: .LBB2_120: +; SSE41-NEXT: pinsrb $12, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $12, %ecx, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $12, %edx, %xmm2 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; SSE41-NEXT: movzbl %sil, %esi +; SSE41-NEXT: pinsrb $11, %esi, %xmm4 +; SSE41-NEXT: pextrb $12, %xmm7, %esi +; SSE41-NEXT: pextrb $12, %xmm3, %edi +; SSE41-NEXT: subb %sil, %dil +; SSE41-NEXT: movl $0, %esi +; SSE41-NEXT: jb .LBB2_122 +; SSE41-NEXT: # %bb.121: +; SSE41-NEXT: movl %edi, %esi +; SSE41-NEXT: .LBB2_122: +; SSE41-NEXT: pinsrb $13, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $13, %ecx, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $13, %edx, %xmm2 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; SSE41-NEXT: movzbl %sil, %esi +; SSE41-NEXT: pinsrb $12, %esi, %xmm4 +; SSE41-NEXT: pextrb $13, %xmm7, %esi +; SSE41-NEXT: pextrb $13, %xmm3, %edi +; SSE41-NEXT: subb %sil, %dil +; SSE41-NEXT: movl $0, %esi +; SSE41-NEXT: jb .LBB2_124 +; SSE41-NEXT: # %bb.123: +; SSE41-NEXT: movl %edi, %esi +; SSE41-NEXT: .LBB2_124: +; SSE41-NEXT: pinsrb $14, %eax, %xmm0 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $14, %ecx, %xmm1 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SSE41-NEXT: pinsrb $14, %edx, %xmm2 +; SSE41-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; SSE41-NEXT: movzbl %sil, %esi +; SSE41-NEXT: pinsrb $13, %esi, %xmm4 +; SSE41-NEXT: pextrb $14, %xmm7, %esi +; SSE41-NEXT: pextrb $14, %xmm3, %edi +; SSE41-NEXT: subb %sil, %dil +; SSE41-NEXT: movl $0, %esi +; SSE41-NEXT: leaq {{[0-9]+}}(%rsp), %rsp +; SSE41-NEXT: popq %rbx +; SSE41-NEXT: popq %r12 +; SSE41-NEXT: popq %r13 +; SSE41-NEXT: popq %r14 +; SSE41-NEXT: popq %r15 +; SSE41-NEXT: popq %rbp +; SSE41-NEXT: jb .LBB2_126 +; SSE41-NEXT: # %bb.125: +; SSE41-NEXT: movl %edi, %esi +; SSE41-NEXT: .LBB2_126: +; SSE41-NEXT: pinsrb $15, %eax, %xmm0 +; SSE41-NEXT: pinsrb $15, %ecx, %xmm1 +; SSE41-NEXT: pinsrb $15, %edx, %xmm2 +; SSE41-NEXT: movzbl %sil, %eax +; SSE41-NEXT: pinsrb $14, %eax, %xmm4 +; SSE41-NEXT: pextrb $15, %xmm7, %ecx +; SSE41-NEXT: pextrb $15, %xmm3, %eax +; SSE41-NEXT: subb %cl, %al +; SSE41-NEXT: movl $0, %ecx +; SSE41-NEXT: jb .LBB2_128 +; SSE41-NEXT: # %bb.127: +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: .LBB2_128: +; SSE41-NEXT: movzbl %cl, %eax +; SSE41-NEXT: pinsrb $15, %eax, %xmm4 +; SSE41-NEXT: movdqa %xmm4, %xmm3 +; SSE41-NEXT: retq +; +; AVX1-LABEL: v64i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vpextrb $1, %xmm4, %ecx +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpextrb $1, %xmm5, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %ecx +; AVX1-NEXT: jb .LBB2_2 +; AVX1-NEXT: # %bb.1: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_2: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: pushq %r15 +; AVX1-NEXT: pushq %r14 +; AVX1-NEXT: pushq %r13 +; AVX1-NEXT: pushq %r12 +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: subq $12, %rsp +; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrb $0, %xmm4, %ecx +; AVX1-NEXT: vpextrb $0, %xmm5, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %edi +; AVX1-NEXT: jb .LBB2_4 +; AVX1-NEXT: # %bb.3: +; AVX1-NEXT: movl %eax, %edi +; AVX1-NEXT: .LBB2_4: +; AVX1-NEXT: vpextrb $2, %xmm4, %ecx +; AVX1-NEXT: vpextrb $2, %xmm5, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %ecx +; AVX1-NEXT: jb .LBB2_6 +; AVX1-NEXT: # %bb.5: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_6: +; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrb $3, %xmm4, %ecx +; AVX1-NEXT: vpextrb $3, %xmm5, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %ecx +; AVX1-NEXT: jb .LBB2_8 +; AVX1-NEXT: # %bb.7: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_8: +; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrb $4, %xmm4, %ecx +; AVX1-NEXT: vpextrb $4, %xmm5, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %ecx +; AVX1-NEXT: jb .LBB2_10 +; AVX1-NEXT: # %bb.9: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_10: +; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrb $5, %xmm4, %ecx +; AVX1-NEXT: vpextrb $5, %xmm5, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %ecx +; AVX1-NEXT: jb .LBB2_12 +; AVX1-NEXT: # %bb.11: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_12: +; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrb $6, %xmm4, %ecx +; AVX1-NEXT: vpextrb $6, %xmm5, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %ecx +; AVX1-NEXT: jb .LBB2_14 +; AVX1-NEXT: # %bb.13: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_14: +; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrb $7, %xmm4, %ecx +; AVX1-NEXT: vpextrb $7, %xmm5, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %ecx +; AVX1-NEXT: jb .LBB2_16 +; AVX1-NEXT: # %bb.15: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_16: +; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrb $8, %xmm4, %ecx +; AVX1-NEXT: vpextrb $8, %xmm5, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %ecx +; AVX1-NEXT: jb .LBB2_18 +; AVX1-NEXT: # %bb.17: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_18: +; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrb $9, %xmm4, %ecx +; AVX1-NEXT: vpextrb $9, %xmm5, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %ecx +; AVX1-NEXT: jb .LBB2_20 +; AVX1-NEXT: # %bb.19: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_20: +; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrb $10, %xmm4, %ecx +; AVX1-NEXT: vpextrb $10, %xmm5, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %ecx +; AVX1-NEXT: jb .LBB2_22 +; AVX1-NEXT: # %bb.21: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_22: +; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrb $11, %xmm4, %ecx +; AVX1-NEXT: vpextrb $11, %xmm5, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %ecx +; AVX1-NEXT: jb .LBB2_24 +; AVX1-NEXT: # %bb.23: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_24: +; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrb $12, %xmm4, %ecx +; AVX1-NEXT: vpextrb $12, %xmm5, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %ecx +; AVX1-NEXT: jb .LBB2_26 +; AVX1-NEXT: # %bb.25: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_26: +; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrb $13, %xmm4, %ecx +; AVX1-NEXT: vpextrb $13, %xmm5, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %ecx +; AVX1-NEXT: jb .LBB2_28 +; AVX1-NEXT: # %bb.27: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_28: +; AVX1-NEXT: movl %ecx, (%rsp) # 4-byte Spill +; AVX1-NEXT: vpextrb $14, %xmm4, %ecx +; AVX1-NEXT: vpextrb $14, %xmm5, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %ecx +; AVX1-NEXT: jb .LBB2_30 +; AVX1-NEXT: # %bb.29: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_30: +; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrb $15, %xmm4, %ecx +; AVX1-NEXT: vpextrb $15, %xmm5, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %ecx +; AVX1-NEXT: jb .LBB2_32 +; AVX1-NEXT: # %bb.31: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_32: +; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrb $1, %xmm2, %ecx +; AVX1-NEXT: vpextrb $1, %xmm0, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %esi +; AVX1-NEXT: jb .LBB2_34 +; AVX1-NEXT: # %bb.33: +; AVX1-NEXT: movl %eax, %esi +; AVX1-NEXT: .LBB2_34: +; AVX1-NEXT: vpextrb $0, %xmm2, %ecx +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %ebx +; AVX1-NEXT: jb .LBB2_36 +; AVX1-NEXT: # %bb.35: +; AVX1-NEXT: movl %eax, %ebx +; AVX1-NEXT: .LBB2_36: +; AVX1-NEXT: vpextrb $2, %xmm2, %ecx +; AVX1-NEXT: vpextrb $2, %xmm0, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %ecx +; AVX1-NEXT: jb .LBB2_38 +; AVX1-NEXT: # %bb.37: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_38: +; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrb $3, %xmm2, %ecx +; AVX1-NEXT: vpextrb $3, %xmm0, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %ecx +; AVX1-NEXT: jb .LBB2_40 +; AVX1-NEXT: # %bb.39: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_40: +; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrb $4, %xmm2, %ecx +; AVX1-NEXT: vpextrb $4, %xmm0, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %ecx +; AVX1-NEXT: jb .LBB2_42 +; AVX1-NEXT: # %bb.41: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_42: +; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrb $5, %xmm2, %ecx +; AVX1-NEXT: vpextrb $5, %xmm0, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %ecx +; AVX1-NEXT: jb .LBB2_44 +; AVX1-NEXT: # %bb.43: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_44: +; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrb $6, %xmm2, %ecx +; AVX1-NEXT: vpextrb $6, %xmm0, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %ecx +; AVX1-NEXT: jb .LBB2_46 +; AVX1-NEXT: # %bb.45: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_46: +; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrb $7, %xmm2, %ecx +; AVX1-NEXT: vpextrb $7, %xmm0, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %ecx +; AVX1-NEXT: jb .LBB2_48 +; AVX1-NEXT: # %bb.47: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_48: +; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrb $8, %xmm2, %ecx +; AVX1-NEXT: vpextrb $8, %xmm0, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %ecx +; AVX1-NEXT: jb .LBB2_50 +; AVX1-NEXT: # %bb.49: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_50: +; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrb $9, %xmm2, %ecx +; AVX1-NEXT: vpextrb $9, %xmm0, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %ecx +; AVX1-NEXT: jb .LBB2_52 +; AVX1-NEXT: # %bb.51: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_52: +; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrb $10, %xmm2, %ecx +; AVX1-NEXT: vpextrb $10, %xmm0, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %ecx +; AVX1-NEXT: jb .LBB2_54 +; AVX1-NEXT: # %bb.53: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_54: +; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrb $11, %xmm2, %ecx +; AVX1-NEXT: vpextrb $11, %xmm0, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %ecx +; AVX1-NEXT: jb .LBB2_56 +; AVX1-NEXT: # %bb.55: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_56: +; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrb $12, %xmm2, %ecx +; AVX1-NEXT: vpextrb $12, %xmm0, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %ecx +; AVX1-NEXT: jb .LBB2_58 +; AVX1-NEXT: # %bb.57: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_58: +; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrb $13, %xmm2, %ecx +; AVX1-NEXT: vpextrb $13, %xmm0, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %ecx +; AVX1-NEXT: jb .LBB2_60 +; AVX1-NEXT: # %bb.59: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_60: +; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrb $14, %xmm2, %ecx +; AVX1-NEXT: vpextrb $14, %xmm0, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %ecx +; AVX1-NEXT: jb .LBB2_62 +; AVX1-NEXT: # %bb.61: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_62: +; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrb $15, %xmm2, %ecx +; AVX1-NEXT: vpextrb $15, %xmm0, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %ecx +; AVX1-NEXT: jb .LBB2_64 +; AVX1-NEXT: # %bb.63: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_64: +; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm0 +; AVX1-NEXT: vpextrb $1, %xmm0, %ecx +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpextrb $1, %xmm2, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %r9d +; AVX1-NEXT: jb .LBB2_66 +; AVX1-NEXT: # %bb.65: +; AVX1-NEXT: movl %eax, %r9d +; AVX1-NEXT: .LBB2_66: +; AVX1-NEXT: vpextrb $0, %xmm0, %ecx +; AVX1-NEXT: vpextrb $0, %xmm2, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %edx +; AVX1-NEXT: jb .LBB2_68 +; AVX1-NEXT: # %bb.67: +; AVX1-NEXT: movl %eax, %edx +; AVX1-NEXT: .LBB2_68: +; AVX1-NEXT: vpextrb $2, %xmm0, %ecx +; AVX1-NEXT: vpextrb $2, %xmm2, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %r10d +; AVX1-NEXT: jb .LBB2_70 +; AVX1-NEXT: # %bb.69: +; AVX1-NEXT: movl %eax, %r10d +; AVX1-NEXT: .LBB2_70: +; AVX1-NEXT: vpextrb $3, %xmm0, %ecx +; AVX1-NEXT: vpextrb $3, %xmm2, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %r8d +; AVX1-NEXT: jb .LBB2_72 +; AVX1-NEXT: # %bb.71: +; AVX1-NEXT: movl %eax, %r8d +; AVX1-NEXT: .LBB2_72: +; AVX1-NEXT: vpextrb $4, %xmm0, %ecx +; AVX1-NEXT: vpextrb $4, %xmm2, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %r11d +; AVX1-NEXT: jb .LBB2_74 +; AVX1-NEXT: # %bb.73: +; AVX1-NEXT: movl %eax, %r11d +; AVX1-NEXT: .LBB2_74: +; AVX1-NEXT: vpextrb $5, %xmm0, %ecx +; AVX1-NEXT: vpextrb $5, %xmm2, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %r13d +; AVX1-NEXT: jb .LBB2_76 +; AVX1-NEXT: # %bb.75: +; AVX1-NEXT: movl %eax, %r13d +; AVX1-NEXT: .LBB2_76: +; AVX1-NEXT: vpextrb $6, %xmm0, %ecx +; AVX1-NEXT: vpextrb $6, %xmm2, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %r12d +; AVX1-NEXT: jb .LBB2_78 +; AVX1-NEXT: # %bb.77: +; AVX1-NEXT: movl %eax, %r12d +; AVX1-NEXT: .LBB2_78: +; AVX1-NEXT: vpextrb $7, %xmm0, %ecx +; AVX1-NEXT: vpextrb $7, %xmm2, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %r15d +; AVX1-NEXT: jb .LBB2_80 +; AVX1-NEXT: # %bb.79: +; AVX1-NEXT: movl %eax, %r15d +; AVX1-NEXT: .LBB2_80: +; AVX1-NEXT: vpextrb $8, %xmm0, %ecx +; AVX1-NEXT: vpextrb $8, %xmm2, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %r14d +; AVX1-NEXT: jb .LBB2_82 +; AVX1-NEXT: # %bb.81: +; AVX1-NEXT: movl %eax, %r14d +; AVX1-NEXT: .LBB2_82: +; AVX1-NEXT: vpextrb $9, %xmm0, %ecx +; AVX1-NEXT: vpextrb $9, %xmm2, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %ecx +; AVX1-NEXT: jb .LBB2_84 +; AVX1-NEXT: # %bb.83: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_84: +; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrb $10, %xmm0, %ecx +; AVX1-NEXT: vpextrb $10, %xmm2, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %ecx +; AVX1-NEXT: jb .LBB2_86 +; AVX1-NEXT: # %bb.85: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_86: +; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrb $11, %xmm0, %ecx +; AVX1-NEXT: vpextrb $11, %xmm2, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %ecx +; AVX1-NEXT: jb .LBB2_88 +; AVX1-NEXT: # %bb.87: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_88: +; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrb $12, %xmm0, %ecx +; AVX1-NEXT: vpextrb $12, %xmm2, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %ecx +; AVX1-NEXT: jb .LBB2_90 +; AVX1-NEXT: # %bb.89: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_90: +; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movzbl %bl, %ebp +; AVX1-NEXT: vpextrb $13, %xmm0, %ecx +; AVX1-NEXT: vpextrb $13, %xmm2, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %ecx +; AVX1-NEXT: jb .LBB2_92 +; AVX1-NEXT: # %bb.91: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_92: +; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movzbl %dil, %edi +; AVX1-NEXT: movzbl %sil, %ebx +; AVX1-NEXT: vmovd %ebp, %xmm4 +; AVX1-NEXT: vpextrb $14, %xmm0, %ecx +; AVX1-NEXT: vpextrb $14, %xmm2, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %ecx +; AVX1-NEXT: jb .LBB2_94 +; AVX1-NEXT: # %bb.93: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_94: +; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; AVX1-NEXT: vmovd %edi, %xmm5 +; AVX1-NEXT: vpinsrb $1, %ebx, %xmm4, %xmm4 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; AVX1-NEXT: movzbl %dl, %ebp +; AVX1-NEXT: vpextrb $15, %xmm0, %ecx +; AVX1-NEXT: vpextrb $15, %xmm2, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %ecx +; AVX1-NEXT: jb .LBB2_96 +; AVX1-NEXT: # %bb.95: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_96: +; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpinsrb $1, %esi, %xmm5, %xmm0 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $2, %edi, %xmm4, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload +; AVX1-NEXT: movzbl %r9b, %edx +; AVX1-NEXT: vmovd %ebp, %xmm4 +; AVX1-NEXT: vpextrb $1, %xmm3, %eax +; AVX1-NEXT: vpextrb $1, %xmm1, %edi +; AVX1-NEXT: subb %al, %dil +; AVX1-NEXT: movl $0, %eax +; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; AVX1-NEXT: jb .LBB2_98 +; AVX1-NEXT: # %bb.97: +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: .LBB2_98: +; AVX1-NEXT: vpinsrb $2, %esi, %xmm0, %xmm0 +; AVX1-NEXT: movzbl %cl, %esi +; AVX1-NEXT: vpinsrb $3, %ebx, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $1, %edx, %xmm4, %xmm4 +; AVX1-NEXT: movzbl %r10b, %edi +; AVX1-NEXT: movzbl %al, %edx +; AVX1-NEXT: vpextrb $0, %xmm3, %ecx +; AVX1-NEXT: vpextrb $0, %xmm1, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %ecx +; AVX1-NEXT: jb .LBB2_100 +; AVX1-NEXT: # %bb.99: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_100: +; AVX1-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $4, %ebp, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $2, %edi, %xmm4, %xmm4 +; AVX1-NEXT: movzbl %r8b, %ebp +; AVX1-NEXT: movzbl %cl, %eax +; AVX1-NEXT: vmovd %eax, %xmm5 +; AVX1-NEXT: vpinsrb $1, %edx, %xmm5, %xmm5 +; AVX1-NEXT: vpextrb $2, %xmm3, %eax +; AVX1-NEXT: vpextrb $2, %xmm1, %ecx +; AVX1-NEXT: subb %al, %cl +; AVX1-NEXT: movl $0, %eax +; AVX1-NEXT: jb .LBB2_102 +; AVX1-NEXT: # %bb.101: +; AVX1-NEXT: movl %ecx, %eax +; AVX1-NEXT: .LBB2_102: +; AVX1-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $5, %ebx, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $3, %ebp, %xmm4, %xmm4 +; AVX1-NEXT: movzbl %r11b, %ebp +; AVX1-NEXT: movzbl %al, %eax +; AVX1-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5 +; AVX1-NEXT: vpextrb $3, %xmm3, %eax +; AVX1-NEXT: vpextrb $3, %xmm1, %ecx +; AVX1-NEXT: subb %al, %cl +; AVX1-NEXT: movl $0, %eax +; AVX1-NEXT: jb .LBB2_104 +; AVX1-NEXT: # %bb.103: +; AVX1-NEXT: movl %ecx, %eax +; AVX1-NEXT: .LBB2_104: +; AVX1-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $6, %esi, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $4, %ebp, %xmm4, %xmm4 +; AVX1-NEXT: movzbl %r13b, %esi +; AVX1-NEXT: movzbl %al, %eax +; AVX1-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5 +; AVX1-NEXT: vpextrb $4, %xmm3, %edi +; AVX1-NEXT: vpextrb $4, %xmm1, %eax +; AVX1-NEXT: subb %dil, %al +; AVX1-NEXT: movl $0, %edi +; AVX1-NEXT: jb .LBB2_106 +; AVX1-NEXT: # %bb.105: +; AVX1-NEXT: movl %eax, %edi +; AVX1-NEXT: .LBB2_106: +; AVX1-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $7, %edx, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $5, %esi, %xmm4, %xmm4 +; AVX1-NEXT: movzbl %r12b, %edx +; AVX1-NEXT: movzbl %dil, %esi +; AVX1-NEXT: vpinsrb $4, %esi, %xmm5, %xmm5 +; AVX1-NEXT: vpextrb $5, %xmm3, %esi +; AVX1-NEXT: vpextrb $5, %xmm1, %edi +; AVX1-NEXT: subb %sil, %dil +; AVX1-NEXT: movl $0, %esi +; AVX1-NEXT: jb .LBB2_108 +; AVX1-NEXT: # %bb.107: +; AVX1-NEXT: movl %edi, %esi +; AVX1-NEXT: .LBB2_108: +; AVX1-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $6, %edx, %xmm4, %xmm4 +; AVX1-NEXT: movzbl %r15b, %edx +; AVX1-NEXT: movzbl %sil, %esi +; AVX1-NEXT: vpinsrb $5, %esi, %xmm5, %xmm5 +; AVX1-NEXT: vpextrb $6, %xmm3, %esi +; AVX1-NEXT: vpextrb $6, %xmm1, %edi +; AVX1-NEXT: subb %sil, %dil +; AVX1-NEXT: movl $0, %esi +; AVX1-NEXT: jb .LBB2_110 +; AVX1-NEXT: # %bb.109: +; AVX1-NEXT: movl %edi, %esi +; AVX1-NEXT: .LBB2_110: +; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $7, %edx, %xmm4, %xmm4 +; AVX1-NEXT: movzbl %r14b, %edx +; AVX1-NEXT: movzbl %sil, %esi +; AVX1-NEXT: vpinsrb $6, %esi, %xmm5, %xmm5 +; AVX1-NEXT: vpextrb $7, %xmm3, %esi +; AVX1-NEXT: vpextrb $7, %xmm1, %edi +; AVX1-NEXT: subb %sil, %dil +; AVX1-NEXT: movl $0, %esi +; AVX1-NEXT: jb .LBB2_112 +; AVX1-NEXT: # %bb.111: +; AVX1-NEXT: movl %edi, %esi +; AVX1-NEXT: .LBB2_112: +; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $8, %edx, %xmm4, %xmm4 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX1-NEXT: movzbl %sil, %esi +; AVX1-NEXT: vpinsrb $7, %esi, %xmm5, %xmm5 +; AVX1-NEXT: vpextrb $8, %xmm3, %esi +; AVX1-NEXT: vpextrb $8, %xmm1, %edi +; AVX1-NEXT: subb %sil, %dil +; AVX1-NEXT: movl $0, %esi +; AVX1-NEXT: jb .LBB2_114 +; AVX1-NEXT: # %bb.113: +; AVX1-NEXT: movl %edi, %esi +; AVX1-NEXT: .LBB2_114: +; AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $9, %edx, %xmm4, %xmm4 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX1-NEXT: movzbl %sil, %esi +; AVX1-NEXT: vpinsrb $8, %esi, %xmm5, %xmm5 +; AVX1-NEXT: vpextrb $9, %xmm3, %esi +; AVX1-NEXT: vpextrb $9, %xmm1, %edi +; AVX1-NEXT: subb %sil, %dil +; AVX1-NEXT: movl $0, %esi +; AVX1-NEXT: jb .LBB2_116 +; AVX1-NEXT: # %bb.115: +; AVX1-NEXT: movl %edi, %esi +; AVX1-NEXT: .LBB2_116: +; AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $10, %edx, %xmm4, %xmm4 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX1-NEXT: movzbl %sil, %esi +; AVX1-NEXT: vpinsrb $9, %esi, %xmm5, %xmm5 +; AVX1-NEXT: vpextrb $10, %xmm3, %esi +; AVX1-NEXT: vpextrb $10, %xmm1, %edi +; AVX1-NEXT: subb %sil, %dil +; AVX1-NEXT: movl $0, %esi +; AVX1-NEXT: jb .LBB2_118 +; AVX1-NEXT: # %bb.117: +; AVX1-NEXT: movl %edi, %esi +; AVX1-NEXT: .LBB2_118: +; AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl (%rsp), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $11, %edx, %xmm4, %xmm4 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX1-NEXT: movzbl %sil, %esi +; AVX1-NEXT: vpinsrb $10, %esi, %xmm5, %xmm5 +; AVX1-NEXT: vpextrb $11, %xmm3, %esi +; AVX1-NEXT: vpextrb $11, %xmm1, %edi +; AVX1-NEXT: subb %sil, %dil +; AVX1-NEXT: movl $0, %esi +; AVX1-NEXT: jb .LBB2_120 +; AVX1-NEXT: # %bb.119: +; AVX1-NEXT: movl %edi, %esi +; AVX1-NEXT: .LBB2_120: +; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $12, %edx, %xmm4, %xmm6 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX1-NEXT: movzbl %sil, %esi +; AVX1-NEXT: vpinsrb $11, %esi, %xmm5, %xmm5 +; AVX1-NEXT: vpextrb $12, %xmm3, %esi +; AVX1-NEXT: vpextrb $12, %xmm1, %edi +; AVX1-NEXT: subb %sil, %dil +; AVX1-NEXT: movl $0, %esi +; AVX1-NEXT: jb .LBB2_122 +; AVX1-NEXT: # %bb.121: +; AVX1-NEXT: movl %edi, %esi +; AVX1-NEXT: .LBB2_122: +; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm4 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: vpinsrb $15, %ecx, %xmm2, %xmm0 +; AVX1-NEXT: vpinsrb $13, %edx, %xmm6, %xmm6 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX1-NEXT: movzbl %sil, %edx +; AVX1-NEXT: vpinsrb $12, %edx, %xmm5, %xmm5 +; AVX1-NEXT: vpextrb $13, %xmm3, %edx +; AVX1-NEXT: vpextrb $13, %xmm1, %esi +; AVX1-NEXT: subb %dl, %sil +; AVX1-NEXT: movl $0, %edx +; AVX1-NEXT: jb .LBB2_124 +; AVX1-NEXT: # %bb.123: +; AVX1-NEXT: movl %esi, %edx +; AVX1-NEXT: .LBB2_124: +; AVX1-NEXT: vpinsrb $15, %eax, %xmm4, %xmm2 +; AVX1-NEXT: vpinsrb $14, %ecx, %xmm6, %xmm4 +; AVX1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX1-NEXT: movzbl %dl, %ecx +; AVX1-NEXT: vpinsrb $13, %ecx, %xmm5, %xmm5 +; AVX1-NEXT: vpextrb $14, %xmm3, %edx +; AVX1-NEXT: vpextrb $14, %xmm1, %ecx +; AVX1-NEXT: subb %dl, %cl +; AVX1-NEXT: movl $0, %edx +; AVX1-NEXT: leaq {{[0-9]+}}(%rsp), %rsp +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: popq %r12 +; AVX1-NEXT: popq %r13 +; AVX1-NEXT: popq %r14 +; AVX1-NEXT: popq %r15 +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: jb .LBB2_126 +; AVX1-NEXT: # %bb.125: +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: .LBB2_126: +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vpinsrb $15, %eax, %xmm4, %xmm2 +; AVX1-NEXT: movzbl %dl, %eax +; AVX1-NEXT: vpinsrb $14, %eax, %xmm5, %xmm4 +; AVX1-NEXT: vpextrb $15, %xmm3, %ecx +; AVX1-NEXT: vpextrb $15, %xmm1, %eax +; AVX1-NEXT: subb %cl, %al +; AVX1-NEXT: movl $0, %ecx +; AVX1-NEXT: jb .LBB2_128 +; AVX1-NEXT: # %bb.127: +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: .LBB2_128: +; AVX1-NEXT: movzbl %cl, %eax +; AVX1-NEXT: vpinsrb $15, %eax, %xmm4, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: v64i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-NEXT: vpextrb $1, %xmm4, %ecx +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-NEXT: vpextrb $1, %xmm5, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %ecx +; AVX2-NEXT: jb .LBB2_2 +; AVX2-NEXT: # %bb.1: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_2: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %r13 +; AVX2-NEXT: pushq %r12 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: subq $12, %rsp +; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrb $0, %xmm4, %ecx +; AVX2-NEXT: vpextrb $0, %xmm5, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %edi +; AVX2-NEXT: jb .LBB2_4 +; AVX2-NEXT: # %bb.3: +; AVX2-NEXT: movl %eax, %edi +; AVX2-NEXT: .LBB2_4: +; AVX2-NEXT: vpextrb $2, %xmm4, %ecx +; AVX2-NEXT: vpextrb $2, %xmm5, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %ecx +; AVX2-NEXT: jb .LBB2_6 +; AVX2-NEXT: # %bb.5: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_6: +; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrb $3, %xmm4, %ecx +; AVX2-NEXT: vpextrb $3, %xmm5, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %ecx +; AVX2-NEXT: jb .LBB2_8 +; AVX2-NEXT: # %bb.7: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_8: +; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrb $4, %xmm4, %ecx +; AVX2-NEXT: vpextrb $4, %xmm5, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %ecx +; AVX2-NEXT: jb .LBB2_10 +; AVX2-NEXT: # %bb.9: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_10: +; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrb $5, %xmm4, %ecx +; AVX2-NEXT: vpextrb $5, %xmm5, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %ecx +; AVX2-NEXT: jb .LBB2_12 +; AVX2-NEXT: # %bb.11: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_12: +; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrb $6, %xmm4, %ecx +; AVX2-NEXT: vpextrb $6, %xmm5, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %ecx +; AVX2-NEXT: jb .LBB2_14 +; AVX2-NEXT: # %bb.13: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_14: +; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrb $7, %xmm4, %ecx +; AVX2-NEXT: vpextrb $7, %xmm5, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %ecx +; AVX2-NEXT: jb .LBB2_16 +; AVX2-NEXT: # %bb.15: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_16: +; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrb $8, %xmm4, %ecx +; AVX2-NEXT: vpextrb $8, %xmm5, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %ecx +; AVX2-NEXT: jb .LBB2_18 +; AVX2-NEXT: # %bb.17: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_18: +; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrb $9, %xmm4, %ecx +; AVX2-NEXT: vpextrb $9, %xmm5, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %ecx +; AVX2-NEXT: jb .LBB2_20 +; AVX2-NEXT: # %bb.19: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_20: +; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrb $10, %xmm4, %ecx +; AVX2-NEXT: vpextrb $10, %xmm5, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %ecx +; AVX2-NEXT: jb .LBB2_22 +; AVX2-NEXT: # %bb.21: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_22: +; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrb $11, %xmm4, %ecx +; AVX2-NEXT: vpextrb $11, %xmm5, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %ecx +; AVX2-NEXT: jb .LBB2_24 +; AVX2-NEXT: # %bb.23: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_24: +; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrb $12, %xmm4, %ecx +; AVX2-NEXT: vpextrb $12, %xmm5, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %ecx +; AVX2-NEXT: jb .LBB2_26 +; AVX2-NEXT: # %bb.25: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_26: +; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrb $13, %xmm4, %ecx +; AVX2-NEXT: vpextrb $13, %xmm5, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %ecx +; AVX2-NEXT: jb .LBB2_28 +; AVX2-NEXT: # %bb.27: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_28: +; AVX2-NEXT: movl %ecx, (%rsp) # 4-byte Spill +; AVX2-NEXT: vpextrb $14, %xmm4, %ecx +; AVX2-NEXT: vpextrb $14, %xmm5, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %ecx +; AVX2-NEXT: jb .LBB2_30 +; AVX2-NEXT: # %bb.29: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_30: +; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrb $15, %xmm4, %ecx +; AVX2-NEXT: vpextrb $15, %xmm5, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %ecx +; AVX2-NEXT: jb .LBB2_32 +; AVX2-NEXT: # %bb.31: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_32: +; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrb $1, %xmm2, %ecx +; AVX2-NEXT: vpextrb $1, %xmm0, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %esi +; AVX2-NEXT: jb .LBB2_34 +; AVX2-NEXT: # %bb.33: +; AVX2-NEXT: movl %eax, %esi +; AVX2-NEXT: .LBB2_34: +; AVX2-NEXT: vpextrb $0, %xmm2, %ecx +; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %ebx +; AVX2-NEXT: jb .LBB2_36 +; AVX2-NEXT: # %bb.35: +; AVX2-NEXT: movl %eax, %ebx +; AVX2-NEXT: .LBB2_36: +; AVX2-NEXT: vpextrb $2, %xmm2, %ecx +; AVX2-NEXT: vpextrb $2, %xmm0, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %ecx +; AVX2-NEXT: jb .LBB2_38 +; AVX2-NEXT: # %bb.37: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_38: +; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrb $3, %xmm2, %ecx +; AVX2-NEXT: vpextrb $3, %xmm0, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %ecx +; AVX2-NEXT: jb .LBB2_40 +; AVX2-NEXT: # %bb.39: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_40: +; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrb $4, %xmm2, %ecx +; AVX2-NEXT: vpextrb $4, %xmm0, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %ecx +; AVX2-NEXT: jb .LBB2_42 +; AVX2-NEXT: # %bb.41: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_42: +; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrb $5, %xmm2, %ecx +; AVX2-NEXT: vpextrb $5, %xmm0, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %ecx +; AVX2-NEXT: jb .LBB2_44 +; AVX2-NEXT: # %bb.43: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_44: +; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrb $6, %xmm2, %ecx +; AVX2-NEXT: vpextrb $6, %xmm0, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %ecx +; AVX2-NEXT: jb .LBB2_46 +; AVX2-NEXT: # %bb.45: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_46: +; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrb $7, %xmm2, %ecx +; AVX2-NEXT: vpextrb $7, %xmm0, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %ecx +; AVX2-NEXT: jb .LBB2_48 +; AVX2-NEXT: # %bb.47: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_48: +; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrb $8, %xmm2, %ecx +; AVX2-NEXT: vpextrb $8, %xmm0, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %ecx +; AVX2-NEXT: jb .LBB2_50 +; AVX2-NEXT: # %bb.49: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_50: +; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrb $9, %xmm2, %ecx +; AVX2-NEXT: vpextrb $9, %xmm0, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %ecx +; AVX2-NEXT: jb .LBB2_52 +; AVX2-NEXT: # %bb.51: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_52: +; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrb $10, %xmm2, %ecx +; AVX2-NEXT: vpextrb $10, %xmm0, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %ecx +; AVX2-NEXT: jb .LBB2_54 +; AVX2-NEXT: # %bb.53: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_54: +; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrb $11, %xmm2, %ecx +; AVX2-NEXT: vpextrb $11, %xmm0, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %ecx +; AVX2-NEXT: jb .LBB2_56 +; AVX2-NEXT: # %bb.55: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_56: +; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrb $12, %xmm2, %ecx +; AVX2-NEXT: vpextrb $12, %xmm0, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %ecx +; AVX2-NEXT: jb .LBB2_58 +; AVX2-NEXT: # %bb.57: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_58: +; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrb $13, %xmm2, %ecx +; AVX2-NEXT: vpextrb $13, %xmm0, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %ecx +; AVX2-NEXT: jb .LBB2_60 +; AVX2-NEXT: # %bb.59: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_60: +; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrb $14, %xmm2, %ecx +; AVX2-NEXT: vpextrb $14, %xmm0, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %ecx +; AVX2-NEXT: jb .LBB2_62 +; AVX2-NEXT: # %bb.61: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_62: +; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrb $15, %xmm2, %ecx +; AVX2-NEXT: vpextrb $15, %xmm0, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %ecx +; AVX2-NEXT: jb .LBB2_64 +; AVX2-NEXT: # %bb.63: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_64: +; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm0 +; AVX2-NEXT: vpextrb $1, %xmm0, %ecx +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpextrb $1, %xmm2, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %r9d +; AVX2-NEXT: jb .LBB2_66 +; AVX2-NEXT: # %bb.65: +; AVX2-NEXT: movl %eax, %r9d +; AVX2-NEXT: .LBB2_66: +; AVX2-NEXT: vpextrb $0, %xmm0, %ecx +; AVX2-NEXT: vpextrb $0, %xmm2, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %edx +; AVX2-NEXT: jb .LBB2_68 +; AVX2-NEXT: # %bb.67: +; AVX2-NEXT: movl %eax, %edx +; AVX2-NEXT: .LBB2_68: +; AVX2-NEXT: vpextrb $2, %xmm0, %ecx +; AVX2-NEXT: vpextrb $2, %xmm2, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %r10d +; AVX2-NEXT: jb .LBB2_70 +; AVX2-NEXT: # %bb.69: +; AVX2-NEXT: movl %eax, %r10d +; AVX2-NEXT: .LBB2_70: +; AVX2-NEXT: vpextrb $3, %xmm0, %ecx +; AVX2-NEXT: vpextrb $3, %xmm2, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %r8d +; AVX2-NEXT: jb .LBB2_72 +; AVX2-NEXT: # %bb.71: +; AVX2-NEXT: movl %eax, %r8d +; AVX2-NEXT: .LBB2_72: +; AVX2-NEXT: vpextrb $4, %xmm0, %ecx +; AVX2-NEXT: vpextrb $4, %xmm2, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %r11d +; AVX2-NEXT: jb .LBB2_74 +; AVX2-NEXT: # %bb.73: +; AVX2-NEXT: movl %eax, %r11d +; AVX2-NEXT: .LBB2_74: +; AVX2-NEXT: vpextrb $5, %xmm0, %ecx +; AVX2-NEXT: vpextrb $5, %xmm2, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %r13d +; AVX2-NEXT: jb .LBB2_76 +; AVX2-NEXT: # %bb.75: +; AVX2-NEXT: movl %eax, %r13d +; AVX2-NEXT: .LBB2_76: +; AVX2-NEXT: vpextrb $6, %xmm0, %ecx +; AVX2-NEXT: vpextrb $6, %xmm2, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %r12d +; AVX2-NEXT: jb .LBB2_78 +; AVX2-NEXT: # %bb.77: +; AVX2-NEXT: movl %eax, %r12d +; AVX2-NEXT: .LBB2_78: +; AVX2-NEXT: vpextrb $7, %xmm0, %ecx +; AVX2-NEXT: vpextrb $7, %xmm2, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %r15d +; AVX2-NEXT: jb .LBB2_80 +; AVX2-NEXT: # %bb.79: +; AVX2-NEXT: movl %eax, %r15d +; AVX2-NEXT: .LBB2_80: +; AVX2-NEXT: vpextrb $8, %xmm0, %ecx +; AVX2-NEXT: vpextrb $8, %xmm2, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %r14d +; AVX2-NEXT: jb .LBB2_82 +; AVX2-NEXT: # %bb.81: +; AVX2-NEXT: movl %eax, %r14d +; AVX2-NEXT: .LBB2_82: +; AVX2-NEXT: vpextrb $9, %xmm0, %ecx +; AVX2-NEXT: vpextrb $9, %xmm2, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %ecx +; AVX2-NEXT: jb .LBB2_84 +; AVX2-NEXT: # %bb.83: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_84: +; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrb $10, %xmm0, %ecx +; AVX2-NEXT: vpextrb $10, %xmm2, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %ecx +; AVX2-NEXT: jb .LBB2_86 +; AVX2-NEXT: # %bb.85: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_86: +; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrb $11, %xmm0, %ecx +; AVX2-NEXT: vpextrb $11, %xmm2, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %ecx +; AVX2-NEXT: jb .LBB2_88 +; AVX2-NEXT: # %bb.87: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_88: +; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrb $12, %xmm0, %ecx +; AVX2-NEXT: vpextrb $12, %xmm2, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %ecx +; AVX2-NEXT: jb .LBB2_90 +; AVX2-NEXT: # %bb.89: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_90: +; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movzbl %bl, %ebp +; AVX2-NEXT: vpextrb $13, %xmm0, %ecx +; AVX2-NEXT: vpextrb $13, %xmm2, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %ecx +; AVX2-NEXT: jb .LBB2_92 +; AVX2-NEXT: # %bb.91: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_92: +; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movzbl %dil, %edi +; AVX2-NEXT: movzbl %sil, %ebx +; AVX2-NEXT: vmovd %ebp, %xmm4 +; AVX2-NEXT: vpextrb $14, %xmm0, %ecx +; AVX2-NEXT: vpextrb $14, %xmm2, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %ecx +; AVX2-NEXT: jb .LBB2_94 +; AVX2-NEXT: # %bb.93: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_94: +; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; AVX2-NEXT: vmovd %edi, %xmm5 +; AVX2-NEXT: vpinsrb $1, %ebx, %xmm4, %xmm4 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; AVX2-NEXT: movzbl %dl, %ebp +; AVX2-NEXT: vpextrb $15, %xmm0, %ecx +; AVX2-NEXT: vpextrb $15, %xmm2, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %ecx +; AVX2-NEXT: jb .LBB2_96 +; AVX2-NEXT: # %bb.95: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_96: +; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpinsrb $1, %esi, %xmm5, %xmm0 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $2, %edi, %xmm4, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload +; AVX2-NEXT: movzbl %r9b, %edx +; AVX2-NEXT: vmovd %ebp, %xmm4 +; AVX2-NEXT: vpextrb $1, %xmm3, %eax +; AVX2-NEXT: vpextrb $1, %xmm1, %edi +; AVX2-NEXT: subb %al, %dil +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; AVX2-NEXT: jb .LBB2_98 +; AVX2-NEXT: # %bb.97: +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: .LBB2_98: +; AVX2-NEXT: vpinsrb $2, %esi, %xmm0, %xmm0 +; AVX2-NEXT: movzbl %cl, %esi +; AVX2-NEXT: vpinsrb $3, %ebx, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $1, %edx, %xmm4, %xmm4 +; AVX2-NEXT: movzbl %r10b, %edi +; AVX2-NEXT: movzbl %al, %edx +; AVX2-NEXT: vpextrb $0, %xmm3, %ecx +; AVX2-NEXT: vpextrb $0, %xmm1, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %ecx +; AVX2-NEXT: jb .LBB2_100 +; AVX2-NEXT: # %bb.99: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_100: +; AVX2-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $4, %ebp, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $2, %edi, %xmm4, %xmm4 +; AVX2-NEXT: movzbl %r8b, %ebp +; AVX2-NEXT: movzbl %cl, %eax +; AVX2-NEXT: vmovd %eax, %xmm5 +; AVX2-NEXT: vpinsrb $1, %edx, %xmm5, %xmm5 +; AVX2-NEXT: vpextrb $2, %xmm3, %eax +; AVX2-NEXT: vpextrb $2, %xmm1, %ecx +; AVX2-NEXT: subb %al, %cl +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: jb .LBB2_102 +; AVX2-NEXT: # %bb.101: +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: .LBB2_102: +; AVX2-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $5, %ebx, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $3, %ebp, %xmm4, %xmm4 +; AVX2-NEXT: movzbl %r11b, %ebp +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5 +; AVX2-NEXT: vpextrb $3, %xmm3, %eax +; AVX2-NEXT: vpextrb $3, %xmm1, %ecx +; AVX2-NEXT: subb %al, %cl +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: jb .LBB2_104 +; AVX2-NEXT: # %bb.103: +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: .LBB2_104: +; AVX2-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $6, %esi, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $4, %ebp, %xmm4, %xmm4 +; AVX2-NEXT: movzbl %r13b, %esi +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5 +; AVX2-NEXT: vpextrb $4, %xmm3, %edi +; AVX2-NEXT: vpextrb $4, %xmm1, %eax +; AVX2-NEXT: subb %dil, %al +; AVX2-NEXT: movl $0, %edi +; AVX2-NEXT: jb .LBB2_106 +; AVX2-NEXT: # %bb.105: +; AVX2-NEXT: movl %eax, %edi +; AVX2-NEXT: .LBB2_106: +; AVX2-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $7, %edx, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $5, %esi, %xmm4, %xmm4 +; AVX2-NEXT: movzbl %r12b, %edx +; AVX2-NEXT: movzbl %dil, %esi +; AVX2-NEXT: vpinsrb $4, %esi, %xmm5, %xmm5 +; AVX2-NEXT: vpextrb $5, %xmm3, %esi +; AVX2-NEXT: vpextrb $5, %xmm1, %edi +; AVX2-NEXT: subb %sil, %dil +; AVX2-NEXT: movl $0, %esi +; AVX2-NEXT: jb .LBB2_108 +; AVX2-NEXT: # %bb.107: +; AVX2-NEXT: movl %edi, %esi +; AVX2-NEXT: .LBB2_108: +; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $6, %edx, %xmm4, %xmm4 +; AVX2-NEXT: movzbl %r15b, %edx +; AVX2-NEXT: movzbl %sil, %esi +; AVX2-NEXT: vpinsrb $5, %esi, %xmm5, %xmm5 +; AVX2-NEXT: vpextrb $6, %xmm3, %esi +; AVX2-NEXT: vpextrb $6, %xmm1, %edi +; AVX2-NEXT: subb %sil, %dil +; AVX2-NEXT: movl $0, %esi +; AVX2-NEXT: jb .LBB2_110 +; AVX2-NEXT: # %bb.109: +; AVX2-NEXT: movl %edi, %esi +; AVX2-NEXT: .LBB2_110: +; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $7, %edx, %xmm4, %xmm4 +; AVX2-NEXT: movzbl %r14b, %edx +; AVX2-NEXT: movzbl %sil, %esi +; AVX2-NEXT: vpinsrb $6, %esi, %xmm5, %xmm5 +; AVX2-NEXT: vpextrb $7, %xmm3, %esi +; AVX2-NEXT: vpextrb $7, %xmm1, %edi +; AVX2-NEXT: subb %sil, %dil +; AVX2-NEXT: movl $0, %esi +; AVX2-NEXT: jb .LBB2_112 +; AVX2-NEXT: # %bb.111: +; AVX2-NEXT: movl %edi, %esi +; AVX2-NEXT: .LBB2_112: +; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $8, %edx, %xmm4, %xmm4 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX2-NEXT: movzbl %sil, %esi +; AVX2-NEXT: vpinsrb $7, %esi, %xmm5, %xmm5 +; AVX2-NEXT: vpextrb $8, %xmm3, %esi +; AVX2-NEXT: vpextrb $8, %xmm1, %edi +; AVX2-NEXT: subb %sil, %dil +; AVX2-NEXT: movl $0, %esi +; AVX2-NEXT: jb .LBB2_114 +; AVX2-NEXT: # %bb.113: +; AVX2-NEXT: movl %edi, %esi +; AVX2-NEXT: .LBB2_114: +; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $9, %edx, %xmm4, %xmm4 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX2-NEXT: movzbl %sil, %esi +; AVX2-NEXT: vpinsrb $8, %esi, %xmm5, %xmm5 +; AVX2-NEXT: vpextrb $9, %xmm3, %esi +; AVX2-NEXT: vpextrb $9, %xmm1, %edi +; AVX2-NEXT: subb %sil, %dil +; AVX2-NEXT: movl $0, %esi +; AVX2-NEXT: jb .LBB2_116 +; AVX2-NEXT: # %bb.115: +; AVX2-NEXT: movl %edi, %esi +; AVX2-NEXT: .LBB2_116: +; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $10, %edx, %xmm4, %xmm4 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX2-NEXT: movzbl %sil, %esi +; AVX2-NEXT: vpinsrb $9, %esi, %xmm5, %xmm5 +; AVX2-NEXT: vpextrb $10, %xmm3, %esi +; AVX2-NEXT: vpextrb $10, %xmm1, %edi +; AVX2-NEXT: subb %sil, %dil +; AVX2-NEXT: movl $0, %esi +; AVX2-NEXT: jb .LBB2_118 +; AVX2-NEXT: # %bb.117: +; AVX2-NEXT: movl %edi, %esi +; AVX2-NEXT: .LBB2_118: +; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl (%rsp), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $11, %edx, %xmm4, %xmm4 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX2-NEXT: movzbl %sil, %esi +; AVX2-NEXT: vpinsrb $10, %esi, %xmm5, %xmm5 +; AVX2-NEXT: vpextrb $11, %xmm3, %esi +; AVX2-NEXT: vpextrb $11, %xmm1, %edi +; AVX2-NEXT: subb %sil, %dil +; AVX2-NEXT: movl $0, %esi +; AVX2-NEXT: jb .LBB2_120 +; AVX2-NEXT: # %bb.119: +; AVX2-NEXT: movl %edi, %esi +; AVX2-NEXT: .LBB2_120: +; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $12, %edx, %xmm4, %xmm6 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX2-NEXT: movzbl %sil, %esi +; AVX2-NEXT: vpinsrb $11, %esi, %xmm5, %xmm5 +; AVX2-NEXT: vpextrb $12, %xmm3, %esi +; AVX2-NEXT: vpextrb $12, %xmm1, %edi +; AVX2-NEXT: subb %sil, %dil +; AVX2-NEXT: movl $0, %esi +; AVX2-NEXT: jb .LBB2_122 +; AVX2-NEXT: # %bb.121: +; AVX2-NEXT: movl %edi, %esi +; AVX2-NEXT: .LBB2_122: +; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm4 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: vpinsrb $15, %ecx, %xmm2, %xmm0 +; AVX2-NEXT: vpinsrb $13, %edx, %xmm6, %xmm6 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX2-NEXT: movzbl %sil, %edx +; AVX2-NEXT: vpinsrb $12, %edx, %xmm5, %xmm5 +; AVX2-NEXT: vpextrb $13, %xmm3, %edx +; AVX2-NEXT: vpextrb $13, %xmm1, %esi +; AVX2-NEXT: subb %dl, %sil +; AVX2-NEXT: movl $0, %edx +; AVX2-NEXT: jb .LBB2_124 +; AVX2-NEXT: # %bb.123: +; AVX2-NEXT: movl %esi, %edx +; AVX2-NEXT: .LBB2_124: +; AVX2-NEXT: vpinsrb $15, %eax, %xmm4, %xmm2 +; AVX2-NEXT: vpinsrb $14, %ecx, %xmm6, %xmm4 +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX2-NEXT: movzbl %dl, %ecx +; AVX2-NEXT: vpinsrb $13, %ecx, %xmm5, %xmm5 +; AVX2-NEXT: vpextrb $14, %xmm3, %edx +; AVX2-NEXT: vpextrb $14, %xmm1, %ecx +; AVX2-NEXT: subb %dl, %cl +; AVX2-NEXT: movl $0, %edx +; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rsp +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r12 +; AVX2-NEXT: popq %r13 +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: jb .LBB2_126 +; AVX2-NEXT: # %bb.125: +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: .LBB2_126: +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: vpinsrb $15, %eax, %xmm4, %xmm2 +; AVX2-NEXT: movzbl %dl, %eax +; AVX2-NEXT: vpinsrb $14, %eax, %xmm5, %xmm4 +; AVX2-NEXT: vpextrb $15, %xmm3, %ecx +; AVX2-NEXT: vpextrb $15, %xmm1, %eax +; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: movl $0, %ecx +; AVX2-NEXT: jb .LBB2_128 +; AVX2-NEXT: # %bb.127: +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: .LBB2_128: +; AVX2-NEXT: movzbl %cl, %eax +; AVX2-NEXT: vpinsrb $15, %eax, %xmm4, %xmm1 +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX512-LABEL: v64i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm2 +; AVX512-NEXT: vpextrb $1, %xmm2, %ecx +; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm3 +; AVX512-NEXT: vpextrb $1, %xmm3, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %ecx +; AVX512-NEXT: jb .LBB2_2 +; AVX512-NEXT: # %bb.1: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_2: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r15 +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %r13 +; AVX512-NEXT: pushq %r12 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: subq $12, %rsp +; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrb $0, %xmm2, %ecx +; AVX512-NEXT: vpextrb $0, %xmm3, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %edi +; AVX512-NEXT: jb .LBB2_4 +; AVX512-NEXT: # %bb.3: +; AVX512-NEXT: movl %eax, %edi +; AVX512-NEXT: .LBB2_4: +; AVX512-NEXT: vpextrb $2, %xmm2, %ecx +; AVX512-NEXT: vpextrb $2, %xmm3, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %ecx +; AVX512-NEXT: jb .LBB2_6 +; AVX512-NEXT: # %bb.5: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_6: +; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrb $3, %xmm2, %ecx +; AVX512-NEXT: vpextrb $3, %xmm3, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %ecx +; AVX512-NEXT: jb .LBB2_8 +; AVX512-NEXT: # %bb.7: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_8: +; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrb $4, %xmm2, %ecx +; AVX512-NEXT: vpextrb $4, %xmm3, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %ecx +; AVX512-NEXT: jb .LBB2_10 +; AVX512-NEXT: # %bb.9: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_10: +; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrb $5, %xmm2, %ecx +; AVX512-NEXT: vpextrb $5, %xmm3, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %ecx +; AVX512-NEXT: jb .LBB2_12 +; AVX512-NEXT: # %bb.11: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_12: +; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrb $6, %xmm2, %ecx +; AVX512-NEXT: vpextrb $6, %xmm3, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %ecx +; AVX512-NEXT: jb .LBB2_14 +; AVX512-NEXT: # %bb.13: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_14: +; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrb $7, %xmm2, %ecx +; AVX512-NEXT: vpextrb $7, %xmm3, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %ecx +; AVX512-NEXT: jb .LBB2_16 +; AVX512-NEXT: # %bb.15: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_16: +; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrb $8, %xmm2, %ecx +; AVX512-NEXT: vpextrb $8, %xmm3, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %ecx +; AVX512-NEXT: jb .LBB2_18 +; AVX512-NEXT: # %bb.17: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_18: +; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrb $9, %xmm2, %ecx +; AVX512-NEXT: vpextrb $9, %xmm3, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %ecx +; AVX512-NEXT: jb .LBB2_20 +; AVX512-NEXT: # %bb.19: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_20: +; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrb $10, %xmm2, %ecx +; AVX512-NEXT: vpextrb $10, %xmm3, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %ecx +; AVX512-NEXT: jb .LBB2_22 +; AVX512-NEXT: # %bb.21: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_22: +; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrb $11, %xmm2, %ecx +; AVX512-NEXT: vpextrb $11, %xmm3, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %ecx +; AVX512-NEXT: jb .LBB2_24 +; AVX512-NEXT: # %bb.23: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_24: +; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrb $12, %xmm2, %ecx +; AVX512-NEXT: vpextrb $12, %xmm3, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %ecx +; AVX512-NEXT: jb .LBB2_26 +; AVX512-NEXT: # %bb.25: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_26: +; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrb $13, %xmm2, %ecx +; AVX512-NEXT: vpextrb $13, %xmm3, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %ecx +; AVX512-NEXT: jb .LBB2_28 +; AVX512-NEXT: # %bb.27: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_28: +; AVX512-NEXT: movl %ecx, (%rsp) # 4-byte Spill +; AVX512-NEXT: vpextrb $14, %xmm2, %ecx +; AVX512-NEXT: vpextrb $14, %xmm3, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %ecx +; AVX512-NEXT: jb .LBB2_30 +; AVX512-NEXT: # %bb.29: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_30: +; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrb $15, %xmm2, %ecx +; AVX512-NEXT: vpextrb $15, %xmm3, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %ecx +; AVX512-NEXT: jb .LBB2_32 +; AVX512-NEXT: # %bb.31: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_32: +; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm2 +; AVX512-NEXT: vpextrb $1, %xmm2, %ecx +; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm3 +; AVX512-NEXT: vpextrb $1, %xmm3, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %esi +; AVX512-NEXT: jb .LBB2_34 +; AVX512-NEXT: # %bb.33: +; AVX512-NEXT: movl %eax, %esi +; AVX512-NEXT: .LBB2_34: +; AVX512-NEXT: vpextrb $0, %xmm2, %ecx +; AVX512-NEXT: vpextrb $0, %xmm3, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %ebx +; AVX512-NEXT: jb .LBB2_36 +; AVX512-NEXT: # %bb.35: +; AVX512-NEXT: movl %eax, %ebx +; AVX512-NEXT: .LBB2_36: +; AVX512-NEXT: vpextrb $2, %xmm2, %ecx +; AVX512-NEXT: vpextrb $2, %xmm3, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %ecx +; AVX512-NEXT: jb .LBB2_38 +; AVX512-NEXT: # %bb.37: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_38: +; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrb $3, %xmm2, %ecx +; AVX512-NEXT: vpextrb $3, %xmm3, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %ecx +; AVX512-NEXT: jb .LBB2_40 +; AVX512-NEXT: # %bb.39: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_40: +; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrb $4, %xmm2, %ecx +; AVX512-NEXT: vpextrb $4, %xmm3, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %ecx +; AVX512-NEXT: jb .LBB2_42 +; AVX512-NEXT: # %bb.41: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_42: +; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrb $5, %xmm2, %ecx +; AVX512-NEXT: vpextrb $5, %xmm3, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %ecx +; AVX512-NEXT: jb .LBB2_44 +; AVX512-NEXT: # %bb.43: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_44: +; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrb $6, %xmm2, %ecx +; AVX512-NEXT: vpextrb $6, %xmm3, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %ecx +; AVX512-NEXT: jb .LBB2_46 +; AVX512-NEXT: # %bb.45: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_46: +; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrb $7, %xmm2, %ecx +; AVX512-NEXT: vpextrb $7, %xmm3, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %ecx +; AVX512-NEXT: jb .LBB2_48 +; AVX512-NEXT: # %bb.47: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_48: +; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrb $8, %xmm2, %ecx +; AVX512-NEXT: vpextrb $8, %xmm3, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %ecx +; AVX512-NEXT: jb .LBB2_50 +; AVX512-NEXT: # %bb.49: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_50: +; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrb $9, %xmm2, %ecx +; AVX512-NEXT: vpextrb $9, %xmm3, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %ecx +; AVX512-NEXT: jb .LBB2_52 +; AVX512-NEXT: # %bb.51: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_52: +; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrb $10, %xmm2, %ecx +; AVX512-NEXT: vpextrb $10, %xmm3, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %ecx +; AVX512-NEXT: jb .LBB2_54 +; AVX512-NEXT: # %bb.53: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_54: +; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrb $11, %xmm2, %ecx +; AVX512-NEXT: vpextrb $11, %xmm3, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %ecx +; AVX512-NEXT: jb .LBB2_56 +; AVX512-NEXT: # %bb.55: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_56: +; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrb $12, %xmm2, %ecx +; AVX512-NEXT: vpextrb $12, %xmm3, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %ecx +; AVX512-NEXT: jb .LBB2_58 +; AVX512-NEXT: # %bb.57: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_58: +; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrb $13, %xmm2, %ecx +; AVX512-NEXT: vpextrb $13, %xmm3, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %ecx +; AVX512-NEXT: jb .LBB2_60 +; AVX512-NEXT: # %bb.59: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_60: +; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrb $14, %xmm2, %ecx +; AVX512-NEXT: vpextrb $14, %xmm3, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %ecx +; AVX512-NEXT: jb .LBB2_62 +; AVX512-NEXT: # %bb.61: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_62: +; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrb $15, %xmm2, %ecx +; AVX512-NEXT: vpextrb $15, %xmm3, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %ecx +; AVX512-NEXT: jb .LBB2_64 +; AVX512-NEXT: # %bb.63: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_64: +; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vpextrb $1, %xmm2, %ecx +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512-NEXT: vpextrb $1, %xmm3, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %r9d +; AVX512-NEXT: jb .LBB2_66 +; AVX512-NEXT: # %bb.65: +; AVX512-NEXT: movl %eax, %r9d +; AVX512-NEXT: .LBB2_66: +; AVX512-NEXT: vpextrb $0, %xmm2, %ecx +; AVX512-NEXT: vpextrb $0, %xmm3, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: jb .LBB2_68 +; AVX512-NEXT: # %bb.67: +; AVX512-NEXT: movl %eax, %edx +; AVX512-NEXT: .LBB2_68: +; AVX512-NEXT: vpextrb $2, %xmm2, %ecx +; AVX512-NEXT: vpextrb $2, %xmm3, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %r10d +; AVX512-NEXT: jb .LBB2_70 +; AVX512-NEXT: # %bb.69: +; AVX512-NEXT: movl %eax, %r10d +; AVX512-NEXT: .LBB2_70: +; AVX512-NEXT: vpextrb $3, %xmm2, %ecx +; AVX512-NEXT: vpextrb $3, %xmm3, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %r8d +; AVX512-NEXT: jb .LBB2_72 +; AVX512-NEXT: # %bb.71: +; AVX512-NEXT: movl %eax, %r8d +; AVX512-NEXT: .LBB2_72: +; AVX512-NEXT: vpextrb $4, %xmm2, %ecx +; AVX512-NEXT: vpextrb $4, %xmm3, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %r11d +; AVX512-NEXT: jb .LBB2_74 +; AVX512-NEXT: # %bb.73: +; AVX512-NEXT: movl %eax, %r11d +; AVX512-NEXT: .LBB2_74: +; AVX512-NEXT: vpextrb $5, %xmm2, %ecx +; AVX512-NEXT: vpextrb $5, %xmm3, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %r13d +; AVX512-NEXT: jb .LBB2_76 +; AVX512-NEXT: # %bb.75: +; AVX512-NEXT: movl %eax, %r13d +; AVX512-NEXT: .LBB2_76: +; AVX512-NEXT: vpextrb $6, %xmm2, %ecx +; AVX512-NEXT: vpextrb $6, %xmm3, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %r12d +; AVX512-NEXT: jb .LBB2_78 +; AVX512-NEXT: # %bb.77: +; AVX512-NEXT: movl %eax, %r12d +; AVX512-NEXT: .LBB2_78: +; AVX512-NEXT: vpextrb $7, %xmm2, %ecx +; AVX512-NEXT: vpextrb $7, %xmm3, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %r15d +; AVX512-NEXT: jb .LBB2_80 +; AVX512-NEXT: # %bb.79: +; AVX512-NEXT: movl %eax, %r15d +; AVX512-NEXT: .LBB2_80: +; AVX512-NEXT: vpextrb $8, %xmm2, %ecx +; AVX512-NEXT: vpextrb $8, %xmm3, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %r14d +; AVX512-NEXT: jb .LBB2_82 +; AVX512-NEXT: # %bb.81: +; AVX512-NEXT: movl %eax, %r14d +; AVX512-NEXT: .LBB2_82: +; AVX512-NEXT: vpextrb $9, %xmm2, %ecx +; AVX512-NEXT: vpextrb $9, %xmm3, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %ecx +; AVX512-NEXT: jb .LBB2_84 +; AVX512-NEXT: # %bb.83: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_84: +; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrb $10, %xmm2, %ecx +; AVX512-NEXT: vpextrb $10, %xmm3, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %ecx +; AVX512-NEXT: jb .LBB2_86 +; AVX512-NEXT: # %bb.85: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_86: +; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrb $11, %xmm2, %ecx +; AVX512-NEXT: vpextrb $11, %xmm3, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %ecx +; AVX512-NEXT: jb .LBB2_88 +; AVX512-NEXT: # %bb.87: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_88: +; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrb $12, %xmm2, %ecx +; AVX512-NEXT: vpextrb $12, %xmm3, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %ecx +; AVX512-NEXT: jb .LBB2_90 +; AVX512-NEXT: # %bb.89: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_90: +; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movzbl %bl, %ebp +; AVX512-NEXT: vpextrb $13, %xmm2, %ecx +; AVX512-NEXT: vpextrb $13, %xmm3, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %ecx +; AVX512-NEXT: jb .LBB2_92 +; AVX512-NEXT: # %bb.91: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_92: +; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movzbl %dil, %edi +; AVX512-NEXT: movzbl %sil, %ebx +; AVX512-NEXT: vmovd %ebp, %xmm4 +; AVX512-NEXT: vpextrb $14, %xmm2, %ecx +; AVX512-NEXT: vpextrb $14, %xmm3, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %ecx +; AVX512-NEXT: jb .LBB2_94 +; AVX512-NEXT: # %bb.93: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_94: +; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; AVX512-NEXT: vmovd %edi, %xmm5 +; AVX512-NEXT: vpinsrb $1, %ebx, %xmm4, %xmm4 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; AVX512-NEXT: movzbl %dl, %ebp +; AVX512-NEXT: vpextrb $15, %xmm2, %ecx +; AVX512-NEXT: vpextrb $15, %xmm3, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %ecx +; AVX512-NEXT: jb .LBB2_96 +; AVX512-NEXT: # %bb.95: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_96: +; AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpinsrb $1, %esi, %xmm5, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $2, %edi, %xmm4, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload +; AVX512-NEXT: movzbl %r9b, %edx +; AVX512-NEXT: vmovd %ebp, %xmm4 +; AVX512-NEXT: vpextrb $1, %xmm1, %eax +; AVX512-NEXT: vpextrb $1, %xmm0, %edi +; AVX512-NEXT: subb %al, %dil +; AVX512-NEXT: movl $0, %eax +; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; AVX512-NEXT: jb .LBB2_98 +; AVX512-NEXT: # %bb.97: +; AVX512-NEXT: movl %edi, %eax +; AVX512-NEXT: .LBB2_98: +; AVX512-NEXT: vpinsrb $2, %esi, %xmm2, %xmm2 +; AVX512-NEXT: movzbl %cl, %esi +; AVX512-NEXT: vpinsrb $3, %ebx, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $1, %edx, %xmm4, %xmm4 +; AVX512-NEXT: movzbl %r10b, %edi +; AVX512-NEXT: movzbl %al, %edx +; AVX512-NEXT: vpextrb $0, %xmm1, %ecx +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %ecx +; AVX512-NEXT: jb .LBB2_100 +; AVX512-NEXT: # %bb.99: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_100: +; AVX512-NEXT: vpinsrb $3, %esi, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $4, %ebp, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $2, %edi, %xmm4, %xmm4 +; AVX512-NEXT: movzbl %r8b, %ebp +; AVX512-NEXT: movzbl %cl, %eax +; AVX512-NEXT: vmovd %eax, %xmm5 +; AVX512-NEXT: vpinsrb $1, %edx, %xmm5, %xmm5 +; AVX512-NEXT: vpextrb $2, %xmm1, %eax +; AVX512-NEXT: vpextrb $2, %xmm0, %ecx +; AVX512-NEXT: subb %al, %cl +; AVX512-NEXT: movl $0, %eax +; AVX512-NEXT: jb .LBB2_102 +; AVX512-NEXT: # %bb.101: +; AVX512-NEXT: movl %ecx, %eax +; AVX512-NEXT: .LBB2_102: +; AVX512-NEXT: vpinsrb $4, %esi, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $5, %ebx, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $3, %ebp, %xmm4, %xmm4 +; AVX512-NEXT: movzbl %r11b, %ebp +; AVX512-NEXT: movzbl %al, %eax +; AVX512-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5 +; AVX512-NEXT: vpextrb $3, %xmm1, %eax +; AVX512-NEXT: vpextrb $3, %xmm0, %ecx +; AVX512-NEXT: subb %al, %cl +; AVX512-NEXT: movl $0, %eax +; AVX512-NEXT: jb .LBB2_104 +; AVX512-NEXT: # %bb.103: +; AVX512-NEXT: movl %ecx, %eax +; AVX512-NEXT: .LBB2_104: +; AVX512-NEXT: vpinsrb $5, %edx, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $6, %esi, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $4, %ebp, %xmm4, %xmm4 +; AVX512-NEXT: movzbl %r13b, %esi +; AVX512-NEXT: movzbl %al, %eax +; AVX512-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5 +; AVX512-NEXT: vpextrb $4, %xmm1, %edi +; AVX512-NEXT: vpextrb $4, %xmm0, %eax +; AVX512-NEXT: subb %dil, %al +; AVX512-NEXT: movl $0, %edi +; AVX512-NEXT: jb .LBB2_106 +; AVX512-NEXT: # %bb.105: +; AVX512-NEXT: movl %eax, %edi +; AVX512-NEXT: .LBB2_106: +; AVX512-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $7, %edx, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $5, %esi, %xmm4, %xmm4 +; AVX512-NEXT: movzbl %r12b, %edx +; AVX512-NEXT: movzbl %dil, %esi +; AVX512-NEXT: vpinsrb $4, %esi, %xmm5, %xmm5 +; AVX512-NEXT: vpextrb $5, %xmm1, %esi +; AVX512-NEXT: vpextrb $5, %xmm0, %edi +; AVX512-NEXT: subb %sil, %dil +; AVX512-NEXT: movl $0, %esi +; AVX512-NEXT: jb .LBB2_108 +; AVX512-NEXT: # %bb.107: +; AVX512-NEXT: movl %edi, %esi +; AVX512-NEXT: .LBB2_108: +; AVX512-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $8, %ecx, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $6, %edx, %xmm4, %xmm4 +; AVX512-NEXT: movzbl %r15b, %edx +; AVX512-NEXT: movzbl %sil, %esi +; AVX512-NEXT: vpinsrb $5, %esi, %xmm5, %xmm5 +; AVX512-NEXT: vpextrb $6, %xmm1, %esi +; AVX512-NEXT: vpextrb $6, %xmm0, %edi +; AVX512-NEXT: subb %sil, %dil +; AVX512-NEXT: movl $0, %esi +; AVX512-NEXT: jb .LBB2_110 +; AVX512-NEXT: # %bb.109: +; AVX512-NEXT: movl %edi, %esi +; AVX512-NEXT: .LBB2_110: +; AVX512-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $9, %ecx, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $7, %edx, %xmm4, %xmm4 +; AVX512-NEXT: movzbl %r14b, %edx +; AVX512-NEXT: movzbl %sil, %esi +; AVX512-NEXT: vpinsrb $6, %esi, %xmm5, %xmm5 +; AVX512-NEXT: vpextrb $7, %xmm1, %esi +; AVX512-NEXT: vpextrb $7, %xmm0, %edi +; AVX512-NEXT: subb %sil, %dil +; AVX512-NEXT: movl $0, %esi +; AVX512-NEXT: jb .LBB2_112 +; AVX512-NEXT: # %bb.111: +; AVX512-NEXT: movl %edi, %esi +; AVX512-NEXT: .LBB2_112: +; AVX512-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $10, %ecx, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $8, %edx, %xmm4, %xmm4 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX512-NEXT: movzbl %sil, %esi +; AVX512-NEXT: vpinsrb $7, %esi, %xmm5, %xmm5 +; AVX512-NEXT: vpextrb $8, %xmm1, %esi +; AVX512-NEXT: vpextrb $8, %xmm0, %edi +; AVX512-NEXT: subb %sil, %dil +; AVX512-NEXT: movl $0, %esi +; AVX512-NEXT: jb .LBB2_114 +; AVX512-NEXT: # %bb.113: +; AVX512-NEXT: movl %edi, %esi +; AVX512-NEXT: .LBB2_114: +; AVX512-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $11, %ecx, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $9, %edx, %xmm4, %xmm4 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX512-NEXT: movzbl %sil, %esi +; AVX512-NEXT: vpinsrb $8, %esi, %xmm5, %xmm5 +; AVX512-NEXT: vpextrb $9, %xmm1, %esi +; AVX512-NEXT: vpextrb $9, %xmm0, %edi +; AVX512-NEXT: subb %sil, %dil +; AVX512-NEXT: movl $0, %esi +; AVX512-NEXT: jb .LBB2_116 +; AVX512-NEXT: # %bb.115: +; AVX512-NEXT: movl %edi, %esi +; AVX512-NEXT: .LBB2_116: +; AVX512-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $12, %ecx, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $10, %edx, %xmm4, %xmm4 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX512-NEXT: movzbl %sil, %esi +; AVX512-NEXT: vpinsrb $9, %esi, %xmm5, %xmm5 +; AVX512-NEXT: vpextrb $10, %xmm1, %esi +; AVX512-NEXT: vpextrb $10, %xmm0, %edi +; AVX512-NEXT: subb %sil, %dil +; AVX512-NEXT: movl $0, %esi +; AVX512-NEXT: jb .LBB2_118 +; AVX512-NEXT: # %bb.117: +; AVX512-NEXT: movl %edi, %esi +; AVX512-NEXT: .LBB2_118: +; AVX512-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl (%rsp), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $13, %ecx, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $11, %edx, %xmm4, %xmm4 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX512-NEXT: movzbl %sil, %esi +; AVX512-NEXT: vpinsrb $10, %esi, %xmm5, %xmm5 +; AVX512-NEXT: vpextrb $11, %xmm1, %esi +; AVX512-NEXT: vpextrb $11, %xmm0, %edi +; AVX512-NEXT: subb %sil, %dil +; AVX512-NEXT: movl $0, %esi +; AVX512-NEXT: jb .LBB2_120 +; AVX512-NEXT: # %bb.119: +; AVX512-NEXT: movl %edi, %esi +; AVX512-NEXT: .LBB2_120: +; AVX512-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $14, %ecx, %xmm3, %xmm3 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $12, %edx, %xmm4, %xmm6 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX512-NEXT: movzbl %sil, %esi +; AVX512-NEXT: vpinsrb $11, %esi, %xmm5, %xmm5 +; AVX512-NEXT: vpextrb $12, %xmm1, %esi +; AVX512-NEXT: vpextrb $12, %xmm0, %edi +; AVX512-NEXT: subb %sil, %dil +; AVX512-NEXT: movl $0, %esi +; AVX512-NEXT: jb .LBB2_122 +; AVX512-NEXT: # %bb.121: +; AVX512-NEXT: movl %edi, %esi +; AVX512-NEXT: .LBB2_122: +; AVX512-NEXT: vpinsrb $14, %eax, %xmm2, %xmm4 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: vpinsrb $15, %ecx, %xmm3, %xmm2 +; AVX512-NEXT: vpinsrb $13, %edx, %xmm6, %xmm6 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX512-NEXT: movzbl %sil, %edx +; AVX512-NEXT: vpinsrb $12, %edx, %xmm5, %xmm5 +; AVX512-NEXT: vpextrb $13, %xmm1, %edx +; AVX512-NEXT: vpextrb $13, %xmm0, %esi +; AVX512-NEXT: subb %dl, %sil +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: jb .LBB2_124 +; AVX512-NEXT: # %bb.123: +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: .LBB2_124: +; AVX512-NEXT: vpinsrb $15, %eax, %xmm4, %xmm3 +; AVX512-NEXT: vpinsrb $14, %ecx, %xmm6, %xmm4 +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; AVX512-NEXT: movzbl %dl, %ecx +; AVX512-NEXT: vpinsrb $13, %ecx, %xmm5, %xmm5 +; AVX512-NEXT: vpextrb $14, %xmm1, %edx +; AVX512-NEXT: vpextrb $14, %xmm0, %ecx +; AVX512-NEXT: subb %dl, %cl +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsp +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r12 +; AVX512-NEXT: popq %r13 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %r15 +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: jb .LBB2_126 +; AVX512-NEXT: # %bb.125: +; AVX512-NEXT: movl %ecx, %edx +; AVX512-NEXT: .LBB2_126: +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vpinsrb $15, %eax, %xmm4, %xmm3 +; AVX512-NEXT: movzbl %dl, %eax +; AVX512-NEXT: vpinsrb $14, %eax, %xmm5, %xmm4 +; AVX512-NEXT: vpextrb $15, %xmm1, %ecx +; AVX512-NEXT: vpextrb $15, %xmm0, %eax +; AVX512-NEXT: subb %cl, %al +; AVX512-NEXT: movl $0, %ecx +; AVX512-NEXT: jb .LBB2_128 +; AVX512-NEXT: # %bb.127: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB2_128: +; AVX512-NEXT: movzbl %cl, %eax +; AVX512-NEXT: vpinsrb $15, %eax, %xmm4, %xmm0 +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: retq + %z = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> %x, <64 x i8> %y) + ret <64 x i8> %z +} + +define <8 x i16> @v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { +; SSE2-LABEL: v8i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pextrw $7, %xmm1, %ecx +; SSE2-NEXT: pextrw $7, %xmm0, %edx +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm2 +; SSE2-NEXT: pextrw $6, %xmm1, %ecx +; SSE2-NEXT: pextrw $6, %xmm0, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: pextrw $5, %xmm1, %ecx +; SSE2-NEXT: pextrw $5, %xmm0, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm4 +; SSE2-NEXT: pextrw $4, %xmm1, %ecx +; SSE2-NEXT: pextrw $4, %xmm0, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE2-NEXT: pextrw $3, %xmm1, %ecx +; SSE2-NEXT: pextrw $3, %xmm0, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm3 +; SSE2-NEXT: pextrw $2, %xmm1, %ecx +; SSE2-NEXT: pextrw $2, %xmm0, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE2-NEXT: pextrw $1, %xmm1, %ecx +; SSE2-NEXT: pextrw $1, %xmm0, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm3 +; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: movd %xmm0, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v8i16: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pextrw $7, %xmm1, %ecx +; SSSE3-NEXT: pextrw $7, %xmm0, %edx +; SSSE3-NEXT: xorl %eax, %eax +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm2 +; SSSE3-NEXT: pextrw $6, %xmm1, %ecx +; SSSE3-NEXT: pextrw $6, %xmm0, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm3 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSSE3-NEXT: pextrw $5, %xmm1, %ecx +; SSSE3-NEXT: pextrw $5, %xmm0, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm4 +; SSSE3-NEXT: pextrw $4, %xmm1, %ecx +; SSSE3-NEXT: pextrw $4, %xmm0, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSSE3-NEXT: pextrw $3, %xmm1, %ecx +; SSSE3-NEXT: pextrw $3, %xmm0, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm3 +; SSSE3-NEXT: pextrw $2, %xmm1, %ecx +; SSSE3-NEXT: pextrw $2, %xmm0, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm4 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSSE3-NEXT: pextrw $1, %xmm1, %ecx +; SSSE3-NEXT: pextrw $1, %xmm0, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm3 +; SSSE3-NEXT: movd %xmm1, %ecx +; SSSE3-NEXT: movd %xmm0, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v8i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pextrw $1, %xmm1, %ecx +; SSE41-NEXT: pextrw $1, %xmm0, %edx +; SSE41-NEXT: xorl %eax, %eax +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: movd %xmm1, %ecx +; SSE41-NEXT: movd %xmm0, %esi +; SSE41-NEXT: subw %cx, %si +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: movd %esi, %xmm2 +; SSE41-NEXT: pinsrw $1, %edx, %xmm2 +; SSE41-NEXT: pextrw $2, %xmm1, %ecx +; SSE41-NEXT: pextrw $2, %xmm0, %edx +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $2, %edx, %xmm2 +; SSE41-NEXT: pextrw $3, %xmm1, %ecx +; SSE41-NEXT: pextrw $3, %xmm0, %edx +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $3, %edx, %xmm2 +; SSE41-NEXT: pextrw $4, %xmm1, %ecx +; SSE41-NEXT: pextrw $4, %xmm0, %edx +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $4, %edx, %xmm2 +; SSE41-NEXT: pextrw $5, %xmm1, %ecx +; SSE41-NEXT: pextrw $5, %xmm0, %edx +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $5, %edx, %xmm2 +; SSE41-NEXT: pextrw $6, %xmm1, %ecx +; SSE41-NEXT: pextrw $6, %xmm0, %edx +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $6, %edx, %xmm2 +; SSE41-NEXT: pextrw $7, %xmm1, %ecx +; SSE41-NEXT: pextrw $7, %xmm0, %edx +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $7, %edx, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: v8i16: +; AVX: # %bb.0: +; AVX-NEXT: vpextrw $1, %xmm1, %ecx +; AVX-NEXT: vpextrw $1, %xmm0, %edx +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: subw %cx, %dx +; AVX-NEXT: cmovbl %eax, %edx +; AVX-NEXT: vmovd %xmm1, %ecx +; AVX-NEXT: vmovd %xmm0, %esi +; AVX-NEXT: subw %cx, %si +; AVX-NEXT: cmovbl %eax, %esi +; AVX-NEXT: vmovd %esi, %xmm2 +; AVX-NEXT: vpinsrw $1, %edx, %xmm2, %xmm2 +; AVX-NEXT: vpextrw $2, %xmm1, %ecx +; AVX-NEXT: vpextrw $2, %xmm0, %edx +; AVX-NEXT: subw %cx, %dx +; AVX-NEXT: cmovbl %eax, %edx +; AVX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2 +; AVX-NEXT: vpextrw $3, %xmm1, %ecx +; AVX-NEXT: vpextrw $3, %xmm0, %edx +; AVX-NEXT: subw %cx, %dx +; AVX-NEXT: cmovbl %eax, %edx +; AVX-NEXT: vpinsrw $3, %edx, %xmm2, %xmm2 +; AVX-NEXT: vpextrw $4, %xmm1, %ecx +; AVX-NEXT: vpextrw $4, %xmm0, %edx +; AVX-NEXT: subw %cx, %dx +; AVX-NEXT: cmovbl %eax, %edx +; AVX-NEXT: vpinsrw $4, %edx, %xmm2, %xmm2 +; AVX-NEXT: vpextrw $5, %xmm1, %ecx +; AVX-NEXT: vpextrw $5, %xmm0, %edx +; AVX-NEXT: subw %cx, %dx +; AVX-NEXT: cmovbl %eax, %edx +; AVX-NEXT: vpinsrw $5, %edx, %xmm2, %xmm2 +; AVX-NEXT: vpextrw $6, %xmm1, %ecx +; AVX-NEXT: vpextrw $6, %xmm0, %edx +; AVX-NEXT: subw %cx, %dx +; AVX-NEXT: cmovbl %eax, %edx +; AVX-NEXT: vpinsrw $6, %edx, %xmm2, %xmm2 +; AVX-NEXT: vpextrw $7, %xmm1, %ecx +; AVX-NEXT: vpextrw $7, %xmm0, %edx +; AVX-NEXT: subw %cx, %dx +; AVX-NEXT: cmovbl %eax, %edx +; AVX-NEXT: vpinsrw $7, %edx, %xmm2, %xmm0 +; AVX-NEXT: retq + %z = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %x, <8 x i16> %y) + ret <8 x i16> %z +} + +define <16 x i16> @v16i16(<16 x i16> %x, <16 x i16> %y) nounwind { +; SSE2-LABEL: v16i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pextrw $7, %xmm2, %ecx +; SSE2-NEXT: pextrw $7, %xmm0, %edx +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm4 +; SSE2-NEXT: pextrw $6, %xmm2, %ecx +; SSE2-NEXT: pextrw $6, %xmm0, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE2-NEXT: pextrw $5, %xmm2, %ecx +; SSE2-NEXT: pextrw $5, %xmm0, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm6 +; SSE2-NEXT: pextrw $4, %xmm2, %ecx +; SSE2-NEXT: pextrw $4, %xmm0, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE2-NEXT: pextrw $3, %xmm2, %ecx +; SSE2-NEXT: pextrw $3, %xmm0, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm5 +; SSE2-NEXT: pextrw $2, %xmm2, %ecx +; SSE2-NEXT: pextrw $2, %xmm0, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm6 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; SSE2-NEXT: pextrw $1, %xmm2, %ecx +; SSE2-NEXT: pextrw $1, %xmm0, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm5 +; SSE2-NEXT: movd %xmm2, %ecx +; SSE2-NEXT: movd %xmm0, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; SSE2-NEXT: pextrw $7, %xmm3, %ecx +; SSE2-NEXT: pextrw $7, %xmm1, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm2 +; SSE2-NEXT: pextrw $6, %xmm3, %ecx +; SSE2-NEXT: pextrw $6, %xmm1, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE2-NEXT: pextrw $5, %xmm3, %ecx +; SSE2-NEXT: pextrw $5, %xmm1, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm5 +; SSE2-NEXT: pextrw $4, %xmm3, %ecx +; SSE2-NEXT: pextrw $4, %xmm1, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE2-NEXT: pextrw $3, %xmm3, %ecx +; SSE2-NEXT: pextrw $3, %xmm1, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm4 +; SSE2-NEXT: pextrw $2, %xmm3, %ecx +; SSE2-NEXT: pextrw $2, %xmm1, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE2-NEXT: pextrw $1, %xmm3, %ecx +; SSE2-NEXT: pextrw $1, %xmm1, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm4 +; SSE2-NEXT: movd %xmm3, %ecx +; SSE2-NEXT: movd %xmm1, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v16i16: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pextrw $7, %xmm2, %ecx +; SSSE3-NEXT: pextrw $7, %xmm0, %edx +; SSSE3-NEXT: xorl %eax, %eax +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm4 +; SSSE3-NEXT: pextrw $6, %xmm2, %ecx +; SSSE3-NEXT: pextrw $6, %xmm0, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm5 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSSE3-NEXT: pextrw $5, %xmm2, %ecx +; SSSE3-NEXT: pextrw $5, %xmm0, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm6 +; SSSE3-NEXT: pextrw $4, %xmm2, %ecx +; SSSE3-NEXT: pextrw $4, %xmm0, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm4 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSSE3-NEXT: pextrw $3, %xmm2, %ecx +; SSSE3-NEXT: pextrw $3, %xmm0, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm5 +; SSSE3-NEXT: pextrw $2, %xmm2, %ecx +; SSSE3-NEXT: pextrw $2, %xmm0, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm6 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; SSSE3-NEXT: pextrw $1, %xmm2, %ecx +; SSSE3-NEXT: pextrw $1, %xmm0, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm5 +; SSSE3-NEXT: movd %xmm2, %ecx +; SSSE3-NEXT: movd %xmm0, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; SSSE3-NEXT: pextrw $7, %xmm3, %ecx +; SSSE3-NEXT: pextrw $7, %xmm1, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm2 +; SSSE3-NEXT: pextrw $6, %xmm3, %ecx +; SSSE3-NEXT: pextrw $6, %xmm1, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm4 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSSE3-NEXT: pextrw $5, %xmm3, %ecx +; SSSE3-NEXT: pextrw $5, %xmm1, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm5 +; SSSE3-NEXT: pextrw $4, %xmm3, %ecx +; SSSE3-NEXT: pextrw $4, %xmm1, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSSE3-NEXT: pextrw $3, %xmm3, %ecx +; SSSE3-NEXT: pextrw $3, %xmm1, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm4 +; SSSE3-NEXT: pextrw $2, %xmm3, %ecx +; SSSE3-NEXT: pextrw $2, %xmm1, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm5 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSSE3-NEXT: pextrw $1, %xmm3, %ecx +; SSSE3-NEXT: pextrw $1, %xmm1, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm4 +; SSSE3-NEXT: movd %xmm3, %ecx +; SSSE3-NEXT: movd %xmm1, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v16i16: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pextrw $1, %xmm2, %ecx +; SSE41-NEXT: pextrw $1, %xmm0, %edx +; SSE41-NEXT: xorl %eax, %eax +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: movd %xmm2, %ecx +; SSE41-NEXT: movd %xmm0, %esi +; SSE41-NEXT: subw %cx, %si +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: movd %esi, %xmm0 +; SSE41-NEXT: pinsrw $1, %edx, %xmm0 +; SSE41-NEXT: pextrw $2, %xmm2, %ecx +; SSE41-NEXT: pextrw $2, %xmm4, %edx +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $2, %edx, %xmm0 +; SSE41-NEXT: pextrw $3, %xmm2, %ecx +; SSE41-NEXT: pextrw $3, %xmm4, %edx +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $3, %edx, %xmm0 +; SSE41-NEXT: pextrw $4, %xmm2, %ecx +; SSE41-NEXT: pextrw $4, %xmm4, %edx +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $4, %edx, %xmm0 +; SSE41-NEXT: pextrw $5, %xmm2, %ecx +; SSE41-NEXT: pextrw $5, %xmm4, %edx +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $5, %edx, %xmm0 +; SSE41-NEXT: pextrw $6, %xmm2, %ecx +; SSE41-NEXT: pextrw $6, %xmm4, %edx +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $6, %edx, %xmm0 +; SSE41-NEXT: pextrw $7, %xmm2, %ecx +; SSE41-NEXT: pextrw $7, %xmm4, %edx +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $7, %edx, %xmm0 +; SSE41-NEXT: pextrw $1, %xmm3, %ecx +; SSE41-NEXT: pextrw $1, %xmm1, %edx +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: movd %xmm3, %ecx +; SSE41-NEXT: movd %xmm1, %esi +; SSE41-NEXT: subw %cx, %si +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: movd %esi, %xmm2 +; SSE41-NEXT: pinsrw $1, %edx, %xmm2 +; SSE41-NEXT: pextrw $2, %xmm3, %ecx +; SSE41-NEXT: pextrw $2, %xmm1, %edx +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $2, %edx, %xmm2 +; SSE41-NEXT: pextrw $3, %xmm3, %ecx +; SSE41-NEXT: pextrw $3, %xmm1, %edx +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $3, %edx, %xmm2 +; SSE41-NEXT: pextrw $4, %xmm3, %ecx +; SSE41-NEXT: pextrw $4, %xmm1, %edx +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $4, %edx, %xmm2 +; SSE41-NEXT: pextrw $5, %xmm3, %ecx +; SSE41-NEXT: pextrw $5, %xmm1, %edx +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $5, %edx, %xmm2 +; SSE41-NEXT: pextrw $6, %xmm3, %ecx +; SSE41-NEXT: pextrw $6, %xmm1, %edx +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $6, %edx, %xmm2 +; SSE41-NEXT: pextrw $7, %xmm3, %ecx +; SSE41-NEXT: pextrw $7, %xmm1, %edx +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $7, %edx, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: v16i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpextrw $1, %xmm2, %ecx +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpextrw $1, %xmm3, %edx +; AVX1-NEXT: xorl %eax, %eax +; AVX1-NEXT: subw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vmovd %xmm2, %ecx +; AVX1-NEXT: vmovd %xmm3, %esi +; AVX1-NEXT: subw %cx, %si +; AVX1-NEXT: cmovbl %eax, %esi +; AVX1-NEXT: vmovd %esi, %xmm4 +; AVX1-NEXT: vpinsrw $1, %edx, %xmm4, %xmm4 +; AVX1-NEXT: vpextrw $2, %xmm2, %ecx +; AVX1-NEXT: vpextrw $2, %xmm3, %edx +; AVX1-NEXT: subw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 +; AVX1-NEXT: vpextrw $3, %xmm2, %ecx +; AVX1-NEXT: vpextrw $3, %xmm3, %edx +; AVX1-NEXT: subw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $3, %edx, %xmm4, %xmm4 +; AVX1-NEXT: vpextrw $4, %xmm2, %ecx +; AVX1-NEXT: vpextrw $4, %xmm3, %edx +; AVX1-NEXT: subw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $4, %edx, %xmm4, %xmm4 +; AVX1-NEXT: vpextrw $5, %xmm2, %ecx +; AVX1-NEXT: vpextrw $5, %xmm3, %edx +; AVX1-NEXT: subw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $5, %edx, %xmm4, %xmm4 +; AVX1-NEXT: vpextrw $6, %xmm2, %ecx +; AVX1-NEXT: vpextrw $6, %xmm3, %edx +; AVX1-NEXT: subw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $6, %edx, %xmm4, %xmm4 +; AVX1-NEXT: vpextrw $7, %xmm2, %ecx +; AVX1-NEXT: vpextrw $7, %xmm3, %edx +; AVX1-NEXT: subw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $7, %edx, %xmm4, %xmm2 +; AVX1-NEXT: vpextrw $1, %xmm1, %ecx +; AVX1-NEXT: vpextrw $1, %xmm0, %edx +; AVX1-NEXT: subw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vmovd %xmm1, %ecx +; AVX1-NEXT: vmovd %xmm0, %esi +; AVX1-NEXT: subw %cx, %si +; AVX1-NEXT: cmovbl %eax, %esi +; AVX1-NEXT: vmovd %esi, %xmm3 +; AVX1-NEXT: vpinsrw $1, %edx, %xmm3, %xmm3 +; AVX1-NEXT: vpextrw $2, %xmm1, %ecx +; AVX1-NEXT: vpextrw $2, %xmm0, %edx +; AVX1-NEXT: subw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 +; AVX1-NEXT: vpextrw $3, %xmm1, %ecx +; AVX1-NEXT: vpextrw $3, %xmm0, %edx +; AVX1-NEXT: subw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $3, %edx, %xmm3, %xmm3 +; AVX1-NEXT: vpextrw $4, %xmm1, %ecx +; AVX1-NEXT: vpextrw $4, %xmm0, %edx +; AVX1-NEXT: subw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $4, %edx, %xmm3, %xmm3 +; AVX1-NEXT: vpextrw $5, %xmm1, %ecx +; AVX1-NEXT: vpextrw $5, %xmm0, %edx +; AVX1-NEXT: subw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $5, %edx, %xmm3, %xmm3 +; AVX1-NEXT: vpextrw $6, %xmm1, %ecx +; AVX1-NEXT: vpextrw $6, %xmm0, %edx +; AVX1-NEXT: subw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $6, %edx, %xmm3, %xmm3 +; AVX1-NEXT: vpextrw $7, %xmm1, %ecx +; AVX1-NEXT: vpextrw $7, %xmm0, %edx +; AVX1-NEXT: subw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $7, %edx, %xmm3, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: v16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpextrw $1, %xmm2, %ecx +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-NEXT: vpextrw $1, %xmm3, %edx +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: subw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vmovd %xmm2, %ecx +; AVX2-NEXT: vmovd %xmm3, %esi +; AVX2-NEXT: subw %cx, %si +; AVX2-NEXT: cmovbl %eax, %esi +; AVX2-NEXT: vmovd %esi, %xmm4 +; AVX2-NEXT: vpinsrw $1, %edx, %xmm4, %xmm4 +; AVX2-NEXT: vpextrw $2, %xmm2, %ecx +; AVX2-NEXT: vpextrw $2, %xmm3, %edx +; AVX2-NEXT: subw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 +; AVX2-NEXT: vpextrw $3, %xmm2, %ecx +; AVX2-NEXT: vpextrw $3, %xmm3, %edx +; AVX2-NEXT: subw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $3, %edx, %xmm4, %xmm4 +; AVX2-NEXT: vpextrw $4, %xmm2, %ecx +; AVX2-NEXT: vpextrw $4, %xmm3, %edx +; AVX2-NEXT: subw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $4, %edx, %xmm4, %xmm4 +; AVX2-NEXT: vpextrw $5, %xmm2, %ecx +; AVX2-NEXT: vpextrw $5, %xmm3, %edx +; AVX2-NEXT: subw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $5, %edx, %xmm4, %xmm4 +; AVX2-NEXT: vpextrw $6, %xmm2, %ecx +; AVX2-NEXT: vpextrw $6, %xmm3, %edx +; AVX2-NEXT: subw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $6, %edx, %xmm4, %xmm4 +; AVX2-NEXT: vpextrw $7, %xmm2, %ecx +; AVX2-NEXT: vpextrw $7, %xmm3, %edx +; AVX2-NEXT: subw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $7, %edx, %xmm4, %xmm2 +; AVX2-NEXT: vpextrw $1, %xmm1, %ecx +; AVX2-NEXT: vpextrw $1, %xmm0, %edx +; AVX2-NEXT: subw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vmovd %xmm1, %ecx +; AVX2-NEXT: vmovd %xmm0, %esi +; AVX2-NEXT: subw %cx, %si +; AVX2-NEXT: cmovbl %eax, %esi +; AVX2-NEXT: vmovd %esi, %xmm3 +; AVX2-NEXT: vpinsrw $1, %edx, %xmm3, %xmm3 +; AVX2-NEXT: vpextrw $2, %xmm1, %ecx +; AVX2-NEXT: vpextrw $2, %xmm0, %edx +; AVX2-NEXT: subw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 +; AVX2-NEXT: vpextrw $3, %xmm1, %ecx +; AVX2-NEXT: vpextrw $3, %xmm0, %edx +; AVX2-NEXT: subw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $3, %edx, %xmm3, %xmm3 +; AVX2-NEXT: vpextrw $4, %xmm1, %ecx +; AVX2-NEXT: vpextrw $4, %xmm0, %edx +; AVX2-NEXT: subw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $4, %edx, %xmm3, %xmm3 +; AVX2-NEXT: vpextrw $5, %xmm1, %ecx +; AVX2-NEXT: vpextrw $5, %xmm0, %edx +; AVX2-NEXT: subw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $5, %edx, %xmm3, %xmm3 +; AVX2-NEXT: vpextrw $6, %xmm1, %ecx +; AVX2-NEXT: vpextrw $6, %xmm0, %edx +; AVX2-NEXT: subw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $6, %edx, %xmm3, %xmm3 +; AVX2-NEXT: vpextrw $7, %xmm1, %ecx +; AVX2-NEXT: vpextrw $7, %xmm0, %edx +; AVX2-NEXT: subw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $7, %edx, %xmm3, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: v16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vpextrw $1, %xmm2, %ecx +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512-NEXT: vpextrw $1, %xmm3, %edx +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: subw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vmovd %xmm2, %ecx +; AVX512-NEXT: vmovd %xmm3, %esi +; AVX512-NEXT: subw %cx, %si +; AVX512-NEXT: cmovbl %eax, %esi +; AVX512-NEXT: vmovd %esi, %xmm4 +; AVX512-NEXT: vpinsrw $1, %edx, %xmm4, %xmm4 +; AVX512-NEXT: vpextrw $2, %xmm2, %ecx +; AVX512-NEXT: vpextrw $2, %xmm3, %edx +; AVX512-NEXT: subw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 +; AVX512-NEXT: vpextrw $3, %xmm2, %ecx +; AVX512-NEXT: vpextrw $3, %xmm3, %edx +; AVX512-NEXT: subw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $3, %edx, %xmm4, %xmm4 +; AVX512-NEXT: vpextrw $4, %xmm2, %ecx +; AVX512-NEXT: vpextrw $4, %xmm3, %edx +; AVX512-NEXT: subw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $4, %edx, %xmm4, %xmm4 +; AVX512-NEXT: vpextrw $5, %xmm2, %ecx +; AVX512-NEXT: vpextrw $5, %xmm3, %edx +; AVX512-NEXT: subw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $5, %edx, %xmm4, %xmm4 +; AVX512-NEXT: vpextrw $6, %xmm2, %ecx +; AVX512-NEXT: vpextrw $6, %xmm3, %edx +; AVX512-NEXT: subw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $6, %edx, %xmm4, %xmm4 +; AVX512-NEXT: vpextrw $7, %xmm2, %ecx +; AVX512-NEXT: vpextrw $7, %xmm3, %edx +; AVX512-NEXT: subw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $7, %edx, %xmm4, %xmm2 +; AVX512-NEXT: vpextrw $1, %xmm1, %ecx +; AVX512-NEXT: vpextrw $1, %xmm0, %edx +; AVX512-NEXT: subw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vmovd %xmm1, %ecx +; AVX512-NEXT: vmovd %xmm0, %esi +; AVX512-NEXT: subw %cx, %si +; AVX512-NEXT: cmovbl %eax, %esi +; AVX512-NEXT: vmovd %esi, %xmm3 +; AVX512-NEXT: vpinsrw $1, %edx, %xmm3, %xmm3 +; AVX512-NEXT: vpextrw $2, %xmm1, %ecx +; AVX512-NEXT: vpextrw $2, %xmm0, %edx +; AVX512-NEXT: subw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 +; AVX512-NEXT: vpextrw $3, %xmm1, %ecx +; AVX512-NEXT: vpextrw $3, %xmm0, %edx +; AVX512-NEXT: subw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $3, %edx, %xmm3, %xmm3 +; AVX512-NEXT: vpextrw $4, %xmm1, %ecx +; AVX512-NEXT: vpextrw $4, %xmm0, %edx +; AVX512-NEXT: subw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $4, %edx, %xmm3, %xmm3 +; AVX512-NEXT: vpextrw $5, %xmm1, %ecx +; AVX512-NEXT: vpextrw $5, %xmm0, %edx +; AVX512-NEXT: subw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $5, %edx, %xmm3, %xmm3 +; AVX512-NEXT: vpextrw $6, %xmm1, %ecx +; AVX512-NEXT: vpextrw $6, %xmm0, %edx +; AVX512-NEXT: subw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $6, %edx, %xmm3, %xmm3 +; AVX512-NEXT: vpextrw $7, %xmm1, %ecx +; AVX512-NEXT: vpextrw $7, %xmm0, %edx +; AVX512-NEXT: subw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $7, %edx, %xmm3, %xmm0 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512-NEXT: retq + %z = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> %x, <16 x i16> %y) + ret <16 x i16> %z +} + +define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind { +; SSE2-LABEL: v32i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pextrw $7, %xmm4, %ecx +; SSE2-NEXT: pextrw $7, %xmm0, %edx +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm8 +; SSE2-NEXT: pextrw $6, %xmm4, %ecx +; SSE2-NEXT: pextrw $6, %xmm0, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm9 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; SSE2-NEXT: pextrw $5, %xmm4, %ecx +; SSE2-NEXT: pextrw $5, %xmm0, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm10 +; SSE2-NEXT: pextrw $4, %xmm4, %ecx +; SSE2-NEXT: pextrw $4, %xmm0, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm8 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] +; SSE2-NEXT: pextrw $3, %xmm4, %ecx +; SSE2-NEXT: pextrw $3, %xmm0, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm9 +; SSE2-NEXT: pextrw $2, %xmm4, %ecx +; SSE2-NEXT: pextrw $2, %xmm0, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm10 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; SSE2-NEXT: pextrw $1, %xmm4, %ecx +; SSE2-NEXT: pextrw $1, %xmm0, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm9 +; SSE2-NEXT: movd %xmm4, %ecx +; SSE2-NEXT: movd %xmm0, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm8[0] +; SSE2-NEXT: pextrw $7, %xmm5, %ecx +; SSE2-NEXT: pextrw $7, %xmm1, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm8 +; SSE2-NEXT: pextrw $6, %xmm5, %ecx +; SSE2-NEXT: pextrw $6, %xmm1, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] +; SSE2-NEXT: pextrw $5, %xmm5, %ecx +; SSE2-NEXT: pextrw $5, %xmm1, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm9 +; SSE2-NEXT: pextrw $4, %xmm5, %ecx +; SSE2-NEXT: pextrw $4, %xmm1, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm8 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; SSE2-NEXT: pextrw $3, %xmm5, %ecx +; SSE2-NEXT: pextrw $3, %xmm1, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm9 +; SSE2-NEXT: pextrw $2, %xmm5, %ecx +; SSE2-NEXT: pextrw $2, %xmm1, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] +; SSE2-NEXT: pextrw $1, %xmm5, %ecx +; SSE2-NEXT: pextrw $1, %xmm1, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm9 +; SSE2-NEXT: movd %xmm5, %ecx +; SSE2-NEXT: movd %xmm1, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm8[0] +; SSE2-NEXT: pextrw $7, %xmm6, %ecx +; SSE2-NEXT: pextrw $7, %xmm2, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm4 +; SSE2-NEXT: pextrw $6, %xmm6, %ecx +; SSE2-NEXT: pextrw $6, %xmm2, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE2-NEXT: pextrw $5, %xmm6, %ecx +; SSE2-NEXT: pextrw $5, %xmm2, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm8 +; SSE2-NEXT: pextrw $4, %xmm6, %ecx +; SSE2-NEXT: pextrw $4, %xmm2, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE2-NEXT: pextrw $3, %xmm6, %ecx +; SSE2-NEXT: pextrw $3, %xmm2, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm8 +; SSE2-NEXT: pextrw $2, %xmm6, %ecx +; SSE2-NEXT: pextrw $2, %xmm2, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3] +; SSE2-NEXT: pextrw $1, %xmm6, %ecx +; SSE2-NEXT: pextrw $1, %xmm2, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm8 +; SSE2-NEXT: movd %xmm6, %ecx +; SSE2-NEXT: movd %xmm2, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE2-NEXT: pextrw $7, %xmm7, %ecx +; SSE2-NEXT: pextrw $7, %xmm3, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm4 +; SSE2-NEXT: pextrw $6, %xmm7, %ecx +; SSE2-NEXT: pextrw $6, %xmm3, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE2-NEXT: pextrw $5, %xmm7, %ecx +; SSE2-NEXT: pextrw $5, %xmm3, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm6 +; SSE2-NEXT: pextrw $4, %xmm7, %ecx +; SSE2-NEXT: pextrw $4, %xmm3, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE2-NEXT: pextrw $3, %xmm7, %ecx +; SSE2-NEXT: pextrw $3, %xmm3, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm5 +; SSE2-NEXT: pextrw $2, %xmm7, %ecx +; SSE2-NEXT: pextrw $2, %xmm3, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm6 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; SSE2-NEXT: pextrw $1, %xmm7, %ecx +; SSE2-NEXT: pextrw $1, %xmm3, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm5 +; SSE2-NEXT: movd %xmm7, %ecx +; SSE2-NEXT: movd %xmm3, %edx +; SSE2-NEXT: subw %cx, %dx +; SSE2-NEXT: cmovbl %eax, %edx +; SSE2-NEXT: movd %edx, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v32i16: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pextrw $7, %xmm4, %ecx +; SSSE3-NEXT: pextrw $7, %xmm0, %edx +; SSSE3-NEXT: xorl %eax, %eax +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm8 +; SSSE3-NEXT: pextrw $6, %xmm4, %ecx +; SSSE3-NEXT: pextrw $6, %xmm0, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm9 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; SSSE3-NEXT: pextrw $5, %xmm4, %ecx +; SSSE3-NEXT: pextrw $5, %xmm0, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm10 +; SSSE3-NEXT: pextrw $4, %xmm4, %ecx +; SSSE3-NEXT: pextrw $4, %xmm0, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm8 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] +; SSSE3-NEXT: pextrw $3, %xmm4, %ecx +; SSSE3-NEXT: pextrw $3, %xmm0, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm9 +; SSSE3-NEXT: pextrw $2, %xmm4, %ecx +; SSSE3-NEXT: pextrw $2, %xmm0, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm10 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; SSSE3-NEXT: pextrw $1, %xmm4, %ecx +; SSSE3-NEXT: pextrw $1, %xmm0, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm9 +; SSSE3-NEXT: movd %xmm4, %ecx +; SSSE3-NEXT: movd %xmm0, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm8[0] +; SSSE3-NEXT: pextrw $7, %xmm5, %ecx +; SSSE3-NEXT: pextrw $7, %xmm1, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm8 +; SSSE3-NEXT: pextrw $6, %xmm5, %ecx +; SSSE3-NEXT: pextrw $6, %xmm1, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm4 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] +; SSSE3-NEXT: pextrw $5, %xmm5, %ecx +; SSSE3-NEXT: pextrw $5, %xmm1, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm9 +; SSSE3-NEXT: pextrw $4, %xmm5, %ecx +; SSSE3-NEXT: pextrw $4, %xmm1, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm8 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; SSSE3-NEXT: pextrw $3, %xmm5, %ecx +; SSSE3-NEXT: pextrw $3, %xmm1, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm9 +; SSSE3-NEXT: pextrw $2, %xmm5, %ecx +; SSSE3-NEXT: pextrw $2, %xmm1, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm4 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] +; SSSE3-NEXT: pextrw $1, %xmm5, %ecx +; SSSE3-NEXT: pextrw $1, %xmm1, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm9 +; SSSE3-NEXT: movd %xmm5, %ecx +; SSSE3-NEXT: movd %xmm1, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm8[0] +; SSSE3-NEXT: pextrw $7, %xmm6, %ecx +; SSSE3-NEXT: pextrw $7, %xmm2, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm4 +; SSSE3-NEXT: pextrw $6, %xmm6, %ecx +; SSSE3-NEXT: pextrw $6, %xmm2, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm5 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSSE3-NEXT: pextrw $5, %xmm6, %ecx +; SSSE3-NEXT: pextrw $5, %xmm2, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm8 +; SSSE3-NEXT: pextrw $4, %xmm6, %ecx +; SSSE3-NEXT: pextrw $4, %xmm2, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm4 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSSE3-NEXT: pextrw $3, %xmm6, %ecx +; SSSE3-NEXT: pextrw $3, %xmm2, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm8 +; SSSE3-NEXT: pextrw $2, %xmm6, %ecx +; SSSE3-NEXT: pextrw $2, %xmm2, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm5 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3] +; SSSE3-NEXT: pextrw $1, %xmm6, %ecx +; SSSE3-NEXT: pextrw $1, %xmm2, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm8 +; SSSE3-NEXT: movd %xmm6, %ecx +; SSSE3-NEXT: movd %xmm2, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSSE3-NEXT: pextrw $7, %xmm7, %ecx +; SSSE3-NEXT: pextrw $7, %xmm3, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm4 +; SSSE3-NEXT: pextrw $6, %xmm7, %ecx +; SSSE3-NEXT: pextrw $6, %xmm3, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm5 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSSE3-NEXT: pextrw $5, %xmm7, %ecx +; SSSE3-NEXT: pextrw $5, %xmm3, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm6 +; SSSE3-NEXT: pextrw $4, %xmm7, %ecx +; SSSE3-NEXT: pextrw $4, %xmm3, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm4 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSSE3-NEXT: pextrw $3, %xmm7, %ecx +; SSSE3-NEXT: pextrw $3, %xmm3, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm5 +; SSSE3-NEXT: pextrw $2, %xmm7, %ecx +; SSSE3-NEXT: pextrw $2, %xmm3, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm6 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; SSSE3-NEXT: pextrw $1, %xmm7, %ecx +; SSSE3-NEXT: pextrw $1, %xmm3, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm5 +; SSSE3-NEXT: movd %xmm7, %ecx +; SSSE3-NEXT: movd %xmm3, %edx +; SSSE3-NEXT: subw %cx, %dx +; SSSE3-NEXT: cmovbl %eax, %edx +; SSSE3-NEXT: movd %edx, %xmm3 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v32i16: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm1, %xmm8 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pextrw $1, %xmm4, %ecx +; SSE41-NEXT: pextrw $1, %xmm0, %edx +; SSE41-NEXT: xorl %eax, %eax +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: movd %xmm4, %ecx +; SSE41-NEXT: movd %xmm0, %esi +; SSE41-NEXT: subw %cx, %si +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: movd %esi, %xmm0 +; SSE41-NEXT: pinsrw $1, %edx, %xmm0 +; SSE41-NEXT: pextrw $2, %xmm4, %ecx +; SSE41-NEXT: pextrw $2, %xmm1, %edx +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $2, %edx, %xmm0 +; SSE41-NEXT: pextrw $3, %xmm4, %ecx +; SSE41-NEXT: pextrw $3, %xmm1, %edx +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $3, %edx, %xmm0 +; SSE41-NEXT: pextrw $4, %xmm4, %ecx +; SSE41-NEXT: pextrw $4, %xmm1, %edx +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $4, %edx, %xmm0 +; SSE41-NEXT: pextrw $5, %xmm4, %ecx +; SSE41-NEXT: pextrw $5, %xmm1, %edx +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $5, %edx, %xmm0 +; SSE41-NEXT: pextrw $6, %xmm4, %ecx +; SSE41-NEXT: pextrw $6, %xmm1, %edx +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $6, %edx, %xmm0 +; SSE41-NEXT: pextrw $7, %xmm4, %ecx +; SSE41-NEXT: pextrw $7, %xmm1, %edx +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $7, %edx, %xmm0 +; SSE41-NEXT: pextrw $1, %xmm5, %ecx +; SSE41-NEXT: pextrw $1, %xmm8, %edx +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: movd %xmm5, %ecx +; SSE41-NEXT: movd %xmm8, %esi +; SSE41-NEXT: subw %cx, %si +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: movd %esi, %xmm1 +; SSE41-NEXT: pinsrw $1, %edx, %xmm1 +; SSE41-NEXT: pextrw $2, %xmm5, %ecx +; SSE41-NEXT: pextrw $2, %xmm8, %edx +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $2, %edx, %xmm1 +; SSE41-NEXT: pextrw $3, %xmm5, %ecx +; SSE41-NEXT: pextrw $3, %xmm8, %edx +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $3, %edx, %xmm1 +; SSE41-NEXT: pextrw $4, %xmm5, %ecx +; SSE41-NEXT: pextrw $4, %xmm8, %edx +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $4, %edx, %xmm1 +; SSE41-NEXT: pextrw $5, %xmm5, %ecx +; SSE41-NEXT: pextrw $5, %xmm8, %edx +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $5, %edx, %xmm1 +; SSE41-NEXT: pextrw $6, %xmm5, %ecx +; SSE41-NEXT: pextrw $6, %xmm8, %edx +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $6, %edx, %xmm1 +; SSE41-NEXT: pextrw $7, %xmm5, %ecx +; SSE41-NEXT: pextrw $7, %xmm8, %edx +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $7, %edx, %xmm1 +; SSE41-NEXT: pextrw $1, %xmm6, %ecx +; SSE41-NEXT: pextrw $1, %xmm2, %edx +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: movd %xmm6, %ecx +; SSE41-NEXT: movd %xmm2, %esi +; SSE41-NEXT: subw %cx, %si +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: movd %esi, %xmm4 +; SSE41-NEXT: pinsrw $1, %edx, %xmm4 +; SSE41-NEXT: pextrw $2, %xmm6, %ecx +; SSE41-NEXT: pextrw $2, %xmm2, %edx +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $2, %edx, %xmm4 +; SSE41-NEXT: pextrw $3, %xmm6, %ecx +; SSE41-NEXT: pextrw $3, %xmm2, %edx +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $3, %edx, %xmm4 +; SSE41-NEXT: pextrw $4, %xmm6, %ecx +; SSE41-NEXT: pextrw $4, %xmm2, %edx +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $4, %edx, %xmm4 +; SSE41-NEXT: pextrw $5, %xmm6, %ecx +; SSE41-NEXT: pextrw $5, %xmm2, %edx +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $5, %edx, %xmm4 +; SSE41-NEXT: pextrw $6, %xmm6, %ecx +; SSE41-NEXT: pextrw $6, %xmm2, %edx +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $6, %edx, %xmm4 +; SSE41-NEXT: pextrw $7, %xmm6, %ecx +; SSE41-NEXT: pextrw $7, %xmm2, %edx +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $7, %edx, %xmm4 +; SSE41-NEXT: pextrw $1, %xmm7, %ecx +; SSE41-NEXT: pextrw $1, %xmm3, %edx +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: movd %xmm7, %ecx +; SSE41-NEXT: movd %xmm3, %esi +; SSE41-NEXT: subw %cx, %si +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: movd %esi, %xmm5 +; SSE41-NEXT: pinsrw $1, %edx, %xmm5 +; SSE41-NEXT: pextrw $2, %xmm7, %ecx +; SSE41-NEXT: pextrw $2, %xmm3, %edx +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $2, %edx, %xmm5 +; SSE41-NEXT: pextrw $3, %xmm7, %ecx +; SSE41-NEXT: pextrw $3, %xmm3, %edx +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $3, %edx, %xmm5 +; SSE41-NEXT: pextrw $4, %xmm7, %ecx +; SSE41-NEXT: pextrw $4, %xmm3, %edx +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $4, %edx, %xmm5 +; SSE41-NEXT: pextrw $5, %xmm7, %ecx +; SSE41-NEXT: pextrw $5, %xmm3, %edx +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $5, %edx, %xmm5 +; SSE41-NEXT: pextrw $6, %xmm7, %ecx +; SSE41-NEXT: pextrw $6, %xmm3, %edx +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $6, %edx, %xmm5 +; SSE41-NEXT: pextrw $7, %xmm7, %ecx +; SSE41-NEXT: pextrw $7, %xmm3, %edx +; SSE41-NEXT: subw %cx, %dx +; SSE41-NEXT: cmovbl %eax, %edx +; SSE41-NEXT: pinsrw $7, %edx, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm2 +; SSE41-NEXT: movdqa %xmm5, %xmm3 +; SSE41-NEXT: retq +; +; AVX1-LABEL: v32i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vpextrw $1, %xmm4, %ecx +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpextrw $1, %xmm5, %edx +; AVX1-NEXT: xorl %eax, %eax +; AVX1-NEXT: subw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vmovd %xmm4, %ecx +; AVX1-NEXT: vmovd %xmm5, %esi +; AVX1-NEXT: subw %cx, %si +; AVX1-NEXT: cmovbl %eax, %esi +; AVX1-NEXT: vmovd %esi, %xmm6 +; AVX1-NEXT: vpinsrw $1, %edx, %xmm6, %xmm6 +; AVX1-NEXT: vpextrw $2, %xmm4, %ecx +; AVX1-NEXT: vpextrw $2, %xmm5, %edx +; AVX1-NEXT: subw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $2, %edx, %xmm6, %xmm6 +; AVX1-NEXT: vpextrw $3, %xmm4, %ecx +; AVX1-NEXT: vpextrw $3, %xmm5, %edx +; AVX1-NEXT: subw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $3, %edx, %xmm6, %xmm6 +; AVX1-NEXT: vpextrw $4, %xmm4, %ecx +; AVX1-NEXT: vpextrw $4, %xmm5, %edx +; AVX1-NEXT: subw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $4, %edx, %xmm6, %xmm6 +; AVX1-NEXT: vpextrw $5, %xmm4, %ecx +; AVX1-NEXT: vpextrw $5, %xmm5, %edx +; AVX1-NEXT: subw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $5, %edx, %xmm6, %xmm6 +; AVX1-NEXT: vpextrw $6, %xmm4, %ecx +; AVX1-NEXT: vpextrw $6, %xmm5, %edx +; AVX1-NEXT: subw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $6, %edx, %xmm6, %xmm6 +; AVX1-NEXT: vpextrw $7, %xmm4, %ecx +; AVX1-NEXT: vpextrw $7, %xmm5, %edx +; AVX1-NEXT: subw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $7, %edx, %xmm6, %xmm4 +; AVX1-NEXT: vpextrw $1, %xmm2, %ecx +; AVX1-NEXT: vpextrw $1, %xmm0, %edx +; AVX1-NEXT: subw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vmovd %xmm2, %ecx +; AVX1-NEXT: vmovd %xmm0, %esi +; AVX1-NEXT: subw %cx, %si +; AVX1-NEXT: cmovbl %eax, %esi +; AVX1-NEXT: vmovd %esi, %xmm5 +; AVX1-NEXT: vpinsrw $1, %edx, %xmm5, %xmm5 +; AVX1-NEXT: vpextrw $2, %xmm2, %ecx +; AVX1-NEXT: vpextrw $2, %xmm0, %edx +; AVX1-NEXT: subw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 +; AVX1-NEXT: vpextrw $3, %xmm2, %ecx +; AVX1-NEXT: vpextrw $3, %xmm0, %edx +; AVX1-NEXT: subw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $3, %edx, %xmm5, %xmm5 +; AVX1-NEXT: vpextrw $4, %xmm2, %ecx +; AVX1-NEXT: vpextrw $4, %xmm0, %edx +; AVX1-NEXT: subw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $4, %edx, %xmm5, %xmm5 +; AVX1-NEXT: vpextrw $5, %xmm2, %ecx +; AVX1-NEXT: vpextrw $5, %xmm0, %edx +; AVX1-NEXT: subw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $5, %edx, %xmm5, %xmm5 +; AVX1-NEXT: vpextrw $6, %xmm2, %ecx +; AVX1-NEXT: vpextrw $6, %xmm0, %edx +; AVX1-NEXT: subw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $6, %edx, %xmm5, %xmm5 +; AVX1-NEXT: vpextrw $7, %xmm2, %ecx +; AVX1-NEXT: vpextrw $7, %xmm0, %edx +; AVX1-NEXT: subw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $7, %edx, %xmm5, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 +; AVX1-NEXT: vpextrw $1, %xmm2, %ecx +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpextrw $1, %xmm4, %edx +; AVX1-NEXT: subw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vmovd %xmm2, %ecx +; AVX1-NEXT: vmovd %xmm4, %esi +; AVX1-NEXT: subw %cx, %si +; AVX1-NEXT: cmovbl %eax, %esi +; AVX1-NEXT: vmovd %esi, %xmm5 +; AVX1-NEXT: vpinsrw $1, %edx, %xmm5, %xmm5 +; AVX1-NEXT: vpextrw $2, %xmm2, %ecx +; AVX1-NEXT: vpextrw $2, %xmm4, %edx +; AVX1-NEXT: subw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 +; AVX1-NEXT: vpextrw $3, %xmm2, %ecx +; AVX1-NEXT: vpextrw $3, %xmm4, %edx +; AVX1-NEXT: subw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $3, %edx, %xmm5, %xmm5 +; AVX1-NEXT: vpextrw $4, %xmm2, %ecx +; AVX1-NEXT: vpextrw $4, %xmm4, %edx +; AVX1-NEXT: subw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $4, %edx, %xmm5, %xmm5 +; AVX1-NEXT: vpextrw $5, %xmm2, %ecx +; AVX1-NEXT: vpextrw $5, %xmm4, %edx +; AVX1-NEXT: subw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $5, %edx, %xmm5, %xmm5 +; AVX1-NEXT: vpextrw $6, %xmm2, %ecx +; AVX1-NEXT: vpextrw $6, %xmm4, %edx +; AVX1-NEXT: subw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $6, %edx, %xmm5, %xmm5 +; AVX1-NEXT: vpextrw $7, %xmm2, %ecx +; AVX1-NEXT: vpextrw $7, %xmm4, %edx +; AVX1-NEXT: subw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $7, %edx, %xmm5, %xmm2 +; AVX1-NEXT: vpextrw $1, %xmm3, %ecx +; AVX1-NEXT: vpextrw $1, %xmm1, %edx +; AVX1-NEXT: subw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vmovd %xmm3, %ecx +; AVX1-NEXT: vmovd %xmm1, %esi +; AVX1-NEXT: subw %cx, %si +; AVX1-NEXT: cmovbl %eax, %esi +; AVX1-NEXT: vmovd %esi, %xmm4 +; AVX1-NEXT: vpinsrw $1, %edx, %xmm4, %xmm4 +; AVX1-NEXT: vpextrw $2, %xmm3, %ecx +; AVX1-NEXT: vpextrw $2, %xmm1, %edx +; AVX1-NEXT: subw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 +; AVX1-NEXT: vpextrw $3, %xmm3, %ecx +; AVX1-NEXT: vpextrw $3, %xmm1, %edx +; AVX1-NEXT: subw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $3, %edx, %xmm4, %xmm4 +; AVX1-NEXT: vpextrw $4, %xmm3, %ecx +; AVX1-NEXT: vpextrw $4, %xmm1, %edx +; AVX1-NEXT: subw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $4, %edx, %xmm4, %xmm4 +; AVX1-NEXT: vpextrw $5, %xmm3, %ecx +; AVX1-NEXT: vpextrw $5, %xmm1, %edx +; AVX1-NEXT: subw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $5, %edx, %xmm4, %xmm4 +; AVX1-NEXT: vpextrw $6, %xmm3, %ecx +; AVX1-NEXT: vpextrw $6, %xmm1, %edx +; AVX1-NEXT: subw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $6, %edx, %xmm4, %xmm4 +; AVX1-NEXT: vpextrw $7, %xmm3, %ecx +; AVX1-NEXT: vpextrw $7, %xmm1, %edx +; AVX1-NEXT: subw %cx, %dx +; AVX1-NEXT: cmovbl %eax, %edx +; AVX1-NEXT: vpinsrw $7, %edx, %xmm4, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: v32i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-NEXT: vpextrw $1, %xmm4, %ecx +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-NEXT: vpextrw $1, %xmm5, %edx +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: subw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vmovd %xmm4, %ecx +; AVX2-NEXT: vmovd %xmm5, %esi +; AVX2-NEXT: subw %cx, %si +; AVX2-NEXT: cmovbl %eax, %esi +; AVX2-NEXT: vmovd %esi, %xmm6 +; AVX2-NEXT: vpinsrw $1, %edx, %xmm6, %xmm6 +; AVX2-NEXT: vpextrw $2, %xmm4, %ecx +; AVX2-NEXT: vpextrw $2, %xmm5, %edx +; AVX2-NEXT: subw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $2, %edx, %xmm6, %xmm6 +; AVX2-NEXT: vpextrw $3, %xmm4, %ecx +; AVX2-NEXT: vpextrw $3, %xmm5, %edx +; AVX2-NEXT: subw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $3, %edx, %xmm6, %xmm6 +; AVX2-NEXT: vpextrw $4, %xmm4, %ecx +; AVX2-NEXT: vpextrw $4, %xmm5, %edx +; AVX2-NEXT: subw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $4, %edx, %xmm6, %xmm6 +; AVX2-NEXT: vpextrw $5, %xmm4, %ecx +; AVX2-NEXT: vpextrw $5, %xmm5, %edx +; AVX2-NEXT: subw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $5, %edx, %xmm6, %xmm6 +; AVX2-NEXT: vpextrw $6, %xmm4, %ecx +; AVX2-NEXT: vpextrw $6, %xmm5, %edx +; AVX2-NEXT: subw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $6, %edx, %xmm6, %xmm6 +; AVX2-NEXT: vpextrw $7, %xmm4, %ecx +; AVX2-NEXT: vpextrw $7, %xmm5, %edx +; AVX2-NEXT: subw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $7, %edx, %xmm6, %xmm4 +; AVX2-NEXT: vpextrw $1, %xmm2, %ecx +; AVX2-NEXT: vpextrw $1, %xmm0, %edx +; AVX2-NEXT: subw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vmovd %xmm2, %ecx +; AVX2-NEXT: vmovd %xmm0, %esi +; AVX2-NEXT: subw %cx, %si +; AVX2-NEXT: cmovbl %eax, %esi +; AVX2-NEXT: vmovd %esi, %xmm5 +; AVX2-NEXT: vpinsrw $1, %edx, %xmm5, %xmm5 +; AVX2-NEXT: vpextrw $2, %xmm2, %ecx +; AVX2-NEXT: vpextrw $2, %xmm0, %edx +; AVX2-NEXT: subw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 +; AVX2-NEXT: vpextrw $3, %xmm2, %ecx +; AVX2-NEXT: vpextrw $3, %xmm0, %edx +; AVX2-NEXT: subw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $3, %edx, %xmm5, %xmm5 +; AVX2-NEXT: vpextrw $4, %xmm2, %ecx +; AVX2-NEXT: vpextrw $4, %xmm0, %edx +; AVX2-NEXT: subw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $4, %edx, %xmm5, %xmm5 +; AVX2-NEXT: vpextrw $5, %xmm2, %ecx +; AVX2-NEXT: vpextrw $5, %xmm0, %edx +; AVX2-NEXT: subw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $5, %edx, %xmm5, %xmm5 +; AVX2-NEXT: vpextrw $6, %xmm2, %ecx +; AVX2-NEXT: vpextrw $6, %xmm0, %edx +; AVX2-NEXT: subw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $6, %edx, %xmm5, %xmm5 +; AVX2-NEXT: vpextrw $7, %xmm2, %ecx +; AVX2-NEXT: vpextrw $7, %xmm0, %edx +; AVX2-NEXT: subw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $7, %edx, %xmm5, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm2 +; AVX2-NEXT: vpextrw $1, %xmm2, %ecx +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-NEXT: vpextrw $1, %xmm4, %edx +; AVX2-NEXT: subw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vmovd %xmm2, %ecx +; AVX2-NEXT: vmovd %xmm4, %esi +; AVX2-NEXT: subw %cx, %si +; AVX2-NEXT: cmovbl %eax, %esi +; AVX2-NEXT: vmovd %esi, %xmm5 +; AVX2-NEXT: vpinsrw $1, %edx, %xmm5, %xmm5 +; AVX2-NEXT: vpextrw $2, %xmm2, %ecx +; AVX2-NEXT: vpextrw $2, %xmm4, %edx +; AVX2-NEXT: subw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 +; AVX2-NEXT: vpextrw $3, %xmm2, %ecx +; AVX2-NEXT: vpextrw $3, %xmm4, %edx +; AVX2-NEXT: subw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $3, %edx, %xmm5, %xmm5 +; AVX2-NEXT: vpextrw $4, %xmm2, %ecx +; AVX2-NEXT: vpextrw $4, %xmm4, %edx +; AVX2-NEXT: subw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $4, %edx, %xmm5, %xmm5 +; AVX2-NEXT: vpextrw $5, %xmm2, %ecx +; AVX2-NEXT: vpextrw $5, %xmm4, %edx +; AVX2-NEXT: subw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $5, %edx, %xmm5, %xmm5 +; AVX2-NEXT: vpextrw $6, %xmm2, %ecx +; AVX2-NEXT: vpextrw $6, %xmm4, %edx +; AVX2-NEXT: subw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $6, %edx, %xmm5, %xmm5 +; AVX2-NEXT: vpextrw $7, %xmm2, %ecx +; AVX2-NEXT: vpextrw $7, %xmm4, %edx +; AVX2-NEXT: subw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $7, %edx, %xmm5, %xmm2 +; AVX2-NEXT: vpextrw $1, %xmm3, %ecx +; AVX2-NEXT: vpextrw $1, %xmm1, %edx +; AVX2-NEXT: subw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vmovd %xmm3, %ecx +; AVX2-NEXT: vmovd %xmm1, %esi +; AVX2-NEXT: subw %cx, %si +; AVX2-NEXT: cmovbl %eax, %esi +; AVX2-NEXT: vmovd %esi, %xmm4 +; AVX2-NEXT: vpinsrw $1, %edx, %xmm4, %xmm4 +; AVX2-NEXT: vpextrw $2, %xmm3, %ecx +; AVX2-NEXT: vpextrw $2, %xmm1, %edx +; AVX2-NEXT: subw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 +; AVX2-NEXT: vpextrw $3, %xmm3, %ecx +; AVX2-NEXT: vpextrw $3, %xmm1, %edx +; AVX2-NEXT: subw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $3, %edx, %xmm4, %xmm4 +; AVX2-NEXT: vpextrw $4, %xmm3, %ecx +; AVX2-NEXT: vpextrw $4, %xmm1, %edx +; AVX2-NEXT: subw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $4, %edx, %xmm4, %xmm4 +; AVX2-NEXT: vpextrw $5, %xmm3, %ecx +; AVX2-NEXT: vpextrw $5, %xmm1, %edx +; AVX2-NEXT: subw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $5, %edx, %xmm4, %xmm4 +; AVX2-NEXT: vpextrw $6, %xmm3, %ecx +; AVX2-NEXT: vpextrw $6, %xmm1, %edx +; AVX2-NEXT: subw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $6, %edx, %xmm4, %xmm4 +; AVX2-NEXT: vpextrw $7, %xmm3, %ecx +; AVX2-NEXT: vpextrw $7, %xmm1, %edx +; AVX2-NEXT: subw %cx, %dx +; AVX2-NEXT: cmovbl %eax, %edx +; AVX2-NEXT: vpinsrw $7, %edx, %xmm4, %xmm1 +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX512-LABEL: v32i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm2 +; AVX512-NEXT: vpextrw $1, %xmm2, %ecx +; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm3 +; AVX512-NEXT: vpextrw $1, %xmm3, %edx +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: subw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vmovd %xmm2, %ecx +; AVX512-NEXT: vmovd %xmm3, %esi +; AVX512-NEXT: subw %cx, %si +; AVX512-NEXT: cmovbl %eax, %esi +; AVX512-NEXT: vmovd %esi, %xmm4 +; AVX512-NEXT: vpinsrw $1, %edx, %xmm4, %xmm4 +; AVX512-NEXT: vpextrw $2, %xmm2, %ecx +; AVX512-NEXT: vpextrw $2, %xmm3, %edx +; AVX512-NEXT: subw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 +; AVX512-NEXT: vpextrw $3, %xmm2, %ecx +; AVX512-NEXT: vpextrw $3, %xmm3, %edx +; AVX512-NEXT: subw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $3, %edx, %xmm4, %xmm4 +; AVX512-NEXT: vpextrw $4, %xmm2, %ecx +; AVX512-NEXT: vpextrw $4, %xmm3, %edx +; AVX512-NEXT: subw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $4, %edx, %xmm4, %xmm4 +; AVX512-NEXT: vpextrw $5, %xmm2, %ecx +; AVX512-NEXT: vpextrw $5, %xmm3, %edx +; AVX512-NEXT: subw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $5, %edx, %xmm4, %xmm4 +; AVX512-NEXT: vpextrw $6, %xmm2, %ecx +; AVX512-NEXT: vpextrw $6, %xmm3, %edx +; AVX512-NEXT: subw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $6, %edx, %xmm4, %xmm4 +; AVX512-NEXT: vpextrw $7, %xmm2, %ecx +; AVX512-NEXT: vpextrw $7, %xmm3, %edx +; AVX512-NEXT: subw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $7, %edx, %xmm4, %xmm2 +; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm3 +; AVX512-NEXT: vpextrw $1, %xmm3, %ecx +; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm4 +; AVX512-NEXT: vpextrw $1, %xmm4, %edx +; AVX512-NEXT: subw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vmovd %xmm3, %ecx +; AVX512-NEXT: vmovd %xmm4, %esi +; AVX512-NEXT: subw %cx, %si +; AVX512-NEXT: cmovbl %eax, %esi +; AVX512-NEXT: vmovd %esi, %xmm5 +; AVX512-NEXT: vpinsrw $1, %edx, %xmm5, %xmm5 +; AVX512-NEXT: vpextrw $2, %xmm3, %ecx +; AVX512-NEXT: vpextrw $2, %xmm4, %edx +; AVX512-NEXT: subw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 +; AVX512-NEXT: vpextrw $3, %xmm3, %ecx +; AVX512-NEXT: vpextrw $3, %xmm4, %edx +; AVX512-NEXT: subw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $3, %edx, %xmm5, %xmm5 +; AVX512-NEXT: vpextrw $4, %xmm3, %ecx +; AVX512-NEXT: vpextrw $4, %xmm4, %edx +; AVX512-NEXT: subw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $4, %edx, %xmm5, %xmm5 +; AVX512-NEXT: vpextrw $5, %xmm3, %ecx +; AVX512-NEXT: vpextrw $5, %xmm4, %edx +; AVX512-NEXT: subw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $5, %edx, %xmm5, %xmm5 +; AVX512-NEXT: vpextrw $6, %xmm3, %ecx +; AVX512-NEXT: vpextrw $6, %xmm4, %edx +; AVX512-NEXT: subw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $6, %edx, %xmm5, %xmm5 +; AVX512-NEXT: vpextrw $7, %xmm3, %ecx +; AVX512-NEXT: vpextrw $7, %xmm4, %edx +; AVX512-NEXT: subw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $7, %edx, %xmm5, %xmm3 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512-NEXT: vpextrw $1, %xmm3, %ecx +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512-NEXT: vpextrw $1, %xmm4, %edx +; AVX512-NEXT: subw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vmovd %xmm3, %ecx +; AVX512-NEXT: vmovd %xmm4, %esi +; AVX512-NEXT: subw %cx, %si +; AVX512-NEXT: cmovbl %eax, %esi +; AVX512-NEXT: vmovd %esi, %xmm5 +; AVX512-NEXT: vpinsrw $1, %edx, %xmm5, %xmm5 +; AVX512-NEXT: vpextrw $2, %xmm3, %ecx +; AVX512-NEXT: vpextrw $2, %xmm4, %edx +; AVX512-NEXT: subw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 +; AVX512-NEXT: vpextrw $3, %xmm3, %ecx +; AVX512-NEXT: vpextrw $3, %xmm4, %edx +; AVX512-NEXT: subw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $3, %edx, %xmm5, %xmm5 +; AVX512-NEXT: vpextrw $4, %xmm3, %ecx +; AVX512-NEXT: vpextrw $4, %xmm4, %edx +; AVX512-NEXT: subw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $4, %edx, %xmm5, %xmm5 +; AVX512-NEXT: vpextrw $5, %xmm3, %ecx +; AVX512-NEXT: vpextrw $5, %xmm4, %edx +; AVX512-NEXT: subw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $5, %edx, %xmm5, %xmm5 +; AVX512-NEXT: vpextrw $6, %xmm3, %ecx +; AVX512-NEXT: vpextrw $6, %xmm4, %edx +; AVX512-NEXT: subw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $6, %edx, %xmm5, %xmm5 +; AVX512-NEXT: vpextrw $7, %xmm3, %ecx +; AVX512-NEXT: vpextrw $7, %xmm4, %edx +; AVX512-NEXT: subw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $7, %edx, %xmm5, %xmm3 +; AVX512-NEXT: vpextrw $1, %xmm1, %ecx +; AVX512-NEXT: vpextrw $1, %xmm0, %edx +; AVX512-NEXT: subw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vmovd %xmm1, %ecx +; AVX512-NEXT: vmovd %xmm0, %esi +; AVX512-NEXT: subw %cx, %si +; AVX512-NEXT: cmovbl %eax, %esi +; AVX512-NEXT: vmovd %esi, %xmm4 +; AVX512-NEXT: vpinsrw $1, %edx, %xmm4, %xmm4 +; AVX512-NEXT: vpextrw $2, %xmm1, %ecx +; AVX512-NEXT: vpextrw $2, %xmm0, %edx +; AVX512-NEXT: subw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 +; AVX512-NEXT: vpextrw $3, %xmm1, %ecx +; AVX512-NEXT: vpextrw $3, %xmm0, %edx +; AVX512-NEXT: subw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $3, %edx, %xmm4, %xmm4 +; AVX512-NEXT: vpextrw $4, %xmm1, %ecx +; AVX512-NEXT: vpextrw $4, %xmm0, %edx +; AVX512-NEXT: subw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $4, %edx, %xmm4, %xmm4 +; AVX512-NEXT: vpextrw $5, %xmm1, %ecx +; AVX512-NEXT: vpextrw $5, %xmm0, %edx +; AVX512-NEXT: subw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $5, %edx, %xmm4, %xmm4 +; AVX512-NEXT: vpextrw $6, %xmm1, %ecx +; AVX512-NEXT: vpextrw $6, %xmm0, %edx +; AVX512-NEXT: subw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $6, %edx, %xmm4, %xmm4 +; AVX512-NEXT: vpextrw $7, %xmm1, %ecx +; AVX512-NEXT: vpextrw $7, %xmm0, %edx +; AVX512-NEXT: subw %cx, %dx +; AVX512-NEXT: cmovbl %eax, %edx +; AVX512-NEXT: vpinsrw $7, %edx, %xmm4, %xmm0 +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: retq + %z = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> %x, <32 x i16> %y) + ret <32 x i16> %z +} + +; Too narrow vectors, legalized by widening. + +define void @v8i8(<8 x i8>* %px, <8 x i8>* %py, <8 x i8>* %pz) nounwind { +; SSE2-LABEL: v8i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE2-NEXT: pextrw $7, %xmm1, %ecx +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: pextrw $7, %xmm0, %esi +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: subw %cx, %si +; SSE2-NEXT: cmovbl %eax, %esi +; SSE2-NEXT: movd %esi, %xmm2 +; SSE2-NEXT: pextrw $6, %xmm1, %ecx +; SSE2-NEXT: pextrw $6, %xmm0, %esi +; SSE2-NEXT: subw %cx, %si +; SSE2-NEXT: cmovbl %eax, %esi +; SSE2-NEXT: movd %esi, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: pextrw $5, %xmm1, %ecx +; SSE2-NEXT: pextrw $5, %xmm0, %esi +; SSE2-NEXT: subw %cx, %si +; SSE2-NEXT: cmovbl %eax, %esi +; SSE2-NEXT: movd %esi, %xmm4 +; SSE2-NEXT: pextrw $4, %xmm1, %ecx +; SSE2-NEXT: pextrw $4, %xmm0, %esi +; SSE2-NEXT: subw %cx, %si +; SSE2-NEXT: cmovbl %eax, %esi +; SSE2-NEXT: movd %esi, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE2-NEXT: pextrw $3, %xmm1, %ecx +; SSE2-NEXT: pextrw $3, %xmm0, %esi +; SSE2-NEXT: subw %cx, %si +; SSE2-NEXT: cmovbl %eax, %esi +; SSE2-NEXT: movd %esi, %xmm3 +; SSE2-NEXT: pextrw $2, %xmm1, %ecx +; SSE2-NEXT: pextrw $2, %xmm0, %esi +; SSE2-NEXT: subw %cx, %si +; SSE2-NEXT: cmovbl %eax, %esi +; SSE2-NEXT: movd %esi, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE2-NEXT: pextrw $1, %xmm1, %ecx +; SSE2-NEXT: pextrw $1, %xmm0, %esi +; SSE2-NEXT: subw %cx, %si +; SSE2-NEXT: cmovbl %eax, %esi +; SSE2-NEXT: movd %esi, %xmm3 +; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: movd %xmm0, %esi +; SSE2-NEXT: subw %cx, %si +; SSE2-NEXT: cmovbl %eax, %esi +; SSE2-NEXT: movd %esi, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: movq %xmm0, (%rdx) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v8i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; SSSE3-NEXT: movq {{.*#+}} xmm3 = mem[0],zero +; SSSE3-NEXT: pxor %xmm0, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSSE3-NEXT: pextrw $7, %xmm1, %ecx +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSSE3-NEXT: pextrw $7, %xmm0, %esi +; SSSE3-NEXT: xorl %eax, %eax +; SSSE3-NEXT: subw %cx, %si +; SSSE3-NEXT: cmovbl %eax, %esi +; SSSE3-NEXT: movd %esi, %xmm2 +; SSSE3-NEXT: pextrw $6, %xmm1, %ecx +; SSSE3-NEXT: pextrw $6, %xmm0, %esi +; SSSE3-NEXT: subw %cx, %si +; SSSE3-NEXT: cmovbl %eax, %esi +; SSSE3-NEXT: movd %esi, %xmm3 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSSE3-NEXT: pextrw $5, %xmm1, %ecx +; SSSE3-NEXT: pextrw $5, %xmm0, %esi +; SSSE3-NEXT: subw %cx, %si +; SSSE3-NEXT: cmovbl %eax, %esi +; SSSE3-NEXT: movd %esi, %xmm4 +; SSSE3-NEXT: pextrw $4, %xmm1, %ecx +; SSSE3-NEXT: pextrw $4, %xmm0, %esi +; SSSE3-NEXT: subw %cx, %si +; SSSE3-NEXT: cmovbl %eax, %esi +; SSSE3-NEXT: movd %esi, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSSE3-NEXT: pextrw $3, %xmm1, %ecx +; SSSE3-NEXT: pextrw $3, %xmm0, %esi +; SSSE3-NEXT: subw %cx, %si +; SSSE3-NEXT: cmovbl %eax, %esi +; SSSE3-NEXT: movd %esi, %xmm3 +; SSSE3-NEXT: pextrw $2, %xmm1, %ecx +; SSSE3-NEXT: pextrw $2, %xmm0, %esi +; SSSE3-NEXT: subw %cx, %si +; SSSE3-NEXT: cmovbl %eax, %esi +; SSSE3-NEXT: movd %esi, %xmm4 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSSE3-NEXT: pextrw $1, %xmm1, %ecx +; SSSE3-NEXT: pextrw $1, %xmm0, %esi +; SSSE3-NEXT: subw %cx, %si +; SSSE3-NEXT: cmovbl %eax, %esi +; SSSE3-NEXT: movd %esi, %xmm3 +; SSSE3-NEXT: movd %xmm1, %ecx +; SSSE3-NEXT: movd %xmm0, %esi +; SSSE3-NEXT: subw %cx, %si +; SSSE3-NEXT: cmovbl %eax, %esi +; SSSE3-NEXT: movd %esi, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSSE3-NEXT: psrlw $8, %xmm0 +; SSSE3-NEXT: packuswb %xmm0, %xmm0 +; SSSE3-NEXT: movq %xmm0, (%rdx) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v8i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; SSE41-NEXT: movq {{.*#+}} xmm3 = mem[0],zero +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE41-NEXT: pextrw $1, %xmm1, %ecx +; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE41-NEXT: pextrw $1, %xmm0, %esi +; SSE41-NEXT: xorl %eax, %eax +; SSE41-NEXT: subw %cx, %si +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: movd %xmm1, %ecx +; SSE41-NEXT: movd %xmm0, %edi +; SSE41-NEXT: subw %cx, %di +; SSE41-NEXT: cmovbl %eax, %edi +; SSE41-NEXT: movd %edi, %xmm2 +; SSE41-NEXT: pinsrw $1, %esi, %xmm2 +; SSE41-NEXT: pextrw $2, %xmm1, %ecx +; SSE41-NEXT: pextrw $2, %xmm0, %esi +; SSE41-NEXT: subw %cx, %si +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: pinsrw $2, %esi, %xmm2 +; SSE41-NEXT: pextrw $3, %xmm1, %ecx +; SSE41-NEXT: pextrw $3, %xmm0, %esi +; SSE41-NEXT: subw %cx, %si +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: pinsrw $3, %esi, %xmm2 +; SSE41-NEXT: pextrw $4, %xmm1, %ecx +; SSE41-NEXT: pextrw $4, %xmm0, %esi +; SSE41-NEXT: subw %cx, %si +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: pinsrw $4, %esi, %xmm2 +; SSE41-NEXT: pextrw $5, %xmm1, %ecx +; SSE41-NEXT: pextrw $5, %xmm0, %esi +; SSE41-NEXT: subw %cx, %si +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: pinsrw $5, %esi, %xmm2 +; SSE41-NEXT: pextrw $6, %xmm1, %ecx +; SSE41-NEXT: pextrw $6, %xmm0, %esi +; SSE41-NEXT: subw %cx, %si +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: pinsrw $6, %esi, %xmm2 +; SSE41-NEXT: pextrw $7, %xmm1, %ecx +; SSE41-NEXT: pextrw $7, %xmm0, %esi +; SSE41-NEXT: subw %cx, %si +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: pinsrw $7, %esi, %xmm2 +; SSE41-NEXT: psrlw $8, %xmm2 +; SSE41-NEXT: packuswb %xmm0, %xmm2 +; SSE41-NEXT: movq %xmm2, (%rdx) +; SSE41-NEXT: retq +; +; AVX1-LABEL: v8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX1-NEXT: vpextrw $1, %xmm0, %ecx +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX1-NEXT: vpextrw $1, %xmm1, %esi +; AVX1-NEXT: xorl %eax, %eax +; AVX1-NEXT: subw %cx, %si +; AVX1-NEXT: cmovbl %eax, %esi +; AVX1-NEXT: vmovd %xmm0, %ecx +; AVX1-NEXT: vmovd %xmm1, %edi +; AVX1-NEXT: subw %cx, %di +; AVX1-NEXT: cmovbl %eax, %edi +; AVX1-NEXT: vmovd %edi, %xmm2 +; AVX1-NEXT: vpinsrw $1, %esi, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $2, %xmm0, %ecx +; AVX1-NEXT: vpextrw $2, %xmm1, %esi +; AVX1-NEXT: subw %cx, %si +; AVX1-NEXT: cmovbl %eax, %esi +; AVX1-NEXT: vpinsrw $2, %esi, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $3, %xmm0, %ecx +; AVX1-NEXT: vpextrw $3, %xmm1, %esi +; AVX1-NEXT: subw %cx, %si +; AVX1-NEXT: cmovbl %eax, %esi +; AVX1-NEXT: vpinsrw $3, %esi, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $4, %xmm0, %ecx +; AVX1-NEXT: vpextrw $4, %xmm1, %esi +; AVX1-NEXT: subw %cx, %si +; AVX1-NEXT: cmovbl %eax, %esi +; AVX1-NEXT: vpinsrw $4, %esi, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $5, %xmm0, %ecx +; AVX1-NEXT: vpextrw $5, %xmm1, %esi +; AVX1-NEXT: subw %cx, %si +; AVX1-NEXT: cmovbl %eax, %esi +; AVX1-NEXT: vpinsrw $5, %esi, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $6, %xmm0, %ecx +; AVX1-NEXT: vpextrw $6, %xmm1, %esi +; AVX1-NEXT: subw %cx, %si +; AVX1-NEXT: cmovbl %eax, %esi +; AVX1-NEXT: vpinsrw $6, %esi, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $7, %xmm0, %ecx +; AVX1-NEXT: vpextrw $7, %xmm1, %esi +; AVX1-NEXT: subw %cx, %si +; AVX1-NEXT: cmovbl %eax, %esi +; AVX1-NEXT: vpinsrw $7, %esi, %xmm2, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, (%rdx) +; AVX1-NEXT: retq +; +; AVX2-LABEL: v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX2-NEXT: vpextrw $1, %xmm0, %ecx +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX2-NEXT: vpextrw $1, %xmm1, %esi +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: subw %cx, %si +; AVX2-NEXT: cmovbl %eax, %esi +; AVX2-NEXT: vmovd %xmm0, %ecx +; AVX2-NEXT: vmovd %xmm1, %edi +; AVX2-NEXT: subw %cx, %di +; AVX2-NEXT: cmovbl %eax, %edi +; AVX2-NEXT: vmovd %edi, %xmm2 +; AVX2-NEXT: vpinsrw $1, %esi, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $2, %xmm0, %ecx +; AVX2-NEXT: vpextrw $2, %xmm1, %esi +; AVX2-NEXT: subw %cx, %si +; AVX2-NEXT: cmovbl %eax, %esi +; AVX2-NEXT: vpinsrw $2, %esi, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $3, %xmm0, %ecx +; AVX2-NEXT: vpextrw $3, %xmm1, %esi +; AVX2-NEXT: subw %cx, %si +; AVX2-NEXT: cmovbl %eax, %esi +; AVX2-NEXT: vpinsrw $3, %esi, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $4, %xmm0, %ecx +; AVX2-NEXT: vpextrw $4, %xmm1, %esi +; AVX2-NEXT: subw %cx, %si +; AVX2-NEXT: cmovbl %eax, %esi +; AVX2-NEXT: vpinsrw $4, %esi, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $5, %xmm0, %ecx +; AVX2-NEXT: vpextrw $5, %xmm1, %esi +; AVX2-NEXT: subw %cx, %si +; AVX2-NEXT: cmovbl %eax, %esi +; AVX2-NEXT: vpinsrw $5, %esi, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $6, %xmm0, %ecx +; AVX2-NEXT: vpextrw $6, %xmm1, %esi +; AVX2-NEXT: subw %cx, %si +; AVX2-NEXT: cmovbl %eax, %esi +; AVX2-NEXT: vpinsrw $6, %esi, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $7, %xmm0, %ecx +; AVX2-NEXT: vpextrw $7, %xmm1, %esi +; AVX2-NEXT: subw %cx, %si +; AVX2-NEXT: cmovbl %eax, %esi +; AVX2-NEXT: vpinsrw $7, %esi, %xmm2, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rdx) +; AVX2-NEXT: retq +; +; AVX512-LABEL: v8i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512-NEXT: vpextrw $1, %xmm0, %ecx +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512-NEXT: vpextrw $1, %xmm1, %esi +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: subw %cx, %si +; AVX512-NEXT: cmovbl %eax, %esi +; AVX512-NEXT: vmovd %xmm0, %ecx +; AVX512-NEXT: vmovd %xmm1, %edi +; AVX512-NEXT: subw %cx, %di +; AVX512-NEXT: cmovbl %eax, %edi +; AVX512-NEXT: vmovd %edi, %xmm2 +; AVX512-NEXT: vpinsrw $1, %esi, %xmm2, %xmm2 +; AVX512-NEXT: vpextrw $2, %xmm0, %ecx +; AVX512-NEXT: vpextrw $2, %xmm1, %esi +; AVX512-NEXT: subw %cx, %si +; AVX512-NEXT: cmovbl %eax, %esi +; AVX512-NEXT: vpinsrw $2, %esi, %xmm2, %xmm2 +; AVX512-NEXT: vpextrw $3, %xmm0, %ecx +; AVX512-NEXT: vpextrw $3, %xmm1, %esi +; AVX512-NEXT: subw %cx, %si +; AVX512-NEXT: cmovbl %eax, %esi +; AVX512-NEXT: vpinsrw $3, %esi, %xmm2, %xmm2 +; AVX512-NEXT: vpextrw $4, %xmm0, %ecx +; AVX512-NEXT: vpextrw $4, %xmm1, %esi +; AVX512-NEXT: subw %cx, %si +; AVX512-NEXT: cmovbl %eax, %esi +; AVX512-NEXT: vpinsrw $4, %esi, %xmm2, %xmm2 +; AVX512-NEXT: vpextrw $5, %xmm0, %ecx +; AVX512-NEXT: vpextrw $5, %xmm1, %esi +; AVX512-NEXT: subw %cx, %si +; AVX512-NEXT: cmovbl %eax, %esi +; AVX512-NEXT: vpinsrw $5, %esi, %xmm2, %xmm2 +; AVX512-NEXT: vpextrw $6, %xmm0, %ecx +; AVX512-NEXT: vpextrw $6, %xmm1, %esi +; AVX512-NEXT: subw %cx, %si +; AVX512-NEXT: cmovbl %eax, %esi +; AVX512-NEXT: vpinsrw $6, %esi, %xmm2, %xmm2 +; AVX512-NEXT: vpextrw $7, %xmm0, %ecx +; AVX512-NEXT: vpextrw $7, %xmm1, %esi +; AVX512-NEXT: subw %cx, %si +; AVX512-NEXT: cmovbl %eax, %esi +; AVX512-NEXT: vpinsrw $7, %esi, %xmm2, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512-NEXT: vpmovwb %xmm0, (%rdx) +; AVX512-NEXT: retq + %x = load <8 x i8>, <8 x i8>* %px + %y = load <8 x i8>, <8 x i8>* %py + %z = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> %x, <8 x i8> %y) + store <8 x i8> %z, <8 x i8>* %pz + ret void +} + +define void @v4i8(<4 x i8>* %px, <4 x i8>* %py, <4 x i8>* %pz) nounwind { +; SSE2-LABEL: v4i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: pslld $24, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3] +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: pslld $24, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] +; SSE2-NEXT: movd %xmm2, %ecx +; SSE2-NEXT: xorl %esi, %esi +; SSE2-NEXT: subl %eax, %ecx +; SSE2-NEXT: cmovbl %esi, %ecx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; SSE2-NEXT: movd %xmm3, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; SSE2-NEXT: movd %xmm3, %ecx +; SSE2-NEXT: subl %eax, %ecx +; SSE2-NEXT: cmovbl %esi, %ecx +; SSE2-NEXT: movd %ecx, %xmm3 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: subl %eax, %ecx +; SSE2-NEXT: cmovbl %esi, %ecx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: subl %eax, %ecx +; SSE2-NEXT: cmovbl %esi, %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE2-NEXT: psrld $24, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm2 +; SSE2-NEXT: movd %xmm2, (%rdx) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v4i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,1,255,255,255,2,255,255,255,3] +; SSSE3-NEXT: pshufb %xmm2, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,1,2,3] +; SSSE3-NEXT: movd %xmm3, %eax +; SSSE3-NEXT: pshufb %xmm2, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] +; SSSE3-NEXT: movd %xmm2, %ecx +; SSSE3-NEXT: xorl %esi, %esi +; SSSE3-NEXT: subl %eax, %ecx +; SSSE3-NEXT: cmovbl %esi, %ecx +; SSSE3-NEXT: movd %ecx, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; SSSE3-NEXT: movd %xmm3, %eax +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; SSSE3-NEXT: movd %xmm3, %ecx +; SSSE3-NEXT: subl %eax, %ecx +; SSSE3-NEXT: cmovbl %esi, %ecx +; SSSE3-NEXT: movd %ecx, %xmm3 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSSE3-NEXT: movd %xmm1, %eax +; SSSE3-NEXT: movd %xmm0, %ecx +; SSSE3-NEXT: subl %eax, %ecx +; SSSE3-NEXT: cmovbl %esi, %ecx +; SSSE3-NEXT: movd %ecx, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; SSSE3-NEXT: movd %xmm1, %eax +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSSE3-NEXT: movd %xmm0, %ecx +; SSSE3-NEXT: subl %eax, %ecx +; SSSE3-NEXT: cmovbl %esi, %ecx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: movd %xmm2, (%rdx) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v4i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; SSE41-NEXT: pslld $24, %xmm1 +; SSE41-NEXT: pextrd $1, %xmm1, %eax +; SSE41-NEXT: pslld $24, %xmm0 +; SSE41-NEXT: pextrd $1, %xmm0, %ecx +; SSE41-NEXT: xorl %esi, %esi +; SSE41-NEXT: subl %eax, %ecx +; SSE41-NEXT: cmovbl %esi, %ecx +; SSE41-NEXT: movd %xmm1, %eax +; SSE41-NEXT: movd %xmm0, %edi +; SSE41-NEXT: subl %eax, %edi +; SSE41-NEXT: cmovbl %esi, %edi +; SSE41-NEXT: movd %edi, %xmm2 +; SSE41-NEXT: pinsrd $1, %ecx, %xmm2 +; SSE41-NEXT: pextrd $2, %xmm1, %eax +; SSE41-NEXT: pextrd $2, %xmm0, %ecx +; SSE41-NEXT: subl %eax, %ecx +; SSE41-NEXT: cmovbl %esi, %ecx +; SSE41-NEXT: pinsrd $2, %ecx, %xmm2 +; SSE41-NEXT: pextrd $3, %xmm1, %eax +; SSE41-NEXT: pextrd $3, %xmm0, %ecx +; SSE41-NEXT: subl %eax, %ecx +; SSE41-NEXT: cmovbl %esi, %ecx +; SSE41-NEXT: pinsrd $3, %ecx, %xmm2 +; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] +; SSE41-NEXT: movd %xmm2, (%rdx) +; SSE41-NEXT: retq +; +; AVX1-LABEL: v4i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpslld $24, %xmm1, %xmm1 +; AVX1-NEXT: vpextrd $1, %xmm1, %eax +; AVX1-NEXT: vpslld $24, %xmm0, %xmm0 +; AVX1-NEXT: vpextrd $1, %xmm0, %ecx +; AVX1-NEXT: xorl %esi, %esi +; AVX1-NEXT: subl %eax, %ecx +; AVX1-NEXT: cmovbl %esi, %ecx +; AVX1-NEXT: vmovd %xmm1, %eax +; AVX1-NEXT: vmovd %xmm0, %edi +; AVX1-NEXT: subl %eax, %edi +; AVX1-NEXT: cmovbl %esi, %edi +; AVX1-NEXT: vmovd %edi, %xmm2 +; AVX1-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrd $2, %xmm1, %eax +; AVX1-NEXT: vpextrd $2, %xmm0, %ecx +; AVX1-NEXT: subl %eax, %ecx +; AVX1-NEXT: cmovbl %esi, %ecx +; AVX1-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrd $3, %xmm1, %eax +; AVX1-NEXT: vpextrd $3, %xmm0, %ecx +; AVX1-NEXT: subl %eax, %ecx +; AVX1-NEXT: cmovbl %esi, %ecx +; AVX1-NEXT: vpinsrd $3, %ecx, %xmm2, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vmovd %xmm0, (%rdx) +; AVX1-NEXT: retq +; +; AVX2-LABEL: v4i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX2-NEXT: vpslld $24, %xmm1, %xmm1 +; AVX2-NEXT: vpextrd $1, %xmm1, %eax +; AVX2-NEXT: vpslld $24, %xmm0, %xmm0 +; AVX2-NEXT: vpextrd $1, %xmm0, %ecx +; AVX2-NEXT: xorl %esi, %esi +; AVX2-NEXT: subl %eax, %ecx +; AVX2-NEXT: cmovbl %esi, %ecx +; AVX2-NEXT: vmovd %xmm1, %eax +; AVX2-NEXT: vmovd %xmm0, %edi +; AVX2-NEXT: subl %eax, %edi +; AVX2-NEXT: cmovbl %esi, %edi +; AVX2-NEXT: vmovd %edi, %xmm2 +; AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrd $2, %xmm1, %eax +; AVX2-NEXT: vpextrd $2, %xmm0, %ecx +; AVX2-NEXT: subl %eax, %ecx +; AVX2-NEXT: cmovbl %esi, %ecx +; AVX2-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrd $3, %xmm1, %eax +; AVX2-NEXT: vpextrd $3, %xmm0, %ecx +; AVX2-NEXT: subl %eax, %ecx +; AVX2-NEXT: cmovbl %esi, %ecx +; AVX2-NEXT: vpinsrd $3, %ecx, %xmm2, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovd %xmm0, (%rdx) +; AVX2-NEXT: retq +; +; AVX512-LABEL: v4i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,1,255,255,255,2,255,255,255,3] +; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpextrd $1, %xmm1, %eax +; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpextrd $1, %xmm0, %ecx +; AVX512-NEXT: xorl %esi, %esi +; AVX512-NEXT: subl %eax, %ecx +; AVX512-NEXT: cmovbl %esi, %ecx +; AVX512-NEXT: vmovd %xmm1, %eax +; AVX512-NEXT: vmovd %xmm0, %edi +; AVX512-NEXT: subl %eax, %edi +; AVX512-NEXT: cmovbl %esi, %edi +; AVX512-NEXT: vmovd %edi, %xmm2 +; AVX512-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 +; AVX512-NEXT: vpextrd $2, %xmm1, %eax +; AVX512-NEXT: vpextrd $2, %xmm0, %ecx +; AVX512-NEXT: subl %eax, %ecx +; AVX512-NEXT: cmovbl %esi, %ecx +; AVX512-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm2 +; AVX512-NEXT: vpextrd $3, %xmm1, %eax +; AVX512-NEXT: vpextrd $3, %xmm0, %ecx +; AVX512-NEXT: subl %eax, %ecx +; AVX512-NEXT: cmovbl %esi, %ecx +; AVX512-NEXT: vpinsrd $3, %ecx, %xmm2, %xmm0 +; AVX512-NEXT: vpsrld $24, %xmm0, %xmm0 +; AVX512-NEXT: vpmovdb %xmm0, (%rdx) +; AVX512-NEXT: retq + %x = load <4 x i8>, <4 x i8>* %px + %y = load <4 x i8>, <4 x i8>* %py + %z = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> %x, <4 x i8> %y) + store <4 x i8> %z, <4 x i8>* %pz + ret void +} + +define void @v2i8(<2 x i8>* %px, <2 x i8>* %py, <2 x i8>* %pz) nounwind { +; SSE2-LABEL: v2i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movzwl (%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSE2-NEXT: movzwl (%rsi), %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] +; SSE2-NEXT: psllq $56, %xmm1 +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: psllq $56, %xmm0 +; SSE2-NEXT: movq %xmm0, %rcx +; SSE2-NEXT: xorl %esi, %esi +; SSE2-NEXT: subq %rax, %rcx +; SSE2-NEXT: cmovbq %rsi, %rcx +; SSE2-NEXT: movq %rcx, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm0, %rcx +; SSE2-NEXT: subq %rax, %rcx +; SSE2-NEXT: cmovbq %rsi, %rcx +; SSE2-NEXT: movq %rcx, %xmm0 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE2-NEXT: psrlq $56, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm2 +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: movw %ax, (%rdx) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v2i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movzwl (%rdi), %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzwl (%rsi), %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,1] +; SSSE3-NEXT: pshufb %xmm2, %xmm1 +; SSSE3-NEXT: movq %xmm1, %rax +; SSSE3-NEXT: pshufb %xmm2, %xmm0 +; SSSE3-NEXT: movq %xmm0, %rcx +; SSSE3-NEXT: xorl %esi, %esi +; SSSE3-NEXT: subq %rax, %rcx +; SSSE3-NEXT: cmovbq %rsi, %rcx +; SSSE3-NEXT: movq %rcx, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSSE3-NEXT: movq %xmm1, %rax +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSSE3-NEXT: movq %xmm0, %rcx +; SSSE3-NEXT: subq %rax, %rcx +; SSSE3-NEXT: cmovbq %rsi, %rcx +; SSSE3-NEXT: movq %rcx, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: movd %xmm2, %eax +; SSSE3-NEXT: movw %ax, (%rdx) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v2i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: psllq $56, %xmm1 +; SSE41-NEXT: pextrq $1, %xmm1, %rax +; SSE41-NEXT: psllq $56, %xmm0 +; SSE41-NEXT: pextrq $1, %xmm0, %rcx +; SSE41-NEXT: xorl %esi, %esi +; SSE41-NEXT: subq %rax, %rcx +; SSE41-NEXT: cmovbq %rsi, %rcx +; SSE41-NEXT: movq %rcx, %xmm2 +; SSE41-NEXT: movq %xmm1, %rax +; SSE41-NEXT: movq %xmm0, %rcx +; SSE41-NEXT: subq %rax, %rcx +; SSE41-NEXT: cmovbq %rsi, %rcx +; SSE41-NEXT: movq %rcx, %xmm0 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; SSE41-NEXT: pextrw $0, %xmm0, (%rdx) +; SSE41-NEXT: retq +; +; AVX1-LABEL: v2i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsllq $56, %xmm1, %xmm1 +; AVX1-NEXT: vpextrq $1, %xmm1, %rax +; AVX1-NEXT: vpsllq $56, %xmm0, %xmm0 +; AVX1-NEXT: vpextrq $1, %xmm0, %rcx +; AVX1-NEXT: xorl %esi, %esi +; AVX1-NEXT: subq %rax, %rcx +; AVX1-NEXT: cmovbq %rsi, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm2 +; AVX1-NEXT: vmovq %xmm1, %rax +; AVX1-NEXT: vmovq %xmm0, %rcx +; AVX1-NEXT: subq %rax, %rcx +; AVX1-NEXT: cmovbq %rsi, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpextrw $0, %xmm0, (%rdx) +; AVX1-NEXT: retq +; +; AVX2-LABEL: v2i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpsllq $56, %xmm1, %xmm1 +; AVX2-NEXT: vpextrq $1, %xmm1, %rax +; AVX2-NEXT: vpsllq $56, %xmm0, %xmm0 +; AVX2-NEXT: vpextrq $1, %xmm0, %rcx +; AVX2-NEXT: xorl %esi, %esi +; AVX2-NEXT: subq %rax, %rcx +; AVX2-NEXT: cmovbq %rsi, %rcx +; AVX2-NEXT: vmovq %rcx, %xmm2 +; AVX2-NEXT: vmovq %xmm1, %rax +; AVX2-NEXT: vmovq %xmm0, %rcx +; AVX2-NEXT: subq %rax, %rcx +; AVX2-NEXT: cmovbq %rsi, %rcx +; AVX2-NEXT: vmovq %rcx, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpextrw $0, %xmm0, (%rdx) +; AVX2-NEXT: retq +; +; AVX512-LABEL: v2i8: +; AVX512: # %bb.0: +; AVX512-NEXT: movzwl (%rdi), %eax +; AVX512-NEXT: vmovd %eax, %xmm0 +; AVX512-NEXT: movzwl (%rsi), %eax +; AVX512-NEXT: vmovd %eax, %xmm1 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,1] +; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpextrq $1, %xmm1, %rax +; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512-NEXT: xorl %esi, %esi +; AVX512-NEXT: subq %rax, %rcx +; AVX512-NEXT: cmovbq %rsi, %rcx +; AVX512-NEXT: vmovq %rcx, %xmm2 +; AVX512-NEXT: vmovq %xmm1, %rax +; AVX512-NEXT: vmovq %xmm0, %rcx +; AVX512-NEXT: subq %rax, %rcx +; AVX512-NEXT: cmovbq %rsi, %rcx +; AVX512-NEXT: vmovq %rcx, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX512-NEXT: vpsrlq $56, %xmm0, %xmm0 +; AVX512-NEXT: vpmovqb %xmm0, (%rdx) +; AVX512-NEXT: retq + %x = load <2 x i8>, <2 x i8>* %px + %y = load <2 x i8>, <2 x i8>* %py + %z = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> %x, <2 x i8> %y) + store <2 x i8> %z, <2 x i8>* %pz + ret void +} + +define void @v4i16(<4 x i16>* %px, <4 x i16>* %py, <4 x i16>* %pz) nounwind { +; SSE2-LABEL: v4i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,1,2,3] +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] +; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: xorl %esi, %esi +; SSE2-NEXT: subl %eax, %ecx +; SSE2-NEXT: cmovbl %esi, %ecx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE2-NEXT: movd %xmm2, %ecx +; SSE2-NEXT: subl %eax, %ecx +; SSE2-NEXT: cmovbl %esi, %ecx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: movd %xmm3, %eax +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: subl %eax, %ecx +; SSE2-NEXT: cmovbl %esi, %ecx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] +; SSE2-NEXT: movd %xmm3, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: subl %eax, %ecx +; SSE2-NEXT: cmovbl %esi, %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: movq %xmm0, (%rdx) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v4i16: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSSE3-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; SSSE3-NEXT: pxor %xmm0, %xmm0 +; SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,1,2,3] +; SSSE3-NEXT: movd %xmm2, %eax +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] +; SSSE3-NEXT: movd %xmm1, %ecx +; SSSE3-NEXT: xorl %esi, %esi +; SSSE3-NEXT: subl %eax, %ecx +; SSSE3-NEXT: cmovbl %esi, %ecx +; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; SSSE3-NEXT: movd %xmm2, %eax +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSSE3-NEXT: movd %xmm2, %ecx +; SSSE3-NEXT: subl %eax, %ecx +; SSSE3-NEXT: cmovbl %esi, %ecx +; SSSE3-NEXT: movd %ecx, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSSE3-NEXT: movd %xmm3, %eax +; SSSE3-NEXT: movd %xmm0, %ecx +; SSSE3-NEXT: subl %eax, %ecx +; SSSE3-NEXT: cmovbl %esi, %ecx +; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] +; SSSE3-NEXT: movd %xmm3, %eax +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSSE3-NEXT: movd %xmm0, %ecx +; SSSE3-NEXT: subl %eax, %ecx +; SSSE3-NEXT: cmovbl %esi, %ecx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,10,11,14,15,14,15],zero,zero +; SSSE3-NEXT: movq %xmm1, (%rdx) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v4i16: +; SSE41: # %bb.0: +; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE41-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE41-NEXT: pextrd $1, %xmm3, %eax +; SSE41-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE41-NEXT: pextrd $1, %xmm2, %ecx +; SSE41-NEXT: xorl %esi, %esi +; SSE41-NEXT: subl %eax, %ecx +; SSE41-NEXT: cmovbl %esi, %ecx +; SSE41-NEXT: movd %xmm3, %eax +; SSE41-NEXT: movd %xmm2, %edi +; SSE41-NEXT: subl %eax, %edi +; SSE41-NEXT: cmovbl %esi, %edi +; SSE41-NEXT: movd %edi, %xmm0 +; SSE41-NEXT: pinsrd $1, %ecx, %xmm0 +; SSE41-NEXT: pextrd $2, %xmm3, %eax +; SSE41-NEXT: pextrd $2, %xmm2, %ecx +; SSE41-NEXT: subl %eax, %ecx +; SSE41-NEXT: cmovbl %esi, %ecx +; SSE41-NEXT: pinsrd $2, %ecx, %xmm0 +; SSE41-NEXT: pextrd $3, %xmm3, %eax +; SSE41-NEXT: pextrd $3, %xmm2, %ecx +; SSE41-NEXT: subl %eax, %ecx +; SSE41-NEXT: cmovbl %esi, %ecx +; SSE41-NEXT: pinsrd $3, %ecx, %xmm0 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: packusdw %xmm0, %xmm0 +; SSE41-NEXT: movq %xmm0, (%rdx) +; SSE41-NEXT: retq +; +; AVX1-LABEL: v4i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-NEXT: vpextrd $1, %xmm1, %eax +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX1-NEXT: vpextrd $1, %xmm0, %ecx +; AVX1-NEXT: xorl %esi, %esi +; AVX1-NEXT: subl %eax, %ecx +; AVX1-NEXT: cmovbl %esi, %ecx +; AVX1-NEXT: vmovd %xmm1, %eax +; AVX1-NEXT: vmovd %xmm0, %edi +; AVX1-NEXT: subl %eax, %edi +; AVX1-NEXT: cmovbl %esi, %edi +; AVX1-NEXT: vmovd %edi, %xmm2 +; AVX1-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrd $2, %xmm1, %eax +; AVX1-NEXT: vpextrd $2, %xmm0, %ecx +; AVX1-NEXT: subl %eax, %ecx +; AVX1-NEXT: cmovbl %esi, %ecx +; AVX1-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrd $3, %xmm1, %eax +; AVX1-NEXT: vpextrd $3, %xmm0, %ecx +; AVX1-NEXT: subl %eax, %ecx +; AVX1-NEXT: cmovbl %esi, %ecx +; AVX1-NEXT: vpinsrd $3, %ecx, %xmm2, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, (%rdx) +; AVX1-NEXT: retq +; +; AVX2-LABEL: v4i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-NEXT: vpextrd $1, %xmm1, %eax +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX2-NEXT: vpextrd $1, %xmm0, %ecx +; AVX2-NEXT: xorl %esi, %esi +; AVX2-NEXT: subl %eax, %ecx +; AVX2-NEXT: cmovbl %esi, %ecx +; AVX2-NEXT: vmovd %xmm1, %eax +; AVX2-NEXT: vmovd %xmm0, %edi +; AVX2-NEXT: subl %eax, %edi +; AVX2-NEXT: cmovbl %esi, %edi +; AVX2-NEXT: vmovd %edi, %xmm2 +; AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrd $2, %xmm1, %eax +; AVX2-NEXT: vpextrd $2, %xmm0, %ecx +; AVX2-NEXT: subl %eax, %ecx +; AVX2-NEXT: cmovbl %esi, %ecx +; AVX2-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrd $3, %xmm1, %eax +; AVX2-NEXT: vpextrd $3, %xmm0, %ecx +; AVX2-NEXT: subl %eax, %ecx +; AVX2-NEXT: cmovbl %esi, %ecx +; AVX2-NEXT: vpinsrd $3, %ecx, %xmm2, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rdx) +; AVX2-NEXT: retq +; +; AVX512-LABEL: v4i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,0,1,255,255,2,3,255,255,4,5,255,255,6,7] +; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpextrd $1, %xmm1, %eax +; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpextrd $1, %xmm0, %ecx +; AVX512-NEXT: xorl %esi, %esi +; AVX512-NEXT: subl %eax, %ecx +; AVX512-NEXT: cmovbl %esi, %ecx +; AVX512-NEXT: vmovd %xmm1, %eax +; AVX512-NEXT: vmovd %xmm0, %edi +; AVX512-NEXT: subl %eax, %edi +; AVX512-NEXT: cmovbl %esi, %edi +; AVX512-NEXT: vmovd %edi, %xmm2 +; AVX512-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 +; AVX512-NEXT: vpextrd $2, %xmm1, %eax +; AVX512-NEXT: vpextrd $2, %xmm0, %ecx +; AVX512-NEXT: subl %eax, %ecx +; AVX512-NEXT: cmovbl %esi, %ecx +; AVX512-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm2 +; AVX512-NEXT: vpextrd $3, %xmm1, %eax +; AVX512-NEXT: vpextrd $3, %xmm0, %ecx +; AVX512-NEXT: subl %eax, %ecx +; AVX512-NEXT: cmovbl %esi, %ecx +; AVX512-NEXT: vpinsrd $3, %ecx, %xmm2, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX512-NEXT: vpmovdw %xmm0, (%rdx) +; AVX512-NEXT: retq + %x = load <4 x i16>, <4 x i16>* %px + %y = load <4 x i16>, <4 x i16>* %py + %z = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> %x, <4 x i16> %y) + store <4 x i16> %z, <4 x i16>* %pz + ret void +} + +define void @v2i16(<2 x i16>* %px, <2 x i16>* %py, <2 x i16>* %pz) nounwind { +; SSE2-LABEL: v2i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7] +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,6,7] +; SSE2-NEXT: psllq $48, %xmm1 +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: psllq $48, %xmm0 +; SSE2-NEXT: movq %xmm0, %rcx +; SSE2-NEXT: xorl %esi, %esi +; SSE2-NEXT: subq %rax, %rcx +; SSE2-NEXT: cmovbq %rsi, %rcx +; SSE2-NEXT: movq %rcx, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm0, %rcx +; SSE2-NEXT: subq %rax, %rcx +; SSE2-NEXT: cmovbq %rsi, %rcx +; SSE2-NEXT: movq %rcx, %xmm0 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE2-NEXT: psrlq $48, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: movd %xmm0, (%rdx) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v2i16: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,0,1,255,255,255,255,255,255,2,3] +; SSSE3-NEXT: pshufb %xmm2, %xmm1 +; SSSE3-NEXT: movq %xmm1, %rax +; SSSE3-NEXT: pshufb %xmm2, %xmm0 +; SSSE3-NEXT: movq %xmm0, %rcx +; SSSE3-NEXT: xorl %esi, %esi +; SSSE3-NEXT: subq %rax, %rcx +; SSSE3-NEXT: cmovbq %rsi, %rcx +; SSSE3-NEXT: movq %rcx, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSSE3-NEXT: movq %xmm1, %rax +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSSE3-NEXT: movq %xmm0, %rcx +; SSSE3-NEXT: subq %rax, %rcx +; SSSE3-NEXT: cmovbq %rsi, %rcx +; SSSE3-NEXT: movq %rcx, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[6,7,14,15,14,15],zero,zero,xmm2[14,15],zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: movd %xmm2, (%rdx) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v2i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; SSE41-NEXT: psllq $48, %xmm1 +; SSE41-NEXT: pextrq $1, %xmm1, %rax +; SSE41-NEXT: psllq $48, %xmm0 +; SSE41-NEXT: pextrq $1, %xmm0, %rcx +; SSE41-NEXT: xorl %esi, %esi +; SSE41-NEXT: subq %rax, %rcx +; SSE41-NEXT: cmovbq %rsi, %rcx +; SSE41-NEXT: movq %rcx, %xmm2 +; SSE41-NEXT: movq %xmm1, %rax +; SSE41-NEXT: movq %xmm0, %rcx +; SSE41-NEXT: subq %rax, %rcx +; SSE41-NEXT: cmovbq %rsi, %rcx +; SSE41-NEXT: movq %rcx, %xmm0 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,14,15],zero,zero,xmm0[14,15],zero,zero,zero,zero,zero,zero +; SSE41-NEXT: movd %xmm0, (%rdx) +; SSE41-NEXT: retq +; +; AVX1-LABEL: v2i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; AVX1-NEXT: vpsllq $48, %xmm1, %xmm1 +; AVX1-NEXT: vpextrq $1, %xmm1, %rax +; AVX1-NEXT: vpsllq $48, %xmm0, %xmm0 +; AVX1-NEXT: vpextrq $1, %xmm0, %rcx +; AVX1-NEXT: xorl %esi, %esi +; AVX1-NEXT: subq %rax, %rcx +; AVX1-NEXT: cmovbq %rsi, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm2 +; AVX1-NEXT: vmovq %xmm1, %rax +; AVX1-NEXT: vmovq %xmm0, %rcx +; AVX1-NEXT: subq %rax, %rcx +; AVX1-NEXT: cmovbq %rsi, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,14,15],zero,zero,xmm0[14,15],zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vmovd %xmm0, (%rdx) +; AVX1-NEXT: retq +; +; AVX2-LABEL: v2i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1 +; AVX2-NEXT: vpextrq $1, %xmm1, %rax +; AVX2-NEXT: vpsllq $48, %xmm0, %xmm0 +; AVX2-NEXT: vpextrq $1, %xmm0, %rcx +; AVX2-NEXT: xorl %esi, %esi +; AVX2-NEXT: subq %rax, %rcx +; AVX2-NEXT: cmovbq %rsi, %rcx +; AVX2-NEXT: vmovq %rcx, %xmm2 +; AVX2-NEXT: vmovq %xmm1, %rax +; AVX2-NEXT: vmovq %xmm0, %rcx +; AVX2-NEXT: subq %rax, %rcx +; AVX2-NEXT: cmovbq %rsi, %rcx +; AVX2-NEXT: vmovq %rcx, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,14,15],zero,zero,xmm0[14,15],zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vmovd %xmm0, (%rdx) +; AVX2-NEXT: retq +; +; AVX512-LABEL: v2i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,0,1,255,255,255,255,255,255,2,3] +; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpextrq $1, %xmm1, %rax +; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512-NEXT: xorl %esi, %esi +; AVX512-NEXT: subq %rax, %rcx +; AVX512-NEXT: cmovbq %rsi, %rcx +; AVX512-NEXT: vmovq %rcx, %xmm2 +; AVX512-NEXT: vmovq %xmm1, %rax +; AVX512-NEXT: vmovq %xmm0, %rcx +; AVX512-NEXT: subq %rax, %rcx +; AVX512-NEXT: cmovbq %rsi, %rcx +; AVX512-NEXT: vmovq %rcx, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX512-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX512-NEXT: vpmovqw %xmm0, (%rdx) +; AVX512-NEXT: retq + %x = load <2 x i16>, <2 x i16>* %px + %y = load <2 x i16>, <2 x i16>* %py + %z = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %x, <2 x i16> %y) + store <2 x i16> %z, <2 x i16>* %pz + ret void +} + +define <12 x i8> @v12i8(<12 x i8> %x, <12 x i8> %y) nounwind { +; SSE2-LABEL: v12i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: pushq %r15 +; SSE2-NEXT: pushq %r14 +; SSE2-NEXT: pushq %r13 +; SSE2-NEXT: pushq %r12 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: jb .LBB11_2 +; SSE2-NEXT: # %bb.1: +; SSE2-NEXT: movl %ecx, %eax +; SSE2-NEXT: .LBB11_2: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl $0, %ecx +; SSE2-NEXT: jb .LBB11_4 +; SSE2-NEXT: # %bb.3: +; SSE2-NEXT: movl %edx, %ecx +; SSE2-NEXT: .LBB11_4: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl $0, %edx +; SSE2-NEXT: jb .LBB11_6 +; SSE2-NEXT: # %bb.5: +; SSE2-NEXT: movl %ebx, %edx +; SSE2-NEXT: .LBB11_6: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl $0, %esi +; SSE2-NEXT: jb .LBB11_8 +; SSE2-NEXT: # %bb.7: +; SSE2-NEXT: movl %ebx, %esi +; SSE2-NEXT: .LBB11_8: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl $0, %edi +; SSE2-NEXT: jb .LBB11_10 +; SSE2-NEXT: # %bb.9: +; SSE2-NEXT: movl %ebx, %edi +; SSE2-NEXT: .LBB11_10: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl $0, %r12d +; SSE2-NEXT: jb .LBB11_12 +; SSE2-NEXT: # %bb.11: +; SSE2-NEXT: movl %ebx, %r12d +; SSE2-NEXT: .LBB11_12: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl $0, %r8d +; SSE2-NEXT: jb .LBB11_14 +; SSE2-NEXT: # %bb.13: +; SSE2-NEXT: movl %ebx, %r8d +; SSE2-NEXT: .LBB11_14: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl $0, %r10d +; SSE2-NEXT: jb .LBB11_16 +; SSE2-NEXT: # %bb.15: +; SSE2-NEXT: movl %ebx, %r10d +; SSE2-NEXT: .LBB11_16: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl $0, %r13d +; SSE2-NEXT: jb .LBB11_18 +; SSE2-NEXT: # %bb.17: +; SSE2-NEXT: movl %ebx, %r13d +; SSE2-NEXT: .LBB11_18: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl $0, %r9d +; SSE2-NEXT: jb .LBB11_20 +; SSE2-NEXT: # %bb.19: +; SSE2-NEXT: movl %ebx, %r9d +; SSE2-NEXT: .LBB11_20: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl $0, %r11d +; SSE2-NEXT: jb .LBB11_22 +; SSE2-NEXT: # %bb.21: +; SSE2-NEXT: movl %ebx, %r11d +; SSE2-NEXT: .LBB11_22: +; SSE2-NEXT: movzbl %al, %r14d +; SSE2-NEXT: movzbl %cl, %r15d +; SSE2-NEXT: movzbl %dl, %edx +; SSE2-NEXT: movzbl %sil, %esi +; SSE2-NEXT: movzbl %dil, %ebx +; SSE2-NEXT: movzbl %r12b, %ebp +; SSE2-NEXT: movzbl %r8b, %edi +; SSE2-NEXT: movzbl %r10b, %r8d +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: jb .LBB11_24 +; SSE2-NEXT: # %bb.23: +; SSE2-NEXT: movl %ecx, %eax +; SSE2-NEXT: .LBB11_24: +; SSE2-NEXT: movd %r14d, %xmm2 +; SSE2-NEXT: movd %r15d, %xmm3 +; SSE2-NEXT: movd %edx, %xmm5 +; SSE2-NEXT: movd %esi, %xmm0 +; SSE2-NEXT: movd %ebx, %xmm6 +; SSE2-NEXT: movd %ebp, %xmm4 +; SSE2-NEXT: movd %edi, %xmm7 +; SSE2-NEXT: movd %r8d, %xmm1 +; SSE2-NEXT: movzbl %r13b, %ebp +; SSE2-NEXT: movzbl %r9b, %ecx +; SSE2-NEXT: movzbl %r11b, %edx +; SSE2-NEXT: movzbl %al, %esi +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, %edi +; SSE2-NEXT: jb .LBB11_26 +; SSE2-NEXT: # %bb.25: +; SSE2-NEXT: movl %eax, %edi +; SSE2-NEXT: .LBB11_26: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE2-NEXT: movd %ebp, %xmm6 +; SSE2-NEXT: movd %ecx, %xmm5 +; SSE2-NEXT: movd %edx, %xmm7 +; SSE2-NEXT: movd %esi, %xmm2 +; SSE2-NEXT: movzbl %dil, %eax +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl $0, %ecx +; SSE2-NEXT: jb .LBB11_28 +; SSE2-NEXT: # %bb.27: +; SSE2-NEXT: movl %edx, %ecx +; SSE2-NEXT: .LBB11_28: +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: movzbl %cl, %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl $0, %ecx +; SSE2-NEXT: jb .LBB11_30 +; SSE2-NEXT: # %bb.29: +; SSE2-NEXT: movl %edx, %ecx +; SSE2-NEXT: .LBB11_30: +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: movzbl %cl, %ecx +; SSE2-NEXT: movd %ecx, %xmm4 +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, %ecx +; SSE2-NEXT: jb .LBB11_32 +; SSE2-NEXT: # %bb.31: +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: .LBB11_32: +; SSE2-NEXT: movzbl %cl, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %r12 +; SSE2-NEXT: popq %r13 +; SSE2-NEXT: popq %r14 +; SSE2-NEXT: popq %r15 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v12i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pushq %rbp +; SSSE3-NEXT: pushq %r15 +; SSSE3-NEXT: pushq %r14 +; SSSE3-NEXT: pushq %r13 +; SSSE3-NEXT: pushq %r12 +; SSSE3-NEXT: pushq %rbx +; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movl $0, %eax +; SSSE3-NEXT: jb .LBB11_2 +; SSSE3-NEXT: # %bb.1: +; SSSE3-NEXT: movl %ecx, %eax +; SSSE3-NEXT: .LBB11_2: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl $0, %ecx +; SSSE3-NEXT: jb .LBB11_4 +; SSSE3-NEXT: # %bb.3: +; SSSE3-NEXT: movl %edx, %ecx +; SSSE3-NEXT: .LBB11_4: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl $0, %edx +; SSSE3-NEXT: jb .LBB11_6 +; SSSE3-NEXT: # %bb.5: +; SSSE3-NEXT: movl %ebx, %edx +; SSSE3-NEXT: .LBB11_6: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl $0, %esi +; SSSE3-NEXT: jb .LBB11_8 +; SSSE3-NEXT: # %bb.7: +; SSSE3-NEXT: movl %ebx, %esi +; SSSE3-NEXT: .LBB11_8: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl $0, %edi +; SSSE3-NEXT: jb .LBB11_10 +; SSSE3-NEXT: # %bb.9: +; SSSE3-NEXT: movl %ebx, %edi +; SSSE3-NEXT: .LBB11_10: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl $0, %r12d +; SSSE3-NEXT: jb .LBB11_12 +; SSSE3-NEXT: # %bb.11: +; SSSE3-NEXT: movl %ebx, %r12d +; SSSE3-NEXT: .LBB11_12: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl $0, %r8d +; SSSE3-NEXT: jb .LBB11_14 +; SSSE3-NEXT: # %bb.13: +; SSSE3-NEXT: movl %ebx, %r8d +; SSSE3-NEXT: .LBB11_14: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl $0, %r10d +; SSSE3-NEXT: jb .LBB11_16 +; SSSE3-NEXT: # %bb.15: +; SSSE3-NEXT: movl %ebx, %r10d +; SSSE3-NEXT: .LBB11_16: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl $0, %r13d +; SSSE3-NEXT: jb .LBB11_18 +; SSSE3-NEXT: # %bb.17: +; SSSE3-NEXT: movl %ebx, %r13d +; SSSE3-NEXT: .LBB11_18: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl $0, %r9d +; SSSE3-NEXT: jb .LBB11_20 +; SSSE3-NEXT: # %bb.19: +; SSSE3-NEXT: movl %ebx, %r9d +; SSSE3-NEXT: .LBB11_20: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl $0, %r11d +; SSSE3-NEXT: jb .LBB11_22 +; SSSE3-NEXT: # %bb.21: +; SSSE3-NEXT: movl %ebx, %r11d +; SSSE3-NEXT: .LBB11_22: +; SSSE3-NEXT: movzbl %al, %r14d +; SSSE3-NEXT: movzbl %cl, %r15d +; SSSE3-NEXT: movzbl %dl, %edx +; SSSE3-NEXT: movzbl %sil, %esi +; SSSE3-NEXT: movzbl %dil, %ebx +; SSSE3-NEXT: movzbl %r12b, %ebp +; SSSE3-NEXT: movzbl %r8b, %edi +; SSSE3-NEXT: movzbl %r10b, %r8d +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movl $0, %eax +; SSSE3-NEXT: jb .LBB11_24 +; SSSE3-NEXT: # %bb.23: +; SSSE3-NEXT: movl %ecx, %eax +; SSSE3-NEXT: .LBB11_24: +; SSSE3-NEXT: movd %r14d, %xmm2 +; SSSE3-NEXT: movd %r15d, %xmm3 +; SSSE3-NEXT: movd %edx, %xmm5 +; SSSE3-NEXT: movd %esi, %xmm0 +; SSSE3-NEXT: movd %ebx, %xmm6 +; SSSE3-NEXT: movd %ebp, %xmm4 +; SSSE3-NEXT: movd %edi, %xmm7 +; SSSE3-NEXT: movd %r8d, %xmm1 +; SSSE3-NEXT: movzbl %r13b, %ebp +; SSSE3-NEXT: movzbl %r9b, %ecx +; SSSE3-NEXT: movzbl %r11b, %edx +; SSSE3-NEXT: movzbl %al, %esi +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, %edi +; SSSE3-NEXT: jb .LBB11_26 +; SSSE3-NEXT: # %bb.25: +; SSSE3-NEXT: movl %eax, %edi +; SSSE3-NEXT: .LBB11_26: +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSSE3-NEXT: movd %ebp, %xmm6 +; SSSE3-NEXT: movd %ecx, %xmm5 +; SSSE3-NEXT: movd %edx, %xmm7 +; SSSE3-NEXT: movd %esi, %xmm2 +; SSSE3-NEXT: movzbl %dil, %eax +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl $0, %ecx +; SSSE3-NEXT: jb .LBB11_28 +; SSSE3-NEXT: # %bb.27: +; SSSE3-NEXT: movl %edx, %ecx +; SSSE3-NEXT: .LBB11_28: +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSSE3-NEXT: movd %eax, %xmm4 +; SSSE3-NEXT: movzbl %cl, %eax +; SSSE3-NEXT: movd %eax, %xmm3 +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl $0, %ecx +; SSSE3-NEXT: jb .LBB11_30 +; SSSE3-NEXT: # %bb.29: +; SSSE3-NEXT: movl %edx, %ecx +; SSSE3-NEXT: .LBB11_30: +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSSE3-NEXT: movzbl %cl, %ecx +; SSSE3-NEXT: movd %ecx, %xmm4 +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, %ecx +; SSSE3-NEXT: jb .LBB11_32 +; SSSE3-NEXT: # %bb.31: +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: .LBB11_32: +; SSSE3-NEXT: movzbl %cl, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: popq %rbx +; SSSE3-NEXT: popq %r12 +; SSSE3-NEXT: popq %r13 +; SSSE3-NEXT: popq %r14 +; SSSE3-NEXT: popq %r15 +; SSSE3-NEXT: popq %rbp +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v12i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pextrb $1, %xmm1, %edx +; SSE41-NEXT: pextrb $1, %xmm0, %ecx +; SSE41-NEXT: xorl %eax, %eax +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB11_2 +; SSE41-NEXT: # %bb.1: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB11_2: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pextrb $0, %xmm1, %esi +; SSE41-NEXT: pextrb $0, %xmm0, %edx +; SSE41-NEXT: subb %sil, %dl +; SSE41-NEXT: movl $0, %esi +; SSE41-NEXT: jb .LBB11_4 +; SSE41-NEXT: # %bb.3: +; SSE41-NEXT: movl %edx, %esi +; SSE41-NEXT: .LBB11_4: +; SSE41-NEXT: movzbl %sil, %edx +; SSE41-NEXT: movd %edx, %xmm2 +; SSE41-NEXT: pinsrb $1, %ecx, %xmm2 +; SSE41-NEXT: pextrb $2, %xmm1, %edx +; SSE41-NEXT: pextrb $2, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB11_6 +; SSE41-NEXT: # %bb.5: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB11_6: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $2, %ecx, %xmm2 +; SSE41-NEXT: pextrb $3, %xmm1, %edx +; SSE41-NEXT: pextrb $3, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB11_8 +; SSE41-NEXT: # %bb.7: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB11_8: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $3, %ecx, %xmm2 +; SSE41-NEXT: pextrb $4, %xmm1, %edx +; SSE41-NEXT: pextrb $4, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB11_10 +; SSE41-NEXT: # %bb.9: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB11_10: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $4, %ecx, %xmm2 +; SSE41-NEXT: pextrb $5, %xmm1, %edx +; SSE41-NEXT: pextrb $5, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB11_12 +; SSE41-NEXT: # %bb.11: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB11_12: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $5, %ecx, %xmm2 +; SSE41-NEXT: pextrb $6, %xmm1, %edx +; SSE41-NEXT: pextrb $6, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB11_14 +; SSE41-NEXT: # %bb.13: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB11_14: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $6, %ecx, %xmm2 +; SSE41-NEXT: pextrb $7, %xmm1, %edx +; SSE41-NEXT: pextrb $7, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB11_16 +; SSE41-NEXT: # %bb.15: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB11_16: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $7, %ecx, %xmm2 +; SSE41-NEXT: pextrb $8, %xmm1, %edx +; SSE41-NEXT: pextrb $8, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB11_18 +; SSE41-NEXT: # %bb.17: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB11_18: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $8, %ecx, %xmm2 +; SSE41-NEXT: pextrb $9, %xmm1, %edx +; SSE41-NEXT: pextrb $9, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB11_20 +; SSE41-NEXT: # %bb.19: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB11_20: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $9, %ecx, %xmm2 +; SSE41-NEXT: pextrb $10, %xmm1, %edx +; SSE41-NEXT: pextrb $10, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB11_22 +; SSE41-NEXT: # %bb.21: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB11_22: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $10, %ecx, %xmm2 +; SSE41-NEXT: pextrb $11, %xmm1, %edx +; SSE41-NEXT: pextrb $11, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB11_24 +; SSE41-NEXT: # %bb.23: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB11_24: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $11, %ecx, %xmm2 +; SSE41-NEXT: pextrb $12, %xmm1, %edx +; SSE41-NEXT: pextrb $12, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB11_26 +; SSE41-NEXT: # %bb.25: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB11_26: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $12, %ecx, %xmm2 +; SSE41-NEXT: pextrb $13, %xmm1, %edx +; SSE41-NEXT: pextrb $13, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB11_28 +; SSE41-NEXT: # %bb.27: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB11_28: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $13, %ecx, %xmm2 +; SSE41-NEXT: pextrb $14, %xmm1, %edx +; SSE41-NEXT: pextrb $14, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB11_30 +; SSE41-NEXT: # %bb.29: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB11_30: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $14, %ecx, %xmm2 +; SSE41-NEXT: pextrb $15, %xmm1, %edx +; SSE41-NEXT: pextrb $15, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: jb .LBB11_32 +; SSE41-NEXT: # %bb.31: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB11_32: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $15, %eax, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: v12i8: +; AVX: # %bb.0: +; AVX-NEXT: vpextrb $1, %xmm1, %edx +; AVX-NEXT: vpextrb $1, %xmm0, %ecx +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: subb %dl, %cl +; AVX-NEXT: movl $0, %edx +; AVX-NEXT: jb .LBB11_2 +; AVX-NEXT: # %bb.1: +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: .LBB11_2: +; AVX-NEXT: movzbl %dl, %ecx +; AVX-NEXT: vpextrb $0, %xmm1, %esi +; AVX-NEXT: vpextrb $0, %xmm0, %edx +; AVX-NEXT: subb %sil, %dl +; AVX-NEXT: movl $0, %esi +; AVX-NEXT: jb .LBB11_4 +; AVX-NEXT: # %bb.3: +; AVX-NEXT: movl %edx, %esi +; AVX-NEXT: .LBB11_4: +; AVX-NEXT: movzbl %sil, %edx +; AVX-NEXT: vmovd %edx, %xmm2 +; AVX-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $2, %xmm1, %edx +; AVX-NEXT: vpextrb $2, %xmm0, %ecx +; AVX-NEXT: subb %dl, %cl +; AVX-NEXT: movl $0, %edx +; AVX-NEXT: jb .LBB11_6 +; AVX-NEXT: # %bb.5: +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: .LBB11_6: +; AVX-NEXT: movzbl %dl, %ecx +; AVX-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $3, %xmm1, %edx +; AVX-NEXT: vpextrb $3, %xmm0, %ecx +; AVX-NEXT: subb %dl, %cl +; AVX-NEXT: movl $0, %edx +; AVX-NEXT: jb .LBB11_8 +; AVX-NEXT: # %bb.7: +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: .LBB11_8: +; AVX-NEXT: movzbl %dl, %ecx +; AVX-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $4, %xmm1, %edx +; AVX-NEXT: vpextrb $4, %xmm0, %ecx +; AVX-NEXT: subb %dl, %cl +; AVX-NEXT: movl $0, %edx +; AVX-NEXT: jb .LBB11_10 +; AVX-NEXT: # %bb.9: +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: .LBB11_10: +; AVX-NEXT: movzbl %dl, %ecx +; AVX-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $5, %xmm1, %edx +; AVX-NEXT: vpextrb $5, %xmm0, %ecx +; AVX-NEXT: subb %dl, %cl +; AVX-NEXT: movl $0, %edx +; AVX-NEXT: jb .LBB11_12 +; AVX-NEXT: # %bb.11: +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: .LBB11_12: +; AVX-NEXT: movzbl %dl, %ecx +; AVX-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $6, %xmm1, %edx +; AVX-NEXT: vpextrb $6, %xmm0, %ecx +; AVX-NEXT: subb %dl, %cl +; AVX-NEXT: movl $0, %edx +; AVX-NEXT: jb .LBB11_14 +; AVX-NEXT: # %bb.13: +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: .LBB11_14: +; AVX-NEXT: movzbl %dl, %ecx +; AVX-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $7, %xmm1, %edx +; AVX-NEXT: vpextrb $7, %xmm0, %ecx +; AVX-NEXT: subb %dl, %cl +; AVX-NEXT: movl $0, %edx +; AVX-NEXT: jb .LBB11_16 +; AVX-NEXT: # %bb.15: +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: .LBB11_16: +; AVX-NEXT: movzbl %dl, %ecx +; AVX-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $8, %xmm1, %edx +; AVX-NEXT: vpextrb $8, %xmm0, %ecx +; AVX-NEXT: subb %dl, %cl +; AVX-NEXT: movl $0, %edx +; AVX-NEXT: jb .LBB11_18 +; AVX-NEXT: # %bb.17: +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: .LBB11_18: +; AVX-NEXT: movzbl %dl, %ecx +; AVX-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $9, %xmm1, %edx +; AVX-NEXT: vpextrb $9, %xmm0, %ecx +; AVX-NEXT: subb %dl, %cl +; AVX-NEXT: movl $0, %edx +; AVX-NEXT: jb .LBB11_20 +; AVX-NEXT: # %bb.19: +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: .LBB11_20: +; AVX-NEXT: movzbl %dl, %ecx +; AVX-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $10, %xmm1, %edx +; AVX-NEXT: vpextrb $10, %xmm0, %ecx +; AVX-NEXT: subb %dl, %cl +; AVX-NEXT: movl $0, %edx +; AVX-NEXT: jb .LBB11_22 +; AVX-NEXT: # %bb.21: +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: .LBB11_22: +; AVX-NEXT: movzbl %dl, %ecx +; AVX-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $11, %xmm1, %edx +; AVX-NEXT: vpextrb $11, %xmm0, %ecx +; AVX-NEXT: subb %dl, %cl +; AVX-NEXT: movl $0, %edx +; AVX-NEXT: jb .LBB11_24 +; AVX-NEXT: # %bb.23: +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: .LBB11_24: +; AVX-NEXT: movzbl %dl, %ecx +; AVX-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $12, %xmm1, %edx +; AVX-NEXT: vpextrb $12, %xmm0, %ecx +; AVX-NEXT: subb %dl, %cl +; AVX-NEXT: movl $0, %edx +; AVX-NEXT: jb .LBB11_26 +; AVX-NEXT: # %bb.25: +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: .LBB11_26: +; AVX-NEXT: movzbl %dl, %ecx +; AVX-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $13, %xmm1, %edx +; AVX-NEXT: vpextrb $13, %xmm0, %ecx +; AVX-NEXT: subb %dl, %cl +; AVX-NEXT: movl $0, %edx +; AVX-NEXT: jb .LBB11_28 +; AVX-NEXT: # %bb.27: +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: .LBB11_28: +; AVX-NEXT: movzbl %dl, %ecx +; AVX-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $14, %xmm1, %edx +; AVX-NEXT: vpextrb $14, %xmm0, %ecx +; AVX-NEXT: subb %dl, %cl +; AVX-NEXT: movl $0, %edx +; AVX-NEXT: jb .LBB11_30 +; AVX-NEXT: # %bb.29: +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: .LBB11_30: +; AVX-NEXT: movzbl %dl, %ecx +; AVX-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $15, %xmm1, %edx +; AVX-NEXT: vpextrb $15, %xmm0, %ecx +; AVX-NEXT: subb %dl, %cl +; AVX-NEXT: jb .LBB11_32 +; AVX-NEXT: # %bb.31: +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: .LBB11_32: +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0 +; AVX-NEXT: retq + %z = call <12 x i8> @llvm.usub.sat.v12i8(<12 x i8> %x, <12 x i8> %y) + ret <12 x i8> %z +} + +define void @v12i16(<12 x i16>* %px, <12 x i16>* %py, <12 x i16>* %pz) nounwind { +; SSE2-LABEL: v12i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm2 +; SSE2-NEXT: movdqa 16(%rdi), %xmm0 +; SSE2-NEXT: movdqa (%rsi), %xmm3 +; SSE2-NEXT: movdqa 16(%rsi), %xmm1 +; SSE2-NEXT: pextrw $7, %xmm3, %ecx +; SSE2-NEXT: pextrw $7, %xmm2, %esi +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: subw %cx, %si +; SSE2-NEXT: cmovbl %eax, %esi +; SSE2-NEXT: movd %esi, %xmm4 +; SSE2-NEXT: pextrw $6, %xmm3, %ecx +; SSE2-NEXT: pextrw $6, %xmm2, %esi +; SSE2-NEXT: subw %cx, %si +; SSE2-NEXT: cmovbl %eax, %esi +; SSE2-NEXT: movd %esi, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE2-NEXT: pextrw $5, %xmm3, %ecx +; SSE2-NEXT: pextrw $5, %xmm2, %esi +; SSE2-NEXT: subw %cx, %si +; SSE2-NEXT: cmovbl %eax, %esi +; SSE2-NEXT: movd %esi, %xmm6 +; SSE2-NEXT: pextrw $4, %xmm3, %ecx +; SSE2-NEXT: pextrw $4, %xmm2, %esi +; SSE2-NEXT: subw %cx, %si +; SSE2-NEXT: cmovbl %eax, %esi +; SSE2-NEXT: movd %esi, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE2-NEXT: pextrw $3, %xmm3, %ecx +; SSE2-NEXT: pextrw $3, %xmm2, %esi +; SSE2-NEXT: subw %cx, %si +; SSE2-NEXT: cmovbl %eax, %esi +; SSE2-NEXT: movd %esi, %xmm5 +; SSE2-NEXT: pextrw $2, %xmm3, %ecx +; SSE2-NEXT: pextrw $2, %xmm2, %esi +; SSE2-NEXT: subw %cx, %si +; SSE2-NEXT: cmovbl %eax, %esi +; SSE2-NEXT: movd %esi, %xmm6 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; SSE2-NEXT: pextrw $1, %xmm3, %ecx +; SSE2-NEXT: pextrw $1, %xmm2, %esi +; SSE2-NEXT: subw %cx, %si +; SSE2-NEXT: cmovbl %eax, %esi +; SSE2-NEXT: movd %esi, %xmm5 +; SSE2-NEXT: movd %xmm3, %ecx +; SSE2-NEXT: movd %xmm2, %esi +; SSE2-NEXT: subw %cx, %si +; SSE2-NEXT: cmovbl %eax, %esi +; SSE2-NEXT: movd %esi, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE2-NEXT: pextrw $1, %xmm1, %ecx +; SSE2-NEXT: pextrw $1, %xmm0, %esi +; SSE2-NEXT: subw %cx, %si +; SSE2-NEXT: cmovbl %eax, %esi +; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: movd %xmm0, %edi +; SSE2-NEXT: subw %cx, %di +; SSE2-NEXT: cmovbl %eax, %edi +; SSE2-NEXT: movd %edi, %xmm3 +; SSE2-NEXT: pinsrw $1, %esi, %xmm3 +; SSE2-NEXT: pextrw $2, %xmm1, %ecx +; SSE2-NEXT: pextrw $2, %xmm0, %esi +; SSE2-NEXT: subw %cx, %si +; SSE2-NEXT: cmovbl %eax, %esi +; SSE2-NEXT: pinsrw $2, %esi, %xmm3 +; SSE2-NEXT: pextrw $3, %xmm1, %ecx +; SSE2-NEXT: pextrw $3, %xmm0, %esi +; SSE2-NEXT: subw %cx, %si +; SSE2-NEXT: cmovbl %eax, %esi +; SSE2-NEXT: pinsrw $3, %esi, %xmm3 +; SSE2-NEXT: movq %xmm3, 16(%rdx) +; SSE2-NEXT: movdqa %xmm2, (%rdx) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v12i16: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa (%rdi), %xmm2 +; SSSE3-NEXT: movdqa 16(%rdi), %xmm0 +; SSSE3-NEXT: movdqa (%rsi), %xmm3 +; SSSE3-NEXT: movdqa 16(%rsi), %xmm1 +; SSSE3-NEXT: pextrw $7, %xmm3, %ecx +; SSSE3-NEXT: pextrw $7, %xmm2, %esi +; SSSE3-NEXT: xorl %eax, %eax +; SSSE3-NEXT: subw %cx, %si +; SSSE3-NEXT: cmovbl %eax, %esi +; SSSE3-NEXT: movd %esi, %xmm4 +; SSSE3-NEXT: pextrw $6, %xmm3, %ecx +; SSSE3-NEXT: pextrw $6, %xmm2, %esi +; SSSE3-NEXT: subw %cx, %si +; SSSE3-NEXT: cmovbl %eax, %esi +; SSSE3-NEXT: movd %esi, %xmm5 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSSE3-NEXT: pextrw $5, %xmm3, %ecx +; SSSE3-NEXT: pextrw $5, %xmm2, %esi +; SSSE3-NEXT: subw %cx, %si +; SSSE3-NEXT: cmovbl %eax, %esi +; SSSE3-NEXT: movd %esi, %xmm6 +; SSSE3-NEXT: pextrw $4, %xmm3, %ecx +; SSSE3-NEXT: pextrw $4, %xmm2, %esi +; SSSE3-NEXT: subw %cx, %si +; SSSE3-NEXT: cmovbl %eax, %esi +; SSSE3-NEXT: movd %esi, %xmm4 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSSE3-NEXT: pextrw $3, %xmm3, %ecx +; SSSE3-NEXT: pextrw $3, %xmm2, %esi +; SSSE3-NEXT: subw %cx, %si +; SSSE3-NEXT: cmovbl %eax, %esi +; SSSE3-NEXT: movd %esi, %xmm5 +; SSSE3-NEXT: pextrw $2, %xmm3, %ecx +; SSSE3-NEXT: pextrw $2, %xmm2, %esi +; SSSE3-NEXT: subw %cx, %si +; SSSE3-NEXT: cmovbl %eax, %esi +; SSSE3-NEXT: movd %esi, %xmm6 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; SSSE3-NEXT: pextrw $1, %xmm3, %ecx +; SSSE3-NEXT: pextrw $1, %xmm2, %esi +; SSSE3-NEXT: subw %cx, %si +; SSSE3-NEXT: cmovbl %eax, %esi +; SSSE3-NEXT: movd %esi, %xmm5 +; SSSE3-NEXT: movd %xmm3, %ecx +; SSSE3-NEXT: movd %xmm2, %esi +; SSSE3-NEXT: subw %cx, %si +; SSSE3-NEXT: cmovbl %eax, %esi +; SSSE3-NEXT: movd %esi, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSSE3-NEXT: pextrw $1, %xmm1, %ecx +; SSSE3-NEXT: pextrw $1, %xmm0, %esi +; SSSE3-NEXT: subw %cx, %si +; SSSE3-NEXT: cmovbl %eax, %esi +; SSSE3-NEXT: movd %xmm1, %ecx +; SSSE3-NEXT: movd %xmm0, %edi +; SSSE3-NEXT: subw %cx, %di +; SSSE3-NEXT: cmovbl %eax, %edi +; SSSE3-NEXT: movd %edi, %xmm3 +; SSSE3-NEXT: pinsrw $1, %esi, %xmm3 +; SSSE3-NEXT: pextrw $2, %xmm1, %ecx +; SSSE3-NEXT: pextrw $2, %xmm0, %esi +; SSSE3-NEXT: subw %cx, %si +; SSSE3-NEXT: cmovbl %eax, %esi +; SSSE3-NEXT: pinsrw $2, %esi, %xmm3 +; SSSE3-NEXT: pextrw $3, %xmm1, %ecx +; SSSE3-NEXT: pextrw $3, %xmm0, %esi +; SSSE3-NEXT: subw %cx, %si +; SSSE3-NEXT: cmovbl %eax, %esi +; SSSE3-NEXT: pinsrw $3, %esi, %xmm3 +; SSSE3-NEXT: movq %xmm3, 16(%rdx) +; SSSE3-NEXT: movdqa %xmm2, (%rdx) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v12i16: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm3 +; SSE41-NEXT: movdqa 16(%rdi), %xmm0 +; SSE41-NEXT: movdqa (%rsi), %xmm4 +; SSE41-NEXT: movdqa 16(%rsi), %xmm1 +; SSE41-NEXT: pextrw $1, %xmm4, %ecx +; SSE41-NEXT: pextrw $1, %xmm3, %esi +; SSE41-NEXT: xorl %eax, %eax +; SSE41-NEXT: subw %cx, %si +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: movd %xmm4, %ecx +; SSE41-NEXT: movd %xmm3, %edi +; SSE41-NEXT: subw %cx, %di +; SSE41-NEXT: cmovbl %eax, %edi +; SSE41-NEXT: movd %edi, %xmm2 +; SSE41-NEXT: pinsrw $1, %esi, %xmm2 +; SSE41-NEXT: pextrw $2, %xmm4, %ecx +; SSE41-NEXT: pextrw $2, %xmm3, %esi +; SSE41-NEXT: subw %cx, %si +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: pinsrw $2, %esi, %xmm2 +; SSE41-NEXT: pextrw $3, %xmm4, %ecx +; SSE41-NEXT: pextrw $3, %xmm3, %esi +; SSE41-NEXT: subw %cx, %si +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: pinsrw $3, %esi, %xmm2 +; SSE41-NEXT: pextrw $4, %xmm4, %ecx +; SSE41-NEXT: pextrw $4, %xmm3, %esi +; SSE41-NEXT: subw %cx, %si +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: pinsrw $4, %esi, %xmm2 +; SSE41-NEXT: pextrw $5, %xmm4, %ecx +; SSE41-NEXT: pextrw $5, %xmm3, %esi +; SSE41-NEXT: subw %cx, %si +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: pinsrw $5, %esi, %xmm2 +; SSE41-NEXT: pextrw $6, %xmm4, %ecx +; SSE41-NEXT: pextrw $6, %xmm3, %esi +; SSE41-NEXT: subw %cx, %si +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: pinsrw $6, %esi, %xmm2 +; SSE41-NEXT: pextrw $7, %xmm4, %ecx +; SSE41-NEXT: pextrw $7, %xmm3, %esi +; SSE41-NEXT: subw %cx, %si +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: pinsrw $7, %esi, %xmm2 +; SSE41-NEXT: pextrw $1, %xmm1, %ecx +; SSE41-NEXT: pextrw $1, %xmm0, %esi +; SSE41-NEXT: subw %cx, %si +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: movd %xmm1, %ecx +; SSE41-NEXT: movd %xmm0, %edi +; SSE41-NEXT: subw %cx, %di +; SSE41-NEXT: cmovbl %eax, %edi +; SSE41-NEXT: movd %edi, %xmm3 +; SSE41-NEXT: pinsrw $1, %esi, %xmm3 +; SSE41-NEXT: pextrw $2, %xmm1, %ecx +; SSE41-NEXT: pextrw $2, %xmm0, %esi +; SSE41-NEXT: subw %cx, %si +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: pinsrw $2, %esi, %xmm3 +; SSE41-NEXT: pextrw $3, %xmm1, %ecx +; SSE41-NEXT: pextrw $3, %xmm0, %esi +; SSE41-NEXT: subw %cx, %si +; SSE41-NEXT: cmovbl %eax, %esi +; SSE41-NEXT: pinsrw $3, %esi, %xmm3 +; SSE41-NEXT: movq %xmm3, 16(%rdx) +; SSE41-NEXT: movdqa %xmm2, (%rdx) +; SSE41-NEXT: retq +; +; AVX-LABEL: v12i16: +; AVX: # %bb.0: +; AVX-NEXT: pushq %rbp +; AVX-NEXT: pushq %rbx +; AVX-NEXT: vmovdqa (%rsi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rsi), %xmm2 +; AVX-NEXT: vpextrw $1, %xmm2, %eax +; AVX-NEXT: vmovdqa (%rdi), %xmm1 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX-NEXT: vpextrw $1, %xmm3, %ecx +; AVX-NEXT: xorl %r8d, %r8d +; AVX-NEXT: subw %ax, %cx +; AVX-NEXT: cmovbl %r8d, %ecx +; AVX-NEXT: vmovd %xmm2, %eax +; AVX-NEXT: vmovd %xmm3, %esi +; AVX-NEXT: subw %ax, %si +; AVX-NEXT: cmovbl %r8d, %esi +; AVX-NEXT: vmovd %esi, %xmm4 +; AVX-NEXT: vpinsrw $1, %ecx, %xmm4, %xmm4 +; AVX-NEXT: vpextrw $2, %xmm2, %eax +; AVX-NEXT: vpextrw $2, %xmm3, %ecx +; AVX-NEXT: subw %ax, %cx +; AVX-NEXT: cmovbl %r8d, %ecx +; AVX-NEXT: vpinsrw $2, %ecx, %xmm4, %xmm4 +; AVX-NEXT: vpextrw $3, %xmm2, %eax +; AVX-NEXT: vpextrw $3, %xmm3, %ecx +; AVX-NEXT: subw %ax, %cx +; AVX-NEXT: cmovbl %r8d, %ecx +; AVX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm4 +; AVX-NEXT: vpextrw $4, %xmm2, %eax +; AVX-NEXT: vpextrw $4, %xmm3, %ecx +; AVX-NEXT: subw %ax, %cx +; AVX-NEXT: cmovbl %r8d, %ecx +; AVX-NEXT: vpinsrw $4, %ecx, %xmm4, %xmm4 +; AVX-NEXT: vpextrw $5, %xmm2, %eax +; AVX-NEXT: vpextrw $5, %xmm3, %ecx +; AVX-NEXT: subw %ax, %cx +; AVX-NEXT: cmovbl %r8d, %ecx +; AVX-NEXT: vpinsrw $5, %ecx, %xmm4, %xmm4 +; AVX-NEXT: vpextrw $6, %xmm2, %eax +; AVX-NEXT: vpextrw $6, %xmm3, %ecx +; AVX-NEXT: subw %ax, %cx +; AVX-NEXT: cmovbl %r8d, %ecx +; AVX-NEXT: vpinsrw $6, %ecx, %xmm4, %xmm4 +; AVX-NEXT: vpextrw $7, %xmm2, %eax +; AVX-NEXT: vpextrw $7, %xmm3, %ecx +; AVX-NEXT: subw %ax, %cx +; AVX-NEXT: cmovbl %r8d, %ecx +; AVX-NEXT: vpinsrw $7, %ecx, %xmm4, %xmm2 +; AVX-NEXT: vpextrw $7, %xmm0, %eax +; AVX-NEXT: vpextrw $7, %xmm1, %r9d +; AVX-NEXT: subw %ax, %r9w +; AVX-NEXT: cmovbl %r8d, %r9d +; AVX-NEXT: vpextrw $6, %xmm0, %eax +; AVX-NEXT: vpextrw $6, %xmm1, %r10d +; AVX-NEXT: subw %ax, %r10w +; AVX-NEXT: cmovbl %r8d, %r10d +; AVX-NEXT: vpextrw $5, %xmm0, %eax +; AVX-NEXT: vpextrw $5, %xmm1, %edi +; AVX-NEXT: subw %ax, %di +; AVX-NEXT: cmovbl %r8d, %edi +; AVX-NEXT: vpextrw $4, %xmm0, %ecx +; AVX-NEXT: vpextrw $4, %xmm1, %eax +; AVX-NEXT: subw %cx, %ax +; AVX-NEXT: cmovbl %r8d, %eax +; AVX-NEXT: vpextrw $3, %xmm0, %esi +; AVX-NEXT: vpextrw $3, %xmm1, %ecx +; AVX-NEXT: subw %si, %cx +; AVX-NEXT: cmovbl %r8d, %ecx +; AVX-NEXT: vpextrw $2, %xmm0, %r11d +; AVX-NEXT: vpextrw $2, %xmm1, %esi +; AVX-NEXT: subw %r11w, %si +; AVX-NEXT: cmovbl %r8d, %esi +; AVX-NEXT: vpextrw $1, %xmm0, %r11d +; AVX-NEXT: vpextrw $1, %xmm1, %ebx +; AVX-NEXT: subw %r11w, %bx +; AVX-NEXT: cmovbl %r8d, %ebx +; AVX-NEXT: vmovd %xmm0, %r11d +; AVX-NEXT: vmovd %xmm1, %ebp +; AVX-NEXT: subw %r11w, %bp +; AVX-NEXT: cmovbl %r8d, %ebp +; AVX-NEXT: vmovq %xmm2, 16(%rdx) +; AVX-NEXT: vmovd %ebp, %xmm0 +; AVX-NEXT: vpinsrw $1, %ebx, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $2, %esi, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $5, %edi, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $6, %r10d, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $7, %r9d, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rdx) +; AVX-NEXT: popq %rbx +; AVX-NEXT: popq %rbp +; AVX-NEXT: retq + %x = load <12 x i16>, <12 x i16>* %px + %y = load <12 x i16>, <12 x i16>* %py + %z = call <12 x i16> @llvm.usub.sat.v12i16(<12 x i16> %x, <12 x i16> %y) + store <12 x i16> %z, <12 x i16>* %pz + ret void +} + +; Scalarization + +define void @v1i8(<1 x i8>* %px, <1 x i8>* %py, <1 x i8>* %pz) nounwind { +; SSE-LABEL: v1i8: +; SSE: # %bb.0: +; SSE-NEXT: movb (%rdi), %al +; SSE-NEXT: subb (%rsi), %al +; SSE-NEXT: jae .LBB13_2 +; SSE-NEXT: # %bb.1: +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: .LBB13_2: +; SSE-NEXT: movb %al, (%rdx) +; SSE-NEXT: retq +; +; AVX-LABEL: v1i8: +; AVX: # %bb.0: +; AVX-NEXT: movb (%rdi), %al +; AVX-NEXT: subb (%rsi), %al +; AVX-NEXT: jae .LBB13_2 +; AVX-NEXT: # %bb.1: +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: .LBB13_2: +; AVX-NEXT: movb %al, (%rdx) +; AVX-NEXT: retq + %x = load <1 x i8>, <1 x i8>* %px + %y = load <1 x i8>, <1 x i8>* %py + %z = call <1 x i8> @llvm.usub.sat.v1i8(<1 x i8> %x, <1 x i8> %y) + store <1 x i8> %z, <1 x i8>* %pz + ret void +} + +define void @v1i16(<1 x i16>* %px, <1 x i16>* %py, <1 x i16>* %pz) nounwind { +; SSE-LABEL: v1i16: +; SSE: # %bb.0: +; SSE-NEXT: movzwl (%rdi), %eax +; SSE-NEXT: xorl %ecx, %ecx +; SSE-NEXT: subw (%rsi), %ax +; SSE-NEXT: cmovbl %ecx, %eax +; SSE-NEXT: movw %ax, (%rdx) +; SSE-NEXT: retq +; +; AVX-LABEL: v1i16: +; AVX: # %bb.0: +; AVX-NEXT: movzwl (%rdi), %eax +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: subw (%rsi), %ax +; AVX-NEXT: cmovbl %ecx, %eax +; AVX-NEXT: movw %ax, (%rdx) +; AVX-NEXT: retq + %x = load <1 x i16>, <1 x i16>* %px + %y = load <1 x i16>, <1 x i16>* %py + %z = call <1 x i16> @llvm.usub.sat.v1i16(<1 x i16> %x, <1 x i16> %y) + store <1 x i16> %z, <1 x i16>* %pz + ret void +} + +; Promotion + +define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { +; SSE2-LABEL: v16i4: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: pushq %r15 +; SSE2-NEXT: pushq %r14 +; SSE2-NEXT: pushq %r13 +; SSE2-NEXT: pushq %r12 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: psllw $4, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: psllw $4, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: jb .LBB15_2 +; SSE2-NEXT: # %bb.1: +; SSE2-NEXT: movl %ecx, %eax +; SSE2-NEXT: .LBB15_2: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl $0, %ecx +; SSE2-NEXT: jb .LBB15_4 +; SSE2-NEXT: # %bb.3: +; SSE2-NEXT: movl %edx, %ecx +; SSE2-NEXT: .LBB15_4: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl $0, %edx +; SSE2-NEXT: jb .LBB15_6 +; SSE2-NEXT: # %bb.5: +; SSE2-NEXT: movl %ebx, %edx +; SSE2-NEXT: .LBB15_6: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl $0, %esi +; SSE2-NEXT: jb .LBB15_8 +; SSE2-NEXT: # %bb.7: +; SSE2-NEXT: movl %ebx, %esi +; SSE2-NEXT: .LBB15_8: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl $0, %edi +; SSE2-NEXT: jb .LBB15_10 +; SSE2-NEXT: # %bb.9: +; SSE2-NEXT: movl %ebx, %edi +; SSE2-NEXT: .LBB15_10: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl $0, %r12d +; SSE2-NEXT: jb .LBB15_12 +; SSE2-NEXT: # %bb.11: +; SSE2-NEXT: movl %ebx, %r12d +; SSE2-NEXT: .LBB15_12: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl $0, %r8d +; SSE2-NEXT: jb .LBB15_14 +; SSE2-NEXT: # %bb.13: +; SSE2-NEXT: movl %ebx, %r8d +; SSE2-NEXT: .LBB15_14: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl $0, %r10d +; SSE2-NEXT: jb .LBB15_16 +; SSE2-NEXT: # %bb.15: +; SSE2-NEXT: movl %ebx, %r10d +; SSE2-NEXT: .LBB15_16: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl $0, %r13d +; SSE2-NEXT: jb .LBB15_18 +; SSE2-NEXT: # %bb.17: +; SSE2-NEXT: movl %ebx, %r13d +; SSE2-NEXT: .LBB15_18: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl $0, %r9d +; SSE2-NEXT: jb .LBB15_20 +; SSE2-NEXT: # %bb.19: +; SSE2-NEXT: movl %ebx, %r9d +; SSE2-NEXT: .LBB15_20: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl $0, %r11d +; SSE2-NEXT: jb .LBB15_22 +; SSE2-NEXT: # %bb.21: +; SSE2-NEXT: movl %ebx, %r11d +; SSE2-NEXT: .LBB15_22: +; SSE2-NEXT: movzbl %al, %r14d +; SSE2-NEXT: movzbl %cl, %r15d +; SSE2-NEXT: movzbl %dl, %edx +; SSE2-NEXT: movzbl %sil, %esi +; SSE2-NEXT: movzbl %dil, %ebx +; SSE2-NEXT: movzbl %r12b, %ebp +; SSE2-NEXT: movzbl %r8b, %edi +; SSE2-NEXT: movzbl %r10b, %r8d +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: jb .LBB15_24 +; SSE2-NEXT: # %bb.23: +; SSE2-NEXT: movl %ecx, %eax +; SSE2-NEXT: .LBB15_24: +; SSE2-NEXT: movd %r14d, %xmm2 +; SSE2-NEXT: movd %r15d, %xmm3 +; SSE2-NEXT: movd %edx, %xmm5 +; SSE2-NEXT: movd %esi, %xmm0 +; SSE2-NEXT: movd %ebx, %xmm6 +; SSE2-NEXT: movd %ebp, %xmm4 +; SSE2-NEXT: movd %edi, %xmm7 +; SSE2-NEXT: movd %r8d, %xmm1 +; SSE2-NEXT: movzbl %r13b, %ebp +; SSE2-NEXT: movzbl %r9b, %ecx +; SSE2-NEXT: movzbl %r11b, %edx +; SSE2-NEXT: movzbl %al, %esi +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, %edi +; SSE2-NEXT: jb .LBB15_26 +; SSE2-NEXT: # %bb.25: +; SSE2-NEXT: movl %eax, %edi +; SSE2-NEXT: .LBB15_26: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE2-NEXT: movd %ebp, %xmm6 +; SSE2-NEXT: movd %ecx, %xmm5 +; SSE2-NEXT: movd %edx, %xmm7 +; SSE2-NEXT: movd %esi, %xmm2 +; SSE2-NEXT: movzbl %dil, %eax +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl $0, %ecx +; SSE2-NEXT: jb .LBB15_28 +; SSE2-NEXT: # %bb.27: +; SSE2-NEXT: movl %edx, %ecx +; SSE2-NEXT: .LBB15_28: +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: movzbl %cl, %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl $0, %ecx +; SSE2-NEXT: jb .LBB15_30 +; SSE2-NEXT: # %bb.29: +; SSE2-NEXT: movl %edx, %ecx +; SSE2-NEXT: .LBB15_30: +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: movzbl %cl, %ecx +; SSE2-NEXT: movd %ecx, %xmm4 +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, %ecx +; SSE2-NEXT: jb .LBB15_32 +; SSE2-NEXT: # %bb.31: +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: .LBB15_32: +; SSE2-NEXT: movzbl %cl, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %r12 +; SSE2-NEXT: popq %r13 +; SSE2-NEXT: popq %r14 +; SSE2-NEXT: popq %r15 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v16i4: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pushq %rbp +; SSSE3-NEXT: pushq %r15 +; SSSE3-NEXT: pushq %r14 +; SSSE3-NEXT: pushq %r13 +; SSSE3-NEXT: pushq %r12 +; SSSE3-NEXT: pushq %rbx +; SSSE3-NEXT: psllw $4, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: psllw $4, %xmm1 +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movl $0, %eax +; SSSE3-NEXT: jb .LBB15_2 +; SSSE3-NEXT: # %bb.1: +; SSSE3-NEXT: movl %ecx, %eax +; SSSE3-NEXT: .LBB15_2: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl $0, %ecx +; SSSE3-NEXT: jb .LBB15_4 +; SSSE3-NEXT: # %bb.3: +; SSSE3-NEXT: movl %edx, %ecx +; SSSE3-NEXT: .LBB15_4: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl $0, %edx +; SSSE3-NEXT: jb .LBB15_6 +; SSSE3-NEXT: # %bb.5: +; SSSE3-NEXT: movl %ebx, %edx +; SSSE3-NEXT: .LBB15_6: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl $0, %esi +; SSSE3-NEXT: jb .LBB15_8 +; SSSE3-NEXT: # %bb.7: +; SSSE3-NEXT: movl %ebx, %esi +; SSSE3-NEXT: .LBB15_8: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl $0, %edi +; SSSE3-NEXT: jb .LBB15_10 +; SSSE3-NEXT: # %bb.9: +; SSSE3-NEXT: movl %ebx, %edi +; SSSE3-NEXT: .LBB15_10: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl $0, %r12d +; SSSE3-NEXT: jb .LBB15_12 +; SSSE3-NEXT: # %bb.11: +; SSSE3-NEXT: movl %ebx, %r12d +; SSSE3-NEXT: .LBB15_12: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl $0, %r8d +; SSSE3-NEXT: jb .LBB15_14 +; SSSE3-NEXT: # %bb.13: +; SSSE3-NEXT: movl %ebx, %r8d +; SSSE3-NEXT: .LBB15_14: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl $0, %r10d +; SSSE3-NEXT: jb .LBB15_16 +; SSSE3-NEXT: # %bb.15: +; SSSE3-NEXT: movl %ebx, %r10d +; SSSE3-NEXT: .LBB15_16: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl $0, %r13d +; SSSE3-NEXT: jb .LBB15_18 +; SSSE3-NEXT: # %bb.17: +; SSSE3-NEXT: movl %ebx, %r13d +; SSSE3-NEXT: .LBB15_18: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl $0, %r9d +; SSSE3-NEXT: jb .LBB15_20 +; SSSE3-NEXT: # %bb.19: +; SSSE3-NEXT: movl %ebx, %r9d +; SSSE3-NEXT: .LBB15_20: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl $0, %r11d +; SSSE3-NEXT: jb .LBB15_22 +; SSSE3-NEXT: # %bb.21: +; SSSE3-NEXT: movl %ebx, %r11d +; SSSE3-NEXT: .LBB15_22: +; SSSE3-NEXT: movzbl %al, %r14d +; SSSE3-NEXT: movzbl %cl, %r15d +; SSSE3-NEXT: movzbl %dl, %edx +; SSSE3-NEXT: movzbl %sil, %esi +; SSSE3-NEXT: movzbl %dil, %ebx +; SSSE3-NEXT: movzbl %r12b, %ebp +; SSSE3-NEXT: movzbl %r8b, %edi +; SSSE3-NEXT: movzbl %r10b, %r8d +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movl $0, %eax +; SSSE3-NEXT: jb .LBB15_24 +; SSSE3-NEXT: # %bb.23: +; SSSE3-NEXT: movl %ecx, %eax +; SSSE3-NEXT: .LBB15_24: +; SSSE3-NEXT: movd %r14d, %xmm2 +; SSSE3-NEXT: movd %r15d, %xmm3 +; SSSE3-NEXT: movd %edx, %xmm5 +; SSSE3-NEXT: movd %esi, %xmm0 +; SSSE3-NEXT: movd %ebx, %xmm6 +; SSSE3-NEXT: movd %ebp, %xmm4 +; SSSE3-NEXT: movd %edi, %xmm7 +; SSSE3-NEXT: movd %r8d, %xmm1 +; SSSE3-NEXT: movzbl %r13b, %ebp +; SSSE3-NEXT: movzbl %r9b, %ecx +; SSSE3-NEXT: movzbl %r11b, %edx +; SSSE3-NEXT: movzbl %al, %esi +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, %edi +; SSSE3-NEXT: jb .LBB15_26 +; SSSE3-NEXT: # %bb.25: +; SSSE3-NEXT: movl %eax, %edi +; SSSE3-NEXT: .LBB15_26: +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSSE3-NEXT: movd %ebp, %xmm6 +; SSSE3-NEXT: movd %ecx, %xmm5 +; SSSE3-NEXT: movd %edx, %xmm7 +; SSSE3-NEXT: movd %esi, %xmm2 +; SSSE3-NEXT: movzbl %dil, %eax +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl $0, %ecx +; SSSE3-NEXT: jb .LBB15_28 +; SSSE3-NEXT: # %bb.27: +; SSSE3-NEXT: movl %edx, %ecx +; SSSE3-NEXT: .LBB15_28: +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSSE3-NEXT: movd %eax, %xmm4 +; SSSE3-NEXT: movzbl %cl, %eax +; SSSE3-NEXT: movd %eax, %xmm3 +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl $0, %ecx +; SSSE3-NEXT: jb .LBB15_30 +; SSSE3-NEXT: # %bb.29: +; SSSE3-NEXT: movl %edx, %ecx +; SSSE3-NEXT: .LBB15_30: +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSSE3-NEXT: movzbl %cl, %ecx +; SSSE3-NEXT: movd %ecx, %xmm4 +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, %ecx +; SSSE3-NEXT: jb .LBB15_32 +; SSSE3-NEXT: # %bb.31: +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: .LBB15_32: +; SSSE3-NEXT: movzbl %cl, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: psrlw $4, %xmm0 +; SSSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSSE3-NEXT: popq %rbx +; SSSE3-NEXT: popq %r12 +; SSSE3-NEXT: popq %r13 +; SSSE3-NEXT: popq %r14 +; SSSE3-NEXT: popq %r15 +; SSSE3-NEXT: popq %rbp +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v16i4: +; SSE41: # %bb.0: +; SSE41-NEXT: psllw $4, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; SSE41-NEXT: pand %xmm2, %xmm1 +; SSE41-NEXT: pextrb $1, %xmm1, %edx +; SSE41-NEXT: psllw $4, %xmm0 +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pextrb $1, %xmm0, %ecx +; SSE41-NEXT: xorl %eax, %eax +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB15_2 +; SSE41-NEXT: # %bb.1: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB15_2: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pextrb $0, %xmm1, %esi +; SSE41-NEXT: pextrb $0, %xmm0, %edx +; SSE41-NEXT: subb %sil, %dl +; SSE41-NEXT: movl $0, %esi +; SSE41-NEXT: jb .LBB15_4 +; SSE41-NEXT: # %bb.3: +; SSE41-NEXT: movl %edx, %esi +; SSE41-NEXT: .LBB15_4: +; SSE41-NEXT: movzbl %sil, %edx +; SSE41-NEXT: movd %edx, %xmm2 +; SSE41-NEXT: pinsrb $1, %ecx, %xmm2 +; SSE41-NEXT: pextrb $2, %xmm1, %edx +; SSE41-NEXT: pextrb $2, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB15_6 +; SSE41-NEXT: # %bb.5: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB15_6: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $2, %ecx, %xmm2 +; SSE41-NEXT: pextrb $3, %xmm1, %edx +; SSE41-NEXT: pextrb $3, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB15_8 +; SSE41-NEXT: # %bb.7: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB15_8: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $3, %ecx, %xmm2 +; SSE41-NEXT: pextrb $4, %xmm1, %edx +; SSE41-NEXT: pextrb $4, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB15_10 +; SSE41-NEXT: # %bb.9: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB15_10: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $4, %ecx, %xmm2 +; SSE41-NEXT: pextrb $5, %xmm1, %edx +; SSE41-NEXT: pextrb $5, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB15_12 +; SSE41-NEXT: # %bb.11: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB15_12: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $5, %ecx, %xmm2 +; SSE41-NEXT: pextrb $6, %xmm1, %edx +; SSE41-NEXT: pextrb $6, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB15_14 +; SSE41-NEXT: # %bb.13: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB15_14: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $6, %ecx, %xmm2 +; SSE41-NEXT: pextrb $7, %xmm1, %edx +; SSE41-NEXT: pextrb $7, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB15_16 +; SSE41-NEXT: # %bb.15: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB15_16: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $7, %ecx, %xmm2 +; SSE41-NEXT: pextrb $8, %xmm1, %edx +; SSE41-NEXT: pextrb $8, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB15_18 +; SSE41-NEXT: # %bb.17: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB15_18: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $8, %ecx, %xmm2 +; SSE41-NEXT: pextrb $9, %xmm1, %edx +; SSE41-NEXT: pextrb $9, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB15_20 +; SSE41-NEXT: # %bb.19: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB15_20: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $9, %ecx, %xmm2 +; SSE41-NEXT: pextrb $10, %xmm1, %edx +; SSE41-NEXT: pextrb $10, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB15_22 +; SSE41-NEXT: # %bb.21: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB15_22: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $10, %ecx, %xmm2 +; SSE41-NEXT: pextrb $11, %xmm1, %edx +; SSE41-NEXT: pextrb $11, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB15_24 +; SSE41-NEXT: # %bb.23: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB15_24: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $11, %ecx, %xmm2 +; SSE41-NEXT: pextrb $12, %xmm1, %edx +; SSE41-NEXT: pextrb $12, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB15_26 +; SSE41-NEXT: # %bb.25: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB15_26: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $12, %ecx, %xmm2 +; SSE41-NEXT: pextrb $13, %xmm1, %edx +; SSE41-NEXT: pextrb $13, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB15_28 +; SSE41-NEXT: # %bb.27: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB15_28: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $13, %ecx, %xmm2 +; SSE41-NEXT: pextrb $14, %xmm1, %edx +; SSE41-NEXT: pextrb $14, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB15_30 +; SSE41-NEXT: # %bb.29: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB15_30: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $14, %ecx, %xmm2 +; SSE41-NEXT: pextrb $15, %xmm1, %edx +; SSE41-NEXT: pextrb $15, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: jb .LBB15_32 +; SSE41-NEXT: # %bb.31: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB15_32: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $15, %eax, %xmm2 +; SSE41-NEXT: psrlw $4, %xmm2 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: v16i4: +; AVX: # %bb.0: +; AVX-NEXT: vpsllw $4, %xmm1, %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $1, %xmm1, %edx +; AVX-NEXT: vpsllw $4, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $1, %xmm0, %ecx +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: subb %dl, %cl +; AVX-NEXT: movl $0, %edx +; AVX-NEXT: jb .LBB15_2 +; AVX-NEXT: # %bb.1: +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: .LBB15_2: +; AVX-NEXT: movzbl %dl, %ecx +; AVX-NEXT: vpextrb $0, %xmm1, %esi +; AVX-NEXT: vpextrb $0, %xmm0, %edx +; AVX-NEXT: subb %sil, %dl +; AVX-NEXT: movl $0, %esi +; AVX-NEXT: jb .LBB15_4 +; AVX-NEXT: # %bb.3: +; AVX-NEXT: movl %edx, %esi +; AVX-NEXT: .LBB15_4: +; AVX-NEXT: movzbl %sil, %edx +; AVX-NEXT: vmovd %edx, %xmm2 +; AVX-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $2, %xmm1, %edx +; AVX-NEXT: vpextrb $2, %xmm0, %ecx +; AVX-NEXT: subb %dl, %cl +; AVX-NEXT: movl $0, %edx +; AVX-NEXT: jb .LBB15_6 +; AVX-NEXT: # %bb.5: +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: .LBB15_6: +; AVX-NEXT: movzbl %dl, %ecx +; AVX-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $3, %xmm1, %edx +; AVX-NEXT: vpextrb $3, %xmm0, %ecx +; AVX-NEXT: subb %dl, %cl +; AVX-NEXT: movl $0, %edx +; AVX-NEXT: jb .LBB15_8 +; AVX-NEXT: # %bb.7: +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: .LBB15_8: +; AVX-NEXT: movzbl %dl, %ecx +; AVX-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $4, %xmm1, %edx +; AVX-NEXT: vpextrb $4, %xmm0, %ecx +; AVX-NEXT: subb %dl, %cl +; AVX-NEXT: movl $0, %edx +; AVX-NEXT: jb .LBB15_10 +; AVX-NEXT: # %bb.9: +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: .LBB15_10: +; AVX-NEXT: movzbl %dl, %ecx +; AVX-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $5, %xmm1, %edx +; AVX-NEXT: vpextrb $5, %xmm0, %ecx +; AVX-NEXT: subb %dl, %cl +; AVX-NEXT: movl $0, %edx +; AVX-NEXT: jb .LBB15_12 +; AVX-NEXT: # %bb.11: +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: .LBB15_12: +; AVX-NEXT: movzbl %dl, %ecx +; AVX-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $6, %xmm1, %edx +; AVX-NEXT: vpextrb $6, %xmm0, %ecx +; AVX-NEXT: subb %dl, %cl +; AVX-NEXT: movl $0, %edx +; AVX-NEXT: jb .LBB15_14 +; AVX-NEXT: # %bb.13: +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: .LBB15_14: +; AVX-NEXT: movzbl %dl, %ecx +; AVX-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $7, %xmm1, %edx +; AVX-NEXT: vpextrb $7, %xmm0, %ecx +; AVX-NEXT: subb %dl, %cl +; AVX-NEXT: movl $0, %edx +; AVX-NEXT: jb .LBB15_16 +; AVX-NEXT: # %bb.15: +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: .LBB15_16: +; AVX-NEXT: movzbl %dl, %ecx +; AVX-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $8, %xmm1, %edx +; AVX-NEXT: vpextrb $8, %xmm0, %ecx +; AVX-NEXT: subb %dl, %cl +; AVX-NEXT: movl $0, %edx +; AVX-NEXT: jb .LBB15_18 +; AVX-NEXT: # %bb.17: +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: .LBB15_18: +; AVX-NEXT: movzbl %dl, %ecx +; AVX-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $9, %xmm1, %edx +; AVX-NEXT: vpextrb $9, %xmm0, %ecx +; AVX-NEXT: subb %dl, %cl +; AVX-NEXT: movl $0, %edx +; AVX-NEXT: jb .LBB15_20 +; AVX-NEXT: # %bb.19: +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: .LBB15_20: +; AVX-NEXT: movzbl %dl, %ecx +; AVX-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $10, %xmm1, %edx +; AVX-NEXT: vpextrb $10, %xmm0, %ecx +; AVX-NEXT: subb %dl, %cl +; AVX-NEXT: movl $0, %edx +; AVX-NEXT: jb .LBB15_22 +; AVX-NEXT: # %bb.21: +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: .LBB15_22: +; AVX-NEXT: movzbl %dl, %ecx +; AVX-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $11, %xmm1, %edx +; AVX-NEXT: vpextrb $11, %xmm0, %ecx +; AVX-NEXT: subb %dl, %cl +; AVX-NEXT: movl $0, %edx +; AVX-NEXT: jb .LBB15_24 +; AVX-NEXT: # %bb.23: +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: .LBB15_24: +; AVX-NEXT: movzbl %dl, %ecx +; AVX-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $12, %xmm1, %edx +; AVX-NEXT: vpextrb $12, %xmm0, %ecx +; AVX-NEXT: subb %dl, %cl +; AVX-NEXT: movl $0, %edx +; AVX-NEXT: jb .LBB15_26 +; AVX-NEXT: # %bb.25: +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: .LBB15_26: +; AVX-NEXT: movzbl %dl, %ecx +; AVX-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $13, %xmm1, %edx +; AVX-NEXT: vpextrb $13, %xmm0, %ecx +; AVX-NEXT: subb %dl, %cl +; AVX-NEXT: movl $0, %edx +; AVX-NEXT: jb .LBB15_28 +; AVX-NEXT: # %bb.27: +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: .LBB15_28: +; AVX-NEXT: movzbl %dl, %ecx +; AVX-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $14, %xmm1, %edx +; AVX-NEXT: vpextrb $14, %xmm0, %ecx +; AVX-NEXT: subb %dl, %cl +; AVX-NEXT: movl $0, %edx +; AVX-NEXT: jb .LBB15_30 +; AVX-NEXT: # %bb.29: +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: .LBB15_30: +; AVX-NEXT: movzbl %dl, %ecx +; AVX-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $15, %xmm1, %edx +; AVX-NEXT: vpextrb $15, %xmm0, %ecx +; AVX-NEXT: subb %dl, %cl +; AVX-NEXT: jb .LBB15_32 +; AVX-NEXT: # %bb.31: +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: .LBB15_32: +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: retq + %z = call <16 x i4> @llvm.usub.sat.v16i4(<16 x i4> %x, <16 x i4> %y) + ret <16 x i4> %z +} + +define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind { +; SSE2-LABEL: v16i1: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: pushq %r15 +; SSE2-NEXT: pushq %r14 +; SSE2-NEXT: pushq %r13 +; SSE2-NEXT: pushq %r12 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: psllw $7, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: psllw $7, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: jb .LBB16_2 +; SSE2-NEXT: # %bb.1: +; SSE2-NEXT: movl %ecx, %eax +; SSE2-NEXT: .LBB16_2: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl $0, %ecx +; SSE2-NEXT: jb .LBB16_4 +; SSE2-NEXT: # %bb.3: +; SSE2-NEXT: movl %edx, %ecx +; SSE2-NEXT: .LBB16_4: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl $0, %edx +; SSE2-NEXT: jb .LBB16_6 +; SSE2-NEXT: # %bb.5: +; SSE2-NEXT: movl %ebx, %edx +; SSE2-NEXT: .LBB16_6: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl $0, %esi +; SSE2-NEXT: jb .LBB16_8 +; SSE2-NEXT: # %bb.7: +; SSE2-NEXT: movl %ebx, %esi +; SSE2-NEXT: .LBB16_8: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl $0, %edi +; SSE2-NEXT: jb .LBB16_10 +; SSE2-NEXT: # %bb.9: +; SSE2-NEXT: movl %ebx, %edi +; SSE2-NEXT: .LBB16_10: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl $0, %r12d +; SSE2-NEXT: jb .LBB16_12 +; SSE2-NEXT: # %bb.11: +; SSE2-NEXT: movl %ebx, %r12d +; SSE2-NEXT: .LBB16_12: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl $0, %r8d +; SSE2-NEXT: jb .LBB16_14 +; SSE2-NEXT: # %bb.13: +; SSE2-NEXT: movl %ebx, %r8d +; SSE2-NEXT: .LBB16_14: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl $0, %r10d +; SSE2-NEXT: jb .LBB16_16 +; SSE2-NEXT: # %bb.15: +; SSE2-NEXT: movl %ebx, %r10d +; SSE2-NEXT: .LBB16_16: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl $0, %r13d +; SSE2-NEXT: jb .LBB16_18 +; SSE2-NEXT: # %bb.17: +; SSE2-NEXT: movl %ebx, %r13d +; SSE2-NEXT: .LBB16_18: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl $0, %r9d +; SSE2-NEXT: jb .LBB16_20 +; SSE2-NEXT: # %bb.19: +; SSE2-NEXT: movl %ebx, %r9d +; SSE2-NEXT: .LBB16_20: +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSE2-NEXT: movl $0, %r11d +; SSE2-NEXT: jb .LBB16_22 +; SSE2-NEXT: # %bb.21: +; SSE2-NEXT: movl %ebx, %r11d +; SSE2-NEXT: .LBB16_22: +; SSE2-NEXT: movzbl %al, %r14d +; SSE2-NEXT: movzbl %cl, %r15d +; SSE2-NEXT: movzbl %dl, %edx +; SSE2-NEXT: movzbl %sil, %esi +; SSE2-NEXT: movzbl %dil, %ebx +; SSE2-NEXT: movzbl %r12b, %ebp +; SSE2-NEXT: movzbl %r8b, %edi +; SSE2-NEXT: movzbl %r10b, %r8d +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: jb .LBB16_24 +; SSE2-NEXT: # %bb.23: +; SSE2-NEXT: movl %ecx, %eax +; SSE2-NEXT: .LBB16_24: +; SSE2-NEXT: movd %r14d, %xmm2 +; SSE2-NEXT: movd %r15d, %xmm3 +; SSE2-NEXT: movd %edx, %xmm5 +; SSE2-NEXT: movd %esi, %xmm0 +; SSE2-NEXT: movd %ebx, %xmm6 +; SSE2-NEXT: movd %ebp, %xmm4 +; SSE2-NEXT: movd %edi, %xmm7 +; SSE2-NEXT: movd %r8d, %xmm1 +; SSE2-NEXT: movzbl %r13b, %ebp +; SSE2-NEXT: movzbl %r9b, %ecx +; SSE2-NEXT: movzbl %r11b, %edx +; SSE2-NEXT: movzbl %al, %esi +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, %edi +; SSE2-NEXT: jb .LBB16_26 +; SSE2-NEXT: # %bb.25: +; SSE2-NEXT: movl %eax, %edi +; SSE2-NEXT: .LBB16_26: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE2-NEXT: movd %ebp, %xmm6 +; SSE2-NEXT: movd %ecx, %xmm5 +; SSE2-NEXT: movd %edx, %xmm7 +; SSE2-NEXT: movd %esi, %xmm2 +; SSE2-NEXT: movzbl %dil, %eax +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl $0, %ecx +; SSE2-NEXT: jb .LBB16_28 +; SSE2-NEXT: # %bb.27: +; SSE2-NEXT: movl %edx, %ecx +; SSE2-NEXT: .LBB16_28: +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: movzbl %cl, %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %dl +; SSE2-NEXT: movl $0, %ecx +; SSE2-NEXT: jb .LBB16_30 +; SSE2-NEXT: # %bb.29: +; SSE2-NEXT: movl %edx, %ecx +; SSE2-NEXT: .LBB16_30: +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: movzbl %cl, %ecx +; SSE2-NEXT: movd %ecx, %xmm4 +; SSE2-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: movl $0, %ecx +; SSE2-NEXT: jb .LBB16_32 +; SSE2-NEXT: # %bb.31: +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: .LBB16_32: +; SSE2-NEXT: movzbl %cl, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: psrlw $7, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %r12 +; SSE2-NEXT: popq %r13 +; SSE2-NEXT: popq %r14 +; SSE2-NEXT: popq %r15 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v16i1: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pushq %rbp +; SSSE3-NEXT: pushq %r15 +; SSSE3-NEXT: pushq %r14 +; SSSE3-NEXT: pushq %r13 +; SSSE3-NEXT: pushq %r12 +; SSSE3-NEXT: pushq %rbx +; SSSE3-NEXT: psllw $7, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: psllw $7, %xmm1 +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movl $0, %eax +; SSSE3-NEXT: jb .LBB16_2 +; SSSE3-NEXT: # %bb.1: +; SSSE3-NEXT: movl %ecx, %eax +; SSSE3-NEXT: .LBB16_2: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl $0, %ecx +; SSSE3-NEXT: jb .LBB16_4 +; SSSE3-NEXT: # %bb.3: +; SSSE3-NEXT: movl %edx, %ecx +; SSSE3-NEXT: .LBB16_4: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl $0, %edx +; SSSE3-NEXT: jb .LBB16_6 +; SSSE3-NEXT: # %bb.5: +; SSSE3-NEXT: movl %ebx, %edx +; SSSE3-NEXT: .LBB16_6: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl $0, %esi +; SSSE3-NEXT: jb .LBB16_8 +; SSSE3-NEXT: # %bb.7: +; SSSE3-NEXT: movl %ebx, %esi +; SSSE3-NEXT: .LBB16_8: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl $0, %edi +; SSSE3-NEXT: jb .LBB16_10 +; SSSE3-NEXT: # %bb.9: +; SSSE3-NEXT: movl %ebx, %edi +; SSSE3-NEXT: .LBB16_10: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl $0, %r12d +; SSSE3-NEXT: jb .LBB16_12 +; SSSE3-NEXT: # %bb.11: +; SSSE3-NEXT: movl %ebx, %r12d +; SSSE3-NEXT: .LBB16_12: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl $0, %r8d +; SSSE3-NEXT: jb .LBB16_14 +; SSSE3-NEXT: # %bb.13: +; SSSE3-NEXT: movl %ebx, %r8d +; SSSE3-NEXT: .LBB16_14: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl $0, %r10d +; SSSE3-NEXT: jb .LBB16_16 +; SSSE3-NEXT: # %bb.15: +; SSSE3-NEXT: movl %ebx, %r10d +; SSSE3-NEXT: .LBB16_16: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl $0, %r13d +; SSSE3-NEXT: jb .LBB16_18 +; SSSE3-NEXT: # %bb.17: +; SSSE3-NEXT: movl %ebx, %r13d +; SSSE3-NEXT: .LBB16_18: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl $0, %r9d +; SSSE3-NEXT: jb .LBB16_20 +; SSSE3-NEXT: # %bb.19: +; SSSE3-NEXT: movl %ebx, %r9d +; SSSE3-NEXT: .LBB16_20: +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %bl +; SSSE3-NEXT: movl $0, %r11d +; SSSE3-NEXT: jb .LBB16_22 +; SSSE3-NEXT: # %bb.21: +; SSSE3-NEXT: movl %ebx, %r11d +; SSSE3-NEXT: .LBB16_22: +; SSSE3-NEXT: movzbl %al, %r14d +; SSSE3-NEXT: movzbl %cl, %r15d +; SSSE3-NEXT: movzbl %dl, %edx +; SSSE3-NEXT: movzbl %sil, %esi +; SSSE3-NEXT: movzbl %dil, %ebx +; SSSE3-NEXT: movzbl %r12b, %ebp +; SSSE3-NEXT: movzbl %r8b, %edi +; SSSE3-NEXT: movzbl %r10b, %r8d +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %cl +; SSSE3-NEXT: movl $0, %eax +; SSSE3-NEXT: jb .LBB16_24 +; SSSE3-NEXT: # %bb.23: +; SSSE3-NEXT: movl %ecx, %eax +; SSSE3-NEXT: .LBB16_24: +; SSSE3-NEXT: movd %r14d, %xmm2 +; SSSE3-NEXT: movd %r15d, %xmm3 +; SSSE3-NEXT: movd %edx, %xmm5 +; SSSE3-NEXT: movd %esi, %xmm0 +; SSSE3-NEXT: movd %ebx, %xmm6 +; SSSE3-NEXT: movd %ebp, %xmm4 +; SSSE3-NEXT: movd %edi, %xmm7 +; SSSE3-NEXT: movd %r8d, %xmm1 +; SSSE3-NEXT: movzbl %r13b, %ebp +; SSSE3-NEXT: movzbl %r9b, %ecx +; SSSE3-NEXT: movzbl %r11b, %edx +; SSSE3-NEXT: movzbl %al, %esi +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, %edi +; SSSE3-NEXT: jb .LBB16_26 +; SSSE3-NEXT: # %bb.25: +; SSSE3-NEXT: movl %eax, %edi +; SSSE3-NEXT: .LBB16_26: +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSSE3-NEXT: movd %ebp, %xmm6 +; SSSE3-NEXT: movd %ecx, %xmm5 +; SSSE3-NEXT: movd %edx, %xmm7 +; SSSE3-NEXT: movd %esi, %xmm2 +; SSSE3-NEXT: movzbl %dil, %eax +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl $0, %ecx +; SSSE3-NEXT: jb .LBB16_28 +; SSSE3-NEXT: # %bb.27: +; SSSE3-NEXT: movl %edx, %ecx +; SSSE3-NEXT: .LBB16_28: +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSSE3-NEXT: movd %eax, %xmm4 +; SSSE3-NEXT: movzbl %cl, %eax +; SSSE3-NEXT: movd %eax, %xmm3 +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %dl +; SSSE3-NEXT: movl $0, %ecx +; SSSE3-NEXT: jb .LBB16_30 +; SSSE3-NEXT: # %bb.29: +; SSSE3-NEXT: movl %edx, %ecx +; SSSE3-NEXT: .LBB16_30: +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSSE3-NEXT: movzbl %cl, %ecx +; SSSE3-NEXT: movd %ecx, %xmm4 +; SSSE3-NEXT: subb -{{[0-9]+}}(%rsp), %al +; SSSE3-NEXT: movl $0, %ecx +; SSSE3-NEXT: jb .LBB16_32 +; SSSE3-NEXT: # %bb.31: +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: .LBB16_32: +; SSSE3-NEXT: movzbl %cl, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: psrlw $7, %xmm0 +; SSSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSSE3-NEXT: popq %rbx +; SSSE3-NEXT: popq %r12 +; SSSE3-NEXT: popq %r13 +; SSSE3-NEXT: popq %r14 +; SSSE3-NEXT: popq %r15 +; SSSE3-NEXT: popq %rbp +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v16i1: +; SSE41: # %bb.0: +; SSE41-NEXT: psllw $7, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; SSE41-NEXT: pand %xmm2, %xmm1 +; SSE41-NEXT: pextrb $1, %xmm1, %edx +; SSE41-NEXT: psllw $7, %xmm0 +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pextrb $1, %xmm0, %ecx +; SSE41-NEXT: xorl %eax, %eax +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB16_2 +; SSE41-NEXT: # %bb.1: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB16_2: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pextrb $0, %xmm1, %esi +; SSE41-NEXT: pextrb $0, %xmm0, %edx +; SSE41-NEXT: subb %sil, %dl +; SSE41-NEXT: movl $0, %esi +; SSE41-NEXT: jb .LBB16_4 +; SSE41-NEXT: # %bb.3: +; SSE41-NEXT: movl %edx, %esi +; SSE41-NEXT: .LBB16_4: +; SSE41-NEXT: movzbl %sil, %edx +; SSE41-NEXT: movd %edx, %xmm2 +; SSE41-NEXT: pinsrb $1, %ecx, %xmm2 +; SSE41-NEXT: pextrb $2, %xmm1, %edx +; SSE41-NEXT: pextrb $2, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB16_6 +; SSE41-NEXT: # %bb.5: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB16_6: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $2, %ecx, %xmm2 +; SSE41-NEXT: pextrb $3, %xmm1, %edx +; SSE41-NEXT: pextrb $3, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB16_8 +; SSE41-NEXT: # %bb.7: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB16_8: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $3, %ecx, %xmm2 +; SSE41-NEXT: pextrb $4, %xmm1, %edx +; SSE41-NEXT: pextrb $4, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB16_10 +; SSE41-NEXT: # %bb.9: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB16_10: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $4, %ecx, %xmm2 +; SSE41-NEXT: pextrb $5, %xmm1, %edx +; SSE41-NEXT: pextrb $5, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB16_12 +; SSE41-NEXT: # %bb.11: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB16_12: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $5, %ecx, %xmm2 +; SSE41-NEXT: pextrb $6, %xmm1, %edx +; SSE41-NEXT: pextrb $6, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB16_14 +; SSE41-NEXT: # %bb.13: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB16_14: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $6, %ecx, %xmm2 +; SSE41-NEXT: pextrb $7, %xmm1, %edx +; SSE41-NEXT: pextrb $7, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB16_16 +; SSE41-NEXT: # %bb.15: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB16_16: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $7, %ecx, %xmm2 +; SSE41-NEXT: pextrb $8, %xmm1, %edx +; SSE41-NEXT: pextrb $8, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB16_18 +; SSE41-NEXT: # %bb.17: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB16_18: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $8, %ecx, %xmm2 +; SSE41-NEXT: pextrb $9, %xmm1, %edx +; SSE41-NEXT: pextrb $9, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB16_20 +; SSE41-NEXT: # %bb.19: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB16_20: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $9, %ecx, %xmm2 +; SSE41-NEXT: pextrb $10, %xmm1, %edx +; SSE41-NEXT: pextrb $10, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB16_22 +; SSE41-NEXT: # %bb.21: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB16_22: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $10, %ecx, %xmm2 +; SSE41-NEXT: pextrb $11, %xmm1, %edx +; SSE41-NEXT: pextrb $11, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB16_24 +; SSE41-NEXT: # %bb.23: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB16_24: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $11, %ecx, %xmm2 +; SSE41-NEXT: pextrb $12, %xmm1, %edx +; SSE41-NEXT: pextrb $12, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB16_26 +; SSE41-NEXT: # %bb.25: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB16_26: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $12, %ecx, %xmm2 +; SSE41-NEXT: pextrb $13, %xmm1, %edx +; SSE41-NEXT: pextrb $13, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB16_28 +; SSE41-NEXT: # %bb.27: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB16_28: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $13, %ecx, %xmm2 +; SSE41-NEXT: pextrb $14, %xmm1, %edx +; SSE41-NEXT: pextrb $14, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: movl $0, %edx +; SSE41-NEXT: jb .LBB16_30 +; SSE41-NEXT: # %bb.29: +; SSE41-NEXT: movl %ecx, %edx +; SSE41-NEXT: .LBB16_30: +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: pinsrb $14, %ecx, %xmm2 +; SSE41-NEXT: pextrb $15, %xmm1, %edx +; SSE41-NEXT: pextrb $15, %xmm0, %ecx +; SSE41-NEXT: subb %dl, %cl +; SSE41-NEXT: jb .LBB16_32 +; SSE41-NEXT: # %bb.31: +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: .LBB16_32: +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $15, %eax, %xmm2 +; SSE41-NEXT: psrlw $7, %xmm2 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: v16i1: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsllw $7, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpextrb $1, %xmm1, %edx +; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpextrb $1, %xmm0, %ecx +; AVX1-NEXT: xorl %eax, %eax +; AVX1-NEXT: subb %dl, %cl +; AVX1-NEXT: movl $0, %edx +; AVX1-NEXT: jb .LBB16_2 +; AVX1-NEXT: # %bb.1: +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: .LBB16_2: +; AVX1-NEXT: movzbl %dl, %ecx +; AVX1-NEXT: vpextrb $0, %xmm1, %esi +; AVX1-NEXT: vpextrb $0, %xmm0, %edx +; AVX1-NEXT: subb %sil, %dl +; AVX1-NEXT: movl $0, %esi +; AVX1-NEXT: jb .LBB16_4 +; AVX1-NEXT: # %bb.3: +; AVX1-NEXT: movl %edx, %esi +; AVX1-NEXT: .LBB16_4: +; AVX1-NEXT: movzbl %sil, %edx +; AVX1-NEXT: vmovd %edx, %xmm2 +; AVX1-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $2, %xmm1, %edx +; AVX1-NEXT: vpextrb $2, %xmm0, %ecx +; AVX1-NEXT: subb %dl, %cl +; AVX1-NEXT: movl $0, %edx +; AVX1-NEXT: jb .LBB16_6 +; AVX1-NEXT: # %bb.5: +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: .LBB16_6: +; AVX1-NEXT: movzbl %dl, %ecx +; AVX1-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $3, %xmm1, %edx +; AVX1-NEXT: vpextrb $3, %xmm0, %ecx +; AVX1-NEXT: subb %dl, %cl +; AVX1-NEXT: movl $0, %edx +; AVX1-NEXT: jb .LBB16_8 +; AVX1-NEXT: # %bb.7: +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: .LBB16_8: +; AVX1-NEXT: movzbl %dl, %ecx +; AVX1-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $4, %xmm1, %edx +; AVX1-NEXT: vpextrb $4, %xmm0, %ecx +; AVX1-NEXT: subb %dl, %cl +; AVX1-NEXT: movl $0, %edx +; AVX1-NEXT: jb .LBB16_10 +; AVX1-NEXT: # %bb.9: +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: .LBB16_10: +; AVX1-NEXT: movzbl %dl, %ecx +; AVX1-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $5, %xmm1, %edx +; AVX1-NEXT: vpextrb $5, %xmm0, %ecx +; AVX1-NEXT: subb %dl, %cl +; AVX1-NEXT: movl $0, %edx +; AVX1-NEXT: jb .LBB16_12 +; AVX1-NEXT: # %bb.11: +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: .LBB16_12: +; AVX1-NEXT: movzbl %dl, %ecx +; AVX1-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $6, %xmm1, %edx +; AVX1-NEXT: vpextrb $6, %xmm0, %ecx +; AVX1-NEXT: subb %dl, %cl +; AVX1-NEXT: movl $0, %edx +; AVX1-NEXT: jb .LBB16_14 +; AVX1-NEXT: # %bb.13: +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: .LBB16_14: +; AVX1-NEXT: movzbl %dl, %ecx +; AVX1-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $7, %xmm1, %edx +; AVX1-NEXT: vpextrb $7, %xmm0, %ecx +; AVX1-NEXT: subb %dl, %cl +; AVX1-NEXT: movl $0, %edx +; AVX1-NEXT: jb .LBB16_16 +; AVX1-NEXT: # %bb.15: +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: .LBB16_16: +; AVX1-NEXT: movzbl %dl, %ecx +; AVX1-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $8, %xmm1, %edx +; AVX1-NEXT: vpextrb $8, %xmm0, %ecx +; AVX1-NEXT: subb %dl, %cl +; AVX1-NEXT: movl $0, %edx +; AVX1-NEXT: jb .LBB16_18 +; AVX1-NEXT: # %bb.17: +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: .LBB16_18: +; AVX1-NEXT: movzbl %dl, %ecx +; AVX1-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $9, %xmm1, %edx +; AVX1-NEXT: vpextrb $9, %xmm0, %ecx +; AVX1-NEXT: subb %dl, %cl +; AVX1-NEXT: movl $0, %edx +; AVX1-NEXT: jb .LBB16_20 +; AVX1-NEXT: # %bb.19: +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: .LBB16_20: +; AVX1-NEXT: movzbl %dl, %ecx +; AVX1-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $10, %xmm1, %edx +; AVX1-NEXT: vpextrb $10, %xmm0, %ecx +; AVX1-NEXT: subb %dl, %cl +; AVX1-NEXT: movl $0, %edx +; AVX1-NEXT: jb .LBB16_22 +; AVX1-NEXT: # %bb.21: +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: .LBB16_22: +; AVX1-NEXT: movzbl %dl, %ecx +; AVX1-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $11, %xmm1, %edx +; AVX1-NEXT: vpextrb $11, %xmm0, %ecx +; AVX1-NEXT: subb %dl, %cl +; AVX1-NEXT: movl $0, %edx +; AVX1-NEXT: jb .LBB16_24 +; AVX1-NEXT: # %bb.23: +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: .LBB16_24: +; AVX1-NEXT: movzbl %dl, %ecx +; AVX1-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $12, %xmm1, %edx +; AVX1-NEXT: vpextrb $12, %xmm0, %ecx +; AVX1-NEXT: subb %dl, %cl +; AVX1-NEXT: movl $0, %edx +; AVX1-NEXT: jb .LBB16_26 +; AVX1-NEXT: # %bb.25: +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: .LBB16_26: +; AVX1-NEXT: movzbl %dl, %ecx +; AVX1-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $13, %xmm1, %edx +; AVX1-NEXT: vpextrb $13, %xmm0, %ecx +; AVX1-NEXT: subb %dl, %cl +; AVX1-NEXT: movl $0, %edx +; AVX1-NEXT: jb .LBB16_28 +; AVX1-NEXT: # %bb.27: +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: .LBB16_28: +; AVX1-NEXT: movzbl %dl, %ecx +; AVX1-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $14, %xmm1, %edx +; AVX1-NEXT: vpextrb $14, %xmm0, %ecx +; AVX1-NEXT: subb %dl, %cl +; AVX1-NEXT: movl $0, %edx +; AVX1-NEXT: jb .LBB16_30 +; AVX1-NEXT: # %bb.29: +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: .LBB16_30: +; AVX1-NEXT: movzbl %dl, %ecx +; AVX1-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $15, %xmm1, %edx +; AVX1-NEXT: vpextrb $15, %xmm0, %ecx +; AVX1-NEXT: subb %dl, %cl +; AVX1-NEXT: jb .LBB16_32 +; AVX1-NEXT: # %bb.31: +; AVX1-NEXT: movl %ecx, %eax +; AVX1-NEXT: .LBB16_32: +; AVX1-NEXT: movzbl %al, %eax +; AVX1-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0 +; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: v16i1: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllw $7, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpextrb $1, %xmm1, %edx +; AVX2-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpextrb $1, %xmm0, %ecx +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: subb %dl, %cl +; AVX2-NEXT: movl $0, %edx +; AVX2-NEXT: jb .LBB16_2 +; AVX2-NEXT: # %bb.1: +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: .LBB16_2: +; AVX2-NEXT: movzbl %dl, %ecx +; AVX2-NEXT: vpextrb $0, %xmm1, %esi +; AVX2-NEXT: vpextrb $0, %xmm0, %edx +; AVX2-NEXT: subb %sil, %dl +; AVX2-NEXT: movl $0, %esi +; AVX2-NEXT: jb .LBB16_4 +; AVX2-NEXT: # %bb.3: +; AVX2-NEXT: movl %edx, %esi +; AVX2-NEXT: .LBB16_4: +; AVX2-NEXT: movzbl %sil, %edx +; AVX2-NEXT: vmovd %edx, %xmm2 +; AVX2-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $2, %xmm1, %edx +; AVX2-NEXT: vpextrb $2, %xmm0, %ecx +; AVX2-NEXT: subb %dl, %cl +; AVX2-NEXT: movl $0, %edx +; AVX2-NEXT: jb .LBB16_6 +; AVX2-NEXT: # %bb.5: +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: .LBB16_6: +; AVX2-NEXT: movzbl %dl, %ecx +; AVX2-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $3, %xmm1, %edx +; AVX2-NEXT: vpextrb $3, %xmm0, %ecx +; AVX2-NEXT: subb %dl, %cl +; AVX2-NEXT: movl $0, %edx +; AVX2-NEXT: jb .LBB16_8 +; AVX2-NEXT: # %bb.7: +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: .LBB16_8: +; AVX2-NEXT: movzbl %dl, %ecx +; AVX2-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $4, %xmm1, %edx +; AVX2-NEXT: vpextrb $4, %xmm0, %ecx +; AVX2-NEXT: subb %dl, %cl +; AVX2-NEXT: movl $0, %edx +; AVX2-NEXT: jb .LBB16_10 +; AVX2-NEXT: # %bb.9: +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: .LBB16_10: +; AVX2-NEXT: movzbl %dl, %ecx +; AVX2-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $5, %xmm1, %edx +; AVX2-NEXT: vpextrb $5, %xmm0, %ecx +; AVX2-NEXT: subb %dl, %cl +; AVX2-NEXT: movl $0, %edx +; AVX2-NEXT: jb .LBB16_12 +; AVX2-NEXT: # %bb.11: +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: .LBB16_12: +; AVX2-NEXT: movzbl %dl, %ecx +; AVX2-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $6, %xmm1, %edx +; AVX2-NEXT: vpextrb $6, %xmm0, %ecx +; AVX2-NEXT: subb %dl, %cl +; AVX2-NEXT: movl $0, %edx +; AVX2-NEXT: jb .LBB16_14 +; AVX2-NEXT: # %bb.13: +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: .LBB16_14: +; AVX2-NEXT: movzbl %dl, %ecx +; AVX2-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $7, %xmm1, %edx +; AVX2-NEXT: vpextrb $7, %xmm0, %ecx +; AVX2-NEXT: subb %dl, %cl +; AVX2-NEXT: movl $0, %edx +; AVX2-NEXT: jb .LBB16_16 +; AVX2-NEXT: # %bb.15: +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: .LBB16_16: +; AVX2-NEXT: movzbl %dl, %ecx +; AVX2-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $8, %xmm1, %edx +; AVX2-NEXT: vpextrb $8, %xmm0, %ecx +; AVX2-NEXT: subb %dl, %cl +; AVX2-NEXT: movl $0, %edx +; AVX2-NEXT: jb .LBB16_18 +; AVX2-NEXT: # %bb.17: +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: .LBB16_18: +; AVX2-NEXT: movzbl %dl, %ecx +; AVX2-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $9, %xmm1, %edx +; AVX2-NEXT: vpextrb $9, %xmm0, %ecx +; AVX2-NEXT: subb %dl, %cl +; AVX2-NEXT: movl $0, %edx +; AVX2-NEXT: jb .LBB16_20 +; AVX2-NEXT: # %bb.19: +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: .LBB16_20: +; AVX2-NEXT: movzbl %dl, %ecx +; AVX2-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $10, %xmm1, %edx +; AVX2-NEXT: vpextrb $10, %xmm0, %ecx +; AVX2-NEXT: subb %dl, %cl +; AVX2-NEXT: movl $0, %edx +; AVX2-NEXT: jb .LBB16_22 +; AVX2-NEXT: # %bb.21: +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: .LBB16_22: +; AVX2-NEXT: movzbl %dl, %ecx +; AVX2-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $11, %xmm1, %edx +; AVX2-NEXT: vpextrb $11, %xmm0, %ecx +; AVX2-NEXT: subb %dl, %cl +; AVX2-NEXT: movl $0, %edx +; AVX2-NEXT: jb .LBB16_24 +; AVX2-NEXT: # %bb.23: +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: .LBB16_24: +; AVX2-NEXT: movzbl %dl, %ecx +; AVX2-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $12, %xmm1, %edx +; AVX2-NEXT: vpextrb $12, %xmm0, %ecx +; AVX2-NEXT: subb %dl, %cl +; AVX2-NEXT: movl $0, %edx +; AVX2-NEXT: jb .LBB16_26 +; AVX2-NEXT: # %bb.25: +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: .LBB16_26: +; AVX2-NEXT: movzbl %dl, %ecx +; AVX2-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $13, %xmm1, %edx +; AVX2-NEXT: vpextrb $13, %xmm0, %ecx +; AVX2-NEXT: subb %dl, %cl +; AVX2-NEXT: movl $0, %edx +; AVX2-NEXT: jb .LBB16_28 +; AVX2-NEXT: # %bb.27: +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: .LBB16_28: +; AVX2-NEXT: movzbl %dl, %ecx +; AVX2-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $14, %xmm1, %edx +; AVX2-NEXT: vpextrb $14, %xmm0, %ecx +; AVX2-NEXT: subb %dl, %cl +; AVX2-NEXT: movl $0, %edx +; AVX2-NEXT: jb .LBB16_30 +; AVX2-NEXT: # %bb.29: +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: .LBB16_30: +; AVX2-NEXT: movzbl %dl, %ecx +; AVX2-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $15, %xmm1, %edx +; AVX2-NEXT: vpextrb $15, %xmm0, %ecx +; AVX2-NEXT: subb %dl, %cl +; AVX2-NEXT: jb .LBB16_32 +; AVX2-NEXT: # %bb.31: +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: .LBB16_32: +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0 +; AVX2-NEXT: vpsrlw $7, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: v16i1: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r15 +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX512-NEXT: vpmovb2m %xmm0, %k0 +; AVX512-NEXT: vpsllw $7, %xmm1, %xmm0 +; AVX512-NEXT: vpmovb2m %xmm0, %k1 +; AVX512-NEXT: kshiftrw $4, %k0, %k2 +; AVX512-NEXT: kshiftrw $4, %k1, %k3 +; AVX512-NEXT: kshiftrw $3, %k0, %k4 +; AVX512-NEXT: kmovd %k4, %r15d +; AVX512-NEXT: kshiftrw $3, %k1, %k4 +; AVX512-NEXT: kmovd %k4, %r9d +; AVX512-NEXT: kshiftrw $2, %k0, %k4 +; AVX512-NEXT: kmovd %k4, %eax +; AVX512-NEXT: kshiftrw $2, %k1, %k4 +; AVX512-NEXT: kmovd %k4, %ebp +; AVX512-NEXT: kmovd %k0, %ecx +; AVX512-NEXT: kmovd %k1, %esi +; AVX512-NEXT: kshiftrw $1, %k0, %k4 +; AVX512-NEXT: kmovd %k4, %edi +; AVX512-NEXT: kshiftrw $1, %k1, %k4 +; AVX512-NEXT: kmovd %k4, %edx +; AVX512-NEXT: shlb $7, %dl +; AVX512-NEXT: shlb $7, %dil +; AVX512-NEXT: xorl %r8d, %r8d +; AVX512-NEXT: subb %dl, %dil +; AVX512-NEXT: movl $0, %ebx +; AVX512-NEXT: jb .LBB16_2 +; AVX512-NEXT: # %bb.1: +; AVX512-NEXT: movl %edi, %ebx +; AVX512-NEXT: .LBB16_2: +; AVX512-NEXT: kshiftrw $5, %k0, %k4 +; AVX512-NEXT: kshiftrw $5, %k1, %k5 +; AVX512-NEXT: kmovd %k2, %edi +; AVX512-NEXT: kmovd %k3, %r11d +; AVX512-NEXT: shrb $7, %bl +; AVX512-NEXT: kmovd %ebx, %k6 +; AVX512-NEXT: shlb $7, %sil +; AVX512-NEXT: shlb $7, %cl +; AVX512-NEXT: subb %sil, %cl +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: jb .LBB16_4 +; AVX512-NEXT: # %bb.3: +; AVX512-NEXT: movl %ecx, %edx +; AVX512-NEXT: .LBB16_4: +; AVX512-NEXT: kshiftrw $6, %k0, %k2 +; AVX512-NEXT: kshiftrw $6, %k1, %k3 +; AVX512-NEXT: kmovd %k4, %esi +; AVX512-NEXT: kmovd %k5, %r14d +; AVX512-NEXT: shrb $7, %dl +; AVX512-NEXT: kmovd %edx, %k4 +; AVX512-NEXT: kshiftrw $1, %k4, %k5 +; AVX512-NEXT: kxorw %k6, %k5, %k5 +; AVX512-NEXT: kshiftlw $15, %k5, %k5 +; AVX512-NEXT: kshiftrw $14, %k5, %k5 +; AVX512-NEXT: kxorw %k5, %k4, %k6 +; AVX512-NEXT: kshiftrw $2, %k6, %k7 +; AVX512-NEXT: shlb $7, %bpl +; AVX512-NEXT: shlb $7, %al +; AVX512-NEXT: subb %bpl, %al +; AVX512-NEXT: movl $0, %ecx +; AVX512-NEXT: jb .LBB16_6 +; AVX512-NEXT: # %bb.5: +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: .LBB16_6: +; AVX512-NEXT: kshiftrw $7, %k0, %k4 +; AVX512-NEXT: kshiftrw $7, %k1, %k5 +; AVX512-NEXT: kmovd %k2, %eax +; AVX512-NEXT: kmovd %k3, %r10d +; AVX512-NEXT: shrb $7, %cl +; AVX512-NEXT: kmovd %ecx, %k2 +; AVX512-NEXT: kxorw %k2, %k7, %k2 +; AVX512-NEXT: kshiftlw $15, %k2, %k2 +; AVX512-NEXT: kshiftrw $13, %k2, %k2 +; AVX512-NEXT: kxorw %k2, %k6, %k6 +; AVX512-NEXT: kshiftrw $3, %k6, %k7 +; AVX512-NEXT: shlb $7, %r9b +; AVX512-NEXT: shlb $7, %r15b +; AVX512-NEXT: subb %r9b, %r15b +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: jb .LBB16_8 +; AVX512-NEXT: # %bb.7: +; AVX512-NEXT: movl %r15d, %edx +; AVX512-NEXT: .LBB16_8: +; AVX512-NEXT: kshiftrw $8, %k0, %k2 +; AVX512-NEXT: kshiftrw $8, %k1, %k3 +; AVX512-NEXT: kmovd %k4, %ecx +; AVX512-NEXT: kmovd %k5, %r9d +; AVX512-NEXT: shrb $7, %dl +; AVX512-NEXT: kmovd %edx, %k4 +; AVX512-NEXT: kxorw %k4, %k7, %k4 +; AVX512-NEXT: kshiftlw $15, %k4, %k4 +; AVX512-NEXT: kshiftrw $12, %k4, %k4 +; AVX512-NEXT: kxorw %k4, %k6, %k6 +; AVX512-NEXT: kshiftrw $4, %k6, %k7 +; AVX512-NEXT: shlb $7, %r11b +; AVX512-NEXT: shlb $7, %dil +; AVX512-NEXT: subb %r11b, %dil +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: jb .LBB16_10 +; AVX512-NEXT: # %bb.9: +; AVX512-NEXT: movl %edi, %edx +; AVX512-NEXT: .LBB16_10: +; AVX512-NEXT: kshiftrw $9, %k0, %k4 +; AVX512-NEXT: kshiftrw $9, %k1, %k5 +; AVX512-NEXT: kmovd %k2, %edi +; AVX512-NEXT: kmovd %k3, %ebx +; AVX512-NEXT: shrb $7, %dl +; AVX512-NEXT: kmovd %edx, %k2 +; AVX512-NEXT: kxorw %k2, %k7, %k2 +; AVX512-NEXT: kshiftlw $15, %k2, %k2 +; AVX512-NEXT: kshiftrw $11, %k2, %k2 +; AVX512-NEXT: kxorw %k2, %k6, %k6 +; AVX512-NEXT: kshiftrw $5, %k6, %k7 +; AVX512-NEXT: shlb $7, %r14b +; AVX512-NEXT: shlb $7, %sil +; AVX512-NEXT: subb %r14b, %sil +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: jb .LBB16_12 +; AVX512-NEXT: # %bb.11: +; AVX512-NEXT: movl %esi, %edx +; AVX512-NEXT: .LBB16_12: +; AVX512-NEXT: kshiftrw $10, %k0, %k2 +; AVX512-NEXT: kshiftrw $10, %k1, %k3 +; AVX512-NEXT: kmovd %k4, %esi +; AVX512-NEXT: kmovd %k5, %r11d +; AVX512-NEXT: shrb $7, %dl +; AVX512-NEXT: kmovd %edx, %k4 +; AVX512-NEXT: kxorw %k4, %k7, %k4 +; AVX512-NEXT: kshiftlw $15, %k4, %k4 +; AVX512-NEXT: kshiftrw $10, %k4, %k4 +; AVX512-NEXT: kxorw %k4, %k6, %k6 +; AVX512-NEXT: kshiftrw $6, %k6, %k7 +; AVX512-NEXT: shlb $7, %r10b +; AVX512-NEXT: shlb $7, %al +; AVX512-NEXT: subb %r10b, %al +; AVX512-NEXT: movl $0, %ebp +; AVX512-NEXT: jb .LBB16_14 +; AVX512-NEXT: # %bb.13: +; AVX512-NEXT: movl %eax, %ebp +; AVX512-NEXT: .LBB16_14: +; AVX512-NEXT: kshiftrw $11, %k0, %k4 +; AVX512-NEXT: kshiftrw $11, %k1, %k5 +; AVX512-NEXT: kmovd %k2, %r15d +; AVX512-NEXT: kmovd %k3, %r10d +; AVX512-NEXT: shrb $7, %bpl +; AVX512-NEXT: kmovd %ebp, %k2 +; AVX512-NEXT: kxorw %k2, %k7, %k2 +; AVX512-NEXT: kshiftlw $15, %k2, %k2 +; AVX512-NEXT: kshiftrw $9, %k2, %k2 +; AVX512-NEXT: kxorw %k2, %k6, %k6 +; AVX512-NEXT: kshiftrw $7, %k6, %k7 +; AVX512-NEXT: shlb $7, %r9b +; AVX512-NEXT: shlb $7, %cl +; AVX512-NEXT: subb %r9b, %cl +; AVX512-NEXT: movl $0, %eax +; AVX512-NEXT: jb .LBB16_16 +; AVX512-NEXT: # %bb.15: +; AVX512-NEXT: movl %ecx, %eax +; AVX512-NEXT: .LBB16_16: +; AVX512-NEXT: kshiftrw $12, %k0, %k2 +; AVX512-NEXT: kshiftrw $12, %k1, %k3 +; AVX512-NEXT: kmovd %k4, %ecx +; AVX512-NEXT: kmovd %k5, %r9d +; AVX512-NEXT: shrb $7, %al +; AVX512-NEXT: kmovd %eax, %k4 +; AVX512-NEXT: kxorw %k4, %k7, %k4 +; AVX512-NEXT: kshiftlw $15, %k4, %k4 +; AVX512-NEXT: kshiftrw $8, %k4, %k4 +; AVX512-NEXT: kxorw %k4, %k6, %k6 +; AVX512-NEXT: kshiftrw $8, %k6, %k7 +; AVX512-NEXT: shlb $7, %bl +; AVX512-NEXT: shlb $7, %dil +; AVX512-NEXT: subb %bl, %dil +; AVX512-NEXT: movl $0, %ebx +; AVX512-NEXT: jb .LBB16_18 +; AVX512-NEXT: # %bb.17: +; AVX512-NEXT: movl %edi, %ebx +; AVX512-NEXT: .LBB16_18: +; AVX512-NEXT: kshiftrw $13, %k0, %k4 +; AVX512-NEXT: kshiftrw $13, %k1, %k5 +; AVX512-NEXT: kmovd %k2, %eax +; AVX512-NEXT: kmovd %k3, %r14d +; AVX512-NEXT: shrb $7, %bl +; AVX512-NEXT: kmovd %ebx, %k2 +; AVX512-NEXT: kxorw %k2, %k7, %k2 +; AVX512-NEXT: kshiftlw $15, %k2, %k2 +; AVX512-NEXT: kshiftrw $7, %k2, %k2 +; AVX512-NEXT: kxorw %k2, %k6, %k6 +; AVX512-NEXT: kshiftrw $9, %k6, %k7 +; AVX512-NEXT: shlb $7, %r11b +; AVX512-NEXT: shlb $7, %sil +; AVX512-NEXT: subb %r11b, %sil +; AVX512-NEXT: movl $0, %edi +; AVX512-NEXT: jb .LBB16_20 +; AVX512-NEXT: # %bb.19: +; AVX512-NEXT: movl %esi, %edi +; AVX512-NEXT: .LBB16_20: +; AVX512-NEXT: kshiftrw $14, %k0, %k2 +; AVX512-NEXT: kshiftrw $14, %k1, %k3 +; AVX512-NEXT: kmovd %k4, %esi +; AVX512-NEXT: kmovd %k5, %r11d +; AVX512-NEXT: shrb $7, %dil +; AVX512-NEXT: kmovd %edi, %k4 +; AVX512-NEXT: kxorw %k4, %k7, %k4 +; AVX512-NEXT: kshiftlw $15, %k4, %k4 +; AVX512-NEXT: kshiftrw $6, %k4, %k4 +; AVX512-NEXT: kxorw %k4, %k6, %k4 +; AVX512-NEXT: kshiftrw $10, %k4, %k5 +; AVX512-NEXT: shlb $7, %r10b +; AVX512-NEXT: shlb $7, %r15b +; AVX512-NEXT: subb %r10b, %r15b +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: jb .LBB16_22 +; AVX512-NEXT: # %bb.21: +; AVX512-NEXT: movl %r15d, %edx +; AVX512-NEXT: .LBB16_22: +; AVX512-NEXT: kshiftrw $15, %k0, %k0 +; AVX512-NEXT: kshiftrw $15, %k1, %k1 +; AVX512-NEXT: kmovd %k2, %ebx +; AVX512-NEXT: kmovd %k3, %edi +; AVX512-NEXT: shrb $7, %dl +; AVX512-NEXT: kmovd %edx, %k2 +; AVX512-NEXT: kxorw %k2, %k5, %k2 +; AVX512-NEXT: kshiftlw $15, %k2, %k2 +; AVX512-NEXT: kshiftrw $5, %k2, %k2 +; AVX512-NEXT: kxorw %k2, %k4, %k2 +; AVX512-NEXT: kshiftrw $11, %k2, %k3 +; AVX512-NEXT: shlb $7, %r9b +; AVX512-NEXT: shlb $7, %cl +; AVX512-NEXT: subb %r9b, %cl +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: jb .LBB16_24 +; AVX512-NEXT: # %bb.23: +; AVX512-NEXT: movl %ecx, %edx +; AVX512-NEXT: .LBB16_24: +; AVX512-NEXT: kmovd %k0, %ecx +; AVX512-NEXT: kmovd %k1, %ebp +; AVX512-NEXT: shrb $7, %dl +; AVX512-NEXT: kmovd %edx, %k0 +; AVX512-NEXT: kxorw %k0, %k3, %k0 +; AVX512-NEXT: kshiftlw $15, %k0, %k0 +; AVX512-NEXT: kshiftrw $4, %k0, %k0 +; AVX512-NEXT: kxorw %k0, %k2, %k0 +; AVX512-NEXT: kshiftrw $12, %k0, %k1 +; AVX512-NEXT: shlb $7, %r14b +; AVX512-NEXT: shlb $7, %al +; AVX512-NEXT: subb %r14b, %al +; AVX512-NEXT: movl $0, %edx +; AVX512-NEXT: jb .LBB16_26 +; AVX512-NEXT: # %bb.25: +; AVX512-NEXT: movl %eax, %edx +; AVX512-NEXT: .LBB16_26: +; AVX512-NEXT: shrb $7, %dl +; AVX512-NEXT: kmovd %edx, %k2 +; AVX512-NEXT: kxorw %k2, %k1, %k1 +; AVX512-NEXT: kshiftlw $15, %k1, %k1 +; AVX512-NEXT: kshiftrw $3, %k1, %k1 +; AVX512-NEXT: kxorw %k1, %k0, %k0 +; AVX512-NEXT: kshiftrw $13, %k0, %k1 +; AVX512-NEXT: shlb $7, %r11b +; AVX512-NEXT: shlb $7, %sil +; AVX512-NEXT: subb %r11b, %sil +; AVX512-NEXT: movl $0, %eax +; AVX512-NEXT: jb .LBB16_28 +; AVX512-NEXT: # %bb.27: +; AVX512-NEXT: movl %esi, %eax +; AVX512-NEXT: .LBB16_28: +; AVX512-NEXT: shrb $7, %al +; AVX512-NEXT: kmovd %eax, %k2 +; AVX512-NEXT: kxorw %k2, %k1, %k1 +; AVX512-NEXT: kshiftlw $15, %k1, %k1 +; AVX512-NEXT: kshiftrw $2, %k1, %k1 +; AVX512-NEXT: kxorw %k1, %k0, %k0 +; AVX512-NEXT: kshiftrw $14, %k0, %k1 +; AVX512-NEXT: shlb $7, %dil +; AVX512-NEXT: shlb $7, %bl +; AVX512-NEXT: subb %dil, %bl +; AVX512-NEXT: movl $0, %eax +; AVX512-NEXT: jb .LBB16_30 +; AVX512-NEXT: # %bb.29: +; AVX512-NEXT: movl %ebx, %eax +; AVX512-NEXT: .LBB16_30: +; AVX512-NEXT: shrb $7, %al +; AVX512-NEXT: kmovd %eax, %k2 +; AVX512-NEXT: kxorw %k2, %k1, %k1 +; AVX512-NEXT: kshiftlw $15, %k1, %k1 +; AVX512-NEXT: kshiftrw $1, %k1, %k1 +; AVX512-NEXT: kxorw %k1, %k0, %k0 +; AVX512-NEXT: kshiftlw $1, %k0, %k0 +; AVX512-NEXT: kshiftrw $1, %k0, %k0 +; AVX512-NEXT: shlb $7, %bpl +; AVX512-NEXT: shlb $7, %cl +; AVX512-NEXT: subb %bpl, %cl +; AVX512-NEXT: jb .LBB16_32 +; AVX512-NEXT: # %bb.31: +; AVX512-NEXT: movl %ecx, %r8d +; AVX512-NEXT: .LBB16_32: +; AVX512-NEXT: shrb $7, %r8b +; AVX512-NEXT: kmovd %r8d, %k1 +; AVX512-NEXT: kshiftlw $15, %k1, %k1 +; AVX512-NEXT: korw %k1, %k0, %k0 +; AVX512-NEXT: vpmovm2b %k0, %xmm0 +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %r15 +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq + %z = call <16 x i1> @llvm.usub.sat.v16i1(<16 x i1> %x, <16 x i1> %y) + ret <16 x i1> %z +} + +; Expanded + +define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { +; SSE2-LABEL: v4i32: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3] +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] +; SSE2-NEXT: movd %xmm2, %ecx +; SSE2-NEXT: xorl %edx, %edx +; SSE2-NEXT: subl %eax, %ecx +; SSE2-NEXT: cmovbl %edx, %ecx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; SSE2-NEXT: movd %xmm3, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; SSE2-NEXT: movd %xmm3, %ecx +; SSE2-NEXT: subl %eax, %ecx +; SSE2-NEXT: cmovbl %edx, %ecx +; SSE2-NEXT: movd %ecx, %xmm3 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: subl %eax, %ecx +; SSE2-NEXT: cmovbl %edx, %ecx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: subl %eax, %ecx +; SSE2-NEXT: cmovbl %edx, %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v4i32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3] +; SSSE3-NEXT: movd %xmm2, %eax +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] +; SSSE3-NEXT: movd %xmm2, %ecx +; SSSE3-NEXT: xorl %edx, %edx +; SSSE3-NEXT: subl %eax, %ecx +; SSSE3-NEXT: cmovbl %edx, %ecx +; SSSE3-NEXT: movd %ecx, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; SSSE3-NEXT: movd %xmm3, %eax +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; SSSE3-NEXT: movd %xmm3, %ecx +; SSSE3-NEXT: subl %eax, %ecx +; SSSE3-NEXT: cmovbl %edx, %ecx +; SSSE3-NEXT: movd %ecx, %xmm3 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSSE3-NEXT: movd %xmm1, %eax +; SSSE3-NEXT: movd %xmm0, %ecx +; SSSE3-NEXT: subl %eax, %ecx +; SSSE3-NEXT: cmovbl %edx, %ecx +; SSSE3-NEXT: movd %ecx, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; SSSE3-NEXT: movd %xmm1, %eax +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSSE3-NEXT: movd %xmm0, %ecx +; SSSE3-NEXT: subl %eax, %ecx +; SSSE3-NEXT: cmovbl %edx, %ecx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v4i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pextrd $1, %xmm1, %eax +; SSE41-NEXT: pextrd $1, %xmm0, %ecx +; SSE41-NEXT: xorl %edx, %edx +; SSE41-NEXT: subl %eax, %ecx +; SSE41-NEXT: cmovbl %edx, %ecx +; SSE41-NEXT: movd %xmm1, %eax +; SSE41-NEXT: movd %xmm0, %esi +; SSE41-NEXT: subl %eax, %esi +; SSE41-NEXT: cmovbl %edx, %esi +; SSE41-NEXT: movd %esi, %xmm2 +; SSE41-NEXT: pinsrd $1, %ecx, %xmm2 +; SSE41-NEXT: pextrd $2, %xmm1, %eax +; SSE41-NEXT: pextrd $2, %xmm0, %ecx +; SSE41-NEXT: subl %eax, %ecx +; SSE41-NEXT: cmovbl %edx, %ecx +; SSE41-NEXT: pinsrd $2, %ecx, %xmm2 +; SSE41-NEXT: pextrd $3, %xmm1, %eax +; SSE41-NEXT: pextrd $3, %xmm0, %ecx +; SSE41-NEXT: subl %eax, %ecx +; SSE41-NEXT: cmovbl %edx, %ecx +; SSE41-NEXT: pinsrd $3, %ecx, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vpextrd $1, %xmm1, %eax +; AVX-NEXT: vpextrd $1, %xmm0, %ecx +; AVX-NEXT: xorl %edx, %edx +; AVX-NEXT: subl %eax, %ecx +; AVX-NEXT: cmovbl %edx, %ecx +; AVX-NEXT: vmovd %xmm1, %eax +; AVX-NEXT: vmovd %xmm0, %esi +; AVX-NEXT: subl %eax, %esi +; AVX-NEXT: cmovbl %edx, %esi +; AVX-NEXT: vmovd %esi, %xmm2 +; AVX-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrd $2, %xmm1, %eax +; AVX-NEXT: vpextrd $2, %xmm0, %ecx +; AVX-NEXT: subl %eax, %ecx +; AVX-NEXT: cmovbl %edx, %ecx +; AVX-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrd $3, %xmm1, %eax +; AVX-NEXT: vpextrd $3, %xmm0, %ecx +; AVX-NEXT: subl %eax, %ecx +; AVX-NEXT: cmovbl %edx, %ecx +; AVX-NEXT: vpinsrd $3, %ecx, %xmm2, %xmm0 +; AVX-NEXT: retq + %z = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %x, <4 x i32> %y) + ret <4 x i32> %z +} + +define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { +; SSE2-LABEL: v2i32: +; SSE2: # %bb.0: +; SSE2-NEXT: psllq $32, %xmm1 +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: psllq $32, %xmm0 +; SSE2-NEXT: movq %xmm0, %rcx +; SSE2-NEXT: xorl %edx, %edx +; SSE2-NEXT: subq %rax, %rcx +; SSE2-NEXT: cmovbq %rdx, %rcx +; SSE2-NEXT: movq %rcx, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm0, %rcx +; SSE2-NEXT: subq %rax, %rcx +; SSE2-NEXT: cmovbq %rdx, %rcx +; SSE2-NEXT: movq %rcx, %xmm0 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE2-NEXT: psrlq $32, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v2i32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: psllq $32, %xmm1 +; SSSE3-NEXT: movq %xmm1, %rax +; SSSE3-NEXT: psllq $32, %xmm0 +; SSSE3-NEXT: movq %xmm0, %rcx +; SSSE3-NEXT: xorl %edx, %edx +; SSSE3-NEXT: subq %rax, %rcx +; SSSE3-NEXT: cmovbq %rdx, %rcx +; SSSE3-NEXT: movq %rcx, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSSE3-NEXT: movq %xmm1, %rax +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSSE3-NEXT: movq %xmm0, %rcx +; SSSE3-NEXT: subq %rax, %rcx +; SSSE3-NEXT: cmovbq %rdx, %rcx +; SSSE3-NEXT: movq %rcx, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSSE3-NEXT: psrlq $32, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v2i32: +; SSE41: # %bb.0: +; SSE41-NEXT: psllq $32, %xmm1 +; SSE41-NEXT: pextrq $1, %xmm1, %rax +; SSE41-NEXT: psllq $32, %xmm0 +; SSE41-NEXT: pextrq $1, %xmm0, %rcx +; SSE41-NEXT: xorl %edx, %edx +; SSE41-NEXT: subq %rax, %rcx +; SSE41-NEXT: cmovbq %rdx, %rcx +; SSE41-NEXT: movq %rcx, %xmm2 +; SSE41-NEXT: movq %xmm1, %rax +; SSE41-NEXT: movq %xmm0, %rcx +; SSE41-NEXT: subq %rax, %rcx +; SSE41-NEXT: cmovbq %rdx, %rcx +; SSE41-NEXT: movq %rcx, %xmm0 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE41-NEXT: psrlq $32, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: v2i32: +; AVX: # %bb.0: +; AVX-NEXT: vpsllq $32, %xmm1, %xmm1 +; AVX-NEXT: vpextrq $1, %xmm1, %rax +; AVX-NEXT: vpsllq $32, %xmm0, %xmm0 +; AVX-NEXT: vpextrq $1, %xmm0, %rcx +; AVX-NEXT: xorl %edx, %edx +; AVX-NEXT: subq %rax, %rcx +; AVX-NEXT: cmovbq %rdx, %rcx +; AVX-NEXT: vmovq %rcx, %xmm2 +; AVX-NEXT: vmovq %xmm1, %rax +; AVX-NEXT: vmovq %xmm0, %rcx +; AVX-NEXT: subq %rax, %rcx +; AVX-NEXT: cmovbq %rdx, %rcx +; AVX-NEXT: vmovq %rcx, %xmm0 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX-NEXT: retq + %z = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> %x, <2 x i32> %y) + ret <2 x i32> %z +} + +define <4 x i24> @v4i24(<4 x i24> %x, <4 x i24> %y) nounwind { +; SSE2-LABEL: v4i24: +; SSE2: # %bb.0: +; SSE2-NEXT: pslld $8, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3] +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: pslld $8, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] +; SSE2-NEXT: movd %xmm2, %ecx +; SSE2-NEXT: xorl %edx, %edx +; SSE2-NEXT: subl %eax, %ecx +; SSE2-NEXT: cmovbl %edx, %ecx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; SSE2-NEXT: movd %xmm3, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; SSE2-NEXT: movd %xmm3, %ecx +; SSE2-NEXT: subl %eax, %ecx +; SSE2-NEXT: cmovbl %edx, %ecx +; SSE2-NEXT: movd %ecx, %xmm3 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: subl %eax, %ecx +; SSE2-NEXT: cmovbl %edx, %ecx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: subl %eax, %ecx +; SSE2-NEXT: cmovbl %edx, %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE2-NEXT: psrld $8, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v4i24: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pslld $8, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3] +; SSSE3-NEXT: movd %xmm2, %eax +; SSSE3-NEXT: pslld $8, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] +; SSSE3-NEXT: movd %xmm2, %ecx +; SSSE3-NEXT: xorl %edx, %edx +; SSSE3-NEXT: subl %eax, %ecx +; SSSE3-NEXT: cmovbl %edx, %ecx +; SSSE3-NEXT: movd %ecx, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; SSSE3-NEXT: movd %xmm3, %eax +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; SSSE3-NEXT: movd %xmm3, %ecx +; SSSE3-NEXT: subl %eax, %ecx +; SSSE3-NEXT: cmovbl %edx, %ecx +; SSSE3-NEXT: movd %ecx, %xmm3 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSSE3-NEXT: movd %xmm1, %eax +; SSSE3-NEXT: movd %xmm0, %ecx +; SSSE3-NEXT: subl %eax, %ecx +; SSSE3-NEXT: cmovbl %edx, %ecx +; SSSE3-NEXT: movd %ecx, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; SSSE3-NEXT: movd %xmm1, %eax +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSSE3-NEXT: movd %xmm0, %ecx +; SSSE3-NEXT: subl %eax, %ecx +; SSSE3-NEXT: cmovbl %edx, %ecx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSSE3-NEXT: psrld $8, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v4i24: +; SSE41: # %bb.0: +; SSE41-NEXT: pslld $8, %xmm1 +; SSE41-NEXT: pextrd $1, %xmm1, %eax +; SSE41-NEXT: pslld $8, %xmm0 +; SSE41-NEXT: pextrd $1, %xmm0, %ecx +; SSE41-NEXT: xorl %edx, %edx +; SSE41-NEXT: subl %eax, %ecx +; SSE41-NEXT: cmovbl %edx, %ecx +; SSE41-NEXT: movd %xmm1, %eax +; SSE41-NEXT: movd %xmm0, %esi +; SSE41-NEXT: subl %eax, %esi +; SSE41-NEXT: cmovbl %edx, %esi +; SSE41-NEXT: movd %esi, %xmm2 +; SSE41-NEXT: pinsrd $1, %ecx, %xmm2 +; SSE41-NEXT: pextrd $2, %xmm1, %eax +; SSE41-NEXT: pextrd $2, %xmm0, %ecx +; SSE41-NEXT: subl %eax, %ecx +; SSE41-NEXT: cmovbl %edx, %ecx +; SSE41-NEXT: pinsrd $2, %ecx, %xmm2 +; SSE41-NEXT: pextrd $3, %xmm1, %eax +; SSE41-NEXT: pextrd $3, %xmm0, %ecx +; SSE41-NEXT: subl %eax, %ecx +; SSE41-NEXT: cmovbl %edx, %ecx +; SSE41-NEXT: pinsrd $3, %ecx, %xmm2 +; SSE41-NEXT: psrld $8, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: v4i24: +; AVX: # %bb.0: +; AVX-NEXT: vpslld $8, %xmm1, %xmm1 +; AVX-NEXT: vpextrd $1, %xmm1, %eax +; AVX-NEXT: vpslld $8, %xmm0, %xmm0 +; AVX-NEXT: vpextrd $1, %xmm0, %ecx +; AVX-NEXT: xorl %edx, %edx +; AVX-NEXT: subl %eax, %ecx +; AVX-NEXT: cmovbl %edx, %ecx +; AVX-NEXT: vmovd %xmm1, %eax +; AVX-NEXT: vmovd %xmm0, %esi +; AVX-NEXT: subl %eax, %esi +; AVX-NEXT: cmovbl %edx, %esi +; AVX-NEXT: vmovd %esi, %xmm2 +; AVX-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrd $2, %xmm1, %eax +; AVX-NEXT: vpextrd $2, %xmm0, %ecx +; AVX-NEXT: subl %eax, %ecx +; AVX-NEXT: cmovbl %edx, %ecx +; AVX-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpextrd $3, %xmm1, %eax +; AVX-NEXT: vpextrd $3, %xmm0, %ecx +; AVX-NEXT: subl %eax, %ecx +; AVX-NEXT: cmovbl %edx, %ecx +; AVX-NEXT: vpinsrd $3, %ecx, %xmm2, %xmm0 +; AVX-NEXT: vpsrld $8, %xmm0, %xmm0 +; AVX-NEXT: retq + %z = call <4 x i24> @llvm.usub.sat.v4i24(<4 x i24> %x, <4 x i24> %y) + ret <4 x i24> %z +} + +define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind { +; SSE-LABEL: v2i128: +; SSE: # %bb.0: +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: xorl %edi, %edi +; SSE-NEXT: subq %r9, %rsi +; SSE-NEXT: sbbq {{[0-9]+}}(%rsp), %rdx +; SSE-NEXT: cmovbq %rdi, %rsi +; SSE-NEXT: cmovbq %rdi, %rdx +; SSE-NEXT: subq {{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: sbbq {{[0-9]+}}(%rsp), %r8 +; SSE-NEXT: cmovbq %rdi, %r8 +; SSE-NEXT: cmovbq %rdi, %rcx +; SSE-NEXT: movq %r8, 24(%rax) +; SSE-NEXT: movq %rcx, 16(%rax) +; SSE-NEXT: movq %rdx, 8(%rax) +; SSE-NEXT: movq %rsi, (%rax) +; SSE-NEXT: retq +; +; AVX-LABEL: v2i128: +; AVX: # %bb.0: +; AVX-NEXT: movq %rdi, %rax +; AVX-NEXT: xorl %edi, %edi +; AVX-NEXT: subq %r9, %rsi +; AVX-NEXT: sbbq {{[0-9]+}}(%rsp), %rdx +; AVX-NEXT: cmovbq %rdi, %rsi +; AVX-NEXT: cmovbq %rdi, %rdx +; AVX-NEXT: subq {{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: sbbq {{[0-9]+}}(%rsp), %r8 +; AVX-NEXT: cmovbq %rdi, %r8 +; AVX-NEXT: cmovbq %rdi, %rcx +; AVX-NEXT: movq %r8, 24(%rax) +; AVX-NEXT: movq %rcx, 16(%rax) +; AVX-NEXT: movq %rdx, 8(%rax) +; AVX-NEXT: movq %rsi, (%rax) +; AVX-NEXT: retq + %z = call <2 x i128> @llvm.usub.sat.v2i128(<2 x i128> %x, <2 x i128> %y) + ret <2 x i128> %z +}