return SDValue();
// TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
+ bool IsShift = false;
switch (Opcode) {
default:
return SDValue();
+ case ISD::SHL:
+ case ISD::SRL:
+ case ISD::SRA:
+ IsShift = true;
+ break;
case ISD::AND:
case ISD::XOR:
case ISD::OR:
// We expect the canonicalized RHS operand to be the constant.
if (!isa<ConstantSDNode>(RHS))
return SDValue();
+
+ // Extend shift amounts.
+ if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
+ if (!IsShift)
+ return SDValue();
+ RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
+ }
+
LHSElts.push_back(LHS);
RHSElts.push_back(RHS);
}
+ // Limit to shifts by uniform immediates.
+ // TODO: Only accept vXi8/vXi64 special cases?
+ // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
+ if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
+ return SDValue();
+
SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
return DAG.getNode(Opcode, DL, VT, LHS, RHS);
; X64-NEXT: vpextrq $1, %xmm0, %rax
; X64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB
; X64-NEXT: mulq %rcx
-; X64-NEXT: shrq %rdx
; X64-NEXT: vmovq %rdx, %xmm1
; X64-NEXT: vmovq %xmm0, %rax
; X64-NEXT: mulq %rcx
-; X64-NEXT: shrq %rdx
; X64-NEXT: vmovq %rdx, %xmm0
; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT: vpsrlq $1, %xmm0, %xmm0
; X64-NEXT: vprolq $57, %zmm0, %zmm0
; X64-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; X64-NEXT: vzeroupper
; X64-NEXT: movabsq $-6148914691236517205, %rdi # imm = 0xAAAAAAAAAAAAAAAB
; X64-NEXT: movq %rcx, %rax
; X64-NEXT: mulq %rdi
-; X64-NEXT: shrq %rdx
; X64-NEXT: vmovq %rdx, %xmm1
; X64-NEXT: vmovq %xmm0, %rsi
; X64-NEXT: movq %rsi, %rax
; X64-NEXT: mulq %rdi
-; X64-NEXT: shrq %rdx
; X64-NEXT: vmovq %rdx, %xmm0
; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT: vpsrlq $1, %xmm0, %xmm0
; X64-NEXT: movabsq $-6180857105216966645, %rdi # imm = 0xAA392F35DC17F00B
; X64-NEXT: movq %rcx, %rax
; X64-NEXT: mulq %rdi
-; X64-NEXT: shrq $9, %rdx
; X64-NEXT: vmovq %rdx, %xmm1
; X64-NEXT: movq %rsi, %rax
; X64-NEXT: mulq %rdi
-; X64-NEXT: shrq $9, %rdx
; X64-NEXT: vmovq %rdx, %xmm2
; X64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; X64-NEXT: vpsrlq $9, %xmm1, %xmm1
; X64-NEXT: vpsllq $56, %xmm0, %xmm0
; X64-NEXT: vpor %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
; SSE2-NEXT: subq %rdx, %rcx
; SSE2-NEXT: shrq %rcx
; SSE2-NEXT: addq %rdx, %rcx
-; SSE2-NEXT: shrq $2, %rcx
; SSE2-NEXT: movq %rcx, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; SSE2-NEXT: movq %xmm0, %rcx
; SSE2-NEXT: subq %rdx, %rcx
; SSE2-NEXT: shrq %rcx
; SSE2-NEXT: addq %rdx, %rcx
-; SSE2-NEXT: shrq $2, %rcx
; SSE2-NEXT: movq %rcx, %xmm0
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT: psrlq $2, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-NEXT: subq %rdx, %rcx
; SSE41-NEXT: shrq %rcx
; SSE41-NEXT: addq %rdx, %rcx
-; SSE41-NEXT: shrq $2, %rcx
; SSE41-NEXT: movq %rcx, %xmm1
; SSE41-NEXT: movq %xmm0, %rcx
; SSE41-NEXT: movq %rcx, %rax
; SSE41-NEXT: subq %rdx, %rcx
; SSE41-NEXT: shrq %rcx
; SSE41-NEXT: addq %rdx, %rcx
-; SSE41-NEXT: shrq $2, %rcx
; SSE41-NEXT: movq %rcx, %xmm0
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE41-NEXT: psrlq $2, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_div7_2i64:
; AVX-NEXT: subq %rdx, %rcx
; AVX-NEXT: shrq %rcx
; AVX-NEXT: addq %rdx, %rcx
-; AVX-NEXT: shrq $2, %rcx
; AVX-NEXT: vmovq %rcx, %xmm1
; AVX-NEXT: vmovq %xmm0, %rcx
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: subq %rdx, %rcx
; AVX-NEXT: shrq %rcx
; AVX-NEXT: addq %rdx, %rcx
-; AVX-NEXT: shrq $2, %rcx
; AVX-NEXT: vmovq %rcx, %xmm0
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: vpsrlq $2, %xmm0, %xmm0
; AVX-NEXT: retq
%res = udiv <2 x i64> %a, <i64 7, i64 7>
ret <2 x i64> %res
define <4 x i64> @test_div7_4i64(<4 x i64> %a) nounwind {
; AVX1-LABEL: test_div7_4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX1-NEXT: vpextrq $1, %xmm0, %rcx
; AVX1-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
; AVX1-NEXT: movq %rcx, %rax
; AVX1-NEXT: mulq %rsi
; AVX1-NEXT: subq %rdx, %rcx
; AVX1-NEXT: shrq %rcx
; AVX1-NEXT: addq %rdx, %rcx
-; AVX1-NEXT: shrq $2, %rcx
-; AVX1-NEXT: vmovq %rcx, %xmm2
-; AVX1-NEXT: vmovq %xmm1, %rcx
+; AVX1-NEXT: vmovq %rcx, %xmm1
+; AVX1-NEXT: vmovq %xmm0, %rcx
; AVX1-NEXT: movq %rcx, %rax
; AVX1-NEXT: mulq %rsi
; AVX1-NEXT: subq %rdx, %rcx
; AVX1-NEXT: shrq %rcx
; AVX1-NEXT: addq %rdx, %rcx
-; AVX1-NEXT: shrq $2, %rcx
-; AVX1-NEXT: vmovq %rcx, %xmm1
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT: vmovq %rcx, %xmm2
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT: vpsrlq $2, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpextrq $1, %xmm0, %rcx
; AVX1-NEXT: movq %rcx, %rax
; AVX1-NEXT: mulq %rsi
; AVX1-NEXT: subq %rdx, %rcx
; AVX1-NEXT: shrq %rcx
; AVX1-NEXT: addq %rdx, %rcx
-; AVX1-NEXT: shrq $2, %rcx
; AVX1-NEXT: vmovq %rcx, %xmm2
; AVX1-NEXT: vmovq %xmm0, %rcx
; AVX1-NEXT: movq %rcx, %rax
; AVX1-NEXT: subq %rdx, %rcx
; AVX1-NEXT: shrq %rcx
; AVX1-NEXT: addq %rdx, %rcx
-; AVX1-NEXT: shrq $2, %rcx
; AVX1-NEXT: vmovq %rcx, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vpsrlq $2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_div7_4i64:
; AVX2-NEXT: subq %rdx, %rcx
; AVX2-NEXT: shrq %rcx
; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: shrq $2, %rcx
; AVX2-NEXT: vmovq %rcx, %xmm2
; AVX2-NEXT: vmovq %xmm1, %rcx
; AVX2-NEXT: movq %rcx, %rax
; AVX2-NEXT: subq %rdx, %rcx
; AVX2-NEXT: shrq %rcx
; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: shrq $2, %rcx
; AVX2-NEXT: vmovq %rcx, %xmm1
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
; AVX2-NEXT: subq %rdx, %rcx
; AVX2-NEXT: shrq %rcx
; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: shrq $2, %rcx
; AVX2-NEXT: vmovq %rcx, %xmm2
; AVX2-NEXT: vmovq %xmm0, %rcx
; AVX2-NEXT: movq %rcx, %rax
; AVX2-NEXT: subq %rdx, %rcx
; AVX2-NEXT: shrq %rcx
; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: shrq $2, %rcx
; AVX2-NEXT: vmovq %rcx, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlq $2, %ymm0, %ymm0
; AVX2-NEXT: retq
%res = udiv <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
ret <4 x i64> %res
; AVX-NEXT: subq %rdx, %rcx
; AVX-NEXT: shrq %rcx
; AVX-NEXT: addq %rdx, %rcx
-; AVX-NEXT: shrq $2, %rcx
; AVX-NEXT: vmovq %rcx, %xmm2
; AVX-NEXT: vmovq %xmm1, %rcx
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: subq %rdx, %rcx
; AVX-NEXT: shrq %rcx
; AVX-NEXT: addq %rdx, %rcx
-; AVX-NEXT: shrq $2, %rcx
; AVX-NEXT: vmovq %rcx, %xmm1
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2
; AVX-NEXT: subq %rdx, %rcx
; AVX-NEXT: shrq %rcx
; AVX-NEXT: addq %rdx, %rcx
-; AVX-NEXT: shrq $2, %rcx
; AVX-NEXT: vmovq %rcx, %xmm3
; AVX-NEXT: vmovq %xmm2, %rcx
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: subq %rdx, %rcx
; AVX-NEXT: shrq %rcx
; AVX-NEXT: addq %rdx, %rcx
-; AVX-NEXT: shrq $2, %rcx
; AVX-NEXT: vmovq %rcx, %xmm2
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
; AVX-NEXT: subq %rdx, %rcx
; AVX-NEXT: shrq %rcx
; AVX-NEXT: addq %rdx, %rcx
-; AVX-NEXT: shrq $2, %rcx
; AVX-NEXT: vmovq %rcx, %xmm3
; AVX-NEXT: vmovq %xmm2, %rcx
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: subq %rdx, %rcx
; AVX-NEXT: shrq %rcx
; AVX-NEXT: addq %rdx, %rcx
-; AVX-NEXT: shrq $2, %rcx
; AVX-NEXT: vmovq %rcx, %xmm2
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX-NEXT: vpextrq $1, %xmm0, %rcx
; AVX-NEXT: subq %rdx, %rcx
; AVX-NEXT: shrq %rcx
; AVX-NEXT: addq %rdx, %rcx
-; AVX-NEXT: shrq $2, %rcx
; AVX-NEXT: vmovq %rcx, %xmm3
; AVX-NEXT: vmovq %xmm0, %rcx
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: subq %rdx, %rcx
; AVX-NEXT: shrq %rcx
; AVX-NEXT: addq %rdx, %rcx
-; AVX-NEXT: shrq $2, %rcx
; AVX-NEXT: vmovq %rcx, %xmm0
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX-NEXT: vpsrlq $2, %zmm0, %zmm0
; AVX-NEXT: retq
%res = udiv <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
ret <8 x i64> %res
; SSE-NEXT: movzbl (%rdi), %eax
; SSE-NEXT: movq %rax, %rcx
; SSE-NEXT: shlq $62, %rcx
-; SSE-NEXT: sarq $63, %rcx
-; SSE-NEXT: movq %rcx, %xmm1
+; SSE-NEXT: movq %rcx, %xmm0
; SSE-NEXT: shlq $63, %rax
-; SSE-NEXT: sarq $63, %rax
-; SSE-NEXT: movq %rax, %xmm0
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: movq %rax, %xmm1
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT: psrad $31, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
; SSE-NEXT: retq
;
; AVX1-LABEL: load_sext_2i1_to_2i64:
; AVX1-NEXT: movzbl (%rdi), %eax
; AVX1-NEXT: movq %rax, %rcx
; AVX1-NEXT: shlq $62, %rcx
-; AVX1-NEXT: sarq $63, %rcx
; AVX1-NEXT: vmovq %rcx, %xmm0
; AVX1-NEXT: shlq $63, %rax
-; AVX1-NEXT: sarq $63, %rax
; AVX1-NEXT: vmovq %rax, %xmm1
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_sext_2i1_to_2i64:
; AVX2-NEXT: movzbl (%rdi), %eax
; AVX2-NEXT: movq %rax, %rcx
; AVX2-NEXT: shlq $62, %rcx
-; AVX2-NEXT: sarq $63, %rcx
; AVX2-NEXT: vmovq %rcx, %xmm0
; AVX2-NEXT: shlq $63, %rax
-; AVX2-NEXT: sarq $63, %rax
; AVX2-NEXT: vmovq %rax, %xmm1
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_sext_2i1_to_2i64:
; X32-SSE2-NEXT: movzbl (%eax), %eax
; X32-SSE2-NEXT: movl %eax, %ecx
; X32-SSE2-NEXT: shll $30, %ecx
-; X32-SSE2-NEXT: sarl $31, %ecx
; X32-SSE2-NEXT: movd %ecx, %xmm0
; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
; X32-SSE2-NEXT: shll $31, %eax
-; X32-SSE2-NEXT: sarl $31, %eax
; X32-SSE2-NEXT: movd %eax, %xmm0
; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-SSE2-NEXT: psrad $31, %xmm0
; X32-SSE2-NEXT: retl
;
; X32-SSE41-LABEL: load_sext_2i1_to_2i64:
; X32-SSE41-NEXT: movzbl (%eax), %eax
; X32-SSE41-NEXT: movl %eax, %ecx
; X32-SSE41-NEXT: shll $31, %ecx
-; X32-SSE41-NEXT: sarl $31, %ecx
; X32-SSE41-NEXT: movd %ecx, %xmm0
; X32-SSE41-NEXT: pinsrd $1, %ecx, %xmm0
; X32-SSE41-NEXT: shll $30, %eax
-; X32-SSE41-NEXT: sarl $31, %eax
; X32-SSE41-NEXT: pinsrd $2, %eax, %xmm0
; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm0
+; X32-SSE41-NEXT: psrad $31, %xmm0
; X32-SSE41-NEXT: retl
entry:
%X = load <2 x i1>, <2 x i1>* %ptr
; X32-SSE2-NEXT: movl (%eax), %eax
; X32-SSE2-NEXT: movl %eax, %ecx
; X32-SSE2-NEXT: shll $28, %ecx
-; X32-SSE2-NEXT: sarl $31, %ecx
; X32-SSE2-NEXT: movd %ecx, %xmm0
; X32-SSE2-NEXT: movl %eax, %ecx
; X32-SSE2-NEXT: shll $29, %ecx
-; X32-SSE2-NEXT: sarl $31, %ecx
; X32-SSE2-NEXT: movd %ecx, %xmm1
; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; X32-SSE2-NEXT: movl %eax, %ecx
; X32-SSE2-NEXT: shll $30, %ecx
-; X32-SSE2-NEXT: sarl $31, %ecx
; X32-SSE2-NEXT: movd %ecx, %xmm2
; X32-SSE2-NEXT: shll $31, %eax
-; X32-SSE2-NEXT: sarl $31, %eax
; X32-SSE2-NEXT: movd %eax, %xmm0
; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-SSE2-NEXT: psrad $31, %xmm0
; X32-SSE2-NEXT: retl
;
; X32-SSE41-LABEL: load_sext_4i1_to_4i32:
; X32-SSE41-NEXT: movl (%eax), %eax
; X32-SSE41-NEXT: movl %eax, %ecx
; X32-SSE41-NEXT: shll $30, %ecx
-; X32-SSE41-NEXT: sarl $31, %ecx
; X32-SSE41-NEXT: movl %eax, %edx
; X32-SSE41-NEXT: shll $31, %edx
-; X32-SSE41-NEXT: sarl $31, %edx
; X32-SSE41-NEXT: movd %edx, %xmm0
; X32-SSE41-NEXT: pinsrd $1, %ecx, %xmm0
; X32-SSE41-NEXT: movl %eax, %ecx
; X32-SSE41-NEXT: shll $29, %ecx
-; X32-SSE41-NEXT: sarl $31, %ecx
; X32-SSE41-NEXT: pinsrd $2, %ecx, %xmm0
; X32-SSE41-NEXT: shll $28, %eax
-; X32-SSE41-NEXT: sarl $31, %eax
; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm0
+; X32-SSE41-NEXT: psrad $31, %xmm0
; X32-SSE41-NEXT: retl
entry:
%X = load <4 x i1>, <4 x i1>* %ptr
; AVX2-NEXT: movl (%rdi), %eax
; AVX2-NEXT: movq %rax, %rcx
; AVX2-NEXT: shlq $60, %rcx
-; AVX2-NEXT: sarq $63, %rcx
; AVX2-NEXT: vmovq %rcx, %xmm0
; AVX2-NEXT: movq %rax, %rcx
; AVX2-NEXT: shlq $61, %rcx
-; AVX2-NEXT: sarq $63, %rcx
; AVX2-NEXT: vmovq %rcx, %xmm1
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX2-NEXT: movq %rax, %rcx
; AVX2-NEXT: shlq $62, %rcx
-; AVX2-NEXT: sarq $63, %rcx
; AVX2-NEXT: vmovq %rcx, %xmm1
; AVX2-NEXT: shlq $63, %rax
-; AVX2-NEXT: sarq $63, %rax
; AVX2-NEXT: vmovq %rax, %xmm2
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_sext_4i1_to_4i64:
; X32-SSE2-NEXT: movl 8(%eax), %eax
; X32-SSE2-NEXT: shldl $13, %edx, %eax
; X32-SSE2-NEXT: shll $15, %eax
-; X32-SSE2-NEXT: sarl $15, %eax
; X32-SSE2-NEXT: movd %eax, %xmm0
; X32-SSE2-NEXT: movl %edx, %eax
; X32-SSE2-NEXT: shll $13, %eax
-; X32-SSE2-NEXT: sarl $15, %eax
; X32-SSE2-NEXT: movd %eax, %xmm1
; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; X32-SSE2-NEXT: shldl $15, %ecx, %edx
; X32-SSE2-NEXT: shll $15, %ecx
-; X32-SSE2-NEXT: sarl $15, %ecx
; X32-SSE2-NEXT: movd %ecx, %xmm0
; X32-SSE2-NEXT: shll $15, %edx
-; X32-SSE2-NEXT: sarl $15, %edx
; X32-SSE2-NEXT: movd %edx, %xmm2
; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-SSE2-NEXT: psrad $15, %xmm0
; X32-SSE2-NEXT: retl
;
; X32-SSE41-LABEL: sext_4i17_to_4i32:
; X32-SSE41-NEXT: shldl $13, %edx, %eax
; X32-SSE41-NEXT: shldl $15, %ecx, %edx
; X32-SSE41-NEXT: shll $15, %edx
-; X32-SSE41-NEXT: sarl $15, %edx
; X32-SSE41-NEXT: shll $15, %ecx
-; X32-SSE41-NEXT: sarl $15, %ecx
; X32-SSE41-NEXT: movd %ecx, %xmm0
; X32-SSE41-NEXT: pinsrd $1, %edx, %xmm0
; X32-SSE41-NEXT: shll $13, %esi
-; X32-SSE41-NEXT: sarl $15, %esi
; X32-SSE41-NEXT: pinsrd $2, %esi, %xmm0
; X32-SSE41-NEXT: shll $15, %eax
-; X32-SSE41-NEXT: sarl $15, %eax
; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm0
+; X32-SSE41-NEXT: psrad $15, %xmm0
; X32-SSE41-NEXT: popl %esi
; X32-SSE41-NEXT: .cfi_def_cfa_offset 4
; X32-SSE41-NEXT: retl
; SSE-NEXT: movzbl (%rdi), %eax
; SSE-NEXT: movq %rax, %rcx
; SSE-NEXT: shlq $62, %rcx
-; SSE-NEXT: sarq $63, %rcx
-; SSE-NEXT: movq %rcx, %xmm1
+; SSE-NEXT: movq %rcx, %xmm0
; SSE-NEXT: shlq $63, %rax
-; SSE-NEXT: sarq $63, %rax
-; SSE-NEXT: movq %rax, %xmm0
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: movq %rax, %xmm1
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT: psrad $31, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
; SSE-NEXT: retq
;
; AVX1-LABEL: load_sext_2i1_to_2i64:
; AVX1-NEXT: movzbl (%rdi), %eax
; AVX1-NEXT: movq %rax, %rcx
; AVX1-NEXT: shlq $62, %rcx
-; AVX1-NEXT: sarq $63, %rcx
; AVX1-NEXT: vmovq %rcx, %xmm0
; AVX1-NEXT: shlq $63, %rax
-; AVX1-NEXT: sarq $63, %rax
; AVX1-NEXT: vmovq %rax, %xmm1
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_sext_2i1_to_2i64:
; AVX2-NEXT: movzbl (%rdi), %eax
; AVX2-NEXT: movq %rax, %rcx
; AVX2-NEXT: shlq $62, %rcx
-; AVX2-NEXT: sarq $63, %rcx
; AVX2-NEXT: vmovq %rcx, %xmm0
; AVX2-NEXT: shlq $63, %rax
-; AVX2-NEXT: sarq $63, %rax
; AVX2-NEXT: vmovq %rax, %xmm1
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_sext_2i1_to_2i64:
; X32-SSE2-NEXT: movzbl (%eax), %eax
; X32-SSE2-NEXT: movl %eax, %ecx
; X32-SSE2-NEXT: shll $30, %ecx
-; X32-SSE2-NEXT: sarl $31, %ecx
; X32-SSE2-NEXT: movd %ecx, %xmm0
; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
; X32-SSE2-NEXT: shll $31, %eax
-; X32-SSE2-NEXT: sarl $31, %eax
; X32-SSE2-NEXT: movd %eax, %xmm0
; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-SSE2-NEXT: psrad $31, %xmm0
; X32-SSE2-NEXT: retl
;
; X32-SSE41-LABEL: load_sext_2i1_to_2i64:
; X32-SSE41-NEXT: movzbl (%eax), %eax
; X32-SSE41-NEXT: movl %eax, %ecx
; X32-SSE41-NEXT: shll $31, %ecx
-; X32-SSE41-NEXT: sarl $31, %ecx
; X32-SSE41-NEXT: movd %ecx, %xmm0
; X32-SSE41-NEXT: pinsrd $1, %ecx, %xmm0
; X32-SSE41-NEXT: shll $30, %eax
-; X32-SSE41-NEXT: sarl $31, %eax
; X32-SSE41-NEXT: pinsrd $2, %eax, %xmm0
; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm0
+; X32-SSE41-NEXT: psrad $31, %xmm0
; X32-SSE41-NEXT: retl
entry:
%X = load <2 x i1>, <2 x i1>* %ptr
; X32-SSE2-NEXT: movl (%eax), %eax
; X32-SSE2-NEXT: movl %eax, %ecx
; X32-SSE2-NEXT: shll $28, %ecx
-; X32-SSE2-NEXT: sarl $31, %ecx
; X32-SSE2-NEXT: movd %ecx, %xmm0
; X32-SSE2-NEXT: movl %eax, %ecx
; X32-SSE2-NEXT: shll $29, %ecx
-; X32-SSE2-NEXT: sarl $31, %ecx
; X32-SSE2-NEXT: movd %ecx, %xmm1
; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; X32-SSE2-NEXT: movl %eax, %ecx
; X32-SSE2-NEXT: shll $30, %ecx
-; X32-SSE2-NEXT: sarl $31, %ecx
; X32-SSE2-NEXT: movd %ecx, %xmm2
; X32-SSE2-NEXT: shll $31, %eax
-; X32-SSE2-NEXT: sarl $31, %eax
; X32-SSE2-NEXT: movd %eax, %xmm0
; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-SSE2-NEXT: psrad $31, %xmm0
; X32-SSE2-NEXT: retl
;
; X32-SSE41-LABEL: load_sext_4i1_to_4i32:
; X32-SSE41-NEXT: movl (%eax), %eax
; X32-SSE41-NEXT: movl %eax, %ecx
; X32-SSE41-NEXT: shll $30, %ecx
-; X32-SSE41-NEXT: sarl $31, %ecx
; X32-SSE41-NEXT: movl %eax, %edx
; X32-SSE41-NEXT: shll $31, %edx
-; X32-SSE41-NEXT: sarl $31, %edx
; X32-SSE41-NEXT: movd %edx, %xmm0
; X32-SSE41-NEXT: pinsrd $1, %ecx, %xmm0
; X32-SSE41-NEXT: movl %eax, %ecx
; X32-SSE41-NEXT: shll $29, %ecx
-; X32-SSE41-NEXT: sarl $31, %ecx
; X32-SSE41-NEXT: pinsrd $2, %ecx, %xmm0
; X32-SSE41-NEXT: shll $28, %eax
-; X32-SSE41-NEXT: sarl $31, %eax
; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm0
+; X32-SSE41-NEXT: psrad $31, %xmm0
; X32-SSE41-NEXT: retl
entry:
%X = load <4 x i1>, <4 x i1>* %ptr
; AVX2-NEXT: movl (%rdi), %eax
; AVX2-NEXT: movq %rax, %rcx
; AVX2-NEXT: shlq $60, %rcx
-; AVX2-NEXT: sarq $63, %rcx
; AVX2-NEXT: vmovq %rcx, %xmm0
; AVX2-NEXT: movq %rax, %rcx
; AVX2-NEXT: shlq $61, %rcx
-; AVX2-NEXT: sarq $63, %rcx
; AVX2-NEXT: vmovq %rcx, %xmm1
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX2-NEXT: movq %rax, %rcx
; AVX2-NEXT: shlq $62, %rcx
-; AVX2-NEXT: sarq $63, %rcx
; AVX2-NEXT: vmovq %rcx, %xmm1
; AVX2-NEXT: shlq $63, %rax
-; AVX2-NEXT: sarq $63, %rax
; AVX2-NEXT: vmovq %rax, %xmm2
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_sext_4i1_to_4i64:
; X32-SSE2-NEXT: movl 8(%eax), %eax
; X32-SSE2-NEXT: shldl $13, %edx, %eax
; X32-SSE2-NEXT: shll $15, %eax
-; X32-SSE2-NEXT: sarl $15, %eax
; X32-SSE2-NEXT: movd %eax, %xmm0
; X32-SSE2-NEXT: movl %edx, %eax
; X32-SSE2-NEXT: shll $13, %eax
-; X32-SSE2-NEXT: sarl $15, %eax
; X32-SSE2-NEXT: movd %eax, %xmm1
; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; X32-SSE2-NEXT: shldl $15, %ecx, %edx
; X32-SSE2-NEXT: shll $15, %ecx
-; X32-SSE2-NEXT: sarl $15, %ecx
; X32-SSE2-NEXT: movd %ecx, %xmm0
; X32-SSE2-NEXT: shll $15, %edx
-; X32-SSE2-NEXT: sarl $15, %edx
; X32-SSE2-NEXT: movd %edx, %xmm2
; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-SSE2-NEXT: psrad $15, %xmm0
; X32-SSE2-NEXT: retl
;
; X32-SSE41-LABEL: sext_4i17_to_4i32:
; X32-SSE41-NEXT: shldl $13, %edx, %eax
; X32-SSE41-NEXT: shldl $15, %ecx, %edx
; X32-SSE41-NEXT: shll $15, %edx
-; X32-SSE41-NEXT: sarl $15, %edx
; X32-SSE41-NEXT: shll $15, %ecx
-; X32-SSE41-NEXT: sarl $15, %ecx
; X32-SSE41-NEXT: movd %ecx, %xmm0
; X32-SSE41-NEXT: pinsrd $1, %edx, %xmm0
; X32-SSE41-NEXT: shll $13, %esi
-; X32-SSE41-NEXT: sarl $15, %esi
; X32-SSE41-NEXT: pinsrd $2, %esi, %xmm0
; X32-SSE41-NEXT: shll $15, %eax
-; X32-SSE41-NEXT: sarl $15, %eax
; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm0
+; X32-SSE41-NEXT: psrad $15, %xmm0
; X32-SSE41-NEXT: popl %esi
; X32-SSE41-NEXT: .cfi_def_cfa_offset 4
; X32-SSE41-NEXT: retl