From: Sanjay Patel Date: Wed, 12 Jun 2019 21:30:06 +0000 (+0000) Subject: [x86] add tests for vector shifts; NFC X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=d761cdb4033b5323dc3aa8dcaa3c7409c07761cf;p=llvm [x86] add tests for vector shifts; NFC git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@363203 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/test/CodeGen/X86/vector-shift-lshr-128.ll b/test/CodeGen/X86/vector-shift-lshr-128.ll index 7ce33dcfe24..9e7cd41a6ad 100644 --- a/test/CodeGen/X86/vector-shift-lshr-128.ll +++ b/test/CodeGen/X86/vector-shift-lshr-128.ll @@ -1233,3 +1233,184 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind { %shift = lshr <16 x i8> %a, ret <16 x i8> %shift } + +define <4 x i32> @vector_variable_shift_right(<4 x i1> %cond, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) nounwind { +; SSE2-LABEL: vector_variable_shift_right: +; SSE2: # %bb.0: +; SSE2-NEXT: pslld $31, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pandn %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: psrld %xmm1, %xmm2 +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,1,1,1,4,5,6,7] +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: psrld %xmm4, %xmm1 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: psrld %xmm2, %xmm4 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,1,4,5,6,7] +; SSE2-NEXT: psrld %xmm0, %xmm3 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: vector_variable_shift_right: +; SSE41: # %bb.0: +; SSE41-NEXT: pslld $31, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm2 +; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,3,3,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm3, %xmm1 +; SSE41-NEXT: psrld %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[2,3,3,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm3, %xmm5 +; SSE41-NEXT: psrld %xmm4, %xmm5 +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm1[0,1,2,3],xmm5[4,5,6,7] +; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] +; SSE41-NEXT: movdqa %xmm3, %xmm2 +; SSE41-NEXT: psrld %xmm1, %xmm2 +; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,1,4,5,6,7] +; SSE41-NEXT: psrld %xmm0, %xmm3 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1,2,3],xmm3[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3],xmm3[4,5],xmm5[6,7] +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: vector_variable_shift_right: +; AVX1: # %bb.0: +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,0,0,0] +; AVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 +; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrld %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX1-NEXT: vpsrld %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX1-NEXT: vpsrld %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vpsrld %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: vector_variable_shift_right: +; AVX2: # %bb.0: +; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 +; AVX2-NEXT: vbroadcastss %xmm2, %xmm2 +; AVX2-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 +; AVX2-NEXT: vpsrlvd %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: retq +; +; XOPAVX1-LABEL: vector_variable_shift_right: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0] +; XOPAVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,0,0,0] +; XOPAVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 +; XOPAVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; XOPAVX1-NEXT: vpshld %xmm0, %xmm3, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: vector_variable_shift_right: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpslld $31, %xmm0, %xmm0 +; XOPAVX2-NEXT: vbroadcastss %xmm1, %xmm1 +; XOPAVX2-NEXT: vbroadcastss %xmm2, %xmm2 +; XOPAVX2-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 +; XOPAVX2-NEXT: vpsrlvd %xmm0, %xmm3, %xmm0 +; XOPAVX2-NEXT: retq +; +; AVX512DQ-LABEL: vector_variable_shift_right: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 +; AVX512DQ-NEXT: vpbroadcastd %xmm1, %xmm0 +; AVX512DQ-NEXT: vpbroadcastd %xmm2, %xmm1 +; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; AVX512DQ-NEXT: vpsrlvd %xmm1, %xmm3, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: vector_variable_shift_right: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512BW-NEXT: vpbroadcastd %xmm1, %xmm0 +; AVX512BW-NEXT: vpbroadcastd %xmm2, %xmm1 +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vpsrlvd %xmm1, %xmm3, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: vector_variable_shift_right: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpmovd2m %xmm0, %k1 +; AVX512DQVL-NEXT: vpbroadcastd %xmm2, %xmm0 +; AVX512DQVL-NEXT: vpbroadcastd %xmm1, %xmm0 {%k1} +; AVX512DQVL-NEXT: vpsrlvd %xmm0, %xmm3, %xmm0 +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: vector_variable_shift_right: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vptestmd %xmm0, %xmm0, %k1 +; AVX512BWVL-NEXT: vpbroadcastd %xmm2, %xmm0 +; AVX512BWVL-NEXT: vpbroadcastd %xmm1, %xmm0 {%k1} +; AVX512BWVL-NEXT: vpsrlvd %xmm0, %xmm3, %xmm0 +; AVX512BWVL-NEXT: retq +; +; X32-SSE-LABEL: vector_variable_shift_right: +; X32-SSE: # %bb.0: +; X32-SSE-NEXT: pushl %ebp +; X32-SSE-NEXT: movl %esp, %ebp +; X32-SSE-NEXT: andl $-16, %esp +; X32-SSE-NEXT: subl $16, %esp +; X32-SSE-NEXT: pslld $31, %xmm0 +; X32-SSE-NEXT: psrad $31, %xmm0 +; X32-SSE-NEXT: movdqa 8(%ebp), %xmm3 +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; X32-SSE-NEXT: pand %xmm0, %xmm1 +; X32-SSE-NEXT: pandn %xmm2, %xmm0 +; X32-SSE-NEXT: por %xmm1, %xmm0 +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[2,3,3,3,4,5,6,7] +; X32-SSE-NEXT: movdqa %xmm3, %xmm2 +; X32-SSE-NEXT: psrld %xmm1, %xmm2 +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,1,1,1,4,5,6,7] +; X32-SSE-NEXT: movdqa %xmm3, %xmm1 +; X32-SSE-NEXT: psrld %xmm4, %xmm1 +; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[2,3,3,3,4,5,6,7] +; X32-SSE-NEXT: movdqa %xmm3, %xmm4 +; X32-SSE-NEXT: psrld %xmm2, %xmm4 +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,1,4,5,6,7] +; X32-SSE-NEXT: psrld %xmm0, %xmm3 +; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; X32-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3] +; X32-SSE-NEXT: movaps %xmm1, %xmm0 +; X32-SSE-NEXT: movl %ebp, %esp +; X32-SSE-NEXT: popl %ebp +; X32-SSE-NEXT: retl + %splat1 = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> zeroinitializer + %splat2 = shufflevector <4 x i32> %y, <4 x i32> undef, <4 x i32> zeroinitializer + %sel = select <4 x i1> %cond, <4 x i32> %splat1, <4 x i32> %splat2 + %sh = lshr <4 x i32> %z, %sel + ret <4 x i32> %sh +} diff --git a/test/CodeGen/X86/vector-shift-shl-128.ll b/test/CodeGen/X86/vector-shift-shl-128.ll index 6088c965c52..fed509a9eaa 100644 --- a/test/CodeGen/X86/vector-shift-shl-128.ll +++ b/test/CodeGen/X86/vector-shift-shl-128.ll @@ -1104,3 +1104,347 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind { %shift = shl <16 x i8> %a, ret <16 x i8> %shift } + +; PR37428 + +define void @vector_variable_shift_left_loop(i32* nocapture %arr, i8* nocapture readonly %control, i32 %count, i32 %amt0, i32 %amt1, i32 %x) nounwind { +; SSE2-LABEL: vector_variable_shift_left_loop: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: testl %edx, %edx +; SSE2-NEXT: jle .LBB16_3 +; SSE2-NEXT: # %bb.1: # %vector.ph +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: andl $-4, %eax +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: movd %r8d, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE2-NEXT: movd %r9d, %xmm2 +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] +; SSE2-NEXT: .p2align 4, 0x90 +; SSE2-NEXT: .LBB16_2: # %vector.body +; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 +; SSE2-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pand %xmm5, %xmm6 +; SSE2-NEXT: pandn %xmm1, %xmm5 +; SSE2-NEXT: por %xmm6, %xmm5 +; SSE2-NEXT: pslld $23, %xmm5 +; SSE2-NEXT: paddd %xmm4, %xmm5 +; SSE2-NEXT: cvttps2dq %xmm5, %xmm5 +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: pmuludq %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm2, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSE2-NEXT: movdqu %xmm6, (%rdi,%rcx,4) +; SSE2-NEXT: addq $4, %rcx +; SSE2-NEXT: cmpq %rcx, %rax +; SSE2-NEXT: jne .LBB16_2 +; SSE2-NEXT: .LBB16_3: # %exit +; SSE2-NEXT: retq +; +; SSE41-LABEL: vector_variable_shift_left_loop: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: testl %edx, %edx +; SSE41-NEXT: jle .LBB16_3 +; SSE41-NEXT: # %bb.1: # %vector.ph +; SSE41-NEXT: movl %edx, %eax +; SSE41-NEXT: andl $-4, %eax +; SSE41-NEXT: movd %ecx, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; SSE41-NEXT: movd %r8d, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] +; SSE41-NEXT: movd %r9d, %xmm0 +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216] +; SSE41-NEXT: .p2align 4, 0x90 +; SSE41-NEXT: .LBB16_2: # %vector.body +; SSE41-NEXT: # =>This Inner Loop Header: Depth=1 +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm6 +; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm6 +; SSE41-NEXT: pslld $23, %xmm6 +; SSE41-NEXT: paddd %xmm5, %xmm6 +; SSE41-NEXT: cvttps2dq %xmm6, %xmm0 +; SSE41-NEXT: pmulld %xmm3, %xmm0 +; SSE41-NEXT: movdqu %xmm0, (%rdi,%rcx,4) +; SSE41-NEXT: addq $4, %rcx +; SSE41-NEXT: cmpq %rcx, %rax +; SSE41-NEXT: jne .LBB16_2 +; SSE41-NEXT: .LBB16_3: # %exit +; SSE41-NEXT: retq +; +; AVX1-LABEL: vector_variable_shift_left_loop: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: testl %edx, %edx +; AVX1-NEXT: jle .LBB16_3 +; AVX1-NEXT: # %bb.1: # %vector.ph +; AVX1-NEXT: movl %edx, %eax +; AVX1-NEXT: andl $-4, %eax +; AVX1-NEXT: vmovd %ecx, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vmovd %r8d, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-NEXT: vmovd %r9d, %xmm2 +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] +; AVX1-NEXT: .p2align 4, 0x90 +; AVX1-NEXT: .LBB16_2: # %vector.body +; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vblendvps %xmm5, %xmm0, %xmm1, %xmm5 +; AVX1-NEXT: vpslld $23, %xmm5, %xmm5 +; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm5 +; AVX1-NEXT: vcvttps2dq %xmm5, %xmm5 +; AVX1-NEXT: vpmulld %xmm5, %xmm2, %xmm5 +; AVX1-NEXT: vmovdqu %xmm5, (%rdi,%rcx,4) +; AVX1-NEXT: addq $4, %rcx +; AVX1-NEXT: cmpq %rcx, %rax +; AVX1-NEXT: jne .LBB16_2 +; AVX1-NEXT: .LBB16_3: # %exit +; AVX1-NEXT: retq +; +; AVX2-LABEL: vector_variable_shift_left_loop: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: testl %edx, %edx +; AVX2-NEXT: jle .LBB16_3 +; AVX2-NEXT: # %bb.1: # %vector.ph +; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: andl $-4, %eax +; AVX2-NEXT: vmovd %ecx, %xmm0 +; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX2-NEXT: vmovd %r8d, %xmm1 +; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX2-NEXT: vmovd %r9d, %xmm2 +; AVX2-NEXT: vpbroadcastd %xmm2, %xmm2 +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: .p2align 4, 0x90 +; AVX2-NEXT: .LBB16_2: # %vector.body +; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX2-NEXT: vpcmpeqd %xmm3, %xmm4, %xmm4 +; AVX2-NEXT: vblendvps %xmm4, %xmm0, %xmm1, %xmm4 +; AVX2-NEXT: vpsllvd %xmm4, %xmm2, %xmm4 +; AVX2-NEXT: vmovdqu %xmm4, (%rdi,%rcx,4) +; AVX2-NEXT: addq $4, %rcx +; AVX2-NEXT: cmpq %rcx, %rax +; AVX2-NEXT: jne .LBB16_2 +; AVX2-NEXT: .LBB16_3: # %exit +; AVX2-NEXT: retq +; +; XOPAVX1-LABEL: vector_variable_shift_left_loop: +; XOPAVX1: # %bb.0: # %entry +; XOPAVX1-NEXT: testl %edx, %edx +; XOPAVX1-NEXT: jle .LBB16_3 +; XOPAVX1-NEXT: # %bb.1: # %vector.ph +; XOPAVX1-NEXT: movl %edx, %eax +; XOPAVX1-NEXT: andl $-4, %eax +; XOPAVX1-NEXT: vmovd %ecx, %xmm0 +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; XOPAVX1-NEXT: vmovd %r8d, %xmm1 +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; XOPAVX1-NEXT: vmovd %r9d, %xmm2 +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; XOPAVX1-NEXT: xorl %ecx, %ecx +; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; XOPAVX1-NEXT: .p2align 4, 0x90 +; XOPAVX1-NEXT: .LBB16_2: # %vector.body +; XOPAVX1-NEXT: # =>This Inner Loop Header: Depth=1 +; XOPAVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; XOPAVX1-NEXT: vpcomeqd %xmm3, %xmm4, %xmm4 +; XOPAVX1-NEXT: vblendvps %xmm4, %xmm0, %xmm1, %xmm4 +; XOPAVX1-NEXT: vpshld %xmm4, %xmm2, %xmm4 +; XOPAVX1-NEXT: vmovdqu %xmm4, (%rdi,%rcx,4) +; XOPAVX1-NEXT: addq $4, %rcx +; XOPAVX1-NEXT: cmpq %rcx, %rax +; XOPAVX1-NEXT: jne .LBB16_2 +; XOPAVX1-NEXT: .LBB16_3: # %exit +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: vector_variable_shift_left_loop: +; XOPAVX2: # %bb.0: # %entry +; XOPAVX2-NEXT: testl %edx, %edx +; XOPAVX2-NEXT: jle .LBB16_3 +; XOPAVX2-NEXT: # %bb.1: # %vector.ph +; XOPAVX2-NEXT: movl %edx, %eax +; XOPAVX2-NEXT: andl $-4, %eax +; XOPAVX2-NEXT: vmovd %ecx, %xmm0 +; XOPAVX2-NEXT: vpbroadcastd %xmm0, %xmm0 +; XOPAVX2-NEXT: vmovd %r8d, %xmm1 +; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1 +; XOPAVX2-NEXT: vmovd %r9d, %xmm2 +; XOPAVX2-NEXT: vpbroadcastd %xmm2, %xmm2 +; XOPAVX2-NEXT: xorl %ecx, %ecx +; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; XOPAVX2-NEXT: .p2align 4, 0x90 +; XOPAVX2-NEXT: .LBB16_2: # %vector.body +; XOPAVX2-NEXT: # =>This Inner Loop Header: Depth=1 +; XOPAVX2-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; XOPAVX2-NEXT: vpcomeqd %xmm3, %xmm4, %xmm4 +; XOPAVX2-NEXT: vblendvps %xmm4, %xmm0, %xmm1, %xmm4 +; XOPAVX2-NEXT: vpsllvd %xmm4, %xmm2, %xmm4 +; XOPAVX2-NEXT: vmovdqu %xmm4, (%rdi,%rcx,4) +; XOPAVX2-NEXT: addq $4, %rcx +; XOPAVX2-NEXT: cmpq %rcx, %rax +; XOPAVX2-NEXT: jne .LBB16_2 +; XOPAVX2-NEXT: .LBB16_3: # %exit +; XOPAVX2-NEXT: retq +; +; AVX512-LABEL: vector_variable_shift_left_loop: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: testl %edx, %edx +; AVX512-NEXT: jle .LBB16_3 +; AVX512-NEXT: # %bb.1: # %vector.ph +; AVX512-NEXT: movl %edx, %eax +; AVX512-NEXT: andl $-4, %eax +; AVX512-NEXT: vmovd %ecx, %xmm0 +; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX512-NEXT: vmovd %r8d, %xmm1 +; AVX512-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX512-NEXT: vmovd %r9d, %xmm2 +; AVX512-NEXT: vpbroadcastd %xmm2, %xmm2 +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512-NEXT: .p2align 4, 0x90 +; AVX512-NEXT: .LBB16_2: # %vector.body +; AVX512-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX512-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX512-NEXT: vpcmpeqd %xmm3, %xmm4, %xmm4 +; AVX512-NEXT: vblendvps %xmm4, %xmm0, %xmm1, %xmm4 +; AVX512-NEXT: vpsllvd %xmm4, %xmm2, %xmm4 +; AVX512-NEXT: vmovdqu %xmm4, (%rdi,%rcx,4) +; AVX512-NEXT: addq $4, %rcx +; AVX512-NEXT: cmpq %rcx, %rax +; AVX512-NEXT: jne .LBB16_2 +; AVX512-NEXT: .LBB16_3: # %exit +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: vector_variable_shift_left_loop: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: testl %edx, %edx +; AVX512VL-NEXT: jle .LBB16_3 +; AVX512VL-NEXT: # %bb.1: # %vector.ph +; AVX512VL-NEXT: movl %edx, %eax +; AVX512VL-NEXT: andl $-4, %eax +; AVX512VL-NEXT: vpbroadcastd %ecx, %xmm0 +; AVX512VL-NEXT: vpbroadcastd %r8d, %xmm1 +; AVX512VL-NEXT: vpbroadcastd %r9d, %xmm2 +; AVX512VL-NEXT: xorl %ecx, %ecx +; AVX512VL-NEXT: .p2align 4, 0x90 +; AVX512VL-NEXT: .LBB16_2: # %vector.body +; AVX512VL-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX512VL-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX512VL-NEXT: vptestnmd %xmm3, %xmm3, %k1 +; AVX512VL-NEXT: vpblendmd %xmm0, %xmm1, %xmm3 {%k1} +; AVX512VL-NEXT: vpsllvd %xmm3, %xmm2, %xmm3 +; AVX512VL-NEXT: vmovdqu %xmm3, (%rdi,%rcx,4) +; AVX512VL-NEXT: addq $4, %rcx +; AVX512VL-NEXT: cmpq %rcx, %rax +; AVX512VL-NEXT: jne .LBB16_2 +; AVX512VL-NEXT: .LBB16_3: # %exit +; AVX512VL-NEXT: retq +; +; X32-SSE-LABEL: vector_variable_shift_left_loop: +; X32-SSE: # %bb.0: # %entry +; X32-SSE-NEXT: pushl %ebx +; X32-SSE-NEXT: pushl %edi +; X32-SSE-NEXT: pushl %esi +; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE-NEXT: testl %eax, %eax +; X32-SSE-NEXT: jle .LBB16_3 +; X32-SSE-NEXT: # %bb.1: # %vector.ph +; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-SSE-NEXT: andl $-4, %eax +; X32-SSE-NEXT: xorl %esi, %esi +; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X32-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; X32-SSE-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; X32-SSE-NEXT: pxor %xmm3, %xmm3 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] +; X32-SSE-NEXT: xorl %edi, %edi +; X32-SSE-NEXT: .p2align 4, 0x90 +; X32-SSE-NEXT: .LBB16_2: # %vector.body +; X32-SSE-NEXT: # =>This Inner Loop Header: Depth=1 +; X32-SSE-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; X32-SSE-NEXT: pcmpeqd %xmm3, %xmm5 +; X32-SSE-NEXT: movdqa %xmm0, %xmm6 +; X32-SSE-NEXT: pand %xmm5, %xmm6 +; X32-SSE-NEXT: pandn %xmm1, %xmm5 +; X32-SSE-NEXT: por %xmm6, %xmm5 +; X32-SSE-NEXT: pslld $23, %xmm5 +; X32-SSE-NEXT: paddd %xmm4, %xmm5 +; X32-SSE-NEXT: cvttps2dq %xmm5, %xmm5 +; X32-SSE-NEXT: movdqa %xmm2, %xmm6 +; X32-SSE-NEXT: pmuludq %xmm5, %xmm6 +; X32-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; X32-SSE-NEXT: pmuludq %xmm2, %xmm5 +; X32-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; X32-SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; X32-SSE-NEXT: movdqu %xmm6, (%edx,%esi,4) +; X32-SSE-NEXT: addl $4, %esi +; X32-SSE-NEXT: adcl $0, %edi +; X32-SSE-NEXT: movl %esi, %ebx +; X32-SSE-NEXT: xorl %eax, %ebx +; X32-SSE-NEXT: orl %edi, %ebx +; X32-SSE-NEXT: jne .LBB16_2 +; X32-SSE-NEXT: .LBB16_3: # %exit +; X32-SSE-NEXT: popl %esi +; X32-SSE-NEXT: popl %edi +; X32-SSE-NEXT: popl %ebx +; X32-SSE-NEXT: retl +entry: + %cmp16 = icmp sgt i32 %count, 0 + %wide.trip.count = zext i32 %count to i64 + br i1 %cmp16, label %vector.ph, label %exit + +vector.ph: + %n.vec = and i64 %wide.trip.count, 4294967292 + %splatinsert18 = insertelement <4 x i32> undef, i32 %amt0, i32 0 + %splat1 = shufflevector <4 x i32> %splatinsert18, <4 x i32> undef, <4 x i32> zeroinitializer + %splatinsert20 = insertelement <4 x i32> undef, i32 %amt1, i32 0 + %splat2 = shufflevector <4 x i32> %splatinsert20, <4 x i32> undef, <4 x i32> zeroinitializer + %splatinsert22 = insertelement <4 x i32> undef, i32 %x, i32 0 + %splat3 = shufflevector <4 x i32> %splatinsert22, <4 x i32> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i8, i8* %control, i64 %index + %1 = bitcast i8* %0 to <4 x i8>* + %wide.load = load <4 x i8>, <4 x i8>* %1, align 1 + %2 = icmp eq <4 x i8> %wide.load, zeroinitializer + %3 = select <4 x i1> %2, <4 x i32> %splat1, <4 x i32> %splat2 + %4 = shl <4 x i32> %splat3, %3 + %5 = getelementptr inbounds i32, i32* %arr, i64 %index + %6 = bitcast i32* %5 to <4 x i32>* + store <4 x i32> %4, <4 x i32>* %6, align 4 + %index.next = add i64 %index, 4 + %7 = icmp eq i64 %index.next, %n.vec + br i1 %7, label %exit, label %vector.body + +exit: + ret void +} diff --git a/test/Transforms/CodeGenPrepare/X86/vec-shift.ll b/test/Transforms/CodeGenPrepare/X86/vec-shift.ll new file mode 100644 index 00000000000..098a65a397c --- /dev/null +++ b/test/Transforms/CodeGenPrepare/X86/vec-shift.ll @@ -0,0 +1,117 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -codegenprepare -mtriple=x86_64-- -mattr=avx -S < %s | FileCheck %s --check-prefixes=ALL,AVX +; RUN: opt -codegenprepare -mtriple=x86_64-- -mattr=avx2 -S < %s | FileCheck %s --check-prefixes=ALL,AVX2 + +; PR37428 - https://bugs.llvm.org/show_bug.cgi?id=37428 + +define void @vector_variable_shift_left_loop(i32* nocapture %arr, i8* nocapture readonly %control, i32 %count, i32 %amt0, i32 %amt1, i32 %x) { +; AVX-LABEL: @vector_variable_shift_left_loop( +; AVX-NEXT: entry: +; AVX-NEXT: [[CMP16:%.*]] = icmp sgt i32 [[COUNT:%.*]], 0 +; AVX-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[COUNT]] to i64 +; AVX-NEXT: br i1 [[CMP16]], label [[VECTOR_PH:%.*]], label [[EXIT:%.*]] +; AVX: vector.ph: +; AVX-NEXT: [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967292 +; AVX-NEXT: [[SPLATINSERT18:%.*]] = insertelement <4 x i32> undef, i32 [[AMT0:%.*]], i32 0 +; AVX-NEXT: [[SPLAT1:%.*]] = shufflevector <4 x i32> [[SPLATINSERT18]], <4 x i32> undef, <4 x i32> zeroinitializer +; AVX-NEXT: [[SPLATINSERT20:%.*]] = insertelement <4 x i32> undef, i32 [[AMT1:%.*]], i32 0 +; AVX-NEXT: [[SPLAT2:%.*]] = shufflevector <4 x i32> [[SPLATINSERT20]], <4 x i32> undef, <4 x i32> zeroinitializer +; AVX-NEXT: [[SPLATINSERT22:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0 +; AVX-NEXT: br label [[VECTOR_BODY:%.*]] +; AVX: vector.body: +; AVX-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; AVX-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[SPLATINSERT22]], <4 x i32> undef, <4 x i32> zeroinitializer +; AVX-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[CONTROL:%.*]], i64 [[INDEX]] +; AVX-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i8>* +; AVX-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP2]], align 1 +; AVX-NEXT: [[TMP3:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer +; AVX-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[SPLAT1]], <4 x i32> [[SPLAT2]] +; AVX-NEXT: [[TMP5:%.*]] = shl <4 x i32> [[TMP0]], [[TMP4]] +; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 [[INDEX]] +; AVX-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* +; AVX-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4 +; AVX-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; AVX-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AVX-NEXT: br i1 [[TMP8]], label [[EXIT]], label [[VECTOR_BODY]] +; AVX: exit: +; AVX-NEXT: ret void +; +; AVX2-LABEL: @vector_variable_shift_left_loop( +; AVX2-NEXT: entry: +; AVX2-NEXT: [[CMP16:%.*]] = icmp sgt i32 [[COUNT:%.*]], 0 +; AVX2-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[COUNT]] to i64 +; AVX2-NEXT: br i1 [[CMP16]], label [[VECTOR_PH:%.*]], label [[EXIT:%.*]] +; AVX2: vector.ph: +; AVX2-NEXT: [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967292 +; AVX2-NEXT: [[SPLATINSERT18:%.*]] = insertelement <4 x i32> undef, i32 [[AMT0:%.*]], i32 0 +; AVX2-NEXT: [[SPLAT1:%.*]] = shufflevector <4 x i32> [[SPLATINSERT18]], <4 x i32> undef, <4 x i32> zeroinitializer +; AVX2-NEXT: [[SPLATINSERT20:%.*]] = insertelement <4 x i32> undef, i32 [[AMT1:%.*]], i32 0 +; AVX2-NEXT: [[SPLAT2:%.*]] = shufflevector <4 x i32> [[SPLATINSERT20]], <4 x i32> undef, <4 x i32> zeroinitializer +; AVX2-NEXT: [[SPLATINSERT22:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0 +; AVX2-NEXT: [[SPLAT3:%.*]] = shufflevector <4 x i32> [[SPLATINSERT22]], <4 x i32> undef, <4 x i32> zeroinitializer +; AVX2-NEXT: br label [[VECTOR_BODY:%.*]] +; AVX2: vector.body: +; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; AVX2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[CONTROL:%.*]], i64 [[INDEX]] +; AVX2-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i8>* +; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 +; AVX2-NEXT: [[TMP2:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer +; AVX2-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[SPLAT1]], <4 x i32> [[SPLAT2]] +; AVX2-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[SPLAT3]], [[TMP3]] +; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 [[INDEX]] +; AVX2-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* +; AVX2-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP6]], align 4 +; AVX2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; AVX2-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AVX2-NEXT: br i1 [[TMP7]], label [[EXIT]], label [[VECTOR_BODY]] +; AVX2: exit: +; AVX2-NEXT: ret void +; +entry: + %cmp16 = icmp sgt i32 %count, 0 + %wide.trip.count = zext i32 %count to i64 + br i1 %cmp16, label %vector.ph, label %exit + +vector.ph: + %n.vec = and i64 %wide.trip.count, 4294967292 + %splatinsert18 = insertelement <4 x i32> undef, i32 %amt0, i32 0 + %splat1 = shufflevector <4 x i32> %splatinsert18, <4 x i32> undef, <4 x i32> zeroinitializer + %splatinsert20 = insertelement <4 x i32> undef, i32 %amt1, i32 0 + %splat2 = shufflevector <4 x i32> %splatinsert20, <4 x i32> undef, <4 x i32> zeroinitializer + %splatinsert22 = insertelement <4 x i32> undef, i32 %x, i32 0 + %splat3 = shufflevector <4 x i32> %splatinsert22, <4 x i32> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i8, i8* %control, i64 %index + %1 = bitcast i8* %0 to <4 x i8>* + %wide.load = load <4 x i8>, <4 x i8>* %1, align 1 + %2 = icmp eq <4 x i8> %wide.load, zeroinitializer + %3 = select <4 x i1> %2, <4 x i32> %splat1, <4 x i32> %splat2 + %4 = shl <4 x i32> %splat3, %3 + %5 = getelementptr inbounds i32, i32* %arr, i64 %index + %6 = bitcast i32* %5 to <4 x i32>* + store <4 x i32> %4, <4 x i32>* %6, align 4 + %index.next = add i64 %index, 4 + %7 = icmp eq i64 %index.next, %n.vec + br i1 %7, label %exit, label %vector.body + +exit: + ret void +} + +define <4 x i32> @vector_variable_shift_right(<4 x i1> %cond, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { +; ALL-LABEL: @vector_variable_shift_right( +; ALL-NEXT: [[SPLAT1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer +; ALL-NEXT: [[SPLAT2:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer +; ALL-NEXT: [[SEL:%.*]] = select <4 x i1> [[COND:%.*]], <4 x i32> [[SPLAT1]], <4 x i32> [[SPLAT2]] +; ALL-NEXT: [[SH:%.*]] = lshr <4 x i32> [[Z:%.*]], [[SEL]] +; ALL-NEXT: ret <4 x i32> [[SH]] +; + %splat1 = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> zeroinitializer + %splat2 = shufflevector <4 x i32> %y, <4 x i32> undef, <4 x i32> zeroinitializer + %sel = select <4 x i1> %cond, <4 x i32> %splat1, <4 x i32> %splat2 + %sh = lshr <4 x i32> %z, %sel + ret <4 x i32> %sh +}