From: Michael Zuckerman Date: Mon, 21 Aug 2017 08:56:39 +0000 (+0000) Subject: [InterLeaved] Adding lit test for future work interleaved load strid 3 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=076fb389d7ace7e624facdd881b9299b07e49da5;p=llvm [InterLeaved] Adding lit test for future work interleaved load strid 3 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@311320 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/test/CodeGen/X86/x86-interleaved-access.ll b/test/CodeGen/X86/x86-interleaved-access.ll index b2760f9ad82..2634e8abe26 100644 --- a/test/CodeGen/X86/x86-interleaved-access.ll +++ b/test/CodeGen/X86/x86-interleaved-access.ll @@ -973,3 +973,319 @@ define void @interleaved_store_vf8_i8_stride4(<8 x i8> %x1, <8 x i8> %x2, <8 x i store <32 x i8> %interleaved.vec, <32 x i8>* %p ret void } + +define <32 x i8> @interleaved_load_vf32_i8_stride3(<96 x i8>* %ptr){ +; AVX1-LABEL: interleaved_load_vf32_i8_stride3: +; AVX1: # BB#0: +; AVX1-NEXT: vmovdqa (%rdi), %ymm1 +; AVX1-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX1-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm4[1,4,7,10,13] +; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,u,u,u,u,u,2,5,8,11,14],zero,zero,zero,zero,zero +; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-NEXT: vmovaps {{.*#+}} ymm8 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] +; AVX1-NEXT: vandnps %ymm2, %ymm8, %ymm10 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,xmm5[2,5,8,11,14,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[0,3,6,9,12,15],zero,zero,zero,zero,zero,xmm1[u,u,u,u,u] +; AVX1-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,10,128,128,128,128,128] +; AVX1-NEXT: vpshufb %xmm9, %xmm6, %xmm6 +; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1,4,7,10,13] +; AVX1-NEXT: vpor %xmm7, %xmm6, %xmm7 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[0,3,6,9,12,15,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm2 +; AVX1-NEXT: vandps %ymm8, %ymm2, %ymm2 +; AVX1-NEXT: vorps %ymm10, %ymm2, %ymm8 +; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm4[2,5,8,11,14] +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,0,3,6,9,12,15],zero,zero,zero,zero,zero +; AVX1-NEXT: vpor %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-NEXT: vmovaps {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vandnps %ymm2, %ymm10, %ymm11 +; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[1,4,7,10,13],zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm5[0,3,6,9,12,15,u,u,u,u,u] +; AVX1-NEXT: vpor %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[2,5,8,11,14] +; AVX1-NEXT: vpor %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[1,4,7,10,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2 +; AVX1-NEXT: vandps %ymm10, %ymm2, %ymm2 +; AVX1-NEXT: vorps %ymm11, %ymm2, %ymm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,1,4,7,10,13],zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,zero,zero,zero,xmm4[0,3,6,9,12,15] +; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-NEXT: vandnps %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,xmm5[1,4,7,10,13,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,5,8,11,14],zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u] +; AVX1-NEXT: vpor %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm4[5,6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,3,6,9,12,15] +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[2,5,8,11,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vandps %ymm10, %ymm0, %ymm0 +; AVX1-NEXT: vorps %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpaddb %xmm0, %xmm8, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: interleaved_load_vf32_i8_stride3: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0> +; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm3 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,xmm4[2,5,8,11,14],zero,zero,zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,3,6,9,12,15],zero,zero,zero,zero,zero,xmm3[1,4,7,10,13] +; AVX2-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,3,6,9,12,15,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm4 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm3[1,4,7,10,13] +; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,2,5,8,11,14],zero,zero,zero,zero,zero +; AVX2-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm5 +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,xmm6[0,3,6,9,12,15],zero,zero,zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,4,7,10,13],zero,zero,zero,zero,zero,zero,xmm5[2,5,8,11,14] +; AVX2-NEXT: vpor %xmm7, %xmm5, %xmm5 +; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,4,7,10,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 +; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm3[2,5,8,11,14] +; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,0,3,6,9,12,15],zero,zero,zero,zero,zero +; AVX2-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = <255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u> +; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,xmm2[1,4,7,10,13],zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,5,8,11,14],zero,zero,zero,zero,zero,xmm1[0,3,6,9,12,15] +; AVX2-NEXT: vpor %xmm6, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,5,8,11,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,1,4,7,10,13],zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,xmm3[0,3,6,9,12,15] +; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddb %ymm0, %ymm5, %ymm0 +; AVX2-NEXT: vpaddb %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: interleaved_load_vf32_i8_stride3: +; AVX512: # BB#0: +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = <255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0> +; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm3 +; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,xmm4[2,5,8,11,14],zero,zero,zero,zero,zero +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,3,6,9,12,15],zero,zero,zero,zero,zero,xmm3[1,4,7,10,13] +; AVX512-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,3,6,9,12,15,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm4 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm3[1,4,7,10,13] +; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,2,5,8,11,14],zero,zero,zero,zero,zero +; AVX512-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0] +; AVX512-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX512-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm5 +; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,xmm6[0,3,6,9,12,15],zero,zero,zero,zero,zero +; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,4,7,10,13],zero,zero,zero,zero,zero,zero,xmm5[2,5,8,11,14] +; AVX512-NEXT: vpor %xmm7, %xmm5, %xmm5 +; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,4,7,10,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 +; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm3[2,5,8,11,14] +; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,0,3,6,9,12,15],zero,zero,zero,zero,zero +; AVX512-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX512-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = <255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u> +; AVX512-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm1 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,xmm2[1,4,7,10,13],zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,5,8,11,14],zero,zero,zero,zero,zero,xmm1[0,3,6,9,12,15] +; AVX512-NEXT: vpor %xmm6, %xmm1, %xmm1 +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,5,8,11,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,1,4,7,10,13],zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,xmm3[0,3,6,9,12,15] +; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpaddb %ymm0, %ymm5, %ymm0 +; AVX512-NEXT: vpaddb %ymm0, %ymm4, %ymm0 +; AVX512-NEXT: retq + %wide.vec = load <96 x i8>, <96 x i8>* %ptr + %v1 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> + %v2 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> + %v3 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> + %add1 = add <32 x i8> %v1, %v2 + %add2 = add <32 x i8> %v3, %add1 + ret <32 x i8> %add2 +} + +define <16 x i8> @interleaved_load_vf16_i8_stride3(<48 x i8>* %ptr){ +; AVX1-LABEL: interleaved_load_vf16_i8_stride3: +; AVX1: # BB#0: +; AVX1-NEXT: vmovdqa (%rdi), %ymm0 +; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,xmm2[2,5,8,11,14,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[0,3,6,9,12,15],zero,zero,zero,zero,zero,xmm0[u,u,u,u,u] +; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,10,128,128,128,128,128] +; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[1,4,7,10,13] +; AVX1-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[1,4,7,10,13],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,xmm2[0,3,6,9,12,15,u,u,u,u,u] +; AVX1-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[2,5,8,11,14] +; AVX1-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm2[1,4,7,10,13,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u] +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,3,6,9,12,15] +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpaddb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: interleaved_load_vf16_i8_stride3: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,u,u,255,u,u,255,u,u,255,u,u,255,u,u,255,u> +; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,xmm3[2,5,8,11,14],zero,zero,zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,3,6,9,12,15],zero,zero,zero,zero,zero,xmm2[1,4,7,10,13] +; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm3 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,xmm4[0,3,6,9,12,15],zero,zero,zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,4,7,10,13],zero,zero,zero,zero,zero,zero,xmm3[2,5,8,11,14] +; AVX2-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = <0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,u,u,255,u,u,255,u,u,255,u,u,255,u,u> +; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,10,13],zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,zero,zero,xmm0[0,3,6,9,12,15] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpaddb %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: interleaved_load_vf16_i8_stride3: +; AVX512: # BB#0: +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = <255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,u,u,255,u,u,255,u,u,255,u,u,255,u,u,255,u> +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm1 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,xmm3[2,5,8,11,14],zero,zero,zero,zero,zero +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,3,6,9,12,15],zero,zero,zero,zero,zero,xmm1[1,4,7,10,13] +; AVX512-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm3 +; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,xmm4[0,3,6,9,12,15],zero,zero,zero,zero,zero +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,4,7,10,13],zero,zero,zero,zero,zero,zero,xmm3[2,5,8,11,14] +; AVX512-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = <0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,u,u,255,u,u,255,u,u,255,u,u,255,u,u> +; AVX512-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm2[1,4,7,10,13],zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,zero,zero,xmm0[0,3,6,9,12,15] +; AVX512-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpaddb %xmm0, %xmm3, %xmm0 +; AVX512-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %wide.vec = load <48 x i8>, <48 x i8>* %ptr + %v1 = shufflevector <48 x i8> %wide.vec, <48 x i8> undef,<16 x i32> + %v2 = shufflevector <48 x i8> %wide.vec, <48 x i8> undef,<16 x i32> + %v3 = shufflevector <48 x i8> %wide.vec, <48 x i8> undef,<16 x i32> + %add1 = add <16 x i8> %v1, %v2 + %add2 = add <16 x i8> %v3, %add1 + ret <16 x i8> %add2 +} + +define <8 x i8> @interleaved_load_vf8_i8_stride3(<24 x i8>* %ptr){ +; AVX1-LABEL: interleaved_load_vf8_i8_stride3: +; AVX1: # BB#0: +; AVX1-NEXT: vmovdqa (%rdi), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,2,u,5,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,u,3,u,6,u,9,u,12,u,15,u],zero,xmm0[u],zero,xmm0[u] +; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,0,u,3,u,6,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,u,4,u,7,u,10,u,13,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u] +; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,1,u,4,u,7,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,5,u,8,u,11,u,14,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u] +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpaddw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX-LABEL: interleaved_load_vf8_i8_stride3: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa (%rdi), %ymm0 +; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,2,u,5,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,u,3,u,6,u,9,u,12,u,15,u],zero,xmm0[u],zero,xmm0[u] +; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,0,u,3,u,6,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,u,4,u,7,u,10,u,13,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u] +; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,1,u,4,u,7,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,5,u,8,u,11,u,14,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u] +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpaddw %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vpaddw %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq + %wide.vec = load <24 x i8>, <24 x i8>* %ptr + %v1 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> + %v2 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> + %v3 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> + %add1 = add <8 x i8> %v1, %v2 + %add2 = add <8 x i8> %v3, %add1 + ret <8 x i8> %add2 +} diff --git a/test/Transforms/InterleavedAccess/X86/interleavedLoad.ll b/test/Transforms/InterleavedAccess/X86/interleavedLoad.ll new file mode 100644 index 00000000000..816c949c76f --- /dev/null +++ b/test/Transforms/InterleavedAccess/X86/interleavedLoad.ll @@ -0,0 +1,60 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -mtriple=x86_64-pc-linux -mattr=+avx2 -interleaved-access -S | FileCheck %s + + +define <32 x i8> @interleaved_load_vf32_i8_stride3(<96 x i8>* %ptr){ +; CHECK-LABEL: @interleaved_load_vf32_i8_stride3( +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <96 x i8>, <96 x i8>* [[PTR:%.*]] +; CHECK-NEXT: [[V1:%.*]] = shufflevector <96 x i8> [[WIDE_VEC]], <96 x i8> undef, <32 x i32> +; CHECK-NEXT: [[V2:%.*]] = shufflevector <96 x i8> [[WIDE_VEC]], <96 x i8> undef, <32 x i32> +; CHECK-NEXT: [[V3:%.*]] = shufflevector <96 x i8> [[WIDE_VEC]], <96 x i8> undef, <32 x i32> +; CHECK-NEXT: [[ADD1:%.*]] = add <32 x i8> [[V1]], [[V2]] +; CHECK-NEXT: [[ADD2:%.*]] = add <32 x i8> [[V3]], [[ADD1]] +; CHECK-NEXT: ret <32 x i8> [[ADD2]] +; + %wide.vec = load <96 x i8>, <96 x i8>* %ptr + %v1 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> + %v2 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> + %v3 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> + %add1 = add <32 x i8> %v1, %v2 + %add2 = add <32 x i8> %v3, %add1 + ret <32 x i8> %add2 +} + +define <16 x i8> @interleaved_load_vf16_i8_stride3(<48 x i8>* %ptr){ +; CHECK-LABEL: @interleaved_load_vf16_i8_stride3( +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <48 x i8>, <48 x i8>* [[PTR:%.*]] +; CHECK-NEXT: [[V1:%.*]] = shufflevector <48 x i8> [[WIDE_VEC]], <48 x i8> undef, <16 x i32> +; CHECK-NEXT: [[V2:%.*]] = shufflevector <48 x i8> [[WIDE_VEC]], <48 x i8> undef, <16 x i32> +; CHECK-NEXT: [[V3:%.*]] = shufflevector <48 x i8> [[WIDE_VEC]], <48 x i8> undef, <16 x i32> +; CHECK-NEXT: [[ADD1:%.*]] = add <16 x i8> [[V1]], [[V2]] +; CHECK-NEXT: [[ADD2:%.*]] = add <16 x i8> [[V3]], [[ADD1]] +; CHECK-NEXT: ret <16 x i8> [[ADD2]] +; + %wide.vec = load <48 x i8>, <48 x i8>* %ptr + %v1 = shufflevector <48 x i8> %wide.vec, <48 x i8> undef,<16 x i32> + %v2 = shufflevector <48 x i8> %wide.vec, <48 x i8> undef,<16 x i32> + %v3 = shufflevector <48 x i8> %wide.vec, <48 x i8> undef,<16 x i32> + %add1 = add <16 x i8> %v1, %v2 + %add2 = add <16 x i8> %v3, %add1 + ret <16 x i8> %add2 +} + +define <8 x i8> @interleaved_load_vf8_i8_stride3(<24 x i8>* %ptr){ +; CHECK-LABEL: @interleaved_load_vf8_i8_stride3( +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <24 x i8>, <24 x i8>* [[PTR:%.*]] +; CHECK-NEXT: [[V1:%.*]] = shufflevector <24 x i8> [[WIDE_VEC]], <24 x i8> undef, <8 x i32> +; CHECK-NEXT: [[V2:%.*]] = shufflevector <24 x i8> [[WIDE_VEC]], <24 x i8> undef, <8 x i32> +; CHECK-NEXT: [[V3:%.*]] = shufflevector <24 x i8> [[WIDE_VEC]], <24 x i8> undef, <8 x i32> +; CHECK-NEXT: [[ADD1:%.*]] = add <8 x i8> [[V1]], [[V2]] +; CHECK-NEXT: [[ADD2:%.*]] = add <8 x i8> [[V3]], [[ADD1]] +; CHECK-NEXT: ret <8 x i8> [[ADD2]] +; + %wide.vec = load <24 x i8>, <24 x i8>* %ptr + %v1 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> + %v2 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> + %v3 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> + %add1 = add <8 x i8> %v1, %v2 + %add2 = add <8 x i8> %v3, %add1 + ret <8 x i8> %add2 +}