From: Simon Pilgrim Date: Tue, 10 May 2016 16:08:24 +0000 (+0000) Subject: [X86][AVX] Added some shuffle combine from load tests X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=86f93f6dffa8506bf10f8075f1b80023d7f83b35;p=llvm [X86][AVX] Added some shuffle combine from load tests As discussed on D19198 - we need to check what happens when we shuffle with different value type to the load git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@269068 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/test/CodeGen/X86/vector-shuffle-combining-avx.ll index cbc4fc0dee7..8f00703cf37 100644 --- a/test/CodeGen/X86/vector-shuffle-combining-avx.ll +++ b/test/CodeGen/X86/vector-shuffle-combining-avx.ll @@ -36,6 +36,15 @@ define <4 x float> @combine_vpermilvar_4f32_movddup(<4 x float> %a0) { %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> ) ret <4 x float> %1 } +define <4 x float> @combine_vpermilvar_4f32_movddup_load(<4 x float> *%a0) { +; ALL-LABEL: combine_vpermilvar_4f32_movddup_load: +; ALL: # BB#0: +; ALL-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; ALL-NEXT: retq + %1 = load <4 x float>, <4 x float> *%a0 + %2 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %1, <4 x i32> ) + ret <4 x float> %2 +} define <4 x float> @combine_vpermilvar_4f32_movshdup(<4 x float> %a0) { ; ALL-LABEL: combine_vpermilvar_4f32_movshdup: @@ -90,6 +99,16 @@ define <8 x float> @combine_vpermilvar_8f32_movddup(<8 x float> %a0) { %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> ) ret <8 x float> %1 } +define <8 x float> @combine_vpermilvar_8f32_movddup_load(<8 x float> *%a0) { +; ALL-LABEL: combine_vpermilvar_8f32_movddup_load: +; ALL: # BB#0: +; ALL-NEXT: vmovaps (%rdi), %ymm0 +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; ALL-NEXT: retq + %1 = load <8 x float>, <8 x float> *%a0 + %2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %1, <8 x i32> ) + ret <8 x float> %2 +} define <8 x float> @combine_vpermilvar_8f32_movshdup(<8 x float> %a0) { ; ALL-LABEL: combine_vpermilvar_8f32_movshdup: diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll b/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll index 0268d9ef50a..23e1896f778 100644 --- a/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll +++ b/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll @@ -41,6 +41,18 @@ define <8 x double> @combine_vpermt2var_8f64_movddup(<8 x double> %x0, <8 x doub %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> , <8 x double> %x0, <8 x double> %x1, i8 -1) ret <8 x double> %res0 } +define <8 x double> @combine_vpermt2var_8f64_movddup_load(<8 x double> *%p0, <8 x double> %x1) { +; CHECK-LABEL: combine_vpermt2var_8f64_movddup_load: +; CHECK: # BB#0: +; CHECK-NEXT: vmovapd (%rdi), %zmm1 +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,2,2,4,4,6,6] +; CHECK-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1 +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %x0 = load <8 x double>, <8 x double> *%p0 + %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> , <8 x double> %x0, <8 x double> %x1, i8 -1) + ret <8 x double> %res0 +} define <8 x double> @combine_vpermt2var_8f64_movddup_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) { ; CHECK-LABEL: combine_vpermt2var_8f64_movddup_mask: ; CHECK: # BB#0: @@ -105,6 +117,18 @@ define <16 x float> @combine_vpermt2var_16f32_vmovddup(<16 x float> %x0, <16 x f %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> , <16 x float> %x0, <16 x float> %x1, i16 -1) ret <16 x float> %res0 } +define <16 x float> @combine_vpermt2var_16f32_vmovddup_load(<16 x float> *%p0, <16 x float> %x1) { +; CHECK-LABEL: combine_vpermt2var_16f32_vmovddup_load: +; CHECK: # BB#0: +; CHECK-NEXT: vmovaps (%rdi), %zmm1 +; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] +; CHECK-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1 +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %x0 = load <16 x float>, <16 x float> *%p0 + %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> , <16 x float> %x0, <16 x float> %x1, i16 -1) + ret <16 x float> %res0 +} define <16 x float> @combine_vpermt2var_16f32_vmovddup_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) { ; CHECK-LABEL: combine_vpermt2var_16f32_vmovddup_mask: ; CHECK: # BB#0: @@ -125,6 +149,18 @@ define <16 x float> @combine_vpermt2var_16f32_vmovshdup(<16 x float> %x0, <16 x %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> , <16 x float> %x0, <16 x float> %x1, i16 -1) ret <16 x float> %res0 } +define <16 x float> @combine_vpermt2var_16f32_vmovshdup_load(<16 x float> *%p0, <16 x float> %x1) { +; CHECK-LABEL: combine_vpermt2var_16f32_vmovshdup_load: +; CHECK: # BB#0: +; CHECK-NEXT: vmovaps (%rdi), %zmm1 +; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; CHECK-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1 +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %x0 = load <16 x float>, <16 x float> *%p0 + %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> , <16 x float> %x0, <16 x float> %x1, i16 -1) + ret <16 x float> %res0 +} define <16 x float> @combine_vpermt2var_16f32_vmovshdup_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) { ; CHECK-LABEL: combine_vpermt2var_16f32_vmovshdup_mask: ; CHECK: # BB#0: @@ -145,6 +181,18 @@ define <16 x float> @combine_vpermt2var_16f32_vmovsldup(<16 x float> %x0, <16 x %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> , <16 x float> %x0, <16 x float> %x1, i16 -1) ret <16 x float> %res0 } +define <16 x float> @combine_vpermt2var_16f32_vmovsldup_load(<16 x float> *%p0, <16 x float> %x1) { +; CHECK-LABEL: combine_vpermt2var_16f32_vmovsldup_load: +; CHECK: # BB#0: +; CHECK-NEXT: vmovaps (%rdi), %zmm1 +; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] +; CHECK-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1 +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %x0 = load <16 x float>, <16 x float> *%p0 + %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> , <16 x float> %x0, <16 x float> %x1, i16 -1) + ret <16 x float> %res0 +} define <16 x float> @combine_vpermt2var_16f32_vmovsldup_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) { ; CHECK-LABEL: combine_vpermt2var_16f32_vmovsldup_mask: ; CHECK: # BB#0: @@ -155,6 +203,19 @@ define <16 x float> @combine_vpermt2var_16f32_vmovsldup_mask(<16 x float> %x0, < %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> , <16 x float> %x0, <16 x float> %x1, i16 %m) ret <16 x float> %res0 } +define <16 x float> @combine_vpermt2var_16f32_vmovsldup_mask_load(<16 x float> *%p0, <16 x float> %x1, i16 %m) { +; CHECK-LABEL: combine_vpermt2var_16f32_vmovsldup_mask_load: +; CHECK: # BB#0: +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vmovaps (%rdi), %zmm1 +; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] +; CHECK-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1 {%k1} {z} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %x0 = load <16 x float>, <16 x float> *%p0 + %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> , <16 x float> %x0, <16 x float> %x1, i16 %m) + ret <16 x float> %res0 +} define <16 x i32> @combine_vpermt2var_16i32_identity(<16 x i32> %x0, <16 x i32> %x1) { ; CHECK-LABEL: combine_vpermt2var_16i32_identity: