From: Simon Pilgrim Date: Sat, 2 Jul 2016 14:42:35 +0000 (+0000) Subject: [X86][AVX512] Autoupgrade the MOVDDUP/MOVSLDUP/MOVSHDUP intrinsics X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=c964a30adf02673149e0641ac03aa3e36e1dce13;p=llvm [X86][AVX512] Autoupgrade the MOVDDUP/MOVSLDUP/MOVSHDUP intrinsics git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@274439 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp index ef80b63b0b8..fb9fdb4a904 100644 --- a/lib/IR/AutoUpgrade.cpp +++ b/lib/IR/AutoUpgrade.cpp @@ -219,6 +219,9 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) { Name.startswith("x86.avx2.pbroadcast") || Name.startswith("x86.avx.vpermil.") || Name.startswith("x86.sse2.pshuf") || + Name.startswith("x86.avx512.mask.movddup") || + Name.startswith("x86.avx512.mask.movshdup") || + Name.startswith("x86.avx512.mask.movsldup") || Name.startswith("x86.avx512.mask.pshuf.d.") || Name.startswith("x86.avx512.mask.pshufl.w.") || Name.startswith("x86.avx512.mask.pshufh.w.") || @@ -1063,6 +1066,28 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { if (CI->getNumArgOperands() == 4) Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep, CI->getArgOperand(2)); + } else if (Name.startswith("llvm.x86.avx512.mask.movddup") || + Name.startswith("llvm.x86.avx512.mask.movshdup") || + Name.startswith("llvm.x86.avx512.mask.movsldup")) { + Value *Op0 = CI->getArgOperand(0); + unsigned NumElts = CI->getType()->getVectorNumElements(); + unsigned NumLaneElts = 128/CI->getType()->getScalarSizeInBits(); + + unsigned Offset = 0; + if (Name.startswith("llvm.x86.avx512.mask.movshdup.")) + Offset = 1; + + SmallVector Idxs(NumElts); + for (unsigned l = 0; l != NumElts; l += NumLaneElts) + for (unsigned i = 0; i != NumLaneElts; i += 2) { + Idxs[i + l + 0] = i + l + Offset; + Idxs[i + l + 1] = i + l + Offset; + } + + Rep = Builder.CreateShuffleVector(Op0, Op0, Idxs); + + Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep, + CI->getArgOperand(1)); } else if (Name.startswith("llvm.x86.avx512.mask.punpckl") || Name.startswith("llvm.x86.avx512.mask.unpckl.")) { Value *Op0 = CI->getArgOperand(0); diff --git a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll index 8ffc984b0cc..bb7f140de05 100644 --- a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -1,6 +1,66 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s +declare <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float>, <16 x float>, i16) + +define <16 x float>@test_int_x86_avx512_mask_movsldup_512(<16 x float> %x0, <16 x float> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_movsldup_512: +; CHECK: ## BB#0: +; CHECK-NEXT: vmovsldup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovsldup {{.*#+}} zmm1 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] +; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] +; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2) + %res1 = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 -1) + %res2 = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> zeroinitializer, i16 %x2) + %res3 = fadd <16 x float> %res, %res1 + %res4 = fadd <16 x float> %res2, %res3 + ret <16 x float> %res4 +} + +declare <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float>, <16 x float>, i16) + +define <16 x float>@test_int_x86_avx512_mask_movshdup_512(<16 x float> %x0, <16 x float> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_movshdup_512: +; CHECK: ## BB#0: +; CHECK-NEXT: vmovshdup {{.*#+}} zmm2 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovshdup {{.*#+}} zmm1 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2) + %res1 = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 -1) + %res2 = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> zeroinitializer, i16 %x2) + %res3 = fadd <16 x float> %res, %res1 + %res4 = fadd <16 x float> %res2, %res3 + ret <16 x float> %res4 +} + +declare <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double>, <8 x double>, i8) + +define <8 x double>@test_int_x86_avx512_mask_movddup_512(<8 x double> %x0, <8 x double> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_movddup_512: +; CHECK: ## BB#0: +; CHECK-NEXT: vmovddup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6] +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovddup {{.*#+}} zmm1 = zmm0[0,0,2,2,4,4,6,6] +; CHECK-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6] +; CHECK-NEXT: vaddpd %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %res = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 %x2) + %res1 = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 -1) + %res2 = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> zeroinitializer, i8 %x2) + %res3 = fadd <8 x double> %res, %res1 + %res4 = fadd <8 x double> %res2, %res3 + ret <8 x double> %res4 +} + define void @test_store1(<16 x float> %data, i8* %ptr, i8* %ptr2, i16 %mask) { ; CHECK-LABEL: test_store1: ; CHECK: ## BB#0: diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll index a1460a43012..1b5a52a8656 100644 --- a/test/CodeGen/X86/avx512-intrinsics.ll +++ b/test/CodeGen/X86/avx512-intrinsics.ll @@ -1,4 +1,3 @@ -; NOTE: Assertions have been autogenerated by update_llc_test_checks.py ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s @@ -5701,66 +5700,6 @@ define <8 x i64>@test_int_x86_avx512_maskz_pternlog_q_512(<8 x i64> %x0, <8 x i6 ret <8 x i64> %res2 } -declare <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float>, <16 x float>, i16) - -define <16 x float>@test_int_x86_avx512_mask_movsldup_512(<16 x float> %x0, <16 x float> %x1, i16 %x2) { -; CHECK-LABEL: test_int_x86_avx512_mask_movsldup_512: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovsldup {{.*#+}} zmm1 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] -; CHECK-NEXT: vmovsldup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] -; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] -; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0 -; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2) - %res1 = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 -1) - %res2 = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> zeroinitializer, i16 %x2) - %res3 = fadd <16 x float> %res, %res1 - %res4 = fadd <16 x float> %res2, %res3 - ret <16 x float> %res4 -} - -declare <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float>, <16 x float>, i16) - -define <16 x float>@test_int_x86_avx512_mask_movshdup_512(<16 x float> %x0, <16 x float> %x1, i16 %x2) { -; CHECK-LABEL: test_int_x86_avx512_mask_movshdup_512: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovshdup {{.*#+}} zmm1 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] -; CHECK-NEXT: vmovshdup {{.*#+}} zmm2 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] -; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] -; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0 -; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2) - %res1 = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 -1) - %res2 = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> zeroinitializer, i16 %x2) - %res3 = fadd <16 x float> %res, %res1 - %res4 = fadd <16 x float> %res2, %res3 - ret <16 x float> %res4 -} - -declare <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double>, <8 x double>, i8) - -define <8 x double>@test_int_x86_avx512_mask_movddup_512(<8 x double> %x0, <8 x double> %x1, i8 %x2) { -; CHECK-LABEL: test_int_x86_avx512_mask_movddup_512: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovddup {{.*#+}} zmm1 = zmm0[0,0,2,2,4,4,6,6] -; CHECK-NEXT: vmovddup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6] -; CHECK-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6] -; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0 -; CHECK-NEXT: retq - %res = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 %x2) - %res1 = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 -1) - %res2 = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> zeroinitializer, i8 %x2) - %res3 = fadd <8 x double> %res, %res1 - %res4 = fadd <8 x double> %res2, %res3 - ret <8 x double> %res4 -} - define i32 @test_x86_avx512_comi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: test_x86_avx512_comi_sd_eq_sae: ; CHECK: ## BB#0: diff --git a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll index 4a0235e09ad..061e346e55f 100644 --- a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -1,6 +1,143 @@ -; NOTE: Assertions have been autogenerated by update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vl --show-mc-encoding| FileCheck %s +declare <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float>, <4 x float>, i8) + +define <4 x float>@test_int_x86_avx512_mask_movsldup_128(<4 x float> %x0, <4 x float> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_movsldup_128: +; CHECK: ## BB#0: +; CHECK-NEXT: vmovsldup %xmm0, %xmm2 ## encoding: [0x62,0xf1,0x7e,0x08,0x12,0xd0] +; CHECK-NEXT: ## xmm2 = xmm0[0,0,2,2] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vmovsldup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x12,0xc8] +; CHECK-NEXT: ## xmm1 = xmm0[0,0,2,2] +; CHECK-NEXT: vmovsldup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x12,0xc0] +; CHECK-NEXT: ## xmm0 = xmm0[0,0,2,2] +; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x74,0x08,0x58,0xca] +; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x58,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2) + %res1 = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> %x1, i8 -1) + %res2 = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> zeroinitializer, i8 %x2) + %res3 = fadd <4 x float> %res, %res1 + %res4 = fadd <4 x float> %res2, %res3 + ret <4 x float> %res4 +} + +declare <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float>, <8 x float>, i8) + +define <8 x float>@test_int_x86_avx512_mask_movsldup_256(<8 x float> %x0, <8 x float> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_movsldup_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vmovsldup %ymm0, %ymm2 ## encoding: [0x62,0xf1,0x7e,0x28,0x12,0xd0] +; CHECK-NEXT: ## ymm2 = ymm0[0,0,2,2,4,4,6,6] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vmovsldup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x12,0xc8] +; CHECK-NEXT: ## ymm1 = ymm0[0,0,2,2,4,4,6,6] +; CHECK-NEXT: vmovsldup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x12,0xc0] +; CHECK-NEXT: ## ymm0 = ymm0[0,0,2,2,4,4,6,6] +; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x74,0x28,0x58,0xca] +; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x58,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2) + %res1 = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> %x1, i8 -1) + %res2 = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> zeroinitializer, i8 %x2) + %res3 = fadd <8 x float> %res, %res1 + %res4 = fadd <8 x float> %res2, %res3 + ret <8 x float> %res4 +} + +declare <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float>, <4 x float>, i8) + +define <4 x float>@test_int_x86_avx512_mask_movshdup_128(<4 x float> %x0, <4 x float> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_movshdup_128: +; CHECK: ## BB#0: +; CHECK-NEXT: vmovshdup %xmm0, %xmm2 ## encoding: [0x62,0xf1,0x7e,0x08,0x16,0xd0] +; CHECK-NEXT: ## xmm2 = xmm0[1,1,3,3] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vmovshdup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x16,0xc8] +; CHECK-NEXT: ## xmm1 = xmm0[1,1,3,3] +; CHECK-NEXT: vmovshdup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x16,0xc0] +; CHECK-NEXT: ## xmm0 = xmm0[1,1,3,3] +; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x74,0x08,0x58,0xca] +; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x58,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2) + %res1 = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> %x1, i8 -1) + %res2 = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> zeroinitializer, i8 %x2) + %res3 = fadd <4 x float> %res, %res1 + %res4 = fadd <4 x float> %res2, %res3 + ret <4 x float> %res4 +} + +declare <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float>, <8 x float>, i8) + +define <8 x float>@test_int_x86_avx512_mask_movshdup_256(<8 x float> %x0, <8 x float> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_movshdup_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vmovshdup %ymm0, %ymm2 ## encoding: [0x62,0xf1,0x7e,0x28,0x16,0xd0] +; CHECK-NEXT: ## ymm2 = ymm0[1,1,3,3,5,5,7,7] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vmovshdup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x16,0xc8] +; CHECK-NEXT: ## ymm1 = ymm0[1,1,3,3,5,5,7,7] +; CHECK-NEXT: vmovshdup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x16,0xc0] +; CHECK-NEXT: ## ymm0 = ymm0[1,1,3,3,5,5,7,7] +; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x74,0x28,0x58,0xca] +; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x58,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2) + %res1 = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> %x1, i8 -1) + %res2 = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> zeroinitializer, i8 %x2) + %res3 = fadd <8 x float> %res, %res1 + %res4 = fadd <8 x float> %res2, %res3 + ret <8 x float> %res4 +} +declare <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double>, <2 x double>, i8) + +define <2 x double>@test_int_x86_avx512_mask_movddup_128(<2 x double> %x0, <2 x double> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_movddup_128: +; CHECK: ## BB#0: +; CHECK-NEXT: vmovddup %xmm0, %xmm2 ## encoding: [0x62,0xf1,0xff,0x08,0x12,0xd0] +; CHECK-NEXT: ## xmm2 = xmm0[0,0] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vmovddup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x12,0xc8] +; CHECK-NEXT: ## xmm1 = xmm0[0,0] +; CHECK-NEXT: vmovddup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0x89,0x12,0xc0] +; CHECK-NEXT: ## xmm0 = xmm0[0,0] +; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0xf5,0x08,0x58,0xca] +; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x58,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> %x1, i8 %x2) + %res1 = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> %x1, i8 -1) + %res2 = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> zeroinitializer, i8 %x2) + %res3 = fadd <2 x double> %res, %res1 + %res4 = fadd <2 x double> %res2, %res3 + ret <2 x double> %res4 +} + +declare <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double>, <4 x double>, i8) + +define <4 x double>@test_int_x86_avx512_mask_movddup_256(<4 x double> %x0, <4 x double> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_movddup_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vmovddup %ymm0, %ymm2 ## encoding: [0x62,0xf1,0xff,0x28,0x12,0xd0] +; CHECK-NEXT: ## ymm2 = ymm0[0,0,2,2] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vmovddup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0x12,0xc8] +; CHECK-NEXT: ## ymm1 = ymm0[0,0,2,2] +; CHECK-NEXT: vmovddup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0xa9,0x12,0xc0] +; CHECK-NEXT: ## ymm0 = ymm0[0,0,2,2] +; CHECK-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0xf5,0x28,0x58,0xca] +; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x58,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> %x1, i8 %x2) + %res1 = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> %x1, i8 -1) + %res2 = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> zeroinitializer, i8 %x2) + %res3 = fadd <4 x double> %res, %res1 + %res4 = fadd <4 x double> %res2, %res3 + ret <4 x double> %res4 +} + declare void @llvm.x86.avx512.mask.store.pd.128(i8*, <2 x double>, i8) define void@test_int_x86_avx512_mask_store_pd_128(i8* %ptr1, i8* %ptr2, <2 x double> %x1, i8 %x2) { diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll index b2397de135d..969cc71992f 100644 --- a/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -1,5 +1,4 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; NOTE: Assertions have been autogenerated by update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vl --show-mc-encoding| FileCheck %s ; 256-bit @@ -6259,143 +6258,6 @@ define <8 x i16> @test_x86_vcvtps2ph_256(<8 x float> %a0, i8 %mask, <8 x i16> %s declare <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float>, i32, <8 x i16>, i8) nounwind readonly -declare <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float>, <4 x float>, i8) - -define <4 x float>@test_int_x86_avx512_mask_movsldup_128(<4 x float> %x0, <4 x float> %x1, i8 %x2) { -; CHECK-LABEL: test_int_x86_avx512_mask_movsldup_128: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovsldup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x12,0xc8] -; CHECK-NEXT: ## xmm1 = xmm0[0,0,2,2] -; CHECK-NEXT: vmovsldup %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x12,0xd0] -; CHECK-NEXT: ## xmm2 = xmm0[0,0,2,2] -; CHECK-NEXT: vmovsldup %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7e,0x08,0x12,0xc0] -; CHECK-NEXT: ## xmm0 = xmm0[0,0,2,2] -; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x74,0x08,0x58,0xc0] -; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6c,0x08,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2) - %res1 = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> %x1, i8 -1) - %res2 = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> zeroinitializer, i8 %x2) - %res3 = fadd <4 x float> %res, %res1 - %res4 = fadd <4 x float> %res2, %res3 - ret <4 x float> %res4 -} - -declare <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float>, <8 x float>, i8) - -define <8 x float>@test_int_x86_avx512_mask_movsldup_256(<8 x float> %x0, <8 x float> %x1, i8 %x2) { -; CHECK-LABEL: test_int_x86_avx512_mask_movsldup_256: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovsldup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x12,0xc8] -; CHECK-NEXT: ## ymm1 = ymm0[0,0,2,2,4,4,6,6] -; CHECK-NEXT: vmovsldup %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x12,0xd0] -; CHECK-NEXT: ## ymm2 = ymm0[0,0,2,2,4,4,6,6] -; CHECK-NEXT: vmovsldup %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7e,0x28,0x12,0xc0] -; CHECK-NEXT: ## ymm0 = ymm0[0,0,2,2,4,4,6,6] -; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x74,0x28,0x58,0xc0] -; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6c,0x28,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2) - %res1 = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> %x1, i8 -1) - %res2 = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> zeroinitializer, i8 %x2) - %res3 = fadd <8 x float> %res, %res1 - %res4 = fadd <8 x float> %res2, %res3 - ret <8 x float> %res4 -} - -declare <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float>, <4 x float>, i8) - -define <4 x float>@test_int_x86_avx512_mask_movshdup_128(<4 x float> %x0, <4 x float> %x1, i8 %x2) { -; CHECK-LABEL: test_int_x86_avx512_mask_movshdup_128: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovshdup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x16,0xc8] -; CHECK-NEXT: ## xmm1 = xmm0[1,1,3,3] -; CHECK-NEXT: vmovshdup %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x16,0xd0] -; CHECK-NEXT: ## xmm2 = xmm0[1,1,3,3] -; CHECK-NEXT: vmovshdup %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7e,0x08,0x16,0xc0] -; CHECK-NEXT: ## xmm0 = xmm0[1,1,3,3] -; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x74,0x08,0x58,0xc0] -; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6c,0x08,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2) - %res1 = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> %x1, i8 -1) - %res2 = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> zeroinitializer, i8 %x2) - %res3 = fadd <4 x float> %res, %res1 - %res4 = fadd <4 x float> %res2, %res3 - ret <4 x float> %res4 -} - -declare <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float>, <8 x float>, i8) - -define <8 x float>@test_int_x86_avx512_mask_movshdup_256(<8 x float> %x0, <8 x float> %x1, i8 %x2) { -; CHECK-LABEL: test_int_x86_avx512_mask_movshdup_256: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovshdup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x16,0xc8] -; CHECK-NEXT: ## ymm1 = ymm0[1,1,3,3,5,5,7,7] -; CHECK-NEXT: vmovshdup %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x16,0xd0] -; CHECK-NEXT: ## ymm2 = ymm0[1,1,3,3,5,5,7,7] -; CHECK-NEXT: vmovshdup %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7e,0x28,0x16,0xc0] -; CHECK-NEXT: ## ymm0 = ymm0[1,1,3,3,5,5,7,7] -; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x74,0x28,0x58,0xc0] -; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6c,0x28,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2) - %res1 = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> %x1, i8 -1) - %res2 = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> zeroinitializer, i8 %x2) - %res3 = fadd <8 x float> %res, %res1 - %res4 = fadd <8 x float> %res2, %res3 - ret <8 x float> %res4 -} -declare <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double>, <2 x double>, i8) - -define <2 x double>@test_int_x86_avx512_mask_movddup_128(<2 x double> %x0, <2 x double> %x1, i8 %x2) { -; CHECK-LABEL: test_int_x86_avx512_mask_movddup_128: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovddup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x12,0xc8] -; CHECK-NEXT: ## xmm1 = xmm0[0,0] -; CHECK-NEXT: vmovddup %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0x89,0x12,0xd0] -; CHECK-NEXT: ## xmm2 = xmm0[0,0] -; CHECK-NEXT: vmovddup %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xff,0x08,0x12,0xc0] -; CHECK-NEXT: ## xmm0 = xmm0[0,0] -; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0x58,0xc0] -; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0xed,0x08,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> %x1, i8 %x2) - %res1 = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> %x1, i8 -1) - %res2 = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> zeroinitializer, i8 %x2) - %res3 = fadd <2 x double> %res, %res1 - %res4 = fadd <2 x double> %res2, %res3 - ret <2 x double> %res4 -} - -declare <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double>, <4 x double>, i8) - -define <4 x double>@test_int_x86_avx512_mask_movddup_256(<4 x double> %x0, <4 x double> %x1, i8 %x2) { -; CHECK-LABEL: test_int_x86_avx512_mask_movddup_256: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovddup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0x12,0xc8] -; CHECK-NEXT: ## ymm1 = ymm0[0,0,2,2] -; CHECK-NEXT: vmovddup %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0xa9,0x12,0xd0] -; CHECK-NEXT: ## ymm2 = ymm0[0,0,2,2] -; CHECK-NEXT: vmovddup %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xff,0x28,0x12,0xc0] -; CHECK-NEXT: ## ymm0 = ymm0[0,0,2,2] -; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0x58,0xc0] -; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0xed,0x28,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> %x1, i8 %x2) - %res1 = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> %x1, i8 -1) - %res2 = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> zeroinitializer, i8 %x2) - %res3 = fadd <4 x double> %res, %res1 - %res4 = fadd <4 x double> %res2, %res3 - ret <4 x double> %res4 -} - define <8 x float> @test_rsqrt_ps_256_rr(<8 x float> %a0) { ; CHECK-LABEL: test_rsqrt_ps_256_rr: ; CHECK: ## BB#0: @@ -7356,9 +7218,9 @@ define <8 x i32>@test_int_x86_avx512_mask_psrav8_si_const() { ; CHECK: ## BB#0: ; CHECK-NEXT: vmovdqa32 {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] ; CHECK-NEXT: ## encoding: [0x62,0xf1,0x7d,0x28,0x6f,0x05,A,A,A,A] -; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI478_0-4, kind: reloc_riprel_4byte +; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI472_0-4, kind: reloc_riprel_4byte ; CHECK-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x46,0x05,A,A,A,A] -; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI478_1-4, kind: reloc_riprel_4byte +; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI472_1-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.psrav8.si(<8 x i32> , <8 x i32> , <8 x i32> zeroinitializer, i8 -1) ret <8 x i32> %res @@ -7389,9 +7251,9 @@ define <2 x i64>@test_int_x86_avx512_mask_psrav_q_128_const(i8 %x3) { ; CHECK: ## BB#0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} xmm0 = [2,18446744073709551607] ; CHECK-NEXT: ## encoding: [0x62,0xf1,0xfd,0x08,0x6f,0x05,A,A,A,A] -; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI480_0-4, kind: reloc_riprel_4byte +; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI474_0-4, kind: reloc_riprel_4byte ; CHECK-NEXT: vpsravq {{.*}}(%rip), %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x46,0x05,A,A,A,A] -; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI480_1-4, kind: reloc_riprel_4byte +; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI474_1-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.psrav.q.128(<2 x i64> , <2 x i64> , <2 x i64> zeroinitializer, i8 -1) ret <2 x i64> %res