From 39878526e7c6986aa0157beb28da2c155f185087 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 21 Jan 2019 08:16:59 +0000 Subject: [PATCH] [X86] Remove and autoupgrade vpmovqd/vpmovwb intrinsics using trunc+select. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@351729 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/IR/IntrinsicsX86.td | 16 --------- lib/IR/AutoUpgrade.cpp | 12 +++++++ lib/Target/X86/X86IntrinsicsInfo.h | 8 ----- test/CodeGen/X86/avx512-intrinsics-upgrade.ll | 31 ++++++++++++++++ test/CodeGen/X86/avx512-intrinsics.ll | 28 ++++++++------- .../X86/avx512bw-intrinsics-upgrade.ll | 30 ++++++++++++++++ test/CodeGen/X86/avx512bw-intrinsics.ll | 36 ++++++++++--------- .../X86/avx512bwvl-intrinsics-upgrade.ll | 32 +++++++++++++++++ test/CodeGen/X86/avx512bwvl-intrinsics.ll | 34 +++++++++--------- .../X86/avx512vl-intrinsics-upgrade.ll | 33 +++++++++++++++++ test/CodeGen/X86/avx512vl-intrinsics.ll | 36 ++++++++++--------- test/CodeGen/X86/stack-folding-int-avx512.ll | 4 +-- .../CodeGen/X86/stack-folding-int-avx512vl.ll | 4 +-- 13 files changed, 214 insertions(+), 90 deletions(-) diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td index 3d41d23d3e8..e01896e1b9a 100644 --- a/include/llvm/IR/IntrinsicsX86.td +++ b/include/llvm/IR/IntrinsicsX86.td @@ -4444,10 +4444,6 @@ let TargetPrefix = "x86" in { Intrinsic<[], [llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrArgMemOnly]>; - def int_x86_avx512_mask_pmov_qd_256 : // FIXME: Replace with trunc+select. - Intrinsic<[llvm_v4i32_ty], - [llvm_v4i64_ty, llvm_v4i32_ty, llvm_i8_ty], - [IntrNoMem]>; def int_x86_avx512_mask_pmov_qd_mem_256 : GCCBuiltin<"__builtin_ia32_pmovqd256mem_mask">, Intrinsic<[], @@ -4473,10 +4469,6 @@ let TargetPrefix = "x86" in { Intrinsic<[], [llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrArgMemOnly]>; - def int_x86_avx512_mask_pmov_qd_512 : // FIXME: Replace with trunc+select. - Intrinsic<[llvm_v8i32_ty], - [llvm_v8i64_ty, llvm_v8i32_ty, llvm_i8_ty], - [IntrNoMem]>; def int_x86_avx512_mask_pmov_qd_mem_512 : GCCBuiltin<"__builtin_ia32_pmovqd512mem_mask">, Intrinsic<[], @@ -4710,10 +4702,6 @@ let TargetPrefix = "x86" in { Intrinsic<[], [llvm_ptr_ty, llvm_v8i16_ty, llvm_i8_ty], [IntrArgMemOnly]>; - def int_x86_avx512_mask_pmov_wb_256 : // FIXME: Replace with trunc+select. - Intrinsic<[llvm_v16i8_ty], - [llvm_v16i16_ty, llvm_v16i8_ty, llvm_i16_ty], - [IntrNoMem]>; def int_x86_avx512_mask_pmov_wb_mem_256 : GCCBuiltin<"__builtin_ia32_pmovwb256mem_mask">, Intrinsic<[], @@ -4739,10 +4727,6 @@ let TargetPrefix = "x86" in { Intrinsic<[], [llvm_ptr_ty, llvm_v16i16_ty, llvm_i16_ty], [IntrArgMemOnly]>; - def int_x86_avx512_mask_pmov_wb_512 : // FIXME: Replace with trunc+select. - Intrinsic<[llvm_v32i8_ty], - [llvm_v32i16_ty, llvm_v32i8_ty, llvm_i32_ty], - [IntrNoMem]>; def int_x86_avx512_mask_pmov_wb_mem_512 : GCCBuiltin<"__builtin_ia32_pmovwb512mem_mask">, Intrinsic<[], diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp index e27d93ed317..8f49c8eac88 100644 --- a/lib/IR/AutoUpgrade.cpp +++ b/lib/IR/AutoUpgrade.cpp @@ -299,6 +299,10 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) { Name.startswith("avx512.mask.fpclass.p") || // Added in 7.0 Name.startswith("avx512.mask.vpshufbitqmb.") || // Added in 8.0 Name.startswith("avx512.mask.pmultishift.qb.") || // Added in 8.0 + Name == "avx512.mask.pmov.qd.256" || // Added in 9.0 + Name == "avx512.mask.pmov.qd.512" || // Added in 9.0 + Name == "avx512.mask.pmov.wb.256" || // Added in 9.0 + Name == "avx512.mask.pmov.wb.512" || // Added in 9.0 Name == "sse.cvtsi2ss" || // Added in 7.0 Name == "sse.cvtsi642ss" || // Added in 7.0 Name == "sse2.cvtsi2sd" || // Added in 7.0 @@ -2131,6 +2135,14 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { if (CI->getNumArgOperands() == 3) Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep, CI->getArgOperand(1)); + } else if (Name == "avx512.mask.pmov.qd.256" || + Name == "avx512.mask.pmov.qd.512" || + Name == "avx512.mask.pmov.wb.256" || + Name == "avx512.mask.pmov.wb.512") { + Type *Ty = CI->getArgOperand(1)->getType(); + Rep = Builder.CreateTrunc(CI->getArgOperand(0), Ty); + Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep, + CI->getArgOperand(1)); } else if (IsX86 && (Name.startswith("avx.vbroadcastf128") || Name == "avx2.vbroadcasti128")) { // Replace vbroadcastf128/vbroadcasti128 with a vector load+shuffle. diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index b032889937b..6ff3c65b1ac 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -739,10 +739,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::VTRUNC, X86ISD::VMTRUNC), X86_INTRINSIC_DATA(avx512_mask_pmov_qd_128, TRUNCATE_TO_REG, X86ISD::VTRUNC, X86ISD::VMTRUNC), - X86_INTRINSIC_DATA(avx512_mask_pmov_qd_256, INTR_TYPE_1OP_MASK, - ISD::TRUNCATE, 0), - X86_INTRINSIC_DATA(avx512_mask_pmov_qd_512, INTR_TYPE_1OP_MASK, - ISD::TRUNCATE, 0), X86_INTRINSIC_DATA(avx512_mask_pmov_qw_128, TRUNCATE_TO_REG, X86ISD::VTRUNC, X86ISD::VMTRUNC), X86_INTRINSIC_DATA(avx512_mask_pmov_qw_256, TRUNCATE_TO_REG, @@ -751,10 +747,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { ISD::TRUNCATE, X86ISD::VMTRUNC), X86_INTRINSIC_DATA(avx512_mask_pmov_wb_128, TRUNCATE_TO_REG, X86ISD::VTRUNC, X86ISD::VMTRUNC), - X86_INTRINSIC_DATA(avx512_mask_pmov_wb_256, INTR_TYPE_1OP_MASK, - ISD::TRUNCATE, 0), - X86_INTRINSIC_DATA(avx512_mask_pmov_wb_512, INTR_TYPE_1OP_MASK, - ISD::TRUNCATE, 0), X86_INTRINSIC_DATA(avx512_mask_pmovs_db_128, TRUNCATE_TO_REG, X86ISD::VTRUNCS, X86ISD::VMTRUNCS), X86_INTRINSIC_DATA(avx512_mask_pmovs_db_256, TRUNCATE_TO_REG, diff --git a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll index bfa7a58a4d9..04171979a5f 100644 --- a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -9703,3 +9703,34 @@ define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss_rm(<4 x float> %x0, <4 x %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %vecinit.i, i8 0, i32 4) ret < 4 x float> %res } + +declare <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64>, <8 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_mask_pmov_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_mask_pmov_qd_512: +; X86: ## %bb.0: +; X86-NEXT: vpmovqd %zmm0, %ymm2 ## encoding: [0x62,0xf2,0x7e,0x48,0x35,0xc2] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovqd %zmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x49,0x35,0xc1] +; X86-NEXT: vpmovqd %zmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xc9,0x35,0xc0] +; X86-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0xc5,0xf5,0xfe,0xc0] +; X86-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## encoding: [0xc5,0xed,0xfe,0xc0] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_pmov_qd_512: +; X64: ## %bb.0: +; X64-NEXT: vpmovqd %zmm0, %ymm2 ## encoding: [0x62,0xf2,0x7e,0x48,0x35,0xc2] +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpmovqd %zmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x49,0x35,0xc1] +; X64-NEXT: vpmovqd %zmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xc9,0x35,0xc0] +; X64-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0xc5,0xf5,0xfe,0xc0] +; X64-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## encoding: [0xc5,0xed,0xfe,0xc0] +; X64-NEXT: retq ## encoding: [0xc3] + %res0 = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1) + %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) + %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2) + %res3 = add <8 x i32> %res0, %res1 + %res4 = add <8 x i32> %res3, %res2 + ret <8 x i32> %res4 +} diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll index 6a1d9d3da91..d24ed3b02d8 100644 --- a/test/CodeGen/X86/avx512-intrinsics.ll +++ b/test/CodeGen/X86/avx512-intrinsics.ll @@ -2813,24 +2813,26 @@ define void @test_int_x86_avx512_mask_pmovus_qw_mem_512(i8* %ptr, <8 x i64> %x1, ret void } -declare <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64>, <8 x i32>, i8) - define <8 x i32>@test_int_x86_avx512_mask_pmov_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_512: ; CHECK: ## %bb.0: +; CHECK-NEXT: vpmovqd %zmm0, %ymm2 ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vpmovqd %zmm0, %ymm2 {%k1} {z} ; CHECK-NEXT: vpmovqd %zmm0, %ymm1 {%k1} -; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 -; CHECK-NEXT: vpmovqd %zmm0, %ymm0 -; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: retq - %res0 = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) - %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2) - %res3 = add <8 x i32> %res0, %res1 - %res4 = add <8 x i32> %res3, %res2 - ret <8 x i32> %res4 +; CHECK-NEXT: vpmovqd %zmm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: retq + %1 = trunc <8 x i64> %x0 to <8 x i32> + %2 = trunc <8 x i64> %x0 to <8 x i32> + %3 = bitcast i8 %x2 to <8 x i1> + %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> %x1 + %5 = trunc <8 x i64> %x0 to <8 x i32> + %6 = bitcast i8 %x2 to <8 x i1> + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> zeroinitializer + %res3 = add <8 x i32> %1, %4 + %res4 = add <8 x i32> %res3, %7 + ret <8 x i32> %res4 } declare void @llvm.x86.avx512.mask.pmov.qd.mem.512(i8* %ptr, <8 x i64>, i8) diff --git a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll index f62156a6fa8..beea392fc17 100644 --- a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -3984,3 +3984,33 @@ define <32 x i16>@test_int_x86_avx512_mask_psllv32hi(<32 x i16> %x0, <32 x i16> %res4 = add <32 x i16> %res3, %res2 ret <32 x i16> %res4 } + +declare <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16>, <32 x i8>, i32) + +define <32 x i8>@test_int_x86_avx512_mask_pmov_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) { +; X86-LABEL: test_int_x86_avx512_mask_pmov_wb_512: +; X86: # %bb.0: +; X86-NEXT: vpmovwb %zmm0, %ymm2 # encoding: [0x62,0xf2,0x7e,0x48,0x30,0xc2] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmovwb %zmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x49,0x30,0xc1] +; X86-NEXT: vpmovwb %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xc9,0x30,0xc0] +; X86-NEXT: vpaddb %ymm0, %ymm1, %ymm0 # encoding: [0xc5,0xf5,0xfc,0xc0] +; X86-NEXT: vpaddb %ymm0, %ymm2, %ymm0 # encoding: [0xc5,0xed,0xfc,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_pmov_wb_512: +; X64: # %bb.0: +; X64-NEXT: vpmovwb %zmm0, %ymm2 # encoding: [0x62,0xf2,0x7e,0x48,0x30,0xc2] +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpmovwb %zmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x49,0x30,0xc1] +; X64-NEXT: vpmovwb %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xc9,0x30,0xc0] +; X64-NEXT: vpaddb %ymm0, %ymm1, %ymm0 # encoding: [0xc5,0xf5,0xfc,0xc0] +; X64-NEXT: vpaddb %ymm0, %ymm2, %ymm0 # encoding: [0xc5,0xed,0xfc,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res0 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1) + %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) + %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2) + %res3 = add <32 x i8> %res0, %res1 + %res4 = add <32 x i8> %res3, %res2 + ret <32 x i8> %res4 +} diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll index a220ab0ad73..71cfd891bb1 100644 --- a/test/CodeGen/X86/avx512bw-intrinsics.ll +++ b/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -890,34 +890,36 @@ define <32 x i16> @test_int_x86_avx512_mask_pmulhr_sw_512(<32 x i16> %x0, <32 x ret <32 x i16> %res2 } -declare <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16>, <32 x i8>, i32) - define <32 x i8>@test_int_x86_avx512_mask_pmov_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmov_wb_512: ; X86: # %bb.0: +; X86-NEXT: vpmovwb %zmm0, %ymm2 # encoding: [0x62,0xf2,0x7e,0x48,0x30,0xc2] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmovwb %zmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x49,0x30,0xc1] -; X86-NEXT: vpmovwb %zmm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xc9,0x30,0xc2] -; X86-NEXT: vpaddb %ymm2, %ymm1, %ymm1 # encoding: [0xc5,0xf5,0xfc,0xca] -; X86-NEXT: vpmovwb %zmm0, %ymm0 # encoding: [0x62,0xf2,0x7e,0x48,0x30,0xc0] -; X86-NEXT: vpaddb %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xfc,0xc1] +; X86-NEXT: vpmovwb %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xc9,0x30,0xc0] +; X86-NEXT: vpaddb %ymm0, %ymm1, %ymm0 # encoding: [0xc5,0xf5,0xfc,0xc0] +; X86-NEXT: vpaddb %ymm0, %ymm2, %ymm0 # encoding: [0xc5,0xed,0xfc,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmov_wb_512: ; X64: # %bb.0: +; X64-NEXT: vpmovwb %zmm0, %ymm2 # encoding: [0x62,0xf2,0x7e,0x48,0x30,0xc2] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmovwb %zmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x49,0x30,0xc1] -; X64-NEXT: vpmovwb %zmm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xc9,0x30,0xc2] -; X64-NEXT: vpaddb %ymm2, %ymm1, %ymm1 # encoding: [0xc5,0xf5,0xfc,0xca] -; X64-NEXT: vpmovwb %zmm0, %ymm0 # encoding: [0x62,0xf2,0x7e,0x48,0x30,0xc0] -; X64-NEXT: vpaddb %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xfc,0xc1] -; X64-NEXT: retq # encoding: [0xc3] - %res0 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1) - %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) - %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2) - %res3 = add <32 x i8> %res0, %res1 - %res4 = add <32 x i8> %res3, %res2 - ret <32 x i8> %res4 +; X64-NEXT: vpmovwb %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xc9,0x30,0xc0] +; X64-NEXT: vpaddb %ymm0, %ymm1, %ymm0 # encoding: [0xc5,0xf5,0xfc,0xc0] +; X64-NEXT: vpaddb %ymm0, %ymm2, %ymm0 # encoding: [0xc5,0xed,0xfc,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %1 = trunc <32 x i16> %x0 to <32 x i8> + %2 = trunc <32 x i16> %x0 to <32 x i8> + %3 = bitcast i32 %x2 to <32 x i1> + %4 = select <32 x i1> %3, <32 x i8> %2, <32 x i8> %x1 + %5 = trunc <32 x i16> %x0 to <32 x i8> + %6 = bitcast i32 %x2 to <32 x i1> + %7 = select <32 x i1> %6, <32 x i8> %5, <32 x i8> zeroinitializer + %res3 = add <32 x i8> %1, %4 + %res4 = add <32 x i8> %res3, %7 + ret <32 x i8> %res4 } declare void @llvm.x86.avx512.mask.pmov.wb.mem.512(i8* %ptr, <32 x i16>, i32) diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll index bbc97999b8c..0c20b267984 100644 --- a/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll @@ -8808,3 +8808,35 @@ define <8 x i16>@test_int_x86_avx512_mask_psrlv8_hi(<8 x i16> %x0, <8 x i16> %x1 %res4 = add <8 x i16> %res3, %res2 ret <8 x i16> %res4 } + +declare <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16>, <16 x i8>, i16) + +define <16 x i8>@test_int_x86_avx512_mask_pmov_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) { +; X86-LABEL: test_int_x86_avx512_mask_pmov_wb_256: +; X86: # %bb.0: +; X86-NEXT: vpmovwb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x30,0xc2] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmovwb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x30,0xc1] +; X86-NEXT: vpmovwb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x30,0xc0] +; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] +; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] +; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_pmov_wb_256: +; X64: # %bb.0: +; X64-NEXT: vpmovwb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x30,0xc2] +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpmovwb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x30,0xc1] +; X64-NEXT: vpmovwb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x30,0xc0] +; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] +; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/test/CodeGen/X86/avx512bwvl-intrinsics.ll index a01252f7d49..b9937603f16 100644 --- a/test/CodeGen/X86/avx512bwvl-intrinsics.ll +++ b/test/CodeGen/X86/avx512bwvl-intrinsics.ll @@ -1611,36 +1611,38 @@ define void @test_int_x86_avx512_mask_pmovus_wb_mem_128(i8* %ptr, <8 x i16> %x1, ret void } -declare <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16>, <16 x i8>, i16) - define <16 x i8>@test_int_x86_avx512_mask_pmov_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmov_wb_256: ; X86: # %bb.0: +; X86-NEXT: vpmovwb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x30,0xc2] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmovwb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x30,0xc1] -; X86-NEXT: vpmovwb %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x30,0xc2] -; X86-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xca] -; X86-NEXT: vpmovwb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x30,0xc0] -; X86-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1] +; X86-NEXT: vpmovwb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x30,0xc0] +; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] +; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmov_wb_256: ; X64: # %bb.0: +; X64-NEXT: vpmovwb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x30,0xc2] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpmovwb %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x30,0xc2] ; X64-NEXT: vpmovwb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x30,0xc1] -; X64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xca] -; X64-NEXT: vpmovwb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x30,0xc0] -; X64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1] +; X64-NEXT: vpmovwb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x30,0xc0] +; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] +; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] - %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1) - %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) - %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2) - %res3 = add <16 x i8> %res0, %res1 - %res4 = add <16 x i8> %res3, %res2 - ret <16 x i8> %res4 + %1 = trunc <16 x i16> %x0 to <16 x i8> + %2 = trunc <16 x i16> %x0 to <16 x i8> + %3 = bitcast i16 %x2 to <16 x i1> + %4 = select <16 x i1> %3, <16 x i8> %2, <16 x i8> %x1 + %5 = trunc <16 x i16> %x0 to <16 x i8> + %6 = bitcast i16 %x2 to <16 x i1> + %7 = select <16 x i1> %6, <16 x i8> %5, <16 x i8> zeroinitializer + %res3 = add <16 x i8> %1, %4 + %res4 = add <16 x i8> %res3, %7 + ret <16 x i8> %res4 } declare void @llvm.x86.avx512.mask.pmov.wb.mem.256(i8* %ptr, <16 x i16>, i16) diff --git a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll index f8507aee001..1e46a41d59b 100644 --- a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -14552,3 +14552,36 @@ define <4 x double> @test_mask_vfmadd256_pd_rmkz(<4 x double> %a0, <4 x double> %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 -1) nounwind ret <4 x double> %res } + +declare <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_mask_pmov_qd_256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_mask_pmov_qd_256: +; X86: # %bb.0: +; X86-NEXT: vpmovqd %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x35,0xc2] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovqd %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x35,0xc1] +; X86-NEXT: vpmovqd %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x35,0xc0] +; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X86-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] +; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_pmov_qd_256: +; X64: # %bb.0: +; X64-NEXT: vpmovqd %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x35,0xc2] +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpmovqd %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x35,0xc1] +; X64-NEXT: vpmovqd %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x35,0xc0] +; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X64-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] + %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 -1) + %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) + %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2) + %res3 = add <4 x i32> %res0, %res1 + %res4 = add <4 x i32> %res3, %res2 + ret <4 x i32> %res4 +} diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll index 912715ba909..841c6872f73 100644 --- a/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -2446,37 +2446,41 @@ define void @test_int_x86_avx512_mask_pmovus_qd_mem_128(i8* %ptr, <2 x i64> %x1, ret void } -declare <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64>, <4 x i32>, i8) - define <4 x i32>@test_int_x86_avx512_mask_pmov_qd_256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmov_qd_256: ; X86: # %bb.0: +; X86-NEXT: vpmovqd %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x35,0xc2] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovqd %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x35,0xc1] -; X86-NEXT: vpmovqd %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x35,0xc2] -; X86-NEXT: vpaddd %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca] -; X86-NEXT: vpmovqd %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x35,0xc0] -; X86-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] +; X86-NEXT: vpmovqd %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x35,0xc0] +; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X86-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmov_qd_256: ; X64: # %bb.0: +; X64-NEXT: vpmovqd %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x35,0xc2] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpmovqd %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x35,0xc2] ; X64-NEXT: vpmovqd %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x35,0xc1] -; X64-NEXT: vpaddd %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca] -; X64-NEXT: vpmovqd %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x35,0xc0] -; X64-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] +; X64-NEXT: vpmovqd %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x35,0xc0] +; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X64-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] - %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 -1) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) - %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2) - %res3 = add <4 x i32> %res0, %res1 - %res4 = add <4 x i32> %res3, %res2 - ret <4 x i32> %res4 + %1 = trunc <4 x i64> %x0 to <4 x i32> + %2 = trunc <4 x i64> %x0 to <4 x i32> + %3 = bitcast i8 %x2 to <8 x i1> + %extract1 = shufflevector <8 x i1> %3, <8 x i1> %3, <4 x i32> + %4 = select <4 x i1> %extract1, <4 x i32> %2, <4 x i32> %x1 + %5 = trunc <4 x i64> %x0 to <4 x i32> + %6 = bitcast i8 %x2 to <8 x i1> + %extract = shufflevector <8 x i1> %6, <8 x i1> %6, <4 x i32> + %7 = select <4 x i1> %extract, <4 x i32> %5, <4 x i32> zeroinitializer + %res3 = add <4 x i32> %1, %4 + %res4 = add <4 x i32> %res3, %7 + ret <4 x i32> %res4 } declare void @llvm.x86.avx512.mask.pmov.qd.mem.256(i8* %ptr, <4 x i64>, i8) diff --git a/test/CodeGen/X86/stack-folding-int-avx512.ll b/test/CodeGen/X86/stack-folding-int-avx512.ll index 5464775a43a..3efc63b38ba 100644 --- a/test/CodeGen/X86/stack-folding-int-avx512.ll +++ b/test/CodeGen/X86/stack-folding-int-avx512.ll @@ -999,7 +999,7 @@ define <2 x i64> @stack_fold_movq_load(<2 x i64> %a0) { define <8 x i32> @stack_fold_vpmovqd(<8 x i64> %a0) { ;CHECK-LABEL: stack_fold_vpmovqd ;CHECK: vpmovqd %zmm0, {{-?[0-9]*}}(%rsp) # 32-byte Folded Spill - %1 = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %a0, <8 x i32> undef, i8 -1) + %1 = trunc <8 x i64> %a0 to <8 x i32> %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() ret <8 x i32> %1 } @@ -1017,7 +1017,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64>, <8 x i16>, i8) define <32 x i8> @stack_fold_vpmovwb(<32 x i16> %a0) { ;CHECK-LABEL: stack_fold_vpmovwb ;CHECK: vpmovwb %zmm0, {{-?[0-9]*}}(%rsp) # 32-byte Folded Spill - %1 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %a0, <32 x i8> undef, i32 -1) + %1 = trunc <32 x i16> %a0 to <32 x i8> %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() ret <32 x i8> %1 } diff --git a/test/CodeGen/X86/stack-folding-int-avx512vl.ll b/test/CodeGen/X86/stack-folding-int-avx512vl.ll index f01bc61c7f7..a3d7da21c5f 100644 --- a/test/CodeGen/X86/stack-folding-int-avx512vl.ll +++ b/test/CodeGen/X86/stack-folding-int-avx512vl.ll @@ -1275,7 +1275,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32>, <8 x i16>, i8) define <4 x i32> @stack_fold_vpmovqd(<4 x i64> %a0) { ;CHECK-LABEL: stack_fold_vpmovqd ;CHECK: vpmovqd %ymm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill - %1 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64> %a0, <4 x i32> undef, i8 -1) + %1 = trunc <4 x i64> %a0 to <4 x i32> %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() ret <4 x i32> %1 } @@ -1284,7 +1284,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64>, <4 x i32>, i8) define <16 x i8> @stack_fold_vpmovwb(<16 x i16> %a0) { ;CHECK-LABEL: stack_fold_vpmovwb ;CHECK: vpmovwb %ymm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill - %1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %a0, <16 x i8> undef, i16 -1) + %1 = trunc <16 x i16> %a0 to <16 x i8> %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() ret <16 x i8> %1 } -- 2.50.1