From 3f2027522c83fcb4fd8629b6395cec82bfda9cd1 Mon Sep 17 00:00:00 2001 From: Elena Demikhovsky Date: Mon, 15 Dec 2014 10:03:52 +0000 Subject: [PATCH] AVX-512: Added EXPAND instructions and intrinsics. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@224241 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/IR/IntrinsicsX86.td | 102 ++++++++++++++++++++++++ lib/Target/X86/X86ISelLowering.cpp | 31 ++++++- lib/Target/X86/X86InstrAVX512.td | 55 +++++++++++++ lib/Target/X86/X86InstrFragmentsSIMD.td | 3 + lib/Target/X86/X86IntrinsicsInfo.h | 76 ++++++++++++++---- test/CodeGen/X86/avx512vl-intrinsics.ll | 91 +++++++++++++++++++++ 6 files changed, 343 insertions(+), 15 deletions(-) diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td index 40b2ba28f2e..cf3e3a6e02d 100644 --- a/include/llvm/IR/IntrinsicsX86.td +++ b/include/llvm/IR/IntrinsicsX86.td @@ -3584,6 +3584,108 @@ let TargetPrefix = "x86" in { GCCBuiltin<"__builtin_ia32_compressstoredi128_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrReadWriteArgMem]>; + +// expand + def int_x86_avx512_mask_expand_ps_512 : + GCCBuiltin<"__builtin_ia32_expandsf512_mask">, + Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, + llvm_i16_ty], [IntrNoMem]>; + def int_x86_avx512_mask_expand_pd_512 : + GCCBuiltin<"__builtin_ia32_expanddf512_mask">, + Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, + llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_mask_expand_ps_256 : + GCCBuiltin<"__builtin_ia32_expandsf256_mask">, + Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty, + llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_mask_expand_pd_256 : + GCCBuiltin<"__builtin_ia32_expanddf256_mask">, + Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty, + llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_mask_expand_ps_128 : + GCCBuiltin<"__builtin_ia32_expandsf128_mask">, + Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, + llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_mask_expand_pd_128 : + GCCBuiltin<"__builtin_ia32_expanddf128_mask">, + Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, + llvm_i8_ty], [IntrNoMem]>; + + def int_x86_avx512_mask_expand_load_ps_512 : + GCCBuiltin<"__builtin_ia32_expandloadsf512_mask">, + Intrinsic<[llvm_v16f32_ty], [llvm_ptr_ty, llvm_v16f32_ty, + llvm_i16_ty], [IntrReadArgMem]>; + def int_x86_avx512_mask_expand_load_pd_512 : + GCCBuiltin<"__builtin_ia32_expandloaddf512_mask">, + Intrinsic<[llvm_v8f64_ty], [llvm_ptr_ty, llvm_v8f64_ty, + llvm_i8_ty], [IntrReadArgMem]>; + def int_x86_avx512_mask_expand_load_ps_256 : + GCCBuiltin<"__builtin_ia32_expandloadsf256_mask">, + Intrinsic<[llvm_v8f32_ty], [llvm_ptr_ty, llvm_v8f32_ty, + llvm_i8_ty], [IntrReadArgMem]>; + def int_x86_avx512_mask_expand_load_pd_256 : + GCCBuiltin<"__builtin_ia32_expandloaddf256_mask">, + Intrinsic<[llvm_v4f64_ty], [llvm_ptr_ty, llvm_v4f64_ty, + llvm_i8_ty], [IntrReadArgMem]>; + def int_x86_avx512_mask_expand_load_ps_128 : + GCCBuiltin<"__builtin_ia32_expandloadsf128_mask">, + Intrinsic<[llvm_v4f32_ty], [llvm_ptr_ty, llvm_v4f32_ty, + llvm_i8_ty], [IntrReadArgMem]>; + def int_x86_avx512_mask_expand_load_pd_128 : + GCCBuiltin<"__builtin_ia32_expandloaddf128_mask">, + Intrinsic<[llvm_v2f64_ty], [llvm_ptr_ty, llvm_v2f64_ty, + llvm_i8_ty], [IntrReadArgMem]>; + + def int_x86_avx512_mask_expand_d_512 : + GCCBuiltin<"__builtin_ia32_expandsi512_mask">, + Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty, + llvm_i16_ty], [IntrNoMem]>; + def int_x86_avx512_mask_expand_q_512 : + GCCBuiltin<"__builtin_ia32_expanddi512_mask">, + Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty, + llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_mask_expand_d_256 : + GCCBuiltin<"__builtin_ia32_expandsi256_mask">, + Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty, + llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_mask_expand_q_256 : + GCCBuiltin<"__builtin_ia32_expanddi256_mask">, + Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty, + llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_mask_expand_d_128 : + GCCBuiltin<"__builtin_ia32_expandsi128_mask">, + Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, + llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_mask_expand_q_128 : + GCCBuiltin<"__builtin_ia32_expanddi128_mask">, + Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty, + llvm_i8_ty], [IntrNoMem]>; + + def int_x86_avx512_mask_expand_load_d_512 : + GCCBuiltin<"__builtin_ia32_expandloadsi512_mask">, + Intrinsic<[llvm_v16i32_ty], [llvm_ptr_ty, llvm_v16i32_ty, + llvm_i16_ty], [IntrReadArgMem]>; + def int_x86_avx512_mask_expand_load_q_512 : + GCCBuiltin<"__builtin_ia32_expandloaddi512_mask">, + Intrinsic<[llvm_v8i64_ty], [llvm_ptr_ty, llvm_v8i64_ty, + llvm_i8_ty], [IntrReadArgMem]>; + def int_x86_avx512_mask_expand_load_d_256 : + GCCBuiltin<"__builtin_ia32_expandloadsi256_mask">, + Intrinsic<[llvm_v8i32_ty], [llvm_ptr_ty, llvm_v8i32_ty, + llvm_i8_ty], [IntrReadArgMem]>; + def int_x86_avx512_mask_expand_load_q_256 : + GCCBuiltin<"__builtin_ia32_expandloaddi256_mask">, + Intrinsic<[llvm_v4i64_ty], [llvm_ptr_ty, llvm_v4i64_ty, + llvm_i8_ty], [IntrReadArgMem]>; + def int_x86_avx512_mask_expand_load_d_128 : + GCCBuiltin<"__builtin_ia32_expandloadsi128_mask">, + Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty, llvm_v4i32_ty, + llvm_i8_ty], [IntrReadArgMem]>; + def int_x86_avx512_mask_expand_load_q_128 : + GCCBuiltin<"__builtin_ia32_expandloaddi128_mask">, + Intrinsic<[llvm_v2i64_ty], [llvm_ptr_ty, llvm_v2i64_ty, + llvm_i8_ty], [IntrReadArgMem]>; + } // Misc. let TargetPrefix = "x86" in { diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 306260924ff..e029b4b13f2 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -16963,7 +16963,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget Op.getOperand(2), DAG), Op.getOperand(4), Op.getOperand(3), Subtarget, DAG); - case COMPRESS_TO_REG: { + case COMPRESS_EXPAND_IN_REG: { SDValue Mask = Op.getOperand(3); SDValue DataToCompress = Op.getOperand(1); SDValue PassThru = Op.getOperand(2); @@ -17524,6 +17524,34 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, return DAG.getStore(Chain, dl, Compressed, Addr, MachinePointerInfo(), false, false, 0); } + case EXPAND_FROM_MEM: { + SDLoc dl(Op); + SDValue Mask = Op.getOperand(4); + SDValue PathThru = Op.getOperand(3); + SDValue Addr = Op.getOperand(2); + SDValue Chain = Op.getOperand(0); + EVT VT = Op.getValueType(); + + if (isAllOnes(Mask)) // return just a load + return DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false, + false, 0); + EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + VT.getVectorNumElements()); + EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + Mask.getValueType().getSizeInBits()); + SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, + DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask), + DAG.getIntPtrConstant(0)); + + SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), + false, false, false, 0); + + SmallVector Results; + Results.push_back(DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToExpand, + PathThru)); + Results.push_back(Chain); + return DAG.getMergeValues(Results, dl); + } } } @@ -19710,6 +19738,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI"; case X86ISD::XTEST: return "X86ISD::XTEST"; case X86ISD::COMPRESS: return "X86ISD::COMPRESS"; + case X86ISD::EXPAND: return "X86ISD::EXPAND"; } } diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 5933e184400..cc676d8f473 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -5323,3 +5323,58 @@ defm VCOMPRESSPS : compress_by_elt_width <0x8A, "vcompressps", avx512vl_f32_info defm VCOMPRESSPD : compress_by_elt_width <0x8A, "vcompresspd", avx512vl_f64_info>, EVEX, VEX_W; +// expand +multiclass expand_by_vec_width opc, X86VectorVTInfo _, + string OpcodeStr> { + def rrkz : AVX5128I, EVEX_KZ; + + let Constraints = "$src0 = $dst" in + def rrk : AVX5128I, EVEX_K; + + let mayLoad = 1, Constraints = "$src0 = $dst" in + def rmk : AVX5128I, + EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>; + + let mayLoad = 1 in + def rmkz : AVX5128I, + EVEX_KZ, EVEX_CD8<_.EltSize, CD8VT1>; + +} + +multiclass expand_by_elt_width opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo> { + defm Z : expand_by_vec_width, EVEX_V512; + + let Predicates = [HasVLX] in { + defm Z256 : expand_by_vec_width, EVEX_V256; + defm Z128 : expand_by_vec_width, EVEX_V128; + } +} + +defm VPEXPANDD : expand_by_elt_width <0x89, "vpexpandd", avx512vl_i32_info>, + EVEX; +defm VPEXPANDQ : expand_by_elt_width <0x89, "vpexpandq", avx512vl_i64_info>, + EVEX, VEX_W; +defm VEXPANDPS : expand_by_elt_width <0x88, "vexpandps", avx512vl_f32_info>, + EVEX; +defm VEXPANDPD : expand_by_elt_width <0x88, "vexpandpd", avx512vl_f64_info>, + EVEX, VEX_W; diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 04de47be08a..5f695c02cfc 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -286,6 +286,9 @@ def X86pcmpestri : SDNode<"X86ISD::PCMPESTRI", SDT_PCMPESTRI>; def X86compress: SDNode<"X86ISD::COMPRESS", SDTypeProfile<1, 3, [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisVec<3>, SDTCisVec<1>, SDTCisInt<1>]>, []>; +def X86expand : SDNode<"X86ISD::EXPAND", SDTypeProfile<1, 3, + [SDTCisSameAs<0, 3>, + SDTCisVec<3>, SDTCisVec<1>, SDTCisInt<1>]>, []>; //===----------------------------------------------------------------------===// // SSE Complex Patterns diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index c1255224db2..8fa0efa2fc2 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -22,7 +22,7 @@ enum IntrinsicType { INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, CMP_MASK, CMP_MASK_CC, VSHIFT, VSHIFT_MASK, COMI, INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, INTR_TYPE_SCALAR_MASK_RM, - COMPRESS_TO_REG, COMPRESS_TO_MEM + COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, EXPAND_FROM_MEM }; struct IntrinsicData { @@ -95,7 +95,30 @@ static const IntrinsicData IntrinsicsWithChain[] = { COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), X86_INTRINSIC_DATA(avx512_mask_compress_store_q_512, COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), - + X86_INTRINSIC_DATA(avx512_mask_expand_load_d_128, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_d_256, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_d_512, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_pd_128, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_pd_256, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_pd_512, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_ps_128, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_ps_256, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_ps_512, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_q_128, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_q_256, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_q_512, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), X86_INTRINSIC_DATA(avx512_scatter_dpd_512, SCATTER, X86::VSCATTERDPDZmr, 0), X86_INTRINSIC_DATA(avx512_scatter_dpi_512, SCATTER, X86::VPSCATTERDDZmr, 0), X86_INTRINSIC_DATA(avx512_scatter_dpq_512, SCATTER, X86::VPSCATTERDQZmr, 0), @@ -233,30 +256,55 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_cmp_w_128, CMP_MASK_CC, X86ISD::CMPM, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_w_256, CMP_MASK_CC, X86ISD::CMPM, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_w_512, CMP_MASK_CC, X86ISD::CMPM, 0), - X86_INTRINSIC_DATA(avx512_mask_compress_d_128, COMPRESS_TO_REG, + X86_INTRINSIC_DATA(avx512_mask_compress_d_128, COMPRESS_EXPAND_IN_REG, X86ISD::COMPRESS, 0), - X86_INTRINSIC_DATA(avx512_mask_compress_d_256, COMPRESS_TO_REG, + X86_INTRINSIC_DATA(avx512_mask_compress_d_256, COMPRESS_EXPAND_IN_REG, X86ISD::COMPRESS, 0), - X86_INTRINSIC_DATA(avx512_mask_compress_d_512, COMPRESS_TO_REG, + X86_INTRINSIC_DATA(avx512_mask_compress_d_512, COMPRESS_EXPAND_IN_REG, X86ISD::COMPRESS, 0), - X86_INTRINSIC_DATA(avx512_mask_compress_pd_128, COMPRESS_TO_REG, + X86_INTRINSIC_DATA(avx512_mask_compress_pd_128, COMPRESS_EXPAND_IN_REG, X86ISD::COMPRESS, 0), - X86_INTRINSIC_DATA(avx512_mask_compress_pd_256, COMPRESS_TO_REG, + X86_INTRINSIC_DATA(avx512_mask_compress_pd_256, COMPRESS_EXPAND_IN_REG, X86ISD::COMPRESS, 0), - X86_INTRINSIC_DATA(avx512_mask_compress_pd_512, COMPRESS_TO_REG, + X86_INTRINSIC_DATA(avx512_mask_compress_pd_512, COMPRESS_EXPAND_IN_REG, X86ISD::COMPRESS, 0), - X86_INTRINSIC_DATA(avx512_mask_compress_ps_128, COMPRESS_TO_REG, + X86_INTRINSIC_DATA(avx512_mask_compress_ps_128, COMPRESS_EXPAND_IN_REG, X86ISD::COMPRESS, 0), - X86_INTRINSIC_DATA(avx512_mask_compress_ps_256, COMPRESS_TO_REG, + X86_INTRINSIC_DATA(avx512_mask_compress_ps_256, COMPRESS_EXPAND_IN_REG, X86ISD::COMPRESS, 0), - X86_INTRINSIC_DATA(avx512_mask_compress_ps_512, COMPRESS_TO_REG, + X86_INTRINSIC_DATA(avx512_mask_compress_ps_512, COMPRESS_EXPAND_IN_REG, X86ISD::COMPRESS, 0), - X86_INTRINSIC_DATA(avx512_mask_compress_q_128, COMPRESS_TO_REG, + X86_INTRINSIC_DATA(avx512_mask_compress_q_128, COMPRESS_EXPAND_IN_REG, X86ISD::COMPRESS, 0), - X86_INTRINSIC_DATA(avx512_mask_compress_q_256, COMPRESS_TO_REG, + X86_INTRINSIC_DATA(avx512_mask_compress_q_256, COMPRESS_EXPAND_IN_REG, X86ISD::COMPRESS, 0), - X86_INTRINSIC_DATA(avx512_mask_compress_q_512, COMPRESS_TO_REG, + X86_INTRINSIC_DATA(avx512_mask_compress_q_512, COMPRESS_EXPAND_IN_REG, X86ISD::COMPRESS, 0), + + X86_INTRINSIC_DATA(avx512_mask_expand_d_128, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_d_256, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_d_512, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_pd_128, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_pd_256, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_pd_512, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_ps_128, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_ps_256, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_ps_512, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_q_128, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_q_256, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_q_512, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_128, CMP_MASK, X86ISD::PCMPEQM, 0), X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_256, CMP_MASK, X86ISD::PCMPEQM, 0), X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_512, CMP_MASK, X86ISD::PCMPEQM, 0), diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll index 8f3b32a93c0..23b05e38ed4 100644 --- a/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -698,3 +698,94 @@ define <4 x i32> @compr10(<4 x i32> %data, i8 %mask) { } declare <4 x i32> @llvm.x86.avx512.mask.compress.d.128(<4 x i32> %data, <4 x i32> %src0, i8 %mask) + +; Expand + +; CHECK-LABEL: expand1 +; CHECK: vexpandpd (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x88,0x07] +define <8 x double> @expand1(i8* %addr, <8 x double> %data, i8 %mask) { + %res = call <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(i8* %addr, <8 x double> %data, i8 %mask) + ret <8 x double> %res +} + +declare <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(i8* %addr, <8 x double> %data, i8 %mask) + +; CHECK-LABEL: expand2 +; CHECK: vexpandpd (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x88,0x07] +define <4 x double> @expand2(i8* %addr, <4 x double> %data, i8 %mask) { + %res = call <4 x double> @llvm.x86.avx512.mask.expand.load.pd.256(i8* %addr, <4 x double> %data, i8 %mask) + ret <4 x double> %res +} + +declare <4 x double> @llvm.x86.avx512.mask.expand.load.pd.256(i8* %addr, <4 x double> %data, i8 %mask) + +; CHECK-LABEL: expand3 +; CHECK: vexpandps (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x88,0x07] +define <4 x float> @expand3(i8* %addr, <4 x float> %data, i8 %mask) { + %res = call <4 x float> @llvm.x86.avx512.mask.expand.load.ps.128(i8* %addr, <4 x float> %data, i8 %mask) + ret <4 x float> %res +} + +declare <4 x float> @llvm.x86.avx512.mask.expand.load.ps.128(i8* %addr, <4 x float> %data, i8 %mask) + +; CHECK-LABEL: expand4 +; CHECK: vexpandpd %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x88,0xc0] +define <8 x double> @expand4(i8* %addr, <8 x double> %data, i8 %mask) { + %res = call <8 x double> @llvm.x86.avx512.mask.expand.pd.512(<8 x double> %data, <8 x double> zeroinitializer, i8 %mask) + ret <8 x double> %res +} + +declare <8 x double> @llvm.x86.avx512.mask.expand.pd.512(<8 x double> %data, <8 x double> %src0, i8 %mask) + +; CHECK-LABEL: expand5 +; CHECK: vexpandpd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x88,0xc8] +define <4 x double> @expand5(<4 x double> %data, <4 x double> %src0, i8 %mask) { + %res = call <4 x double> @llvm.x86.avx512.mask.expand.pd.256( <4 x double> %data, <4 x double> %src0, i8 %mask) + ret <4 x double> %res +} + +declare <4 x double> @llvm.x86.avx512.mask.expand.pd.256(<4 x double> %data, <4 x double> %src0, i8 %mask) + +; CHECK-LABEL: expand6 +; CHECK: vexpandps %xmm0 +define <4 x float> @expand6(<4 x float> %data, i8 %mask) { + %res = call <4 x float> @llvm.x86.avx512.mask.expand.ps.128(<4 x float> %data, <4 x float>zeroinitializer, i8 %mask) + ret <4 x float> %res +} + +declare <4 x float> @llvm.x86.avx512.mask.expand.ps.128(<4 x float> %data, <4 x float> %src0, i8 %mask) + +; CHECK-LABEL: expand7 +; CHECK-NOT: vexpand +; CHECK: vmovapd +define <8 x double> @expand7(i8* %addr, <8 x double> %data) { + %res = call <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(i8* %addr, <8 x double> %data, i8 -1) + ret <8 x double> %res +} + +; CHECK-LABEL: expand8 +; CHECK-NOT: vexpandps %xmm0 +define <4 x float> @expand8(<4 x float> %data) { + %res = call <4 x float> @llvm.x86.avx512.mask.expand.ps.128(<4 x float> %data, <4 x float>zeroinitializer, i8 -1) + ret <4 x float> %res +} + +; CHECK-LABEL: expand9 +; CHECK: vpexpandq (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x89,0x07] +define <8 x i64> @expand9(i8* %addr, <8 x i64> %data, i8 %mask) { + %res = call <8 x i64> @llvm.x86.avx512.mask.expand.load.q.512(i8* %addr, <8 x i64> %data, i8 %mask) + ret <8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.expand.load.q.512(i8* %addr, <8 x i64> %data, i8 %mask) + +; CHECK-LABEL: expand10 +; CHECK: vpexpandd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x89,0xc0] +define <4 x i32> @expand10(<4 x i32> %data, i8 %mask) { + %res = call <4 x i32> @llvm.x86.avx512.mask.expand.d.128(<4 x i32> %data, <4 x i32>zeroinitializer, i8 %mask) + ret <4 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx512.mask.expand.d.128(<4 x i32> %data, <4 x i32> %src0, i8 %mask) + + -- 2.40.0