From 3c97b65dcad51dcd9af4c2bafa1b72d63333114f Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 1 Dec 2017 13:25:54 +0000 Subject: [PATCH] [X86][AVX512] Tag vshift/vpermv/pshufd/pshufb instructions scheduler classes git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@319540 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86InstrAVX512.td | 231 +++-- lib/Target/X86/X86InstrSSE.td | 47 +- test/CodeGen/X86/avx512-bugfix-23634.ll | 2 +- test/CodeGen/X86/avx512-schedule.ll | 10 +- test/CodeGen/X86/avx512-shuffle-schedule.ll | 916 +++++++++--------- .../X86/avx512dq-intrinsics-upgrade.ll | 2 +- 6 files changed, 623 insertions(+), 585 deletions(-) diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index a38f9bfa6e5..c9b22128f03 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -5123,135 +5123,148 @@ defm VPTESTNM : avx512_vptest_all_forms<0x26, 0x27, "vptestnm", X86testnm, // AVX-512 Shift instructions //===----------------------------------------------------------------------===// multiclass avx512_shift_rmi opc, Format ImmFormR, Format ImmFormM, - string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { + string OpcodeStr, SDNode OpNode, OpndItins itins, + X86VectorVTInfo _> { let ExeDomain = _.ExeDomain in { defm ri : AVX512_maskable; + itins.rr>, Sched<[itins.Sched]>; defm mi : AVX512_maskable; + itins.rm>, Sched<[itins.Sched.Folded]>; } } multiclass avx512_shift_rmbi opc, Format ImmFormM, - string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { + string OpcodeStr, SDNode OpNode, OpndItins itins, + X86VectorVTInfo _> { let ExeDomain = _.ExeDomain in defm mbi : AVX512_maskable, EVEX_B; + itins.rm>, EVEX_B, Sched<[itins.Sched.Folded, ReadAfterLd]>; } multiclass avx512_shift_rrm opc, string OpcodeStr, SDNode OpNode, - ValueType SrcVT, PatFrag bc_frag, X86VectorVTInfo _> { + OpndItins itins, ValueType SrcVT, PatFrag bc_frag, + X86VectorVTInfo _> { // src2 is always 128-bit let ExeDomain = _.ExeDomain in { defm rr : AVX512_maskable, AVX512BIBase, EVEX_4V; + itins.rr>, AVX512BIBase, EVEX_4V, Sched<[itins.Sched]>; defm rm : AVX512_maskable, AVX512BIBase, - EVEX_4V; + itins.rm>, AVX512BIBase, + EVEX_4V, Sched<[itins.Sched.Folded, ReadAfterLd]>; } } multiclass avx512_shift_sizes opc, string OpcodeStr, SDNode OpNode, - ValueType SrcVT, PatFrag bc_frag, - AVX512VLVectorVTInfo VTInfo, Predicate prd> { + OpndItins itins, ValueType SrcVT, PatFrag bc_frag, + AVX512VLVectorVTInfo VTInfo, Predicate prd> { let Predicates = [prd] in - defm Z : avx512_shift_rrm, EVEX_V512, EVEX_CD8 ; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_shift_rrm, EVEX_V256, EVEX_CD8; - defm Z128 : avx512_shift_rrm, EVEX_V128, EVEX_CD8; } } multiclass avx512_shift_types opcd, bits<8> opcq, bits<8> opcw, - string OpcodeStr, SDNode OpNode> { - defm D : avx512_shift_sizes; - defm Q : avx512_shift_sizes, VEX_W; - defm W : avx512_shift_sizes; + string OpcodeStr, SDNode OpNode, + OpndItins itins> { + defm D : avx512_shift_sizes; + defm Q : avx512_shift_sizes, VEX_W; + defm W : avx512_shift_sizes; } multiclass avx512_shift_rmi_sizes opc, Format ImmFormR, Format ImmFormM, - string OpcodeStr, SDNode OpNode, - AVX512VLVectorVTInfo VTInfo> { + string OpcodeStr, SDNode OpNode, + OpndItins itins, AVX512VLVectorVTInfo VTInfo> { let Predicates = [HasAVX512] in - defm Z: avx512_shift_rmi, - avx512_shift_rmbi, EVEX_V512; let Predicates = [HasAVX512, HasVLX] in { - defm Z256: avx512_shift_rmi, - avx512_shift_rmbi, EVEX_V256; defm Z128: avx512_shift_rmi, - avx512_shift_rmbi, + avx512_shift_rmbi, EVEX_V128; } } multiclass avx512_shift_rmi_w opcw, Format ImmFormR, Format ImmFormM, - string OpcodeStr, SDNode OpNode> { + string OpcodeStr, SDNode OpNode, + OpndItins itins> { let Predicates = [HasBWI] in defm WZ: avx512_shift_rmi, EVEX_V512, VEX_WIG; + itins, v32i16_info>, EVEX_V512, VEX_WIG; let Predicates = [HasVLX, HasBWI] in { defm WZ256: avx512_shift_rmi, EVEX_V256, VEX_WIG; + itins, v16i16x_info>, EVEX_V256, VEX_WIG; defm WZ128: avx512_shift_rmi, EVEX_V128, VEX_WIG; + itins, v8i16x_info>, EVEX_V128, VEX_WIG; } } multiclass avx512_shift_rmi_dq opcd, bits<8> opcq, Format ImmFormR, Format ImmFormM, - string OpcodeStr, SDNode OpNode> { + string OpcodeStr, SDNode OpNode, OpndItins itins> { defm D: avx512_shift_rmi_sizes, EVEX_CD8<32, CD8VF>; + itins, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; defm Q: avx512_shift_rmi_sizes, EVEX_CD8<64, CD8VF>, VEX_W; + itins, avx512vl_i64_info>, EVEX_CD8<64, CD8VF>, VEX_W; } -defm VPSRL : avx512_shift_rmi_dq<0x72, 0x73, MRM2r, MRM2m, "vpsrl", X86vsrli>, - avx512_shift_rmi_w<0x71, MRM2r, MRM2m, "vpsrlw", X86vsrli>, AVX512BIi8Base, EVEX_4V; +defm VPSRL : avx512_shift_rmi_dq<0x72, 0x73, MRM2r, MRM2m, "vpsrl", X86vsrli, + SSE_INTSHIFT_P>, + avx512_shift_rmi_w<0x71, MRM2r, MRM2m, "vpsrlw", X86vsrli, + SSE_INTSHIFT_P>, AVX512BIi8Base, EVEX_4V; -defm VPSLL : avx512_shift_rmi_dq<0x72, 0x73, MRM6r, MRM6m, "vpsll", X86vshli>, - avx512_shift_rmi_w<0x71, MRM6r, MRM6m, "vpsllw", X86vshli>, AVX512BIi8Base, EVEX_4V; +defm VPSLL : avx512_shift_rmi_dq<0x72, 0x73, MRM6r, MRM6m, "vpsll", X86vshli, + SSE_INTSHIFT_P>, + avx512_shift_rmi_w<0x71, MRM6r, MRM6m, "vpsllw", X86vshli, + SSE_INTSHIFT_P>, AVX512BIi8Base, EVEX_4V; -defm VPSRA : avx512_shift_rmi_dq<0x72, 0x72, MRM4r, MRM4m, "vpsra", X86vsrai>, - avx512_shift_rmi_w<0x71, MRM4r, MRM4m, "vpsraw", X86vsrai>, AVX512BIi8Base, EVEX_4V; +defm VPSRA : avx512_shift_rmi_dq<0x72, 0x72, MRM4r, MRM4m, "vpsra", X86vsrai, + SSE_INTSHIFT_P>, + avx512_shift_rmi_w<0x71, MRM4r, MRM4m, "vpsraw", X86vsrai, + SSE_INTSHIFT_P>, AVX512BIi8Base, EVEX_4V; -defm VPROR : avx512_shift_rmi_dq<0x72, 0x72, MRM0r, MRM0m, "vpror", X86vrotri>, AVX512BIi8Base, EVEX_4V; -defm VPROL : avx512_shift_rmi_dq<0x72, 0x72, MRM1r, MRM1m, "vprol", X86vrotli>, AVX512BIi8Base, EVEX_4V; +defm VPROR : avx512_shift_rmi_dq<0x72, 0x72, MRM0r, MRM0m, "vpror", X86vrotri, + SSE_INTSHIFT_P>, AVX512BIi8Base, EVEX_4V; +defm VPROL : avx512_shift_rmi_dq<0x72, 0x72, MRM1r, MRM1m, "vprol", X86vrotli, + SSE_INTSHIFT_P>, AVX512BIi8Base, EVEX_4V; -defm VPSLL : avx512_shift_types<0xF2, 0xF3, 0xF1, "vpsll", X86vshl>; -defm VPSRA : avx512_shift_types<0xE2, 0xE2, 0xE1, "vpsra", X86vsra>; -defm VPSRL : avx512_shift_types<0xD2, 0xD3, 0xD1, "vpsrl", X86vsrl>; +defm VPSLL : avx512_shift_types<0xF2, 0xF3, 0xF1, "vpsll", X86vshl, SSE_INTSHIFT_P>; +defm VPSRA : avx512_shift_types<0xE2, 0xE2, 0xE1, "vpsra", X86vsra, SSE_INTSHIFT_P>; +defm VPSRL : avx512_shift_types<0xD2, 0xD3, 0xD1, "vpsrl", X86vsrl, SSE_INTSHIFT_P>; // Use 512bit VPSRA/VPSRAI version to implement v2i64/v4i64 in case NoVLX. let Predicates = [HasAVX512, NoVLX] in { @@ -5284,25 +5297,27 @@ let Predicates = [HasAVX512, NoVLX] in { // Variable Bit Shifts //===-------------------------------------------------------------------===// multiclass avx512_var_shift opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _> { + OpndItins itins, X86VectorVTInfo _> { let ExeDomain = _.ExeDomain in { defm rr : AVX512_maskable, AVX5128IBase, EVEX_4V; + itins.rr>, AVX5128IBase, EVEX_4V, + Sched<[itins.Sched]>; defm rm : AVX512_maskable, AVX5128IBase, EVEX_4V, - EVEX_CD8<_.EltSize, CD8VF>; + itins.rm>, AVX5128IBase, EVEX_4V, + EVEX_CD8<_.EltSize, CD8VF>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } } multiclass avx512_var_shift_mb opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _> { + OpndItins itins, X86VectorVTInfo _> { let ExeDomain = _.ExeDomain in defm rmb : AVX512_maskable opc, string OpcodeStr, SDNode OpNode, "$src1, ${src2}"##_.BroadcastStr, (_.VT (OpNode _.RC:$src1, (_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src2))))), - SSE_INTSHIFT_ITINS_P.rm>, AVX5128IBase, EVEX_B, - EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; + itins.rm>, AVX5128IBase, EVEX_B, + EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } multiclass avx512_var_shift_sizes opc, string OpcodeStr, SDNode OpNode, - AVX512VLVectorVTInfo _> { + OpndItins itins, AVX512VLVectorVTInfo _> { let Predicates = [HasAVX512] in - defm Z : avx512_var_shift, - avx512_var_shift_mb, EVEX_V512; + defm Z : avx512_var_shift, + avx512_var_shift_mb, EVEX_V512; let Predicates = [HasAVX512, HasVLX] in { - defm Z256 : avx512_var_shift, - avx512_var_shift_mb, EVEX_V256; - defm Z128 : avx512_var_shift, - avx512_var_shift_mb, EVEX_V128; + defm Z256 : avx512_var_shift, + avx512_var_shift_mb, EVEX_V256; + defm Z128 : avx512_var_shift, + avx512_var_shift_mb, EVEX_V128; } } multiclass avx512_var_shift_types opc, string OpcodeStr, - SDNode OpNode> { - defm D : avx512_var_shift_sizes { + defm D : avx512_var_shift_sizes; - defm Q : avx512_var_shift_sizes, VEX_W; } @@ -5358,30 +5374,30 @@ multiclass avx512_var_shift_lowering opc, string OpcodeStr, - SDNode OpNode> { + SDNode OpNode, OpndItins itins> { let Predicates = [HasBWI] in - defm WZ: avx512_var_shift, + defm WZ: avx512_var_shift, EVEX_V512, VEX_W; let Predicates = [HasVLX, HasBWI] in { - defm WZ256: avx512_var_shift, + defm WZ256: avx512_var_shift, EVEX_V256, VEX_W; - defm WZ128: avx512_var_shift, + defm WZ128: avx512_var_shift, EVEX_V128, VEX_W; } } -defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", shl>, - avx512_var_shift_w<0x12, "vpsllvw", shl>; +defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", shl, SSE_INTSHIFT_P>, + avx512_var_shift_w<0x12, "vpsllvw", shl, SSE_INTSHIFT_P>; -defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", sra>, - avx512_var_shift_w<0x11, "vpsravw", sra>; +defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", sra, SSE_INTSHIFT_P>, + avx512_var_shift_w<0x11, "vpsravw", sra, SSE_INTSHIFT_P>; -defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", srl>, - avx512_var_shift_w<0x10, "vpsrlvw", srl>; +defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", srl, SSE_INTSHIFT_P>, + avx512_var_shift_w<0x10, "vpsrlvw", srl, SSE_INTSHIFT_P>; -defm VPRORV : avx512_var_shift_types<0x14, "vprorv", rotr>; -defm VPROLV : avx512_var_shift_types<0x15, "vprolv", rotl>; +defm VPRORV : avx512_var_shift_types<0x14, "vprorv", rotr, SSE_INTSHIFT_P>; +defm VPROLV : avx512_var_shift_types<0x15, "vprolv", rotl, SSE_INTSHIFT_P>; defm : avx512_var_shift_lowering; defm : avx512_var_shift_lowering; @@ -5559,64 +5575,64 @@ let Predicates = [HasAVX512, NoVLX] in { // 1-src variable permutation VPERMW/D/Q //===-------------------------------------------------------------------===// multiclass avx512_vperm_dq_sizes opc, string OpcodeStr, SDNode OpNode, - AVX512VLVectorVTInfo _> { + OpndItins itins, AVX512VLVectorVTInfo _> { let Predicates = [HasAVX512] in - defm Z : avx512_var_shift, - avx512_var_shift_mb, EVEX_V512; + defm Z : avx512_var_shift, + avx512_var_shift_mb, EVEX_V512; let Predicates = [HasAVX512, HasVLX] in - defm Z256 : avx512_var_shift, - avx512_var_shift_mb, EVEX_V256; + defm Z256 : avx512_var_shift, + avx512_var_shift_mb, EVEX_V256; } multiclass avx512_vpermi_dq_sizes opc, Format ImmFormR, Format ImmFormM, string OpcodeStr, SDNode OpNode, - AVX512VLVectorVTInfo VTInfo> { + OpndItins itins, AVX512VLVectorVTInfo VTInfo> { let Predicates = [HasAVX512] in defm Z: avx512_shift_rmi, + itins, VTInfo.info512>, avx512_shift_rmbi, EVEX_V512; + itins, VTInfo.info512>, EVEX_V512; let Predicates = [HasAVX512, HasVLX] in defm Z256: avx512_shift_rmi, + itins, VTInfo.info256>, avx512_shift_rmbi, EVEX_V256; + itins, VTInfo.info256>, EVEX_V256; } multiclass avx512_vperm_bw opc, string OpcodeStr, Predicate prd, SDNode OpNode, - AVX512VLVectorVTInfo _> { + OpndItins itins, AVX512VLVectorVTInfo _> { let Predicates = [prd] in - defm Z: avx512_var_shift, + defm Z: avx512_var_shift, EVEX_V512 ; let Predicates = [HasVLX, prd] in { - defm Z256: avx512_var_shift, + defm Z256: avx512_var_shift, EVEX_V256 ; - defm Z128: avx512_var_shift, + defm Z128: avx512_var_shift, EVEX_V128 ; } } defm VPERMW : avx512_vperm_bw<0x8D, "vpermw", HasBWI, X86VPermv, - avx512vl_i16_info>, VEX_W; + AVX2_PERMV_I, avx512vl_i16_info>, VEX_W; defm VPERMB : avx512_vperm_bw<0x8D, "vpermb", HasVBMI, X86VPermv, - avx512vl_i8_info>; + AVX2_PERMV_I, avx512vl_i8_info>; defm VPERMD : avx512_vperm_dq_sizes<0x36, "vpermd", X86VPermv, - avx512vl_i32_info>; + AVX2_PERMV_I, avx512vl_i32_info>; defm VPERMQ : avx512_vperm_dq_sizes<0x36, "vpermq", X86VPermv, - avx512vl_i64_info>, VEX_W; + AVX2_PERMV_I, avx512vl_i64_info>, VEX_W; defm VPERMPS : avx512_vperm_dq_sizes<0x16, "vpermps", X86VPermv, - avx512vl_f32_info>; + AVX2_PERMV_F, avx512vl_f32_info>; defm VPERMPD : avx512_vperm_dq_sizes<0x16, "vpermpd", X86VPermv, - avx512vl_f64_info>, VEX_W; + AVX2_PERMV_F, avx512vl_f64_info>, VEX_W; defm VPERMQ : avx512_vpermi_dq_sizes<0x00, MRMSrcReg, MRMSrcMem, "vpermq", - X86VPermi, avx512vl_i64_info>, + X86VPermi, AVX2_PERMV_I, avx512vl_i64_info>, EVEX, AVX512AIi8Base, EVEX_CD8<64, CD8VF>, VEX_W; defm VPERMPD : avx512_vpermi_dq_sizes<0x01, MRMSrcReg, MRMSrcMem, "vpermpd", - X86VPermi, avx512vl_f64_info>, + X86VPermi, AVX2_PERMV_F, avx512vl_f64_info>, EVEX, AVX512AIi8Base, EVEX_CD8<64, CD8VF>, VEX_W; //===----------------------------------------------------------------------===// // AVX-512 - VPERMIL @@ -5670,7 +5686,7 @@ multiclass avx512_permil OpcImm, bits<8> OpcVar, AVX512VLVectorVTInfo _, AVX512VLVectorVTInfo Ctrl>{ defm NAME: avx512_permil_vec_common; defm NAME: avx512_shift_rmi_sizes, + X86VPermilpi, AVX_VPERMILV, _>, EVEX, AVX512AIi8Base, EVEX_CD8<_.info128.EltSize, CD8VF>; } @@ -5686,24 +5702,25 @@ defm VPERMILPD : avx512_permil<"vpermilpd", 0x05, 0x0D, avx512vl_f64_info, //===----------------------------------------------------------------------===// defm VPSHUFD : avx512_shift_rmi_sizes<0x70, MRMSrcReg, MRMSrcMem, "vpshufd", - X86PShufd, avx512vl_i32_info>, + X86PShufd, SSE_PSHUF, avx512vl_i32_info>, EVEX, AVX512BIi8Base, EVEX_CD8<32, CD8VF>; defm VPSHUFH : avx512_shift_rmi_w<0x70, MRMSrcReg, MRMSrcMem, "vpshufhw", - X86PShufhw>, EVEX, AVX512XSIi8Base; + X86PShufhw, SSE_PSHUF>, EVEX, AVX512XSIi8Base; defm VPSHUFL : avx512_shift_rmi_w<0x70, MRMSrcReg, MRMSrcMem, "vpshuflw", - X86PShuflw>, EVEX, AVX512XDIi8Base; + X86PShuflw, SSE_PSHUF>, EVEX, AVX512XDIi8Base; -multiclass avx512_pshufb_sizes opc, string OpcodeStr, SDNode OpNode> { +multiclass avx512_pshufb_sizes opc, string OpcodeStr, SDNode OpNode, + OpndItins itins> { let Predicates = [HasBWI] in - defm Z: avx512_var_shift, EVEX_V512; + defm Z: avx512_var_shift, EVEX_V512; let Predicates = [HasVLX, HasBWI] in { - defm Z256: avx512_var_shift, EVEX_V256; - defm Z128: avx512_var_shift, EVEX_V128; + defm Z256: avx512_var_shift, EVEX_V256; + defm Z128: avx512_var_shift, EVEX_V128; } } -defm VPSHUFB: avx512_pshufb_sizes<0x00, "vpshufb", X86pshufb>, VEX_WIG; +defm VPSHUFB: avx512_pshufb_sizes<0x00, "vpshufb", X86pshufb, SSE_PSHUFB>, VEX_WIG; //===----------------------------------------------------------------------===// // Move Low to High and High to Low packed FP Instructions diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 099883c4072..586bcc29946 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -139,6 +139,11 @@ def SSE_INTMUL_ITINS_P : OpndItins< IIC_SSE_INTMUL_P_RR, IIC_SSE_INTMUL_P_RM >; +// FIXME: Merge SSE_INTSHIFT_P + SSE_INTSHIFT_ITINS_P. +def SSE_INTSHIFT_P : OpndItins< + IIC_SSE_INTSH_P_RR, IIC_SSE_INTSH_P_RM +>; + def SSE_INTSHIFT_ITINS_P : ShiftOpndItins< IIC_SSE_INTSH_P_RR, IIC_SSE_INTSH_P_RM, IIC_SSE_INTSH_P_RI >; @@ -3891,9 +3896,14 @@ defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32, // SSE2 - Packed Integer Shuffle Instructions //===---------------------------------------------------------------------===// +let Sched = WriteShuffle in +def SSE_PSHUF : OpndItins< + IIC_SSE_PSHUF_RI, IIC_SSE_PSHUF_MI +>; + let ExeDomain = SSEPackedInt in { multiclass sse2_pshuffle { + SDNode OpNode, OpndItins itins, Predicate prd> { let Predicates = [HasAVX, prd] in { def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2), @@ -3901,15 +3911,15 @@ let Predicates = [HasAVX, prd] in { "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))], - IIC_SSE_PSHUF_RI>, VEX, Sched<[WriteShuffle]>, VEX_WIG; + itins.rr>, VEX, Sched<[itins.Sched]>, VEX_WIG; def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2), !strconcat("v", OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, (vt128 (OpNode (bitconvert (loadv2i64 addr:$src1)), - (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX, - Sched<[WriteShuffleLd]>, VEX_WIG; + (i8 imm:$src2))))], itins.rm>, VEX, + Sched<[itins.Sched.Folded]>, VEX_WIG; } let Predicates = [HasAVX2, prd] in { @@ -3919,15 +3929,15 @@ let Predicates = [HasAVX2, prd] in { "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))], - IIC_SSE_PSHUF_RI>, VEX, VEX_L, Sched<[WriteShuffle]>, VEX_WIG; + itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>, VEX_WIG; def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src1, u8imm:$src2), !strconcat("v", OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (vt256 (OpNode (bitconvert (loadv4i64 addr:$src1)), - (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX, VEX_L, - Sched<[WriteShuffleLd]>, VEX_WIG; + (i8 imm:$src2))))], itins.rm>, VEX, VEX_L, + Sched<[itins.Sched.Folded]>, VEX_WIG; } let Predicates = [UseSSE2] in { @@ -3937,23 +3947,24 @@ let Predicates = [UseSSE2] in { "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))], - IIC_SSE_PSHUF_RI>, Sched<[WriteShuffle]>; + itins.rr>, Sched<[itins.Sched]>; def mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)), - (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, - Sched<[WriteShuffleLd, ReadAfterLd]>; + (i8 imm:$src2))))], itins.rm>, + Sched<[itins.Sched.Folded]>; } } } // ExeDomain = SSEPackedInt -defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd, NoVLX>, PD; -defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw, +defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd, SSE_PSHUF, + NoVLX>, PD; +defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw, SSE_PSHUF, NoVLX_Or_NoBWI>, XS; -defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw, +defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw, SSE_PSHUF, NoVLX_Or_NoBWI>, XD; //===---------------------------------------------------------------------===// @@ -8086,6 +8097,16 @@ let Predicates = [HasAVX1Only] in { // VPERM - Permute instructions // +let Sched = WriteFShuffle256 in +def AVX2_PERMV_F : OpndItins< + IIC_SSE_SHUFP, IIC_SSE_SHUFP +>; + +let Sched = WriteShuffle256 in +def AVX2_PERMV_I : OpndItins< + IIC_SSE_PSHUF_RI, IIC_SSE_PSHUF_MI +>; + multiclass avx2_perm opc, string OpcodeStr, PatFrag mem_frag, ValueType OpVT, X86FoldableSchedWrite Sched, X86MemOperand memOp> { diff --git a/test/CodeGen/X86/avx512-bugfix-23634.ll b/test/CodeGen/X86/avx512-bugfix-23634.ll index acb3f121c8d..ebd7f537fe9 100644 --- a/test/CodeGen/X86/avx512-bugfix-23634.ll +++ b/test/CodeGen/X86/avx512-bugfix-23634.ll @@ -16,8 +16,8 @@ define void @f_fu(float* %ret, float* %aa, float %b) { ; CHECK-NEXT: movw $-21846, %ax ## imm = 0xAAAA ; CHECK-NEXT: kmovw %eax, %k1 ; CHECK-NEXT: vmovdqa32 {{.*}}(%rip), %zmm1 {%k1} -; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: vcvtdq2ps %zmm0, %zmm0 ; CHECK-NEXT: vmovups %zmm0, (%rdi) ; CHECK-NEXT: retq diff --git a/test/CodeGen/X86/avx512-schedule.ll b/test/CodeGen/X86/avx512-schedule.ll index 72541465e23..464a6670d8c 100755 --- a/test/CodeGen/X86/avx512-schedule.ll +++ b/test/CodeGen/X86/avx512-schedule.ll @@ -4353,7 +4353,7 @@ define i16 @trunc_16i8_to_16i1(<16 x i8> %a) { define i16 @trunc_16i32_to_16i1(<16 x i32> %a) { ; GENERIC-LABEL: trunc_16i32_to_16i1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vpslld $31, %zmm0, %zmm0 +; GENERIC-NEXT: vpslld $31, %zmm0, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: vptestmd %zmm0, %zmm0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: kmovd %k0, %eax ; GENERIC-NEXT: # kill: %ax %ax %eax @@ -4547,7 +4547,7 @@ define void @extload_v8i64(<8 x i8>* %a, <8 x i64>* %res) { define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: test21: ; GENERIC: # BB#0: -; GENERIC-NEXT: vpsllw $7, %zmm2, %zmm2 +; GENERIC-NEXT: vpsllw $7, %zmm2, %zmm2 # sched: [3:1.00] ; GENERIC-NEXT: vpmovb2m %zmm2, %k1 ; GENERIC-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} ; GENERIC-NEXT: kshiftrq $32, %k1, %k1 @@ -7703,7 +7703,7 @@ define <32 x i16> @test_build_vec_v32i1(<32 x i16> %x) { define <64 x i8> @test_build_vec_v64i1(<64 x i8> %x) { ; GENERIC-LABEL: test_build_vec_v64i1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zmm0[2],zero,zero,zero,zmm0[6],zero,zmm0[8],zero,zmm0[10],zero,zmm0[12],zero,zero,zmm0[15],zero,zero,zmm0[18],zero,zmm0[20],zero,zmm0[22],zero,zmm0[24],zero,zero,zmm0[27],zero,zero,zmm0[30],zero,zmm0[32],zero,zmm0[34],zero,zero,zero,zmm0[38],zero,zmm0[40],zero,zero,zmm0[43,44],zero,zmm0[46],zero,zmm0[48],zero,zmm0[50],zero,zero,zero,zmm0[54],zero,zmm0[56],zero,zero,zmm0[59,60],zero,zmm0[62],zero +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zmm0[2],zero,zero,zero,zmm0[6],zero,zmm0[8],zero,zmm0[10],zero,zmm0[12],zero,zero,zmm0[15],zero,zero,zmm0[18],zero,zmm0[20],zero,zmm0[22],zero,zmm0[24],zero,zero,zmm0[27],zero,zero,zmm0[30],zero,zmm0[32],zero,zmm0[34],zero,zero,zero,zmm0[38],zero,zmm0[40],zero,zero,zmm0[43,44],zero,zmm0[46],zero,zmm0[48],zero,zmm0[50],zero,zero,zero,zmm0[54],zero,zmm0[56],zero,zero,zmm0[59,60],zero,zmm0[62],zero sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_build_vec_v64i1: @@ -8035,7 +8035,7 @@ define void @store_32i1(<32 x i1>* %a, <32 x i1> %v) { define void @store_32i1_1(<32 x i1>* %a, <32 x i16> %v) { ; GENERIC-LABEL: store_32i1_1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vpsllw $15, %zmm0, %zmm0 +; GENERIC-NEXT: vpsllw $15, %zmm0, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: vpmovw2m %zmm0, %k0 ; GENERIC-NEXT: kmovd %k0, (%rdi) ; GENERIC-NEXT: vzeroupper @@ -8058,7 +8058,7 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) { ; ; GENERIC-LABEL: store_64i1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vpsllw $7, %zmm0, %zmm0 +; GENERIC-NEXT: vpsllw $7, %zmm0, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: vpmovb2m %zmm0, %k0 ; GENERIC-NEXT: kmovq %k0, (%rdi) ; GENERIC-NEXT: vzeroupper diff --git a/test/CodeGen/X86/avx512-shuffle-schedule.ll b/test/CodeGen/X86/avx512-shuffle-schedule.ll index b59f024d9c3..7bca8d32d84 100755 --- a/test/CodeGen/X86/avx512-shuffle-schedule.ll +++ b/test/CodeGen/X86/avx512-shuffle-schedule.ll @@ -8,7 +8,7 @@ define <16 x i16> @test_16xi16_perm_mask0(<16 x i16> %vec) { ; GENERIC-LABEL: test_16xi16_perm_mask0: ; GENERIC: # BB#0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50] -; GENERIC-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; GENERIC-NEXT: vpermw %ymm0, %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xi16_perm_mask0: @@ -25,7 +25,7 @@ define <16 x i16> @test_masked_16xi16_perm_mask0(<16 x i16> %vec, <16 x i16> %ve ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 -; GENERIC-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} +; GENERIC-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -49,7 +49,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mask0(<16 x i16> %vec, <16 x i16> % ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 -; GENERIC-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_mask0: @@ -70,7 +70,7 @@ define <16 x i16> @test_masked_16xi16_perm_mask1(<16 x i16> %vec, <16 x i16> %ve ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 -; GENERIC-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} +; GENERIC-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -94,7 +94,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mask1(<16 x i16> %vec, <16 x i16> % ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 -; GENERIC-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_mask1: @@ -115,7 +115,7 @@ define <16 x i16> @test_masked_16xi16_perm_mask2(<16 x i16> %vec, <16 x i16> %ve ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 -; GENERIC-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} +; GENERIC-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -139,7 +139,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mask2(<16 x i16> %vec, <16 x i16> % ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 -; GENERIC-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_mask2: @@ -158,7 +158,7 @@ define <16 x i16> @test_16xi16_perm_mask3(<16 x i16> %vec) { ; GENERIC-LABEL: test_16xi16_perm_mask3: ; GENERIC: # BB#0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50] -; GENERIC-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; GENERIC-NEXT: vpermw %ymm0, %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xi16_perm_mask3: @@ -175,7 +175,7 @@ define <16 x i16> @test_masked_16xi16_perm_mask3(<16 x i16> %vec, <16 x i16> %ve ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 -; GENERIC-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} +; GENERIC-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -199,7 +199,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mask3(<16 x i16> %vec, <16 x i16> % ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 -; GENERIC-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_mask3: @@ -218,7 +218,7 @@ define <16 x i16> @test_16xi16_perm_mem_mask0(<16 x i16>* %vp) { ; GENERIC-LABEL: test_16xi16_perm_mem_mask0: ; GENERIC: # BB#0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm0 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50] -; GENERIC-NEXT: vpermw (%rdi), %ymm0, %ymm0 +; GENERIC-NEXT: vpermw (%rdi), %ymm0, %ymm0 # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xi16_perm_mem_mask0: @@ -236,7 +236,7 @@ define <16 x i16> @test_masked_16xi16_perm_mem_mask0(<16 x i16>* %vp, <16 x i16> ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 -; GENERIC-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi16_perm_mem_mask0: @@ -259,7 +259,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mem_mask0(<16 x i16>* %vp, <16 x i1 ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 -; GENERIC-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask0: @@ -282,7 +282,7 @@ define <16 x i16> @test_masked_16xi16_perm_mem_mask1(<16 x i16>* %vp, <16 x i16> ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 -; GENERIC-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi16_perm_mem_mask1: @@ -305,7 +305,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mem_mask1(<16 x i16>* %vp, <16 x i1 ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 -; GENERIC-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask1: @@ -328,7 +328,7 @@ define <16 x i16> @test_masked_16xi16_perm_mem_mask2(<16 x i16>* %vp, <16 x i16> ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 -; GENERIC-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi16_perm_mem_mask2: @@ -351,7 +351,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mem_mask2(<16 x i16>* %vp, <16 x i1 ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 -; GENERIC-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask2: @@ -372,7 +372,7 @@ define <16 x i16> @test_16xi16_perm_mem_mask3(<16 x i16>* %vp) { ; GENERIC-LABEL: test_16xi16_perm_mem_mask3: ; GENERIC: # BB#0: ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm0 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50] -; GENERIC-NEXT: vpermw (%rdi), %ymm0, %ymm0 +; GENERIC-NEXT: vpermw (%rdi), %ymm0, %ymm0 # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xi16_perm_mem_mask3: @@ -390,7 +390,7 @@ define <16 x i16> @test_masked_16xi16_perm_mem_mask3(<16 x i16>* %vp, <16 x i16> ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 -; GENERIC-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi16_perm_mem_mask3: @@ -413,7 +413,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mem_mask3(<16 x i16>* %vp, <16 x i1 ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 -; GENERIC-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask3: @@ -434,7 +434,7 @@ define <32 x i16> @test_32xi16_perm_mask0(<32 x i16> %vec) { ; GENERIC-LABEL: test_32xi16_perm_mask0: ; GENERIC: # BB#0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [4:0.50] -; GENERIC-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; GENERIC-NEXT: vpermw %zmm0, %zmm1, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_32xi16_perm_mask0: @@ -451,7 +451,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %ve ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 -; GENERIC-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} +; GENERIC-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -475,7 +475,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> % ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 -; GENERIC-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} +; GENERIC-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_mask0: @@ -496,7 +496,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %ve ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 -; GENERIC-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} +; GENERIC-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -520,7 +520,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> % ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 -; GENERIC-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} +; GENERIC-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_mask1: @@ -541,7 +541,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %ve ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 -; GENERIC-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} +; GENERIC-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -565,7 +565,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> % ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 -; GENERIC-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} +; GENERIC-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_mask2: @@ -584,7 +584,7 @@ define <32 x i16> @test_32xi16_perm_mask3(<32 x i16> %vec) { ; GENERIC-LABEL: test_32xi16_perm_mask3: ; GENERIC: # BB#0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [4:0.50] -; GENERIC-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; GENERIC-NEXT: vpermw %zmm0, %zmm1, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_32xi16_perm_mask3: @@ -601,7 +601,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %ve ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 -; GENERIC-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} +; GENERIC-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -625,7 +625,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> % ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 -; GENERIC-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} +; GENERIC-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_mask3: @@ -644,7 +644,7 @@ define <32 x i16> @test_32xi16_perm_mem_mask0(<32 x i16>* %vp) { ; GENERIC-LABEL: test_32xi16_perm_mem_mask0: ; GENERIC: # BB#0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm0 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [4:0.50] -; GENERIC-NEXT: vpermw (%rdi), %zmm0, %zmm0 +; GENERIC-NEXT: vpermw (%rdi), %zmm0, %zmm0 # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_32xi16_perm_mem_mask0: @@ -662,7 +662,7 @@ define <32 x i16> @test_masked_32xi16_perm_mem_mask0(<32 x i16>* %vp, <32 x i16> ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 -; GENERIC-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} +; GENERIC-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_mem_mask0: @@ -685,7 +685,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mem_mask0(<32 x i16>* %vp, <32 x i1 ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 -; GENERIC-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} +; GENERIC-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask0: @@ -708,7 +708,7 @@ define <32 x i16> @test_masked_32xi16_perm_mem_mask1(<32 x i16>* %vp, <32 x i16> ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 -; GENERIC-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} +; GENERIC-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_mem_mask1: @@ -731,7 +731,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mem_mask1(<32 x i16>* %vp, <32 x i1 ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 -; GENERIC-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} +; GENERIC-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask1: @@ -754,7 +754,7 @@ define <32 x i16> @test_masked_32xi16_perm_mem_mask2(<32 x i16>* %vp, <32 x i16> ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 -; GENERIC-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} +; GENERIC-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_mem_mask2: @@ -777,7 +777,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mem_mask2(<32 x i16>* %vp, <32 x i1 ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 -; GENERIC-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} +; GENERIC-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask2: @@ -798,7 +798,7 @@ define <32 x i16> @test_32xi16_perm_mem_mask3(<32 x i16>* %vp) { ; GENERIC-LABEL: test_32xi16_perm_mem_mask3: ; GENERIC: # BB#0: ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [4:0.50] -; GENERIC-NEXT: vpermw (%rdi), %zmm0, %zmm0 +; GENERIC-NEXT: vpermw (%rdi), %zmm0, %zmm0 # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_32xi16_perm_mem_mask3: @@ -816,7 +816,7 @@ define <32 x i16> @test_masked_32xi16_perm_mem_mask3(<32 x i16>* %vp, <32 x i16> ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 -; GENERIC-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} +; GENERIC-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_mem_mask3: @@ -839,7 +839,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mem_mask3(<32 x i16>* %vp, <32 x i1 ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 -; GENERIC-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} +; GENERIC-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask3: @@ -877,7 +877,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2, ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [4,2,0,6,7,2,3,6] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 -; GENERIC-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} +; GENERIC-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -901,7 +901,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %mask ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [4,2,0,6,7,2,3,6] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 -; GENERIC-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi32_perm_mask0: @@ -922,7 +922,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2, ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [0,5,1,2,6,0,0,3] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 -; GENERIC-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} +; GENERIC-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -946,7 +946,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %mask ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,1,2,6,0,0,3] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 -; GENERIC-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi32_perm_mask1: @@ -967,7 +967,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2, ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [3,6,5,5,1,7,3,4] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 -; GENERIC-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} +; GENERIC-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -991,7 +991,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %mask ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [3,6,5,5,1,7,3,4] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 -; GENERIC-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi32_perm_mask2: @@ -1027,7 +1027,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2, ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [3,0,3,1,0,4,5,0] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 -; GENERIC-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} +; GENERIC-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1051,7 +1051,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %mask ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [3,0,3,1,0,4,5,0] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 -; GENERIC-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi32_perm_mask3: @@ -1088,7 +1088,7 @@ define <8 x i32> @test_masked_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %ve ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [3,7,4,3,5,2,0,5] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 -; GENERIC-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi32_perm_mem_mask0: @@ -1111,7 +1111,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> % ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [3,7,4,3,5,2,0,5] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 -; GENERIC-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask0: @@ -1134,7 +1134,7 @@ define <8 x i32> @test_masked_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %ve ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [4,6,1,7,6,7,6,5] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 -; GENERIC-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi32_perm_mem_mask1: @@ -1157,7 +1157,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> % ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [4,6,1,7,6,7,6,5] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 -; GENERIC-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask1: @@ -1180,7 +1180,7 @@ define <8 x i32> @test_masked_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %ve ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [6,4,6,1,6,3,6,3] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 -; GENERIC-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi32_perm_mem_mask2: @@ -1203,7 +1203,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> % ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [6,4,6,1,6,3,6,3] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 -; GENERIC-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask2: @@ -1242,7 +1242,7 @@ define <8 x i32> @test_masked_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %ve ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [6,0,0,7,3,7,7,5] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 -; GENERIC-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi32_perm_mem_mask3: @@ -1265,7 +1265,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> % ; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [6,0,0,7,3,7,7,5] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 -; GENERIC-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask3: @@ -1286,7 +1286,7 @@ define <16 x i32> @test_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xi32_perm_mask0: ; GENERIC: # BB#0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [4:0.50] -; GENERIC-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; GENERIC-NEXT: vpermps %zmm0, %zmm1, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xi32_perm_mask0: @@ -1303,7 +1303,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %ve ; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm3 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 -; GENERIC-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} +; GENERIC-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1327,7 +1327,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> % ; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm2 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 -; GENERIC-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} +; GENERIC-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi32_perm_mask0: @@ -1348,7 +1348,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %ve ; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm3 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 -; GENERIC-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} +; GENERIC-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1372,7 +1372,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> % ; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm2 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 -; GENERIC-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} +; GENERIC-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi32_perm_mask1: @@ -1393,7 +1393,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %ve ; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm3 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 -; GENERIC-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} +; GENERIC-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1417,7 +1417,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> % ; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm2 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 -; GENERIC-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} +; GENERIC-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi32_perm_mask2: @@ -1436,7 +1436,7 @@ define <16 x i32> @test_16xi32_perm_mask3(<16 x i32> %vec) { ; GENERIC-LABEL: test_16xi32_perm_mask3: ; GENERIC: # BB#0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [4:0.50] -; GENERIC-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; GENERIC-NEXT: vpermps %zmm0, %zmm1, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xi32_perm_mask3: @@ -1453,7 +1453,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %ve ; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm3 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 -; GENERIC-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} +; GENERIC-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1477,7 +1477,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> % ; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm2 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 -; GENERIC-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} +; GENERIC-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi32_perm_mask3: @@ -1496,7 +1496,7 @@ define <16 x i32> @test_16xi32_perm_mem_mask0(<16 x i32>* %vp) { ; GENERIC-LABEL: test_16xi32_perm_mem_mask0: ; GENERIC: # BB#0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [4:0.50] -; GENERIC-NEXT: vpermps (%rdi), %zmm0, %zmm0 +; GENERIC-NEXT: vpermps (%rdi), %zmm0, %zmm0 # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xi32_perm_mem_mask0: @@ -1514,7 +1514,7 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> ; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 -; GENERIC-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} +; GENERIC-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi32_perm_mem_mask0: @@ -1537,7 +1537,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i3 ; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm1 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 -; GENERIC-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} +; GENERIC-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask0: @@ -1560,7 +1560,7 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> ; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm2 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 -; GENERIC-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} +; GENERIC-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi32_perm_mem_mask1: @@ -1583,7 +1583,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i3 ; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm1 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 -; GENERIC-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} +; GENERIC-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask1: @@ -1606,7 +1606,7 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> ; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm2 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 -; GENERIC-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} +; GENERIC-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi32_perm_mem_mask2: @@ -1629,7 +1629,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i3 ; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm1 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 -; GENERIC-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} +; GENERIC-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask2: @@ -1650,7 +1650,7 @@ define <16 x i32> @test_16xi32_perm_mem_mask3(<16 x i32>* %vp) { ; GENERIC-LABEL: test_16xi32_perm_mem_mask3: ; GENERIC: # BB#0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [4:0.50] -; GENERIC-NEXT: vpermps (%rdi), %zmm0, %zmm0 +; GENERIC-NEXT: vpermps (%rdi), %zmm0, %zmm0 # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xi32_perm_mem_mask3: @@ -1668,7 +1668,7 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> ; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm2 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 -; GENERIC-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} +; GENERIC-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi32_perm_mem_mask3: @@ -1691,7 +1691,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i3 ; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm1 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 -; GENERIC-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} +; GENERIC-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask3: @@ -1726,7 +1726,7 @@ define <4 x i64> @test_masked_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %vec2, ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,3,1] +; GENERIC-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,3,1] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1748,7 +1748,7 @@ define <4 x i64> @test_masked_z_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %mask ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,3,1] +; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,3,1] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xi64_perm_mask0: @@ -1767,7 +1767,7 @@ define <4 x i64> @test_masked_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %vec2, ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3] +; GENERIC-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1789,7 +1789,7 @@ define <4 x i64> @test_masked_z_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %mask ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3] +; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xi64_perm_mask1: @@ -1808,7 +1808,7 @@ define <4 x i64> @test_masked_4xi64_perm_mask2(<4 x i64> %vec, <4 x i64> %vec2, ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,2,2,1] +; GENERIC-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,2,2,1] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1830,7 +1830,7 @@ define <4 x i64> @test_masked_z_4xi64_perm_mask2(<4 x i64> %vec, <4 x i64> %mask ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,2,2,1] +; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,2,2,1] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xi64_perm_mask2: @@ -1862,7 +1862,7 @@ define <4 x i64> @test_masked_4xi64_perm_mask3(<4 x i64> %vec, <4 x i64> %vec2, ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,3] +; GENERIC-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,3] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1884,7 +1884,7 @@ define <4 x i64> @test_masked_z_4xi64_perm_mask3(<4 x i64> %vec, <4 x i64> %mask ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,3] +; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,3] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xi64_perm_mask3: @@ -1917,7 +1917,7 @@ define <4 x i64> @test_masked_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> %ve ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,2,0] +; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,2,0] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_4xi64_perm_mem_mask0: @@ -1938,7 +1938,7 @@ define <4 x i64> @test_masked_z_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> % ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 -; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,2,0] +; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,2,0] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask0: @@ -1959,7 +1959,7 @@ define <4 x i64> @test_masked_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> %ve ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,1,1] +; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,1,1] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_4xi64_perm_mem_mask1: @@ -1980,7 +1980,7 @@ define <4 x i64> @test_masked_z_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> % ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 -; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,1,1] +; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,1,1] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask1: @@ -2001,7 +2001,7 @@ define <4 x i64> @test_masked_4xi64_perm_mem_mask2(<4 x i64>* %vp, <4 x i64> %ve ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[0,1,2,0] +; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[0,1,2,0] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_4xi64_perm_mem_mask2: @@ -2022,7 +2022,7 @@ define <4 x i64> @test_masked_z_4xi64_perm_mem_mask2(<4 x i64>* %vp, <4 x i64> % ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 -; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,0] +; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,0] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask2: @@ -2057,7 +2057,7 @@ define <4 x i64> @test_masked_4xi64_perm_mem_mask3(<4 x i64>* %vp, <4 x i64> %ve ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,0,1,3] +; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,0,1,3] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_4xi64_perm_mem_mask3: @@ -2078,7 +2078,7 @@ define <4 x i64> @test_masked_z_4xi64_perm_mem_mask3(<4 x i64>* %vp, <4 x i64> % ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 -; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,0,1,3] +; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,0,1,3] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask3: @@ -2098,7 +2098,7 @@ define <8 x i64> @test_8xi64_perm_mask0(<8 x i64> %vec) { ; GENERIC-LABEL: test_8xi64_perm_mask0: ; GENERIC: # BB#0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,7,6,5,5,1,6] sched: [4:0.50] -; GENERIC-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; GENERIC-NEXT: vpermpd %zmm0, %zmm1, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi64_perm_mask0: @@ -2115,7 +2115,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %vec2, ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,7,6,5,5,1,6] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 -; GENERIC-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} +; GENERIC-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2139,7 +2139,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %mask ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,7,6,5,5,1,6] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 -; GENERIC-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} +; GENERIC-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi64_perm_mask0: @@ -2159,7 +2159,7 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> %ve ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 -; GENERIC-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[1,0,1,1,5,4,5,5] +; GENERIC-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[1,0,1,1,5,4,5,5] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2181,7 +2181,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> % ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 -; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,1,1,5,4,5,5] +; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,1,1,5,4,5,5] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi64_perm_imm_mask1: @@ -2201,7 +2201,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %vec2, ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,7,3,3,5,4,1] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 -; GENERIC-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} +; GENERIC-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2225,7 +2225,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %mask ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,3,7,3,3,5,4,1] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 -; GENERIC-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} +; GENERIC-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi64_perm_mask2: @@ -2243,7 +2243,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %mask define <8 x i64> @test_8xi64_perm_imm_mask3(<8 x i64> %vec) { ; GENERIC-LABEL: test_8xi64_perm_imm_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,1,3,1,7,5,7,5] +; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,1,3,1,7,5,7,5] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi64_perm_imm_mask3: @@ -2258,7 +2258,7 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> %ve ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 -; GENERIC-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,1,7,5,7,5] +; GENERIC-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,1,7,5,7,5] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2280,7 +2280,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> % ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 -; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,1,7,5,7,5] +; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,1,7,5,7,5] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi64_perm_imm_mask3: @@ -2300,7 +2300,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %vec2, ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [6,3,1,1,7,4,0,3] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 -; GENERIC-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} +; GENERIC-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2324,7 +2324,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %mask ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,3,1,1,7,4,0,3] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 -; GENERIC-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} +; GENERIC-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi64_perm_mask4: @@ -2344,7 +2344,7 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> %ve ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 -; GENERIC-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[0,0,0,0,4,4,4,4] +; GENERIC-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[0,0,0,0,4,4,4,4] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2366,7 +2366,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> % ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 -; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4] +; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi64_perm_imm_mask5: @@ -2384,7 +2384,7 @@ define <8 x i64> @test_8xi64_perm_mask6(<8 x i64> %vec) { ; GENERIC-LABEL: test_8xi64_perm_mask6: ; GENERIC: # BB#0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [5,1,4,4,5,4,2,7] sched: [4:0.50] -; GENERIC-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; GENERIC-NEXT: vpermpd %zmm0, %zmm1, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi64_perm_mask6: @@ -2401,7 +2401,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %vec2, ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,1,4,4,5,4,2,7] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 -; GENERIC-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} +; GENERIC-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2425,7 +2425,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %mask ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,1,4,4,5,4,2,7] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 -; GENERIC-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} +; GENERIC-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi64_perm_mask6: @@ -2445,7 +2445,7 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> %ve ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 -; GENERIC-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,3,3,3,7,7,7,7] +; GENERIC-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,3,3,3,7,7,7,7] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2467,7 +2467,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> % ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 -; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,3,3,7,7,7,7] +; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,3,3,7,7,7,7] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi64_perm_imm_mask7: @@ -2485,7 +2485,7 @@ define <8 x i64> @test_8xi64_perm_mem_mask0(<8 x i64>* %vp) { ; GENERIC-LABEL: test_8xi64_perm_mem_mask0: ; GENERIC: # BB#0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [5,1,6,5,7,3,7,3] sched: [4:0.50] -; GENERIC-NEXT: vpermpd (%rdi), %zmm0, %zmm0 +; GENERIC-NEXT: vpermpd (%rdi), %zmm0, %zmm0 # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi64_perm_mem_mask0: @@ -2503,7 +2503,7 @@ define <8 x i64> @test_masked_8xi64_perm_mem_mask0(<8 x i64>* %vp, <8 x i64> %ve ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,1,6,5,7,3,7,3] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 -; GENERIC-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} +; GENERIC-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi64_perm_mem_mask0: @@ -2526,7 +2526,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_mem_mask0(<8 x i64>* %vp, <8 x i64> % ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,1,6,5,7,3,7,3] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 -; GENERIC-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} +; GENERIC-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask0: @@ -2548,7 +2548,7 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask1(<8 x i64>* %vp, <8 x i64> ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 -; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[1,1,1,0,5,5,5,4] +; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[1,1,1,0,5,5,5,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask1: @@ -2569,7 +2569,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask1(<8 x i64>* %vp, <8 x i6 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 -; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,1,1,0,5,5,5,4] +; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,1,1,0,5,5,5,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask1: @@ -2591,7 +2591,7 @@ define <8 x i64> @test_masked_8xi64_perm_mem_mask2(<8 x i64>* %vp, <8 x i64> %ve ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,1,4,1,1,5,5] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 -; GENERIC-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} +; GENERIC-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi64_perm_mem_mask2: @@ -2614,7 +2614,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_mem_mask2(<8 x i64>* %vp, <8 x i64> % ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,1,4,1,1,5,5] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 -; GENERIC-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} +; GENERIC-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask2: @@ -2634,7 +2634,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_mem_mask2(<8 x i64>* %vp, <8 x i64> % define <8 x i64> @test_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp) { ; GENERIC-LABEL: test_8xi64_perm_imm_mem_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 = mem[1,3,1,1,5,7,5,5] +; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 = mem[1,3,1,1,5,7,5,5] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi64_perm_imm_mem_mask3: @@ -2650,7 +2650,7 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp, <8 x i64> ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 -; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[1,3,1,1,5,7,5,5] +; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[1,3,1,1,5,7,5,5] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask3: @@ -2671,7 +2671,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp, <8 x i6 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 -; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,3,1,1,5,7,5,5] +; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,3,1,1,5,7,5,5] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask3: @@ -2693,7 +2693,7 @@ define <8 x i64> @test_masked_8xi64_perm_mem_mask4(<8 x i64>* %vp, <8 x i64> %ve ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,0,7,0,3,5,0,6] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 -; GENERIC-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} +; GENERIC-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi64_perm_mem_mask4: @@ -2716,7 +2716,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_mem_mask4(<8 x i64>* %vp, <8 x i64> % ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,0,7,0,3,5,0,6] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 -; GENERIC-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} +; GENERIC-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask4: @@ -2738,7 +2738,7 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask5(<8 x i64>* %vp, <8 x i64> ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 -; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[3,1,0,0,7,5,4,4] +; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[3,1,0,0,7,5,4,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask5: @@ -2759,7 +2759,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask5(<8 x i64>* %vp, <8 x i6 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 -; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,1,0,0,7,5,4,4] +; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,1,0,0,7,5,4,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask5: @@ -2779,7 +2779,7 @@ define <8 x i64> @test_8xi64_perm_mem_mask6(<8 x i64>* %vp) { ; GENERIC-LABEL: test_8xi64_perm_mem_mask6: ; GENERIC: # BB#0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [0,6,3,7,3,0,3,6] sched: [4:0.50] -; GENERIC-NEXT: vpermpd (%rdi), %zmm0, %zmm0 +; GENERIC-NEXT: vpermpd (%rdi), %zmm0, %zmm0 # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi64_perm_mem_mask6: @@ -2797,7 +2797,7 @@ define <8 x i64> @test_masked_8xi64_perm_mem_mask6(<8 x i64>* %vp, <8 x i64> %ve ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,6,3,7,3,0,3,6] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 -; GENERIC-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} +; GENERIC-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi64_perm_mem_mask6: @@ -2820,7 +2820,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_mem_mask6(<8 x i64>* %vp, <8 x i64> % ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,6,3,7,3,0,3,6] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 -; GENERIC-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} +; GENERIC-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask6: @@ -2842,7 +2842,7 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask7(<8 x i64>* %vp, <8 x i64> ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 -; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[3,0,0,1,7,4,4,5] +; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[3,0,0,1,7,4,4,5] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask7: @@ -2863,7 +2863,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask7(<8 x i64>* %vp, <8 x i6 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 -; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,0,0,1,7,4,4,5] +; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,0,0,1,7,4,4,5] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask7: @@ -2900,7 +2900,7 @@ define <8 x float> @test_masked_8xfloat_perm_mask0(<8 x float> %vec, <8 x float> ; GENERIC-NEXT: vmovaps {{.*#+}} ymm3 = [3,4,2,4,1,2,3,4] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 -; GENERIC-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} +; GENERIC-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2924,7 +2924,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_mask0(<8 x float> %vec, <8 x i32> ; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [3,4,2,4,1,2,3,4] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 -; GENERIC-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xfloat_perm_mask0: @@ -2945,7 +2945,7 @@ define <8 x float> @test_masked_8xfloat_perm_mask1(<8 x float> %vec, <8 x float> ; GENERIC-NEXT: vmovaps {{.*#+}} ymm3 = [4,2,1,0,6,0,5,1] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 -; GENERIC-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} +; GENERIC-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2969,7 +2969,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_mask1(<8 x float> %vec, <8 x i64> ; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [4,2,1,0,6,0,5,1] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 -; GENERIC-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xfloat_perm_mask1: @@ -2990,7 +2990,7 @@ define <8 x float> @test_masked_8xfloat_perm_mask2(<8 x float> %vec, <8 x float> ; GENERIC-NEXT: vmovaps {{.*#+}} ymm3 = [2,5,5,5,4,6,0,5] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 -; GENERIC-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} +; GENERIC-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -3014,7 +3014,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_mask2(<8 x float> %vec, <8 x i32> ; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [2,5,5,5,4,6,0,5] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 -; GENERIC-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xfloat_perm_mask2: @@ -3050,7 +3050,7 @@ define <8 x float> @test_masked_8xfloat_perm_mask3(<8 x float> %vec, <8 x float> ; GENERIC-NEXT: vmovaps {{.*#+}} ymm3 = [0,5,2,5,5,5,1,6] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 -; GENERIC-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} +; GENERIC-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -3074,7 +3074,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_mask3(<8 x float> %vec, <8 x i32> ; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [0,5,2,5,5,5,1,6] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 -; GENERIC-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xfloat_perm_mask3: @@ -3111,7 +3111,7 @@ define <8 x float> @test_masked_8xfloat_perm_mem_mask0(<8 x float>* %vp, <8 x fl ; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [5,2,1,6,4,2,4,0] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 -; GENERIC-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xfloat_perm_mem_mask0: @@ -3134,7 +3134,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_mem_mask0(<8 x float>* %vp, <8 x ; GENERIC-NEXT: vmovaps {{.*#+}} ymm1 = [5,2,1,6,4,2,4,0] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 -; GENERIC-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask0: @@ -3157,7 +3157,7 @@ define <8 x float> @test_masked_8xfloat_perm_mem_mask1(<8 x float>* %vp, <8 x fl ; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [1,3,7,4,0,6,6,6] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 -; GENERIC-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xfloat_perm_mem_mask1: @@ -3180,7 +3180,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_mem_mask1(<8 x float>* %vp, <8 x ; GENERIC-NEXT: vmovaps {{.*#+}} ymm1 = [1,3,7,4,0,6,6,6] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 -; GENERIC-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask1: @@ -3203,7 +3203,7 @@ define <8 x float> @test_masked_8xfloat_perm_mem_mask2(<8 x float>* %vp, <8 x fl ; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [4,5,1,5,6,6,2,4] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 -; GENERIC-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xfloat_perm_mem_mask2: @@ -3226,7 +3226,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_mem_mask2(<8 x float>* %vp, <8 x ; GENERIC-NEXT: vmovaps {{.*#+}} ymm1 = [4,5,1,5,6,6,2,4] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 -; GENERIC-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask2: @@ -3265,7 +3265,7 @@ define <8 x float> @test_masked_8xfloat_perm_mem_mask3(<8 x float>* %vp, <8 x fl ; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [5,7,0,6,4,2,3,0] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 -; GENERIC-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xfloat_perm_mem_mask3: @@ -3288,7 +3288,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_mem_mask3(<8 x float>* %vp, <8 x ; GENERIC-NEXT: vmovaps {{.*#+}} ymm1 = [5,7,0,6,4,2,3,0] sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 -; GENERIC-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask3: @@ -3309,7 +3309,7 @@ define <16 x float> @test_16xfloat_perm_mask0(<16 x float> %vec) { ; GENERIC-LABEL: test_16xfloat_perm_mask0: ; GENERIC: # BB#0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [4:0.50] -; GENERIC-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; GENERIC-NEXT: vpermps %zmm0, %zmm1, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_perm_mask0: @@ -3326,7 +3326,7 @@ define <16 x float> @test_masked_16xfloat_perm_mask0(<16 x float> %vec, <16 x fl ; GENERIC-NEXT: vmovaps {{.*#+}} zmm3 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 -; GENERIC-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} +; GENERIC-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -3350,7 +3350,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mask0(<16 x float> %vec, <16 x ; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 -; GENERIC-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} +; GENERIC-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xfloat_perm_mask0: @@ -3371,7 +3371,7 @@ define <16 x float> @test_masked_16xfloat_perm_mask1(<16 x float> %vec, <16 x fl ; GENERIC-NEXT: vmovaps {{.*#+}} zmm3 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 -; GENERIC-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} +; GENERIC-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -3395,7 +3395,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mask1(<16 x float> %vec, <16 x ; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 -; GENERIC-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} +; GENERIC-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xfloat_perm_mask1: @@ -3416,7 +3416,7 @@ define <16 x float> @test_masked_16xfloat_perm_mask2(<16 x float> %vec, <16 x fl ; GENERIC-NEXT: vmovaps {{.*#+}} zmm3 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 -; GENERIC-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} +; GENERIC-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -3440,7 +3440,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mask2(<16 x float> %vec, <16 x ; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 -; GENERIC-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} +; GENERIC-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xfloat_perm_mask2: @@ -3459,7 +3459,7 @@ define <16 x float> @test_16xfloat_perm_mask3(<16 x float> %vec) { ; GENERIC-LABEL: test_16xfloat_perm_mask3: ; GENERIC: # BB#0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [4:0.50] -; GENERIC-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; GENERIC-NEXT: vpermps %zmm0, %zmm1, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_perm_mask3: @@ -3476,7 +3476,7 @@ define <16 x float> @test_masked_16xfloat_perm_mask3(<16 x float> %vec, <16 x fl ; GENERIC-NEXT: vmovaps {{.*#+}} zmm3 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 -; GENERIC-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} +; GENERIC-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -3500,7 +3500,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mask3(<16 x float> %vec, <16 x ; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 -; GENERIC-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} +; GENERIC-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xfloat_perm_mask3: @@ -3519,7 +3519,7 @@ define <16 x float> @test_16xfloat_perm_mem_mask0(<16 x float>* %vp) { ; GENERIC-LABEL: test_16xfloat_perm_mem_mask0: ; GENERIC: # BB#0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [4:0.50] -; GENERIC-NEXT: vpermps (%rdi), %zmm0, %zmm0 +; GENERIC-NEXT: vpermps (%rdi), %zmm0, %zmm0 # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_perm_mem_mask0: @@ -3537,7 +3537,7 @@ define <16 x float> @test_masked_16xfloat_perm_mem_mask0(<16 x float>* %vp, <16 ; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 -; GENERIC-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} +; GENERIC-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xfloat_perm_mem_mask0: @@ -3560,7 +3560,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mem_mask0(<16 x float>* %vp, <1 ; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 -; GENERIC-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} +; GENERIC-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask0: @@ -3583,7 +3583,7 @@ define <16 x float> @test_masked_16xfloat_perm_mem_mask1(<16 x float>* %vp, <16 ; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 -; GENERIC-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} +; GENERIC-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xfloat_perm_mem_mask1: @@ -3606,7 +3606,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mem_mask1(<16 x float>* %vp, <1 ; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 -; GENERIC-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} +; GENERIC-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask1: @@ -3629,7 +3629,7 @@ define <16 x float> @test_masked_16xfloat_perm_mem_mask2(<16 x float>* %vp, <16 ; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 -; GENERIC-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} +; GENERIC-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xfloat_perm_mem_mask2: @@ -3652,7 +3652,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mem_mask2(<16 x float>* %vp, <1 ; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 -; GENERIC-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} +; GENERIC-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask2: @@ -3673,7 +3673,7 @@ define <16 x float> @test_16xfloat_perm_mem_mask3(<16 x float>* %vp) { ; GENERIC-LABEL: test_16xfloat_perm_mem_mask3: ; GENERIC: # BB#0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [4:0.50] -; GENERIC-NEXT: vpermps (%rdi), %zmm0, %zmm0 +; GENERIC-NEXT: vpermps (%rdi), %zmm0, %zmm0 # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_perm_mem_mask3: @@ -3691,7 +3691,7 @@ define <16 x float> @test_masked_16xfloat_perm_mem_mask3(<16 x float>* %vp, <16 ; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 -; GENERIC-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} +; GENERIC-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xfloat_perm_mem_mask3: @@ -3714,7 +3714,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mem_mask3(<16 x float>* %vp, <1 ; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 -; GENERIC-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} +; GENERIC-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask3: @@ -3749,7 +3749,7 @@ define <4 x double> @test_masked_4xdouble_perm_mask0(<4 x double> %vec, <4 x dou ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,2] +; GENERIC-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,2] sched: [1:1.00] ; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -3771,7 +3771,7 @@ define <4 x double> @test_masked_z_4xdouble_perm_mask0(<4 x double> %vec, <4 x i ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,2] +; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,2] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xdouble_perm_mask0: @@ -3790,7 +3790,7 @@ define <4 x double> @test_masked_4xdouble_perm_mask1(<4 x double> %vec, <4 x dou ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,0,0,0] +; GENERIC-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,0,0,0] sched: [1:1.00] ; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -3812,7 +3812,7 @@ define <4 x double> @test_masked_z_4xdouble_perm_mask1(<4 x double> %vec, <4 x i ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0] +; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xdouble_perm_mask1: @@ -3831,7 +3831,7 @@ define <4 x double> @test_masked_4xdouble_perm_mask2(<4 x double> %vec, <4 x dou ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,1] +; GENERIC-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,1] sched: [1:1.00] ; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -3853,7 +3853,7 @@ define <4 x double> @test_masked_z_4xdouble_perm_mask2(<4 x double> %vec, <4 x i ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,1] +; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,1] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xdouble_perm_mask2: @@ -3885,7 +3885,7 @@ define <4 x double> @test_masked_4xdouble_perm_mask3(<4 x double> %vec, <4 x dou ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,2] +; GENERIC-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,2] sched: [1:1.00] ; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -3907,7 +3907,7 @@ define <4 x double> @test_masked_z_4xdouble_perm_mask3(<4 x double> %vec, <4 x i ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,2] +; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,2] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xdouble_perm_mask3: @@ -3940,7 +3940,7 @@ define <4 x double> @test_masked_4xdouble_perm_mem_mask0(<4 x double>* %vp, <4 x ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[0,0,2,0] +; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[0,0,2,0] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_4xdouble_perm_mem_mask0: @@ -3961,7 +3961,7 @@ define <4 x double> @test_masked_z_4xdouble_perm_mem_mask0(<4 x double>* %vp, <4 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 -; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,0] +; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,0] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask0: @@ -3982,7 +3982,7 @@ define <4 x double> @test_masked_4xdouble_perm_mem_mask1(<4 x double>* %vp, <4 x ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[0,2,3,2] +; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[0,2,3,2] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_4xdouble_perm_mem_mask1: @@ -4003,7 +4003,7 @@ define <4 x double> @test_masked_z_4xdouble_perm_mem_mask1(<4 x double>* %vp, <4 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 -; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,2,3,2] +; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,2,3,2] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask1: @@ -4024,7 +4024,7 @@ define <4 x double> @test_masked_4xdouble_perm_mem_mask2(<4 x double>* %vp, <4 x ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[3,1,1,1] +; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[3,1,1,1] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_4xdouble_perm_mem_mask2: @@ -4045,7 +4045,7 @@ define <4 x double> @test_masked_z_4xdouble_perm_mem_mask2(<4 x double>* %vp, <4 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 -; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,1,1,1] +; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,1,1,1] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask2: @@ -4080,7 +4080,7 @@ define <4 x double> @test_masked_4xdouble_perm_mem_mask3(<4 x double>* %vp, <4 x ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[3,2,3,2] +; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[3,2,3,2] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_4xdouble_perm_mem_mask3: @@ -4101,7 +4101,7 @@ define <4 x double> @test_masked_z_4xdouble_perm_mem_mask3(<4 x double>* %vp, <4 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 -; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,2] +; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,2] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask3: @@ -4121,7 +4121,7 @@ define <8 x double> @test_8xdouble_perm_mask0(<8 x double> %vec) { ; GENERIC-LABEL: test_8xdouble_perm_mask0: ; GENERIC: # BB#0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [5,7,4,2,7,4,3,4] sched: [4:0.50] -; GENERIC-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; GENERIC-NEXT: vpermpd %zmm0, %zmm1, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_perm_mask0: @@ -4138,7 +4138,7 @@ define <8 x double> @test_masked_8xdouble_perm_mask0(<8 x double> %vec, <8 x dou ; GENERIC-NEXT: vmovapd {{.*#+}} zmm3 = [5,7,4,2,7,4,3,4] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 -; GENERIC-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} +; GENERIC-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4162,7 +4162,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_mask0(<8 x double> %vec, <8 x i ; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [5,7,4,2,7,4,3,4] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 -; GENERIC-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} +; GENERIC-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xdouble_perm_mask0: @@ -4182,7 +4182,7 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mask1(<8 x double> %vec, <8 x ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 -; GENERIC-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,2,7,4,4,6] +; GENERIC-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,2,7,4,4,6] sched: [1:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4204,7 +4204,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_imm_mask1(<8 x double> %vec, <8 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 -; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,2,7,4,4,6] +; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,2,7,4,4,6] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mask1: @@ -4224,7 +4224,7 @@ define <8 x double> @test_masked_8xdouble_perm_mask2(<8 x double> %vec, <8 x dou ; GENERIC-NEXT: vmovapd {{.*#+}} zmm3 = [7,5,5,5,3,5,1,7] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 -; GENERIC-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} +; GENERIC-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4248,7 +4248,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_mask2(<8 x double> %vec, <8 x i ; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [7,5,5,5,3,5,1,7] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 -; GENERIC-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} +; GENERIC-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xdouble_perm_mask2: @@ -4266,7 +4266,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_mask2(<8 x double> %vec, <8 x i define <8 x double> @test_8xdouble_perm_imm_mask3(<8 x double> %vec) { ; GENERIC-LABEL: test_8xdouble_perm_imm_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[1,3,3,0,5,7,7,4] +; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[1,3,3,0,5,7,7,4] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_perm_imm_mask3: @@ -4281,7 +4281,7 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mask3(<8 x double> %vec, <8 x ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 -; GENERIC-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4] +; GENERIC-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4] sched: [1:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4303,7 +4303,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_imm_mask3(<8 x double> %vec, <8 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 -; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4] +; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mask3: @@ -4323,7 +4323,7 @@ define <8 x double> @test_masked_8xdouble_perm_mask4(<8 x double> %vec, <8 x dou ; GENERIC-NEXT: vmovapd {{.*#+}} zmm3 = [3,5,3,4,6,5,7,1] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 -; GENERIC-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} +; GENERIC-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4347,7 +4347,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_mask4(<8 x double> %vec, <8 x i ; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [3,5,3,4,6,5,7,1] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 -; GENERIC-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} +; GENERIC-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xdouble_perm_mask4: @@ -4367,7 +4367,7 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mask5(<8 x double> %vec, <8 x ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 -; GENERIC-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,3,2,3,7,7,6,7] +; GENERIC-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,3,2,3,7,7,6,7] sched: [1:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4389,7 +4389,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_imm_mask5(<8 x double> %vec, <8 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 -; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,2,3,7,7,6,7] +; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,2,3,7,7,6,7] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mask5: @@ -4407,7 +4407,7 @@ define <8 x double> @test_8xdouble_perm_mask6(<8 x double> %vec) { ; GENERIC-LABEL: test_8xdouble_perm_mask6: ; GENERIC: # BB#0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [2,7,6,4,0,0,0,2] sched: [4:0.50] -; GENERIC-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; GENERIC-NEXT: vpermpd %zmm0, %zmm1, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_perm_mask6: @@ -4424,7 +4424,7 @@ define <8 x double> @test_masked_8xdouble_perm_mask6(<8 x double> %vec, <8 x dou ; GENERIC-NEXT: vmovapd {{.*#+}} zmm3 = [2,7,6,4,0,0,0,2] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 -; GENERIC-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} +; GENERIC-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4448,7 +4448,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_mask6(<8 x double> %vec, <8 x i ; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [2,7,6,4,0,0,0,2] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 -; GENERIC-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} +; GENERIC-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xdouble_perm_mask6: @@ -4468,7 +4468,7 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mask7(<8 x double> %vec, <8 x ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 -; GENERIC-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,2,7,5,7,6] +; GENERIC-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,2,7,5,7,6] sched: [1:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4490,7 +4490,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_imm_mask7(<8 x double> %vec, <8 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 -; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,2,7,5,7,6] +; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,2,7,5,7,6] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mask7: @@ -4508,7 +4508,7 @@ define <8 x double> @test_8xdouble_perm_mem_mask0(<8 x double>* %vp) { ; GENERIC-LABEL: test_8xdouble_perm_mem_mask0: ; GENERIC: # BB#0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [0,3,4,0,4,2,0,1] sched: [4:0.50] -; GENERIC-NEXT: vpermpd (%rdi), %zmm0, %zmm0 +; GENERIC-NEXT: vpermpd (%rdi), %zmm0, %zmm0 # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_perm_mem_mask0: @@ -4526,7 +4526,7 @@ define <8 x double> @test_masked_8xdouble_perm_mem_mask0(<8 x double>* %vp, <8 x ; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [0,3,4,0,4,2,0,1] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 -; GENERIC-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} +; GENERIC-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xdouble_perm_mem_mask0: @@ -4549,7 +4549,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_mem_mask0(<8 x double>* %vp, <8 ; GENERIC-NEXT: vmovapd {{.*#+}} zmm1 = [0,3,4,0,4,2,0,1] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 -; GENERIC-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} +; GENERIC-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask0: @@ -4571,7 +4571,7 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask1(<8 x double>* %vp, ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 -; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[0,2,0,3,4,6,4,7] +; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[0,2,0,3,4,6,4,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask1: @@ -4592,7 +4592,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask1(<8 x double>* %vp ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 -; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,2,0,3,4,6,4,7] +; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,2,0,3,4,6,4,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask1: @@ -4614,7 +4614,7 @@ define <8 x double> @test_masked_8xdouble_perm_mem_mask2(<8 x double>* %vp, <8 x ; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [6,7,2,7,7,6,2,5] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 -; GENERIC-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} +; GENERIC-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xdouble_perm_mem_mask2: @@ -4637,7 +4637,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_mem_mask2(<8 x double>* %vp, <8 ; GENERIC-NEXT: vmovapd {{.*#+}} zmm1 = [6,7,2,7,7,6,2,5] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 -; GENERIC-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} +; GENERIC-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask2: @@ -4657,7 +4657,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_mem_mask2(<8 x double>* %vp, <8 define <8 x double> @test_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp) { ; GENERIC-LABEL: test_8xdouble_perm_imm_mem_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 = mem[2,1,1,0,6,5,5,4] +; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 = mem[2,1,1,0,6,5,5,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_perm_imm_mem_mask3: @@ -4673,7 +4673,7 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp, ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 -; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[2,1,1,0,6,5,5,4] +; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[2,1,1,0,6,5,5,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask3: @@ -4694,7 +4694,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 -; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,1,1,0,6,5,5,4] +; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,1,1,0,6,5,5,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask3: @@ -4716,7 +4716,7 @@ define <8 x double> @test_masked_8xdouble_perm_mem_mask4(<8 x double>* %vp, <8 x ; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [1,1,3,5,6,0,6,0] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 -; GENERIC-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} +; GENERIC-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xdouble_perm_mem_mask4: @@ -4739,7 +4739,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_mem_mask4(<8 x double>* %vp, <8 ; GENERIC-NEXT: vmovapd {{.*#+}} zmm1 = [1,1,3,5,6,0,6,0] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 -; GENERIC-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} +; GENERIC-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask4: @@ -4761,7 +4761,7 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask5(<8 x double>* %vp, ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 -; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[2,2,2,3,6,6,6,7] +; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[2,2,2,3,6,6,6,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask5: @@ -4782,7 +4782,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask5(<8 x double>* %vp ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 -; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,2,2,3,6,6,6,7] +; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,2,2,3,6,6,6,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask5: @@ -4802,7 +4802,7 @@ define <8 x double> @test_8xdouble_perm_mem_mask6(<8 x double>* %vp) { ; GENERIC-LABEL: test_8xdouble_perm_mem_mask6: ; GENERIC: # BB#0: ; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [2,4,0,4,6,1,2,5] sched: [4:0.50] -; GENERIC-NEXT: vpermpd (%rdi), %zmm0, %zmm0 +; GENERIC-NEXT: vpermpd (%rdi), %zmm0, %zmm0 # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_perm_mem_mask6: @@ -4820,7 +4820,7 @@ define <8 x double> @test_masked_8xdouble_perm_mem_mask6(<8 x double>* %vp, <8 x ; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [2,4,0,4,6,1,2,5] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 -; GENERIC-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} +; GENERIC-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xdouble_perm_mem_mask6: @@ -4843,7 +4843,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_mem_mask6(<8 x double>* %vp, <8 ; GENERIC-NEXT: vmovapd {{.*#+}} zmm1 = [2,4,0,4,6,1,2,5] sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 -; GENERIC-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} +; GENERIC-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask6: @@ -4865,7 +4865,7 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask7(<8 x double>* %vp, ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 -; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[0,3,2,0,4,7,6,4] +; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask7: @@ -4886,7 +4886,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask7(<8 x double>* %vp ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 -; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] +; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask7: @@ -4920,7 +4920,7 @@ define <16 x i8> @test_masked_16xi8_perm_mask0(<16 x i8> %vec, <16 x i8> %vec2, ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 -; GENERIC-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] +; GENERIC-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4942,7 +4942,7 @@ define <16 x i8> @test_masked_z_16xi8_perm_mask0(<16 x i8> %vec, <16 x i8> %mask ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 -; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] +; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi8_perm_mask0: @@ -4961,7 +4961,7 @@ define <16 x i8> @test_masked_16xi8_perm_mask1(<16 x i8> %vec, <16 x i8> %vec2, ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 -; GENERIC-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] +; GENERIC-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4983,7 +4983,7 @@ define <16 x i8> @test_masked_z_16xi8_perm_mask1(<16 x i8> %vec, <16 x i8> %mask ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 -; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] +; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi8_perm_mask1: @@ -5002,7 +5002,7 @@ define <16 x i8> @test_masked_16xi8_perm_mask2(<16 x i8> %vec, <16 x i8> %vec2, ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 -; GENERIC-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] +; GENERIC-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5024,7 +5024,7 @@ define <16 x i8> @test_masked_z_16xi8_perm_mask2(<16 x i8> %vec, <16 x i8> %mask ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 -; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] +; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi8_perm_mask2: @@ -5056,7 +5056,7 @@ define <16 x i8> @test_masked_16xi8_perm_mask3(<16 x i8> %vec, <16 x i8> %vec2, ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 -; GENERIC-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] +; GENERIC-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5078,7 +5078,7 @@ define <16 x i8> @test_masked_z_16xi8_perm_mask3(<16 x i8> %vec, <16 x i8> %mask ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 -; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] +; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi8_perm_mask3: @@ -5114,7 +5114,7 @@ define <16 x i8> @test_masked_16xi8_perm_mem_mask0(<16 x i8>* %vp, <16 x i8> %ve ; GENERIC-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 -; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] +; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi8_perm_mem_mask0: @@ -5137,7 +5137,7 @@ define <16 x i8> @test_masked_z_16xi8_perm_mem_mask0(<16 x i8>* %vp, <16 x i8> % ; GENERIC-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 -; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] +; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask0: @@ -5160,7 +5160,7 @@ define <16 x i8> @test_masked_16xi8_perm_mem_mask1(<16 x i8>* %vp, <16 x i8> %ve ; GENERIC-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 -; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] +; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi8_perm_mem_mask1: @@ -5183,7 +5183,7 @@ define <16 x i8> @test_masked_z_16xi8_perm_mem_mask1(<16 x i8>* %vp, <16 x i8> % ; GENERIC-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 -; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] +; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask1: @@ -5206,7 +5206,7 @@ define <16 x i8> @test_masked_16xi8_perm_mem_mask2(<16 x i8>* %vp, <16 x i8> %ve ; GENERIC-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 -; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] +; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi8_perm_mem_mask2: @@ -5229,7 +5229,7 @@ define <16 x i8> @test_masked_z_16xi8_perm_mem_mask2(<16 x i8>* %vp, <16 x i8> % ; GENERIC-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 -; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] +; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask2: @@ -5268,7 +5268,7 @@ define <16 x i8> @test_masked_16xi8_perm_mem_mask3(<16 x i8>* %vp, <16 x i8> %ve ; GENERIC-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 -; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] +; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi8_perm_mem_mask3: @@ -5291,7 +5291,7 @@ define <16 x i8> @test_masked_z_16xi8_perm_mem_mask3(<16 x i8>* %vp, <16 x i8> % ; GENERIC-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 -; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] +; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask3: @@ -5326,7 +5326,7 @@ define <32 x i8> @test_masked_32xi8_perm_mask0(<32 x i8> %vec, <32 x i8> %vec2, ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] +; GENERIC-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5348,7 +5348,7 @@ define <32 x i8> @test_masked_z_32xi8_perm_mask0(<32 x i8> %vec, <32 x i8> %mask ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] +; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi8_perm_mask0: @@ -5367,7 +5367,7 @@ define <32 x i8> @test_masked_32xi8_perm_mask1(<32 x i8> %vec, <32 x i8> %vec2, ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] +; GENERIC-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5389,7 +5389,7 @@ define <32 x i8> @test_masked_z_32xi8_perm_mask1(<32 x i8> %vec, <32 x i8> %mask ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] +; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi8_perm_mask1: @@ -5408,7 +5408,7 @@ define <32 x i8> @test_masked_32xi8_perm_mask2(<32 x i8> %vec, <32 x i8> %vec2, ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] +; GENERIC-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5430,7 +5430,7 @@ define <32 x i8> @test_masked_z_32xi8_perm_mask2(<32 x i8> %vec, <32 x i8> %mask ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] +; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi8_perm_mask2: @@ -5462,7 +5462,7 @@ define <32 x i8> @test_masked_32xi8_perm_mask3(<32 x i8> %vec, <32 x i8> %vec2, ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] +; GENERIC-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5484,7 +5484,7 @@ define <32 x i8> @test_masked_z_32xi8_perm_mask3(<32 x i8> %vec, <32 x i8> %mask ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] +; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi8_perm_mask3: @@ -5520,7 +5520,7 @@ define <32 x i8> @test_masked_32xi8_perm_mem_mask0(<32 x i8>* %vp, <32 x i8> %ve ; GENERIC-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 -; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] +; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi8_perm_mem_mask0: @@ -5543,7 +5543,7 @@ define <32 x i8> @test_masked_z_32xi8_perm_mem_mask0(<32 x i8>* %vp, <32 x i8> % ; GENERIC-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 -; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] +; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask0: @@ -5566,7 +5566,7 @@ define <32 x i8> @test_masked_32xi8_perm_mem_mask1(<32 x i8>* %vp, <32 x i8> %ve ; GENERIC-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 -; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] +; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi8_perm_mem_mask1: @@ -5589,7 +5589,7 @@ define <32 x i8> @test_masked_z_32xi8_perm_mem_mask1(<32 x i8>* %vp, <32 x i8> % ; GENERIC-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 -; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] +; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask1: @@ -5612,7 +5612,7 @@ define <32 x i8> @test_masked_32xi8_perm_mem_mask2(<32 x i8>* %vp, <32 x i8> %ve ; GENERIC-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 -; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] +; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi8_perm_mem_mask2: @@ -5635,7 +5635,7 @@ define <32 x i8> @test_masked_z_32xi8_perm_mem_mask2(<32 x i8>* %vp, <32 x i8> % ; GENERIC-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 -; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] +; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask2: @@ -5674,7 +5674,7 @@ define <32 x i8> @test_masked_32xi8_perm_mem_mask3(<32 x i8>* %vp, <32 x i8> %ve ; GENERIC-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 -; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] +; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi8_perm_mem_mask3: @@ -5697,7 +5697,7 @@ define <32 x i8> @test_masked_z_32xi8_perm_mem_mask3(<32 x i8>* %vp, <32 x i8> % ; GENERIC-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 -; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] +; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask3: @@ -5717,7 +5717,7 @@ define <32 x i8> @test_masked_z_32xi8_perm_mem_mask3(<32 x i8>* %vp, <32 x i8> % define <64 x i8> @test_64xi8_perm_mask0(<64 x i8> %vec) { ; GENERIC-LABEL: test_64xi8_perm_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_64xi8_perm_mask0: @@ -5732,7 +5732,7 @@ define <64 x i8> @test_masked_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %vec2, ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 -; GENERIC-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] +; GENERIC-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5754,7 +5754,7 @@ define <64 x i8> @test_masked_z_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %mask ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 -; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_64xi8_perm_mask0: @@ -5773,7 +5773,7 @@ define <64 x i8> @test_masked_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %vec2, ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 -; GENERIC-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] +; GENERIC-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5795,7 +5795,7 @@ define <64 x i8> @test_masked_z_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %mask ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 -; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_64xi8_perm_mask1: @@ -5814,7 +5814,7 @@ define <64 x i8> @test_masked_64xi8_perm_mask2(<64 x i8> %vec, <64 x i8> %vec2, ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 -; GENERIC-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] +; GENERIC-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5836,7 +5836,7 @@ define <64 x i8> @test_masked_z_64xi8_perm_mask2(<64 x i8> %vec, <64 x i8> %mask ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 -; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_64xi8_perm_mask2: @@ -5853,7 +5853,7 @@ define <64 x i8> @test_masked_z_64xi8_perm_mask2(<64 x i8> %vec, <64 x i8> %mask define <64 x i8> @test_64xi8_perm_mask3(<64 x i8> %vec) { ; GENERIC-LABEL: test_64xi8_perm_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_64xi8_perm_mask3: @@ -5868,7 +5868,7 @@ define <64 x i8> @test_masked_64xi8_perm_mask3(<64 x i8> %vec, <64 x i8> %vec2, ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 -; GENERIC-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] +; GENERIC-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [5:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5890,7 +5890,7 @@ define <64 x i8> @test_masked_z_64xi8_perm_mask3(<64 x i8> %vec, <64 x i8> %mask ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 -; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_64xi8_perm_mask3: @@ -5908,7 +5908,7 @@ define <64 x i8> @test_64xi8_perm_mem_mask0(<64 x i8>* %vp) { ; GENERIC-LABEL: test_64xi8_perm_mem_mask0: ; GENERIC: # BB#0: ; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm0 # sched: [4:0.50] -; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_64xi8_perm_mem_mask0: @@ -5926,7 +5926,7 @@ define <64 x i8> @test_masked_64xi8_perm_mem_mask0(<64 x i8>* %vp, <64 x i8> %ve ; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 -; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_64xi8_perm_mem_mask0: @@ -5949,7 +5949,7 @@ define <64 x i8> @test_masked_z_64xi8_perm_mem_mask0(<64 x i8>* %vp, <64 x i8> % ; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 -; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask0: @@ -5972,7 +5972,7 @@ define <64 x i8> @test_masked_64xi8_perm_mem_mask1(<64 x i8>* %vp, <64 x i8> %ve ; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 -; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_64xi8_perm_mem_mask1: @@ -5995,7 +5995,7 @@ define <64 x i8> @test_masked_z_64xi8_perm_mem_mask1(<64 x i8>* %vp, <64 x i8> % ; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 -; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask1: @@ -6018,7 +6018,7 @@ define <64 x i8> @test_masked_64xi8_perm_mem_mask2(<64 x i8>* %vp, <64 x i8> %ve ; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 -; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_64xi8_perm_mem_mask2: @@ -6041,7 +6041,7 @@ define <64 x i8> @test_masked_z_64xi8_perm_mem_mask2(<64 x i8>* %vp, <64 x i8> % ; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 -; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask2: @@ -6062,7 +6062,7 @@ define <64 x i8> @test_64xi8_perm_mem_mask3(<64 x i8>* %vp) { ; GENERIC-LABEL: test_64xi8_perm_mem_mask3: ; GENERIC: # BB#0: ; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm0 # sched: [4:0.50] -; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_64xi8_perm_mem_mask3: @@ -6080,7 +6080,7 @@ define <64 x i8> @test_masked_64xi8_perm_mem_mask3(<64 x i8>* %vp, <64 x i8> %ve ; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 -; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_64xi8_perm_mem_mask3: @@ -6103,7 +6103,7 @@ define <64 x i8> @test_masked_z_64xi8_perm_mem_mask3(<64 x i8>* %vp, <64 x i8> % ; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [4:0.50] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 -; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask3: @@ -6138,7 +6138,7 @@ define <8 x i16> @test_masked_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16> %v ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 -; GENERIC-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,7,6] +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6160,7 +6160,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16> ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 -; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,7,6] +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi16_perm_high_mask0: @@ -6179,7 +6179,7 @@ define <8 x i16> @test_masked_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> %ve ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 -; GENERIC-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[0,3,0,0,4,5,6,7] +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[0,3,0,0,4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6201,7 +6201,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> % ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 -; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3,0,0,4,5,6,7] +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3,0,0,4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi16_perm_low_mask1: @@ -6220,7 +6220,7 @@ define <8 x i16> @test_masked_8xi16_perm_high_mask2(<8 x i16> %vec, <8 x i16> %v ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 -; GENERIC-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,4,4,5] +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,4,4,5] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6242,7 +6242,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mask2(<8 x i16> %vec, <8 x i16> ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 -; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,4,4,5] +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,4,4,5] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi16_perm_high_mask2: @@ -6274,7 +6274,7 @@ define <8 x i16> @test_masked_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> %ve ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 -; GENERIC-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[2,1,1,1,4,5,6,7] +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6296,7 +6296,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> % ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 -; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1,1,1,4,5,6,7] +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi16_perm_low_mask3: @@ -6315,7 +6315,7 @@ define <8 x i16> @test_masked_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16> %v ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 -; GENERIC-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,5,7,6] +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,5,7,6] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6337,7 +6337,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16> ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 -; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,5,7,6] +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,5,7,6] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi16_perm_high_mask4: @@ -6356,7 +6356,7 @@ define <8 x i16> @test_masked_8xi16_perm_low_mask5(<8 x i16> %vec, <8 x i16> %ve ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 -; GENERIC-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[3,3,2,1,4,5,6,7] +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[3,3,2,1,4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6378,7 +6378,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_low_mask5(<8 x i16> %vec, <8 x i16> % ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 -; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[3,3,2,1,4,5,6,7] +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[3,3,2,1,4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi16_perm_low_mask5: @@ -6410,7 +6410,7 @@ define <8 x i16> @test_masked_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16> %v ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 -; GENERIC-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,6,5] +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6432,7 +6432,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16> ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 -; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,6,5] +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi16_perm_high_mask6: @@ -6451,7 +6451,7 @@ define <8 x i16> @test_masked_8xi16_perm_low_mask7(<8 x i16> %vec, <8 x i16> %ve ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 -; GENERIC-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0,4,5,6,7] +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0,4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6473,7 +6473,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_low_mask7(<8 x i16> %vec, <8 x i16> % ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 -; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0,4,5,6,7] +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0,4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi16_perm_low_mask7: @@ -6506,7 +6506,7 @@ define <8 x i16> @test_masked_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i16 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 -; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,7,4,6] +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,7,4,6] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi16_perm_high_mem_mask0: @@ -6527,7 +6527,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 -; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,7,4,6] +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,7,4,6] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi16_perm_high_mem_mask0: @@ -6548,7 +6548,7 @@ define <8 x i16> @test_masked_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i16> ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 -; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[1,3,3,2,4,5,6,7] +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[1,3,3,2,4,5,6,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi16_perm_low_mem_mask1: @@ -6569,7 +6569,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i1 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 -; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7] +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi16_perm_low_mem_mask1: @@ -6590,7 +6590,7 @@ define <8 x i16> @test_masked_8xi16_perm_high_mem_mask2(<8 x i16>* %vp, <8 x i16 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 -; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,6,6,5,7] +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,6,6,5,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi16_perm_high_mem_mask2: @@ -6611,7 +6611,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask2(<8 x i16>* %vp, <8 x i ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 -; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,6,6,5,7] +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,6,6,5,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi16_perm_high_mem_mask2: @@ -6646,7 +6646,7 @@ define <8 x i16> @test_masked_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i16> ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 -; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[3,1,2,0,4,5,6,7] +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[3,1,2,0,4,5,6,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi16_perm_low_mem_mask3: @@ -6667,7 +6667,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i1 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 -; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[3,1,2,0,4,5,6,7] +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[3,1,2,0,4,5,6,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi16_perm_low_mem_mask3: @@ -6688,7 +6688,7 @@ define <8 x i16> @test_masked_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i16 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 -; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,6,7,5] +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,6,7,5] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi16_perm_high_mem_mask4: @@ -6709,7 +6709,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 -; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,6,7,5] +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,6,7,5] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi16_perm_high_mem_mask4: @@ -6730,7 +6730,7 @@ define <8 x i16> @test_masked_8xi16_perm_low_mem_mask5(<8 x i16>* %vp, <8 x i16> ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 -; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[2,1,3,2,4,5,6,7] +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[2,1,3,2,4,5,6,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi16_perm_low_mem_mask5: @@ -6751,7 +6751,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask5(<8 x i16>* %vp, <8 x i1 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 -; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[2,1,3,2,4,5,6,7] +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[2,1,3,2,4,5,6,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi16_perm_low_mem_mask5: @@ -6786,7 +6786,7 @@ define <8 x i16> @test_masked_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i16 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 -; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,4,4,4] +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,4,4,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi16_perm_high_mem_mask6: @@ -6807,7 +6807,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 -; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,4,4,4] +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,4,4,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi16_perm_high_mem_mask6: @@ -6828,7 +6828,7 @@ define <8 x i16> @test_masked_8xi16_perm_low_mem_mask7(<8 x i16>* %vp, <8 x i16> ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 -; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[0,3,3,1,4,5,6,7] +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[0,3,3,1,4,5,6,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi16_perm_low_mem_mask7: @@ -6849,7 +6849,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask7(<8 x i16>* %vp, <8 x i1 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 -; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[0,3,3,1,4,5,6,7] +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[0,3,3,1,4,5,6,7] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi16_perm_low_mem_mask7: @@ -6883,7 +6883,7 @@ define <16 x i16> @test_masked_16xi16_perm_high_mask0(<16 x i16> %vec, <16 x i16 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] +; GENERIC-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6891,7 +6891,7 @@ define <16 x i16> @test_masked_16xi16_perm_high_mask0(<16 x i16> %vec, <16 x i16 ; SKX: # BB#0: ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] +; SKX-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00] ; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> @@ -6905,14 +6905,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mask0(<16 x i16> %vec, <16 x i ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] +; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_high_mask0: ; SKX: # BB#0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] +; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer @@ -6924,7 +6924,7 @@ define <16 x i16> @test_masked_16xi16_perm_low_mask1(<16 x i16> %vec, <16 x i16> ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] +; GENERIC-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6932,7 +6932,7 @@ define <16 x i16> @test_masked_16xi16_perm_low_mask1(<16 x i16> %vec, <16 x i16> ; SKX: # BB#0: ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] +; SKX-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] sched: [1:1.00] ; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> @@ -6946,14 +6946,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_low_mask1(<16 x i16> %vec, <16 x i1 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] +; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_low_mask1: ; SKX: # BB#0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] +; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer @@ -6965,7 +6965,7 @@ define <16 x i16> @test_masked_16xi16_perm_high_mask2(<16 x i16> %vec, <16 x i16 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] +; GENERIC-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6973,7 +6973,7 @@ define <16 x i16> @test_masked_16xi16_perm_high_mask2(<16 x i16> %vec, <16 x i16 ; SKX: # BB#0: ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] +; SKX-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] sched: [1:1.00] ; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> @@ -6987,14 +6987,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mask2(<16 x i16> %vec, <16 x i ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] +; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_high_mask2: ; SKX: # BB#0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] +; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer @@ -7019,7 +7019,7 @@ define <16 x i16> @test_masked_16xi16_perm_low_mask3(<16 x i16> %vec, <16 x i16> ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] +; GENERIC-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7027,7 +7027,7 @@ define <16 x i16> @test_masked_16xi16_perm_low_mask3(<16 x i16> %vec, <16 x i16> ; SKX: # BB#0: ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] +; SKX-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00] ; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> @@ -7041,14 +7041,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_low_mask3(<16 x i16> %vec, <16 x i1 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] +; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_low_mask3: ; SKX: # BB#0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] +; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer @@ -7060,7 +7060,7 @@ define <16 x i16> @test_masked_16xi16_perm_high_mask4(<16 x i16> %vec, <16 x i16 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] +; GENERIC-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7068,7 +7068,7 @@ define <16 x i16> @test_masked_16xi16_perm_high_mask4(<16 x i16> %vec, <16 x i16 ; SKX: # BB#0: ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] +; SKX-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] sched: [1:1.00] ; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> @@ -7082,14 +7082,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mask4(<16 x i16> %vec, <16 x i ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] +; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_high_mask4: ; SKX: # BB#0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] +; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer @@ -7101,7 +7101,7 @@ define <16 x i16> @test_masked_16xi16_perm_low_mask5(<16 x i16> %vec, <16 x i16> ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] +; GENERIC-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7109,7 +7109,7 @@ define <16 x i16> @test_masked_16xi16_perm_low_mask5(<16 x i16> %vec, <16 x i16> ; SKX: # BB#0: ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] +; SKX-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] sched: [1:1.00] ; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> @@ -7123,14 +7123,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_low_mask5(<16 x i16> %vec, <16 x i1 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] +; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_low_mask5: ; SKX: # BB#0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] +; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer @@ -7155,7 +7155,7 @@ define <16 x i16> @test_masked_16xi16_perm_high_mask6(<16 x i16> %vec, <16 x i16 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] +; GENERIC-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7163,7 +7163,7 @@ define <16 x i16> @test_masked_16xi16_perm_high_mask6(<16 x i16> %vec, <16 x i16 ; SKX: # BB#0: ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] +; SKX-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00] ; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> @@ -7177,14 +7177,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mask6(<16 x i16> %vec, <16 x i ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] +; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_high_mask6: ; SKX: # BB#0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] +; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer @@ -7196,7 +7196,7 @@ define <16 x i16> @test_masked_16xi16_perm_low_mask7(<16 x i16> %vec, <16 x i16> ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] +; GENERIC-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7204,7 +7204,7 @@ define <16 x i16> @test_masked_16xi16_perm_low_mask7(<16 x i16> %vec, <16 x i16> ; SKX: # BB#0: ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] +; SKX-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] sched: [1:1.00] ; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> @@ -7218,14 +7218,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_low_mask7(<16 x i16> %vec, <16 x i1 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] +; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_low_mask7: ; SKX: # BB#0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] +; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer @@ -7251,14 +7251,14 @@ define <16 x i16> @test_masked_16xi16_perm_high_mem_mask0(<16 x i16>* %vp, <16 x ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] +; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask0: ; SKX: # BB#0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] +; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [6:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> @@ -7272,14 +7272,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask0(<16 x i16>* %vp, <16 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 -; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] +; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask0: ; SKX: # BB#0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] -; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] +; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [6:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> @@ -7293,14 +7293,14 @@ define <16 x i16> @test_masked_16xi16_perm_low_mem_mask1(<16 x i16>* %vp, <16 x ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] +; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask1: ; SKX: # BB#0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] +; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] sched: [6:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> @@ -7314,14 +7314,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask1(<16 x i16>* %vp, <16 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 -; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] +; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask1: ; SKX: # BB#0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] -; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] +; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] sched: [6:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> @@ -7335,14 +7335,14 @@ define <16 x i16> @test_masked_16xi16_perm_high_mem_mask2(<16 x i16>* %vp, <16 x ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] +; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask2: ; SKX: # BB#0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] +; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] sched: [6:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> @@ -7356,14 +7356,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask2(<16 x i16>* %vp, <16 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 -; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] +; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask2: ; SKX: # BB#0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] -; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] +; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] sched: [6:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> @@ -7391,14 +7391,14 @@ define <16 x i16> @test_masked_16xi16_perm_low_mem_mask3(<16 x i16>* %vp, <16 x ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] +; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask3: ; SKX: # BB#0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] +; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [6:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> @@ -7412,14 +7412,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask3(<16 x i16>* %vp, <16 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 -; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] +; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask3: ; SKX: # BB#0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] -; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] +; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [6:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> @@ -7433,14 +7433,14 @@ define <16 x i16> @test_masked_16xi16_perm_high_mem_mask4(<16 x i16>* %vp, <16 x ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] +; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask4: ; SKX: # BB#0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] +; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] sched: [6:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> @@ -7454,14 +7454,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask4(<16 x i16>* %vp, <16 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 -; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] +; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask4: ; SKX: # BB#0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] -; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] +; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] sched: [6:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> @@ -7475,14 +7475,14 @@ define <16 x i16> @test_masked_16xi16_perm_low_mem_mask5(<16 x i16>* %vp, <16 x ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] +; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask5: ; SKX: # BB#0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] +; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] sched: [6:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> @@ -7496,14 +7496,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask5(<16 x i16>* %vp, <16 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 -; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] +; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask5: ; SKX: # BB#0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] -; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] +; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] sched: [6:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> @@ -7531,14 +7531,14 @@ define <16 x i16> @test_masked_16xi16_perm_high_mem_mask6(<16 x i16>* %vp, <16 x ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] +; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask6: ; SKX: # BB#0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] +; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [6:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> @@ -7552,14 +7552,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask6(<16 x i16>* %vp, <16 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 -; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] +; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask6: ; SKX: # BB#0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] -; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] +; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [6:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> @@ -7573,14 +7573,14 @@ define <16 x i16> @test_masked_16xi16_perm_low_mem_mask7(<16 x i16>* %vp, <16 x ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] +; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask7: ; SKX: # BB#0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] +; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] sched: [6:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> @@ -7594,14 +7594,14 @@ define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask7(<16 x i16>* %vp, <16 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 -; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] +; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask7: ; SKX: # BB#0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] -; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] +; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] sched: [6:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> @@ -7613,7 +7613,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask7(<16 x i16>* %vp, <16 define <32 x i16> @test_32xi16_perm_high_mask0(<32 x i16> %vec) { ; GENERIC-LABEL: test_32xi16_perm_high_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_32xi16_perm_high_mask0: @@ -7628,7 +7628,7 @@ define <32 x i16> @test_masked_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i16 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 -; GENERIC-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7650,7 +7650,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 -; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_high_mask0: @@ -7669,7 +7669,7 @@ define <32 x i16> @test_masked_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i16> ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 -; GENERIC-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] +; GENERIC-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7691,7 +7691,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i1 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 -; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] +; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_low_mask1: @@ -7710,7 +7710,7 @@ define <32 x i16> @test_masked_32xi16_perm_high_mask2(<32 x i16> %vec, <32 x i16 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 -; GENERIC-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7732,7 +7732,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mask2(<32 x i16> %vec, <32 x i ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 -; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_high_mask2: @@ -7749,7 +7749,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mask2(<32 x i16> %vec, <32 x i define <32 x i16> @test_32xi16_perm_low_mask3(<32 x i16> %vec) { ; GENERIC-LABEL: test_32xi16_perm_low_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] +; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_32xi16_perm_low_mask3: @@ -7764,7 +7764,7 @@ define <32 x i16> @test_masked_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i16> ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 -; GENERIC-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] +; GENERIC-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7786,7 +7786,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i1 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 -; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] +; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_low_mask3: @@ -7805,7 +7805,7 @@ define <32 x i16> @test_masked_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i16 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 -; GENERIC-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7827,7 +7827,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 -; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_high_mask4: @@ -7846,7 +7846,7 @@ define <32 x i16> @test_masked_32xi16_perm_low_mask5(<32 x i16> %vec, <32 x i16> ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 -; GENERIC-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] +; GENERIC-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7868,7 +7868,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mask5(<32 x i16> %vec, <32 x i1 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 -; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] +; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_low_mask5: @@ -7885,7 +7885,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mask5(<32 x i16> %vec, <32 x i1 define <32 x i16> @test_32xi16_perm_high_mask6(<32 x i16> %vec) { ; GENERIC-LABEL: test_32xi16_perm_high_mask6: ; GENERIC: # BB#0: -; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_32xi16_perm_high_mask6: @@ -7900,7 +7900,7 @@ define <32 x i16> @test_masked_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i16 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 -; GENERIC-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7922,7 +7922,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 -; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_high_mask6: @@ -7941,7 +7941,7 @@ define <32 x i16> @test_masked_32xi16_perm_low_mask7(<32 x i16> %vec, <32 x i16> ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 -; GENERIC-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] +; GENERIC-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7963,7 +7963,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mask7(<32 x i16> %vec, <32 x i1 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 -; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] +; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_low_mask7: @@ -7980,7 +7980,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mask7(<32 x i16> %vec, <32 x i1 define <32 x i16> @test_32xi16_perm_high_mem_mask0(<32 x i16>* %vp) { ; GENERIC-LABEL: test_32xi16_perm_high_mem_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_32xi16_perm_high_mem_mask0: @@ -7996,7 +7996,7 @@ define <32 x i16> @test_masked_32xi16_perm_high_mem_mask0(<32 x i16>* %vp, <32 x ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 -; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask0: @@ -8017,7 +8017,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask0(<32 x i16>* %vp, <32 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 -; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask0: @@ -8038,7 +8038,7 @@ define <32 x i16> @test_masked_32xi16_perm_low_mem_mask1(<32 x i16>* %vp, <32 x ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 -; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] +; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask1: @@ -8059,7 +8059,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask1(<32 x i16>* %vp, <32 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 -; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] +; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask1: @@ -8080,7 +8080,7 @@ define <32 x i16> @test_masked_32xi16_perm_high_mem_mask2(<32 x i16>* %vp, <32 x ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 -; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask2: @@ -8101,7 +8101,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask2(<32 x i16>* %vp, <32 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 -; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask2: @@ -8120,7 +8120,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask2(<32 x i16>* %vp, <32 define <32 x i16> @test_32xi16_perm_low_mem_mask3(<32 x i16>* %vp) { ; GENERIC-LABEL: test_32xi16_perm_low_mem_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] +; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_32xi16_perm_low_mem_mask3: @@ -8136,7 +8136,7 @@ define <32 x i16> @test_masked_32xi16_perm_low_mem_mask3(<32 x i16>* %vp, <32 x ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 -; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] +; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask3: @@ -8157,7 +8157,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask3(<32 x i16>* %vp, <32 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 -; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] +; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask3: @@ -8178,7 +8178,7 @@ define <32 x i16> @test_masked_32xi16_perm_high_mem_mask4(<32 x i16>* %vp, <32 x ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 -; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask4: @@ -8199,7 +8199,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask4(<32 x i16>* %vp, <32 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 -; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask4: @@ -8218,7 +8218,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask4(<32 x i16>* %vp, <32 define <32 x i16> @test_masked_32xi16_perm_low_mem_mask5(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask5: ; GENERIC: # BB#0: -; GENERIC-NEXT: vpshufd {{.*#+}} zmm2 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] +; GENERIC-NEXT: vpshufd {{.*#+}} zmm2 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 ; GENERIC-NEXT: vmovdqu16 %zmm2, %zmm0 {%k1} @@ -8241,7 +8241,7 @@ define <32 x i16> @test_masked_32xi16_perm_low_mem_mask5(<32 x i16>* %vp, <32 x define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask5(<32 x i16>* %vp, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask5: ; GENERIC: # BB#0: -; GENERIC-NEXT: vpshufd {{.*#+}} zmm1 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] +; GENERIC-NEXT: vpshufd {{.*#+}} zmm1 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 ; GENERIC-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} {z} @@ -8264,7 +8264,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask5(<32 x i16>* %vp, <32 define <32 x i16> @test_32xi16_perm_high_mem_mask6(<32 x i16>* %vp) { ; GENERIC-LABEL: test_32xi16_perm_high_mem_mask6: ; GENERIC: # BB#0: -; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_32xi16_perm_high_mem_mask6: @@ -8280,7 +8280,7 @@ define <32 x i16> @test_masked_32xi16_perm_high_mem_mask6(<32 x i16>* %vp, <32 x ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 -; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask6: @@ -8301,7 +8301,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask6(<32 x i16>* %vp, <32 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 -; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask6: @@ -8322,7 +8322,7 @@ define <32 x i16> @test_masked_32xi16_perm_low_mem_mask7(<32 x i16>* %vp, <32 x ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 -; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] +; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask7: @@ -8343,7 +8343,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask7(<32 x i16>* %vp, <32 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 -; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] +; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask7: @@ -8377,7 +8377,7 @@ define <4 x i32> @test_masked_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %vec2, ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 -; GENERIC-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[2,3,3,0] +; GENERIC-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[2,3,3,0] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8399,7 +8399,7 @@ define <4 x i32> @test_masked_z_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %mask ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 -; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[2,3,3,0] +; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[2,3,3,0] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xi32_perm_mask0: @@ -8418,7 +8418,7 @@ define <4 x i32> @test_masked_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %vec2, ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 -; GENERIC-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0] +; GENERIC-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8440,7 +8440,7 @@ define <4 x i32> @test_masked_z_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %mask ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 -; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0] +; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xi32_perm_mask1: @@ -8459,7 +8459,7 @@ define <4 x i32> @test_masked_4xi32_perm_mask2(<4 x i32> %vec, <4 x i32> %vec2, ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 -; GENERIC-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[3,0,1,0] +; GENERIC-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[3,0,1,0] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8481,7 +8481,7 @@ define <4 x i32> @test_masked_z_4xi32_perm_mask2(<4 x i32> %vec, <4 x i32> %mask ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 -; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[3,0,1,0] +; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[3,0,1,0] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xi32_perm_mask2: @@ -8513,7 +8513,7 @@ define <4 x i32> @test_masked_4xi32_perm_mask3(<4 x i32> %vec, <4 x i32> %vec2, ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 -; GENERIC-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,1,0,3] +; GENERIC-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,1,0,3] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8535,7 +8535,7 @@ define <4 x i32> @test_masked_z_4xi32_perm_mask3(<4 x i32> %vec, <4 x i32> %mask ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 -; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,0,3] +; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,0,3] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xi32_perm_mask3: @@ -8568,7 +8568,7 @@ define <4 x i32> @test_masked_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> %ve ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 -; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[0,1,3,3] +; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[0,1,3,3] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_4xi32_perm_mem_mask0: @@ -8589,7 +8589,7 @@ define <4 x i32> @test_masked_z_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> % ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 -; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,1,3,3] +; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,1,3,3] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xi32_perm_mem_mask0: @@ -8610,7 +8610,7 @@ define <4 x i32> @test_masked_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> %ve ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 -; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[2,2,3,1] +; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[2,2,3,1] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_4xi32_perm_mem_mask1: @@ -8631,7 +8631,7 @@ define <4 x i32> @test_masked_z_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> % ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 -; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[2,2,3,1] +; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[2,2,3,1] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xi32_perm_mem_mask1: @@ -8652,7 +8652,7 @@ define <4 x i32> @test_masked_4xi32_perm_mem_mask2(<4 x i32>* %vp, <4 x i32> %ve ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 -; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[0,3,0,1] +; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[0,3,0,1] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_4xi32_perm_mem_mask2: @@ -8673,7 +8673,7 @@ define <4 x i32> @test_masked_z_4xi32_perm_mem_mask2(<4 x i32>* %vp, <4 x i32> % ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 -; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,3,0,1] +; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,3,0,1] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xi32_perm_mem_mask2: @@ -8708,7 +8708,7 @@ define <4 x i32> @test_masked_4xi32_perm_mem_mask3(<4 x i32>* %vp, <4 x i32> %ve ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 -; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[1,0,1,0] +; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[1,0,1,0] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_4xi32_perm_mem_mask3: @@ -8729,7 +8729,7 @@ define <4 x i32> @test_masked_z_4xi32_perm_mem_mask3(<4 x i32>* %vp, <4 x i32> % ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 -; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[1,0,1,0] +; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[1,0,1,0] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xi32_perm_mem_mask3: @@ -8763,7 +8763,7 @@ define <8 x i32> @test2_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2, ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[2,3,1,0,6,7,5,4] +; GENERIC-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8785,7 +8785,7 @@ define <8 x i32> @test2_masked_z_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %mas ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,1,0,6,7,5,4] +; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_z_8xi32_perm_mask0: @@ -8804,7 +8804,7 @@ define <8 x i32> @test2_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2, ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,3,4,7,7,7] +; GENERIC-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,3,4,7,7,7] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8826,7 +8826,7 @@ define <8 x i32> @test2_masked_z_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %mas ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,3,4,7,7,7] +; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,3,4,7,7,7] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_z_8xi32_perm_mask1: @@ -8845,7 +8845,7 @@ define <8 x i32> @test2_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2, ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3,5,6,4,7] +; GENERIC-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3,5,6,4,7] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8867,7 +8867,7 @@ define <8 x i32> @test2_masked_z_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %mas ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3,5,6,4,7] +; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3,5,6,4,7] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_z_8xi32_perm_mask2: @@ -8899,7 +8899,7 @@ define <8 x i32> @test2_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2, ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,3,1,0,5,7,5,4] +; GENERIC-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8921,7 +8921,7 @@ define <8 x i32> @test2_masked_z_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %mas ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,3,1,0,5,7,5,4] +; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_z_8xi32_perm_mask3: @@ -8954,7 +8954,7 @@ define <8 x i32> @test2_masked_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %v ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[1,0,2,0,5,4,6,4] +; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[1,0,2,0,5,4,6,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_8xi32_perm_mem_mask0: @@ -8975,7 +8975,7 @@ define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 -; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[1,0,2,0,5,4,6,4] +; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[1,0,2,0,5,4,6,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask0: @@ -8996,7 +8996,7 @@ define <8 x i32> @test2_masked_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %v ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[0,3,2,0,4,7,6,4] +; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_8xi32_perm_mem_mask1: @@ -9017,7 +9017,7 @@ define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 -; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] +; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask1: @@ -9038,7 +9038,7 @@ define <8 x i32> @test2_masked_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %v ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,3,1,7,6,7,5] +; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,3,1,7,6,7,5] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_8xi32_perm_mem_mask2: @@ -9059,7 +9059,7 @@ define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 -; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,1,7,6,7,5] +; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,1,7,6,7,5] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask2: @@ -9094,7 +9094,7 @@ define <8 x i32> @test2_masked_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %v ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,0,0,7,6,4,4] +; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,0,0,7,6,4,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_8xi32_perm_mem_mask3: @@ -9115,7 +9115,7 @@ define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 -; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,0,0,7,6,4,4] +; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,0,0,7,6,4,4] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask3: @@ -9134,7 +9134,7 @@ define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> define <16 x i32> @test2_16xi32_perm_mask0(<16 x i32> %vec) { ; GENERIC-LABEL: test2_16xi32_perm_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] +; GENERIC-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_16xi32_perm_mask0: @@ -9149,7 +9149,7 @@ define <16 x i32> @test2_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %v ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 -; GENERIC-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] +; GENERIC-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -9171,7 +9171,7 @@ define <16 x i32> @test2_masked_z_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 -; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] +; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_z_16xi32_perm_mask0: @@ -9190,7 +9190,7 @@ define <16 x i32> @test2_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %v ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 -; GENERIC-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] +; GENERIC-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -9212,7 +9212,7 @@ define <16 x i32> @test2_masked_z_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 -; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] +; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_z_16xi32_perm_mask1: @@ -9231,7 +9231,7 @@ define <16 x i32> @test2_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %v ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 -; GENERIC-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] +; GENERIC-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -9253,7 +9253,7 @@ define <16 x i32> @test2_masked_z_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 -; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] +; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_z_16xi32_perm_mask2: @@ -9270,7 +9270,7 @@ define <16 x i32> @test2_masked_z_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> define <16 x i32> @test2_16xi32_perm_mask3(<16 x i32> %vec) { ; GENERIC-LABEL: test2_16xi32_perm_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] +; GENERIC-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_16xi32_perm_mask3: @@ -9285,7 +9285,7 @@ define <16 x i32> @test2_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %v ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 -; GENERIC-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] +; GENERIC-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -9307,7 +9307,7 @@ define <16 x i32> @test2_masked_z_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 -; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] +; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_z_16xi32_perm_mask3: @@ -9324,7 +9324,7 @@ define <16 x i32> @test2_masked_z_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> define <16 x i32> @test2_16xi32_perm_mem_mask0(<16 x i32>* %vp) { ; GENERIC-LABEL: test2_16xi32_perm_mem_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vpermilps {{.*#+}} zmm0 = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] +; GENERIC-NEXT: vpermilps {{.*#+}} zmm0 = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_16xi32_perm_mem_mask0: @@ -9340,7 +9340,7 @@ define <16 x i32> @test2_masked_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 -; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] +; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_16xi32_perm_mem_mask0: @@ -9361,7 +9361,7 @@ define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 -; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] +; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask0: @@ -9382,7 +9382,7 @@ define <16 x i32> @test2_masked_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 -; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] +; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_16xi32_perm_mem_mask1: @@ -9403,7 +9403,7 @@ define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 -; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] +; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask1: @@ -9424,7 +9424,7 @@ define <16 x i32> @test2_masked_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 -; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] +; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_16xi32_perm_mem_mask2: @@ -9445,7 +9445,7 @@ define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 -; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] +; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask2: @@ -9464,7 +9464,7 @@ define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i define <16 x i32> @test2_16xi32_perm_mem_mask3(<16 x i32>* %vp) { ; GENERIC-LABEL: test2_16xi32_perm_mem_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vpermilps {{.*#+}} zmm0 = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] +; GENERIC-NEXT: vpermilps {{.*#+}} zmm0 = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_16xi32_perm_mem_mask3: @@ -9480,7 +9480,7 @@ define <16 x i32> @test2_masked_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32 ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 -; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] +; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_16xi32_perm_mem_mask3: @@ -9501,7 +9501,7 @@ define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i ; GENERIC: # BB#0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 -; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] +; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask3: diff --git a/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll index d59d9f82321..04ea615fed6 100644 --- a/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll @@ -6,6 +6,7 @@ declare <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double>, i32, define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_512(<8 x double> %x0, <2 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vextractf64x2_512: ; CHECK: ## BB#0: +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 ; CHECK-NEXT: kmovw %edi, %k0 ; CHECK-NEXT: kshiftlb $7, %k0, %k1 ; CHECK-NEXT: kshiftrb $7, %k1, %k1 @@ -15,7 +16,6 @@ define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_512(<8 x double> %x0, ; CHECK-NEXT: kmovw %k1, %ecx ; CHECK-NEXT: vmovd %ecx, %xmm2 ; CHECK-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 ; CHECK-NEXT: vpsllq $63, %xmm2, %xmm2 ; CHECK-NEXT: vpsraq $63, %zmm2, %zmm2 ; CHECK-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1 -- 2.50.1