From: Simon Pilgrim Date: Wed, 6 Dec 2017 17:59:26 +0000 (+0000) Subject: [X86][AVX512] Tag aligned/unaligned move instruction scheduler classes X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=47aefda7c625f654f35bacdac92d2c053e302cf8;p=llvm [X86][AVX512] Tag aligned/unaligned move instruction scheduler classes git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@319913 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 2f4dd099284..c85ebf14d26 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -3148,42 +3148,40 @@ defm : mask_shift_lowering, Requires<[HasAVX512]>; // -multiclass avx512_load opc, string OpcodeStr, X86VectorVTInfo _, - PatFrag ld_frag, PatFrag mload, - bit NoRMPattern = 0, - SDPatternOperator SelectOprr = vselect> { +multiclass avx512_load opc, string OpcodeStr, MoveLoadStoreItins itins, + X86VectorVTInfo _, PatFrag ld_frag, PatFrag mload, + bit NoRMPattern = 0, + SDPatternOperator SelectOprr = vselect> { let hasSideEffects = 0 in { def rr : AVX512PI, EVEX; + _.ExeDomain, itins.rr>, EVEX, Sched<[WriteMove]>; def rrkz : AVX512PI, - EVEX, EVEX_KZ; + _.ImmAllZerosV)))], _.ExeDomain, + itins.rr>, EVEX, EVEX_KZ, Sched<[WriteMove]>; - let mayLoad = 1, canFoldAsLoad = 1, isReMaterializable = 1, - SchedRW = [WriteLoad] in + let mayLoad = 1, canFoldAsLoad = 1, isReMaterializable = 1 in def rm : AVX512PI, EVEX; + _.ExeDomain, itins.rm>, EVEX, Sched<[WriteLoad]>; let Constraints = "$src0 = $dst", isConvertibleToThreeAddress = 1 in { - def rrk : AVX512PI, - EVEX, EVEX_K; - let SchedRW = [WriteLoad] in + def rrk : AVX512PI, EVEX, EVEX_K, Sched<[WriteMove]>; def rmk : AVX512PI opc, string OpcodeStr, X86VectorVTInfo _, [(set _.RC:$dst, (_.VT (vselect _.KRCWM:$mask, (_.VT (bitconvert (ld_frag addr:$src1))), - (_.VT _.RC:$src0))))], _.ExeDomain>, EVEX, EVEX_K; + (_.VT _.RC:$src0))))], _.ExeDomain, itins.rm>, + EVEX, EVEX_K, Sched<[WriteLoad]>; } - let SchedRW = [WriteLoad] in def rmkz : AVX512PI, EVEX, EVEX_KZ; + _.ExeDomain, itins.rm>, EVEX, EVEX_KZ, Sched<[WriteLoad]>; } def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, undef)), (!cast(NAME#_.ZSuffix##rmkz) _.KRCWM:$mask, addr:$ptr)>; @@ -3217,14 +3215,17 @@ multiclass avx512_alignedload_vl opc, string OpcodeStr, AVX512VLVectorVTInfo _, Predicate prd> { let Predicates = [prd] in - defm Z : avx512_load, EVEX_V512; + defm Z : avx512_load, + EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_load, EVEX_V256; - defm Z128 : avx512_load, EVEX_V128; + defm Z256 : avx512_load, + EVEX_V256; + defm Z128 : avx512_load, + EVEX_V128; } } @@ -3234,38 +3235,40 @@ multiclass avx512_load_vl opc, string OpcodeStr, bit NoRMPattern = 0, SDPatternOperator SelectOprr = vselect> { let Predicates = [prd] in - defm Z : avx512_load, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_load, EVEX_V256; - defm Z128 : avx512_load, EVEX_V128; } } -multiclass avx512_store opc, string OpcodeStr, X86VectorVTInfo _, - PatFrag st_frag, PatFrag mstore, string Name, - bit NoMRPattern = 0> { - +multiclass avx512_store opc, string OpcodeStr, MoveLoadStoreItins itins, + X86VectorVTInfo _, PatFrag st_frag, PatFrag mstore, + string Name, bit NoMRPattern = 0> { let hasSideEffects = 0 in { def rr_REV : AVX512PI, EVEX, FoldGenData; + [], _.ExeDomain, itins.rr>, EVEX, FoldGenData, + Sched<[WriteMove]>; def rrk_REV : AVX512PI, EVEX, EVEX_K, FoldGenData; + [], _.ExeDomain, itins.rr>, EVEX, EVEX_K, + FoldGenData, Sched<[WriteMove]>; def rrkz_REV : AVX512PI, EVEX, EVEX_KZ, FoldGenData; + [], _.ExeDomain, itins.rr>, EVEX, EVEX_KZ, + FoldGenData, Sched<[WriteMove]>; } let hasSideEffects = 0, mayStore = 1 in @@ -3273,11 +3276,11 @@ multiclass avx512_store opc, string OpcodeStr, X86VectorVTInfo _, !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), !if(NoMRPattern, [], [(st_frag (_.VT _.RC:$src), addr:$dst)]), - _.ExeDomain>, EVEX; + _.ExeDomain, itins.mr>, EVEX, Sched<[WriteStore]>; def mrk : AVX512PI, EVEX, EVEX_K; + [], _.ExeDomain, itins.mr>, EVEX, EVEX_K, Sched<[WriteStore]>; def: Pat<(mstore addr:$ptr, _.KRCWM:$mask, (_.VT _.RC:$src)), (!cast(NAME#_.ZSuffix##mrk) addr:$ptr, @@ -3289,14 +3292,14 @@ multiclass avx512_store_vl< bits<8> opc, string OpcodeStr, AVX512VLVectorVTInfo _, Predicate prd, string Name, bit NoMRPattern = 0> { let Predicates = [prd] in - defm Z : avx512_store, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_store, EVEX_V256; - defm Z128 : avx512_store, EVEX_V128; } @@ -3306,13 +3309,13 @@ multiclass avx512_alignedstore_vl opc, string OpcodeStr, AVX512VLVectorVTInfo _, Predicate prd, string Name> { let Predicates = [prd] in - defm Z : avx512_store, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_store, EVEX_V256; - defm Z128 : avx512_store, EVEX_V128; } } @@ -3381,24 +3384,24 @@ defm VMOVDQU64 : avx512_load_vl<0x6F, "vmovdqu64", avx512vl_i64_info, HasAVX512, let isReMaterializable = 1, canFoldAsLoad = 1, isPseudo = 1, SchedRW = [WriteLoad], mayLoad = 1, hasSideEffects = 0 in { def VMOVAPSZ128rm_NOVLX : I<0, Pseudo, (outs VR128X:$dst), (ins f128mem:$src), - "", []>; + "", [], IIC_SSE_MOVA_P_RM>; def VMOVAPSZ256rm_NOVLX : I<0, Pseudo, (outs VR256X:$dst), (ins f256mem:$src), - "", []>; + "", [], IIC_SSE_MOVA_P_RM>; def VMOVUPSZ128rm_NOVLX : I<0, Pseudo, (outs VR128X:$dst), (ins f128mem:$src), - "", []>; + "", [], IIC_SSE_MOVA_P_RM>; def VMOVUPSZ256rm_NOVLX : I<0, Pseudo, (outs VR256X:$dst), (ins f256mem:$src), - "", []>; + "", [], IIC_SSE_MOVA_P_RM>; } -let isPseudo = 1, mayStore = 1, hasSideEffects = 0 in { +let isPseudo = 1, SchedRW = [WriteStore], mayStore = 1, hasSideEffects = 0 in { def VMOVAPSZ128mr_NOVLX : I<0, Pseudo, (outs), (ins f128mem:$dst, VR128X:$src), - "", []>; + "", [], IIC_SSE_MOVA_P_MR>; def VMOVAPSZ256mr_NOVLX : I<0, Pseudo, (outs), (ins f256mem:$dst, VR256X:$src), - "", []>; + "", [], IIC_SSE_MOVA_P_MR>; def VMOVUPSZ128mr_NOVLX : I<0, Pseudo, (outs), (ins f128mem:$dst, VR128X:$src), - "", []>; + "", [], IIC_SSE_MOVA_P_MR>; def VMOVUPSZ256mr_NOVLX : I<0, Pseudo, (outs), (ins f256mem:$dst, VR256X:$src), - "", []>; + "", [], IIC_SSE_MOVA_P_MR>; } def : Pat<(v8i64 (vselect VK8WM:$mask, (bc_v8i64 (v16i32 immAllZerosV)), @@ -3907,8 +3910,8 @@ let hasSideEffects = 0 in { def VMOVSSZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2), "vmovss.s\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [], NoItinerary>, XS, EVEX_4V, VEX_LIG, - FoldGenData<"VMOVSSZrr">; + [], IIC_SSE_MOV_S_RR>, XS, EVEX_4V, VEX_LIG, + FoldGenData<"VMOVSSZrr">, Sched<[WriteMove]>; let Constraints = "$src0 = $dst" in def VMOVSSZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), @@ -3916,21 +3919,21 @@ let Constraints = "$src0 = $dst" in VR128X:$src1, VR128X:$src2), "vmovss.s\t{$src2, $src1, $dst {${mask}}|"# "$dst {${mask}}, $src1, $src2}", - [], NoItinerary>, EVEX_K, XS, EVEX_4V, VEX_LIG, - FoldGenData<"VMOVSSZrrk">; + [], IIC_SSE_MOV_S_RR>, EVEX_K, XS, EVEX_4V, VEX_LIG, + FoldGenData<"VMOVSSZrrk">, Sched<[WriteMove]>; def VMOVSSZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), (ins f32x_info.KRCWM:$mask, VR128X:$src1, VR128X:$src2), "vmovss.s\t{$src2, $src1, $dst {${mask}} {z}|"# "$dst {${mask}} {z}, $src1, $src2}", - [], NoItinerary>, EVEX_KZ, XS, EVEX_4V, VEX_LIG, - FoldGenData<"VMOVSSZrrkz">; + [], IIC_SSE_MOV_S_RR>, EVEX_KZ, XS, EVEX_4V, VEX_LIG, + FoldGenData<"VMOVSSZrrkz">, Sched<[WriteMove]>; def VMOVSDZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2), "vmovsd.s\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [], NoItinerary>, XD, EVEX_4V, VEX_LIG, VEX_W, - FoldGenData<"VMOVSDZrr">; + [], IIC_SSE_MOV_S_RR>, XD, EVEX_4V, VEX_LIG, VEX_W, + FoldGenData<"VMOVSDZrr">, Sched<[WriteMove]>; let Constraints = "$src0 = $dst" in def VMOVSDZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), @@ -3938,16 +3941,16 @@ let Constraints = "$src0 = $dst" in VR128X:$src1, VR128X:$src2), "vmovsd.s\t{$src2, $src1, $dst {${mask}}|"# "$dst {${mask}}, $src1, $src2}", - [], NoItinerary>, EVEX_K, XD, EVEX_4V, VEX_LIG, - VEX_W, FoldGenData<"VMOVSDZrrk">; + [], IIC_SSE_MOV_S_RR>, EVEX_K, XD, EVEX_4V, VEX_LIG, + VEX_W, FoldGenData<"VMOVSDZrrk">, Sched<[WriteMove]>; def VMOVSDZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), (ins f64x_info.KRCWM:$mask, VR128X:$src1, VR128X:$src2), "vmovsd.s\t{$src2, $src1, $dst {${mask}} {z}|"# "$dst {${mask}} {z}, $src1, $src2}", - [], NoItinerary>, EVEX_KZ, XD, EVEX_4V, VEX_LIG, - VEX_W, FoldGenData<"VMOVSDZrrkz">; + [], IIC_SSE_MOV_S_RR>, EVEX_KZ, XD, EVEX_4V, VEX_LIG, + VEX_W, FoldGenData<"VMOVSDZrrkz">, Sched<[WriteMove]>; } let Predicates = [HasAVX512] in { diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index d3a61f7b0e1..6616b2c3258 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -25,9 +25,15 @@ class SizeItins { OpndItins d = arg_d; } +class MoveLoadStoreItins { + InstrItinClass rr = arg_rr; + InstrItinClass rm = arg_rm; + InstrItinClass mr = arg_mr; +} class ShiftOpndItins { + InstrItinClass arg_ri> { InstrItinClass rr = arg_rr; InstrItinClass rm = arg_rm; InstrItinClass ri = arg_ri; @@ -152,10 +158,18 @@ def SSE_MOVA_ITINS : OpndItins< IIC_SSE_MOVA_P_RR, IIC_SSE_MOVA_P_RM >; +def SSE_MOVA : MoveLoadStoreItins< + IIC_SSE_MOVA_P_RR, IIC_SSE_MOVA_P_RM, IIC_SSE_MOVA_P_MR +>; + def SSE_MOVU_ITINS : OpndItins< IIC_SSE_MOVU_P_RR, IIC_SSE_MOVU_P_RM >; +def SSE_MOVU : MoveLoadStoreItins< + IIC_SSE_MOVU_P_RR, IIC_SSE_MOVU_P_RM, IIC_SSE_MOVU_P_MR +>; + def SSE_DPPD_ITINS : OpndItins< IIC_SSE_DPPD_RR, IIC_SSE_DPPD_RM >; diff --git a/test/CodeGen/X86/avx512-schedule.ll b/test/CodeGen/X86/avx512-schedule.ll index b826ee48ee9..b447a0dcbea 100755 --- a/test/CodeGen/X86/avx512-schedule.ll +++ b/test/CodeGen/X86/avx512-schedule.ll @@ -986,7 +986,7 @@ define <8 x double> @test_mask_broadcast_vaddpd(<8 x double> %dst, <8 x double> ; GENERIC-NEXT: vpxor %xmm0, %xmm0, %xmm0 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpneqq %zmm0, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vaddpd (%rdi){1to8}, %zmm1, %zmm1 {%k1} # sched: [7:1.00] -; GENERIC-NEXT: vmovapd %zmm1, %zmm0 +; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mask_broadcast_vaddpd: @@ -994,7 +994,7 @@ define <8 x double> @test_mask_broadcast_vaddpd(<8 x double> %dst, <8 x double> ; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 # sched: [1:0.33] ; SKX-NEXT: vpcmpneqq %zmm0, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vaddpd (%rdi){1to8}, %zmm1, %zmm1 {%k1} # sched: [11:0.50] -; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <8 x i64> %mask1, zeroinitializer %tmp = load double, double* %j @@ -2082,7 +2082,7 @@ define <16 x double> @uito16f64(<16 x i32> %a) nounwind { ; GENERIC-NEXT: vcvtudq2pd %ymm0, %zmm2 # sched: [4:1.00] ; GENERIC-NEXT: vextractf64x4 $1, %zmm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: vcvtudq2pd %ymm0, %zmm1 # sched: [4:1.00] -; GENERIC-NEXT: vmovaps %zmm2, %zmm0 +; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: uito16f64: @@ -2796,7 +2796,7 @@ define <16 x double> @sito16f64(<16 x i32> %a) { ; GENERIC-NEXT: vcvtdq2pd %ymm0, %zmm2 # sched: [4:1.00] ; GENERIC-NEXT: vextractf64x4 $1, %zmm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2pd %ymm0, %zmm1 # sched: [4:1.00] -; GENERIC-NEXT: vmovaps %zmm2, %zmm0 +; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sito16f64: @@ -4528,7 +4528,7 @@ define void @extload_v8i64(<8 x i8>* %a, <8 x i64>* %res) { ; GENERIC-LABEL: extload_v8i64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovsxbq (%rdi), %zmm0 # sched: [5:1.00] -; GENERIC-NEXT: vmovdqa64 %zmm0, (%rsi) +; GENERIC-NEXT: vmovdqa64 %zmm0, (%rsi) # sched: [1:1.00] ; GENERIC-NEXT: vzeroupper ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4549,18 +4549,18 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone { ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %zmm2, %zmm2 # sched: [3:1.00] ; GENERIC-NEXT: vpmovb2m %zmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} +; GENERIC-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33] ; GENERIC-NEXT: kshiftrq $32, %k1, %k1 -; GENERIC-NEXT: vmovdqu16 %zmm1, %zmm1 {%k1} {z} +; GENERIC-NEXT: vmovdqu16 %zmm1, %zmm1 {%k1} {z} # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test21: ; SKX: # %bb.0: ; SKX-NEXT: vpsllw $7, %zmm2, %zmm2 # sched: [1:0.50] ; SKX-NEXT: vpmovb2m %zmm2, %k1 # sched: [1:1.00] -; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} +; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.25] ; SKX-NEXT: kshiftrq $32, %k1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqu16 %zmm1, %zmm1 {%k1} {z} +; SKX-NEXT: vmovdqu16 %zmm1, %zmm1 {%k1} {z} # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %ret = select <64 x i1> %mask, <64 x i16> %x, <64 x i16> zeroinitializer ret <64 x i16> %ret @@ -5033,7 +5033,7 @@ define <16 x float> @test_x86_fmadd231_ps(<16 x float> %a0, <16 x float> %a1, <1 ; GENERIC-NEXT: vpmovb2m %xmm2, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vmulps (%rdi), %zmm0, %zmm0 # sched: [9:1.00] ; GENERIC-NEXT: vaddps %zmm1, %zmm0, %zmm1 {%k1} # sched: [3:1.00] -; GENERIC-NEXT: vmovaps %zmm1, %zmm0 +; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_x86_fmadd231_ps: @@ -5059,7 +5059,7 @@ define <16 x float> @test_x86_fmadd213_ps(<16 x float> %a0, <16 x float> %a1, <1 ; GENERIC-NEXT: vpmovb2m %xmm2, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vmulps %zmm0, %zmm1, %zmm0 # sched: [5:1.00] ; GENERIC-NEXT: vaddps (%rdi), %zmm0, %zmm1 {%k1} # sched: [7:1.00] -; GENERIC-NEXT: vmovaps %zmm1, %zmm0 +; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_x86_fmadd213_ps: @@ -6175,7 +6175,7 @@ define <16 x i32> @mov_test17(i8 * %addr) { define void @mov_test18(i8 * %addr, <8 x i64> %data) { ; GENERIC-LABEL: mov_test18: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovaps %zmm0, (%rdi) +; GENERIC-NEXT: vmovaps %zmm0, (%rdi) # sched: [1:1.00] ; GENERIC-NEXT: vzeroupper ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6192,7 +6192,7 @@ define void @mov_test18(i8 * %addr, <8 x i64> %data) { define void @mov_test19(i8 * %addr, <16 x i32> %data) { ; GENERIC-LABEL: mov_test19: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovups %zmm0, (%rdi) +; GENERIC-NEXT: vmovups %zmm0, (%rdi) # sched: [1:1.00] ; GENERIC-NEXT: vzeroupper ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6209,7 +6209,7 @@ define void @mov_test19(i8 * %addr, <16 x i32> %data) { define void @mov_test20(i8 * %addr, <16 x i32> %data) { ; GENERIC-LABEL: mov_test20: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovaps %zmm0, (%rdi) +; GENERIC-NEXT: vmovaps %zmm0, (%rdi) # sched: [1:1.00] ; GENERIC-NEXT: vzeroupper ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6241,7 +6241,7 @@ define <8 x i64> @mov_test21(i8 * %addr) { define void @mov_test22(i8 * %addr, <8 x i64> %data) { ; GENERIC-LABEL: mov_test22: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovups %zmm0, (%rdi) +; GENERIC-NEXT: vmovups %zmm0, (%rdi) # sched: [1:1.00] ; GENERIC-NEXT: vzeroupper ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6273,7 +6273,7 @@ define <8 x i64> @mov_test23(i8 * %addr) { define void @mov_test24(i8 * %addr, <8 x double> %data) { ; GENERIC-LABEL: mov_test24: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovaps %zmm0, (%rdi) +; GENERIC-NEXT: vmovaps %zmm0, (%rdi) # sched: [1:1.00] ; GENERIC-NEXT: vzeroupper ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6305,7 +6305,7 @@ define <8 x double> @mov_test25(i8 * %addr) { define void @mov_test26(i8 * %addr, <16 x float> %data) { ; GENERIC-LABEL: mov_test26: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovaps %zmm0, (%rdi) +; GENERIC-NEXT: vmovaps %zmm0, (%rdi) # sched: [1:1.00] ; GENERIC-NEXT: vzeroupper ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6337,7 +6337,7 @@ define <16 x float> @mov_test27(i8 * %addr) { define void @mov_test28(i8 * %addr, <8 x double> %data) { ; GENERIC-LABEL: mov_test28: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovups %zmm0, (%rdi) +; GENERIC-NEXT: vmovups %zmm0, (%rdi) # sched: [1:1.00] ; GENERIC-NEXT: vzeroupper ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6369,7 +6369,7 @@ define <8 x double> @mov_test29(i8 * %addr) { define void @mov_test30(i8 * %addr, <16 x float> %data) { ; GENERIC-LABEL: mov_test30: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovups %zmm0, (%rdi) +; GENERIC-NEXT: vmovups %zmm0, (%rdi) # sched: [1:1.00] ; GENERIC-NEXT: vzeroupper ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7454,14 +7454,14 @@ define <32 x i16> @vmov_test21(<32 x i16> %x , <32 x i1> %mask) nounwind readnon ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %ymm1, %ymm1 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %ymm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} +; GENERIC-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vmov_test21: ; SKX: # %bb.0: ; SKX-NEXT: vpsllw $7, %ymm1, %ymm1 # sched: [1:0.50] ; SKX-NEXT: vpmovb2m %ymm1, %k1 # sched: [1:1.00] -; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} +; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer ret <32 x i16> %ret @@ -7686,7 +7686,7 @@ define <32 x i16> @test_build_vec_v32i1(<32 x i16> %x) { ; GENERIC-NEXT: movl $1497715861, %eax # imm = 0x59455495 ; GENERIC-NEXT: # sched: [1:0.33] ; GENERIC-NEXT: kmovd %eax, %k1 -; GENERIC-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} +; GENERIC-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_build_vec_v32i1: @@ -7694,7 +7694,7 @@ define <32 x i16> @test_build_vec_v32i1(<32 x i16> %x) { ; SKX-NEXT: movl $1497715861, %eax # imm = 0x59455495 ; SKX-NEXT: # sched: [1:0.25] ; SKX-NEXT: kmovd %eax, %k1 # sched: [1:1.00] -; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} +; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %ret = select <32 x i1> , <32 x i16> %x, <32 x i16> zeroinitializer ret <32 x i16> %ret @@ -7724,11 +7724,11 @@ define void @ktest_1(<8 x double> %in, double * %base) { ; GENERIC-NEXT: ktestb %k0, %k0 ; GENERIC-NEXT: je .LBB410_2 # sched: [1:1.00] ; GENERIC-NEXT: # %bb.1: # %L1 -; GENERIC-NEXT: vmovapd %zmm0, (%rdi) +; GENERIC-NEXT: vmovapd %zmm0, (%rdi) # sched: [1:1.00] ; GENERIC-NEXT: vzeroupper ; GENERIC-NEXT: retq # sched: [1:1.00] ; GENERIC-NEXT: .LBB410_2: # %L2 -; GENERIC-NEXT: vmovapd %zmm0, 8(%rdi) +; GENERIC-NEXT: vmovapd %zmm0, 8(%rdi) # sched: [1:1.00] ; GENERIC-NEXT: vzeroupper ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7793,13 +7793,13 @@ define void @ktest_2(<32 x float> %in, float * %base) { ; GENERIC-NEXT: ktestd %k0, %k0 ; GENERIC-NEXT: je .LBB411_2 # sched: [1:1.00] ; GENERIC-NEXT: # %bb.1: # %L1 -; GENERIC-NEXT: vmovaps %zmm0, (%rdi) -; GENERIC-NEXT: vmovaps %zmm1, 64(%rdi) +; GENERIC-NEXT: vmovaps %zmm0, (%rdi) # sched: [1:1.00] +; GENERIC-NEXT: vmovaps %zmm1, 64(%rdi) # sched: [1:1.00] ; GENERIC-NEXT: vzeroupper ; GENERIC-NEXT: retq # sched: [1:1.00] ; GENERIC-NEXT: .LBB411_2: # %L2 -; GENERIC-NEXT: vmovaps %zmm0, 4(%rdi) -; GENERIC-NEXT: vmovaps %zmm1, 68(%rdi) +; GENERIC-NEXT: vmovaps %zmm0, 4(%rdi) # sched: [1:1.00] +; GENERIC-NEXT: vmovaps %zmm1, 68(%rdi) # sched: [1:1.00] ; GENERIC-NEXT: vzeroupper ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8341,7 +8341,7 @@ define <16 x float> @_ss16xfloat_mask(float %a, <16 x float> %i, <16 x i32> %m ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vbroadcastss %xmm0, %zmm1 {%k1} # sched: [1:1.00] -; GENERIC-NEXT: vmovaps %zmm1, %zmm0 +; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: _ss16xfloat_mask: @@ -8460,7 +8460,7 @@ define <8 x double> @_sd8xdouble_mask(double %a, <8 x double> %i, <8 x i32> %m ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpneqd %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1} # sched: [1:1.00] -; GENERIC-NEXT: vmovapd %zmm1, %zmm0 +; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: _sd8xdouble_mask: @@ -8468,7 +8468,7 @@ define <8 x double> @_sd8xdouble_mask(double %a, <8 x double> %i, <8 x i32> %m ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpneqd %ymm3, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1} # sched: [3:1.00] -; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <8 x i32> %mask1, zeroinitializer %b = insertelement <8 x double> undef, double %a, i32 0 @@ -8593,7 +8593,7 @@ define <16 x i32> @test_vbroadcast() { ; GENERIC-NEXT: vcmpunordps %zmm0, %zmm0, %k0 # sched: [3:1.00] ; GENERIC-NEXT: vpmovm2d %k0, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: knotw %k0, %k1 -; GENERIC-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; GENERIC-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_vbroadcast: @@ -8602,7 +8602,7 @@ define <16 x i32> @test_vbroadcast() { ; SKX-NEXT: vcmpunordps %zmm0, %zmm0, %k0 # sched: [3:1.00] ; SKX-NEXT: vpmovm2d %k0, %zmm0 # sched: [1:0.25] ; SKX-NEXT: knotw %k0, %k1 # sched: [1:1.00] -; SKX-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; SKX-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = sext <16 x i1> zeroinitializer to <16 x i32> diff --git a/test/CodeGen/X86/avx512-shuffle-schedule.ll b/test/CodeGen/X86/avx512-shuffle-schedule.ll index 6c351d2103d..820e74015c1 100755 --- a/test/CodeGen/X86/avx512-shuffle-schedule.ll +++ b/test/CodeGen/X86/avx512-shuffle-schedule.ll @@ -452,7 +452,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %ve ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] -; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_mask0: @@ -461,7 +461,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %ve ; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00] -; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer @@ -497,7 +497,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %ve ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] -; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_mask1: @@ -506,7 +506,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %ve ; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00] -; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer @@ -542,7 +542,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %ve ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] -; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_mask2: @@ -551,7 +551,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %ve ; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00] -; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer @@ -602,7 +602,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %ve ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] -; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_mask3: @@ -611,7 +611,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %ve ; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00] -; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer @@ -1304,7 +1304,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %ve ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] -; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi32_perm_mask0: @@ -1313,7 +1313,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %ve ; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] -; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -1349,7 +1349,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %ve ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] -; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi32_perm_mask1: @@ -1358,7 +1358,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %ve ; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] -; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -1394,7 +1394,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %ve ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] -; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi32_perm_mask2: @@ -1403,7 +1403,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %ve ; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] -; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -1454,7 +1454,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %ve ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] -; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xi32_perm_mask3: @@ -1463,7 +1463,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %ve ; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] -; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -2116,7 +2116,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %vec2, ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] -; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi64_perm_mask0: @@ -2125,7 +2125,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %vec2, ; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] -; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -2160,7 +2160,7 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> %ve ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[1,0,1,1,5,4,5,5] sched: [1:1.00] -; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi64_perm_imm_mask1: @@ -2168,7 +2168,7 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> %ve ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[1,0,1,1,5,4,5,5] sched: [3:1.00] -; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -2202,7 +2202,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %vec2, ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] -; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi64_perm_mask2: @@ -2211,7 +2211,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %vec2, ; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] -; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -2259,7 +2259,7 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> %ve ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,1,7,5,7,5] sched: [1:1.00] -; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi64_perm_imm_mask3: @@ -2267,7 +2267,7 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> %ve ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,1,7,5,7,5] sched: [3:1.00] -; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -2301,7 +2301,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %vec2, ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] -; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi64_perm_mask4: @@ -2310,7 +2310,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %vec2, ; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] -; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -2345,7 +2345,7 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> %ve ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[0,0,0,0,4,4,4,4] sched: [1:1.00] -; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi64_perm_imm_mask5: @@ -2353,7 +2353,7 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> %ve ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[0,0,0,0,4,4,4,4] sched: [3:1.00] -; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -2402,7 +2402,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %vec2, ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] -; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi64_perm_mask6: @@ -2411,7 +2411,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %vec2, ; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] -; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -2446,7 +2446,7 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> %ve ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,3,3,3,7,7,7,7] sched: [1:1.00] -; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi64_perm_imm_mask7: @@ -2454,7 +2454,7 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> %ve ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,3,3,3,7,7,7,7] sched: [3:1.00] -; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -3327,7 +3327,7 @@ define <16 x float> @test_masked_16xfloat_perm_mask0(<16 x float> %vec, <16 x fl ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] -; GENERIC-NEXT: vmovaps %zmm1, %zmm0 +; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xfloat_perm_mask0: @@ -3372,7 +3372,7 @@ define <16 x float> @test_masked_16xfloat_perm_mask1(<16 x float> %vec, <16 x fl ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] -; GENERIC-NEXT: vmovaps %zmm1, %zmm0 +; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xfloat_perm_mask1: @@ -3417,7 +3417,7 @@ define <16 x float> @test_masked_16xfloat_perm_mask2(<16 x float> %vec, <16 x fl ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] -; GENERIC-NEXT: vmovaps %zmm1, %zmm0 +; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xfloat_perm_mask2: @@ -3477,7 +3477,7 @@ define <16 x float> @test_masked_16xfloat_perm_mask3(<16 x float> %vec, <16 x fl ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] -; GENERIC-NEXT: vmovaps %zmm1, %zmm0 +; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_16xfloat_perm_mask3: @@ -4139,7 +4139,7 @@ define <8 x double> @test_masked_8xdouble_perm_mask0(<8 x double> %vec, <8 x dou ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] -; GENERIC-NEXT: vmovapd %zmm1, %zmm0 +; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xdouble_perm_mask0: @@ -4148,7 +4148,7 @@ define <8 x double> @test_masked_8xdouble_perm_mask0(<8 x double> %vec, <8 x dou ; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] -; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -4183,7 +4183,7 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mask1(<8 x double> %vec, <8 x ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,2,7,4,4,6] sched: [1:1.00] -; GENERIC-NEXT: vmovapd %zmm1, %zmm0 +; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xdouble_perm_imm_mask1: @@ -4191,7 +4191,7 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mask1(<8 x double> %vec, <8 x ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,2,7,4,4,6] sched: [3:1.00] -; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -4225,7 +4225,7 @@ define <8 x double> @test_masked_8xdouble_perm_mask2(<8 x double> %vec, <8 x dou ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] -; GENERIC-NEXT: vmovapd %zmm1, %zmm0 +; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xdouble_perm_mask2: @@ -4234,7 +4234,7 @@ define <8 x double> @test_masked_8xdouble_perm_mask2(<8 x double> %vec, <8 x dou ; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] -; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -4282,7 +4282,7 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mask3(<8 x double> %vec, <8 x ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4] sched: [1:1.00] -; GENERIC-NEXT: vmovapd %zmm1, %zmm0 +; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xdouble_perm_imm_mask3: @@ -4290,7 +4290,7 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mask3(<8 x double> %vec, <8 x ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4] sched: [3:1.00] -; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -4324,7 +4324,7 @@ define <8 x double> @test_masked_8xdouble_perm_mask4(<8 x double> %vec, <8 x dou ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] -; GENERIC-NEXT: vmovapd %zmm1, %zmm0 +; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xdouble_perm_mask4: @@ -4333,7 +4333,7 @@ define <8 x double> @test_masked_8xdouble_perm_mask4(<8 x double> %vec, <8 x dou ; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] -; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -4368,7 +4368,7 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mask5(<8 x double> %vec, <8 x ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,3,2,3,7,7,6,7] sched: [1:1.00] -; GENERIC-NEXT: vmovapd %zmm1, %zmm0 +; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xdouble_perm_imm_mask5: @@ -4376,7 +4376,7 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mask5(<8 x double> %vec, <8 x ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,3,2,3,7,7,6,7] sched: [3:1.00] -; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -4425,7 +4425,7 @@ define <8 x double> @test_masked_8xdouble_perm_mask6(<8 x double> %vec, <8 x dou ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] -; GENERIC-NEXT: vmovapd %zmm1, %zmm0 +; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xdouble_perm_mask6: @@ -4434,7 +4434,7 @@ define <8 x double> @test_masked_8xdouble_perm_mask6(<8 x double> %vec, <8 x dou ; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] -; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -4469,7 +4469,7 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mask7(<8 x double> %vec, <8 x ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,2,7,5,7,6] sched: [1:1.00] -; GENERIC-NEXT: vmovapd %zmm1, %zmm0 +; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xdouble_perm_imm_mask7: @@ -4477,7 +4477,7 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mask7(<8 x double> %vec, <8 x ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,2,7,5,7,6] sched: [3:1.00] -; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -5733,7 +5733,7 @@ define <64 x i8> @test_masked_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %vec2, ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [5:1.00] -; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_64xi8_perm_mask0: @@ -5741,7 +5741,7 @@ define <64 x i8> @test_masked_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %vec2, ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:1.00] -; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> %cmp = icmp eq <64 x i8> %mask, zeroinitializer @@ -5774,7 +5774,7 @@ define <64 x i8> @test_masked_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %vec2, ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [5:1.00] -; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_64xi8_perm_mask1: @@ -5782,7 +5782,7 @@ define <64 x i8> @test_masked_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %vec2, ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [8:1.00] -; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> %cmp = icmp eq <64 x i8> %mask, zeroinitializer @@ -5815,7 +5815,7 @@ define <64 x i8> @test_masked_64xi8_perm_mask2(<64 x i8> %vec, <64 x i8> %vec2, ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [5:1.00] -; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_64xi8_perm_mask2: @@ -5823,7 +5823,7 @@ define <64 x i8> @test_masked_64xi8_perm_mask2(<64 x i8> %vec, <64 x i8> %vec2, ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [8:1.00] -; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> %cmp = icmp eq <64 x i8> %mask, zeroinitializer @@ -5869,7 +5869,7 @@ define <64 x i8> @test_masked_64xi8_perm_mask3(<64 x i8> %vec, <64 x i8> %vec2, ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [5:1.00] -; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_64xi8_perm_mask3: @@ -5877,7 +5877,7 @@ define <64 x i8> @test_masked_64xi8_perm_mask3(<64 x i8> %vec, <64 x i8> %vec2, ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:1.00] -; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> %cmp = icmp eq <64 x i8> %mask, zeroinitializer @@ -7629,7 +7629,7 @@ define <32 x i16> @test_masked_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i16 ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00] -; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_high_mask0: @@ -7637,7 +7637,7 @@ define <32 x i16> @test_masked_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i16 ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00] -; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer @@ -7670,7 +7670,7 @@ define <32 x i16> @test_masked_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i16> ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00] -; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_low_mask1: @@ -7678,7 +7678,7 @@ define <32 x i16> @test_masked_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i16> ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00] -; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer @@ -7711,7 +7711,7 @@ define <32 x i16> @test_masked_32xi16_perm_high_mask2(<32 x i16> %vec, <32 x i16 ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00] -; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_high_mask2: @@ -7719,7 +7719,7 @@ define <32 x i16> @test_masked_32xi16_perm_high_mask2(<32 x i16> %vec, <32 x i16 ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00] -; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer @@ -7765,7 +7765,7 @@ define <32 x i16> @test_masked_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i16> ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00] -; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_low_mask3: @@ -7773,7 +7773,7 @@ define <32 x i16> @test_masked_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i16> ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00] -; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer @@ -7806,7 +7806,7 @@ define <32 x i16> @test_masked_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i16 ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00] -; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_high_mask4: @@ -7814,7 +7814,7 @@ define <32 x i16> @test_masked_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i16 ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00] -; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer @@ -7847,7 +7847,7 @@ define <32 x i16> @test_masked_32xi16_perm_low_mask5(<32 x i16> %vec, <32 x i16> ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00] -; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_low_mask5: @@ -7855,7 +7855,7 @@ define <32 x i16> @test_masked_32xi16_perm_low_mask5(<32 x i16> %vec, <32 x i16> ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00] -; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer @@ -7901,7 +7901,7 @@ define <32 x i16> @test_masked_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i16 ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00] -; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_high_mask6: @@ -7909,7 +7909,7 @@ define <32 x i16> @test_masked_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i16 ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00] -; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer @@ -7942,7 +7942,7 @@ define <32 x i16> @test_masked_32xi16_perm_low_mask7(<32 x i16> %vec, <32 x i16> ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00] -; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_low_mask7: @@ -7950,7 +7950,7 @@ define <32 x i16> @test_masked_32xi16_perm_low_mask7(<32 x i16> %vec, <32 x i16> ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00] -; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer @@ -8221,7 +8221,7 @@ define <32 x i16> @test_masked_32xi16_perm_low_mem_mask5(<32 x i16>* %vp, <32 x ; GENERIC-NEXT: vpshufd {{.*#+}} zmm2 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00] -; GENERIC-NEXT: vmovdqu16 %zmm2, %zmm0 {%k1} +; GENERIC-NEXT: vmovdqu16 %zmm2, %zmm0 {%k1} # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask5: @@ -8229,7 +8229,7 @@ define <32 x i16> @test_masked_32xi16_perm_low_mem_mask5(<32 x i16>* %vp, <32 x ; SKX-NEXT: vpshufd {{.*#+}} zmm2 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [8:1.00] ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqu16 %zmm2, %zmm0 {%k1} +; SKX-NEXT: vmovdqu16 %zmm2, %zmm0 {%k1} # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> @@ -8244,7 +8244,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask5(<32 x i16>* %vp, <32 ; GENERIC-NEXT: vpshufd {{.*#+}} zmm1 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00] -; GENERIC-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} {z} +; GENERIC-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} {z} # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask5: @@ -8252,7 +8252,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask5(<32 x i16>* %vp, <32 ; SKX-NEXT: vpshufd {{.*#+}} zmm1 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [8:1.00] ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} {z} +; SKX-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} {z} # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> @@ -9150,7 +9150,7 @@ define <16 x i32> @test2_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %v ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00] -; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_16xi32_perm_mask0: @@ -9158,7 +9158,7 @@ define <16 x i32> @test2_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %v ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00] -; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -9191,7 +9191,7 @@ define <16 x i32> @test2_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %v ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00] -; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_16xi32_perm_mask1: @@ -9199,7 +9199,7 @@ define <16 x i32> @test2_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %v ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00] -; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -9232,7 +9232,7 @@ define <16 x i32> @test2_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %v ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00] -; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_16xi32_perm_mask2: @@ -9240,7 +9240,7 @@ define <16 x i32> @test2_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %v ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00] -; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -9286,7 +9286,7 @@ define <16 x i32> @test2_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %v ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00] -; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_masked_16xi32_perm_mask3: @@ -9294,7 +9294,7 @@ define <16 x i32> @test2_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %v ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00] -; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -9930,7 +9930,7 @@ define <16 x float> @test_16xfloat_masked_shuff_mask0(<16 x float> %vec1, <16 x ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [1:1.00] -; GENERIC-NEXT: vmovaps %zmm2, %zmm0 +; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_masked_shuff_mask0: @@ -9971,7 +9971,7 @@ define <16 x float> @test_16xfloat_masked_shuff_mask1(<16 x float> %vec1, <16 x ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15] sched: [1:1.00] -; GENERIC-NEXT: vmovaps %zmm2, %zmm0 +; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_masked_shuff_mask1: @@ -10012,7 +10012,7 @@ define <16 x float> @test_16xfloat_masked_shuff_mask2(<16 x float> %vec1, <16 x ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7] sched: [1:1.00] -; GENERIC-NEXT: vmovaps %zmm2, %zmm0 +; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_masked_shuff_mask2: @@ -10066,7 +10066,7 @@ define <16 x float> @test_16xfloat_masked_shuff_mask3(<16 x float> %vec1, <16 x ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [1:1.00] -; GENERIC-NEXT: vmovaps %zmm2, %zmm0 +; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_masked_shuff_mask3: @@ -10121,7 +10121,7 @@ define <16 x float> @test_16xfloat_masked_shuff_mem_mask0(<16 x float> %vec1, <1 ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [5:1.00] -; GENERIC-NEXT: vmovaps %zmm1, %zmm0 +; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask0: @@ -10165,7 +10165,7 @@ define <16 x float> @test_16xfloat_masked_shuff_mem_mask1(<16 x float> %vec1, <1 ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [5:1.00] -; GENERIC-NEXT: vmovaps %zmm1, %zmm0 +; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask1: @@ -10209,7 +10209,7 @@ define <16 x float> @test_16xfloat_masked_shuff_mem_mask2(<16 x float> %vec1, <1 ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [5:1.00] -; GENERIC-NEXT: vmovaps %zmm1, %zmm0 +; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask2: @@ -10267,7 +10267,7 @@ define <16 x float> @test_16xfloat_masked_shuff_mem_mask3(<16 x float> %vec1, <1 ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [5:1.00] -; GENERIC-NEXT: vmovaps %zmm1, %zmm0 +; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask3: @@ -10718,7 +10718,7 @@ define <8 x double> @test_8xdouble_masked_shuff_mask0(<8 x double> %vec1, <8 x d ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [1:1.00] -; GENERIC-NEXT: vmovapd %zmm2, %zmm0 +; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_masked_shuff_mask0: @@ -10726,7 +10726,7 @@ define <8 x double> @test_8xdouble_masked_shuff_mask0(<8 x double> %vec1, <8 x d ; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [3:1.00] -; SKX-NEXT: vmovapd %zmm2, %zmm0 +; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -10759,7 +10759,7 @@ define <8 x double> @test_8xdouble_masked_shuff_mask1(<8 x double> %vec1, <8 x d ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [1:1.00] -; GENERIC-NEXT: vmovapd %zmm2, %zmm0 +; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_masked_shuff_mask1: @@ -10767,7 +10767,7 @@ define <8 x double> @test_8xdouble_masked_shuff_mask1(<8 x double> %vec1, <8 x d ; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [3:1.00] -; SKX-NEXT: vmovapd %zmm2, %zmm0 +; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -10800,7 +10800,7 @@ define <8 x double> @test_8xdouble_masked_shuff_mask2(<8 x double> %vec1, <8 x d ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [1:1.00] -; GENERIC-NEXT: vmovapd %zmm2, %zmm0 +; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_masked_shuff_mask2: @@ -10808,7 +10808,7 @@ define <8 x double> @test_8xdouble_masked_shuff_mask2(<8 x double> %vec1, <8 x d ; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [3:1.00] -; SKX-NEXT: vmovapd %zmm2, %zmm0 +; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -10854,7 +10854,7 @@ define <8 x double> @test_8xdouble_masked_shuff_mask3(<8 x double> %vec1, <8 x d ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [1:1.00] -; GENERIC-NEXT: vmovapd %zmm2, %zmm0 +; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_masked_shuff_mask3: @@ -10862,7 +10862,7 @@ define <8 x double> @test_8xdouble_masked_shuff_mask3(<8 x double> %vec1, <8 x d ; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [3:1.00] -; SKX-NEXT: vmovapd %zmm2, %zmm0 +; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -10909,7 +10909,7 @@ define <8 x double> @test_8xdouble_masked_shuff_mem_mask0(<8 x double> %vec1, <8 ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [5:1.00] -; GENERIC-NEXT: vmovapd %zmm1, %zmm0 +; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask0: @@ -10917,7 +10917,7 @@ define <8 x double> @test_8xdouble_masked_shuff_mem_mask0(<8 x double> %vec1, <8 ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [10:1.00] -; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> @@ -10953,7 +10953,7 @@ define <8 x double> @test_8xdouble_masked_shuff_mem_mask1(<8 x double> %vec1, <8 ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [5:1.00] -; GENERIC-NEXT: vmovapd %zmm1, %zmm0 +; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask1: @@ -10961,7 +10961,7 @@ define <8 x double> @test_8xdouble_masked_shuff_mem_mask1(<8 x double> %vec1, <8 ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [10:1.00] -; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> @@ -10997,7 +10997,7 @@ define <8 x double> @test_8xdouble_masked_shuff_mem_mask2(<8 x double> %vec1, <8 ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [5:1.00] -; GENERIC-NEXT: vmovapd %zmm1, %zmm0 +; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask2: @@ -11005,7 +11005,7 @@ define <8 x double> @test_8xdouble_masked_shuff_mem_mask2(<8 x double> %vec1, <8 ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [10:1.00] -; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> @@ -11055,7 +11055,7 @@ define <8 x double> @test_8xdouble_masked_shuff_mem_mask3(<8 x double> %vec1, <8 ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [5:1.00] -; GENERIC-NEXT: vmovapd %zmm1, %zmm0 +; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask3: @@ -11063,7 +11063,7 @@ define <8 x double> @test_8xdouble_masked_shuff_mem_mask3(<8 x double> %vec1, <8 ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [10:1.00] -; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> @@ -11506,7 +11506,7 @@ define <16 x i32> @test_16xi32_masked_shuff_mask0(<16 x i32> %vec1, <16 x i32> % ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [1:1.00] -; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 +; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xi32_masked_shuff_mask0: @@ -11514,7 +11514,7 @@ define <16 x i32> @test_16xi32_masked_shuff_mask0(<16 x i32> %vec1, <16 x i32> % ; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00] -; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -11547,7 +11547,7 @@ define <16 x i32> @test_16xi32_masked_shuff_mask1(<16 x i32> %vec1, <16 x i32> % ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [1:1.00] -; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 +; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xi32_masked_shuff_mask1: @@ -11555,7 +11555,7 @@ define <16 x i32> @test_16xi32_masked_shuff_mask1(<16 x i32> %vec1, <16 x i32> % ; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00] -; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -11588,7 +11588,7 @@ define <16 x i32> @test_16xi32_masked_shuff_mask2(<16 x i32> %vec1, <16 x i32> % ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [1:1.00] -; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 +; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xi32_masked_shuff_mask2: @@ -11596,7 +11596,7 @@ define <16 x i32> @test_16xi32_masked_shuff_mask2(<16 x i32> %vec1, <16 x i32> % ; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [3:1.00] -; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -11642,7 +11642,7 @@ define <16 x i32> @test_16xi32_masked_shuff_mask3(<16 x i32> %vec1, <16 x i32> % ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [1:1.00] -; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 +; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xi32_masked_shuff_mask3: @@ -11650,7 +11650,7 @@ define <16 x i32> @test_16xi32_masked_shuff_mask3(<16 x i32> %vec1, <16 x i32> % ; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00] -; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -11697,7 +11697,7 @@ define <16 x i32> @test_16xi32_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 x i3 ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [5:1.00] -; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xi32_masked_shuff_mem_mask0: @@ -11705,7 +11705,7 @@ define <16 x i32> @test_16xi32_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 x i3 ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [10:1.00] -; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x i32>, <16 x i32>* %vec2p %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> @@ -11741,7 +11741,7 @@ define <16 x i32> @test_16xi32_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 x i3 ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [5:1.00] -; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xi32_masked_shuff_mem_mask1: @@ -11749,7 +11749,7 @@ define <16 x i32> @test_16xi32_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 x i3 ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [10:1.00] -; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x i32>, <16 x i32>* %vec2p %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> @@ -11785,7 +11785,7 @@ define <16 x i32> @test_16xi32_masked_shuff_mem_mask2(<16 x i32> %vec1, <16 x i3 ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [5:1.00] -; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xi32_masked_shuff_mem_mask2: @@ -11793,7 +11793,7 @@ define <16 x i32> @test_16xi32_masked_shuff_mem_mask2(<16 x i32> %vec1, <16 x i3 ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [10:1.00] -; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x i32>, <16 x i32>* %vec2p %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> @@ -11843,7 +11843,7 @@ define <16 x i32> @test_16xi32_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 x i3 ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [5:1.00] -; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xi32_masked_shuff_mem_mask3: @@ -11851,7 +11851,7 @@ define <16 x i32> @test_16xi32_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 x i3 ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [10:1.00] -; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x i32>, <16 x i32>* %vec2p %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> @@ -12294,7 +12294,7 @@ define <8 x i64> @test_8xi64_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2 ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [1:1.00] -; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 +; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi64_masked_shuff_mask0: @@ -12302,7 +12302,7 @@ define <8 x i64> @test_8xi64_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2 ; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [3:1.00] -; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -12335,7 +12335,7 @@ define <8 x i64> @test_8xi64_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> %vec2 ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [1:1.00] -; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 +; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi64_masked_shuff_mask1: @@ -12343,7 +12343,7 @@ define <8 x i64> @test_8xi64_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> %vec2 ; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [3:1.00] -; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -12376,7 +12376,7 @@ define <8 x i64> @test_8xi64_masked_shuff_mask2(<8 x i64> %vec1, <8 x i64> %vec2 ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [1:1.00] -; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 +; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi64_masked_shuff_mask2: @@ -12384,7 +12384,7 @@ define <8 x i64> @test_8xi64_masked_shuff_mask2(<8 x i64> %vec1, <8 x i64> %vec2 ; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [3:1.00] -; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -12430,7 +12430,7 @@ define <8 x i64> @test_8xi64_masked_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2 ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [1:1.00] -; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 +; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi64_masked_shuff_mask3: @@ -12438,7 +12438,7 @@ define <8 x i64> @test_8xi64_masked_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2 ; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [3:1.00] -; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -12485,7 +12485,7 @@ define <8 x i64> @test_8xi64_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [5:1.00] -; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi64_masked_shuff_mem_mask0: @@ -12493,7 +12493,7 @@ define <8 x i64> @test_8xi64_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [10:1.00] -; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i64>, <8 x i64>* %vec2p %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> @@ -12529,7 +12529,7 @@ define <8 x i64> @test_8xi64_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i64>* ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [5:1.00] -; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi64_masked_shuff_mem_mask1: @@ -12537,7 +12537,7 @@ define <8 x i64> @test_8xi64_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i64>* ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [10:1.00] -; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i64>, <8 x i64>* %vec2p %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> @@ -12573,7 +12573,7 @@ define <8 x i64> @test_8xi64_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i64>* ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [5:1.00] -; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi64_masked_shuff_mem_mask2: @@ -12581,7 +12581,7 @@ define <8 x i64> @test_8xi64_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i64>* ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [10:1.00] -; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i64>, <8 x i64>* %vec2p %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> @@ -12631,7 +12631,7 @@ define <8 x i64> @test_8xi64_masked_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [5:1.00] -; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi64_masked_shuff_mem_mask3: @@ -12639,7 +12639,7 @@ define <8 x i64> @test_8xi64_masked_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [10:1.00] -; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i64>, <8 x i64>* %vec2p %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> @@ -13476,7 +13476,7 @@ define <16 x float> @test_16xfloat_masked_unpack_low_mask0(<16 x float> %vec1, < ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00] -; GENERIC-NEXT: vmovaps %zmm2, %zmm0 +; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_masked_unpack_low_mask0: @@ -13517,7 +13517,7 @@ define <16 x float> @test_16xfloat_masked_unpack_low_mask1(<16 x float> %vec1, < ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00] -; GENERIC-NEXT: vmovaps %zmm2, %zmm0 +; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_masked_unpack_low_mask1: @@ -13558,7 +13558,7 @@ define <16 x float> @test_16xfloat_masked_unpack_low_mask2(<16 x float> %vec1, < ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00] -; GENERIC-NEXT: vmovaps %zmm2, %zmm0 +; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_masked_unpack_low_mask2: @@ -13612,7 +13612,7 @@ define <16 x float> @test_16xfloat_masked_unpack_low_mask3(<16 x float> %vec1, < ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00] -; GENERIC-NEXT: vmovaps %zmm2, %zmm0 +; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_masked_unpack_low_mask3: @@ -13667,7 +13667,7 @@ define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask0(<16 x float> %vec ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00] -; GENERIC-NEXT: vmovaps %zmm1, %zmm0 +; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_masked_unpack_low_mem_mask0: @@ -13711,7 +13711,7 @@ define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask1(<16 x float> %vec ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00] -; GENERIC-NEXT: vmovaps %zmm1, %zmm0 +; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_masked_unpack_low_mem_mask1: @@ -13755,7 +13755,7 @@ define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask2(<16 x float> %vec ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00] -; GENERIC-NEXT: vmovaps %zmm1, %zmm0 +; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_masked_unpack_low_mem_mask2: @@ -13813,7 +13813,7 @@ define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask3(<16 x float> %vec ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00] -; GENERIC-NEXT: vmovaps %zmm1, %zmm0 +; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_masked_unpack_low_mem_mask3: @@ -14461,7 +14461,7 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mask0(<8 x double> %vec1, < ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00] -; GENERIC-NEXT: vmovapd %zmm2, %zmm0 +; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_masked_unpack_low_mask0: @@ -14469,7 +14469,7 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mask0(<8 x double> %vec1, < ; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00] -; SKX-NEXT: vmovapd %zmm2, %zmm0 +; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -14502,7 +14502,7 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mask1(<8 x double> %vec1, < ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00] -; GENERIC-NEXT: vmovapd %zmm2, %zmm0 +; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_masked_unpack_low_mask1: @@ -14510,7 +14510,7 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mask1(<8 x double> %vec1, < ; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00] -; SKX-NEXT: vmovapd %zmm2, %zmm0 +; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -14543,7 +14543,7 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mask2(<8 x double> %vec1, < ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00] -; GENERIC-NEXT: vmovapd %zmm2, %zmm0 +; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_masked_unpack_low_mask2: @@ -14551,7 +14551,7 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mask2(<8 x double> %vec1, < ; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00] -; SKX-NEXT: vmovapd %zmm2, %zmm0 +; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -14597,7 +14597,7 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mask3(<8 x double> %vec1, < ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00] -; GENERIC-NEXT: vmovapd %zmm2, %zmm0 +; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_masked_unpack_low_mask3: @@ -14605,7 +14605,7 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mask3(<8 x double> %vec1, < ; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00] -; SKX-NEXT: vmovapd %zmm2, %zmm0 +; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -14652,7 +14652,7 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask0(<8 x double> %vec ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00] -; GENERIC-NEXT: vmovapd %zmm1, %zmm0 +; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_masked_unpack_low_mem_mask0: @@ -14660,7 +14660,7 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask0(<8 x double> %vec ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00] -; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> @@ -14696,7 +14696,7 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask1(<8 x double> %vec ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00] -; GENERIC-NEXT: vmovapd %zmm1, %zmm0 +; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_masked_unpack_low_mem_mask1: @@ -14704,7 +14704,7 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask1(<8 x double> %vec ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00] -; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> @@ -14740,7 +14740,7 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask2(<8 x double> %vec ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00] -; GENERIC-NEXT: vmovapd %zmm1, %zmm0 +; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_masked_unpack_low_mem_mask2: @@ -14748,7 +14748,7 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask2(<8 x double> %vec ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00] -; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> @@ -14798,7 +14798,7 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask3(<8 x double> %vec ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00] -; GENERIC-NEXT: vmovapd %zmm1, %zmm0 +; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_masked_unpack_low_mem_mask3: @@ -14806,7 +14806,7 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask3(<8 x double> %vec ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00] -; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> @@ -15643,7 +15643,7 @@ define <16 x float> @test_16xfloat_masked_unpack_high_mask0(<16 x float> %vec1, ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00] -; GENERIC-NEXT: vmovaps %zmm2, %zmm0 +; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_masked_unpack_high_mask0: @@ -15684,7 +15684,7 @@ define <16 x float> @test_16xfloat_masked_unpack_high_mask1(<16 x float> %vec1, ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00] -; GENERIC-NEXT: vmovaps %zmm2, %zmm0 +; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_masked_unpack_high_mask1: @@ -15725,7 +15725,7 @@ define <16 x float> @test_16xfloat_masked_unpack_high_mask2(<16 x float> %vec1, ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00] -; GENERIC-NEXT: vmovaps %zmm2, %zmm0 +; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_masked_unpack_high_mask2: @@ -15779,7 +15779,7 @@ define <16 x float> @test_16xfloat_masked_unpack_high_mask3(<16 x float> %vec1, ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00] -; GENERIC-NEXT: vmovaps %zmm2, %zmm0 +; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_masked_unpack_high_mask3: @@ -15834,7 +15834,7 @@ define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask0(<16 x float> %ve ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00] -; GENERIC-NEXT: vmovaps %zmm1, %zmm0 +; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_masked_unpack_high_mem_mask0: @@ -15878,7 +15878,7 @@ define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask1(<16 x float> %ve ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00] -; GENERIC-NEXT: vmovaps %zmm1, %zmm0 +; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_masked_unpack_high_mem_mask1: @@ -15922,7 +15922,7 @@ define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask2(<16 x float> %ve ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00] -; GENERIC-NEXT: vmovaps %zmm1, %zmm0 +; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_masked_unpack_high_mem_mask2: @@ -15980,7 +15980,7 @@ define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask3(<16 x float> %ve ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00] -; GENERIC-NEXT: vmovaps %zmm1, %zmm0 +; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_16xfloat_masked_unpack_high_mem_mask3: @@ -16628,7 +16628,7 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mask0(<8 x double> %vec1, ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00] -; GENERIC-NEXT: vmovapd %zmm2, %zmm0 +; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_masked_unpack_high_mask0: @@ -16636,7 +16636,7 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mask0(<8 x double> %vec1, ; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00] -; SKX-NEXT: vmovapd %zmm2, %zmm0 +; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -16669,7 +16669,7 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mask1(<8 x double> %vec1, ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00] -; GENERIC-NEXT: vmovapd %zmm2, %zmm0 +; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_masked_unpack_high_mask1: @@ -16677,7 +16677,7 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mask1(<8 x double> %vec1, ; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00] -; SKX-NEXT: vmovapd %zmm2, %zmm0 +; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -16710,7 +16710,7 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mask2(<8 x double> %vec1, ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00] -; GENERIC-NEXT: vmovapd %zmm2, %zmm0 +; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_masked_unpack_high_mask2: @@ -16718,7 +16718,7 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mask2(<8 x double> %vec1, ; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00] -; SKX-NEXT: vmovapd %zmm2, %zmm0 +; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -16764,7 +16764,7 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mask3(<8 x double> %vec1, ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00] -; GENERIC-NEXT: vmovapd %zmm2, %zmm0 +; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_masked_unpack_high_mask3: @@ -16772,7 +16772,7 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mask3(<8 x double> %vec1, ; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00] -; SKX-NEXT: vmovapd %zmm2, %zmm0 +; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -16819,7 +16819,7 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask0(<8 x double> %ve ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00] -; GENERIC-NEXT: vmovapd %zmm1, %zmm0 +; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_masked_unpack_high_mem_mask0: @@ -16827,7 +16827,7 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask0(<8 x double> %ve ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00] -; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> @@ -16863,7 +16863,7 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask1(<8 x double> %ve ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00] -; GENERIC-NEXT: vmovapd %zmm1, %zmm0 +; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_masked_unpack_high_mem_mask1: @@ -16871,7 +16871,7 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask1(<8 x double> %ve ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00] -; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> @@ -16907,7 +16907,7 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask2(<8 x double> %ve ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00] -; GENERIC-NEXT: vmovapd %zmm1, %zmm0 +; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_masked_unpack_high_mem_mask2: @@ -16915,7 +16915,7 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask2(<8 x double> %ve ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00] -; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> @@ -16965,7 +16965,7 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask3(<8 x double> %ve ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00] -; GENERIC-NEXT: vmovapd %zmm1, %zmm0 +; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xdouble_masked_unpack_high_mem_mask3: @@ -16973,7 +16973,7 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask3(<8 x double> %ve ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00] -; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32>