From: Simon Pilgrim Date: Wed, 6 Dec 2017 19:36:00 +0000 (+0000) Subject: [X86][AVX512] Tag mask reg op instruction scheduler classes X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=5671e842df359cb433edac803a8850492ad5f9c3;p=llvm [X86][AVX512] Tag mask reg op instruction scheduler classes git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@319945 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index c4c22ccbb19..ccbedf1df63 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -2621,15 +2621,16 @@ defm VFPCLASS : avx512_fp_fpclass_all<"vfpclass", 0x66, 0x67, X86Vfpclass, multiclass avx512_mask_mov opc_kk, bits<8> opc_km, bits<8> opc_mk, string OpcodeStr, RegisterClass KRC, ValueType vvt, X86MemOperand x86memop> { - let hasSideEffects = 0 in + let hasSideEffects = 0, SchedRW = [WriteMove] in def kk : I; + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [], + IIC_SSE_MOVDQ>; def km : I; + [(set KRC:$dst, (vvt (load addr:$src)))], IIC_SSE_MOVDQ>; def mk : I; + [(store KRC:$src, addr:$dst)], IIC_SSE_MOVDQ>; } multiclass avx512_mask_mov_gpr opc_kr, bits<8> opc_rk, @@ -2637,9 +2638,11 @@ multiclass avx512_mask_mov_gpr opc_kr, bits<8> opc_rk, RegisterClass KRC, RegisterClass GRC> { let hasSideEffects = 0 in { def kr : I; + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [], + IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>; def rk : I; + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [], + IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>; } } @@ -2805,26 +2808,27 @@ let Predicates = [HasAVX512] in { // - KNOT multiclass avx512_mask_unop opc, string OpcodeStr, RegisterClass KRC, SDPatternOperator OpNode, - Predicate prd> { + OpndItins itins, Predicate prd> { let Predicates = [prd] in def rr : I; + [(set KRC:$dst, (OpNode KRC:$src))], itins.rr>, + Sched<[itins.Sched]>; } multiclass avx512_mask_unop_all opc, string OpcodeStr, - SDPatternOperator OpNode> { + SDPatternOperator OpNode, OpndItins itins> { defm B : avx512_mask_unop, VEX, PD; + itins, HasDQI>, VEX, PD; defm W : avx512_mask_unop, VEX, PS; + itins, HasAVX512>, VEX, PS; defm D : avx512_mask_unop, VEX, PD, VEX_W; + itins, HasBWI>, VEX, PD, VEX_W; defm Q : avx512_mask_unop, VEX, PS, VEX_W; + itins, HasBWI>, VEX, PS, VEX_W; } -defm KNOT : avx512_mask_unop_all<0x44, "knot", vnot>; +defm KNOT : avx512_mask_unop_all<0x44, "knot", vnot, SSE_BIT_ITINS_P>; // KNL does not support KMOVB, 8-bit mask is promoted to 16-bit let Predicates = [HasAVX512, NoDQI] in @@ -2840,25 +2844,26 @@ def : Pat<(vnot VK2:$src), // - KAND, KANDN, KOR, KXNOR, KXOR multiclass avx512_mask_binop opc, string OpcodeStr, RegisterClass KRC, SDPatternOperator OpNode, - Predicate prd, bit IsCommutable> { + OpndItins itins, Predicate prd, bit IsCommutable> { let Predicates = [prd], isCommutable = IsCommutable in def rr : I; + [(set KRC:$dst, (OpNode KRC:$src1, KRC:$src2))], itins.rr>, + Sched<[itins.Sched]>; } multiclass avx512_mask_binop_all opc, string OpcodeStr, - SDPatternOperator OpNode, bit IsCommutable, - Predicate prdW = HasAVX512> { + SDPatternOperator OpNode, OpndItins itins, + bit IsCommutable, Predicate prdW = HasAVX512> { defm B : avx512_mask_binop, VEX_4V, VEX_L, PD; + itins, HasDQI, IsCommutable>, VEX_4V, VEX_L, PD; defm W : avx512_mask_binop, VEX_4V, VEX_L, PS; + itins, prdW, IsCommutable>, VEX_4V, VEX_L, PS; defm D : avx512_mask_binop, VEX_4V, VEX_L, VEX_W, PD; + itins, HasBWI, IsCommutable>, VEX_4V, VEX_L, VEX_W, PD; defm Q : avx512_mask_binop, VEX_4V, VEX_L, VEX_W, PS; + itins, HasBWI, IsCommutable>, VEX_4V, VEX_L, VEX_W, PS; } def andn : PatFrag<(ops node:$i0, node:$i1), (and (not node:$i0), node:$i1)>; @@ -2867,12 +2872,12 @@ def xnor : PatFrag<(ops node:$i0, node:$i1), (not (xor node:$i0, node:$i1))>; def vandn : PatFrag<(ops node:$i0, node:$i1), (and (vnot node:$i0), node:$i1)>; def vxnor : PatFrag<(ops node:$i0, node:$i1), (vnot (xor node:$i0, node:$i1))>; -defm KAND : avx512_mask_binop_all<0x41, "kand", and, 1>; -defm KOR : avx512_mask_binop_all<0x45, "kor", or, 1>; -defm KXNOR : avx512_mask_binop_all<0x46, "kxnor", vxnor, 1>; -defm KXOR : avx512_mask_binop_all<0x47, "kxor", xor, 1>; -defm KANDN : avx512_mask_binop_all<0x42, "kandn", vandn, 0>; -defm KADD : avx512_mask_binop_all<0x4A, "kadd", add, 1, HasDQI>; +defm KAND : avx512_mask_binop_all<0x41, "kand", and, SSE_BIT_ITINS_P, 1>; +defm KOR : avx512_mask_binop_all<0x45, "kor", or, SSE_BIT_ITINS_P, 1>; +defm KXNOR : avx512_mask_binop_all<0x46, "kxnor", vxnor, SSE_BIT_ITINS_P, 1>; +defm KXOR : avx512_mask_binop_all<0x47, "kxor", xor, SSE_BIT_ITINS_P, 1>; +defm KANDN : avx512_mask_binop_all<0x42, "kandn", vandn, SSE_BIT_ITINS_P, 0>; +defm KADD : avx512_mask_binop_all<0x4A, "kadd", add, SSE_BIT_ITINS_P, 1, HasDQI>; multiclass avx512_binop_pat { @@ -2907,13 +2912,13 @@ defm : avx512_binop_pat; // Mask unpacking multiclass avx512_mask_unpck { + RegisterClass KRCSrc, OpndItins itins, Predicate prd> { let Predicates = [prd] in { let hasSideEffects = 0 in def rr : I<0x4b, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src1, KRC:$src2), - "kunpck"#Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, - VEX_4V, VEX_L; + "kunpck"#Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], + itins.rr>, VEX_4V, VEX_L, Sched<[itins.Sched]>; def : Pat<(VT (concat_vectors KRCSrc:$src1, KRCSrc:$src2)), (!cast(NAME##rr) @@ -2922,61 +2927,63 @@ multiclass avx512_mask_unpck, PD; -defm KUNPCKWD : avx512_mask_unpck<"wd", VK32, v32i1, VK16, HasBWI>, PS; -defm KUNPCKDQ : avx512_mask_unpck<"dq", VK64, v64i1, VK32, HasBWI>, PS, VEX_W; +defm KUNPCKBW : avx512_mask_unpck<"bw", VK16, v16i1, VK8, SSE_UNPCK, HasAVX512>, PD; +defm KUNPCKWD : avx512_mask_unpck<"wd", VK32, v32i1, VK16, SSE_UNPCK, HasBWI>, PS; +defm KUNPCKDQ : avx512_mask_unpck<"dq", VK64, v64i1, VK32, SSE_UNPCK, HasBWI>, PS, VEX_W; // Mask bit testing multiclass avx512_mask_testop opc, string OpcodeStr, RegisterClass KRC, - SDNode OpNode, Predicate prd> { + SDNode OpNode, OpndItins itins, Predicate prd> { let Predicates = [prd], Defs = [EFLAGS] in def rr : I; + [(set EFLAGS, (OpNode KRC:$src1, KRC:$src2))], itins.rr>, + Sched<[itins.Sched]>; } multiclass avx512_mask_testop_w opc, string OpcodeStr, SDNode OpNode, - Predicate prdW = HasAVX512> { - defm B : avx512_mask_testop, + OpndItins itins, Predicate prdW = HasAVX512> { + defm B : avx512_mask_testop, VEX, PD; - defm W : avx512_mask_testop, + defm W : avx512_mask_testop, VEX, PS; - defm Q : avx512_mask_testop, + defm Q : avx512_mask_testop, VEX, PS, VEX_W; - defm D : avx512_mask_testop, + defm D : avx512_mask_testop, VEX, PD, VEX_W; } -defm KORTEST : avx512_mask_testop_w<0x98, "kortest", X86kortest>; -defm KTEST : avx512_mask_testop_w<0x99, "ktest", X86ktest, HasDQI>; +defm KORTEST : avx512_mask_testop_w<0x98, "kortest", X86kortest, SSE_PTEST>; +defm KTEST : avx512_mask_testop_w<0x99, "ktest", X86ktest, SSE_PTEST, HasDQI>; // Mask shift multiclass avx512_mask_shiftop opc, string OpcodeStr, RegisterClass KRC, - SDNode OpNode> { + SDNode OpNode, OpndItins itins> { let Predicates = [HasAVX512] in def ri : Ii8; + [(set KRC:$dst, (OpNode KRC:$src, (i8 imm:$imm)))], + itins.rr>, Sched<[itins.Sched]>; } multiclass avx512_mask_shiftop_w opc1, bits<8> opc2, string OpcodeStr, - SDNode OpNode> { - defm W : avx512_mask_shiftop, - VEX, TAPD, VEX_W; + SDNode OpNode, OpndItins itins> { + defm W : avx512_mask_shiftop, VEX, TAPD, VEX_W; let Predicates = [HasDQI] in - defm B : avx512_mask_shiftop, - VEX, TAPD; + defm B : avx512_mask_shiftop, VEX, TAPD; let Predicates = [HasBWI] in { - defm Q : avx512_mask_shiftop, - VEX, TAPD, VEX_W; - defm D : avx512_mask_shiftop, - VEX, TAPD; + defm Q : avx512_mask_shiftop, VEX, TAPD, VEX_W; + defm D : avx512_mask_shiftop, VEX, TAPD; } } -defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86kshiftl>; -defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", X86kshiftr>; +defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86kshiftl, SSE_PSHUF>; +defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", X86kshiftr, SSE_PSHUF>; multiclass axv512_icmp_packed_no_vlx_lowering { def : Pat<(v8i1 (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), @@ -3023,7 +3030,8 @@ let Predicates = [HasAVX512, NoVLX] in { // Mask setting all 0s or 1s multiclass avx512_mask_setop { let Predicates = [HasAVX512] in - let isReMaterializable = 1, isAsCheapAsAMove = 1, isPseudo = 1 in + let isReMaterializable = 1, isAsCheapAsAMove = 1, isPseudo = 1, + SchedRW = [WriteZero] in def #NAME# : I<0, Pseudo, (outs KRC:$dst), (ins), "", [(set KRC:$dst, (VT Val))]>; } diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 927d020b26b..a0c907f5f42 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -6172,6 +6172,11 @@ let Predicates = [UseSSE41] in { // SSE4.1 - Packed Bit Test //===----------------------------------------------------------------------===// +let Sched = WriteVecLogic in +def SSE_PTEST : OpndItins< + IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM +>; + // ptest instruction we'll lower to this in X86ISelLowering primarily from // the intel intrinsic that corresponds to this. let Defs = [EFLAGS], Predicates = [HasAVX] in { diff --git a/test/CodeGen/X86/avx512-schedule.ll b/test/CodeGen/X86/avx512-schedule.ll index b447a0dcbea..589bf6c86a5 100755 --- a/test/CodeGen/X86/avx512-schedule.ll +++ b/test/CodeGen/X86/avx512-schedule.ll @@ -1164,7 +1164,7 @@ define i32 @test3(float %a, float %b) { ; GENERIC-LABEL: test3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcmpeqss %xmm1, %xmm0, %k0 # sched: [3:1.00] -; GENERIC-NEXT: kmovd %k0, %eax +; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: movzbl %al, %eax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1719,7 +1719,7 @@ define <8 x double> @sito8f64(<8 x i32> %a) { define <8 x double> @i32to8f64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwind { ; GENERIC-LABEL: i32to8f64_mask: ; GENERIC: # %bb.0: -; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1} # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1741,7 +1741,7 @@ define <8 x double> @i32to8f64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwi define <8 x double> @sito8f64_maskz(<8 x i32> %a, i8 %b) nounwind { ; GENERIC-LABEL: sito8f64_maskz: ; GENERIC: # %bb.0: -; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z} # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2193,7 +2193,7 @@ define <16 x float> @ulto16f32(<16 x i64> %a) { define <8 x double> @uito8f64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwind { ; GENERIC-LABEL: uito8f64_mask: ; GENERIC: # %bb.0: -; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1} # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2215,7 +2215,7 @@ define <8 x double> @uito8f64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwin define <8 x double> @uito8f64_maskz(<8 x i32> %a, i8 %b) nounwind { ; GENERIC-LABEL: uito8f64_maskz: ; GENERIC: # %bb.0: -; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z} # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2855,7 +2855,7 @@ define <16 x double> @ubto16f64(<16 x i32> %a) { ; GENERIC-NEXT: movl {{.*}}(%rip), %eax # sched: [5:0.50] ; GENERIC-NEXT: vpbroadcastd %eax, %ymm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [4:1.00] -; GENERIC-NEXT: kshiftrw $8, %k1, %k1 +; GENERIC-NEXT: kshiftrw $8, %k1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpbroadcastd %eax, %ymm1 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2pd %ymm1, %zmm1 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -4298,7 +4298,7 @@ define <8 x double> @fpext_test(<8 x float> %a) nounwind readnone { define <16 x i32> @zext_16i1_to_16xi32(i16 %b) { ; GENERIC-LABEL: zext_16i1_to_16xi32: ; GENERIC: # %bb.0: -; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4315,7 +4315,7 @@ define <16 x i32> @zext_16i1_to_16xi32(i16 %b) { define <8 x i64> @zext_8i1_to_8xi64(i8 %b) { ; GENERIC-LABEL: zext_8i1_to_8xi64: ; GENERIC: # %bb.0: -; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4334,7 +4334,7 @@ define i16 @trunc_16i8_to_16i1(<16 x i8> %a) { ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %xmm0, %k0 # sched: [1:0.33] -; GENERIC-NEXT: kmovd %k0, %eax +; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: # kill: %ax %ax %eax ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4355,7 +4355,7 @@ define i16 @trunc_16i32_to_16i1(<16 x i32> %a) { ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpslld $31, %zmm0, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: vptestmd %zmm0, %zmm0, %k0 # sched: [1:1.00] -; GENERIC-NEXT: kmovd %k0, %eax +; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: # kill: %ax %ax %eax ; GENERIC-NEXT: vzeroupper ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -4404,7 +4404,7 @@ define i8 @trunc_8i16_to_8i1(<8 x i16> %a) { ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm0, %k0 # sched: [1:0.33] -; GENERIC-NEXT: kmovd %k0, %eax +; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: # kill: %al %al %eax ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4443,13 +4443,13 @@ define i16 @trunc_i32_to_i1(i32 %a) { ; GENERIC-LABEL: trunc_i32_to_i1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: movw $-4, %ax # sched: [1:0.33] -; GENERIC-NEXT: kmovd %eax, %k0 -; GENERIC-NEXT: kshiftrw $1, %k0, %k0 -; GENERIC-NEXT: kshiftlw $1, %k0, %k0 +; GENERIC-NEXT: kmovd %eax, %k0 # sched: [1:0.33] +; GENERIC-NEXT: kshiftrw $1, %k0, %k0 # sched: [1:1.00] +; GENERIC-NEXT: kshiftlw $1, %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: andl $1, %edi # sched: [1:0.33] -; GENERIC-NEXT: kmovw %edi, %k1 -; GENERIC-NEXT: korw %k1, %k0, %k0 -; GENERIC-NEXT: kmovd %k0, %eax +; GENERIC-NEXT: kmovw %edi, %k1 # sched: [1:0.33] +; GENERIC-NEXT: korw %k1, %k0, %k0 # sched: [1:1.00] +; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: # kill: %ax %ax %eax ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4550,7 +4550,7 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone { ; GENERIC-NEXT: vpsllw $7, %zmm2, %zmm2 # sched: [3:1.00] ; GENERIC-NEXT: vpmovb2m %zmm2, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33] -; GENERIC-NEXT: kshiftrq $32, %k1, %k1 +; GENERIC-NEXT: kshiftrq $32, %k1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vmovdqu16 %zmm1, %zmm1 {%k1} {z} # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5357,7 +5357,7 @@ define <32 x i16> @xor_v32i16(<32 x i16> %a, <32 x i16> %b) { define <16 x float> @masked_and_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask, <16 x float> %c) { ; GENERIC-LABEL: masked_and_v16f32: ; GENERIC: # %bb.0: -; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vandps %zmm1, %zmm0, %zmm2 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: vaddps %zmm2, %zmm3, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -5382,7 +5382,7 @@ define <16 x float> @masked_and_v16f32(<16 x float> %a, <16 x float> %b, <16 x f define <16 x float> @masked_or_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask, <16 x float> %c) { ; GENERIC-LABEL: masked_or_v16f32: ; GENERIC: # %bb.0: -; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vandps %zmm1, %zmm0, %zmm2 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: vaddps %zmm2, %zmm3, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -5407,7 +5407,7 @@ define <16 x float> @masked_or_v16f32(<16 x float> %a, <16 x float> %b, <16 x fl define <16 x float> @masked_xor_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask, <16 x float> %c) { ; GENERIC-LABEL: masked_xor_v16f32: ; GENERIC: # %bb.0: -; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vandps %zmm1, %zmm0, %zmm2 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: vaddps %zmm2, %zmm3, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -5432,7 +5432,7 @@ define <16 x float> @masked_xor_v16f32(<16 x float> %a, <16 x float> %b, <16 x f define <8 x double> @masked_and_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %passThru, i8 %mask, <8 x double> %c) { ; GENERIC-LABEL: masked_and_v8f64: ; GENERIC: # %bb.0: -; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vandpd %zmm1, %zmm0, %zmm2 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: vaddpd %zmm2, %zmm3, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -5457,7 +5457,7 @@ define <8 x double> @masked_and_v8f64(<8 x double> %a, <8 x double> %b, <8 x dou define <8 x double> @masked_or_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %passThru, i8 %mask, <8 x double> %c) { ; GENERIC-LABEL: masked_or_v8f64: ; GENERIC: # %bb.0: -; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vandpd %zmm1, %zmm0, %zmm2 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: vaddpd %zmm2, %zmm3, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -5482,7 +5482,7 @@ define <8 x double> @masked_or_v8f64(<8 x double> %a, <8 x double> %b, <8 x doub define <8 x double> @masked_xor_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %passThru, i8 %mask, <8 x double> %c) { ; GENERIC-LABEL: masked_xor_v8f64: ; GENERIC: # %bb.0: -; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vandpd %zmm1, %zmm0, %zmm2 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: vaddpd %zmm2, %zmm3, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -5507,7 +5507,7 @@ define <8 x double> @masked_xor_v8f64(<8 x double> %a, <8 x double> %b, <8 x dou define <8 x i64> @test_mm512_mask_and_epi32(<8 x i64> %__src, i16 zeroext %__k, <8 x i64> %__a, <8 x i64> %__b) { ; GENERIC-LABEL: test_mm512_mask_and_epi32: ; GENERIC: # %bb.0: # %entry -; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vandps %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5529,7 +5529,7 @@ entry: define <8 x i64> @test_mm512_mask_or_epi32(<8 x i64> %__src, i16 zeroext %__k, <8 x i64> %__a, <8 x i64> %__b) { ; GENERIC-LABEL: test_mm512_mask_or_epi32: ; GENERIC: # %bb.0: # %entry -; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vorps %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5551,7 +5551,7 @@ entry: define <8 x i64> @test_mm512_mask_xor_epi32(<8 x i64> %__src, i16 zeroext %__k, <8 x i64> %__a, <8 x i64> %__b) { ; GENERIC-LABEL: test_mm512_mask_xor_epi32: ; GENERIC: # %bb.0: # %entry -; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vxorps %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5573,7 +5573,7 @@ entry: define <8 x double> @test_mm512_mask_xor_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { ; GENERIC-LABEL: test_mm512_mask_xor_pd: ; GENERIC: # %bb.0: # %entry -; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vxorpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5595,7 +5595,7 @@ entry: define <8 x double> @test_mm512_maskz_xor_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { ; GENERIC-LABEL: test_mm512_maskz_xor_pd: ; GENERIC: # %bb.0: # %entry -; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vxorpd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5617,7 +5617,7 @@ entry: define <16 x float> @test_mm512_mask_xor_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { ; GENERIC-LABEL: test_mm512_mask_xor_ps: ; GENERIC: # %bb.0: # %entry -; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vxorps %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5639,7 +5639,7 @@ entry: define <16 x float> @test_mm512_maskz_xor_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { ; GENERIC-LABEL: test_mm512_maskz_xor_ps: ; GENERIC: # %bb.0: # %entry -; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vxorps %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5661,7 +5661,7 @@ entry: define <8 x double> @test_mm512_mask_or_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { ; GENERIC-LABEL: test_mm512_mask_or_pd: ; GENERIC: # %bb.0: # %entry -; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vorpd %zmm1, %zmm2, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5683,7 +5683,7 @@ entry: define <8 x double> @test_mm512_maskz_or_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { ; GENERIC-LABEL: test_mm512_maskz_or_pd: ; GENERIC: # %bb.0: # %entry -; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vorpd %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5705,7 +5705,7 @@ entry: define <16 x float> @test_mm512_mask_or_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { ; GENERIC-LABEL: test_mm512_mask_or_ps: ; GENERIC: # %bb.0: # %entry -; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vorps %zmm1, %zmm2, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5727,7 +5727,7 @@ entry: define <16 x float> @test_mm512_maskz_or_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { ; GENERIC-LABEL: test_mm512_maskz_or_ps: ; GENERIC: # %bb.0: # %entry -; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vorps %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5749,7 +5749,7 @@ entry: define <8 x double> @test_mm512_mask_and_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { ; GENERIC-LABEL: test_mm512_mask_and_pd: ; GENERIC: # %bb.0: # %entry -; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vandpd %zmm1, %zmm2, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5771,7 +5771,7 @@ entry: define <8 x double> @test_mm512_maskz_and_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { ; GENERIC-LABEL: test_mm512_maskz_and_pd: ; GENERIC: # %bb.0: # %entry -; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vandpd %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5793,7 +5793,7 @@ entry: define <16 x float> @test_mm512_mask_and_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { ; GENERIC-LABEL: test_mm512_mask_and_ps: ; GENERIC: # %bb.0: # %entry -; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vandps %zmm1, %zmm2, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5815,7 +5815,7 @@ entry: define <16 x float> @test_mm512_maskz_and_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { ; GENERIC-LABEL: test_mm512_maskz_and_ps: ; GENERIC: # %bb.0: # %entry -; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vandps %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5837,7 +5837,7 @@ entry: define <8 x double> @test_mm512_mask_andnot_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { ; GENERIC-LABEL: test_mm512_mask_andnot_pd: ; GENERIC: # %bb.0: # %entry -; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vandnpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5860,7 +5860,7 @@ entry: define <8 x double> @test_mm512_maskz_andnot_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { ; GENERIC-LABEL: test_mm512_maskz_andnot_pd: ; GENERIC: # %bb.0: # %entry -; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vandnpd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5883,7 +5883,7 @@ entry: define <16 x float> @test_mm512_mask_andnot_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { ; GENERIC-LABEL: test_mm512_mask_andnot_ps: ; GENERIC: # %bb.0: # %entry -; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vandnps %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5906,7 +5906,7 @@ entry: define <16 x float> @test_mm512_maskz_andnot_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { ; GENERIC-LABEL: test_mm512_maskz_andnot_ps: ; GENERIC: # %bb.0: # %entry -; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vandnps %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6737,9 +6737,9 @@ define <8 x double> @mov_test47(i8 * %addr, <8 x double> %mask1) { define i16 @mask16(i16 %x) { ; GENERIC-LABEL: mask16: ; GENERIC: # %bb.0: -; GENERIC-NEXT: kmovd %edi, %k0 -; GENERIC-NEXT: knotw %k0, %k0 -; GENERIC-NEXT: kmovd %k0, %eax +; GENERIC-NEXT: kmovd %edi, %k0 # sched: [1:0.33] +; GENERIC-NEXT: knotw %k0, %k0 # sched: [1:1.00] +; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: # kill: %ax %ax %eax ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6759,9 +6759,9 @@ define i16 @mask16(i16 %x) { define i32 @mask16_zext(i16 %x) { ; GENERIC-LABEL: mask16_zext: ; GENERIC: # %bb.0: -; GENERIC-NEXT: kmovd %edi, %k0 -; GENERIC-NEXT: knotw %k0, %k0 -; GENERIC-NEXT: kmovw %k0, %eax +; GENERIC-NEXT: kmovd %edi, %k0 # sched: [1:0.33] +; GENERIC-NEXT: knotw %k0, %k0 # sched: [1:1.00] +; GENERIC-NEXT: kmovw %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mask16_zext: @@ -6780,9 +6780,9 @@ define i32 @mask16_zext(i16 %x) { define i8 @mask8(i8 %x) { ; GENERIC-LABEL: mask8: ; GENERIC: # %bb.0: -; GENERIC-NEXT: kmovd %edi, %k0 -; GENERIC-NEXT: knotb %k0, %k0 -; GENERIC-NEXT: kmovd %k0, %eax +; GENERIC-NEXT: kmovd %edi, %k0 # sched: [1:0.33] +; GENERIC-NEXT: knotb %k0, %k0 # sched: [1:1.00] +; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: # kill: %al %al %eax ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6802,9 +6802,9 @@ define i8 @mask8(i8 %x) { define i32 @mask8_zext(i8 %x) { ; GENERIC-LABEL: mask8_zext: ; GENERIC: # %bb.0: -; GENERIC-NEXT: kmovd %edi, %k0 -; GENERIC-NEXT: knotb %k0, %k0 -; GENERIC-NEXT: kmovb %k0, %eax +; GENERIC-NEXT: kmovd %edi, %k0 # sched: [1:0.33] +; GENERIC-NEXT: knotb %k0, %k0 # sched: [1:1.00] +; GENERIC-NEXT: kmovb %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mask8_zext: @@ -6824,7 +6824,7 @@ define void @mask16_mem(i16* %ptr) { ; GENERIC-LABEL: mask16_mem: ; GENERIC: # %bb.0: ; GENERIC-NEXT: kmovw (%rdi), %k0 -; GENERIC-NEXT: knotw %k0, %k0 +; GENERIC-NEXT: knotw %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: kmovw %k0, (%rdi) ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6846,7 +6846,7 @@ define void @mask8_mem(i8* %ptr) { ; GENERIC-LABEL: mask8_mem: ; GENERIC: # %bb.0: ; GENERIC-NEXT: kmovb (%rdi), %k0 -; GENERIC-NEXT: knotb %k0, %k0 +; GENERIC-NEXT: knotb %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: kmovb %k0, (%rdi) ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6896,10 +6896,10 @@ define i16 @mand16_mem(<16 x i1>* %x, <16 x i1>* %y) { ; GENERIC: # %bb.0: ; GENERIC-NEXT: kmovw (%rdi), %k0 ; GENERIC-NEXT: kmovw (%rsi), %k1 -; GENERIC-NEXT: kandw %k1, %k0, %k2 -; GENERIC-NEXT: kxorw %k1, %k0, %k0 -; GENERIC-NEXT: korw %k0, %k2, %k0 -; GENERIC-NEXT: kmovd %k0, %eax +; GENERIC-NEXT: kandw %k1, %k0, %k2 # sched: [1:1.00] +; GENERIC-NEXT: kxorw %k1, %k0, %k0 # sched: [1:1.00] +; GENERIC-NEXT: korw %k0, %k2, %k0 # sched: [1:1.00] +; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: # kill: %ax %ax %eax ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6925,9 +6925,9 @@ define i16 @mand16_mem(<16 x i1>* %x, <16 x i1>* %y) { define i8 @shuf_test1(i16 %v) nounwind { ; GENERIC-LABEL: shuf_test1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: kmovd %edi, %k0 -; GENERIC-NEXT: kshiftrw $8, %k0, %k0 -; GENERIC-NEXT: kmovd %k0, %eax +; GENERIC-NEXT: kmovd %edi, %k0 # sched: [1:0.33] +; GENERIC-NEXT: kshiftrw $8, %k0, %k0 # sched: [1:1.00] +; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: # kill: %al %al %eax ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -6948,9 +6948,9 @@ define i32 @zext_test1(<16 x i32> %a, <16 x i32> %b) { ; GENERIC-LABEL: zext_test1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 # sched: [3:1.00] -; GENERIC-NEXT: kshiftlw $10, %k0, %k0 -; GENERIC-NEXT: kshiftrw $15, %k0, %k0 -; GENERIC-NEXT: kmovd %k0, %eax +; GENERIC-NEXT: kshiftlw $10, %k0, %k0 # sched: [1:1.00] +; GENERIC-NEXT: kshiftrw $15, %k0, %k0 # sched: [1:1.00] +; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: andl $1, %eax # sched: [1:0.33] ; GENERIC-NEXT: vzeroupper ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -6974,9 +6974,9 @@ define i16 @zext_test2(<16 x i32> %a, <16 x i32> %b) { ; GENERIC-LABEL: zext_test2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 # sched: [3:1.00] -; GENERIC-NEXT: kshiftlw $10, %k0, %k0 -; GENERIC-NEXT: kshiftrw $15, %k0, %k0 -; GENERIC-NEXT: kmovd %k0, %eax +; GENERIC-NEXT: kshiftlw $10, %k0, %k0 # sched: [1:1.00] +; GENERIC-NEXT: kshiftrw $15, %k0, %k0 # sched: [1:1.00] +; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: andl $1, %eax # sched: [1:0.33] ; GENERIC-NEXT: # kill: %ax %ax %eax ; GENERIC-NEXT: vzeroupper @@ -7002,9 +7002,9 @@ define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) { ; GENERIC-LABEL: zext_test3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 # sched: [3:1.00] -; GENERIC-NEXT: kshiftlw $10, %k0, %k0 -; GENERIC-NEXT: kshiftrw $15, %k0, %k0 -; GENERIC-NEXT: kmovd %k0, %eax +; GENERIC-NEXT: kshiftlw $10, %k0, %k0 # sched: [1:1.00] +; GENERIC-NEXT: kshiftrw $15, %k0, %k0 # sched: [1:1.00] +; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: andb $1, %al # sched: [1:0.33] ; GENERIC-NEXT: # kill: %al %al %eax ; GENERIC-NEXT: vzeroupper @@ -7029,7 +7029,7 @@ define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) { define i8 @conv1(<8 x i1>* %R) { ; GENERIC-LABEL: conv1: ; GENERIC: # %bb.0: # %entry -; GENERIC-NEXT: kxnorw %k0, %k0, %k0 +; GENERIC-NEXT: kxnorw %k0, %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: kmovb %k0, (%rdi) ; GENERIC-NEXT: movb $-2, -{{[0-9]+}}(%rsp) # sched: [5:1.00] ; GENERIC-NEXT: movb $-2, %al # sched: [1:0.33] @@ -7057,7 +7057,7 @@ define <4 x i32> @test4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1 ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 # sched: [3:1.00] ; GENERIC-NEXT: vpcmpgtq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; GENERIC-NEXT: kandnw %k0, %k1, %k0 +; GENERIC-NEXT: kandnw %k0, %k1, %k0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.33] ; GENERIC-NEXT: vzeroupper ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -7082,7 +7082,7 @@ define <2 x i64> @vcmp_test5(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64 ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpcmpgtq %xmm0, %xmm1, %k0 # sched: [3:1.00] ; GENERIC-NEXT: vpcmpgtq %xmm3, %xmm2, %k1 # sched: [3:1.00] -; GENERIC-NEXT: kandnw %k1, %k0, %k0 +; GENERIC-NEXT: kandnw %k1, %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovm2q %k0, %xmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7117,9 +7117,9 @@ define void @vcmp_test7(<8 x i1> %mask) { ; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm0, %k0 # sched: [1:0.33] ; GENERIC-NEXT: movb $85, %al # sched: [1:0.33] -; GENERIC-NEXT: kmovd %eax, %k1 -; GENERIC-NEXT: korb %k1, %k0, %k0 -; GENERIC-NEXT: ktestb %k0, %k0 +; GENERIC-NEXT: kmovd %eax, %k1 # sched: [1:0.33] +; GENERIC-NEXT: korb %k1, %k0, %k0 # sched: [1:1.00] +; GENERIC-NEXT: ktestb %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vcmp_test7: @@ -7297,7 +7297,7 @@ define <16 x i1> @vmov_test15(i32 %x, i32 %y) { ; GENERIC-NEXT: # sched: [1:0.33] ; GENERIC-NEXT: movw $1, %cx # sched: [1:0.33] ; GENERIC-NEXT: cmovgw %ax, %cx # sched: [2:0.67] -; GENERIC-NEXT: kmovd %ecx, %k0 +; GENERIC-NEXT: kmovd %ecx, %k0 # sched: [1:0.33] ; GENERIC-NEXT: vpmovm2b %k0, %xmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7322,14 +7322,14 @@ define <64 x i8> @vmov_test16(i64 %x) { ; ; GENERIC-LABEL: vmov_test16: ; GENERIC: # %bb.0: -; GENERIC-NEXT: kmovq %rdi, %k0 +; GENERIC-NEXT: kmovq %rdi, %k0 # sched: [1:0.33] ; GENERIC-NEXT: movb $1, %al # sched: [1:0.33] -; GENERIC-NEXT: kmovd %eax, %k1 +; GENERIC-NEXT: kmovd %eax, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpmovm2b %k1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: vpsllq $40, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovm2b %k0, %zmm1 # sched: [1:0.33] ; GENERIC-NEXT: movl $32, %eax # sched: [1:0.33] -; GENERIC-NEXT: kmovd %eax, %k1 +; GENERIC-NEXT: kmovd %eax, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpblendmb %ymm0, %ymm1, %ymm0 {%k1} # sched: [2:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %zmm0, %k0 # sched: [1:0.33] @@ -7361,15 +7361,15 @@ define <64 x i8> @vmov_test17(i64 %x, i32 %y, i32 %z) { ; ; GENERIC-LABEL: vmov_test17: ; GENERIC: # %bb.0: -; GENERIC-NEXT: kmovq %rdi, %k0 +; GENERIC-NEXT: kmovq %rdi, %k0 # sched: [1:0.33] ; GENERIC-NEXT: cmpl %edx, %esi # sched: [1:0.33] ; GENERIC-NEXT: setg %al # sched: [1:0.50] -; GENERIC-NEXT: kmovd %eax, %k1 +; GENERIC-NEXT: kmovd %eax, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpmovm2b %k1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: vpsllq $40, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovm2b %k0, %zmm1 # sched: [1:0.33] ; GENERIC-NEXT: movl $32, %eax # sched: [1:0.33] -; GENERIC-NEXT: kmovd %eax, %k1 +; GENERIC-NEXT: kmovd %eax, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpblendmb %ymm0, %ymm1, %ymm0 {%k1} # sched: [2:1.00] ; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %zmm0, %k0 # sched: [1:0.33] @@ -7402,21 +7402,21 @@ define <64 x i8> @vmov_test17(i64 %x, i32 %y, i32 %z) { define <8 x i1> @vmov_test18(i8 %a, i16 %y) { ; GENERIC-LABEL: vmov_test18: ; GENERIC: # %bb.0: -; GENERIC-NEXT: kmovd %edi, %k1 -; GENERIC-NEXT: kmovd %esi, %k2 -; GENERIC-NEXT: kshiftlw $7, %k2, %k0 -; GENERIC-NEXT: kshiftrw $15, %k0, %k0 -; GENERIC-NEXT: kshiftlw $6, %k2, %k2 -; GENERIC-NEXT: kshiftrw $15, %k2, %k2 +; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] +; GENERIC-NEXT: kmovd %esi, %k2 # sched: [1:0.33] +; GENERIC-NEXT: kshiftlw $7, %k2, %k0 # sched: [1:1.00] +; GENERIC-NEXT: kshiftrw $15, %k0, %k0 # sched: [1:1.00] +; GENERIC-NEXT: kshiftlw $6, %k2, %k2 # sched: [1:1.00] +; GENERIC-NEXT: kshiftrw $15, %k2, %k2 # sched: [1:1.00] ; GENERIC-NEXT: vpmovm2q %k1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: vpmovm2q %k2, %zmm1 # sched: [1:0.33] ; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7] sched: [4:0.50] ; GENERIC-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 # sched: [1:1.00] ; GENERIC-NEXT: vpmovq2m %zmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: kshiftlb $1, %k1, %k1 -; GENERIC-NEXT: kshiftrb $1, %k1, %k1 -; GENERIC-NEXT: kshiftlb $7, %k0, %k0 -; GENERIC-NEXT: korb %k0, %k1, %k0 +; GENERIC-NEXT: kshiftlb $1, %k1, %k1 # sched: [1:1.00] +; GENERIC-NEXT: kshiftrb $1, %k1, %k1 # sched: [1:1.00] +; GENERIC-NEXT: kshiftlb $7, %k0, %k0 # sched: [1:1.00] +; GENERIC-NEXT: korb %k0, %k1, %k0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovm2w %k0, %xmm0 # sched: [1:0.33] ; GENERIC-NEXT: vzeroupper ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -7506,9 +7506,9 @@ define void @vmov_test23(<2 x i1> %a, <2 x i1>* %addr) { define void @store_v1i1(<1 x i1> %c , <1 x i1>* %ptr) { ; GENERIC-LABEL: store_v1i1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: kmovd %edi, %k0 -; GENERIC-NEXT: kxnorw %k0, %k0, %k1 -; GENERIC-NEXT: kxorw %k1, %k0, %k0 +; GENERIC-NEXT: kmovd %edi, %k0 # sched: [1:0.33] +; GENERIC-NEXT: kxnorw %k0, %k0, %k1 # sched: [1:1.00] +; GENERIC-NEXT: kxorw %k1, %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: kmovb %k0, (%rsi) ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7529,7 +7529,7 @@ define void @store_v2i1(<2 x i1> %c , <2 x i1>* %ptr) { ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vptestmq %xmm0, %xmm0, %k0 # sched: [1:1.00] -; GENERIC-NEXT: knotw %k0, %k0 +; GENERIC-NEXT: knotw %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: kmovb %k0, (%rdi) ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7550,7 +7550,7 @@ define void @store_v4i1(<4 x i1> %c , <4 x i1>* %ptr) { ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k0 # sched: [1:1.00] -; GENERIC-NEXT: knotw %k0, %k0 +; GENERIC-NEXT: knotw %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: kmovb %k0, (%rdi) ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7571,7 +7571,7 @@ define void @store_v8i1(<8 x i1> %c , <8 x i1>* %ptr) { ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm0, %k0 # sched: [1:0.33] -; GENERIC-NEXT: knotb %k0, %k0 +; GENERIC-NEXT: knotb %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: kmovb %k0, (%rdi) ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7592,7 +7592,7 @@ define void @store_v16i1(<16 x i1> %c , <16 x i1>* %ptr) { ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %xmm0, %k0 # sched: [1:0.33] -; GENERIC-NEXT: knotw %k0, %k0 +; GENERIC-NEXT: knotw %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: kmovw %k0, (%rdi) ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7685,7 +7685,7 @@ define <32 x i16> @test_build_vec_v32i1(<32 x i16> %x) { ; GENERIC: # %bb.0: ; GENERIC-NEXT: movl $1497715861, %eax # imm = 0x59455495 ; GENERIC-NEXT: # sched: [1:0.33] -; GENERIC-NEXT: kmovd %eax, %k1 +; GENERIC-NEXT: kmovd %eax, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7721,7 +7721,7 @@ define void @ktest_1(<8 x double> %in, double * %base) { ; GENERIC-NEXT: vcmpltpd %zmm0, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovupd 8(%rdi), %zmm1 {%k1} {z} # sched: [4:0.50] ; GENERIC-NEXT: vcmpltpd %zmm1, %zmm0, %k0 {%k1} # sched: [3:1.00] -; GENERIC-NEXT: ktestb %k0, %k0 +; GENERIC-NEXT: ktestb %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: je .LBB410_2 # sched: [1:1.00] ; GENERIC-NEXT: # %bb.1: # %L1 ; GENERIC-NEXT: vmovapd %zmm0, (%rdi) # sched: [1:1.00] @@ -7783,14 +7783,14 @@ define void @ktest_2(<32 x float> %in, float * %base) { ; GENERIC-NEXT: vmovups 64(%rdi), %zmm3 # sched: [4:0.50] ; GENERIC-NEXT: vcmpltps %zmm0, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vcmpltps %zmm1, %zmm3, %k2 # sched: [3:1.00] -; GENERIC-NEXT: kunpckwd %k1, %k2, %k0 +; GENERIC-NEXT: kunpckwd %k1, %k2, %k0 # sched: [1:1.00] ; GENERIC-NEXT: vmovups 68(%rdi), %zmm2 {%k2} {z} # sched: [4:0.50] ; GENERIC-NEXT: vmovups 4(%rdi), %zmm3 {%k1} {z} # sched: [4:0.50] ; GENERIC-NEXT: vcmpltps %zmm3, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vcmpltps %zmm2, %zmm1, %k2 # sched: [3:1.00] -; GENERIC-NEXT: kunpckwd %k1, %k2, %k1 -; GENERIC-NEXT: kord %k1, %k0, %k0 -; GENERIC-NEXT: ktestd %k0, %k0 +; GENERIC-NEXT: kunpckwd %k1, %k2, %k1 # sched: [1:1.00] +; GENERIC-NEXT: kord %k1, %k0, %k0 # sched: [1:1.00] +; GENERIC-NEXT: ktestd %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: je .LBB411_2 # sched: [1:1.00] ; GENERIC-NEXT: # %bb.1: # %L1 ; GENERIC-NEXT: vmovaps %zmm0, (%rdi) # sched: [1:1.00] @@ -8080,7 +8080,7 @@ define i32 @test_bitcast_v8i1_zext(<16 x i32> %a) { ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 # sched: [3:1.00] -; GENERIC-NEXT: kmovb %k0, %eax +; GENERIC-NEXT: kmovb %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: addl %eax, %eax # sched: [1:0.33] ; GENERIC-NEXT: vzeroupper ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -8106,7 +8106,7 @@ define i32 @test_bitcast_v16i1_zext(<16 x i32> %a) { ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 # sched: [3:1.00] -; GENERIC-NEXT: kmovw %k0, %eax +; GENERIC-NEXT: kmovw %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: addl %eax, %eax # sched: [1:0.33] ; GENERIC-NEXT: vzeroupper ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -8129,10 +8129,10 @@ define i32 @test_bitcast_v16i1_zext(<16 x i32> %a) { define i16 @test_v16i1_add(i16 %x, i16 %y) { ; GENERIC-LABEL: test_v16i1_add: ; GENERIC: # %bb.0: -; GENERIC-NEXT: kmovd %edi, %k0 -; GENERIC-NEXT: kmovd %esi, %k1 -; GENERIC-NEXT: kxorw %k1, %k0, %k0 -; GENERIC-NEXT: kmovd %k0, %eax +; GENERIC-NEXT: kmovd %edi, %k0 # sched: [1:0.33] +; GENERIC-NEXT: kmovd %esi, %k1 # sched: [1:0.33] +; GENERIC-NEXT: kxorw %k1, %k0, %k0 # sched: [1:1.00] +; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: # kill: %ax %ax %eax ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8154,10 +8154,10 @@ define i16 @test_v16i1_add(i16 %x, i16 %y) { define i16 @test_v16i1_sub(i16 %x, i16 %y) { ; GENERIC-LABEL: test_v16i1_sub: ; GENERIC: # %bb.0: -; GENERIC-NEXT: kmovd %edi, %k0 -; GENERIC-NEXT: kmovd %esi, %k1 -; GENERIC-NEXT: kxorw %k1, %k0, %k0 -; GENERIC-NEXT: kmovd %k0, %eax +; GENERIC-NEXT: kmovd %edi, %k0 # sched: [1:0.33] +; GENERIC-NEXT: kmovd %esi, %k1 # sched: [1:0.33] +; GENERIC-NEXT: kxorw %k1, %k0, %k0 # sched: [1:1.00] +; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: # kill: %ax %ax %eax ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8179,10 +8179,10 @@ define i16 @test_v16i1_sub(i16 %x, i16 %y) { define i16 @test_v16i1_mul(i16 %x, i16 %y) { ; GENERIC-LABEL: test_v16i1_mul: ; GENERIC: # %bb.0: -; GENERIC-NEXT: kmovd %edi, %k0 -; GENERIC-NEXT: kmovd %esi, %k1 -; GENERIC-NEXT: kandw %k1, %k0, %k0 -; GENERIC-NEXT: kmovd %k0, %eax +; GENERIC-NEXT: kmovd %edi, %k0 # sched: [1:0.33] +; GENERIC-NEXT: kmovd %esi, %k1 # sched: [1:0.33] +; GENERIC-NEXT: kandw %k1, %k0, %k0 # sched: [1:1.00] +; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: # kill: %ax %ax %eax ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8204,10 +8204,10 @@ define i16 @test_v16i1_mul(i16 %x, i16 %y) { define i8 @test_v8i1_add(i8 %x, i8 %y) { ; GENERIC-LABEL: test_v8i1_add: ; GENERIC: # %bb.0: -; GENERIC-NEXT: kmovd %edi, %k0 -; GENERIC-NEXT: kmovd %esi, %k1 -; GENERIC-NEXT: kxorb %k1, %k0, %k0 -; GENERIC-NEXT: kmovd %k0, %eax +; GENERIC-NEXT: kmovd %edi, %k0 # sched: [1:0.33] +; GENERIC-NEXT: kmovd %esi, %k1 # sched: [1:0.33] +; GENERIC-NEXT: kxorb %k1, %k0, %k0 # sched: [1:1.00] +; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: # kill: %al %al %eax ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8229,10 +8229,10 @@ define i8 @test_v8i1_add(i8 %x, i8 %y) { define i8 @test_v8i1_sub(i8 %x, i8 %y) { ; GENERIC-LABEL: test_v8i1_sub: ; GENERIC: # %bb.0: -; GENERIC-NEXT: kmovd %edi, %k0 -; GENERIC-NEXT: kmovd %esi, %k1 -; GENERIC-NEXT: kxorb %k1, %k0, %k0 -; GENERIC-NEXT: kmovd %k0, %eax +; GENERIC-NEXT: kmovd %edi, %k0 # sched: [1:0.33] +; GENERIC-NEXT: kmovd %esi, %k1 # sched: [1:0.33] +; GENERIC-NEXT: kxorb %k1, %k0, %k0 # sched: [1:1.00] +; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: # kill: %al %al %eax ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8254,10 +8254,10 @@ define i8 @test_v8i1_sub(i8 %x, i8 %y) { define i8 @test_v8i1_mul(i8 %x, i8 %y) { ; GENERIC-LABEL: test_v8i1_mul: ; GENERIC: # %bb.0: -; GENERIC-NEXT: kmovd %edi, %k0 -; GENERIC-NEXT: kmovd %esi, %k1 -; GENERIC-NEXT: kandb %k1, %k0, %k0 -; GENERIC-NEXT: kmovd %k0, %eax +; GENERIC-NEXT: kmovd %edi, %k0 # sched: [1:0.33] +; GENERIC-NEXT: kmovd %esi, %k1 # sched: [1:0.33] +; GENERIC-NEXT: kandb %k1, %k0, %k0 # sched: [1:1.00] +; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: # kill: %al %al %eax ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -8592,7 +8592,7 @@ define <16 x i32> @test_vbroadcast() { ; GENERIC-NEXT: vxorps %xmm0, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vcmpunordps %zmm0, %zmm0, %k0 # sched: [3:1.00] ; GENERIC-NEXT: vpmovm2d %k0, %zmm0 # sched: [1:0.33] -; GENERIC-NEXT: knotw %k0, %k1 +; GENERIC-NEXT: knotw %k0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ;