def FeatureSoftFloat
: SubtargetFeature<"soft-float", "UseSoftFloat", "true",
"Use software floating point features.">;
-// On at least some AMD processors, there is no performance hazard to writing
-// only the lower parts of a YMM register without clearing the upper part.
-def FeatureFastPartialYMMWrite
- : SubtargetFeature<"fast-partial-ymm-write", "HasFastPartialYMMWrite",
- "true", "Partial writes to YMM registers are fast">;
+// On some X86 processors, there is no performance hazard to writing only the
+// lower parts of a YMM or ZMM register without clearing the upper part.
+def FeatureFastPartialYMMorZMMWrite
+ : SubtargetFeature<"fast-partial-ymm-or-zmm-write",
+ "HasFastPartialYMMorZMMWrite",
+ "true", "Partial writes to YMM/ZMM registers are fast">;
// FeatureFastScalarFSQRT should be enabled if scalar FSQRT has shorter latency
// than the corresponding NR code. FeatureFastVectorFSQRT should be enabled if
// vector FSQRT has higher throughput than the corresponding NR code.
FeatureLZCNT,
FeatureBMI,
FeatureBMI2,
- FeatureFMA
+ FeatureFMA,
+ FeatureFastPartialYMMorZMMWrite
]>;
def : KnightsLandingProc<"knl">;
FeatureXSAVEOPT,
FeatureSlowSHLD,
FeatureLAHFSAHF,
- FeatureFastPartialYMMWrite
+ FeatureFastPartialYMMorZMMWrite
]>;
// Bulldozer
HasSSEUnalignedMem = false;
HasCmpxchg16b = false;
UseLeaForSP = false;
- HasFastPartialYMMWrite = false;
+ HasFastPartialYMMorZMMWrite = false;
HasFastScalarFSQRT = false;
HasFastVectorFSQRT = false;
HasFastLZCNT = false;
bool UseLeaForSP;
/// True if there is no performance penalty to writing only the lower parts
- /// of a YMM register without clearing the upper part.
- bool HasFastPartialYMMWrite;
+ /// of a YMM or ZMM register without clearing the upper part.
+ bool HasFastPartialYMMorZMMWrite;
/// True if hardware SQRTSS instruction is at least as fast (latency) as
/// RSQRTSS followed by a Newton-Raphson iteration.
bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; }
bool hasCmpxchg16b() const { return HasCmpxchg16b; }
bool useLeaForSP() const { return UseLeaForSP; }
- bool hasFastPartialYMMWrite() const { return HasFastPartialYMMWrite; }
+ bool hasFastPartialYMMorZMMWrite() const {
+ return HasFastPartialYMMorZMMWrite;
+ }
bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; }
bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; }
bool hasFastLZCNT() const { return HasFastLZCNT; }
// Core algorithm state:
// BlockState - Each block is either:
- // - PASS_THROUGH: There are neither YMM dirtying instructions nor
+ // - PASS_THROUGH: There are neither YMM/ZMM dirtying instructions nor
// vzeroupper instructions in this block.
// - EXITS_CLEAN: There is (or will be) a vzeroupper instruction in this
- // block that will ensure that YMM is clean on exit.
- // - EXITS_DIRTY: An instruction in the block dirties YMM and no
+ // block that will ensure that YMM/ZMM is clean on exit.
+ // - EXITS_DIRTY: An instruction in the block dirties YMM/ZMM and no
// subsequent vzeroupper in the block clears it.
//
// AddedToDirtySuccessors - This flag is raised when a block is added to the
llvm_unreachable("Invalid block exit state.");
}
-static bool isYmmReg(unsigned Reg) {
- return (Reg >= X86::YMM0 && Reg <= X86::YMM15);
+/// VZEROUPPER cleans state that is related to Y/ZMM0-15 only.
+/// Thus, there is no need to check for Y/ZMM16 and above.
+static bool isYmmOrZmmReg(unsigned Reg) {
+ return (Reg >= X86::YMM0 && Reg <= X86::YMM15) ||
+ (Reg >= X86::ZMM0 && Reg <= X86::ZMM15);
}
-static bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) {
+static bool checkFnHasLiveInYmmOrZmm(MachineRegisterInfo &MRI) {
for (MachineRegisterInfo::livein_iterator I = MRI.livein_begin(),
E = MRI.livein_end(); I != E; ++I)
- if (isYmmReg(I->first))
+ if (isYmmOrZmmReg(I->first))
return true;
return false;
}
-static bool clobbersAllYmmRegs(const MachineOperand &MO) {
+static bool clobbersAllYmmAndZmmRegs(const MachineOperand &MO) {
for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) {
if (!MO.clobbersPhysReg(reg))
return false;
}
+ for (unsigned reg = X86::ZMM0; reg <= X86::ZMM15; ++reg) {
+ if (!MO.clobbersPhysReg(reg))
+ return false;
+ }
return true;
}
-static bool hasYmmReg(MachineInstr &MI) {
+static bool hasYmmOrZmmReg(MachineInstr &MI) {
for (const MachineOperand &MO : MI.operands()) {
- if (MI.isCall() && MO.isRegMask() && !clobbersAllYmmRegs(MO))
+ if (MI.isCall() && MO.isRegMask() && !clobbersAllYmmAndZmmRegs(MO))
return true;
if (!MO.isReg())
continue;
if (MO.isDebug())
continue;
- if (isYmmReg(MO.getReg()))
+ if (isYmmOrZmmReg(MO.getReg()))
return true;
}
return false;
}
-/// Check if any YMM register will be clobbered by this instruction.
-static bool callClobbersAnyYmmReg(MachineInstr &MI) {
+/// Check if given call instruction has a RegMask operand.
+static bool callHasRegMask(MachineInstr &MI) {
assert(MI.isCall() && "Can only be called on call instructions.");
for (const MachineOperand &MO : MI.operands()) {
- if (!MO.isRegMask())
- continue;
- for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) {
- if (MO.clobbersPhysReg(reg))
- return true;
- }
+ if (MO.isRegMask())
+ return true;
}
return false;
}
/// Loop over all of the instructions in the basic block, inserting vzeroupper
/// instructions before function calls.
void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {
-
// Start by assuming that the block is PASS_THROUGH which implies no unguarded
// calls.
BlockExitState CurState = PASS_THROUGH;
BlockStates[MBB.getNumber()].FirstUnguardedCall = MBB.end();
for (MachineInstr &MI : MBB) {
+ bool IsCall = MI.isCall();
+ bool IsReturn = MI.isReturn();
+ bool IsControlFlow = IsCall || IsReturn;
+
// No need for vzeroupper before iret in interrupt handler function,
- // epilogue will restore YMM registers if needed.
- bool IsReturnFromX86INTR = IsX86INTR && MI.isReturn();
- bool IsControlFlow = MI.isCall() || MI.isReturn();
+ // epilogue will restore YMM/ZMM registers if needed.
+ if (IsX86INTR && IsReturn)
+ continue;
// An existing VZERO* instruction resets the state.
if (MI.getOpcode() == X86::VZEROALL || MI.getOpcode() == X86::VZEROUPPER) {
}
// Shortcut: don't need to check regular instructions in dirty state.
- if ((!IsControlFlow || IsReturnFromX86INTR) && CurState == EXITS_DIRTY)
+ if (!IsControlFlow && CurState == EXITS_DIRTY)
continue;
- if (hasYmmReg(MI)) {
- // We found a ymm-using instruction; this could be an AVX instruction,
- // or it could be control flow.
+ if (hasYmmOrZmmReg(MI)) {
+ // We found a ymm/zmm-using instruction; this could be an AVX/AVX512
+ // instruction, or it could be control flow.
CurState = EXITS_DIRTY;
continue;
}
// Check for control-flow out of the current function (which might
// indirectly execute SSE instructions).
- if (!IsControlFlow || IsReturnFromX86INTR)
+ if (!IsControlFlow)
continue;
- // If the call won't clobber any YMM register, skip it as well. It usually
- // happens on helper function calls (such as '_chkstk', '_ftol2') where
- // standard calling convention is not used (RegMask is not used to mark
- // register clobbered and register usage (def/imp-def/use) is well-defined
- // and explicitly specified.
- if (MI.isCall() && !callClobbersAnyYmmReg(MI))
+ // If the call has no RegMask, skip it as well. It usually happens on
+ // helper function calls (such as '_chkstk', '_ftol2') where standard
+ // calling convention is not used (RegMask is not used to mark register
+ // clobbered and register usage (def/imp-def/use) is well-defined and
+ // explicitly specified.
+ if (IsCall && !callHasRegMask(MI))
continue;
- // The VZEROUPPER instruction resets the upper 128 bits of all AVX
+ // The VZEROUPPER instruction resets the upper 128 bits of YMM0-YMM15
// registers. In addition, the processor changes back to Clean state, after
// which execution of SSE instructions or AVX instructions has no transition
// penalty. Add the VZEROUPPER instruction before any function call/return
// predecessor block.
if (CurState == EXITS_DIRTY) {
// After the inserted VZEROUPPER the state becomes clean again, but
- // other YMM may appear before other subsequent calls or even before
+ // other YMM/ZMM may appear before other subsequent calls or even before
// the end of the BB.
insertVZeroUpper(MI, MBB);
CurState = EXITS_CLEAN;
/// function calls.
bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
- if (!ST.hasAVX() || ST.hasAVX512() || ST.hasFastPartialYMMWrite())
+ if (!ST.hasAVX() || ST.hasFastPartialYMMorZMMWrite())
return false;
TII = ST.getInstrInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
EverMadeChange = false;
IsX86INTR = MF.getFunction()->getCallingConv() == CallingConv::X86_INTR;
- bool FnHasLiveInYmm = checkFnHasLiveInYmm(MRI);
-
- // Fast check: if the function doesn't use any ymm registers, we don't need
- // to insert any VZEROUPPER instructions. This is constant-time, so it is
- // cheap in the common case of no ymm use.
- bool YMMUsed = FnHasLiveInYmm;
- if (!YMMUsed) {
- const TargetRegisterClass *RC = &X86::VR256RegClass;
- for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); i != e;
- i++) {
- if (!MRI.reg_nodbg_empty(*i)) {
- YMMUsed = true;
- break;
+ bool FnHasLiveInYmmOrZmm = checkFnHasLiveInYmmOrZmm(MRI);
+
+ // Fast check: if the function doesn't use any ymm/zmm registers, we don't
+ // need to insert any VZEROUPPER instructions. This is constant-time, so it
+ // is cheap in the common case of no ymm/zmm use.
+ bool YmmOrZmmUsed = FnHasLiveInYmmOrZmm;
+ const TargetRegisterClass *RCs[2] = {&X86::VR256RegClass, &X86::VR512RegClass};
+ for (auto *RC : RCs) {
+ if (!YmmOrZmmUsed) {
+ for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); i != e;
+ i++) {
+ if (!MRI.reg_nodbg_empty(*i)) {
+ YmmOrZmmUsed = true;
+ break;
+ }
}
}
}
- if (!YMMUsed) {
+ if (!YmmOrZmmUsed) {
return false;
}
for (MachineBasicBlock &MBB : MF)
processBasicBlock(MBB);
- // If any YMM regs are live-in to this function, add the entry block to the
- // DirtySuccessors list
- if (FnHasLiveInYmm)
+ // If any YMM/ZMM regs are live-in to this function, add the entry block to
+ // the DirtySuccessors list
+ if (FnHasLiveInYmmOrZmm)
addDirtySuccessor(MF.front());
// Re-visit all blocks that are successors of EXITS_DIRTY blocks. Add
; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
; AVX512F-NEXT: vpavgb (%rdi), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: avg_v32i8:
; AVX512BW-NEXT: vmovdqa (%rsi), %ymm0
; AVX512BW-NEXT: vpavgb (%rdi), %ymm0, %ymm0
; AVX512BW-NEXT: vmovdqu %ymm0, (%rax)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = load <32 x i8>, <32 x i8>* %a
%2 = load <32 x i8>, <32 x i8>* %b
; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: avg_v64i8:
; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0
; AVX512BW-NEXT: vpavgb (%rdi), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = load <64 x i8>, <64 x i8>* %a
%2 = load <64 x i8>, <64 x i8>* %b
; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
; AVX512F-NEXT: vpavgw (%rdi), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: avg_v16i16:
; AVX512BW-NEXT: vmovdqa (%rsi), %ymm0
; AVX512BW-NEXT: vpavgw (%rdi), %ymm0, %ymm0
; AVX512BW-NEXT: vmovdqu %ymm0, (%rax)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = load <16 x i16>, <16 x i16>* %a
%2 = load <16 x i16>, <16 x i16>* %b
; AVX512F-NEXT: vpsrld $1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdw %zmm0, (%rax)
; AVX512F-NEXT: vpmovdw %zmm1, (%rax)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: avg_v32i16:
; AVX512BW-NEXT: vmovdqu16 (%rsi), %zmm0
; AVX512BW-NEXT: vpavgw (%rdi), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = load <32 x i16>, <32 x i16>* %a
%2 = load <32 x i16>, <32 x i16>* %b
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vpavgb (%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: avg_v32i8_2:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BW-NEXT: vpavgb (%rsi), %ymm0, %ymm0
; AVX512BW-NEXT: vmovdqu %ymm0, (%rax)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = load <32 x i8>, <32 x i8>* %a
%2 = load <32 x i8>, <32 x i8>* %b
; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: avg_v64i8_2:
; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0
; AVX512BW-NEXT: vpavgb %zmm0, %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = load <64 x i8>, <64 x i8>* %a
%2 = load <64 x i8>, <64 x i8>* %b
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vpavgw (%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: avg_v16i16_2:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BW-NEXT: vpavgw (%rsi), %ymm0, %ymm0
; AVX512BW-NEXT: vmovdqu %ymm0, (%rax)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = load <16 x i16>, <16 x i16>* %a
%2 = load <16 x i16>, <16 x i16>* %b
; AVX512F-NEXT: vpsrld $1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdw %zmm0, (%rax)
; AVX512F-NEXT: vpmovdw %zmm1, (%rax)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: avg_v32i16_2:
; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0
; AVX512BW-NEXT: vpavgw (%rsi), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = load <32 x i16>, <32 x i16>* %a
%2 = load <32 x i16>, <32 x i16>* %b
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vpavgb {{.*}}(%rip), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: avg_v32i8_const:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BW-NEXT: vpavgb {{.*}}(%rip), %ymm0, %ymm0
; AVX512BW-NEXT: vmovdqu %ymm0, (%rax)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = load <32 x i8>, <32 x i8>* %a
%2 = zext <32 x i8> %1 to <32 x i32>
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
; AVX512F-NEXT: vmovdqu %ymm2, (%rax)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: avg_v64i8_const:
; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0
; AVX512BW-NEXT: vpavgb {{.*}}(%rip), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = load <64 x i8>, <64 x i8>* %a
%2 = zext <64 x i8> %1 to <64 x i32>
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vpavgw {{.*}}(%rip), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: avg_v16i16_const:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BW-NEXT: vpavgw {{.*}}(%rip), %ymm0, %ymm0
; AVX512BW-NEXT: vmovdqu %ymm0, (%rax)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = load <16 x i16>, <16 x i16>* %a
%2 = zext <16 x i16> %1 to <16 x i32>
; AVX512F-NEXT: vpsrld $1, %zmm1, %zmm1
; AVX512F-NEXT: vpmovdw %zmm1, (%rax)
; AVX512F-NEXT: vpmovdw %zmm0, (%rax)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: avg_v32i16_const:
; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0
; AVX512BW-NEXT: vpavgw {{.*}}(%rip), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = load <32 x i16>, <32 x i16>* %a
%2 = zext <32 x i16> %1 to <32 x i32>
; AVX512VL-LABEL: test_x86_avx_cvt_pd2_ps_256:
; AVX512VL: ## BB#0:
; AVX512VL-NEXT: vcvtpd2ps %ymm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x5a,0xc0]
+; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; AVX512VL-NEXT: retl ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
; AVX512VL-LABEL: test_x86_avx_cvt_pd2dq_256:
; AVX512VL: ## BB#0:
; AVX512VL-NEXT: vcvtpd2dq %ymm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xff,0xe6,0xc0]
+; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; AVX512VL-NEXT: retl ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %a0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
; AVX512VL-LABEL: test_x86_avx_cvtt_pd2dq_256:
; AVX512VL: ## BB#0:
; AVX512VL-NEXT: vcvttpd2dq %ymm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe6,0xc0]
+; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; AVX512VL-NEXT: retl ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
define void @test_x86_avx_maskstore_pd_256(i8* %a0, <4 x i64> %mask, <4 x double> %a2) {
-; AVX-LABEL: test_x86_avx_maskstore_pd_256:
-; AVX: ## BB#0:
-; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX-NEXT: vmaskmovpd %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0x7d,0x2f,0x08]
-; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_maskstore_pd_256:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX512VL-NEXT: vmaskmovpd %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0x7d,0x2f,0x08]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx_maskstore_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; CHECK-NEXT: vmaskmovpd %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0x7d,0x2f,0x08]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: retl ## encoding: [0xc3]
call void @llvm.x86.avx.maskstore.pd.256(i8* %a0, <4 x i64> %mask, <4 x double> %a2)
ret void
}
define void @test_x86_avx_maskstore_ps_256(i8* %a0, <8 x i32> %mask, <8 x float> %a2) {
-; AVX-LABEL: test_x86_avx_maskstore_ps_256:
-; AVX: ## BB#0:
-; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX-NEXT: vmaskmovps %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0x7d,0x2e,0x08]
-; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_maskstore_ps_256:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX512VL-NEXT: vmaskmovps %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0x7d,0x2e,0x08]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx_maskstore_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; CHECK-NEXT: vmaskmovps %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0x7d,0x2e,0x08]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: retl ## encoding: [0xc3]
call void @llvm.x86.avx.maskstore.ps.256(i8* %a0, <8 x i32> %mask, <8 x float> %a2)
ret void
}
define i32 @test_x86_avx_movmsk_pd_256(<4 x double> %a0) {
-; AVX-LABEL: test_x86_avx_movmsk_pd_256:
-; AVX: ## BB#0:
-; AVX-NEXT: vmovmskpd %ymm0, %eax ## encoding: [0xc5,0xfd,0x50,0xc0]
-; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_movmsk_pd_256:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: vmovmskpd %ymm0, %eax ## encoding: [0xc5,0xfd,0x50,0xc0]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx_movmsk_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovmskpd %ymm0, %eax ## encoding: [0xc5,0xfd,0x50,0xc0]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: retl ## encoding: [0xc3]
%res = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0) ; <i32> [#uses=1]
ret i32 %res
}
define i32 @test_x86_avx_movmsk_ps_256(<8 x float> %a0) {
-; AVX-LABEL: test_x86_avx_movmsk_ps_256:
-; AVX: ## BB#0:
-; AVX-NEXT: vmovmskps %ymm0, %eax ## encoding: [0xc5,0xfc,0x50,0xc0]
-; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_movmsk_ps_256:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: vmovmskps %ymm0, %eax ## encoding: [0xc5,0xfc,0x50,0xc0]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx_movmsk_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovmskps %ymm0, %eax ## encoding: [0xc5,0xfc,0x50,0xc0]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: retl ## encoding: [0xc3]
%res = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0) ; <i32> [#uses=1]
ret i32 %res
}
define i32 @test_x86_avx_ptestc_256(<4 x i64> %a0, <4 x i64> %a1) {
-; AVX-LABEL: test_x86_avx_ptestc_256:
-; AVX: ## BB#0:
-; AVX-NEXT: vptest %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x17,0xc1]
-; AVX-NEXT: sbbl %eax, %eax ## encoding: [0x19,0xc0]
-; AVX-NEXT: andl $1, %eax ## encoding: [0x83,0xe0,0x01]
-; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_ptestc_256:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: vptest %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x17,0xc1]
-; AVX512VL-NEXT: sbbl %eax, %eax ## encoding: [0x19,0xc0]
-; AVX512VL-NEXT: andl $1, %eax ## encoding: [0x83,0xe0,0x01]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx_ptestc_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vptest %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x17,0xc1]
+; CHECK-NEXT: sbbl %eax, %eax ## encoding: [0x19,0xc0]
+; CHECK-NEXT: andl $1, %eax ## encoding: [0x83,0xe0,0x01]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: retl ## encoding: [0xc3]
%res = call i32 @llvm.x86.avx.ptestc.256(<4 x i64> %a0, <4 x i64> %a1) ; <i32> [#uses=1]
ret i32 %res
}
define i32 @test_x86_avx_ptestnzc_256(<4 x i64> %a0, <4 x i64> %a1) {
-; AVX-LABEL: test_x86_avx_ptestnzc_256:
-; AVX: ## BB#0:
-; AVX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX-NEXT: vptest %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x17,0xc1]
-; AVX-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
-; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_ptestnzc_256:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX512VL-NEXT: vptest %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x17,0xc1]
-; AVX512VL-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx_ptestnzc_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
+; CHECK-NEXT: vptest %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x17,0xc1]
+; CHECK-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: retl ## encoding: [0xc3]
%res = call i32 @llvm.x86.avx.ptestnzc.256(<4 x i64> %a0, <4 x i64> %a1) ; <i32> [#uses=1]
ret i32 %res
}
define i32 @test_x86_avx_ptestz_256(<4 x i64> %a0, <4 x i64> %a1) {
-; AVX-LABEL: test_x86_avx_ptestz_256:
-; AVX: ## BB#0:
-; AVX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX-NEXT: vptest %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x17,0xc1]
-; AVX-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0]
-; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_ptestz_256:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX512VL-NEXT: vptest %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x17,0xc1]
-; AVX512VL-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx_ptestz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
+; CHECK-NEXT: vptest %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x17,0xc1]
+; CHECK-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: retl ## encoding: [0xc3]
%res = call i32 @llvm.x86.avx.ptestz.256(<4 x i64> %a0, <4 x i64> %a1) ; <i32> [#uses=1]
ret i32 %res
}
define i32 @test_x86_avx_vtestc_pd_256(<4 x double> %a0, <4 x double> %a1) {
-; AVX-LABEL: test_x86_avx_vtestc_pd_256:
-; AVX: ## BB#0:
-; AVX-NEXT: vtestpd %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0f,0xc1]
-; AVX-NEXT: sbbl %eax, %eax ## encoding: [0x19,0xc0]
-; AVX-NEXT: andl $1, %eax ## encoding: [0x83,0xe0,0x01]
-; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_vtestc_pd_256:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: vtestpd %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0f,0xc1]
-; AVX512VL-NEXT: sbbl %eax, %eax ## encoding: [0x19,0xc0]
-; AVX512VL-NEXT: andl $1, %eax ## encoding: [0x83,0xe0,0x01]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx_vtestc_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vtestpd %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0f,0xc1]
+; CHECK-NEXT: sbbl %eax, %eax ## encoding: [0x19,0xc0]
+; CHECK-NEXT: andl $1, %eax ## encoding: [0x83,0xe0,0x01]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: retl ## encoding: [0xc3]
%res = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
define i32 @test_x86_avx_vtestc_ps_256(<8 x float> %a0, <8 x float> %a1) {
-; AVX-LABEL: test_x86_avx_vtestc_ps_256:
-; AVX: ## BB#0:
-; AVX-NEXT: vtestps %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0e,0xc1]
-; AVX-NEXT: sbbl %eax, %eax ## encoding: [0x19,0xc0]
-; AVX-NEXT: andl $1, %eax ## encoding: [0x83,0xe0,0x01]
-; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_vtestc_ps_256:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: vtestps %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0e,0xc1]
-; AVX512VL-NEXT: sbbl %eax, %eax ## encoding: [0x19,0xc0]
-; AVX512VL-NEXT: andl $1, %eax ## encoding: [0x83,0xe0,0x01]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx_vtestc_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vtestps %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0e,0xc1]
+; CHECK-NEXT: sbbl %eax, %eax ## encoding: [0x19,0xc0]
+; CHECK-NEXT: andl $1, %eax ## encoding: [0x83,0xe0,0x01]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: retl ## encoding: [0xc3]
%res = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
define i32 @test_x86_avx_vtestnzc_pd_256(<4 x double> %a0, <4 x double> %a1) {
-; AVX-LABEL: test_x86_avx_vtestnzc_pd_256:
-; AVX: ## BB#0:
-; AVX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX-NEXT: vtestpd %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0f,0xc1]
-; AVX-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
-; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_vtestnzc_pd_256:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX512VL-NEXT: vtestpd %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0f,0xc1]
-; AVX512VL-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx_vtestnzc_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
+; CHECK-NEXT: vtestpd %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0f,0xc1]
+; CHECK-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: retl ## encoding: [0xc3]
%res = call i32 @llvm.x86.avx.vtestnzc.pd.256(<4 x double> %a0, <4 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
define i32 @test_x86_avx_vtestnzc_ps_256(<8 x float> %a0, <8 x float> %a1) {
-; AVX-LABEL: test_x86_avx_vtestnzc_ps_256:
-; AVX: ## BB#0:
-; AVX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX-NEXT: vtestps %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0e,0xc1]
-; AVX-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
-; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_vtestnzc_ps_256:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX512VL-NEXT: vtestps %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0e,0xc1]
-; AVX512VL-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx_vtestnzc_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
+; CHECK-NEXT: vtestps %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0e,0xc1]
+; CHECK-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: retl ## encoding: [0xc3]
%res = call i32 @llvm.x86.avx.vtestnzc.ps.256(<8 x float> %a0, <8 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
define i32 @test_x86_avx_vtestz_pd_256(<4 x double> %a0, <4 x double> %a1) {
-; AVX-LABEL: test_x86_avx_vtestz_pd_256:
-; AVX: ## BB#0:
-; AVX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX-NEXT: vtestpd %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0f,0xc1]
-; AVX-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0]
-; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_vtestz_pd_256:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX512VL-NEXT: vtestpd %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0f,0xc1]
-; AVX512VL-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx_vtestz_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
+; CHECK-NEXT: vtestpd %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0f,0xc1]
+; CHECK-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: retl ## encoding: [0xc3]
%res = call i32 @llvm.x86.avx.vtestz.pd.256(<4 x double> %a0, <4 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
define i32 @test_x86_avx_vtestz_ps_256(<8 x float> %a0, <8 x float> %a1) {
-; AVX-LABEL: test_x86_avx_vtestz_ps_256:
-; AVX: ## BB#0:
-; AVX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX-NEXT: vtestps %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0e,0xc1]
-; AVX-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0]
-; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_vtestz_ps_256:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX512VL-NEXT: vtestps %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0e,0xc1]
-; AVX512VL-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx_vtestz_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
+; CHECK-NEXT: vtestps %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0e,0xc1]
+; CHECK-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: retl ## encoding: [0xc3]
%res = call i32 @llvm.x86.avx.vtestz.ps.256(<8 x float> %a0, <8 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
; AVX512VL-NEXT: vpaddq LCPI65_0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0x05,A,A,A,A]
; AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI65_0, kind: FK_Data_4
; AVX512VL-NEXT: vmovntdq %ymm0, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe7,0x00]
+; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; AVX512VL-NEXT: retl ## encoding: [0xc3]
%a2 = add <2 x i64> %a1, <i64 1, i64 1>
%a3 = shufflevector <2 x i64> %a2, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; AVX512VL: ## BB#0:
; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; AVX512VL-NEXT: vmovntps %ymm0, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x2b,0x00]
+; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; AVX512VL-NEXT: retl ## encoding: [0xc3]
tail call void @llvm.x86.avx.movnt.ps.256(i8* %p, <8 x float> %a) nounwind
ret void
; AVX512VL-NEXT: vxorpd %ymm1, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x57,0xc9]
; AVX512VL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1]
; AVX512VL-NEXT: vmovntpd %ymm0, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x2b,0x00]
+; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; AVX512VL-NEXT: retl ## encoding: [0xc3]
%a2 = fadd <4 x double> %a1, <double 0x0, double 0x0, double 0x0, double 0x0>
tail call void @llvm.x86.avx.movnt.pd.256(i8* %p, <4 x double> %a2) nounwind
; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s
-; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mattr=+avx,+fast-partial-ymm-write | FileCheck --check-prefix=FASTYMM %s
+; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s
+; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mattr=+avx,+fast-partial-ymm-or-zmm-write | FileCheck --check-prefix=FAST-YMM-ZMM %s
; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mcpu=btver2 | FileCheck --check-prefix=BTVER2 %s
-; FASTYMM-NOT: vzeroupper
+; FAST-YMM-ZMM-NOT: vzeroupper
; BTVER2-NOT: vzeroupper
declare i32 @foo()
define i32 @test_x86_avx2_pmovmskb(<32 x i8> %a0) {
-; AVX2-LABEL: test_x86_avx2_pmovmskb:
-; AVX2: ## BB#0:
-; AVX2-NEXT: vpmovmskb %ymm0, %eax ## encoding: [0xc5,0xfd,0xd7,0xc0]
-; AVX2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX2-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx2_pmovmskb:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: vpmovmskb %ymm0, %eax ## encoding: [0xc5,0xfd,0xd7,0xc0]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_pmovmskb:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovmskb %ymm0, %eax ## encoding: [0xc5,0xfd,0xd7,0xc0]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: retl ## encoding: [0xc3]
%res = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %a0) ; <i32> [#uses=1]
ret i32 %res
}
define void @test_x86_avx2_maskstore_q_256(i8* %a0, <4 x i64> %a1, <4 x i64> %a2) {
-; AVX2-LABEL: test_x86_avx2_maskstore_q_256:
-; AVX2: ## BB#0:
-; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX2-NEXT: vpmaskmovq %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0xfd,0x8e,0x08]
-; AVX2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX2-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx2_maskstore_q_256:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX512VL-NEXT: vpmaskmovq %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0xfd,0x8e,0x08]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_maskstore_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; CHECK-NEXT: vpmaskmovq %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0xfd,0x8e,0x08]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: retl ## encoding: [0xc3]
call void @llvm.x86.avx2.maskstore.q.256(i8* %a0, <4 x i64> %a1, <4 x i64> %a2)
ret void
}
define void @test_x86_avx2_maskstore_d_256(i8* %a0, <8 x i32> %a1, <8 x i32> %a2) {
-; AVX2-LABEL: test_x86_avx2_maskstore_d_256:
-; AVX2: ## BB#0:
-; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX2-NEXT: vpmaskmovd %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0x7d,0x8e,0x08]
-; AVX2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX2-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx2_maskstore_d_256:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX512VL-NEXT: vpmaskmovd %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0x7d,0x8e,0x08]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_maskstore_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; CHECK-NEXT: vpmaskmovd %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0x7d,0x8e,0x08]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: retl ## encoding: [0xc3]
call void @llvm.x86.avx2.maskstore.d.256(i8* %a0, <8 x i32> %a1, <8 x i32> %a2)
ret void
}
<2 x i64>, <4 x float>, i8) nounwind readonly
define <4 x float> @test_x86_avx2_gather_q_ps_256(<4 x float> %a0, i8* %a1, <4 x i64> %idx, <4 x float> %mask) {
-; AVX2-LABEL: test_x86_avx2_gather_q_ps_256:
-; AVX2: ## BB#0:
-; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX2-NEXT: vgatherqps %xmm2, (%eax,%ymm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x6d,0x93,0x04,0x48]
-; AVX2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX2-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx2_gather_q_ps_256:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX512VL-NEXT: vgatherqps %xmm2, (%eax,%ymm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x6d,0x93,0x04,0x48]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_gather_q_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; CHECK-NEXT: vgatherqps %xmm2, (%eax,%ymm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x6d,0x93,0x04,0x48]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: retl ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %a0,
i8* %a1, <4 x i64> %idx, <4 x float> %mask, i8 2) ;
ret <4 x float> %res
<2 x i64>, <4 x i32>, i8) nounwind readonly
define <4 x i32> @test_x86_avx2_gather_q_d_256(<4 x i32> %a0, i8* %a1, <4 x i64> %idx, <4 x i32> %mask) {
-; AVX2-LABEL: test_x86_avx2_gather_q_d_256:
-; AVX2: ## BB#0:
-; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX2-NEXT: vpgatherqd %xmm2, (%eax,%ymm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x6d,0x91,0x04,0x48]
-; AVX2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX2-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx2_gather_q_d_256:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX512VL-NEXT: vpgatherqd %xmm2, (%eax,%ymm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x6d,0x91,0x04,0x48]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_gather_q_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; CHECK-NEXT: vpgatherqd %xmm2, (%eax,%ymm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x6d,0x91,0x04,0x48]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: retl ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %a0,
i8* %a1, <4 x i64> %idx, <4 x i32> %mask, i8 2) ;
ret <4 x i32> %res
; X32-AVX512VL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%esp)
; X32-AVX512VL-NEXT: movl %ebp, %esp
; X32-AVX512VL-NEXT: popl %ebp
+; X32-AVX512VL-NEXT: vzeroupper
; X32-AVX512VL-NEXT: retl
;
; X64-AVX512VL-LABEL: isel_crash_32b:
; X64-AVX512VL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
; X64-AVX512VL-NEXT: movq %rbp, %rsp
; X64-AVX512VL-NEXT: popq %rbp
+; X64-AVX512VL-NEXT: vzeroupper
; X64-AVX512VL-NEXT: retq
eintry:
%__a.addr.i = alloca <4 x i64>, align 16
; X32-AVX512VL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%esp)
; X32-AVX512VL-NEXT: movl %ebp, %esp
; X32-AVX512VL-NEXT: popl %ebp
+; X32-AVX512VL-NEXT: vzeroupper
; X32-AVX512VL-NEXT: retl
;
; X64-AVX512VL-LABEL: isel_crash_16w:
; X64-AVX512VL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
; X64-AVX512VL-NEXT: movq %rbp, %rsp
; X64-AVX512VL-NEXT: popq %rbp
+; X64-AVX512VL-NEXT: vzeroupper
; X64-AVX512VL-NEXT: retq
eintry:
%__a.addr.i = alloca <4 x i64>, align 16
}
define void @isel_crash_8d(i32* %cV_R.addr) {
-; X32-AVX2-LABEL: isel_crash_8d:
-; X32-AVX2: ## BB#0: ## %eintry
-; X32-AVX2-NEXT: pushl %ebp
-; X32-AVX2-NEXT: Lcfi9:
-; X32-AVX2-NEXT: .cfi_def_cfa_offset 8
-; X32-AVX2-NEXT: Lcfi10:
-; X32-AVX2-NEXT: .cfi_offset %ebp, -8
-; X32-AVX2-NEXT: movl %esp, %ebp
-; X32-AVX2-NEXT: Lcfi11:
-; X32-AVX2-NEXT: .cfi_def_cfa_register %ebp
-; X32-AVX2-NEXT: andl $-32, %esp
-; X32-AVX2-NEXT: subl $128, %esp
-; X32-AVX2-NEXT: movl 8(%ebp), %eax
-; X32-AVX2-NEXT: vxorps %ymm0, %ymm0, %ymm0
-; X32-AVX2-NEXT: vmovaps %ymm0, (%esp)
-; X32-AVX2-NEXT: vbroadcastss (%eax), %ymm1
-; X32-AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp)
-; X32-AVX2-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp)
-; X32-AVX2-NEXT: movl %ebp, %esp
-; X32-AVX2-NEXT: popl %ebp
-; X32-AVX2-NEXT: vzeroupper
-; X32-AVX2-NEXT: retl
+; X32-LABEL: isel_crash_8d:
+; X32: ## BB#0: ## %eintry
+; X32-NEXT: pushl %ebp
+; X32-NEXT: Lcfi9:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: Lcfi10:
+; X32-NEXT: .cfi_offset %ebp, -8
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: Lcfi11:
+; X32-NEXT: .cfi_def_cfa_register %ebp
+; X32-NEXT: andl $-32, %esp
+; X32-NEXT: subl $128, %esp
+; X32-NEXT: movl 8(%ebp), %eax
+; X32-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32-NEXT: vmovaps %ymm0, (%esp)
+; X32-NEXT: vbroadcastss (%eax), %ymm1
+; X32-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp)
+; X32-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp)
+; X32-NEXT: movl %ebp, %esp
+; X32-NEXT: popl %ebp
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
;
; X64-AVX2-LABEL: isel_crash_8d:
; X64-AVX2: ## BB#0: ## %eintry
; X64-AVX2-NEXT: vzeroupper
; X64-AVX2-NEXT: retq
;
-; X32-AVX512VL-LABEL: isel_crash_8d:
-; X32-AVX512VL: ## BB#0: ## %eintry
-; X32-AVX512VL-NEXT: pushl %ebp
-; X32-AVX512VL-NEXT: Lcfi9:
-; X32-AVX512VL-NEXT: .cfi_def_cfa_offset 8
-; X32-AVX512VL-NEXT: Lcfi10:
-; X32-AVX512VL-NEXT: .cfi_offset %ebp, -8
-; X32-AVX512VL-NEXT: movl %esp, %ebp
-; X32-AVX512VL-NEXT: Lcfi11:
-; X32-AVX512VL-NEXT: .cfi_def_cfa_register %ebp
-; X32-AVX512VL-NEXT: andl $-32, %esp
-; X32-AVX512VL-NEXT: subl $128, %esp
-; X32-AVX512VL-NEXT: movl 8(%ebp), %eax
-; X32-AVX512VL-NEXT: vxorps %ymm0, %ymm0, %ymm0
-; X32-AVX512VL-NEXT: vmovaps %ymm0, (%esp)
-; X32-AVX512VL-NEXT: vbroadcastss (%eax), %ymm1
-; X32-AVX512VL-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp)
-; X32-AVX512VL-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp)
-; X32-AVX512VL-NEXT: movl %ebp, %esp
-; X32-AVX512VL-NEXT: popl %ebp
-; X32-AVX512VL-NEXT: retl
-;
; X64-AVX512VL-LABEL: isel_crash_8d:
; X64-AVX512VL: ## BB#0: ## %eintry
; X64-AVX512VL-NEXT: pushq %rbp
; X64-AVX512VL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
; X64-AVX512VL-NEXT: movq %rbp, %rsp
; X64-AVX512VL-NEXT: popq %rbp
+; X64-AVX512VL-NEXT: vzeroupper
; X64-AVX512VL-NEXT: retq
eintry:
%__a.addr.i = alloca <4 x i64>, align 16
; X32-AVX512VL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%esp)
; X32-AVX512VL-NEXT: movl %ebp, %esp
; X32-AVX512VL-NEXT: popl %ebp
+; X32-AVX512VL-NEXT: vzeroupper
; X32-AVX512VL-NEXT: retl
;
; X64-AVX512VL-LABEL: isel_crash_4q:
; X64-AVX512VL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
; X64-AVX512VL-NEXT: movq %rbp, %rsp
; X64-AVX512VL-NEXT: popq %rbp
+; X64-AVX512VL-NEXT: vzeroupper
; X64-AVX512VL-NEXT: retq
eintry:
%__a.addr.i = alloca <4 x i64>, align 16
define void @any_extend_load_v8i64(<8 x i8> * %ptr) {
-; ALL-LABEL: any_extend_load_v8i64:
-; ALL: # BB#0:
-; ALL-NEXT: vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
-; ALL-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0
-; ALL-NEXT: vpmovqb %zmm0, (%rdi)
-; ALL-NEXT: retq
+; KNL-LABEL: any_extend_load_v8i64:
+; KNL: # BB#0:
+; KNL-NEXT: vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
+; KNL-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; KNL-NEXT: vpmovqb %zmm0, (%rdi)
+; KNL-NEXT: retq
+;
+; SKX-LABEL: any_extend_load_v8i64:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
+; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; SKX-NEXT: vpmovqb %zmm0, (%rdi)
+; SKX-NEXT: vzeroupper
+; SKX-NEXT: retq
%wide.load = load <8 x i8>, <8 x i8>* %ptr, align 1
%1 = zext <8 x i8> %wide.load to <8 x i64>
%2 = add nuw nsw <8 x i64> %1, <i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 4>
; SKX-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; SKX-NEXT: vpaddd {{.*}}(%rip){1to8}, %ymm0, %ymm0
; SKX-NEXT: vpmovdb %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%wide.load = load <8 x i8>, <8 x i8>* %ptr, align 1
%1 = zext <8 x i8> %wide.load to <8 x i32>
; AVX512DQ-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512DQ-NEXT: vpmullq %zmm0, %zmm1, %zmm0
; AVX512DQ-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; SKX-LABEL: imulq128:
; SKX-NEXT: .cfi_def_cfa_offset 16
; SKX-NEXT: vpcmpgtd %ymm1, %ymm0, %k0
; SKX-NEXT: vpmovm2w %k0, %xmm0
+; SKX-NEXT: vzeroupper
; SKX-NEXT: callq _func8xi1
; SKX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SKX-NEXT: vpslld $31, %ymm0, %ymm0
; SKX-NEXT: .cfi_def_cfa_offset 16
; SKX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
; SKX-NEXT: vpmovm2b %k0, %xmm0
+; SKX-NEXT: vzeroupper
; SKX-NEXT: callq _func16xi1
; SKX-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; SKX-NEXT: vpslld $31, %zmm0, %zmm0
; SKX-NEXT: .cfi_def_cfa_offset 16
; SKX-NEXT: vpcmpgtd %ymm1, %ymm0, %k0
; SKX-NEXT: vpmovm2w %k0, %xmm0
+; SKX-NEXT: vzeroupper
; SKX-NEXT: callq _func8xi1
; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
; SKX-NEXT: vpmovw2m %xmm0, %k0
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=KNL
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=ALL --check-prefix=DQ --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLBW --check-prefix=SKX
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=NODQ --check-prefix=VL --check-prefix=VLNODQ --check-prefix=VLNOBW --check-prefix=AVX512VL
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=DQ --check-prefix=AVX512DQ
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=AVX512BW
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512vl,avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=DQ --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLNOBW --check-prefix=AVX512VLDQ
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512vl,avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=NODQ --check-prefix=VL --check-prefix=VLNODQ --check-prefix=VLBW --check-prefix=AVX512VLBW
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=DQ --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLBW --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=NODQ --check-prefix=VL --check-prefix=VLNODQ --check-prefix=VLNOBW --check-prefix=AVX512VL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=NOVL --check-prefix=DQ --check-prefix=AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512vl,avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=DQ --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLNOBW --check-prefix=AVX512VLDQ
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512vl,avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=NODQ --check-prefix=VL --check-prefix=VLNODQ --check-prefix=VLBW --check-prefix=AVX512VLBW
define <16 x float> @sitof32(<16 x i32> %a) nounwind {
; AVX512DQ-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0
; AVX512DQ-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%b = sitofp <2 x i64> %a to <2 x float>
ret <2 x float>%b
}
define <4 x float> @sltof4f32_mem(<4 x i64>* %a) {
-; NODQ-LABEL: sltof4f32_mem:
-; NODQ: ## BB#0:
-; NODQ-NEXT: vmovdqu (%rdi), %ymm0
-; NODQ-NEXT: vpextrq $1, %xmm0, %rax
-; NODQ-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
-; NODQ-NEXT: vmovq %xmm0, %rax
-; NODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
-; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0
-; NODQ-NEXT: vmovq %xmm0, %rax
-; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
-; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; NODQ-NEXT: vpextrq $1, %xmm0, %rax
-; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
-; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; NODQ-NEXT: retq
+; KNL-LABEL: sltof4f32_mem:
+; KNL: ## BB#0:
+; KNL-NEXT: vmovdqu (%rdi), %ymm0
+; KNL-NEXT: vpextrq $1, %xmm0, %rax
+; KNL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
+; KNL-NEXT: vmovq %xmm0, %rax
+; KNL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
+; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT: vmovq %xmm0, %rax
+; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
+; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; KNL-NEXT: vpextrq $1, %xmm0, %rax
+; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
+; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; KNL-NEXT: retq
;
; VLDQ-LABEL: sltof4f32_mem:
; VLDQ: ## BB#0:
; VLDQ-NEXT: vcvtqq2psy (%rdi), %xmm0
; VLDQ-NEXT: retq
;
+; VLNODQ-LABEL: sltof4f32_mem:
+; VLNODQ: ## BB#0:
+; VLNODQ-NEXT: vmovdqu (%rdi), %ymm0
+; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax
+; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
+; VLNODQ-NEXT: vmovq %xmm0, %rax
+; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
+; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; VLNODQ-NEXT: vextracti128 $1, %ymm0, %xmm0
+; VLNODQ-NEXT: vmovq %xmm0, %rax
+; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
+; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax
+; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
+; VLNODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; VLNODQ-NEXT: vzeroupper
+; VLNODQ-NEXT: retq
+;
; AVX512DQ-LABEL: sltof4f32_mem:
; AVX512DQ: ## BB#0:
; AVX512DQ-NEXT: vmovups (%rdi), %ymm0
; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0
; AVX512DQ-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
+;
+; AVX512BW-LABEL: sltof4f32_mem:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vmovdqu (%rdi), %ymm0
+; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
+; AVX512BW-NEXT: vmovq %xmm0, %rax
+; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
+; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512BW-NEXT: vmovq %xmm0, %rax
+; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
+; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
+; AVX512BW-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%a1 = load <4 x i64>, <4 x i64>* %a, align 8
%b = sitofp <4 x i64> %a1 to <4 x float>
ret <4 x float>%b
}
define <4 x float> @sltof432(<4 x i64> %a) {
-; NODQ-LABEL: sltof432:
-; NODQ: ## BB#0:
-; NODQ-NEXT: vpextrq $1, %xmm0, %rax
-; NODQ-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
-; NODQ-NEXT: vmovq %xmm0, %rax
-; NODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
-; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0
-; NODQ-NEXT: vmovq %xmm0, %rax
-; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
-; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; NODQ-NEXT: vpextrq $1, %xmm0, %rax
-; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
-; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; NODQ-NEXT: retq
+; KNL-LABEL: sltof432:
+; KNL: ## BB#0:
+; KNL-NEXT: vpextrq $1, %xmm0, %rax
+; KNL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
+; KNL-NEXT: vmovq %xmm0, %rax
+; KNL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
+; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT: vmovq %xmm0, %rax
+; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
+; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; KNL-NEXT: vpextrq $1, %xmm0, %rax
+; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
+; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; KNL-NEXT: retq
;
; VLDQ-LABEL: sltof432:
; VLDQ: ## BB#0:
; VLDQ-NEXT: vcvtqq2ps %ymm0, %xmm0
+; VLDQ-NEXT: vzeroupper
; VLDQ-NEXT: retq
;
+; VLNODQ-LABEL: sltof432:
+; VLNODQ: ## BB#0:
+; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax
+; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
+; VLNODQ-NEXT: vmovq %xmm0, %rax
+; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
+; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; VLNODQ-NEXT: vextracti128 $1, %ymm0, %xmm0
+; VLNODQ-NEXT: vmovq %xmm0, %rax
+; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
+; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax
+; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
+; VLNODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; VLNODQ-NEXT: vzeroupper
+; VLNODQ-NEXT: retq
+;
; AVX512DQ-LABEL: sltof432:
; AVX512DQ: ## BB#0:
; AVX512DQ-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0
; AVX512DQ-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
+;
+; AVX512BW-LABEL: sltof432:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
+; AVX512BW-NEXT: vmovq %xmm0, %rax
+; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
+; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512BW-NEXT: vmovq %xmm0, %rax
+; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
+; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
+; AVX512BW-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%b = sitofp <4 x i64> %a to <4 x float>
ret <4 x float> %b
}
define <4 x float> @ultof432(<4 x i64> %a) {
-; NODQ-LABEL: ultof432:
-; NODQ: ## BB#0:
-; NODQ-NEXT: vpextrq $1, %xmm0, %rax
-; NODQ-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1
-; NODQ-NEXT: vmovq %xmm0, %rax
-; NODQ-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2
-; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0
-; NODQ-NEXT: vmovq %xmm0, %rax
-; NODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2
-; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; NODQ-NEXT: vpextrq $1, %xmm0, %rax
-; NODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0
-; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; NODQ-NEXT: retq
+; KNL-LABEL: ultof432:
+; KNL: ## BB#0:
+; KNL-NEXT: vpextrq $1, %xmm0, %rax
+; KNL-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1
+; KNL-NEXT: vmovq %xmm0, %rax
+; KNL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2
+; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT: vmovq %xmm0, %rax
+; KNL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2
+; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; KNL-NEXT: vpextrq $1, %xmm0, %rax
+; KNL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0
+; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; KNL-NEXT: retq
;
; VLDQ-LABEL: ultof432:
; VLDQ: ## BB#0:
; VLDQ-NEXT: vcvtuqq2ps %ymm0, %xmm0
+; VLDQ-NEXT: vzeroupper
; VLDQ-NEXT: retq
;
+; VLNODQ-LABEL: ultof432:
+; VLNODQ: ## BB#0:
+; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax
+; VLNODQ-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1
+; VLNODQ-NEXT: vmovq %xmm0, %rax
+; VLNODQ-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2
+; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; VLNODQ-NEXT: vextracti128 $1, %ymm0, %xmm0
+; VLNODQ-NEXT: vmovq %xmm0, %rax
+; VLNODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2
+; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax
+; VLNODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0
+; VLNODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; VLNODQ-NEXT: vzeroupper
+; VLNODQ-NEXT: retq
+;
; AVX512DQ-LABEL: ultof432:
; AVX512DQ: ## BB#0:
; AVX512DQ-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0
; AVX512DQ-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
+;
+; AVX512BW-LABEL: ultof432:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512BW-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1
+; AVX512BW-NEXT: vmovq %xmm0, %rax
+; AVX512BW-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2
+; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512BW-NEXT: vmovq %xmm0, %rax
+; AVX512BW-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2
+; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512BW-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0
+; AVX512BW-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%b = uitofp <4 x i64> %a to <4 x float>
ret <4 x float> %b
}
}
define <4 x i32> @fptoui_128(<4 x float> %a) nounwind {
-; NOVL-LABEL: fptoui_128:
-; NOVL: ## BB#0:
-; NOVL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; NOVL-NEXT: vcvttps2udq %zmm0, %zmm0
-; NOVL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
-; NOVL-NEXT: retq
+; KNL-LABEL: fptoui_128:
+; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; KNL-NEXT: vcvttps2udq %zmm0, %zmm0
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; KNL-NEXT: retq
;
; VL-LABEL: fptoui_128:
; VL: ## BB#0:
; VL-NEXT: vcvttps2udq %xmm0, %xmm0
; VL-NEXT: retq
+;
+; AVX512DQ-LABEL: fptoui_128:
+; AVX512DQ: ## BB#0:
+; AVX512DQ-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512DQ-NEXT: vcvttps2udq %zmm0, %zmm0
+; AVX512DQ-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
+;
+; AVX512BW-LABEL: fptoui_128:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT: vcvttps2udq %zmm0, %zmm0
+; AVX512BW-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%b = fptoui <4 x float> %a to <4 x i32>
ret <4 x i32> %b
}
}
define <4 x i32> @fptoui_256d(<4 x double> %a) nounwind {
-; NOVL-LABEL: fptoui_256d:
-; NOVL: ## BB#0:
-; NOVL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; NOVL-NEXT: vcvttpd2udq %zmm0, %ymm0
-; NOVL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; NOVL-NEXT: retq
+; KNL-LABEL: fptoui_256d:
+; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT: vcvttpd2udq %zmm0, %ymm0
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL-NEXT: retq
;
; VL-LABEL: fptoui_256d:
; VL: ## BB#0:
; VL-NEXT: vcvttpd2udq %ymm0, %xmm0
+; VL-NEXT: vzeroupper
; VL-NEXT: retq
+;
+; AVX512DQ-LABEL: fptoui_256d:
+; AVX512DQ: ## BB#0:
+; AVX512DQ-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512DQ-NEXT: vcvttpd2udq %zmm0, %ymm0
+; AVX512DQ-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
+;
+; AVX512BW-LABEL: fptoui_256d:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT: vcvttpd2udq %zmm0, %ymm0
+; AVX512BW-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%b = fptoui <4 x double> %a to <4 x i32>
ret <4 x i32> %b
}
}
define <4 x i32> @fptosi03(<4 x double> %a) {
-; ALL-LABEL: fptosi03:
-; ALL: ## BB#0:
-; ALL-NEXT: vcvttpd2dq %ymm0, %xmm0
-; ALL-NEXT: retq
+; KNL-LABEL: fptosi03:
+; KNL: ## BB#0:
+; KNL-NEXT: vcvttpd2dq %ymm0, %xmm0
+; KNL-NEXT: retq
+;
+; AVX512-LABEL: fptosi03:
+; AVX512: ## BB#0:
+; AVX512-NEXT: vcvttpd2dq %ymm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%b = fptosi <4 x double> %a to <4 x i32>
ret <4 x i32> %b
}
}
define <4 x float> @fptrunc01(<4 x double> %b) {
-; ALL-LABEL: fptrunc01:
-; ALL: ## BB#0:
-; ALL-NEXT: vcvtpd2ps %ymm0, %xmm0
-; ALL-NEXT: retq
+; KNL-LABEL: fptrunc01:
+; KNL: ## BB#0:
+; KNL-NEXT: vcvtpd2ps %ymm0, %xmm0
+; KNL-NEXT: retq
+;
+; AVX512-LABEL: fptrunc01:
+; AVX512: ## BB#0:
+; AVX512-NEXT: vcvtpd2ps %ymm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%a = fptrunc <4 x double> %b to <4 x float>
ret <4 x float> %a
}
define <4 x float> @fptrunc02(<4 x double> %b, <4 x i1> %mask) {
-; NOVL-LABEL: fptrunc02:
-; NOVL: ## BB#0:
-; NOVL-NEXT: vpslld $31, %xmm1, %xmm1
-; NOVL-NEXT: vpsrad $31, %xmm1, %xmm1
-; NOVL-NEXT: vcvtpd2ps %ymm0, %xmm0
-; NOVL-NEXT: vpand %xmm0, %xmm1, %xmm0
-; NOVL-NEXT: retq
+; KNL-LABEL: fptrunc02:
+; KNL: ## BB#0:
+; KNL-NEXT: vpslld $31, %xmm1, %xmm1
+; KNL-NEXT: vpsrad $31, %xmm1, %xmm1
+; KNL-NEXT: vcvtpd2ps %ymm0, %xmm0
+; KNL-NEXT: vpand %xmm0, %xmm1, %xmm0
+; KNL-NEXT: retq
;
; VL-LABEL: fptrunc02:
; VL: ## BB#0:
; VL-NEXT: vpslld $31, %xmm1, %xmm1
; VL-NEXT: vptestmd %xmm1, %xmm1, %k1
; VL-NEXT: vcvtpd2ps %ymm0, %xmm0 {%k1} {z}
+; VL-NEXT: vzeroupper
; VL-NEXT: retq
+;
+; AVX512DQ-LABEL: fptrunc02:
+; AVX512DQ: ## BB#0:
+; AVX512DQ-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX512DQ-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX512DQ-NEXT: vcvtpd2ps %ymm0, %xmm0
+; AVX512DQ-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
+;
+; AVX512BW-LABEL: fptrunc02:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX512BW-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX512BW-NEXT: vcvtpd2ps %ymm0, %xmm0
+; AVX512BW-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%a = fptrunc <4 x double> %b to <4 x float>
%c = select <4 x i1>%mask, <4 x float>%a, <4 x float> zeroinitializer
ret <4 x float> %c
}
define <4 x float> @uitof32_128(<4 x i32> %a) nounwind {
-; NOVL-LABEL: uitof32_128:
-; NOVL: ## BB#0:
-; NOVL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; NOVL-NEXT: vcvtudq2ps %zmm0, %zmm0
-; NOVL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
-; NOVL-NEXT: retq
+; KNL-LABEL: uitof32_128:
+; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; KNL-NEXT: vcvtudq2ps %zmm0, %zmm0
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; KNL-NEXT: retq
;
; VL-LABEL: uitof32_128:
; VL: ## BB#0:
; VL-NEXT: vcvtudq2ps %xmm0, %xmm0
; VL-NEXT: retq
+;
+; AVX512DQ-LABEL: uitof32_128:
+; AVX512DQ: ## BB#0:
+; AVX512DQ-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0
+; AVX512DQ-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
+;
+; AVX512BW-LABEL: uitof32_128:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT: vcvtudq2ps %zmm0, %zmm0
+; AVX512BW-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%b = uitofp <4 x i32> %a to <4 x float>
ret <4 x float> %b
}
}
define i16 @trunc_16i32_to_16i1(<16 x i32> %a) {
-; ALL-LABEL: trunc_16i32_to_16i1:
-; ALL: ## BB#0:
-; ALL-NEXT: vpslld $31, %zmm0, %zmm0
-; ALL-NEXT: vptestmd %zmm0, %zmm0, %k0
-; ALL-NEXT: kmovw %k0, %eax
-; ALL-NEXT: retq
+; KNL-LABEL: trunc_16i32_to_16i1:
+; KNL: ## BB#0:
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: retq
+;
+; SKX-LABEL: trunc_16i32_to_16i1:
+; SKX: ## BB#0:
+; SKX-NEXT: vpslld $31, %zmm0, %zmm0
+; SKX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; SKX-NEXT: kmovw %k0, %eax
+; SKX-NEXT: vzeroupper
+; SKX-NEXT: retq
%mask_b = trunc <16 x i32>%a to <16 x i1>
%mask = bitcast <16 x i1> %mask_b to i16
ret i16 %mask
; SKX: ## BB#0:
; SKX-NEXT: vpcmpgtd %ymm0, %ymm1, %k0
; SKX-NEXT: vpmovm2w %k0, %xmm0
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x = icmp slt <8 x i32> %a1, %a2
%y = sext <8 x i1> %x to <8 x i16>
}
define void @extload_v8i64(<8 x i8>* %a, <8 x i64>* %res) {
-; ALL-LABEL: extload_v8i64:
-; ALL: ## BB#0:
-; ALL-NEXT: vpmovsxbq (%rdi), %zmm0
-; ALL-NEXT: vmovdqa64 %zmm0, (%rsi)
-; ALL-NEXT: retq
+; KNL-LABEL: extload_v8i64:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovsxbq (%rdi), %zmm0
+; KNL-NEXT: vmovdqa64 %zmm0, (%rsi)
+; KNL-NEXT: retq
+;
+; SKX-LABEL: extload_v8i64:
+; SKX: ## BB#0:
+; SKX-NEXT: vpmovsxbq (%rdi), %zmm0
+; SKX-NEXT: vmovdqa64 %zmm0, (%rsi)
+; SKX-NEXT: vzeroupper
+; SKX-NEXT: retq
%sign_load = load <8 x i8>, <8 x i8>* %a
%c = sext <8 x i8> %sign_load to <8 x i64>
store <8 x i64> %c, <8 x i64>* %res
; SKX-LABEL: extract_subvector128_v32i16:
; SKX: ## BB#0:
; SKX-NEXT: vextracti32x4 $2, %zmm0, %xmm0
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%r1 = shufflevector <32 x i16> %x, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
ret <8 x i16> %r1
; SKX-LABEL: extract_subvector128_v32i16_first_element:
; SKX: ## BB#0:
; SKX-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%r1 = shufflevector <32 x i16> %x, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i16> %r1
; SKX-LABEL: extract_subvector128_v64i8:
; SKX: ## BB#0:
; SKX-NEXT: vextracti32x4 $2, %zmm0, %xmm0
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%r1 = shufflevector <64 x i8> %x, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38,i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
ret <16 x i8> %r1
; SKX-LABEL: extract_subvector128_v64i8_first_element:
; SKX: ## BB#0:
; SKX-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%r1 = shufflevector <64 x i8> %x, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x i8> %r1
; SKX-LABEL: extract_subvector256_v8f64_store:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vextractf128 $1, %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <4 x double> %a, <4 x double> undef, <2 x i32> <i32 2, i32 3>
; SKX-LABEL: extract_subvector256_v8f32_store:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vextractf128 $1, %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
; SKX-LABEL: extract_subvector256_v4i64_store:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vextracti128 $1, %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <4 x i64> %a, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
; SKX-LABEL: extract_subvector256_v8i32_store:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vextracti128 $1, %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
; SKX-LABEL: extract_subvector256_v16i16_store:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vextracti128 $1, %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; SKX-LABEL: extract_subvector256_v32i8_store:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vextracti128 $1, %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <32 x i8> %a, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
; SKX-LABEL: extract_subvector256_v4f64_store_lo:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovups %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <4 x double> %a, <4 x double> undef, <2 x i32> <i32 0, i32 1>
; SKX-LABEL: extract_subvector256_v4f64_store_lo_align_16:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovaps %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <4 x double> %a, <4 x double> undef, <2 x i32> <i32 0, i32 1>
; SKX-LABEL: extract_subvector256_v4f32_store_lo:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovups %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; SKX-LABEL: extract_subvector256_v4f32_store_lo_align_16:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovaps %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; SKX-LABEL: extract_subvector256_v2i64_store_lo:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovups %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <4 x i64> %a, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
; SKX-LABEL: extract_subvector256_v2i64_store_lo_align_16:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovaps %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <4 x i64> %a, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
; SKX-LABEL: extract_subvector256_v4i32_store_lo:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovups %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; SKX-LABEL: extract_subvector256_v4i32_store_lo_align_16:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovaps %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; SKX-LABEL: extract_subvector256_v8i16_store_lo:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovups %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; SKX-LABEL: extract_subvector256_v8i16_store_lo_align_16:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovaps %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; SKX-LABEL: extract_subvector256_v16i8_store_lo:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovups %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <32 x i8> %a, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; SKX-LABEL: extract_subvector256_v16i8_store_lo_align_16:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovaps %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <32 x i8> %a, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; SKX-LABEL: extract_subvector512_v2f64_store_lo:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovups %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> <i32 0, i32 1>
; SKX-LABEL: extract_subvector512_v2f64_store_lo_align_16:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovaps %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> <i32 0, i32 1>
; SKX-LABEL: extract_subvector512_v4f32_store_lo:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovups %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; SKX-LABEL: extract_subvector512_v4f32_store_lo_align_16:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovaps %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; SKX-LABEL: extract_subvector512_v2i64_store_lo:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovups %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
; SKX-LABEL: extract_subvector512_v2i64_store_lo_align_16:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovaps %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
; SKX-LABEL: extract_subvector512_v4i32_store_lo:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovups %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; SKX-LABEL: extract_subvector512_v4i32_store_lo_align_16:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovaps %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; SKX-LABEL: extract_subvector512_v8i16_store_lo:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovups %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <32 x i16> %a, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; SKX-LABEL: extract_subvector512_v16i8_store_lo:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovups %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <64 x i8> %a, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; SKX-LABEL: extract_subvector512_v16i8_store_lo_align_16:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovaps %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <64 x i8> %a, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; SKX-LABEL: extract_subvector512_v4f64_store_lo:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovups %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <8 x double> %a, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; SKX-LABEL: extract_subvector512_v4f64_store_lo_align_16:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovups %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <8 x double> %a, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; SKX-LABEL: extract_subvector512_v4f64_store_lo_align_32:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovaps %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <8 x double> %a, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; SKX-LABEL: extract_subvector512_v8f32_store_lo:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovups %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <16 x float> %a, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; SKX-LABEL: extract_subvector512_v8f32_store_lo_align_16:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovaps %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <16 x float> %a, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; SKX-LABEL: extract_subvector512_v8f32_store_lo_align_32:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovaps %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <16 x float> %a, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; SKX-LABEL: extract_subvector512_v4i64_store_lo:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovups %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <8 x i64> %a, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; SKX-LABEL: extract_subvector512_v4i64_store_lo_align_16:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovups %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <8 x i64> %a, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; SKX-LABEL: extract_subvector512_v4i64_store_lo_align_32:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovaps %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <8 x i64> %a, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; SKX-LABEL: extract_subvector512_v8i32_store_lo:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovups %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <16 x i32> %a, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; SKX-LABEL: extract_subvector512_v8i32_store_lo_align_16:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovups %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <16 x i32> %a, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; SKX-LABEL: extract_subvector512_v8i32_store_lo_align_32:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovaps %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <16 x i32> %a, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; SKX-LABEL: extract_subvector512_v16i16_store_lo:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovups %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <32 x i16> %a, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; SKX-LABEL: extract_subvector512_v16i16_store_lo_align_16:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovups %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <32 x i16> %a, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; SKX-LABEL: extract_subvector512_v16i16_store_lo_align_32:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovaps %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <32 x i16> %a, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; SKX-LABEL: extract_subvector512_v32i8_store_lo:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovups %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <64 x i8> %a, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
; SKX-LABEL: extract_subvector512_v32i8_store_lo_align_16:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovups %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <64 x i8> %a, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
; SKX-LABEL: extract_subvector512_v32i8_store_lo_align_32:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovaps %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <64 x i8> %a, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
; SKX: ## BB#0: ## %entry
; SKX-NEXT: kmovb %edi, %k1
; SKX-NEXT: vextractf32x4 $1, %zmm1, %xmm0 {%k1}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = bitcast <8 x double> %__A to <16 x float>
; SKX: ## BB#0: ## %entry
; SKX-NEXT: kmovb %edi, %k1
; SKX-NEXT: vextractf32x4 $1, %zmm0, %xmm0 {%k1} {z}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = bitcast <8 x double> %__A to <16 x float>
; SKX: ## BB#0: ## %entry
; SKX-NEXT: kmovb %edi, %k1
; SKX-NEXT: vextractf64x2 $1, %ymm1, %xmm0 {%k1}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%shuffle = shufflevector <4 x double> %__A, <4 x double> undef, <2 x i32> <i32 2, i32 3>
; SKX: ## BB#0: ## %entry
; SKX-NEXT: kmovb %edi, %k1
; SKX-NEXT: vextractf64x2 $1, %ymm0, %xmm0 {%k1} {z}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%shuffle = shufflevector <4 x double> %__A, <4 x double> undef, <2 x i32> <i32 2, i32 3>
; SKX: ## BB#0: ## %entry
; SKX-NEXT: kmovb %edi, %k1
; SKX-NEXT: vextracti64x2 $1, %ymm1, %xmm0 {%k1}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%shuffle = shufflevector <4 x i64> %__A, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
; SKX: ## BB#0: ## %entry
; SKX-NEXT: kmovb %edi, %k1
; SKX-NEXT: vextracti64x2 $1, %ymm0, %xmm0 {%k1} {z}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%shuffle = shufflevector <4 x i64> %__A, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
; SKX: ## BB#0: ## %entry
; SKX-NEXT: kmovb %edi, %k1
; SKX-NEXT: vextractf32x4 $1, %ymm1, %xmm0 {%k1}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%shuffle = shufflevector <8 x float> %__A, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
; SKX: ## BB#0: ## %entry
; SKX-NEXT: kmovb %edi, %k1
; SKX-NEXT: vextractf32x4 $1, %ymm0, %xmm0 {%k1} {z}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%shuffle = shufflevector <8 x float> %__A, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
; SKX: ## BB#0: ## %entry
; SKX-NEXT: kmovb %edi, %k1
; SKX-NEXT: vextracti32x4 $1, %ymm1, %xmm0 {%k1}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__A to <8 x i32>
; SKX: ## BB#0: ## %entry
; SKX-NEXT: kmovb %edi, %k1
; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm0 {%k1} {z}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__A to <8 x i32>
; SKX: ## BB#0: ## %entry
; SKX-NEXT: kmovb %edi, %k1
; SKX-NEXT: vextractf64x2 $3, %zmm1, %xmm0 {%k1}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%shuffle = shufflevector <8 x double> %__A, <8 x double> undef, <2 x i32> <i32 6, i32 7>
; SKX: ## BB#0: ## %entry
; SKX-NEXT: kmovb %edi, %k1
; SKX-NEXT: vextractf64x2 $3, %zmm0, %xmm0 {%k1} {z}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%shuffle = shufflevector <8 x double> %__A, <8 x double> undef, <2 x i32> <i32 6, i32 7>
; CHECK-NEXT: vgatherdps (%rsi,%zmm0,4), %zmm1 {%k2}
; CHECK-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0
; CHECK-NEXT: vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%x = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4)
%ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: vgatherdpd (%rsi,%ymm0,4), %zmm1 {%k2}
; CHECK-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
; CHECK-NEXT: vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%x = call <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4)
%ind2 = add <8 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: vgatherqps (%rsi,%zmm0,4), %ymm1 {%k2}
; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
; CHECK-NEXT: vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
%ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
; CHECK-NEXT: vgatherqpd (%rsi,%zmm0,4), %zmm1 {%k2}
; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
; CHECK-NEXT: vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
%ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
; CHECK-NEXT: vpgatherdd (%rsi,%zmm0,4), %zmm1 {%k2}
; CHECK-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0
; CHECK-NEXT: vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%x = call <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4)
%ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: vpgatherqd (%rsi,%zmm0,4), %ymm1 {%k2}
; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
; CHECK-NEXT: vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%x = call <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i32> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
%ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
; CHECK-NEXT: vpgatherqq (%rsi,%zmm0,4), %zmm1 {%k2}
; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
; CHECK-NEXT: vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%x = call <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
%ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
; CHECK-NEXT: vpgatherdq (%rsi,%ymm0,4), %zmm1 {%k2}
; CHECK-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
; CHECK-NEXT: vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%x = call <8 x i64> @llvm.x86.avx512.gather.dpq.512 (<8 x i64> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4)
%ind2 = add <8 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vgatherdpd (%rsi,%ymm0,4), %zmm1 {%k1}
; CHECK-NEXT: vmovapd %zmm1, (%rdx)
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%x = call <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4)
store <8 x double> %x, <8 x double>* %stbuf
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vgatherqpd (%rsi,%zmm0,4), %zmm1 {%k1}
; CHECK-NEXT: vmovapd %zmm1, (%rdx)
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
store <8 x double> %x, <8 x double>* %stbuf
; CHECK-NEXT: kmovb %esi, %k1
; CHECK-NEXT: vmovapd (%rdi), %zmm1
; CHECK-NEXT: vscatterdpd %zmm1, (%rcx,%ymm0,4) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%x = load <8 x double>, <8 x double>* %src, align 64
call void @llvm.x86.avx512.scatter.dpd.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind, <8 x double> %x, i32 4)
; CHECK-NEXT: kmovb %esi, %k1
; CHECK-NEXT: vmovapd (%rdi), %zmm1
; CHECK-NEXT: vscatterqpd %zmm1, (%rcx,%zmm0,4) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%x = load <8 x double>, <8 x double>* %src, align 64
call void @llvm.x86.avx512.scatter.qpd.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x double> %x, i32 4)
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vmovaps (%rdi), %zmm1
; CHECK-NEXT: vscatterdps %zmm1, (%rcx,%zmm0,4) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%x = load <16 x float>, <16 x float>* %src, align 64
call void @llvm.x86.avx512.scatter.dps.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind, <16 x float> %x, i32 4)
; CHECK-NEXT: kmovb %esi, %k1
; CHECK-NEXT: vmovaps (%rdi), %ymm1
; CHECK-NEXT: vscatterqps %ymm1, (%rcx,%zmm0,4) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%x = load <8 x float>, <8 x float>* %src, align 32
call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x float> %x, i32 4)
; CHECK-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm1 {%k2}
; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
; CHECK-NEXT: vscatterqps %ymm1, (%rsi,%zmm0,4) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 -1, i32 4)
%ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
; CHECK-NEXT: movb $120, %al
; CHECK-NEXT: kmovb %eax, %k1
; CHECK-NEXT: vscatterpf1qps (%rdi,%zmm0,2) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 4, i32 0)
call void @llvm.x86.avx512.gatherpf.qps.512(i8 0, <8 x i64> %ind, i8* %base, i32 4, i32 1)
; CHECK: ## BB#0:
; CHECK-NEXT: kmovb %esi, %k1
; CHECK-NEXT: vpgatherqq (%rdi,%xmm1,8), %xmm0 {%k1}
-; CHECK-NEXT: vpaddq %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vpaddq %xmm0, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.avx512.gather3div2.di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 8)
%res1 = call <2 x i64> @llvm.x86.avx512.gather3div2.di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 8)
; CHECK-NEXT: vpgatherqq (%rdi,%ymm1,8), %ymm2 {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
; CHECK-NEXT: vpgatherqq (%rdi,%ymm1,8), %ymm0 {%k1}
-; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0
; CHECK-NEXT: retq
%res = call <4 x i64> @llvm.x86.avx512.gather3div4.di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 8)
%res1 = call <4 x i64> @llvm.x86.avx512.gather3div4.di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 8)
; CHECK-NEXT: kxnorw %k0, %k0, %k1
; CHECK-NEXT: vgatherqps (%rdi,%ymm1,2), %xmm0 {%k1}
; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 4)
%res1 = call <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 2)
; CHECK-NEXT: vpgatherqd (%rdi,%ymm1,4), %xmm2 {%k2}
; CHECK-NEXT: vpgatherqd (%rdi,%ymm1,2), %xmm0 {%k1}
; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 4)
%res1 = call <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 2)
; CHECK: ## BB#0:
; CHECK-NEXT: kmovb %esi, %k1
; CHECK-NEXT: vpgatherdq (%rdi,%xmm1,8), %ymm0 {%k1}
-; CHECK-NEXT: vpaddq %ymm0, %ymm0, %ymm0
+; CHECK-NEXT: vpaddq %ymm0, %ymm0, %ymm0
; CHECK-NEXT: retq
%res = call <4 x i64> @llvm.x86.avx512.gather3siv4.di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8)
%res1 = call <4 x i64> @llvm.x86.avx512.gather3siv4.di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8)
; CHECK-NEXT: vscatterqpd %ymm1, (%rdi,%ymm0,2) {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
; CHECK-NEXT: vscatterqpd %ymm1, (%rdi,%ymm0,4) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.scatterdiv4.df(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x double> %x3, i32 2)
call void @llvm.x86.avx512.scatterdiv4.df(i8* %x0, i8 -1, <4 x i64> %x2, <4 x double> %x3, i32 4)
; CHECK-NEXT: vpscatterqq %ymm1, (%rdi,%ymm0,2) {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
; CHECK-NEXT: vpscatterqq %ymm1, (%rdi,%ymm0,4) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.scatterdiv4.di(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i64> %x3, i32 2)
call void @llvm.x86.avx512.scatterdiv4.di(i8* %x0, i8 -1, <4 x i64> %x2, <4 x i64> %x3, i32 4)
; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%ymm0,2) {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%ymm0,4) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.scatterdiv8.sf(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x float> %x3, i32 2)
call void @llvm.x86.avx512.scatterdiv8.sf(i8* %x0, i8 -1, <4 x i64> %x2, <4 x float> %x3, i32 4)
; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%ymm0,2) {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%ymm0,4) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.scatterdiv8.si(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i32> %x3, i32 2)
call void @llvm.x86.avx512.scatterdiv8.si(i8* %x0, i8 -1, <4 x i64> %x2, <4 x i32> %x3, i32 4)
; CHECK-NEXT: vscatterdpd %ymm1, (%rdi,%xmm0,2) {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
; CHECK-NEXT: vscatterdpd %ymm1, (%rdi,%xmm0,4) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.scattersiv4.df(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x double> %x3, i32 2)
call void @llvm.x86.avx512.scattersiv4.df(i8* %x0, i8 -1, <4 x i32> %x2, <4 x double> %x3, i32 4)
; CHECK-NEXT: kxnorw %k0, %k0, %k2
; CHECK-NEXT: vpscatterdq %ymm1, (%rdi,%xmm0,2) {%k2}
; CHECK-NEXT: vpscatterdq %ymm1, (%rdi,%xmm0,4) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.scattersiv4.di(i8* %x0, i8 -1, <4 x i32> %x2, <4 x i64> %x3, i32 2)
call void @llvm.x86.avx512.scattersiv4.di(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i64> %x3, i32 4)
; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,2) {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,4) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.scattersiv8.sf(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x float> %x3, i32 2)
call void @llvm.x86.avx512.scattersiv8.sf(i8* %x0, i8 -1, <8 x i32> %x2, <8 x float> %x3, i32 4)
; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x i32> %x3, i32 2)
call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 -1, <8 x i32> %x2, <8 x i32> %x3, i32 4)
; CHECK-NEXT: movb $96, %al
; CHECK-NEXT: kmovb %eax, %k1
; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 -1, <8 x i32> %x2, <8 x i32> %x3, i32 2)
call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 0, <8 x i32> %x2, <8 x i32> %x3, i32 4)
; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%e = extractelement <16 x float> %x, i32 %ind
ret float %e
; SKX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%e = extractelement <8 x double> %x, i32 %ind
ret double %e
; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%e = extractelement <8 x float> %x, i32 %ind
ret float %e
; SKX-NEXT: movl (%rsp,%rdi,4), %eax
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%e = extractelement <16 x i32> %x, i32 %ind
ret i32 %e
; SKX-NEXT: testb %al, %al
; SKX-NEXT: cmoveq %rsi, %rdi
; SKX-NEXT: movq %rdi, %rax
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%cmpvector_func.i = icmp slt <16 x i64> %a, %b
%extract24vector_func.i = extractelement <16 x i1> %cmpvector_func.i, i32 0
; SKX-NEXT: testb %al, %al
; SKX-NEXT: cmoveq %rsi, %rdi
; SKX-NEXT: movq %rdi, %rax
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%cmpvector_func.i = icmp slt <8 x i64> %a, %b
%extract24vector_func.i = extractelement <8 x i1> %cmpvector_func.i, i32 4
; SKX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
; SKX-NEXT: vpmovd2m %zmm2, %k0
; SKX-NEXT: kmovw %k0, %eax
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x = load i1 , i1 * %addr, align 128
%a1 = bitcast i16 %a to <16 x i1>
; SKX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
; SKX-NEXT: vpmovq2m %zmm2, %k0
; SKX-NEXT: kmovb %k0, %eax
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x = load i1 , i1 * %addr, align 128
%a1 = bitcast i8 %a to <8 x i1>
; SKX-NEXT: vpextrq $1, %xmm0, %rax
; SKX-NEXT: vextracti64x2 $1, %zmm0, %xmm0
; SKX-NEXT: vpextrq $1, %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%r1 = extractelement <8 x i64> %x, i32 1
%r2 = extractelement <8 x i64> %x, i32 3
; SKX-NEXT: vpextrq $1, %xmm0, %rax
; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
; SKX-NEXT: vpextrq $1, %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%r1 = extractelement <4 x i64> %x, i32 1
%r2 = extractelement <4 x i64> %x, i32 3
; SKX-NEXT: vpextrd $1, %xmm0, %eax
; SKX-NEXT: vextracti32x4 $1, %zmm0, %xmm0
; SKX-NEXT: vpextrd $1, %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%r1 = extractelement <16 x i32> %x, i32 1
%r2 = extractelement <16 x i32> %x, i32 5
; SKX-NEXT: vpextrd $1, %xmm0, %eax
; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
; SKX-NEXT: vpextrd $1, %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%r1 = extractelement <8 x i32> %x, i32 1
%r2 = extractelement <8 x i32> %x, i32 5
; SKX-NEXT: vextracti32x4 $1, %zmm0, %xmm0
; SKX-NEXT: vpextrw $1, %xmm0, (%rdi)
; SKX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%r1 = extractelement <32 x i16> %x, i32 1
%r2 = extractelement <32 x i16> %x, i32 9
; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
; SKX-NEXT: vpextrw $1, %xmm0, (%rdi)
; SKX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%r1 = extractelement <16 x i16> %x, i32 1
%r2 = extractelement <16 x i16> %x, i32 9
; SKX-NEXT: vextracti32x4 $1, %zmm0, %xmm0
; SKX-NEXT: vpextrb $1, %xmm0, (%rdi)
; SKX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%r1 = extractelement <64 x i8> %x, i32 1
%r2 = extractelement <64 x i8> %x, i32 17
; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
; SKX-NEXT: vpextrb $1, %xmm0, (%rdi)
; SKX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%r1 = extractelement <32 x i8> %x, i32 1
%r2 = extractelement <32 x i8> %x, i32 17
; SKX-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
; SKX-NEXT: vpmovw2m %zmm2, %k0
; SKX-NEXT: kmovd %k0, %eax
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%cmp_res_i1 = icmp ult i32 %a, %b
%cmp_cmp_vec = icmp ult <32 x i32> %x, %y
; SKX-NEXT: kshiftrd $31, %k0, %k0
; SKX-NEXT: kmovw %k0, %eax
; SKX-NEXT: andl $1, %eax
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t1 = icmp ugt <32 x i8> %a, %b
%t2 = extractelement <32 x i1> %t1, i32 2
; SKX-NEXT: sete %al
; SKX-NEXT: addb $3, %al
; SKX-NEXT: movzbl %al, %eax
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t1 = icmp ugt <64 x i8> %a, %b
%t2 = extractelement <64 x i1> %t1, i32 63
; SKX-NEXT: sete %al
; SKX-NEXT: addb $3, %al
; SKX-NEXT: movzbl %al, %eax
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t1 = icmp ugt <64 x i8> %a, %b
%t2 = extractelement <64 x i1> %t1, i32 63
; SKX-NEXT: movq (%rsp,%rdi,8), %rax
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t2 = extractelement <4 x i64> %t1, i32 %index
ret i64 %t2
; SKX-NEXT: movq (%rsp,%rdi,8), %rax
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t2 = extractelement <8 x i64> %t1, i32 %index
ret i64 %t2
; SKX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t2 = extractelement <4 x double> %t1, i32 %index
ret double %t2
; SKX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t2 = extractelement <8 x double> %t1, i32 %index
ret double %t2
; SKX-NEXT: movl (%rsp,%rdi,4), %eax
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t2 = extractelement <8 x i32> %t1, i32 %index
ret i32 %t2
; SKX-NEXT: movl (%rsp,%rdi,4), %eax
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t2 = extractelement <16 x i32> %t1, i32 %index
ret i32 %t2
; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t2 = extractelement <8 x float> %t1, i32 %index
ret float %t2
; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t2 = extractelement <16 x float> %t1, i32 %index
ret float %t2
; SKX-NEXT: movzwl (%rsp,%rdi,2), %eax
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t2 = extractelement <16 x i16> %t1, i32 %index
ret i16 %t2
; SKX-NEXT: movzwl (%rsp,%rdi,2), %eax
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t2 = extractelement <32 x i16> %t1, i32 %index
ret i16 %t2
; SKX-NEXT: movb (%rdi,%rax), %al
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t2 = extractelement <32 x i8> %t1, i32 %index
; SKX-NEXT: movb (%rdi,%rax), %al
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t2 = extractelement <64 x i8> %t1, i32 %index
; SKX-NEXT: movb (%rax,%rcx), %al
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%i = add i8 %index, %index
; SKX-NEXT: andl $1, %eax
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t1 = icmp ugt <8 x i32> %a, %b
%t2 = extractelement <8 x i1> %t1, i32 %index
; SKX-NEXT: andl $1, %eax
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t1 = icmp ugt <16 x i32> %a, %b
%t2 = extractelement <16 x i1> %t1, i32 %index
; SKX-NEXT: andl $1, %eax
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t1 = icmp ugt <32 x i8> %a, %b
%t2 = extractelement <32 x i1> %t1, i32 %index
; SKX-NEXT: movzbl %al, %eax
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t1 = icmp ugt <64 x i8> %a, %b
%t2 = extractelement <64 x i1> %t1, i32 %index
}
define i32 @zext_test1(<16 x i32> %a, <16 x i32> %b) {
-; CHECK-LABEL: zext_test1:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
-; CHECK-NEXT: kshiftlw $10, %k0, %k0
-; CHECK-NEXT: kshiftrw $15, %k0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: andl $1, %eax
-; CHECK-NEXT: retq
+; KNL-LABEL: zext_test1:
+; KNL: ## BB#0:
+; KNL-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
+; KNL-NEXT: kshiftlw $10, %k0, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: andl $1, %eax
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_test1:
+; SKX: ## BB#0:
+; SKX-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
+; SKX-NEXT: kshiftlw $10, %k0, %k0
+; SKX-NEXT: kshiftrw $15, %k0, %k0
+; SKX-NEXT: kmovw %k0, %eax
+; SKX-NEXT: andl $1, %eax
+; SKX-NEXT: vzeroupper
+; SKX-NEXT: retq
%cmp_res = icmp ugt <16 x i32> %a, %b
%cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
%res = zext i1 %cmp_res.i1 to i32
}
define i16 @zext_test2(<16 x i32> %a, <16 x i32> %b) {
-; CHECK-LABEL: zext_test2:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
-; CHECK-NEXT: kshiftlw $10, %k0, %k0
-; CHECK-NEXT: kshiftrw $15, %k0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: andl $1, %eax
-; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
-; CHECK-NEXT: retq
+; KNL-LABEL: zext_test2:
+; KNL: ## BB#0:
+; KNL-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
+; KNL-NEXT: kshiftlw $10, %k0, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: andl $1, %eax
+; KNL-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_test2:
+; SKX: ## BB#0:
+; SKX-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
+; SKX-NEXT: kshiftlw $10, %k0, %k0
+; SKX-NEXT: kshiftrw $15, %k0, %k0
+; SKX-NEXT: kmovw %k0, %eax
+; SKX-NEXT: andl $1, %eax
+; SKX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; SKX-NEXT: vzeroupper
+; SKX-NEXT: retq
%cmp_res = icmp ugt <16 x i32> %a, %b
%cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
%res = zext i1 %cmp_res.i1 to i16
}
define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) {
-; CHECK-LABEL: zext_test3:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
-; CHECK-NEXT: kshiftlw $10, %k0, %k0
-; CHECK-NEXT: kshiftrw $15, %k0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: andl $1, %eax
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
-; CHECK-NEXT: retq
+; KNL-LABEL: zext_test3:
+; KNL: ## BB#0:
+; KNL-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
+; KNL-NEXT: kshiftlw $10, %k0, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: andl $1, %eax
+; KNL-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_test3:
+; SKX: ## BB#0:
+; SKX-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
+; SKX-NEXT: kshiftlw $10, %k0, %k0
+; SKX-NEXT: kshiftrw $15, %k0, %k0
+; SKX-NEXT: kmovw %k0, %eax
+; SKX-NEXT: andl $1, %eax
+; SKX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SKX-NEXT: vzeroupper
+; SKX-NEXT: retq
%cmp_res = icmp ugt <16 x i32> %a, %b
%cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
%res = zext i1 %cmp_res.i1 to i8
; SKX-NEXT: vpcmpgtq %ymm3, %ymm2, %k1
; SKX-NEXT: kandnw %k0, %k1, %k0
; SKX-NEXT: vpmovm2d %k0, %xmm0
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x_gt_y = icmp sgt <4 x i64> %x, %y
%x1_gt_y1 = icmp sgt <4 x i64> %x1, %y1
; SKX-NEXT: ## BB#2:
; SKX-NEXT: vpcmpltud %zmm2, %zmm1, %k0
; SKX-NEXT: vpmovm2b %k0, %xmm0
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
; SKX-NEXT: LBB17_1:
; SKX-NEXT: vpcmpgtd %zmm2, %zmm0, %k0
; SKX-NEXT: vpmovm2b %k0, %xmm0
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%cond = icmp sgt i32 %a1, %b1
%cmp1 = icmp sgt <16 x i32> %a, zeroinitializer
; SKX-NEXT: kshiftlb $7, %k2, %k1
; SKX-NEXT: korb %k1, %k0, %k0
; SKX-NEXT: vpmovm2w %k0, %xmm0
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%b = bitcast i8 %a to <8 x i1>
%b1 = bitcast i16 %y to <16 x i1>
; SKX-NEXT: je LBB41_2
; SKX-NEXT: ## BB#1: ## %L1
; SKX-NEXT: vmovapd %zmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
; SKX-NEXT: LBB41_2: ## %L2
; SKX-NEXT: vmovapd %zmm0, 8(%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%addr1 = getelementptr double, double * %base, i64 0
%addr2 = getelementptr double, double * %base, i64 1
; SKX-NEXT: ## BB#1: ## %L1
; SKX-NEXT: vmovaps %zmm0, (%rdi)
; SKX-NEXT: vmovaps %zmm1, 64(%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
; SKX-NEXT: LBB42_2: ## %L2
; SKX-NEXT: vmovaps %zmm0, 4(%rdi)
; SKX-NEXT: vmovaps %zmm1, 68(%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%addr1 = getelementptr float, float * %base, i64 0
%addr2 = getelementptr float, float * %base, i64 1
; SKX-NEXT: vpsllw $7, %ymm0, %ymm0
; SKX-NEXT: vpmovb2m %ymm0, %k0
; SKX-NEXT: kmovd %k0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
store <32 x i1> %v, <32 x i1>* %a
ret void
; SKX-NEXT: vpsllw $15, %zmm0, %zmm0
; SKX-NEXT: vpmovw2m %zmm0, %k0
; SKX-NEXT: kmovd %k0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%v1 = trunc <32 x i16> %v to <32 x i1>
store <32 x i1> %v1, <32 x i1>* %a
; SKX-NEXT: vpsllw $7, %zmm0, %zmm0
; SKX-NEXT: vpmovb2m %zmm0, %k0
; SKX-NEXT: kmovq %k0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
store <64 x i1> %v, <64 x i1>* %a
ret void
; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
; SKX-NEXT: kmovb %k0, %eax
; SKX-NEXT: addl %eax, %eax
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%v1 = icmp eq <16 x i32> %a, zeroinitializer
%mask = shufflevector <16 x i1> %v1, <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
}
define i32 @test_bitcast_v16i1_zext(<16 x i32> %a) {
-; CHECK-LABEL: test_bitcast_v16i1_zext:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: addl %eax, %eax
-; CHECK-NEXT: retq
+; KNL-LABEL: test_bitcast_v16i1_zext:
+; KNL: ## BB#0:
+; KNL-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; KNL-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: addl %eax, %eax
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_bitcast_v16i1_zext:
+; SKX: ## BB#0:
+; SKX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; SKX-NEXT: kmovw %k0, %eax
+; SKX-NEXT: addl %eax, %eax
+; SKX-NEXT: vzeroupper
+; SKX-NEXT: retq
%v1 = icmp eq <16 x i32> %a, zeroinitializer
%mask1 = bitcast <16 x i1> %v1 to i16
%val = zext i16 %mask1 to i32
; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0
; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq _f
; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq _f
; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
; CHECK-NEXT: kmovd %k0, {{[0-9]+}}(%rsp) ## 4-byte Spill
; CHECK-NEXT: vpcmpgtw %zmm1, %zmm0, %k0
; CHECK-NEXT: kmovd %k0, (%rsp) ## 4-byte Spill
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq _f
; CHECK-NEXT: kmovd {{[0-9]+}}(%rsp), %k0 ## 4-byte Reload
; CHECK-NEXT: kmovd (%rsp), %k1 ## 4-byte Reload
; CHECK-NEXT: kmovq %k0, {{[0-9]+}}(%rsp) ## 8-byte Spill
; CHECK-NEXT: vpcmpgtb %zmm1, %zmm0, %k0
; CHECK-NEXT: kmovq %k0, {{[0-9]+}}(%rsp) ## 8-byte Spill
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq _f
; CHECK-NEXT: kmovq {{[0-9]+}}(%rsp), %k0 ## 8-byte Reload
; CHECK-NEXT: kmovq {{[0-9]+}}(%rsp), %k1 ## 8-byte Reload
; AVX512-NEXT: vpxord %zmm2, %zmm2, %zmm2
; AVX512-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
; AVX512-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1}
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%mask = icmp eq <16 x i32> %trigger, zeroinitializer
call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32>%val, <16 x i32>* %addr, i32 4, <16 x i1>%mask)
; AVX512-NEXT: vpxord %zmm2, %zmm2, %zmm2
; AVX512-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
; AVX512-NEXT: vmovups %zmm1, (%rdi) {%k1}
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%mask = icmp eq <16 x i32> %trigger, zeroinitializer
call void @llvm.masked.store.v16f32.p0v16f32(<16 x float>%val, <16 x float>* %addr, i32 4, <16 x i1>%mask)
; AVX512: ## BB#0:
; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0
; AVX512-NEXT: vmovlps %xmm0, 48(%rdi)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> %val, <8 x double>* %addr, i32 4, <8 x i1><i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false>)
ret void
; AVX512F-NEXT: vmovdqu64 %zmm1, (%rdi) {%k1}
; AVX512F-NEXT: kshiftrw $8, %k1, %k1
; AVX512F-NEXT: vmovdqu64 %zmm2, 64(%rdi) {%k1}
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; SKX-LABEL: test_store_16i64:
; SKX-NEXT: vmovdqu64 %zmm1, (%rdi) {%k1}
; SKX-NEXT: kshiftrw $8, %k1, %k1
; SKX-NEXT: vmovdqu64 %zmm2, 64(%rdi) {%k1}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
call void @llvm.masked.store.v16i64.p0v16i64(<16 x i64> %src0, <16 x i64>* %ptrs, i32 4, <16 x i1> %mask)
ret void
; AVX512F-NEXT: vmovupd %zmm1, (%rdi) {%k1}
; AVX512F-NEXT: kshiftrw $8, %k1, %k1
; AVX512F-NEXT: vmovupd %zmm2, 64(%rdi) {%k1}
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; SKX-LABEL: test_store_16f64:
; SKX-NEXT: vmovupd %zmm1, (%rdi) {%k1}
; SKX-NEXT: kshiftrw $8, %k1, %k1
; SKX-NEXT: vmovupd %zmm2, 64(%rdi) {%k1}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
call void @llvm.masked.store.v16f64.p0v16f64(<16 x double> %src0, <16 x double>* %ptrs, i32 4, <16 x i1> %mask)
ret void
; CHECK-NEXT: vpsllw $7, %ymm0, %ymm0
; CHECK-NEXT: vpmovb2m %ymm0, %k1
; CHECK-NEXT: vmovdqu8 %ymm1, (%rdi) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> %val, <32 x i8>* %addr, i32 4, <32 x i1>%mask)
ret void
; CHECK-NEXT: vpsllw $7, %zmm0, %zmm0
; CHECK-NEXT: vpmovb2m %zmm0, %k1
; CHECK-NEXT: vmovdqu8 %zmm1, (%rdi) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> %val, <64 x i8>* %addr, i32 4, <64 x i1>%mask)
ret void
; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0
; CHECK-NEXT: vpmovb2m %xmm0, %k1
; CHECK-NEXT: vmovdqu16 %ymm1, (%rdi) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %val, <16 x i16>* %addr, i32 4, <16 x i1>%mask)
ret void
; CHECK-NEXT: vpsllw $7, %ymm0, %ymm0
; CHECK-NEXT: vpmovb2m %ymm0, %k1
; CHECK-NEXT: vmovdqu16 %zmm1, (%rdi) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.masked.store.v32i16.p0v32i16(<32 x i16> %val, <32 x i16>* %addr, i32 4, <32 x i1>%mask)
ret void
; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; CHECK-NEXT: vpmovq2m %zmm0, %k0
; CHECK-NEXT: vpmovm2w %k0, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = shufflevector <2 x i1> %a, <2 x i1> zeroinitializer, <8 x i32> <i32 3, i32 3, i32 undef, i32 undef, i32 0, i32 1, i32 undef, i32 undef>
ret <8 x i1> %res
; ALL-LABEL: trunc_16x32_to_16x8:
; ALL: ## BB#0:
; ALL-NEXT: vpmovdb %zmm0, %xmm0
+; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
%x = trunc <16 x i32> %i to <16 x i8>
ret <16 x i8> %x
; ALL-LABEL: trunc_8x64_to_8x16:
; ALL: ## BB#0:
; ALL-NEXT: vpmovqw %zmm0, %xmm0
+; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
%x = trunc <8 x i64> %i to <8 x i16>
ret <8 x i16> %x
; ALL-LABEL: trunc_qb_512:
; ALL: ## BB#0:
; ALL-NEXT: vpmovqw %zmm0, %xmm0
+; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
%x = trunc <8 x i64> %i to <8 x i8>
ret <8 x i8> %x
; ALL-LABEL: trunc_qb_512_mem:
; ALL: ## BB#0:
; ALL-NEXT: vpmovqb %zmm0, (%rdi)
+; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
%x = trunc <8 x i64> %i to <8 x i8>
store <8 x i8> %x, <8 x i8>* %res
; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vpmovqd %zmm0, %ymm0
; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_qb_256:
; SKX: ## BB#0:
; SKX-NEXT: vpmovqd %ymm0, %xmm0
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x = trunc <4 x i64> %i to <4 x i8>
ret <4 x i8> %x
; KNL-NEXT: vpmovqd %zmm0, %ymm0
; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; KNL-NEXT: vmovd %xmm0, (%rdi)
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_qb_256_mem:
; SKX: ## BB#0:
; SKX-NEXT: vpmovqb %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x = trunc <4 x i64> %i to <4 x i8>
store <4 x i8> %x, <4 x i8>* %res
; ALL-LABEL: trunc_qw_512:
; ALL: ## BB#0:
; ALL-NEXT: vpmovqw %zmm0, %xmm0
+; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
%x = trunc <8 x i64> %i to <8 x i16>
ret <8 x i16> %x
; ALL-LABEL: trunc_qw_512_mem:
; ALL: ## BB#0:
; ALL-NEXT: vpmovqw %zmm0, (%rdi)
+; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
%x = trunc <8 x i64> %i to <8 x i16>
store <8 x i16> %x, <8 x i16>* %res
; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vpmovqd %zmm0, %ymm0
; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_qw_256:
; SKX: ## BB#0:
; SKX-NEXT: vpmovqd %ymm0, %xmm0
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x = trunc <4 x i64> %i to <4 x i16>
ret <4 x i16> %x
; KNL-NEXT: vpmovqd %zmm0, %ymm0
; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; KNL-NEXT: vmovq %xmm0, (%rdi)
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_qw_256_mem:
; SKX: ## BB#0:
; SKX-NEXT: vpmovqw %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x = trunc <4 x i64> %i to <4 x i16>
store <4 x i16> %x, <4 x i16>* %res
; ALL-LABEL: trunc_qd_512_mem:
; ALL: ## BB#0:
; ALL-NEXT: vpmovqd %zmm0, (%rdi)
+; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
%x = trunc <8 x i64> %i to <8 x i32>
store <8 x i32> %x, <8 x i32>* %res
; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vpmovqd %zmm0, %ymm0
; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_qd_256:
; SKX: ## BB#0:
; SKX-NEXT: vpmovqd %ymm0, %xmm0
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x = trunc <4 x i64> %i to <4 x i32>
ret <4 x i32> %x
; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vpmovqd %zmm0, %ymm0
; KNL-NEXT: vmovdqa %xmm0, (%rdi)
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_qd_256_mem:
; SKX: ## BB#0:
; SKX-NEXT: vpmovqd %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x = trunc <4 x i64> %i to <4 x i32>
store <4 x i32> %x, <4 x i32>* %res
; ALL-LABEL: trunc_db_512:
; ALL: ## BB#0:
; ALL-NEXT: vpmovdb %zmm0, %xmm0
+; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
%x = trunc <16 x i32> %i to <16 x i8>
ret <16 x i8> %x
; ALL-LABEL: trunc_db_512_mem:
; ALL: ## BB#0:
; ALL-NEXT: vpmovdb %zmm0, (%rdi)
+; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
%x = trunc <16 x i32> %i to <16 x i8>
store <16 x i8> %x, <16 x i8>* %res
; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vpmovdw %zmm0, %ymm0
; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_db_256:
; SKX: ## BB#0:
; SKX-NEXT: vpmovdw %ymm0, %xmm0
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x = trunc <8 x i32> %i to <8 x i8>
ret <8 x i8> %x
; KNL-NEXT: vpmovdw %zmm0, %ymm0
; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; KNL-NEXT: vmovq %xmm0, (%rdi)
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_db_256_mem:
; SKX: ## BB#0:
; SKX-NEXT: vpmovdb %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x = trunc <8 x i32> %i to <8 x i8>
store <8 x i8> %x, <8 x i8>* %res
; ALL-LABEL: trunc_dw_512_mem:
; ALL: ## BB#0:
; ALL-NEXT: vpmovdw %zmm0, (%rdi)
+; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
%x = trunc <16 x i32> %i to <16 x i16>
store <16 x i16> %x, <16 x i16>* %res
; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vpmovdw %zmm0, %ymm0
; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_dw_256:
; SKX: ## BB#0:
; SKX-NEXT: vpmovdw %ymm0, %xmm0
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x = trunc <8 x i32> %i to <8 x i16>
ret <8 x i16> %x
; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vpmovdw %zmm0, %ymm0
; KNL-NEXT: vmovdqa %xmm0, (%rdi)
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_dw_256_mem:
; SKX: ## BB#0:
; SKX-NEXT: vpmovdw %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x = trunc <8 x i32> %i to <8 x i16>
store <8 x i16> %x, <8 x i16>* %res
; KNL-NEXT: vpmovdb %zmm1, %xmm1
; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; KNL-NEXT: vmovdqa %ymm0, (%rdi)
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_wb_512_mem:
; SKX: ## BB#0:
; SKX-NEXT: vpmovwb %zmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x = trunc <32 x i16> %i to <32 x i8>
store <32 x i8> %x, <32 x i8>* %res
; KNL: ## BB#0:
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
; KNL-NEXT: vpmovdb %zmm0, %xmm0
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_wb_256:
; SKX: ## BB#0:
; SKX-NEXT: vpmovwb %ymm0, %xmm0
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x = trunc <16 x i16> %i to <16 x i8>
ret <16 x i8> %x
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: vmovdqa %xmm0, (%rdi)
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_wb_256_mem:
; SKX: ## BB#0:
; SKX-NEXT: vpmovwb %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x = trunc <16 x i16> %i to <16 x i8>
store <16 x i8> %x, <16 x i8>* %res
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: vmovdqu %xmm0, (%rdi)
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: usat_trunc_wb_256_mem:
; SKX: ## BB#0:
; SKX-NEXT: vpmovuswb %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x3 = icmp ult <16 x i16> %i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
%x5 = select <16 x i1> %x3, <16 x i16> %i, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
; KNL-NEXT: vpminuw {{.*}}(%rip), %ymm0, %ymm0
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
; KNL-NEXT: vpmovdb %zmm0, %xmm0
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: usat_trunc_wb_256:
; SKX: ## BB#0:
; SKX-NEXT: vpmovuswb %ymm0, %xmm0
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x3 = icmp ult <16 x i16> %i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
%x5 = select <16 x i1> %x3, <16 x i16> %i, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
; ALL-LABEL: usat_trunc_db_512_mem:
; ALL: ## BB#0:
; ALL-NEXT: vpmovusdb %zmm0, (%rdi)
+; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
%x3 = icmp ult <16 x i32> %i, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
%x5 = select <16 x i1> %x3, <16 x i32> %i, <16 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
; ALL-LABEL: usat_trunc_qb_512_mem:
; ALL: ## BB#0:
; ALL-NEXT: vpmovusqb %zmm0, (%rdi)
+; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
%x3 = icmp ult <8 x i64> %i, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
%x5 = select <8 x i1> %x3, <8 x i64> %i, <8 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
; ALL-LABEL: usat_trunc_qd_512_mem:
; ALL: ## BB#0:
; ALL-NEXT: vpmovusqd %zmm0, (%rdi)
+; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
%x3 = icmp ult <8 x i64> %i, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
%x5 = select <8 x i1> %x3, <8 x i64> %i, <8 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
; ALL-LABEL: usat_trunc_qw_512_mem:
; ALL: ## BB#0:
; ALL-NEXT: vpmovusqw %zmm0, (%rdi)
+; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
%x3 = icmp ult <8 x i64> %i, <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
%x5 = select <8 x i1> %x3, <8 x i64> %i, <8 x i64> <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
; KNL-NEXT: vpmovusdb %zmm1, %xmm1
; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; KNL-NEXT: vmovdqu %ymm0, (%rdi)
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: usat_trunc_db_1024_mem:
; SKX-NEXT: vpmovdw %zmm1, %ymm1
; SKX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; SKX-NEXT: vpmovwb %zmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x3 = icmp ult <32 x i32> %i, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
%x5 = select <32 x i1> %x3, <32 x i32> %i, <32 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
; KNL-NEXT: vpminud %ymm1, %ymm0, %ymm0
; KNL-NEXT: vpmovdw %zmm0, %ymm0
; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: usat_trunc_db_256:
; SKX-NEXT: vpminud {{.*}}(%rip){1to8}, %ymm0, %ymm0
; SKX-NEXT: vpmovdw %ymm0, %xmm0
; SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%tmp1 = icmp ult <8 x i32> %x, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
%tmp2 = select <8 x i1> %tmp1, <8 x i32> %x, <8 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
}
define i16 @test12(<16 x i64> %a, <16 x i64> %b) nounwind {
-; CHECK-LABEL: test12:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpeqq %zmm2, %zmm0, %k0
-; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1
-; CHECK-NEXT: kunpckbw %k0, %k1, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; KNL-LABEL: test12:
+; KNL: ## BB#0:
+; KNL-NEXT: vpcmpeqq %zmm2, %zmm0, %k0
+; KNL-NEXT: vpcmpeqq %zmm3, %zmm1, %k1
+; KNL-NEXT: kunpckbw %k0, %k1, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test12:
+; SKX: ## BB#0:
+; SKX-NEXT: vpcmpeqq %zmm2, %zmm0, %k0
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1
+; SKX-NEXT: kunpckbw %k0, %k1, %k0
+; SKX-NEXT: kmovw %k0, %eax
+; SKX-NEXT: vzeroupper
+; SKX-NEXT: retq
%res = icmp eq <16 x i64> %a, %b
%res1 = bitcast <16 x i1> %res to i16
ret i16 %res1
; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1
; SKX-NEXT: kunpckwd %k0, %k1, %k0
; SKX-NEXT: kmovd %k0, %eax
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%res = icmp eq <32 x i32> %a, %b
%res1 = bitcast <32 x i1> %res to i32
; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1
; SKX-NEXT: kunpckdq %k0, %k1, %k0
; SKX-NEXT: kmovq %k0, %rax
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%res = icmp eq <64 x i16> %a, %b
%res1 = bitcast <64 x i1> %res to i64
; SKX-NEXT: vpcmpgtd %zmm3, %zmm2, %k1
; SKX-NEXT: kxorw %k1, %k0, %k0
; SKX-NEXT: vpmovm2b %k0, %xmm0
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x_gt_y = icmp sgt <16 x i32> %x, %y
%x1_gt_y1 = icmp sgt <16 x i32> %x1, %y1
; CHECK: ## BB#0:
; CHECK-NEXT: vptestmq %zmm0, %zmm1, %k0
; CHECK-NEXT: kmovb %k0, %eax
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%and.i = and <8 x i64> %b, %a
%test.i = tail call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %and.i, <8 x i64> %and.i, i8 -1)
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vptestmq %zmm0, %zmm1, %k0 {%k1}
; CHECK-NEXT: kmovb %k0, %eax
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%and.i = and <8 x i64> %b, %a
%test.i = tail call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %and.i, <8 x i64> %and.i, i8 %mask)
; CHECK-NEXT: kmovb %esi, %k1
; CHECK-NEXT: vptestmq (%rdi), %zmm0, %k0 {%k1}
; CHECK-NEXT: kmovb %k0, %eax
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%b = load <8 x i64>, <8 x i64>* %bptr
%and.i = and <8 x i64> %a, %b
; CHECK-NEXT: kmovb %esi, %k1
; CHECK-NEXT: vptestmq (%rdi), %zmm0, %k0 {%k1}
; CHECK-NEXT: kmovb %k0, %eax
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%b = load <8 x i64>, <8 x i64>* %bptr
%and.i = and <8 x i64> %b, %a
declare <2 x i64> @llvm.masked.expandload.v2i64(i64*, <2 x i1>, <2 x i64>)
define void @test6(float* %base, <16 x float> %V) {
-; ALL-LABEL: test6:
-; ALL: # BB#0:
-; ALL-NEXT: movw $-2049, %ax # imm = 0xF7FF
-; ALL-NEXT: kmovw %eax, %k1
-; ALL-NEXT: vcompressps %zmm0, (%rdi) {%k1}
-; ALL-NEXT: retq
+; SKX-LABEL: test6:
+; SKX: # BB#0:
+; SKX-NEXT: movw $-2049, %ax # imm = 0xF7FF
+; SKX-NEXT: kmovw %eax, %k1
+; SKX-NEXT: vcompressps %zmm0, (%rdi) {%k1}
+; SKX-NEXT: vzeroupper
+; SKX-NEXT: retq
+;
+; KNL-LABEL: test6:
+; KNL: # BB#0:
+; KNL-NEXT: movw $-2049, %ax # imm = 0xF7FF
+; KNL-NEXT: kmovw %eax, %k1
+; KNL-NEXT: vcompressps %zmm0, (%rdi) {%k1}
+; KNL-NEXT: retq
call void @llvm.masked.compressstore.v16f32(<16 x float> %V, float* %base, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 true>)
ret void
}
; SKX-NEXT: vpsllw $15, %xmm1, %xmm1
; SKX-NEXT: vpmovw2m %xmm1, %k1
; SKX-NEXT: vcompressps %ymm0, (%rdi) {%k1}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; KNL-LABEL: test7:
; SKX-NEXT: vpsllw $15, %xmm1, %xmm1
; SKX-NEXT: vpmovw2m %xmm1, %k1
; SKX-NEXT: vcompresspd %zmm0, (%rdi) {%k1}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; KNL-LABEL: test8:
; SKX-NEXT: vpsllw $15, %xmm1, %xmm1
; SKX-NEXT: vpmovw2m %xmm1, %k1
; SKX-NEXT: vpcompressq %zmm0, (%rdi) {%k1}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; KNL-LABEL: test9:
; SKX-NEXT: vpslld $31, %xmm1, %xmm1
; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1
; SKX-NEXT: vpcompressq %ymm0, (%rdi) {%k1}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; KNL-LABEL: test10:
}
define void @test17(float* %base, <32 x float> %V, <32 x i32> %trigger) {
-; ALL-LABEL: test17:
-; ALL: # BB#0:
-; ALL-NEXT: vpxord %zmm4, %zmm4, %zmm4
-; ALL-NEXT: vpcmpeqd %zmm4, %zmm3, %k1
-; ALL-NEXT: vpcmpeqd %zmm4, %zmm2, %k2
-; ALL-NEXT: kmovw %k2, %eax
-; ALL-NEXT: popcntl %eax, %eax
-; ALL-NEXT: vcompressps %zmm1, (%rdi,%rax,4) {%k1}
-; ALL-NEXT: vcompressps %zmm0, (%rdi) {%k2}
-; ALL-NEXT: retq
+; SKX-LABEL: test17:
+; SKX: # BB#0:
+; SKX-NEXT: vpxord %zmm4, %zmm4, %zmm4
+; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1
+; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k2
+; SKX-NEXT: kmovw %k2, %eax
+; SKX-NEXT: popcntl %eax, %eax
+; SKX-NEXT: vcompressps %zmm1, (%rdi,%rax,4) {%k1}
+; SKX-NEXT: vcompressps %zmm0, (%rdi) {%k2}
+; SKX-NEXT: vzeroupper
+; SKX-NEXT: retq
+;
+; KNL-LABEL: test17:
+; KNL: # BB#0:
+; KNL-NEXT: vpxord %zmm4, %zmm4, %zmm4
+; KNL-NEXT: vpcmpeqd %zmm4, %zmm3, %k1
+; KNL-NEXT: vpcmpeqd %zmm4, %zmm2, %k2
+; KNL-NEXT: kmovw %k2, %eax
+; KNL-NEXT: popcntl %eax, %eax
+; KNL-NEXT: vcompressps %zmm1, (%rdi,%rax,4) {%k1}
+; KNL-NEXT: vcompressps %zmm0, (%rdi) {%k2}
+; KNL-NEXT: retq
%mask = icmp eq <32 x i32> %trigger, zeroinitializer
call void @llvm.masked.compressstore.v32f32(<32 x float> %V, float* %base, <32 x i1> %mask)
ret void
; SKX-NEXT: popcntl %eax, %eax
; SKX-NEXT: vcompresspd %zmm1, (%rdi,%rax,8) {%k2}
; SKX-NEXT: vcompresspd %zmm0, (%rdi) {%k1}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; KNL-LABEL: test18:
; AVX512-LABEL: test_nt8xfloat:
; AVX512: # BB#0: # %entry
; AVX512-NEXT: vmovntps %ymm0, (%rdi)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
entry:
store <8 x float> %X, <8 x float>* %ptr, align 32, !nontemporal !1
; AVX512-LABEL: test_nt4xdouble:
; AVX512: # BB#0: # %entry
; AVX512-NEXT: vmovntpd %ymm0, (%rdi)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
entry:
store <4 x double> %X, <4 x double>* %ptr, align 32, !nontemporal !1
; AVX512-LABEL: test_nt32xi8:
; AVX512: # BB#0: # %entry
; AVX512-NEXT: vmovntdq %ymm0, (%rdi)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
entry:
store <32 x i8> %X, <32 x i8>* %ptr, align 32, !nontemporal !1
; AVX512-LABEL: test_nt16xi16:
; AVX512: # BB#0: # %entry
; AVX512-NEXT: vmovntdq %ymm0, (%rdi)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
entry:
store <16 x i16> %X, <16 x i16>* %ptr, align 32, !nontemporal !1
; AVX512-LABEL: test_nt8xi32:
; AVX512: # BB#0: # %entry
; AVX512-NEXT: vmovntdq %ymm0, (%rdi)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
entry:
store <8 x i32> %X, <8 x i32>* %ptr, align 32, !nontemporal !1
; AVX512-LABEL: test_nt4xi64:
; AVX512: # BB#0: # %entry
; AVX512-NEXT: vmovntdq %ymm0, (%rdi)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
entry:
store <4 x i64> %X, <4 x i64>* %ptr, align 32, !nontemporal !1
; AVX512-LABEL: test_nt16xfloat:
; AVX512: # BB#0: # %entry
; AVX512-NEXT: vmovntps %zmm0, (%rdi)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
entry:
store <16 x float> %X, <16 x float>* %ptr, align 64, !nontemporal !1
; AVX512-LABEL: test_nt8xdouble:
; AVX512: # BB#0: # %entry
; AVX512-NEXT: vmovntpd %zmm0, (%rdi)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
entry:
store <8 x double> %X, <8 x double>* %ptr, align 64, !nontemporal !1
; AVX512F: # BB#0: # %entry
; AVX512F-NEXT: vmovntdq %ymm0, (%rdi)
; AVX512F-NEXT: vmovntdq %ymm1, 32(%rdi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: test_nt64xi8:
; AVX512BW: # BB#0: # %entry
; AVX512BW-NEXT: vmovntdq %zmm0, (%rdi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
entry:
store <64 x i8> %X, <64 x i8>* %ptr, align 64, !nontemporal !1
; AVX512F: # BB#0: # %entry
; AVX512F-NEXT: vmovntdq %ymm0, (%rdi)
; AVX512F-NEXT: vmovntdq %ymm1, 32(%rdi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: test_nt32xi16:
; AVX512BW: # BB#0: # %entry
; AVX512BW-NEXT: vmovntdq %zmm0, (%rdi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
entry:
store <32 x i16> %X, <32 x i16>* %ptr, align 64, !nontemporal !1
; AVX512-LABEL: test_nt16xi32:
; AVX512: # BB#0: # %entry
; AVX512-NEXT: vmovntdq %zmm0, (%rdi)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
entry:
store <16 x i32> %X, <16 x i32>* %ptr, align 64, !nontemporal !1
; AVX512-LABEL: test_nt8xi64:
; AVX512: # BB#0: # %entry
; AVX512-NEXT: vmovntdq %zmm0, (%rdi)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
entry:
store <8 x i64> %X, <8 x i64>* %ptr, align 64, !nontemporal !1
; KNL_64-NEXT: kmovw %k1, %k2
; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2}
; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
+; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test5:
; KNL_32-NEXT: kmovw %k1, %k2
; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k2}
; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
+; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test5:
; SKX-NEXT: kmovw %k1, %k2
; SKX-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2}
; SKX-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; SKX_32-LABEL: test5:
; SKX_32-NEXT: kmovw %k1, %k2
; SKX_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k2}
; SKX_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
+; SKX_32-NEXT: vzeroupper
; SKX_32-NEXT: retl
%broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
; KNL_64-NEXT: vptestmd %zmm0, %zmm0, %k1
; KNL_64-NEXT: vgatherqps (%rdi,%zmm2,4), %ymm0 {%k1}
; KNL_64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test15:
; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k1
; KNL_32-NEXT: vgatherqps (%eax,%zmm2,4), %ymm0 {%k1}
; KNL_32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test15:
; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_64-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1}
; KNL_64-NEXT: vmovapd %xmm2, %xmm0
+; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test17:
; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_32-NEXT: vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1}
; KNL_32-NEXT: vmovapd %xmm2, %xmm0
+; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test17:
; KNL_64-NEXT: vpslld $31, %ymm2, %ymm2
; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
+; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test18:
; KNL_32-NEXT: vpslld $31, %ymm2, %ymm2
; KNL_32-NEXT: vptestmd %zmm2, %zmm2, %k1
; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
+; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test18:
; SKX-NEXT: vpslld $31, %xmm2, %xmm2
; SKX-NEXT: vptestmd %xmm2, %xmm2, %k1
; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; SKX_32-LABEL: test18:
; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_64-NEXT: vscatterqpd %zmm0, (%rdi,%zmm2,8) {%k1}
+; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test19:
; KNL_32-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_32-NEXT: vscatterqpd %zmm0, (%eax,%zmm2,8) {%k1}
+; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test19:
; SKX-NEXT: vpslld $31, %xmm1, %xmm1
; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1
; SKX-NEXT: vscatterqpd %ymm0, (%rdi,%ymm2,8) {%k1}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; SKX_32-LABEL: test19:
; SKX_32-NEXT: vptestmd %xmm1, %xmm1, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vscatterqpd %ymm0, (%eax,%ymm2,8) {%k1}
+; SKX_32-NEXT: vzeroupper
; SKX_32-NEXT: retl
%gep = getelementptr double, double* %ptr, <4 x i64> %ind
call void @llvm.masked.scatter.v4f64(<4 x double> %a1, <4 x double*> %gep, i32 8, <4 x i1> %mask)
; KNL_64-NEXT: vpslld $31, %ymm2, %ymm2
; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1}
+; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test20:
; KNL_32-NEXT: vpslld $31, %ymm2, %ymm2
; KNL_32-NEXT: vptestmd %zmm2, %zmm2, %k1
; KNL_32-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1}
+; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test20:
; SKX-NEXT: kshiftlb $6, %k0, %k0
; SKX-NEXT: kshiftrb $6, %k0, %k1
; SKX-NEXT: vscatterqps %xmm0, (,%ymm1) {%k1}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; SKX_32-LABEL: test20:
; KNL_64-NEXT: vpsllq $63, %zmm2, %zmm2
; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k1
; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
+; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test21:
; KNL_32-NEXT: vpsllq $63, %zmm2, %zmm2
; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
+; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test21:
; SKX-NEXT: kshiftrb $6, %k0, %k1
; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; SKX_32-LABEL: test21:
; SKX_32-NEXT: kshiftrb $6, %k0, %k1
; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX_32-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
+; SKX_32-NEXT: vzeroupper
; SKX_32-NEXT: retl
call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask)
ret void
; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k1
; KNL_64-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k1}
; KNL_64-NEXT: vmovaps %xmm2, %xmm0
+; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test22:
; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
; KNL_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm2 {%k1}
; KNL_32-NEXT: vmovaps %xmm2, %xmm0
+; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test22:
; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1}
; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
+; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test23:
; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1}
; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
+; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test23:
; KNL_64-NEXT: kmovw %eax, %k1
; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1}
; KNL_64-NEXT: vmovdqa %xmm1, %xmm0
+; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test24:
; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1}
; KNL_32-NEXT: vmovdqa %xmm1, %xmm0
+; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test24:
; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1}
; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
+; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test25:
; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1}
; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
+; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test25:
; KNL_64-NEXT: kmovw %eax, %k1
; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1}
; KNL_64-NEXT: vmovdqa %xmm1, %xmm0
+; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test26:
; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1}
; KNL_32-NEXT: vmovdqa %xmm1, %xmm0
+; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test26:
; KNL_64-NEXT: kmovw %eax, %k1
; KNL_64-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
; KNL_64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test27:
; KNL_32-NEXT: kmovw %ecx, %k1
; KNL_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
; KNL_32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test27:
; KNL_64-NEXT: movb $3, %al
; KNL_64-NEXT: kmovw %eax, %k1
; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
+; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test28:
; KNL_32-NEXT: vpsllq $63, %zmm2, %zmm2
; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
+; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test28:
; SKX-NEXT: movb $3, %al
; SKX-NEXT: kmovb %eax, %k1
; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; SKX_32-LABEL: test28:
; SKX_32-NEXT: movb $3, %al
; SKX_32-NEXT: kmovb %eax, %k1
; SKX_32-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
+; SKX_32-NEXT: vzeroupper
; SKX_32-NEXT: retl
call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> <i1 true, i1 true>)
ret void
; KNL_64-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1}
; KNL_64-NEXT: vextracti64x4 $1, %zmm3, %ymm0
; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2}
+; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test_scatter_16i32:
; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
; KNL_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1}
+; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test_scatter_16i32:
; SKX-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1}
; SKX-NEXT: vextracti32x8 $1, %zmm3, %ymm0
; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; SKX_32-LABEL: test_scatter_16i32:
; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
; SKX_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1}
+; SKX_32-NEXT: vzeroupper
; SKX_32-NEXT: retl
call void @llvm.masked.scatter.v16i32(<16 x i32> %src0, <16 x i32*> %ptrs, i32 4, <16 x i1> %mask)
ret void
; KNL_64-NEXT: kshiftrw $8, %k1, %k2
; KNL_64-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1}
; KNL_64-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2}
+; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test_scatter_16i64:
; KNL_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2}
; KNL_32-NEXT: movl %ebp, %esp
; KNL_32-NEXT: popl %ebp
+; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test_scatter_16i64:
; SKX-NEXT: kshiftrw $8, %k1, %k2
; SKX-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1}
; SKX-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; SKX_32-LABEL: test_scatter_16i64:
; SKX_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2}
; SKX_32-NEXT: movl %ebp, %esp
; SKX_32-NEXT: popl %ebp
+; SKX_32-NEXT: vzeroupper
; SKX_32-NEXT: retl
call void @llvm.masked.scatter.v16i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32 4, <16 x i1> %mask)
ret void
; KNL_64-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1}
; KNL_64-NEXT: vextractf64x4 $1, %zmm3, %ymm0
; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2}
+; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test_scatter_16f32:
; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
; KNL_32-NEXT: vscatterdps %zmm2, (,%zmm0) {%k1}
+; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test_scatter_16f32:
; SKX-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1}
; SKX-NEXT: vextractf32x8 $1, %zmm3, %ymm0
; SKX-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; SKX_32-LABEL: test_scatter_16f32:
; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
; SKX_32-NEXT: vscatterdps %zmm2, (,%zmm0) {%k1}
+; SKX_32-NEXT: vzeroupper
; SKX_32-NEXT: retl
call void @llvm.masked.scatter.v16f32(<16 x float> %src0, <16 x float*> %ptrs, i32 4, <16 x i1> %mask)
ret void
; KNL_64-NEXT: kshiftrw $8, %k1, %k2
; KNL_64-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1}
; KNL_64-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2}
+; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test_scatter_16f64:
; KNL_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2}
; KNL_32-NEXT: movl %ebp, %esp
; KNL_32-NEXT: popl %ebp
+; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test_scatter_16f64:
; SKX-NEXT: kshiftrw $8, %k1, %k2
; SKX-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1}
; SKX-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; SKX_32-LABEL: test_scatter_16f64:
; SKX_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2}
; SKX_32-NEXT: movl %ebp, %esp
; SKX_32-NEXT: popl %ebp
+; SKX_32-NEXT: vzeroupper
; SKX_32-NEXT: retl
call void @llvm.masked.scatter.v16f64(<16 x double> %src0, <16 x double*> %ptrs, i32 4, <16 x i1> %mask)
ret void
; KNL_64-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; KNL_64-NEXT: retq
;
+; KNL_32-LABEL: test_pr28312:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: pushl %ebp
+; KNL_32-NEXT: .Lcfi12:
+; KNL_32-NEXT: .cfi_def_cfa_offset 8
+; KNL_32-NEXT: .Lcfi13:
+; KNL_32-NEXT: .cfi_offset %ebp, -8
+; KNL_32-NEXT: movl %esp, %ebp
+; KNL_32-NEXT: .Lcfi14:
+; KNL_32-NEXT: .cfi_def_cfa_register %ebp
+; KNL_32-NEXT: andl $-32, %esp
+; KNL_32-NEXT: subl $32, %esp
+; KNL_32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
+; KNL_32-NEXT: vpsrad $31, %xmm1, %xmm1
+; KNL_32-NEXT: vpmovsxdq %xmm1, %ymm1
+; KNL_32-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; KNL_32-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0
+; KNL_32-NEXT: vpsllq $63, %zmm1, %zmm1
+; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
+; KNL_32-NEXT: vpgatherqq (,%zmm0), %zmm1 {%k1}
+; KNL_32-NEXT: vpaddq %ymm1, %ymm1, %ymm0
+; KNL_32-NEXT: vpaddq %ymm0, %ymm1, %ymm0
+; KNL_32-NEXT: movl %ebp, %esp
+; KNL_32-NEXT: popl %ebp
+; KNL_32-NEXT: retl
+;
; SKX-LABEL: test_pr28312:
; SKX: # BB#0:
; SKX-NEXT: vpslld $31, %xmm1, %xmm1
; SKX-NEXT: vpaddq %ymm1, %ymm1, %ymm0
; SKX-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; SKX-NEXT: retq
+;
+; SKX_32-LABEL: test_pr28312:
+; SKX_32: # BB#0:
+; SKX_32-NEXT: pushl %ebp
+; SKX_32-NEXT: .Lcfi13:
+; SKX_32-NEXT: .cfi_def_cfa_offset 8
+; SKX_32-NEXT: .Lcfi14:
+; SKX_32-NEXT: .cfi_offset %ebp, -8
+; SKX_32-NEXT: movl %esp, %ebp
+; SKX_32-NEXT: .Lcfi15:
+; SKX_32-NEXT: .cfi_def_cfa_register %ebp
+; SKX_32-NEXT: andl $-32, %esp
+; SKX_32-NEXT: subl $32, %esp
+; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
+; SKX_32-NEXT: vptestmd %xmm1, %xmm1, %k1
+; SKX_32-NEXT: vpgatherdq (,%xmm0), %ymm1 {%k1}
+; SKX_32-NEXT: vpaddq %ymm1, %ymm1, %ymm0
+; SKX_32-NEXT: vpaddq %ymm0, %ymm1, %ymm0
+; SKX_32-NEXT: movl %ebp, %esp
+; SKX_32-NEXT: popl %ebp
+; SKX_32-NEXT: retl
%g1 = call <4 x i64> @llvm.masked.gather.v4i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
%g2 = call <4 x i64> @llvm.masked.gather.v4i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
%g3 = call <4 x i64> @llvm.masked.gather.v4i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
; AVX512F-NEXT: kshiftlw $8, %k0, %k0
; AVX512F-NEXT: kshiftrw $8, %k0, %k1
; AVX512F-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1}
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; SKX-LABEL: test12:
; SKX-NEXT: vpxor %ymm2, %ymm2, %ymm2
; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1
; SKX-NEXT: vmovdqu32 %ymm1, (%rdi) {%k1}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%mask = icmp eq <8 x i32> %trigger, zeroinitializer
call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>%val, <8 x i32>* %addr, i32 4, <8 x i1>%mask)
; AVX512F: ## BB#0:
; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vmovlps %xmm0, 16(%rdi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; SKX-LABEL: one_mask_bit_set3:
; SKX: ## BB#0:
; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
; SKX-NEXT: vmovq %xmm0, 16(%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> %val, <4 x i64>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>)
ret void
; AVX512: ## BB#0:
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512-NEXT: vmovhpd %xmm0, 24(%rdi)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %val, <4 x double>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 false, i1 true>)
ret void
; AVX512: ## BB#0:
; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0
; AVX512-NEXT: vmovlps %xmm0, 48(%rdi)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> %val, <8 x double>* %addr, i32 4, <8 x i1><i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false>)
ret void
; VLX: # BB#0:
; VLX-NEXT: vpxor %ymm0, %ymm0, %ymm0
; VLX-NEXT: vmovntdq %ymm0, (%rdi)
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
store <8 x float> zeroinitializer, <8 x float>* %dst, align 32, !nontemporal !1
ret void
; VLX: # BB#0:
; VLX-NEXT: vpxor %ymm0, %ymm0, %ymm0
; VLX-NEXT: vmovntdq %ymm0, (%rdi)
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
store <8 x i32> zeroinitializer, <8 x i32>* %dst, align 32, !nontemporal !1
ret void
; VLX: # BB#0:
; VLX-NEXT: vpxor %ymm0, %ymm0, %ymm0
; VLX-NEXT: vmovntdq %ymm0, (%rdi)
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
store <4 x double> zeroinitializer, <4 x double>* %dst, align 32, !nontemporal !1
ret void
; VLX: # BB#0:
; VLX-NEXT: vpxor %ymm0, %ymm0, %ymm0
; VLX-NEXT: vmovntdq %ymm0, (%rdi)
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
store <4 x i64> zeroinitializer, <4 x i64>* %dst, align 32, !nontemporal !1
ret void
; VLX: # BB#0:
; VLX-NEXT: vpxor %ymm0, %ymm0, %ymm0
; VLX-NEXT: vmovntdq %ymm0, (%rdi)
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
store <16 x i16> zeroinitializer, <16 x i16>* %dst, align 32, !nontemporal !1
ret void
; VLX: # BB#0:
; VLX-NEXT: vpxor %ymm0, %ymm0, %ymm0
; VLX-NEXT: vmovntdq %ymm0, (%rdi)
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
store <32 x i8> zeroinitializer, <32 x i8>* %dst, align 32, !nontemporal !1
ret void
; VLX-LABEL: test_arg_v8f32:
; VLX: # BB#0:
; VLX-NEXT: vmovntps %ymm0, (%rdi)
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
store <8 x float> %arg, <8 x float>* %dst, align 32, !nontemporal !1
ret void
; VLX-LABEL: test_arg_v8i32:
; VLX: # BB#0:
; VLX-NEXT: vmovntps %ymm0, (%rdi)
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
store <8 x i32> %arg, <8 x i32>* %dst, align 32, !nontemporal !1
ret void
; VLX-LABEL: test_arg_v4f64:
; VLX: # BB#0:
; VLX-NEXT: vmovntps %ymm0, (%rdi)
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
store <4 x double> %arg, <4 x double>* %dst, align 32, !nontemporal !1
ret void
; VLX-LABEL: test_arg_v4i64:
; VLX: # BB#0:
; VLX-NEXT: vmovntps %ymm0, (%rdi)
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
store <4 x i64> %arg, <4 x i64>* %dst, align 32, !nontemporal !1
ret void
; VLX-LABEL: test_arg_v16i16:
; VLX: # BB#0:
; VLX-NEXT: vmovntps %ymm0, (%rdi)
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
store <16 x i16> %arg, <16 x i16>* %dst, align 32, !nontemporal !1
ret void
; VLX-LABEL: test_arg_v32i8:
; VLX: # BB#0:
; VLX-NEXT: vmovntps %ymm0, (%rdi)
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
store <32 x i8> %arg, <32 x i8>* %dst, align 32, !nontemporal !1
ret void
; VLX: # BB#0:
; VLX-NEXT: vaddps %ymm1, %ymm0, %ymm0
; VLX-NEXT: vmovntps %ymm0, (%rdi)
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
%r = fadd <8 x float> %a, %b
store <8 x float> %r, <8 x float>* %dst, align 32, !nontemporal !1
; VLX: # BB#0:
; VLX-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; VLX-NEXT: vmovntdq %ymm0, (%rdi)
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
%r = add <8 x i32> %a, %b
store <8 x i32> %r, <8 x i32>* %dst, align 32, !nontemporal !1
; VLX: # BB#0:
; VLX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; VLX-NEXT: vmovntpd %ymm0, (%rdi)
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
%r = fadd <4 x double> %a, %b
store <4 x double> %r, <4 x double>* %dst, align 32, !nontemporal !1
; VLX: # BB#0:
; VLX-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; VLX-NEXT: vmovntdq %ymm0, (%rdi)
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
%r = add <4 x i64> %a, %b
store <4 x i64> %r, <4 x i64>* %dst, align 32, !nontemporal !1
; VLX: # BB#0:
; VLX-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; VLX-NEXT: vmovntdq %ymm0, (%rdi)
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
%r = add <16 x i16> %a, %b
store <16 x i16> %r, <16 x i16>* %dst, align 32, !nontemporal !1
; VLX: # BB#0:
; VLX-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; VLX-NEXT: vmovntdq %ymm0, (%rdi)
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
%r = add <32 x i8> %a, %b
store <32 x i8> %r, <32 x i8>* %dst, align 32, !nontemporal !1
; VLX: # BB#0:
; VLX-NEXT: vaddps %ymm1, %ymm0, %ymm0
; VLX-NEXT: vmovups %ymm0, (%rdi)
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
%r = fadd <8 x float> %a, %b
store <8 x float> %r, <8 x float>* %dst, align 16, !nontemporal !1
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: mul_v16i8c:
; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
entry:
%A = mul <16 x i8> %i, < i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117 >
; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: mul_v16i8:
; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
entry:
%A = mul <16 x i8> %i, %j
; SSE41-NEXT: movaps %xmm2, %xmm0
; SSE41-NEXT: retq
;
-; AVX2-LABEL: mul_v4i64_zero_upper:
-; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: mul_v4i64_zero_upper:
-; AVX512: # BB#0: # %entry
-; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
-; AVX512-NEXT: retq
+; AVX-LABEL: mul_v4i64_zero_upper:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
+; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
entry:
%val1a = zext <4 x i32> %val1 to <4 x i64>
%val2a = zext <4 x i32> %val2 to <4 x i64>
; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
; SSE41-NEXT: retq
;
-; AVX2-LABEL: mul_v4i64_zero_upper_left:
-; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1
-; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0
-; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: mul_v4i64_zero_upper_left:
-; AVX512: # BB#0: # %entry
-; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
-; AVX512-NEXT: vpsrlq $32, %ymm1, %ymm1
-; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vpsllq $32, %ymm0, %ymm0
-; AVX512-NEXT: vpaddq %ymm0, %ymm2, %ymm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
-; AVX512-NEXT: retq
+; AVX-LABEL: mul_v4i64_zero_upper_left:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
+; AVX-NEXT: vpsrlq $32, %ymm1, %ymm1
+; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
+; AVX-NEXT: vpsllq $32, %ymm0, %ymm0
+; AVX-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
entry:
%val1a = zext <4 x i32> %val1 to <4 x i64>
%res64 = mul <4 x i64> %val1a, %val2
; SSE41-NEXT: movaps %xmm3, %xmm0
; SSE41-NEXT: retq
;
-; AVX2-LABEL: mul_v4i64_zero_lower:
-; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1
-; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: mul_v4i64_zero_lower:
-; AVX512: # BB#0: # %entry
-; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512-NEXT: vpsrlq $32, %ymm1, %ymm1
-; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vpsllq $32, %ymm0, %ymm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
-; AVX512-NEXT: retq
+; AVX-LABEL: mul_v4i64_zero_lower:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX-NEXT: vpsrlq $32, %ymm1, %ymm1
+; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
+; AVX-NEXT: vpsllq $32, %ymm0, %ymm0
+; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
entry:
%val1a = zext <4 x i32> %val1 to <4 x i64>
%val2a = and <4 x i64> %val2, <i64 -4294967296, i64 -4294967296, i64 -4294967296, i64 -4294967296>
; CHECK-NEXT: vmovaps %xmm8, {{[0-9]+}}(%rsp)
; CHECK-NEXT: vmovaps %xmm9, (%rsp)
; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq foo
; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
; CHECK-NEXT: vaddps {{[0-9]+}}(%rsp), %xmm1, %xmm1 # 16-byte Folded Reload
; CHECK-NEXT: vorpd %zmm2, %zmm1, %zmm1 {%k1}
; CHECK-NEXT: vmovapd %zmm1, 64
; CHECK-NEXT: vmovapd %zmm0, 0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retl
%a_load22 = load <16 x i64>, <16 x i64>* null, align 1
%bitop = or <16 x i64> %a_load22, <i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736>
; AVX512F-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: sad_16i8:
; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovd %xmm0, %eax
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
entry:
br label %vector.body
; AVX512F-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: sad_32i8:
; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovd %xmm0, %eax
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
entry:
br label %vector.body
; AVX512F-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: sad_avx64i8:
; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovd %xmm0, %eax
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
entry:
br label %vector.body
; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX512F-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: sad_nonloop_32i8:
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX512BW-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vmovd %xmm0, %eax
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%v1 = load <32 x i8>, <32 x i8>* %p, align 1
%z1 = zext <32 x i8> %v1 to <32 x i32>
; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i8_to_v16i8:
; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v32i8_to_v16i8:
; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v32i8_to_v16i8:
; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BWVL-NEXT: vmovdqu %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
; AVX512F-NEXT: vpmovsxwd (%rdi), %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_v16i16_to_v16i8:
; AVX512VL-NEXT: vpmovsxwd (%rdi), %zmm0
; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_v16i16_to_v16i8:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_v16i16_to_v16i8:
; AVX512BWVL: # BB#0:
; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0
; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%bc = bitcast <32 x i8> %vec to <16 x i16>
; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_to_v8i16:
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v16i16_to_v8i16:
; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16:
; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BWVL-NEXT: vmovdqu %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %L
%strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_v8i32_to_v8i16:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512VL-NEXT: vpmovdw %ymm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_v8i32_to_v8i16:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_v8i32_to_v8i16:
; AVX512BWVL: # BB#0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BWVL-NEXT: vpmovdw %ymm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %L
%bc = bitcast <16 x i16> %vec to <8 x i32>
; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX512F-NEXT: vmovaps %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v8i32_to_v4i32:
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX512VL-NEXT: vmovaps %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v8i32_to_v4i32:
; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX512BW-NEXT: vmovaps %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v8i32_to_v4i32:
; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX512BWVL-NEXT: vmovaps %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %L
%strided.vec = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_v4i64_to_v4i32:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512VL-NEXT: vpmovqd %ymm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_v4i64_to_v4i32:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_v4i64_to_v4i32:
; AVX512BWVL: # BB#0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BWVL-NEXT: vpmovqd %ymm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %L
%bc = bitcast <8 x i32> %vec to <4 x i64>
; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512F-NEXT: vmovq %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i8_to_v8i8:
; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v32i8_to_v8i8:
; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8:
; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX512F-NEXT: vmovq %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_v8i32_to_v8i8:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512VL-NEXT: vpmovdb %ymm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_v8i32_to_v8i8:
; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8:
; AVX512BWVL: # BB#0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%bc = bitcast <32 x i8> %vec to <8 x i32>
; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512F-NEXT: vmovq %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_to_v4i16:
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v16i16_to_v4i16:
; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16:
; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %L
%strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512F-NEXT: vmovq %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_v4i64_to_v4i16:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512VL-NEXT: vpmovqw %ymm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_v4i64_to_v4i16:
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16:
; AVX512BWVL: # BB#0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BWVL-NEXT: vpmovqw %ymm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %L
%bc = bitcast <16 x i16> %vec to <4 x i64>
; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512F-NEXT: vmovd %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i8_to_v4i8:
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v32i8_to_v4i8:
; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8:
; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512F-NEXT: vmovd %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_v4i64_to_v4i8:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512VL-NEXT: vpmovqb %ymm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_v4i64_to_v4i8:
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_v4i64_to_v4i8:
; AVX512BWVL: # BB#0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%bc = bitcast <32 x i8> %vec to <4 x i64>
; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512F-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v64i8_to_v32i8:
; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512VL-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_to_v32i8:
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v64i8_to_v32i8:
; AVX512BWVL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512BWVL-NEXT: vmovdqu %ymm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
%strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_v32i16_to_v32i8:
; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1
; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_v32i16_to_v32i8:
; AVX512BW: # BB#0:
; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0
; AVX512BW-NEXT: vpmovwb %zmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_v32i16_to_v32i8:
; AVX512BWVL: # BB#0:
; AVX512BWVL-NEXT: vmovdqu16 (%rdi), %zmm0
; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
%bc = bitcast <64 x i8> %vec to <32 x i16>
; AVX512F-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512F-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i16_to_v16i16:
; AVX512VL-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512VL-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v32i16_to_v16i16:
; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v32i16_to_v16i16:
; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm2[0,2,1,3]
; AVX512BWVL-NEXT: vmovdqu %ymm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %L
%strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
; AVX512: # BB#0:
; AVX512-NEXT: vmovdqa32 (%rdi), %zmm0
; AVX512-NEXT: vpmovdw %zmm0, (%rsi)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %L
%bc = bitcast <32 x i16> %vec to <16 x i32>
; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %L
%strided.vec = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
; AVX512: # BB#0:
; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512-NEXT: vpmovqd %zmm0, (%rsi)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %L
%bc = bitcast <16 x i32> %vec to <8 x i64>
; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v64i8_to_v16i8:
; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_to_v16i8:
; AVX512BW-NEXT: vpextrb $12, %xmm0, %eax
; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm1, %xmm0
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8:
; AVX512BWVL-NEXT: vpextrb $12, %xmm0, %eax
; AVX512BWVL-NEXT: vpinsrb $15, %eax, %xmm1, %xmm0
; AVX512BWVL-NEXT: vmovdqu %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
%strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
; AVX512: # BB#0:
; AVX512-NEXT: vmovdqa32 (%rdi), %zmm0
; AVX512-NEXT: vpmovdb %zmm0, (%rsi)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
%bc = bitcast <64 x i8> %vec to <16 x i32>
; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i16_to_v8i16:
; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v32i16_to_v8i16:
; AVX512BW-NEXT: vpextrw $4, %xmm0, %eax
; AVX512BW-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16:
; AVX512BWVL-NEXT: vpextrw $4, %xmm0, %eax
; AVX512BWVL-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
; AVX512BWVL-NEXT: vmovdqu %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %L
%strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
; AVX512: # BB#0:
; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512-NEXT: vpmovqw %zmm0, (%rsi)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %L
%bc = bitcast <32 x i16> %vec to <8 x i64>
; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX512F-NEXT: vmovq %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v64i8_to_v8i8:
; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_to_v8i8:
; AVX512BW-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
; AVX512BW-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8:
; AVX512BWVL-NEXT: vpextrb $8, %xmm0, %eax
; AVX512BWVL-NEXT: vpinsrb $14, %eax, %xmm1, %xmm0
; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
%strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56>
; AVX512: # BB#0:
; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512-NEXT: vpmovqb %zmm0, (%rsi)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
%bc = bitcast <64 x i8> %vec to <8 x i64>
; AVX512F-NEXT: vcvtdq2ps %ymm0, %ymm0
; AVX512F-NEXT: vsubps %ymm0, %ymm2, %ymm0
; AVX512F-NEXT: vmovaps %ymm0, (%rdi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
entry:
%1 = load <8 x float>, <8 x float>* %0
; AVX512F-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX512F-NEXT: vsubpd %ymm0, %ymm2, %ymm0
; AVX512F-NEXT: vmovapd %ymm0, (%rdi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
entry:
%1 = load <4 x double>, <4 x double>* %0
;
define void @signum32c(<8 x float>*) {
-; AVX1-LABEL: signum32c:
-; AVX1: # BB#0: # %entry
-; AVX1-NEXT: vmovaps (%rdi), %ymm0
-; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1
-; AVX1-NEXT: vcmpltps %ymm1, %ymm0, %ymm2
-; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2
-; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
-; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX1-NEXT: vsubps %ymm0, %ymm2, %ymm0
-; AVX1-NEXT: vmovaps %ymm0, (%rdi)
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: signum32c:
-; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vmovaps (%rdi), %ymm0
-; AVX2-NEXT: vxorps %ymm1, %ymm1, %ymm1
-; AVX2-NEXT: vcmpltps %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vcvtdq2ps %ymm2, %ymm2
-; AVX2-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX2-NEXT: vsubps %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vmovaps %ymm0, (%rdi)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: signum32c:
-; AVX512F: # BB#0: # %entry
-; AVX512F-NEXT: vmovaps (%rdi), %ymm0
-; AVX512F-NEXT: vxorps %ymm1, %ymm1, %ymm1
-; AVX512F-NEXT: vcmpltps %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT: vcvtdq2ps %ymm2, %ymm2
-; AVX512F-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
-; AVX512F-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX512F-NEXT: vsubps %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT: vmovaps %ymm0, (%rdi)
-; AVX512F-NEXT: retq
+; AVX-LABEL: signum32c:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovaps (%rdi), %ymm0
+; AVX-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX-NEXT: vcmpltps %ymm1, %ymm0, %ymm2
+; AVX-NEXT: vcvtdq2ps %ymm2, %ymm2
+; AVX-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
+; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX-NEXT: vsubps %ymm0, %ymm2, %ymm0
+; AVX-NEXT: vmovaps %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
entry:
%1 = load <8 x float>, <8 x float>* %0
%2 = tail call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %1, <8 x float> zeroinitializer, i8 1)
; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX512F-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX512F-NEXT: vmovaps %ymm0, (%rdi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
entry:
%x = load <4 x double>, <4 x double>* %0
; X32-AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm1
; X32-AVX512-NEXT: vmovdqu %ymm0, _ga4
; X32-AVX512-NEXT: vmovdqu64 %zmm1, _gb4
+; X32-AVX512-NEXT: vzeroupper
; X32-AVX512-NEXT: retl
;
; X64-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64:
; X64-AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm1
; X64-AVX512-NEXT: vmovdqu %ymm0, {{.*}}(%rip)
; X64-AVX512-NEXT: vmovdqu64 %zmm1, {{.*}}(%rip)
+; X64-AVX512-NEXT: vzeroupper
; X64-AVX512-NEXT: retq
entry:
%0 = add <4 x i64> %a, <i64 1, i64 2, i64 3, i64 4>
; X32-AVX512-NEXT: vdivpd %zmm2, %zmm1, %zmm1
; X32-AVX512-NEXT: vmovupd %ymm0, _ga2
; X32-AVX512-NEXT: vmovupd %zmm1, _gb2
+; X32-AVX512-NEXT: vzeroupper
; X32-AVX512-NEXT: retl
;
; X64-AVX-LABEL: fallback_broadcast_v4f64_to_v8f64:
; X64-AVX512-NEXT: vdivpd %zmm2, %zmm1, %zmm1
; X64-AVX512-NEXT: vmovupd %ymm0, {{.*}}(%rip)
; X64-AVX512-NEXT: vmovupd %zmm1, {{.*}}(%rip)
+; X64-AVX512-NEXT: vzeroupper
; X64-AVX512-NEXT: retq
entry:
%0 = fadd <4 x double> %a, <double 1.0, double 2.0, double 3.0, double 4.0>
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512DQ-NEXT: vcvttpd2qq %zmm0, %zmm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: fptosi_2f64_to_2i64:
; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: retq
;
-; VEX-LABEL: fptosi_4f64_to_2i32:
-; VEX: # BB#0:
-; VEX-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; VEX-NEXT: vcvttpd2dq %ymm0, %xmm0
-; VEX-NEXT: vzeroupper
-; VEX-NEXT: retq
-;
-; AVX512-LABEL: fptosi_4f64_to_2i32:
-; AVX512: # BB#0:
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; AVX512-NEXT: vcvttpd2dq %ymm0, %xmm0
-; AVX512-NEXT: retq
+; AVX-LABEL: fptosi_4f64_to_2i32:
+; AVX: # BB#0:
+; AVX-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; AVX-NEXT: vcvttpd2dq %ymm0, %xmm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
%ext = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
%cvt = fptosi <4 x double> %ext to <4 x i32>
ret <4 x i32> %cvt
; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: retq
;
-; VEX-LABEL: fptosi_4f64_to_4i32:
-; VEX: # BB#0:
-; VEX-NEXT: vcvttpd2dq %ymm0, %xmm0
-; VEX-NEXT: vzeroupper
-; VEX-NEXT: retq
-;
-; AVX512-LABEL: fptosi_4f64_to_4i32:
-; AVX512: # BB#0:
-; AVX512-NEXT: vcvttpd2dq %ymm0, %xmm0
-; AVX512-NEXT: retq
+; AVX-LABEL: fptosi_4f64_to_4i32:
+; AVX: # BB#0:
+; AVX-NEXT: vcvttpd2dq %ymm0, %xmm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
%cvt = fptosi <4 x double> %a to <4 x i32>
ret <4 x i32> %cvt
}
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512DQ-NEXT: vcvttpd2uqq %zmm0, %zmm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: fptoui_2f64_to_2i64:
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512F-NEXT: vcvttpd2udq %zmm0, %ymm0
; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fptoui_2f64_to_4i32:
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512DQ-NEXT: vcvttpd2udq %zmm0, %ymm0
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: fptoui_2f64_to_4i32:
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512F-NEXT: vcvttpd2udq %zmm0, %ymm0
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fptoui_2f64_to_2i32:
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512DQ-NEXT: vcvttpd2udq %zmm0, %ymm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: fptoui_2f64_to_2i32:
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512F-NEXT: vcvttpd2udq %zmm0, %ymm0
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fptoui_4f64_to_2i32:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; AVX512VL-NEXT: vcvttpd2udq %ymm0, %xmm0
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: fptoui_4f64_to_2i32:
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512DQ-NEXT: vcvttpd2udq %zmm0, %ymm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: fptoui_4f64_to_2i32:
; AVX512VLDQ: # BB#0:
; AVX512VLDQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; AVX512VLDQ-NEXT: vcvttpd2udq %ymm0, %xmm0
+; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
%ext = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
%cvt = fptoui <4 x double> %ext to <4 x i32>
; AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512F-NEXT: vcvttpd2udq %zmm0, %ymm0
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fptoui_4f64_to_4i32:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vcvttpd2udq %ymm0, %xmm0
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: fptoui_4f64_to_4i32:
; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512DQ-NEXT: vcvttpd2udq %zmm0, %ymm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: fptoui_4f64_to_4i32:
; AVX512VLDQ: # BB#0:
; AVX512VLDQ-NEXT: vcvttpd2udq %ymm0, %xmm0
+; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
%cvt = fptoui <4 x double> %a to <4 x i32>
ret <4 x i32> %cvt
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: fptosi_4f32_to_2i64:
; AVX512VLDQ: # BB#0:
; AVX512VLDQ-NEXT: vcvttps2qq %xmm0, %ymm0
; AVX512VLDQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
%cvt = fptosi <4 x float> %a to <4 x i64>
%shuf = shufflevector <4 x i64> %cvt, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512F-NEXT: vcvttps2udq %zmm0, %zmm0
; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fptoui_2f32_to_2i32:
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512DQ-NEXT: vcvttps2udq %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: fptoui_2f32_to_2i32:
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512F-NEXT: vcvttps2udq %zmm0, %zmm0
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fptoui_4f32_to_4i32:
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512DQ-NEXT: vcvttps2udq %zmm0, %zmm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: fptoui_4f32_to_4i32:
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: fptoui_4f32_to_2i64:
; AVX512VLDQ: # BB#0:
; AVX512VLDQ-NEXT: vcvttps2uqq %xmm0, %ymm0
; AVX512VLDQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
%cvt = fptoui <4 x float> %a to <4 x i64>
%shuf = shufflevector <4 x i64> %cvt, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
; AVX512F-NEXT: vmovq %rax, %xmm0
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fptosi_2f16_to_4i32:
; AVX512DQ-NEXT: vmovq %rax, %xmm0
; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: fptosi_2f16_to_4i32:
; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
; X32-AVX512VL-NEXT: vcvtps2pd (%ecx), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5a,0x01]
; X32-AVX512VL-NEXT: vmovups %ymm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x00]
+; X32-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
;
; X64-SSE-LABEL: fpext_frommem4:
; X64-AVX512VL: # BB#0: # %entry
; X64-AVX512VL-NEXT: vcvtps2pd (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5a,0x07]
; X64-AVX512VL-NEXT: vmovups %ymm0, (%rsi) # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x06]
+; X64-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
entry:
%0 = load <4 x float>, <4 x float>* %in
; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
; X32-AVX512VL-NEXT: vcvtps2pd (%ecx), %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x5a,0x01]
; X32-AVX512VL-NEXT: vmovups %zmm0, (%eax) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x00]
+; X32-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
;
; X64-SSE-LABEL: fpext_frommem8:
; X64-AVX512VL: # BB#0: # %entry
; X64-AVX512VL-NEXT: vcvtps2pd (%rdi), %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x5a,0x07]
; X64-AVX512VL-NEXT: vmovups %zmm0, (%rsi) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x06]
+; X64-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
entry:
%0 = load <8 x float>, <8 x float>* %in
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512DQ-NEXT: vcvtqq2pd %zmm0, %zmm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: sitofp_2i64_to_2f64:
; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
; SSE-NEXT: retq
;
-; VEX-LABEL: sitofp_4i32_to_2f64:
-; VEX: # BB#0:
-; VEX-NEXT: vcvtdq2pd %xmm0, %ymm0
-; VEX-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; VEX-NEXT: vzeroupper
-; VEX-NEXT: retq
-;
-; AVX512-LABEL: sitofp_4i32_to_2f64:
-; AVX512: # BB#0:
-; AVX512-NEXT: vcvtdq2pd %xmm0, %ymm0
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; AVX512-NEXT: retq
+; AVX-LABEL: sitofp_4i32_to_2f64:
+; AVX: # BB#0:
+; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
+; AVX-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
%cvt = sitofp <4 x i32> %a to <4 x double>
%shuf = shufflevector <4 x double> %cvt, <4 x double> undef, <2 x i32> <i32 0, i32 1>
ret <2 x double> %shuf
; AVX512-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%cvt = sitofp <8 x i16> %a to <8 x double>
%shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <2 x i32> <i32 0, i32 1>
; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%cvt = sitofp <16 x i8> %a to <16 x double>
%shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <2 x i32> <i32 0, i32 1>
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: uitofp_2i64_to_2f64:
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: uitofp_2i32_to_2f64:
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: uitofp_2i32_to_2f64:
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: uitofp_4i32_to_2f64:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vcvtudq2pd %xmm0, %ymm0
; AVX512VL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: uitofp_4i32_to_2f64:
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: uitofp_4i32_to_2f64:
; AVX512VLDQ: # BB#0:
; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %ymm0
; AVX512VLDQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
%cvt = uitofp <4 x i32> %a to <4 x double>
%shuf = shufflevector <4 x double> %cvt, <4 x double> undef, <2 x i32> <i32 0, i32 1>
; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%cvt = uitofp <8 x i16> %a to <8 x double>
%shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <2 x i32> <i32 0, i32 1>
; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%cvt = uitofp <16 x i8> %a to <16 x double>
%shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <2 x i32> <i32 0, i32 1>
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: sitofp_2i64_to_4f32:
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: sitofp_2i64_to_4f32_zero:
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: sitofp_4i64_to_4f32_undef:
; AVX512VLDQ: # BB#0:
; AVX512VLDQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; AVX512VLDQ-NEXT: vcvtqq2ps %ymm0, %xmm0
+; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
%ext = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
%cvt = sitofp <4 x i64> %ext to <4 x float>
; AVX512-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%cvt = sitofp <8 x i16> %a to <8 x float>
%shuf = shufflevector <8 x float> %cvt, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512-NEXT: vcvtdq2ps %zmm0, %zmm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%cvt = sitofp <16 x i8> %a to <16 x float>
%shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: sitofp_4i64_to_4f32:
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: sitofp_4i64_to_4f32:
; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: sitofp_4i64_to_4f32:
; AVX512VLDQ: # BB#0:
; AVX512VLDQ-NEXT: vcvtqq2ps %ymm0, %xmm0
+; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
%cvt = sitofp <4 x i64> %a to <4 x float>
ret <4 x float> %cvt
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: uitofp_2i64_to_4f32:
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: uitofp_2i64_to_2f32:
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: uitofp_4i64_to_4f32_undef:
; AVX512VLDQ: # BB#0:
; AVX512VLDQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; AVX512VLDQ-NEXT: vcvtuqq2ps %ymm0, %xmm0
+; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
%ext = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
%cvt = uitofp <4 x i64> %ext to <4 x float>
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: uitofp_4i32_to_4f32:
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: uitofp_4i32_to_4f32:
; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%cvt = uitofp <8 x i16> %a to <8 x float>
%shuf = shufflevector <8 x float> %cvt, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512-NEXT: vcvtdq2ps %zmm0, %zmm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%cvt = uitofp <16 x i8> %a to <16 x float>
%shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: uitofp_4i64_to_4f32:
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: uitofp_4i64_to_4f32:
; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: uitofp_4i64_to_4f32:
; AVX512VLDQ: # BB#0:
; AVX512VLDQ-NEXT: vcvtuqq2ps %ymm0, %xmm0
+; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
%cvt = uitofp <4 x i64> %a to <4 x float>
ret <4 x float> %cvt
; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0
; AVX512DQ-NEXT: vcvtqq2pd %zmm0, %zmm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: sitofp_load_2i64_to_2f64:
; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0
; AVX512DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: uitofp_load_2i64_to_2f64:
; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: uitofp_load_2i32_to_2f64:
; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: uitofp_load_2i32_to_2f64:
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: sitofp_load_4i64_to_4f32:
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: sitofp_load_4i64_to_4f32:
; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0
; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: sitofp_load_4i64_to_4f32:
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: uitofp_load_4i64_to_4f32:
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: uitofp_load_4i64_to_4f32:
; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0
; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: uitofp_load_4i64_to_4f32:
; AVX512F-NEXT: vmovaps (%rdi), %xmm0
; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: uitofp_load_4i32_to_4f32:
; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0
; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: uitofp_load_4i32_to_4f32:
; AVX512-NEXT: vpmovsxwd 8(%rdi), %ymm0
; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0
; AVX512-NEXT: vmovaps %ymm0, (%rax)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = load %Arguments, %Arguments* %a0, align 1
%2 = extractvalue %Arguments %1, 1
; AVX512F-NEXT: # kill
; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0
; AVX512F-NEXT: # kill
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: test_uitofp_v4i32_to_v4f32:
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vmovq %xmm0, %rax
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = fcmp ogt <4 x double> %a0, %a1
%s = sext <4 x i1> %c to <4 x i64>
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: cltq
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = fcmp ogt <4 x double> %a0, %a1
%s = sext <4 x i1> %c to <4 x i32>
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = fcmp ogt <8 x float> %a0, %a1
%s = sext <8 x i1> %c to <8 x i32>
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: cwtl
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = fcmp ogt <8 x float> %a0, %a1
%s = sext <8 x i1> %c to <8 x i16>
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vmovq %xmm0, %rax
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = icmp sgt <4 x i64> %a0, %a1
%s = sext <4 x i1> %c to <4 x i64>
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: cltq
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = icmp sgt <4 x i64> %a0, %a1
%s = sext <4 x i1> %c to <4 x i32>
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = icmp sgt <8 x i32> %a0, %a1
%s = sext <8 x i1> %c to <8 x i32>
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: cwtl
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = icmp sgt <8 x i32> %a0, %a1
%s = sext <8 x i1> %c to <8 x i16>
; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = icmp sgt <16 x i16> %a0, %a1
%s = sext <16 x i1> %c to <16 x i16>
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
; AVX512-NEXT: movsbl %al, %eax
; AVX512-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = icmp sgt <16 x i16> %a0, %a1
%s = sext <16 x i1> %c to <16 x i8>
; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
; AVX512-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = icmp sgt <32 x i8> %a0, %a1
%s = sext <32 x i1> %c to <32 x i8>
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vmovq %xmm0, %rax
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = fcmp ogt <4 x double> %a0, %a1
%s = sext <4 x i1> %c to <4 x i64>
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: cltq
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = fcmp ogt <4 x double> %a0, %a1
%s = sext <4 x i1> %c to <4 x i32>
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = fcmp ogt <8 x float> %a0, %a1
%s = sext <8 x i1> %c to <8 x i32>
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: cwtl
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = fcmp ogt <8 x float> %a0, %a1
%s = sext <8 x i1> %c to <8 x i16>
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vmovq %xmm0, %rax
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = icmp sgt <4 x i64> %a0, %a1
%s = sext <4 x i1> %c to <4 x i64>
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: cltq
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = icmp sgt <4 x i64> %a0, %a1
%s = sext <4 x i1> %c to <4 x i32>
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = icmp sgt <8 x i32> %a0, %a1
%s = sext <8 x i1> %c to <8 x i32>
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: cwtl
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = icmp sgt <8 x i32> %a0, %a1
%s = sext <8 x i1> %c to <8 x i16>
; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = icmp sgt <16 x i16> %a0, %a1
%s = sext <16 x i1> %c to <16 x i16>
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
; AVX512-NEXT: movsbl %al, %eax
; AVX512-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = icmp sgt <16 x i16> %a0, %a1
%s = sext <16 x i1> %c to <16 x i8>
; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
; AVX512-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = icmp sgt <32 x i8> %a0, %a1
%s = sext <32 x i1> %c to <32 x i8>
; AVX512-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = fcmp ogt <4 x double> %a0, %a1
ret <4 x i1> %1
; AVX512-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = fcmp ogt <8 x float> %a0, %a1
ret <8 x i1> %1
; AVX512-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = icmp sgt <4 x i64> %a0, %a1
ret <4 x i1> %1
; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = icmp sgt <8 x i32> %a0, %a1
ret <8 x i1> %1
; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_cmp_v16i16:
; AVX512DQ-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: test_cmp_v16i16:
; AVX512BW-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = icmp sgt <16 x i16> %a0, %a1
ret <16 x i1> %1
; AVX512F-NEXT: vcmpltpd %zmm0, %zmm1, %k1
; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_cmp_v8f64:
; AVX512DQ-NEXT: vcmpltpd %zmm0, %zmm1, %k0
; AVX512DQ-NEXT: vpmovm2q %k0, %zmm0
; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: test_cmp_v8f64:
; AVX512BW-NEXT: vcmpltpd %zmm0, %zmm1, %k1
; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = fcmp ogt <8 x double> %a0, %a1
ret <8 x i1> %1
; AVX512F-NEXT: vcmpltps %zmm0, %zmm1, %k1
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_cmp_v16f32:
; AVX512DQ-NEXT: vcmpltps %zmm0, %zmm1, %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: test_cmp_v16f32:
; AVX512BW-NEXT: vcmpltps %zmm0, %zmm1, %k1
; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = fcmp ogt <16 x float> %a0, %a1
ret <16 x i1> %1
; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1
; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_cmp_v8i64:
; AVX512DQ-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
; AVX512DQ-NEXT: vpmovm2q %k0, %zmm0
; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: test_cmp_v8i64:
; AVX512BW-NEXT: vpcmpgtq %zmm1, %zmm0, %k1
; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = icmp sgt <8 x i64> %a0, %a1
ret <8 x i1> %1
; AVX512F-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_cmp_v16i32:
; AVX512DQ-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: test_cmp_v16i32:
; AVX512BW-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = icmp sgt <16 x i32> %a0, %a1
ret <16 x i1> %1
; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm3
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX512F-NEXT: vmovdqa %xmm4, %xmm2
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_cmp_v64i8:
; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm3
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX512DQ-NEXT: vmovdqa %xmm4, %xmm2
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: test_cmp_v64i8:
; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_cmp_v16f64:
; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: test_cmp_v16f64:
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = fcmp ogt <16 x double> %a0, %a1
ret <16 x i1> %1
; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_cmp_v16i64:
; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: test_cmp_v16i64:
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = icmp sgt <16 x i64> %a0, %a1
ret <16 x i1> %1
; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX512F-NEXT: # kill: %XMM2<def> %XMM2<kill> %YMM2<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_cmp_v64i16:
; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX512DQ-NEXT: # kill: %XMM2<def> %XMM2<kill> %YMM2<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: test_cmp_v64i16:
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, (%rdi)
; AVX512F-NEXT: movq %rdi, %rax
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_cmp_v128i8:
; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512DQ-NEXT: kmovw %k0, (%rdi)
; AVX512DQ-NEXT: movq %rdi, %rax
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: test_cmp_v128i8:
; AVX512F-NEXT: vmovd %eax, %xmm0
; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_i16_to_f32:
; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_4i16_to_4f32:
; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_8i16_to_4f32:
; AVX512F-NEXT: vmovd %eax, %xmm0
; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: load_cvt_i16_to_f32:
; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: load_cvt_4i16_to_4f32:
; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: load_cvt_8i16_to_4f32:
; AVX512F-NEXT: vmovd %eax, %xmm0
; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_i16_to_f64:
; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_2i16_to_2f64:
; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_8i16_to_2f64:
; AVX512F-NEXT: vmovd %eax, %xmm0
; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: load_cvt_i16_to_f64:
; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: load_cvt_2i16_to_2f64:
; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
; AVX512F-NEXT: vmovd %xmm0, %eax
; AVX512F-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_f32_to_i16:
; AVX512F-NEXT: shlq $32, %rdx
; AVX512F-NEXT: orq %rcx, %rdx
; AVX512F-NEXT: vmovq %rdx, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_4f32_to_4i16:
; AVX512F-NEXT: orq %rcx, %rdx
; AVX512F-NEXT: vmovq %rdx, %xmm0
; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_4f32_to_8i16_undef:
; AVX512F-NEXT: orq %rcx, %rdx
; AVX512F-NEXT: vmovq %rdx, %xmm0
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_4f32_to_8i16_zero:
; AVX512F-NEXT: vmovq %rsi, %xmm0
; AVX512F-NEXT: vmovq %rax, %xmm1
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_8f32_to_8i16:
; AVX512VL-NEXT: vmovq %rsi, %xmm0
; AVX512VL-NEXT: vmovq %rax, %xmm1
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
%1 = fptrunc <8 x float> %a0 to <8 x half>
%2 = bitcast <8 x half> %1 to <8 x i16>
; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
; AVX512F-NEXT: vmovd %xmm0, %eax
; AVX512F-NEXT: movw %ax, (%rdi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: store_cvt_f32_to_i16:
; AVX512F-NEXT: movw %dx, 6(%rdi)
; AVX512F-NEXT: movw %cx, 4(%rdi)
; AVX512F-NEXT: movw %ax, 2(%rdi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: store_cvt_4f32_to_4i16:
; AVX512F-NEXT: vmovq %rdx, %xmm0
; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX512F-NEXT: vmovdqa %xmm0, (%rdi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: store_cvt_4f32_to_8i16_undef:
; AVX512F-NEXT: vmovq %rdx, %xmm0
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vmovdqa %xmm0, (%rdi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: store_cvt_4f32_to_8i16_zero:
; AVX512F-NEXT: movw %r10w, 6(%rdi)
; AVX512F-NEXT: movw %r9w, 4(%rdi)
; AVX512F-NEXT: movw %r8w, 2(%rdi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: store_cvt_8f32_to_8i16:
; AVX512VL-NEXT: movw %r10w, 6(%rdi)
; AVX512VL-NEXT: movw %r9w, 4(%rdi)
; AVX512VL-NEXT: movw %r8w, 2(%rdi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
%1 = fptrunc <8 x float> %a0 to <8 x half>
%2 = bitcast <8 x half> %1 to <8 x i16>
; AVX512F-NEXT: movw %ax, 4(%rdi)
; AVX512F-NEXT: vmovd %xmm4, %eax
; AVX512F-NEXT: movw %ax, 2(%rdi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: store_cvt_16f32_to_16i16:
; AVX512VL-NEXT: movw %ax, 4(%rdi)
; AVX512VL-NEXT: vmovd %xmm4, %eax
; AVX512VL-NEXT: movw %ax, 2(%rdi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
%1 = fptrunc <16 x float> %a0 to <16 x half>
%2 = bitcast <16 x half> %1 to <16 x i16>
; AVX512F-NEXT: subq $40, %rsp
; AVX512F-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movw %ax, %bx
; AVX512F-NEXT: shll $16, %ebx
; AVX512F-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movzwl %ax, %r14d
; AVX512F-NEXT: orl %ebx, %r14d
; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movw %ax, %bx
; AVX512F-NEXT: shll $16, %ebx
; AVX512VL-NEXT: subq $40, %rsp
; AVX512VL-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movw %ax, %bx
; AVX512VL-NEXT: shll $16, %ebx
; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX512VL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movzwl %ax, %r14d
; AVX512VL-NEXT: orl %ebx, %r14d
; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movw %ax, %bx
; AVX512VL-NEXT: shll $16, %ebx
; AVX512F-NEXT: subq $40, %rsp
; AVX512F-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movw %ax, %bx
; AVX512F-NEXT: shll $16, %ebx
; AVX512F-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movzwl %ax, %r14d
; AVX512F-NEXT: orl %ebx, %r14d
; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movw %ax, %bx
; AVX512F-NEXT: shll $16, %ebx
; AVX512VL-NEXT: subq $40, %rsp
; AVX512VL-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movw %ax, %bx
; AVX512VL-NEXT: shll $16, %ebx
; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX512VL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movzwl %ax, %r14d
; AVX512VL-NEXT: orl %ebx, %r14d
; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movw %ax, %bx
; AVX512VL-NEXT: shll $16, %ebx
; AVX512F-NEXT: subq $40, %rsp
; AVX512F-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movw %ax, %bx
; AVX512F-NEXT: shll $16, %ebx
; AVX512F-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movzwl %ax, %r14d
; AVX512F-NEXT: orl %ebx, %r14d
; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movw %ax, %bx
; AVX512F-NEXT: shll $16, %ebx
; AVX512VL-NEXT: subq $40, %rsp
; AVX512VL-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movw %ax, %bx
; AVX512VL-NEXT: shll $16, %ebx
; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX512VL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movzwl %ax, %r14d
; AVX512VL-NEXT: orl %ebx, %r14d
; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movw %ax, %bx
; AVX512VL-NEXT: shll $16, %ebx
; AVX512F-NEXT: subq $96, %rsp
; AVX512F-NEXT: vmovupd %zmm0, (%rsp) # 64-byte Spill
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movw %ax, %bx
; AVX512F-NEXT: shll $16, %ebx
; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movzwl %ax, %r15d
; AVX512F-NEXT: orl %ebx, %r15d
; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movw %ax, %bx
; AVX512F-NEXT: shll $16, %ebx
; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; AVX512F-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movw %ax, %bx
; AVX512F-NEXT: shll $16, %ebx
; AVX512F-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movzwl %ax, %r15d
; AVX512F-NEXT: orl %ebx, %r15d
; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movw %ax, %bx
; AVX512F-NEXT: shll $16, %ebx
; AVX512VL-NEXT: subq $96, %rsp
; AVX512VL-NEXT: vmovupd %zmm0, (%rsp) # 64-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movw %ax, %bx
; AVX512VL-NEXT: shll $16, %ebx
; AVX512VL-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
; AVX512VL-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movzwl %ax, %r15d
; AVX512VL-NEXT: orl %ebx, %r15d
; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512VL-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movw %ax, %bx
; AVX512VL-NEXT: shll $16, %ebx
; AVX512VL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; AVX512VL-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movw %ax, %bx
; AVX512VL-NEXT: shll $16, %ebx
; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX512VL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movzwl %ax, %r15d
; AVX512VL-NEXT: orl %ebx, %r15d
; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movw %ax, %bx
; AVX512VL-NEXT: shll $16, %ebx
; AVX512F-NEXT: movq %rdi, %rbx
; AVX512F-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movl %eax, %r14d
; AVX512F-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movl %eax, %r15d
; AVX512F-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movl %eax, %ebp
; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
; AVX512VL-NEXT: movq %rdi, %rbx
; AVX512VL-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movl %eax, %r14d
; AVX512VL-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512VL-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movl %eax, %r15d
; AVX512VL-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
; AVX512VL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movl %eax, %ebp
; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
; AVX512F-NEXT: movq %rdi, %r14
; AVX512F-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movw %ax, %bp
; AVX512F-NEXT: shll $16, %ebp
; AVX512F-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movzwl %ax, %ebx
; AVX512F-NEXT: orl %ebp, %ebx
; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movw %ax, %bp
; AVX512F-NEXT: shll $16, %ebp
; AVX512VL-NEXT: movq %rdi, %r14
; AVX512VL-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movw %ax, %bp
; AVX512VL-NEXT: shll $16, %ebp
; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX512VL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movzwl %ax, %ebx
; AVX512VL-NEXT: orl %ebp, %ebx
; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movw %ax, %bp
; AVX512VL-NEXT: shll $16, %ebp
; AVX512F-NEXT: movq %rdi, %r14
; AVX512F-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movw %ax, %bp
; AVX512F-NEXT: shll $16, %ebp
; AVX512F-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movzwl %ax, %ebx
; AVX512F-NEXT: orl %ebp, %ebx
; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movw %ax, %bp
; AVX512F-NEXT: shll $16, %ebp
; AVX512VL-NEXT: movq %rdi, %r14
; AVX512VL-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movw %ax, %bp
; AVX512VL-NEXT: shll $16, %ebp
; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX512VL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movzwl %ax, %ebx
; AVX512VL-NEXT: orl %ebp, %ebx
; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movw %ax, %bp
; AVX512VL-NEXT: shll $16, %ebp
; AVX512F-NEXT: movq %rdi, %rbx
; AVX512F-NEXT: vmovupd %zmm0, {{[0-9]+}}(%rsp) # 64-byte Spill
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
; AVX512F-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
; AVX512F-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; AVX512F-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movl %eax, %r12d
; AVX512F-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movl %eax, %r13d
; AVX512F-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movl %eax, %ebp
; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
; AVX512F-NEXT: movl %eax, %r14d
; AVX512F-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movl %eax, %r15d
; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
; AVX512VL-NEXT: movq %rdi, %rbx
; AVX512VL-NEXT: vmovupd %zmm0, {{[0-9]+}}(%rsp) # 64-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
; AVX512VL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512VL-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
; AVX512VL-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
; AVX512VL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; AVX512VL-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movl %eax, %r12d
; AVX512VL-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512VL-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movl %eax, %r13d
; AVX512VL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
; AVX512VL-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movl %eax, %ebp
; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
; AVX512VL-NEXT: movl %eax, %r14d
; AVX512VL-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
; AVX512VL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movl %eax, %r15d
; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512VL-LABEL: var_rotate_v8i16:
; AVX512BW-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512VL-LABEL: var_rotate_v16i8:
; AVX512VL-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0
; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
; AVX512VL-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; XOP-LABEL: var_rotate_v16i8:
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [16,15,14,13,12,11,10,9]
; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512VL-LABEL: constant_rotate_v8i16:
; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; XOP-LABEL: constant_rotate_v16i8:
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: load_sext_2i1_to_2i64:
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; X32-SSE41-LABEL: load_sext_2i1_to_2i64:
; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: load_sext_4i1_to_4i32:
; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; X32-SSE41-LABEL: load_sext_4i1_to_4i32:
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: load_sext_8i1_to_8i16:
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; X32-SSE41-LABEL: load_sext_8i1_to_8i16:
; AVX512-NEXT: kmovw (%rdi), %k1
; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; X32-SSE41-LABEL: load_sext_16i1_to_16i8:
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vextracti32x4 $0, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%mask.cast = bitcast i8 %mask to <8 x i1>
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vextracti32x4 $1, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%mask.cast = bitcast i8 %mask to <8 x i1>
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
%mask.cast = bitcast i8 %mask to <8 x i1>
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vextracti32x4 $3, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
%mask.cast = bitcast i8 %mask to <8 x i1>
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vextractf32x4 $0, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuffle = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%mask.cast = bitcast i8 %mask to <8 x i1>
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vextractf32x4 $1, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuffle = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%mask.cast = bitcast i8 %mask to <8 x i1>
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuffle = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
%mask.cast = bitcast i8 %mask to <8 x i1>
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vextractf32x4 $3, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuffle = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
%mask.cast = bitcast i8 %mask to <8 x i1>
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vextracti64x2 $0, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
%mask.cast = bitcast i8 %mask to <8 x i1>
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vextracti64x2 $1, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> <i32 2, i32 3>
%mask.cast = bitcast i8 %mask to <8 x i1>
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vextracti64x2 $2, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
%mask.cast = bitcast i8 %mask to <8 x i1>
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vextracti64x2 $3, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
%mask.cast = bitcast i8 %mask to <8 x i1>
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vextractf64x2 $0, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovapd %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> <i32 0, i32 1>
%mask.cast = bitcast i8 %mask to <8 x i1>
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vextractf64x2 $1, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovapd %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> <i32 2, i32 3>
%mask.cast = bitcast i8 %mask to <8 x i1>
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vextractf64x2 $2, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovapd %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> <i32 4, i32 5>
%mask.cast = bitcast i8 %mask to <8 x i1>
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vextractf64x2 $3, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovapd %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> <i32 6, i32 7>
%mask.cast = bitcast i8 %mask to <8 x i1>
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vextracti32x4 $1, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> <i32 2, i32 3>
%shuffle.cast = bitcast <2 x i64> %shuffle to <4 x i32>
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vextractf32x4 $1, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> <i32 2, i32 3>
%shuffle.cast = bitcast <2 x double> %shuffle to <4 x float>
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vextracti64x2 $1, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle.cast = bitcast <4 x i32> %shuffle to <2 x i64>
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vextractf64x2 $1, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovapd %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuffle = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle.cast = bitcast <4 x float> %shuffle to <2 x double>
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; VL_BW_DQ-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
; VL_BW_DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0
; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0
+; VL_BW_DQ-NEXT: vzeroupper
; VL_BW_DQ-NEXT: retq
%a2 = icmp eq <8 x i64> %a, %a1
%b2 = icmp eq <8 x i64> %b, %b1
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; VL_BW_DQ-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
; VL_BW_DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
; VL_BW_DQ-NEXT: vpmovd2m %zmm2, %k0
; VL_BW_DQ-NEXT: vpmovm2b %k0, %xmm0
+; VL_BW_DQ-NEXT: vzeroupper
; VL_BW_DQ-NEXT: retq
%a2 = icmp eq <16 x i32> %a, %a1
%b2 = icmp eq <16 x i32> %b, %b1
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; VL_BW_DQ-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
; VL_BW_DQ-NEXT: vpbroadcastq %xmm0, %zmm0
; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0
+; VL_BW_DQ-NEXT: vzeroupper
; VL_BW_DQ-NEXT: retq
%b = bitcast i8 %a to <8 x i1>
%c = shufflevector < 8 x i1> %b, <8 x i1>undef, <8 x i32> <i32 undef, i32 2, i32 undef, i32 undef, i32 2, i32 undef, i32 2, i32 undef>
; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; VL_BW_DQ-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
; VL_BW_DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0
; VL_BW_DQ-NEXT: kmovb %k0, %eax
+; VL_BW_DQ-NEXT: vzeroupper
; VL_BW_DQ-NEXT: retq
%b = bitcast i8 %a to <8 x i1>
%c = shufflevector < 8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> <i32 10, i32 2, i32 9, i32 undef, i32 3, i32 undef, i32 2, i32 undef>
; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; VL_BW_DQ-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
; VL_BW_DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5,0,1,0,1]
; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0
; VL_BW_DQ-NEXT: kmovb %k0, %eax
+; VL_BW_DQ-NEXT: vzeroupper
; VL_BW_DQ-NEXT: retq
%b = bitcast i8 %a to <8 x i1>
%c = shufflevector < 8 x i1> %b, <8 x i1> undef, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; VL_BW_DQ-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
; VL_BW_DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0
; VL_BW_DQ-NEXT: kmovb %k0, %eax
+; VL_BW_DQ-NEXT: vzeroupper
; VL_BW_DQ-NEXT: retq
%b = bitcast i8 %a to <8 x i1>
%c = shufflevector <8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
; VL_BW_DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0
; VL_BW_DQ-NEXT: kmovb %k0, %eax
+; VL_BW_DQ-NEXT: vzeroupper
; VL_BW_DQ-NEXT: retq
%b = bitcast i8 %a to <8 x i1>
%c = shufflevector <8 x i1> zeroinitializer, <8 x i1> %b, <8 x i32> <i32 9, i32 6, i32 1, i32 10, i32 3, i32 7, i32 7, i32 0>
; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; VL_BW_DQ-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
; VL_BW_DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0
; VL_BW_DQ-NEXT: kmovb %k0, %eax
+; VL_BW_DQ-NEXT: vzeroupper
; VL_BW_DQ-NEXT: retq
%b = bitcast i8 %a to <8 x i1>
%c = shufflevector <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x i1> %b, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 1>
; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
; VL_BW_DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0
; VL_BW_DQ-NEXT: kmovb %k0, %eax
+; VL_BW_DQ-NEXT: vzeroupper
; VL_BW_DQ-NEXT: retq
%c = shufflevector <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i1> %a, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
%c1 = bitcast <8 x i1>%c to i8
; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; VL_BW_DQ-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0:
; VL_BW_DQ-NEXT: vpbroadcastd %xmm0, %zmm0
; VL_BW_DQ-NEXT: vpmovd2m %zmm0, %k0
; VL_BW_DQ-NEXT: kmovw %k0, %eax
+; VL_BW_DQ-NEXT: vzeroupper
; VL_BW_DQ-NEXT: retq
%b = bitcast i16 %a to <16 x i1>
%c = shufflevector < 16 x i1> %b, <16 x i1> undef, <16 x i32> zeroinitializer
; AVX512F-NEXT: orq %rcx, %rax
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; VL_BW_DQ-LABEL: shuf64i1_zero:
; VL_BW_DQ-NEXT: vpbroadcastb %xmm0, %zmm0
; VL_BW_DQ-NEXT: vpmovb2m %zmm0, %k0
; VL_BW_DQ-NEXT: kmovq %k0, %rax
+; VL_BW_DQ-NEXT: vzeroupper
; VL_BW_DQ-NEXT: retq
%b = bitcast i64 %a to <64 x i1>
%c = shufflevector < 64 x i1> %b, <64 x i1> undef, <64 x i32> zeroinitializer
; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = add <4 x i64> %a0, %a1
%2 = trunc <4 x i64> %1 to <4 x i32>
; AVX512: # BB#0:
; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = add <8 x i64> %a0, %a1
%2 = trunc <8 x i64> %1 to <8 x i16>
; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = add <8 x i32> %a0, %a1
%2 = trunc <8 x i32> %1 to <8 x i16>
; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_add_v16i64_v16i8:
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_add_v16i64_v16i8:
; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = add <16 x i64> %a0, %a1
%2 = trunc <16 x i64> %1 to <16 x i8>
; AVX512: # BB#0:
; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = add <16 x i32> %a0, %a1
%2 = trunc <16 x i32> %1 to <16 x i8>
; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_add_v16i16_v16i8:
; AVX512BW-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_add_v16i16_v16i8:
; AVX512DQ-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = add <16 x i16> %a0, %a1
%2 = trunc <16 x i16> %1 to <16 x i8>
; AVX512-NEXT: vpmovdw %zmm1, %ymm1
; AVX512-NEXT: vpmovsxbw %xmm0, %xmm0
; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%2 = sext <8 x i8> %1 to <8 x i32>
; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = add <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
%2 = trunc <4 x i64> %1 to <4 x i32>
; AVX512: # BB#0:
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = add <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
%2 = trunc <8 x i64> %1 to <8 x i16>
; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = add <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%2 = trunc <8 x i32> %1 to <8 x i16>
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_add_const_v16i64_v16i8:
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_add_const_v16i64_v16i8:
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = add <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
%2 = trunc <16 x i64> %1 to <16 x i8>
; AVX512: # BB#0:
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = add <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%2 = trunc <16 x i32> %1 to <16 x i8>
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_add_const_v16i16_v16i8:
; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_add_const_v16i16_v16i8:
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = add <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
%2 = trunc <16 x i16> %1 to <16 x i8>
; AVX512-NEXT: vpsubq %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = sub <4 x i64> %a0, %a1
%2 = trunc <4 x i64> %1 to <4 x i32>
; AVX512: # BB#0:
; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = sub <8 x i64> %a0, %a1
%2 = trunc <8 x i64> %1 to <8 x i16>
; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = sub <8 x i32> %a0, %a1
%2 = trunc <8 x i32> %1 to <8 x i16>
; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_sub_v16i64_v16i8:
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_sub_v16i64_v16i8:
; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = sub <16 x i64> %a0, %a1
%2 = trunc <16 x i64> %1 to <16 x i8>
; AVX512: # BB#0:
; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = sub <16 x i32> %a0, %a1
%2 = trunc <16 x i32> %1 to <16 x i8>
; AVX512F-NEXT: vpsubw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_sub_v16i16_v16i8:
; AVX512BW-NEXT: vpsubw %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_sub_v16i16_v16i8:
; AVX512DQ-NEXT: vpsubw %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = sub <16 x i16> %a0, %a1
%2 = trunc <16 x i16> %1 to <16 x i8>
; AVX512-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = sub <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
%2 = trunc <4 x i64> %1 to <4 x i32>
; AVX512: # BB#0:
; AVX512-NEXT: vpsubq {{.*}}(%rip), %zmm0, %zmm0
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = sub <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
%2 = trunc <8 x i64> %1 to <8 x i16>
; AVX512-NEXT: vpsubd {{.*}}(%rip), %ymm0, %ymm0
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = sub <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%2 = trunc <8 x i32> %1 to <8 x i16>
; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_sub_const_v16i64_v16i8:
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_sub_const_v16i64_v16i8:
; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = sub <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
%2 = trunc <16 x i64> %1 to <16 x i8>
; AVX512: # BB#0:
; AVX512-NEXT: vpsubd {{.*}}(%rip), %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = sub <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%2 = trunc <16 x i32> %1 to <16 x i8>
; AVX512F-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_sub_const_v16i16_v16i8:
; AVX512BW-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_sub_const_v16i16_v16i8:
; AVX512DQ-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = sub <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
%2 = trunc <16 x i16> %1 to <16 x i8>
; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
; AVX512F-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_mul_v4i64_v4i32:
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
; AVX512BW-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_mul_v4i64_v4i32:
; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = mul <4 x i64> %a0, %a1
%2 = trunc <4 x i64> %1 to <4 x i32>
; AVX512F-NEXT: vpmovqw %zmm1, %xmm1
; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
; AVX512F-NEXT: vpmullw %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_mul_v8i64_v8i16:
; AVX512BW-NEXT: vpmovqw %zmm1, %xmm1
; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_mul_v8i64_v8i16:
; AVX512DQ: # BB#0:
; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = mul <8 x i64> %a0, %a1
%2 = trunc <8 x i64> %1 to <8 x i16>
; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = mul <8 x i32> %a0, %a1
%2 = trunc <8 x i32> %1 to <8 x i16>
; AVX512F-NEXT: vpmulld %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_mul_v16i64_v16i8:
; AVX512BW-NEXT: vpmulld %ymm2, %ymm0, %ymm0
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_mul_v16i64_v16i8:
; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = mul <16 x i64> %a0, %a1
%2 = trunc <16 x i64> %1 to <16 x i8>
; AVX512: # BB#0:
; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = mul <16 x i32> %a0, %a1
%2 = trunc <16 x i32> %1 to <16 x i8>
; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_mul_v16i16_v16i8:
; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_mul_v16i16_v16i8:
; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = mul <16 x i16> %a0, %a1
%2 = trunc <16 x i16> %1 to <16 x i8>
; AVX512-NEXT: vpmovdw %zmm1, %ymm1
; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%2 = zext <8 x i8> %1 to <8 x i32>
; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = mul <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
%2 = trunc <4 x i64> %1 to <4 x i32>
; AVX512: # BB#0:
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
; AVX512-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = mul <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
%2 = trunc <8 x i64> %1 to <8 x i16>
; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
; AVX512-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = mul <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%2 = trunc <8 x i32> %1 to <8 x i16>
; AVX512F-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_mul_const_v16i64_v16i8:
; AVX512BW-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_mul_const_v16i64_v16i8:
; AVX512DQ-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = mul <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
%2 = trunc <16 x i64> %1 to <16 x i8>
; AVX512: # BB#0:
; AVX512-NEXT: vpmulld {{.*}}(%rip), %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = mul <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%2 = trunc <16 x i32> %1 to <16 x i8>
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_mul_const_v16i16_v16i8:
; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_mul_const_v16i16_v16i8:
; AVX512DQ-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = mul <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
%2 = trunc <16 x i16> %1 to <16 x i8>
; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = and <4 x i64> %a0, %a1
%2 = trunc <4 x i64> %1 to <4 x i32>
; AVX512: # BB#0:
; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = and <8 x i64> %a0, %a1
%2 = trunc <8 x i64> %1 to <8 x i16>
; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = and <8 x i32> %a0, %a1
%2 = trunc <8 x i32> %1 to <8 x i16>
; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_and_v16i64_v16i8:
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_and_v16i64_v16i8:
; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = and <16 x i64> %a0, %a1
%2 = trunc <16 x i64> %1 to <16 x i8>
; AVX512: # BB#0:
; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = and <16 x i32> %a0, %a1
%2 = trunc <16 x i32> %1 to <16 x i8>
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_and_v16i16_v16i8:
; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_and_v16i16_v16i8:
; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = and <16 x i16> %a0, %a1
%2 = trunc <16 x i16> %1 to <16 x i8>
; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = and <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
%2 = trunc <4 x i64> %1 to <4 x i32>
; AVX512: # BB#0:
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = and <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
%2 = trunc <8 x i64> %1 to <8 x i16>
; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = and <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%2 = trunc <8 x i32> %1 to <8 x i16>
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_and_const_v16i64_v16i8:
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_and_const_v16i64_v16i8:
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = and <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
%2 = trunc <16 x i64> %1 to <16 x i8>
; AVX512: # BB#0:
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = and <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%2 = trunc <16 x i32> %1 to <16 x i8>
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_and_const_v16i16_v16i8:
; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_and_const_v16i16_v16i8:
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = and <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
%2 = trunc <16 x i16> %1 to <16 x i8>
; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = xor <4 x i64> %a0, %a1
%2 = trunc <4 x i64> %1 to <4 x i32>
; AVX512: # BB#0:
; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = xor <8 x i64> %a0, %a1
%2 = trunc <8 x i64> %1 to <8 x i16>
; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = xor <8 x i32> %a0, %a1
%2 = trunc <8 x i32> %1 to <8 x i16>
; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_xor_v16i64_v16i8:
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_xor_v16i64_v16i8:
; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = xor <16 x i64> %a0, %a1
%2 = trunc <16 x i64> %1 to <16 x i8>
; AVX512: # BB#0:
; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = xor <16 x i32> %a0, %a1
%2 = trunc <16 x i32> %1 to <16 x i8>
; AVX512F-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_xor_v16i16_v16i8:
; AVX512BW-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_xor_v16i16_v16i8:
; AVX512DQ-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = xor <16 x i16> %a0, %a1
%2 = trunc <16 x i16> %1 to <16 x i8>
; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = xor <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
%2 = trunc <4 x i64> %1 to <4 x i32>
; AVX512: # BB#0:
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = xor <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
%2 = trunc <8 x i64> %1 to <8 x i16>
; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = xor <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%2 = trunc <8 x i32> %1 to <8 x i16>
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_xor_const_v16i64_v16i8:
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_xor_const_v16i64_v16i8:
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = xor <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
%2 = trunc <16 x i64> %1 to <16 x i8>
; AVX512: # BB#0:
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = xor <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%2 = trunc <16 x i32> %1 to <16 x i8>
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_xor_const_v16i16_v16i8:
; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_xor_const_v16i16_v16i8:
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = xor <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
%2 = trunc <16 x i16> %1 to <16 x i8>
; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = or <4 x i64> %a0, %a1
%2 = trunc <4 x i64> %1 to <4 x i32>
; AVX512: # BB#0:
; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = or <8 x i64> %a0, %a1
%2 = trunc <8 x i64> %1 to <8 x i16>
; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = or <8 x i32> %a0, %a1
%2 = trunc <8 x i32> %1 to <8 x i16>
; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_or_v16i64_v16i8:
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_or_v16i64_v16i8:
; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = or <16 x i64> %a0, %a1
%2 = trunc <16 x i64> %1 to <16 x i8>
; AVX512: # BB#0:
; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = or <16 x i32> %a0, %a1
%2 = trunc <16 x i32> %1 to <16 x i8>
; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_or_v16i16_v16i8:
; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_or_v16i16_v16i8:
; AVX512DQ-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = or <16 x i16> %a0, %a1
%2 = trunc <16 x i16> %1 to <16 x i8>
; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = or <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
%2 = trunc <4 x i64> %1 to <4 x i32>
; AVX512: # BB#0:
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = or <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
%2 = trunc <8 x i64> %1 to <8 x i16>
; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = or <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%2 = trunc <8 x i32> %1 to <8 x i16>
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_or_const_v16i64_v16i8:
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_or_const_v16i64_v16i8:
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = or <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
%2 = trunc <16 x i64> %1 to <16 x i8>
; AVX512: # BB#0:
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = or <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%2 = trunc <16 x i32> %1 to <16 x i8>
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_or_const_v16i16_v16i8:
; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_or_const_v16i16_v16i8:
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = or <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
%2 = trunc <16 x i16> %1 to <16 x i8>
; AVX512-LABEL: trunc8i64_8i16:
; AVX512: # BB#0: # %entry
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
entry:
%0 = trunc <8 x i64> %a to <8 x i16>
; AVX512-LABEL: trunc8i64_8i8:
; AVX512: # BB#0: # %entry
; AVX512-NEXT: vpmovqb %zmm0, (%rax)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
entry:
%0 = trunc <8 x i64> %a to <8 x i8>
; AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc8i32_8i16:
; AVX512VL: # BB#0: # %entry
; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc8i32_8i16:
; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc8i32_8i16:
; AVX512BWVL: # BB#0: # %entry
; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
entry:
%0 = trunc <8 x i32> %a to <8 x i16>
; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX512F-NEXT: vmovq %xmm0, (%rax)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc8i32_8i8:
; AVX512VL: # BB#0: # %entry
; AVX512VL-NEXT: vpmovdb %ymm0, (%rax)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc8i32_8i8:
; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vmovq %xmm0, (%rax)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc8i32_8i8:
; AVX512BWVL: # BB#0: # %entry
; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rax)
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
entry:
%0 = trunc <8 x i32> %a to <8 x i8>
; AVX512-LABEL: trunc16i32_16i16:
; AVX512: # BB#0: # %entry
; AVX512-NEXT: vpmovdw %zmm0, (%rax)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
entry:
%0 = trunc <16 x i32> %a to <16 x i16>
; AVX512-LABEL: trunc16i32_16i8:
; AVX512: # BB#0: # %entry
; AVX512-NEXT: vpmovdb %zmm0, (%rax)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
entry:
%0 = trunc <16 x i32> %a to <16 x i8>
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vmovdqu %xmm0, (%rax)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc16i16_16i8:
; AVX512VL-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
; AVX512VL-NEXT: vmovdqu %xmm0, (%rax)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc16i16_16i8:
; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: vmovdqu %xmm0, (%rax)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc16i16_16i8:
; AVX512BWVL: # BB#0: # %entry
; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rax)
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
entry:
%0 = trunc <16 x i16> %a to <16 x i8>
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc32i16_32i8:
; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1
; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: vmovdqu %ymm0, (%rax)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc32i16_32i8:
; AVX512BW: # BB#0: # %entry
; AVX512BW-NEXT: vpmovwb %zmm0, (%rax)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc32i16_32i8:
; AVX512BWVL: # BB#0: # %entry
; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rax)
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
entry:
%0 = trunc <32 x i16> %a to <32 x i8>
; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc2x4i64_8i16:
; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc2x4i64_8i16:
; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc2x4i64_8i16:
; AVX512BWVL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
entry:
%0 = trunc <4 x i64> %a to <4 x i16>
; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0
; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [63,63]
; AVX512CD-NEXT: vpsubq %xmm0, %xmm1, %xmm0
+; AVX512CD-NEXT: vzeroupper
; AVX512CD-NEXT: retq
;
; X32-SSE-LABEL: testv2i64u:
; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
; AVX512CD-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
; AVX512CD-NEXT: vpsubd %xmm0, %xmm1, %xmm0
+; AVX512CD-NEXT: vzeroupper
; AVX512CD-NEXT: retq
;
; X32-SSE-LABEL: testv4i32u: