return true;
}
-// TODO: This method of analysis can miss some legal cases, because the
-// super-register could be live into the address expression for a memory
-// reference for the instruction, and still be killed/last used by the
-// instruction. However, the existing query interfaces don't seem to
-// easily allow that to be checked.
-//
-// What we'd really like to know is whether after OrigMI, the
-// only portion of SuperDestReg that is alive is the portion that
-// was the destination register of OrigMI.
+/// Check if register \p Reg is live after the \p MI.
+///
+/// \p LiveRegs should be in a state describing liveness information in
+/// that exact place as this function tries to precise analysis made
+/// by \p LiveRegs by exploiting the information about particular
+/// instruction \p MI. \p MI is expected to be one of the MOVs handled
+/// by the x86FixupBWInsts pass.
+/// Note: similar to LivePhysRegs::contains this would state that
+/// super-register is not used if only some part of it is used.
+///
+/// X86 backend does not have subregister liveness tracking enabled,
+/// so liveness information might be overly conservative. However, for
+/// some specific instructions (this pass only cares about MOVs) we can
+/// produce more precise results by analysing that MOV's operands.
+///
+/// Indeed, if super-register is not live before the mov it means that it
+/// was originally <read-undef> and so we are free to modify these
+/// undef upper bits. That may happen in case where the use is in another MBB
+/// and the vreg/physreg corresponding to the move has higher width than
+/// necessary (e.g. due to register coalescing with a "truncate" copy).
+/// So, it handles pattern like this:
+///
+/// BB#2: derived from LLVM BB %if.then
+/// Live Ins: %RDI
+/// Predecessors according to CFG: BB#0
+/// %AX<def> = MOV16rm %RDI<kill>, 1, %noreg, 0, %noreg, %EAX<imp-def>; mem:LD2[%p]
+/// No %EAX<imp-use>
+/// Successors according to CFG: BB#3(?%)
+///
+/// BB#3: derived from LLVM BB %if.end
+/// Live Ins: %EAX Only %AX is actually live
+/// Predecessors according to CFG: BB#2 BB#1
+/// %AX<def> = KILL %AX, %EAX<imp-use,kill>
+/// RET 0, %AX
+static bool isLive(const MachineInstr &MI,
+ const LivePhysRegs &LiveRegs,
+ const TargetRegisterInfo *TRI,
+ unsigned Reg) {
+ if (!LiveRegs.contains(Reg))
+ return false;
+
+ unsigned Opc = MI.getOpcode(); (void)Opc;
+ // These are the opcodes currently handled by the pass, if something
+ // else will be added we need to ensure that new opcode has the same
+ // properties.
+ assert((Opc == X86::MOV8rm || Opc == X86::MOV16rm || Opc == X86::MOV8rr ||
+ Opc == X86::MOV16rr) &&
+ "Unexpected opcode.");
+
+ bool IsDefined = false;
+ for (auto &MO: MI.implicit_operands()) {
+ if (!MO.isReg())
+ continue;
+
+ assert((MO.isDef() || MO.isUse()) && "Expected Def or Use only!");
+
+ for (MCSuperRegIterator Supers(Reg, TRI, true); Supers.isValid(); ++Supers) {
+ if (*Supers == MO.getReg()) {
+ if (MO.isDef())
+ IsDefined = true;
+ else
+ return true; // SuperReg Imp-used' -> live before the MI
+ }
+ }
+ }
+ // Reg is not Imp-def'ed -> it's live both before/after the instruction.
+ if (!IsDefined)
+ return true;
+
+ // Otherwise, the Reg is not live before the MI and the MOV can't
+ // make it really live, so it's in fact dead even after the MI.
+ return false;
+}
+
+/// \brief Check if after \p OrigMI the only portion of super register
+/// of the destination register of \p OrigMI that is alive is that
+/// destination register.
+///
+/// If so, return that super register in \p SuperDestReg.
bool FixupBWInstPass::getSuperRegDestIfDead(MachineInstr *OrigMI,
unsigned &SuperDestReg) const {
auto *TRI = &TII->getRegisterInfo();
if (SubRegIdx == X86::sub_8bit_hi)
return false;
- if (LiveRegs.contains(SuperDestReg))
+ if (isLive(*OrigMI, LiveRegs, TRI, SuperDestReg))
return false;
if (SubRegIdx == X86::sub_8bit) {
unsigned UpperByteReg =
getX86SubSuperRegister(SuperDestReg, 8, /*High=*/true);
- if (LiveRegs.contains(UpperByteReg))
+ if (isLive(*OrigMI, LiveRegs, TRI, UpperByteReg))
return false;
}
for (auto I = MBB.rbegin(); I != MBB.rend(); ++I) {
MachineInstr *MI = &*I;
-
+
if (MachineInstr *NewMI = tryReplaceInstr(MI, MBB))
MIReplacements.push_back(std::make_pair(MI, NewMI));
define <4 x i64> @test_mm256_insert_epi16(<4 x i64> %a0, i16 %a1) nounwind {
; X32-LABEL: test_mm256_insert_epi16:
; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vextractf128 $1, %ymm0, %xmm1
; X32-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
; X32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
define <4 x i64> @test_mm256_set_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7, i16 %a8, i16 %a9, i16 %a10, i16 %a11, i16 %a12, i16 %a13, i16 %a14, i16 %a15) nounwind {
; X32-LABEL: test_mm256_set_epi16:
; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovd %eax, %xmm0
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovd %eax, %xmm1
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_set_epi16:
; X64: # BB#0:
-; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: vmovd %eax, %xmm0
-; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
; X64-NEXT: vpinsrw $2, %r9d, %xmm0, %xmm0
; X64-NEXT: vpinsrw $3, %r8d, %xmm0, %xmm0
; X64-NEXT: vpinsrw $5, %edx, %xmm0, %xmm0
; X64-NEXT: vpinsrw $6, %esi, %xmm0, %xmm0
; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0
-; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: vmovd %eax, %xmm1
-; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
-; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
-; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
-; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
-; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1
-; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
-; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; X64-NEXT: retq
define <4 x i64> @test_mm256_set1_epi16(i16 %a0) nounwind {
; X32-LABEL: test_mm256_set1_epi16:
; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovd %eax, %xmm0
; X32-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
define <4 x i64> @test_mm256_setr_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7, i16 %a8, i16 %a9, i16 %a10, i16 %a11, i16 %a12, i16 %a13, i16 %a14, i16 %a15) nounwind {
; X32-LABEL: test_mm256_setr_epi16:
; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovd %eax, %xmm0
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovd %eax, %xmm1
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_setr_epi16:
; X64: # BB#0:
-; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: vmovd %eax, %xmm0
-; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
-; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
-; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
-; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
-; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
-; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
-; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
; X64-NEXT: vmovd %edi, %xmm1
; X64-NEXT: vpinsrw $1, %esi, %xmm1, %xmm1
; X64-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1
; X64-NEXT: vpinsrw $4, %r8d, %xmm1, %xmm1
; X64-NEXT: vpinsrw $5, %r9d, %xmm1, %xmm1
-; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
-; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; X64-NEXT: retq
; X64: ## BB#0: ## %entry
; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movw (%rdi), %ax
+; X64-NEXT: movzwl (%rdi), %eax
; X64-NEXT: vmovd %eax, %xmm1
; X64-NEXT: vpbroadcastw %xmm1, %xmm1
; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NEXT: subq $128, %rsp
; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-NEXT: vmovaps %ymm0, (%rsp)
-; X64-NEXT: movw (%rdi), %ax
+; X64-NEXT: movzwl (%rdi), %eax
; X64-NEXT: vmovd %eax, %xmm1
; X64-NEXT: vpbroadcastw %xmm1, %ymm1
; X64-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
define <8 x i64> @test_mm512_mask_broadcastd_epi32(<8 x i64> %a0, i16 %a1, <2 x i64> %a2) {
; X32-LABEL: test_mm512_mask_broadcastd_epi32:
; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpbroadcastd %xmm1, %zmm0 {%k1}
; X32-NEXT: retl
define <8 x i64> @test_mm512_maskz_broadcastd_epi32(i16 %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm512_maskz_broadcastd_epi32:
; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z}
; X32-NEXT: retl
define <16 x float> @test_mm512_mask_broadcastss_ps(<16 x float> %a0, i16 %a1, <4 x float> %a2) {
; X32-LABEL: test_mm512_mask_broadcastss_ps:
; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vbroadcastss %xmm1, %zmm0 {%k1}
; X32-NEXT: retl
define <16 x float> @test_mm512_maskz_broadcastss_ps(i16 %a0, <4 x float> %a1) {
; X32-LABEL: test_mm512_maskz_broadcastss_ps:
; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z}
; X32-NEXT: retl
define <16 x float> @test_mm512_mask_movehdup_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) {
; X32-LABEL: test_mm512_mask_movehdup_ps:
; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; X32-NEXT: retl
define <16 x float> @test_mm512_maskz_movehdup_ps(i16 %a0, <16 x float> %a1) {
; X32-LABEL: test_mm512_maskz_movehdup_ps:
; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; X32-NEXT: retl
define <16 x float> @test_mm512_mask_moveldup_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) {
; X32-LABEL: test_mm512_mask_moveldup_ps:
; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
; X32-NEXT: retl
define <16 x float> @test_mm512_maskz_moveldup_ps(i16 %a0, <16 x float> %a1) {
; X32-LABEL: test_mm512_maskz_moveldup_ps:
; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
; X32-NEXT: retl
define <16 x float> @test_mm512_mask_permute_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) {
; X32-LABEL: test_mm512_mask_permute_ps:
; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpermilps {{.*#+}} zmm0 {%k1} = zmm1[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
; X32-NEXT: retl
define <16 x float> @test_mm512_maskz_permute_ps(i16 %a0, <16 x float> %a1) {
; X32-LABEL: test_mm512_maskz_permute_ps:
; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
; X32-NEXT: retl
define <8 x i64> @test_mm512_mask_shuffle_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2) {
; X32-LABEL: test_mm512_mask_shuffle_epi32:
; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = zmm1[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
; X32-NEXT: retl
define <8 x i64> @test_mm512_maskz_shuffle_epi32(i16 %a0, <8 x i64> %a1) {
; X32-LABEL: test_mm512_maskz_shuffle_epi32:
; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
; X32-NEXT: retl
define <8 x i64> @test_mm512_mask_unpackhi_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2, <8 x i64> %a3) {
; X32-LABEL: test_mm512_mask_unpackhi_epi32:
; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
; X32-NEXT: retl
define <8 x i64> @test_mm512_maskz_unpackhi_epi32(i16 %a0, <8 x i64> %a1, <8 x i64> %a2) {
; X32-LABEL: test_mm512_maskz_unpackhi_epi32:
; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
; X32-NEXT: retl
define <16 x float> @test_mm512_mask_unpackhi_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2, <16 x float> %a3) {
; X32-LABEL: test_mm512_mask_unpackhi_ps:
; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
; X32-NEXT: retl
define <16 x float> @test_mm512_maskz_unpackhi_ps(i16 %a0, <16 x float> %a1, <16 x float> %a2) {
; X32-LABEL: test_mm512_maskz_unpackhi_ps:
; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
; X32-NEXT: retl
define <8 x i64> @test_mm512_mask_unpacklo_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2, <8 x i64> %a3) {
; X32-LABEL: test_mm512_mask_unpacklo_epi32:
; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
; X32-NEXT: retl
define <8 x i64> @test_mm512_maskz_unpacklo_epi32(i16 %a0, <8 x i64> %a1, <8 x i64> %a2) {
; X32-LABEL: test_mm512_maskz_unpacklo_epi32:
; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
; X32-NEXT: retl
define <16 x float> @test_mm512_mask_unpacklo_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2, <16 x float> %a3) {
; X32-LABEL: test_mm512_mask_unpacklo_ps:
; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
; X32-NEXT: retl
define <16 x float> @test_mm512_maskz_unpacklo_ps(i16 %a0, <16 x float> %a1, <16 x float> %a2) {
; X32-LABEL: test_mm512_maskz_unpacklo_ps:
; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
; X32-NEXT: retl
; AVX512F-32-NEXT: .Lcfi9:
; AVX512F-32-NEXT: .cfi_offset %ebx, -8
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; AVX512F-32-NEXT: movb %cl, %al
+; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrb $5, %al
; AVX512F-32-NEXT: andb $1, %al
-; AVX512F-32-NEXT: movb %cl, %bl
+; AVX512F-32-NEXT: movl %ecx, %ebx
; AVX512F-32-NEXT: andb $15, %bl
-; AVX512F-32-NEXT: movb %cl, %dl
+; AVX512F-32-NEXT: movl %ecx, %edx
; AVX512F-32-NEXT: andb $2, %dl
; AVX512F-32-NEXT: shrb %dl
; AVX512F-32-NEXT: kmovd %edx, %k0
-; AVX512F-32-NEXT: movb %bl, %dl
+; AVX512F-32-NEXT: movl %ebx, %edx
; AVX512F-32-NEXT: shrb $2, %bl
; AVX512F-32-NEXT: kmovd %ebx, %k1
-; AVX512F-32-NEXT: movb %cl, %bl
+; AVX512F-32-NEXT: movl %ecx, %ebx
; AVX512F-32-NEXT: shrb $4, %bl
; AVX512F-32-NEXT: shrb $3, %dl
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
-; AVX512F-32-NEXT: movb %cl, %al
+; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrb $6, %al
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %cl, %al
+; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrb $7, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %ch, %dl
; AVX512F-32-NEXT: andb $15, %dl
-; AVX512F-32-NEXT: movb %dl, %al
+; AVX512F-32-NEXT: movl %edx, %eax
; AVX512F-32-NEXT: shrb $2, %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %al, %dl
+; AVX512F-32-NEXT: movl %eax, %edx
; AVX512F-32-NEXT: andb $2, %dl
; AVX512F-32-NEXT: shrb %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %al, %bl
+; AVX512F-32-NEXT: movl %eax, %ebx
; AVX512F-32-NEXT: andb $15, %bl
-; AVX512F-32-NEXT: movb %bl, %dl
+; AVX512F-32-NEXT: movl %ebx, %edx
; AVX512F-32-NEXT: shrb $2, %bl
; AVX512F-32-NEXT: kmovd %ebx, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %al, %dl
+; AVX512F-32-NEXT: movl %eax, %edx
; AVX512F-32-NEXT: shrb $4, %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %al, %dl
+; AVX512F-32-NEXT: movl %eax, %edx
; AVX512F-32-NEXT: shrb $5, %dl
; AVX512F-32-NEXT: andb $1, %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %al, %dl
+; AVX512F-32-NEXT: movl %eax, %edx
; AVX512F-32-NEXT: shrb $6, %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm3, %k0
-; AVX512F-32-NEXT: movb %al, %dl
+; AVX512F-32-NEXT: movl %eax, %edx
; AVX512F-32-NEXT: andb $2, %dl
; AVX512F-32-NEXT: shrb %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm5, %ymm4, %ymm4
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm4, %k0
-; AVX512F-32-NEXT: movb %al, %dl
+; AVX512F-32-NEXT: movl %eax, %edx
; AVX512F-32-NEXT: andb $15, %dl
-; AVX512F-32-NEXT: movb %dl, %al
+; AVX512F-32-NEXT: movl %edx, %eax
; AVX512F-32-NEXT: shrb $2, %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm4
; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %cl, %al
+; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: andb $2, %al
; AVX512F-32-NEXT: shrb %al
; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %cl, %dl
+; AVX512F-32-NEXT: movl %ecx, %edx
; AVX512F-32-NEXT: andb $15, %dl
-; AVX512F-32-NEXT: movb %dl, %al
+; AVX512F-32-NEXT: movl %edx, %eax
; AVX512F-32-NEXT: shrb $2, %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %cl, %al
+; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrb $4, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %cl, %al
+; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrb $5, %al
; AVX512F-32-NEXT: andb $1, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %cl, %al
+; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrb $6, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %cl, %al
+; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrb $7, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %ch, %dl
; AVX512F-32-NEXT: andb $15, %dl
-; AVX512F-32-NEXT: movb %dl, %al
+; AVX512F-32-NEXT: movl %edx, %eax
; AVX512F-32-NEXT: shrb $2, %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %bl, %dl
+; AVX512F-32-NEXT: movl %ebx, %edx
; AVX512F-32-NEXT: andb $2, %dl
; AVX512F-32-NEXT: shrb %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: movb %bl, %al
+; AVX512F-32-NEXT: movl %ebx, %eax
; AVX512F-32-NEXT: andb $15, %al
-; AVX512F-32-NEXT: movb %al, %dl
+; AVX512F-32-NEXT: movl %eax, %edx
; AVX512F-32-NEXT: shrb $2, %al
; AVX512F-32-NEXT: kmovd %eax, %k0
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1
; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %bl, %al
+; AVX512F-32-NEXT: movl %ebx, %eax
; AVX512F-32-NEXT: shrb $4, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %bl, %al
+; AVX512F-32-NEXT: movl %ebx, %eax
; AVX512F-32-NEXT: shrb $5, %al
; AVX512F-32-NEXT: andb $1, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %bl, %al
+; AVX512F-32-NEXT: movl %ebx, %eax
; AVX512F-32-NEXT: shrb $6, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %al, %dl
+; AVX512F-32-NEXT: movl %eax, %edx
; AVX512F-32-NEXT: andb $2, %dl
; AVX512F-32-NEXT: shrb %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: movb %al, %dl
+; AVX512F-32-NEXT: movl %eax, %edx
; AVX512F-32-NEXT: andb $15, %dl
-; AVX512F-32-NEXT: movb %dl, %al
+; AVX512F-32-NEXT: movl %edx, %eax
; AVX512F-32-NEXT: shrb $2, %dl
; AVX512F-32-NEXT: kmovd %edx, %k0
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1
; AVX512F-32-NEXT: .Lcfi15:
; AVX512F-32-NEXT: .cfi_offset %ebx, -8
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; AVX512F-32-NEXT: movb %cl, %al
+; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrb $5, %al
; AVX512F-32-NEXT: andb $1, %al
-; AVX512F-32-NEXT: movb %cl, %bl
+; AVX512F-32-NEXT: movl %ecx, %ebx
; AVX512F-32-NEXT: andb $15, %bl
-; AVX512F-32-NEXT: movb %cl, %dl
+; AVX512F-32-NEXT: movl %ecx, %edx
; AVX512F-32-NEXT: andb $2, %dl
; AVX512F-32-NEXT: shrb %dl
; AVX512F-32-NEXT: kmovd %edx, %k0
-; AVX512F-32-NEXT: movb %bl, %dl
+; AVX512F-32-NEXT: movl %ebx, %edx
; AVX512F-32-NEXT: shrb $2, %bl
; AVX512F-32-NEXT: kmovd %ebx, %k1
-; AVX512F-32-NEXT: movb %cl, %bl
+; AVX512F-32-NEXT: movl %ecx, %ebx
; AVX512F-32-NEXT: shrb $4, %bl
; AVX512F-32-NEXT: shrb $3, %dl
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
-; AVX512F-32-NEXT: movb %cl, %al
+; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrb $6, %al
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %cl, %al
+; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrb $7, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %ch, %dl
; AVX512F-32-NEXT: andb $15, %dl
-; AVX512F-32-NEXT: movb %dl, %al
+; AVX512F-32-NEXT: movl %edx, %eax
; AVX512F-32-NEXT: shrb $2, %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %al, %dl
+; AVX512F-32-NEXT: movl %eax, %edx
; AVX512F-32-NEXT: andb $2, %dl
; AVX512F-32-NEXT: shrb %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %al, %bl
+; AVX512F-32-NEXT: movl %eax, %ebx
; AVX512F-32-NEXT: andb $15, %bl
-; AVX512F-32-NEXT: movb %bl, %dl
+; AVX512F-32-NEXT: movl %ebx, %edx
; AVX512F-32-NEXT: shrb $2, %bl
; AVX512F-32-NEXT: kmovd %ebx, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %al, %dl
+; AVX512F-32-NEXT: movl %eax, %edx
; AVX512F-32-NEXT: shrb $4, %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %al, %dl
+; AVX512F-32-NEXT: movl %eax, %edx
; AVX512F-32-NEXT: shrb $5, %dl
; AVX512F-32-NEXT: andb $1, %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %al, %dl
+; AVX512F-32-NEXT: movl %eax, %edx
; AVX512F-32-NEXT: shrb $6, %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm3, %k0
-; AVX512F-32-NEXT: movb %al, %dl
+; AVX512F-32-NEXT: movl %eax, %edx
; AVX512F-32-NEXT: andb $2, %dl
; AVX512F-32-NEXT: shrb %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm5, %ymm4, %ymm4
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm4, %k0
-; AVX512F-32-NEXT: movb %al, %dl
+; AVX512F-32-NEXT: movl %eax, %edx
; AVX512F-32-NEXT: andb $15, %dl
-; AVX512F-32-NEXT: movb %dl, %al
+; AVX512F-32-NEXT: movl %edx, %eax
; AVX512F-32-NEXT: shrb $2, %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm4
; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %cl, %al
+; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: andb $2, %al
; AVX512F-32-NEXT: shrb %al
; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %cl, %dl
+; AVX512F-32-NEXT: movl %ecx, %edx
; AVX512F-32-NEXT: andb $15, %dl
-; AVX512F-32-NEXT: movb %dl, %al
+; AVX512F-32-NEXT: movl %edx, %eax
; AVX512F-32-NEXT: shrb $2, %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %cl, %al
+; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrb $4, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %cl, %al
+; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrb $5, %al
; AVX512F-32-NEXT: andb $1, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %cl, %al
+; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrb $6, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %cl, %al
+; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrb $7, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %ch, %dl
; AVX512F-32-NEXT: andb $15, %dl
-; AVX512F-32-NEXT: movb %dl, %al
+; AVX512F-32-NEXT: movl %edx, %eax
; AVX512F-32-NEXT: shrb $2, %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %bl, %dl
+; AVX512F-32-NEXT: movl %ebx, %edx
; AVX512F-32-NEXT: andb $2, %dl
; AVX512F-32-NEXT: shrb %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: movb %bl, %al
+; AVX512F-32-NEXT: movl %ebx, %eax
; AVX512F-32-NEXT: andb $15, %al
-; AVX512F-32-NEXT: movb %al, %dl
+; AVX512F-32-NEXT: movl %eax, %edx
; AVX512F-32-NEXT: shrb $2, %al
; AVX512F-32-NEXT: kmovd %eax, %k0
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1
; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %bl, %al
+; AVX512F-32-NEXT: movl %ebx, %eax
; AVX512F-32-NEXT: shrb $4, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %bl, %al
+; AVX512F-32-NEXT: movl %ebx, %eax
; AVX512F-32-NEXT: shrb $5, %al
; AVX512F-32-NEXT: andb $1, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %bl, %al
+; AVX512F-32-NEXT: movl %ebx, %eax
; AVX512F-32-NEXT: shrb $6, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %al, %dl
+; AVX512F-32-NEXT: movl %eax, %edx
; AVX512F-32-NEXT: andb $2, %dl
; AVX512F-32-NEXT: shrb %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: movb %al, %dl
+; AVX512F-32-NEXT: movl %eax, %edx
; AVX512F-32-NEXT: andb $15, %dl
-; AVX512F-32-NEXT: movb %dl, %al
+; AVX512F-32-NEXT: movl %edx, %eax
; AVX512F-32-NEXT: shrb $2, %dl
; AVX512F-32-NEXT: kmovd %edx, %k0
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1
; AVX512F-32-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_512:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; AVX512F-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: vpbroadcastw %eax, %zmm1 {%k1} {z}
; AVX512F-32-NEXT: vpbroadcastw %eax, %zmm0 {%k1}
; AVX512F-32-NEXT: vpbroadcastw %eax, %zmm2
define <2 x i64> @test_mm_mask_broadcastb_epi8(<2 x i64> %a0, i16 %a1, <2 x i64> %a2) {
; X32-LABEL: test_mm_mask_broadcastb_epi8:
; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kmovd %eax, %k1
; X32-NEXT: vpbroadcastb %xmm1, %xmm0 {%k1}
; X32-NEXT: retl
define <2 x i64> @test_mm_maskz_broadcastb_epi8(i16 %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_maskz_broadcastb_epi8:
; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kmovd %eax, %k1
; X32-NEXT: vpbroadcastb %xmm0, %xmm0 {%k1} {z}
; X32-NEXT: retl
define <4 x i64> @test_mm256_mask_broadcastw_epi16(<4 x i64> %a0, i16 %a1, <2 x i64> %a2) {
; X32-LABEL: test_mm256_mask_broadcastw_epi16:
; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kmovd %eax, %k1
; X32-NEXT: vpbroadcastw %xmm1, %ymm0 {%k1}
; X32-NEXT: retl
define <4 x i64> @test_mm256_maskz_broadcastw_epi16(i16 %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm256_maskz_broadcastw_epi16:
; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kmovd %eax, %k1
; X32-NEXT: vpbroadcastw %xmm0, %ymm0 {%k1} {z}
; X32-NEXT: retl
define <2 x i16> @test_bitreverse_v2i16(<2 x i16> %a) nounwind {
; X86-LABEL: test_bitreverse_v2i16:
; X86: # BB#0:
-; X86-NEXT: movw {{[0-9]+}}(%esp), %cx
-; X86-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: rolw $8, %ax
; X86-NEXT: movl %eax, %edx
; X86-NEXT: andl $3855, %edx # imm = 0xF0F
define i16 @test_bitreverse_i16(i16 %a) nounwind {
; X86-LABEL: test_bitreverse_i16:
; X86: # BB#0:
-; X86-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: rolw $8, %ax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $3855, %ecx # imm = 0xF0F
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: testb %dl, %dl
; CHECK-NEXT: setne %bl
-; CHECK-NEXT: movb %al, %cl
+; CHECK-NEXT: movl %eax, %ecx
; CHECK-NEXT: je .LBB3_4
; CHECK-NEXT: # BB#3: # %func_4.exit.i
; CHECK-NEXT: xorl %ecx, %ecx
; CHECK-NEXT: # BB#6: # %bb.i.i
; CHECK-NEXT: movb {{.*}}(%rip), %cl
; CHECK-NEXT: xorl %ebx, %ebx
-; CHECK-NEXT: movb %al, %cl
+; CHECK-NEXT: movl %eax, %ecx
; CHECK-NEXT: .LBB3_7: # %func_1.exit
; CHECK-NEXT: movb %cl, {{.*}}(%rip)
; CHECK-NEXT: movzbl %cl, %esi
; CHECK-NEXT: testb $1, %dil
; CHECK-NEXT: jne .LBB6_2
; CHECK-NEXT: # BB#1:
-; CHECK-NEXT: movb %dl, %sil
+; CHECK-NEXT: movl %edx, %esi
; CHECK-NEXT: .LBB6_2:
; CHECK-NEXT: movl %esi, %eax
; CHECK-NEXT: retq
--- /dev/null
+# RUN: llc -mtriple=x86_64-unknown-linux-gnu -run-pass x86-fixup-bw-insts %s -o - | FileCheck %s
+
+--- |
+ define void @test1() { ret void }
+
+ define void @test2() { ret void }
+
+ define i16 @test3(i16* readonly %p) {
+ ; Keep original IR to show how the situation like this might happen
+ ; due to preceding CG passes.
+ ;
+ ; %0 is used in %if.end BB (before tail-duplication), so its
+ ; corresponding super-register (EAX) is live-in into that BB (%if.end)
+ ; and also has an EAX<imp-def> flag. Make sure that we still change
+ ; the movw into movzwl because EAX is not live before the load (which
+ ; can be seen by the fact that EAX<imp-use> flag is missing).
+ entry:
+ %tobool = icmp eq i16* %p, null
+ br i1 %tobool, label %if.end, label %if.then
+
+ if.then: ; preds = %entry
+ %0 = load i16, i16* %p, align 2
+ br label %if.end
+
+ if.end: ; preds = %if.then, %entry
+ %i.0 = phi i16 [ %0, %if.then ], [ 0, %entry ]
+ ret i16 %i.0
+ }
+
+...
+---
+# CHECK-LABEL: name: test1
+name: test1
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+liveins:
+ - { reg: '%rax' }
+frameInfo:
+ stackSize: 0
+fixedStack:
+stack:
+constants:
+# Verify that "movw (%rax), %ax" is changed to "movzwl (%rax), %rax".
+#
+# For that to happen, the liveness information after the MOV16rm
+# instruction should be used, not before it because %rax is live
+# before the MOV and is killed by it.
+body: |
+ bb.0:
+ liveins: %rax
+
+ %ax = MOV16rm killed %rax, 1, _, 0, _
+ ; CHECK: %eax = MOVZX32rm16 killed %rax
+
+ RETQ %ax
+
+...
+---
+# CHECK-LABEL: name: test2
+name: test2
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+liveins:
+ - { reg: '%rax' }
+frameInfo:
+ stackSize: 0
+fixedStack:
+stack:
+constants:
+# Imp-use of any super-register means the register is live before the MOV
+body: |
+ bb.0:
+ liveins: %dl, %rbx, %rcx, %r14
+
+ %cl = MOV8rr killed %dl, implicit killed %rcx, implicit-def %rcx
+ ; CHECK: %cl = MOV8rr killed %dl, implicit killed %rcx, implicit-def %rcx
+ JMP_1 %bb.1
+ bb.1:
+ liveins: %rcx
+
+ RETQ %cl
+
+...
+---
+# CHECK-LABEL: name: test3
+name: test3
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+liveins:
+ - { reg: '%rdi', virtual-reg: '' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ savePoint: ''
+ restorePoint: ''
+fixedStack:
+stack:
+constants:
+# After MOV16rm the whole %eax is not *really* live, as can be seen by
+# missing implicit-uses of it in that MOV. Make sure that MOV is
+# transformed into MOVZX.
+# See the comment near the original IR on what preceding decisions can
+# lead to that.
+body: |
+ bb.0.entry:
+ successors: %bb.1(0x30000000), %bb.2.if.then(0x50000000)
+ liveins: %rdi
+
+ TEST64rr %rdi, %rdi, implicit-def %eflags
+ JE_1 %bb.1, implicit %eflags
+
+ bb.2.if.then:
+ liveins: %rdi
+
+ %ax = MOV16rm killed %rdi, 1, _, 0, _, implicit-def %eax :: (load 2 from %ir.p)
+ ; CHECK: %eax = MOVZX32rm16 killed %rdi, 1, _, 0, _, implicit-def %eax :: (load 2 from %ir.p)
+ %ax = KILL %ax, implicit killed %eax
+ RETQ %ax
+
+ bb.1:
+ %eax = XOR32rr undef %eax, undef %eax, implicit-def dead %eflags
+ %ax = KILL %ax, implicit killed %eax
+ RETQ %ax
+
+...
; X64-LABEL: foo:
; X64: # BB#0: # %bb
; X64-NEXT: movzwl {{.*}}(%rip), %ecx
-; X64-NEXT: movw {{.*}}(%rip), %ax
+; X64-NEXT: movzwl {{.*}}(%rip), %eax
; X64-NEXT: xorw %cx, %ax
; X64-NEXT: xorl %ecx, %eax
; X64-NEXT: movzwl %ax, %eax
; 686-NEXT: andl $-8, %esp
; 686-NEXT: subl $8, %esp
; 686-NEXT: movzwl var_27, %ecx
-; 686-NEXT: movw var_22, %ax
+; 686-NEXT: movzwl var_22, %eax
; 686-NEXT: xorw %cx, %ax
; 686-NEXT: xorl %ecx, %eax
; 686-NEXT: movzwl %ax, %eax
; CHECK-NEXT: shll $12, %ecx
; CHECK-NEXT: sarw $12, %cx
; CHECK-NEXT: movq _b@{{.*}}(%rip), %rdx
-; CHECK-NEXT: movw %cx, %si
+; CHECK-NEXT: movl %ecx, %esi
; CHECK-NEXT: orw (%rdx), %si
; CHECK-NEXT: andl %ecx, %esi
; CHECK-NEXT: movw %si, (%rdx)
; CHECK-LABEL: pr34127:
; CHECK: # BB#0: # %entry
; CHECK-NEXT: movzwl {{.*}}(%rip), %eax
-; CHECK-NEXT: movw {{.*}}(%rip), %cx
+; CHECK-NEXT: movzwl {{.*}}(%rip), %ecx
; CHECK-NEXT: andw %ax, %cx
; CHECK-NEXT: andl %eax, %ecx
; CHECK-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
; MCU-NEXT: testb $1, %al
; MCU-NEXT: jne .LBB4_2
; MCU-NEXT: # BB#1:
-; MCU-NEXT: movw {{[0-9]+}}(%esp), %cx
-; MCU-NEXT: movw {{[0-9]+}}(%esp), %dx
+; MCU-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; MCU-NEXT: movzwl {{[0-9]+}}(%esp), %edx
; MCU-NEXT: .LBB4_2:
; MCU-NEXT: movw %cx, 2(%esi)
; MCU-NEXT: movw %dx, (%esi)
define <2 x i64> @test_mm_insert_epi16(<2 x i64> %a0, i16 %a1) nounwind {
; X32-LABEL: test_mm_insert_epi16:
; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: pinsrw $1, %eax, %xmm0
; X32-NEXT: retl
;
define <2 x i64> @test_mm_set_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) nounwind {
; X32-LABEL: test_mm_set_epi16:
; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm1
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm2
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm3
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm4
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm5
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm6
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm7
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm0
; X32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; X32-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
;
; X64-LABEL: test_mm_set_epi16:
; X64: # BB#0:
-; X64-NEXT: movw {{[0-9]+}}(%rsp), %r10w
-; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d
+; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: movd %edi, %xmm0
; X64-NEXT: movd %esi, %xmm1
; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
define <2 x i64> @test_mm_set1_epi16(i16 %a0) nounwind {
; X32-LABEL: test_mm_set1_epi16:
; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm0
; X32-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
define <2 x i64> @test_mm_setr_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) nounwind {
; X32-LABEL: test_mm_setr_epi16:
; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm1
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm2
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm3
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm4
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm5
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm6
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm7
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm0
; X32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; X32-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
;
; X64-LABEL: test_mm_setr_epi16:
; X64: # BB#0:
-; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
-; X64-NEXT: movw {{[0-9]+}}(%rsp), %r10w
+; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d
; X64-NEXT: movd %eax, %xmm0
; X64-NEXT: movd %r10d, %xmm1
; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; ALL-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; ALL-NEXT: callq __truncdfhf2
-; ALL-NEXT: movw %ax, %bx
+; ALL-NEXT: movl %eax, %ebx
; ALL-NEXT: shll $16, %ebx
; ALL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; ALL-NEXT: callq __truncdfhf2
; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncdfhf2
-; AVX1-NEXT: movw %ax, %bx
+; AVX1-NEXT: movl %eax, %ebx
; AVX1-NEXT: shll $16, %ebx
; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncdfhf2
-; AVX1-NEXT: movw %ax, %bx
+; AVX1-NEXT: movl %eax, %ebx
; AVX1-NEXT: shll $16, %ebx
; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX1-NEXT: callq __truncdfhf2
; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2
-; AVX2-NEXT: movw %ax, %bx
+; AVX2-NEXT: movl %eax, %ebx
; AVX2-NEXT: shll $16, %ebx
; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2
-; AVX2-NEXT: movw %ax, %bx
+; AVX2-NEXT: movl %eax, %ebx
; AVX2-NEXT: shll $16, %ebx
; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX2-NEXT: callq __truncdfhf2
; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: callq __truncdfhf2
-; AVX512-NEXT: movw %ax, %bx
+; AVX512-NEXT: movl %eax, %ebx
; AVX512-NEXT: shll $16, %ebx
; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: callq __truncdfhf2
-; AVX512-NEXT: movw %ax, %bx
+; AVX512-NEXT: movl %eax, %ebx
; AVX512-NEXT: shll $16, %ebx
; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX512-NEXT: callq __truncdfhf2
; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncdfhf2
-; AVX1-NEXT: movw %ax, %bx
+; AVX1-NEXT: movl %eax, %ebx
; AVX1-NEXT: shll $16, %ebx
; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncdfhf2
-; AVX1-NEXT: movw %ax, %bx
+; AVX1-NEXT: movl %eax, %ebx
; AVX1-NEXT: shll $16, %ebx
; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX1-NEXT: callq __truncdfhf2
; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2
-; AVX2-NEXT: movw %ax, %bx
+; AVX2-NEXT: movl %eax, %ebx
; AVX2-NEXT: shll $16, %ebx
; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2
-; AVX2-NEXT: movw %ax, %bx
+; AVX2-NEXT: movl %eax, %ebx
; AVX2-NEXT: shll $16, %ebx
; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX2-NEXT: callq __truncdfhf2
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
-; AVX512F-NEXT: movw %ax, %bx
+; AVX512F-NEXT: movl %eax, %ebx
; AVX512F-NEXT: shll $16, %ebx
; AVX512F-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
-; AVX512F-NEXT: movw %ax, %bx
+; AVX512F-NEXT: movl %eax, %ebx
; AVX512F-NEXT: shll $16, %ebx
; AVX512F-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX512F-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
-; AVX512VL-NEXT: movw %ax, %bx
+; AVX512VL-NEXT: movl %eax, %ebx
; AVX512VL-NEXT: shll $16, %ebx
; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX512VL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
-; AVX512VL-NEXT: movw %ax, %bx
+; AVX512VL-NEXT: movl %eax, %ebx
; AVX512VL-NEXT: shll $16, %ebx
; AVX512VL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX512VL-NEXT: callq __truncdfhf2
; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncdfhf2
-; AVX1-NEXT: movw %ax, %bx
+; AVX1-NEXT: movl %eax, %ebx
; AVX1-NEXT: shll $16, %ebx
; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncdfhf2
-; AVX1-NEXT: movw %ax, %bx
+; AVX1-NEXT: movl %eax, %ebx
; AVX1-NEXT: shll $16, %ebx
; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX1-NEXT: callq __truncdfhf2
; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2
-; AVX2-NEXT: movw %ax, %bx
+; AVX2-NEXT: movl %eax, %ebx
; AVX2-NEXT: shll $16, %ebx
; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2
-; AVX2-NEXT: movw %ax, %bx
+; AVX2-NEXT: movl %eax, %ebx
; AVX2-NEXT: shll $16, %ebx
; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX2-NEXT: callq __truncdfhf2
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
-; AVX512F-NEXT: movw %ax, %bx
+; AVX512F-NEXT: movl %eax, %ebx
; AVX512F-NEXT: shll $16, %ebx
; AVX512F-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
-; AVX512F-NEXT: movw %ax, %bx
+; AVX512F-NEXT: movl %eax, %ebx
; AVX512F-NEXT: shll $16, %ebx
; AVX512F-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX512F-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
-; AVX512VL-NEXT: movw %ax, %bx
+; AVX512VL-NEXT: movl %eax, %ebx
; AVX512VL-NEXT: shll $16, %ebx
; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX512VL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
-; AVX512VL-NEXT: movw %ax, %bx
+; AVX512VL-NEXT: movl %eax, %ebx
; AVX512VL-NEXT: shll $16, %ebx
; AVX512VL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX512VL-NEXT: callq __truncdfhf2
; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncdfhf2
-; AVX1-NEXT: movw %ax, %bx
+; AVX1-NEXT: movl %eax, %ebx
; AVX1-NEXT: shll $16, %ebx
; AVX1-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncdfhf2
-; AVX1-NEXT: movw %ax, %bx
+; AVX1-NEXT: movl %eax, %ebx
; AVX1-NEXT: shll $16, %ebx
; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
; AVX1-NEXT: callq __truncdfhf2
; AVX1-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
; AVX1-NEXT: # xmm0 = mem[1,0]
; AVX1-NEXT: callq __truncdfhf2
-; AVX1-NEXT: movw %ax, %bx
+; AVX1-NEXT: movl %eax, %ebx
; AVX1-NEXT: shll $16, %ebx
; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncdfhf2
-; AVX1-NEXT: movw %ax, %bx
+; AVX1-NEXT: movl %eax, %ebx
; AVX1-NEXT: shll $16, %ebx
; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX1-NEXT: callq __truncdfhf2
; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2
-; AVX2-NEXT: movw %ax, %bx
+; AVX2-NEXT: movl %eax, %ebx
; AVX2-NEXT: shll $16, %ebx
; AVX2-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2
-; AVX2-NEXT: movw %ax, %bx
+; AVX2-NEXT: movl %eax, %ebx
; AVX2-NEXT: shll $16, %ebx
; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
; AVX2-NEXT: callq __truncdfhf2
; AVX2-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
; AVX2-NEXT: # xmm0 = mem[1,0]
; AVX2-NEXT: callq __truncdfhf2
-; AVX2-NEXT: movw %ax, %bx
+; AVX2-NEXT: movl %eax, %ebx
; AVX2-NEXT: shll $16, %ebx
; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2
-; AVX2-NEXT: movw %ax, %bx
+; AVX2-NEXT: movl %eax, %ebx
; AVX2-NEXT: shll $16, %ebx
; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX2-NEXT: callq __truncdfhf2
; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: callq __truncdfhf2
-; AVX512-NEXT: movw %ax, %bx
+; AVX512-NEXT: movl %eax, %ebx
; AVX512-NEXT: shll $16, %ebx
; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: callq __truncdfhf2
-; AVX512-NEXT: movw %ax, %bx
+; AVX512-NEXT: movl %eax, %ebx
; AVX512-NEXT: shll $16, %ebx
; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
; AVX512-NEXT: callq __truncdfhf2
; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: callq __truncdfhf2
-; AVX512-NEXT: movw %ax, %bx
+; AVX512-NEXT: movl %eax, %ebx
; AVX512-NEXT: shll $16, %ebx
; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: callq __truncdfhf2
-; AVX512-NEXT: movw %ax, %bx
+; AVX512-NEXT: movl %eax, %ebx
; AVX512-NEXT: shll $16, %ebx
; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX512-NEXT: callq __truncdfhf2
; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncdfhf2
-; AVX1-NEXT: movw %ax, %bp
+; AVX1-NEXT: movl %eax, %ebp
; AVX1-NEXT: shll $16, %ebp
; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncdfhf2
-; AVX1-NEXT: movw %ax, %bp
+; AVX1-NEXT: movl %eax, %ebp
; AVX1-NEXT: shll $16, %ebp
; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX1-NEXT: callq __truncdfhf2
; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2
-; AVX2-NEXT: movw %ax, %bp
+; AVX2-NEXT: movl %eax, %ebp
; AVX2-NEXT: shll $16, %ebp
; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2
-; AVX2-NEXT: movw %ax, %bp
+; AVX2-NEXT: movl %eax, %ebp
; AVX2-NEXT: shll $16, %ebp
; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX2-NEXT: callq __truncdfhf2
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
-; AVX512F-NEXT: movw %ax, %bp
+; AVX512F-NEXT: movl %eax, %ebp
; AVX512F-NEXT: shll $16, %ebp
; AVX512F-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
-; AVX512F-NEXT: movw %ax, %bp
+; AVX512F-NEXT: movl %eax, %ebp
; AVX512F-NEXT: shll $16, %ebp
; AVX512F-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX512F-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
-; AVX512VL-NEXT: movw %ax, %bp
+; AVX512VL-NEXT: movl %eax, %ebp
; AVX512VL-NEXT: shll $16, %ebp
; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX512VL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
-; AVX512VL-NEXT: movw %ax, %bp
+; AVX512VL-NEXT: movl %eax, %ebp
; AVX512VL-NEXT: shll $16, %ebp
; AVX512VL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX512VL-NEXT: callq __truncdfhf2
; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncdfhf2
-; AVX1-NEXT: movw %ax, %bp
+; AVX1-NEXT: movl %eax, %ebp
; AVX1-NEXT: shll $16, %ebp
; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncdfhf2
-; AVX1-NEXT: movw %ax, %bp
+; AVX1-NEXT: movl %eax, %ebp
; AVX1-NEXT: shll $16, %ebp
; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX1-NEXT: callq __truncdfhf2
; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2
-; AVX2-NEXT: movw %ax, %bp
+; AVX2-NEXT: movl %eax, %ebp
; AVX2-NEXT: shll $16, %ebp
; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2
-; AVX2-NEXT: movw %ax, %bp
+; AVX2-NEXT: movl %eax, %ebp
; AVX2-NEXT: shll $16, %ebp
; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX2-NEXT: callq __truncdfhf2
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
-; AVX512F-NEXT: movw %ax, %bp
+; AVX512F-NEXT: movl %eax, %ebp
; AVX512F-NEXT: shll $16, %ebp
; AVX512F-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
-; AVX512F-NEXT: movw %ax, %bp
+; AVX512F-NEXT: movl %eax, %ebp
; AVX512F-NEXT: shll $16, %ebp
; AVX512F-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX512F-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
-; AVX512VL-NEXT: movw %ax, %bp
+; AVX512VL-NEXT: movl %eax, %ebp
; AVX512VL-NEXT: shll $16, %ebp
; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX512VL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
-; AVX512VL-NEXT: movw %ax, %bp
+; AVX512VL-NEXT: movl %eax, %ebp
; AVX512VL-NEXT: shll $16, %ebp
; AVX512VL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX512VL-NEXT: callq __truncdfhf2