}
let SchedRW = [WriteFStore] in {
+let mayStore = 1, hasSideEffects = 0 in
def VMOVHPSZ128mr : AVX512PSI<0x17, MRMDestMem, (outs),
(ins f64mem:$dst, VR128X:$src),
"vmovhps\t{$src, $dst|$dst, $src}",
- [(store (f64 (extractelt
- (X86Unpckh (bc_v2f64 (v4f32 VR128X:$src)),
- (bc_v2f64 (v4f32 VR128X:$src))),
- (iPTR 0))), addr:$dst)]>,
- EVEX, EVEX_CD8<32, CD8VT2>;
+ []>, EVEX, EVEX_CD8<32, CD8VT2>;
def VMOVHPDZ128mr : AVX512PDI<0x17, MRMDestMem, (outs),
(ins f64mem:$dst, VR128X:$src),
"vmovhpd\t{$src, $dst|$dst, $src}",
(v2f64 (X86Unpckh VR128X:$src, VR128X:$src)),
(iPTR 0))), addr:$dst)]>,
EVEX, EVEX_CD8<64, CD8VT1>, VEX_W;
+let mayStore = 1, hasSideEffects = 0 in
def VMOVLPSZ128mr : AVX512PSI<0x13, MRMDestMem, (outs),
(ins f64mem:$dst, VR128X:$src),
"vmovlps\t{$src, $dst|$dst, $src}",
- [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128X:$src)),
- (iPTR 0))), addr:$dst)]>,
- EVEX, EVEX_CD8<32, CD8VT2>;
+ []>, EVEX, EVEX_CD8<32, CD8VT2>;
def VMOVLPDZ128mr : AVX512PDI<0x13, MRMDestMem, (outs),
(ins f64mem:$dst, VR128X:$src),
"vmovlpd\t{$src, $dst|$dst, $src}",
{ X86::VUNPCKHPSYrr, X86::VUNPCKHPSYrr, X86::VPUNPCKHDQYrr },
};
+static const uint16_t ReplaceableInstrsFP[][3] = {
+ //PackedSingle PackedDouble
+ { X86::MOVLPSrm, X86::MOVLPDrm, X86::INSTRUCTION_LIST_END },
+ { X86::MOVHPSrm, X86::MOVHPDrm, X86::INSTRUCTION_LIST_END },
+ { X86::MOVHPSmr, X86::MOVHPDmr, X86::INSTRUCTION_LIST_END },
+ { X86::VMOVLPSrm, X86::VMOVLPDrm, X86::INSTRUCTION_LIST_END },
+ { X86::VMOVHPSrm, X86::VMOVHPDrm, X86::INSTRUCTION_LIST_END },
+ { X86::VMOVHPSmr, X86::VMOVHPDmr, X86::INSTRUCTION_LIST_END },
+ { X86::VMOVLPSZ128rm, X86::VMOVLPDZ128rm, X86::INSTRUCTION_LIST_END },
+ { X86::VMOVHPSZ128rm, X86::VMOVHPDZ128rm, X86::INSTRUCTION_LIST_END },
+ { X86::VMOVHPSZ128mr, X86::VMOVHPDZ128mr, X86::INSTRUCTION_LIST_END },
+};
+
static const uint16_t ReplaceableInstrsAVX2InsertExtract[][3] = {
//PackedSingle PackedDouble PackedInt
{ X86::VEXTRACTF128mr, X86::VEXTRACTF128mr, X86::VEXTRACTI128mr },
};
// NOTE: These should only be used by the custom domain methods.
-static const uint16_t ReplaceableCustomInstrs[][3] = {
+static const uint16_t ReplaceableBlendInstrs[][3] = {
//PackedSingle PackedDouble PackedInt
{ X86::BLENDPSrmi, X86::BLENDPDrmi, X86::PBLENDWrmi },
{ X86::BLENDPSrri, X86::BLENDPDrri, X86::PBLENDWrri },
{ X86::VBLENDPSYrmi, X86::VBLENDPDYrmi, X86::VPBLENDWYrmi },
{ X86::VBLENDPSYrri, X86::VBLENDPDYrri, X86::VPBLENDWYrri },
};
-static const uint16_t ReplaceableCustomAVX2Instrs[][3] = {
+static const uint16_t ReplaceableBlendAVX2Instrs[][3] = {
//PackedSingle PackedDouble PackedInt
{ X86::VBLENDPSrmi, X86::VBLENDPDrmi, X86::VPBLENDDrmi },
{ X86::VBLENDPSrri, X86::VBLENDPDrri, X86::VPBLENDDrri },
Imm = (ImmWidth == 16 ? ((Imm << 8) | Imm) : Imm);
unsigned NewImm = Imm;
- const uint16_t *table = lookup(Opcode, dom, ReplaceableCustomInstrs);
+ const uint16_t *table = lookup(Opcode, dom, ReplaceableBlendInstrs);
if (!table)
- table = lookup(Opcode, dom, ReplaceableCustomAVX2Instrs);
+ table = lookup(Opcode, dom, ReplaceableBlendAVX2Instrs);
if (Domain == 1) { // PackedSingle
AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4, &NewImm);
if (Subtarget.hasAVX2()) {
// If we are already VPBLENDW use that, else use VPBLENDD.
if ((ImmWidth / (Is256 ? 2 : 1)) != 8) {
- table = lookup(Opcode, dom, ReplaceableCustomAVX2Instrs);
+ table = lookup(Opcode, dom, ReplaceableBlendAVX2Instrs);
AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4, &NewImm);
}
} else {
validDomains = 0xe;
} else if (lookup(opcode, domain, ReplaceableInstrsAVX2)) {
validDomains = Subtarget.hasAVX2() ? 0xe : 0x6;
+ } else if (lookup(opcode, domain, ReplaceableInstrsFP)) {
+ validDomains = 0x6;
} else if (lookup(opcode, domain, ReplaceableInstrsAVX2InsertExtract)) {
// Insert/extract instructions should only effect domain if AVX2
// is enabled.
"256-bit vector operations only available in AVX2");
table = lookup(MI.getOpcode(), dom, ReplaceableInstrsAVX2);
}
+ if (!table) { // try the FP table
+ table = lookup(MI.getOpcode(), dom, ReplaceableInstrsFP);
+ assert((!table || Domain < 3) &&
+ "Can only select PackedSingle or PackedDouble");
+ }
if (!table) { // try the other table
assert(Subtarget.hasAVX2() &&
"256-bit insert/extract only available in AVX2");
let SchedRW = [WriteFStore] in {
let Predicates = [UseAVX] in {
+let mayStore = 1, hasSideEffects = 0 in
def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movlps\t{$src, $dst|$dst, $src}",
- [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)),
- (iPTR 0))), addr:$dst)]>,
+ []>,
VEX, VEX_WIG;
def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movlpd\t{$src, $dst|$dst, $src}",
(iPTR 0))), addr:$dst)]>,
VEX, VEX_WIG;
}// UseAVX
+let mayStore = 1, hasSideEffects = 0 in
def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movlps\t{$src, $dst|$dst, $src}",
- [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)),
- (iPTR 0))), addr:$dst)]>;
+ []>;
def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movlpd\t{$src, $dst|$dst, $src}",
[(store (f64 (extractelt (v2f64 VR128:$src),
// v2f64 extract element 1 is always custom lowered to unpack high to low
// and extract element 0 so the non-store version isn't too horrible.
let Predicates = [UseAVX] in {
+let mayStore = 1, hasSideEffects = 0 in
def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movhps\t{$src, $dst|$dst, $src}",
- [(store (f64 (extractelt
- (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
- (bc_v2f64 (v4f32 VR128:$src))),
- (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG;
+ []>, VEX, VEX_WIG;
def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movhpd\t{$src, $dst|$dst, $src}",
[(store (f64 (extractelt
(v2f64 (X86Unpckh VR128:$src, VR128:$src)),
(iPTR 0))), addr:$dst)]>, VEX, VEX_WIG;
} // UseAVX
+let mayStore = 1, hasSideEffects = 0 in
def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movhps\t{$src, $dst|$dst, $src}",
- [(store (f64 (extractelt
- (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
- (bc_v2f64 (v4f32 VR128:$src))),
- (iPTR 0))), addr:$dst)]>;
+ []>;
def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movhpd\t{$src, $dst|$dst, $src}",
[(store (f64 (extractelt
; NOGATHER-NEXT: # %bb.1: # %cond.load
; NOGATHER-NEXT: vmovq %xmm2, %rax
; NOGATHER-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; NOGATHER-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3]
+; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7]
; NOGATHER-NEXT: .LBB9_2: # %else
; NOGATHER-NEXT: vpextrb $4, %xmm0, %eax
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: je .LBB9_4
; NOGATHER-NEXT: # %bb.3: # %cond.load1
; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax
-; NOGATHER-NEXT: vmovhpd {{.*#+}} xmm3 = xmm1[0],mem[0]
-; NOGATHER-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3]
+; NOGATHER-NEXT: vmovhps {{.*#+}} xmm3 = xmm1[0,1],mem[0,1]
+; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
; NOGATHER-NEXT: .LBB9_4: # %else2
; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax
; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm2
; NOGATHER-NEXT: # %bb.5: # %cond.load4
; NOGATHER-NEXT: vmovq %xmm2, %rax
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm3
-; NOGATHER-NEXT: vmovlpd {{.*#+}} xmm3 = mem[0],xmm3[1]
+; NOGATHER-NEXT: vmovlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3]
; NOGATHER-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; NOGATHER-NEXT: .LBB9_6: # %else5
; NOGATHER-NEXT: vpextrb $12, %xmm0, %eax
; NOGATHER-NEXT: # %bb.7: # %cond.load7
; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax
; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0
-; NOGATHER-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; NOGATHER-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
; NOGATHER-NEXT: .LBB9_8: # %else8
-; NOGATHER-NEXT: vmovapd %ymm1, %ymm0
+; NOGATHER-NEXT: vmovaps %ymm1, %ymm0
; NOGATHER-NEXT: retq
entry:
%ld = load <4 x double*>, <4 x double*>* %ptr
; NOGATHER-NEXT: je .LBB11_2
; NOGATHER-NEXT: # %bb.1: # %cond.load
; NOGATHER-NEXT: vmovq %xmm2, %rax
-; NOGATHER-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
+; NOGATHER-NEXT: vmovlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
; NOGATHER-NEXT: .LBB11_2: # %else
; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax
; NOGATHER-NEXT: testb $1, %al
; NOGATHER-NEXT: je .LBB11_4
; NOGATHER-NEXT: # %bb.3: # %cond.load1
; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax
-; NOGATHER-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; NOGATHER-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
; NOGATHER-NEXT: .LBB11_4: # %else2
-; NOGATHER-NEXT: vmovapd %xmm1, %xmm0
+; NOGATHER-NEXT: vmovaps %xmm1, %xmm0
; NOGATHER-NEXT: retq
entry:
%ld = load <2 x double*>, <2 x double*>* %ptr
define <8 x double> @test2(<8 x double> %x, double* %br, double %y) nounwind {
; CHECK-LABEL: test2:
; CHECK: ## %bb.0:
-; CHECK-NEXT: vmovhpd {{.*#+}} xmm2 = xmm0[0],mem[0]
+; CHECK-NEXT: vmovhps {{.*#+}} xmm2 = xmm0[0,1],mem[0,1]
; CHECK-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm2
; CHECK-NEXT: vextractf32x4 $3, %zmm0, %xmm0
-; CHECK-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; CHECK-NEXT: vinsertf32x4 $3, %xmm0, %zmm2, %zmm0
; CHECK-NEXT: retq
%rrr = load double, double* %br
; AVX512-LABEL: load_one_mask_bit_set5:
; AVX512: ## %bb.0:
; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm1
-; AVX512-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX512-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
; AVX512-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0
; AVX512-NEXT: retq
%res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1><i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x double> %val)
;
; SLOW-LABEL: merge_vec_element_store:
; SLOW: # %bb.0:
-; SLOW-NEXT: movlpd %xmm0, (%rdi)
-; SLOW-NEXT: movhpd %xmm0, 8(%rdi)
+; SLOW-NEXT: movlps %xmm0, (%rdi)
+; SLOW-NEXT: movhps %xmm0, 8(%rdi)
; SLOW-NEXT: retq
%vecext0 = extractelement <4 x double> %v, i32 0
; SSE-X32-LABEL: extract_f64_1:
; SSE-X32: # %bb.0:
; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; SSE-X32-NEXT: movhpd %xmm0, (%eax)
+; SSE-X32-NEXT: movhps %xmm0, (%eax)
; SSE-X32-NEXT: retl
;
; SSE-X64-LABEL: extract_f64_1:
; SSE-X64: # %bb.0:
-; SSE-X64-NEXT: movhpd %xmm0, (%rdi)
+; SSE-X64-NEXT: movhps %xmm0, (%rdi)
; SSE-X64-NEXT: retq
;
; AVX-X32-LABEL: extract_f64_1:
; AVX-X32: # %bb.0:
; AVX-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; AVX-X32-NEXT: vmovhpd %xmm0, (%eax)
+; AVX-X32-NEXT: vmovhps %xmm0, (%eax)
; AVX-X32-NEXT: retl
;
; AVX-X64-LABEL: extract_f64_1:
; AVX-X64: # %bb.0:
-; AVX-X64-NEXT: vmovhpd %xmm0, (%rdi)
+; AVX-X64-NEXT: vmovhps %xmm0, (%rdi)
; AVX-X64-NEXT: retq
%vecext = extractelement <2 x double> %foo, i32 1
store double %vecext, double* %dst, align 1
; X32-SSE2-LABEL: t3:
; X32-SSE2: # %bb.0: # %bb
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE2-NEXT: movupd (%eax), %xmm0
-; X32-SSE2-NEXT: movhpd %xmm0, (%eax)
+; X32-SSE2-NEXT: movups (%eax), %xmm0
+; X32-SSE2-NEXT: movhps %xmm0, (%eax)
; X32-SSE2-NEXT: retl
;
; X64-SSSE3-LABEL: t3:
; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A]
; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32_BDVER2-NEXT: vmovapd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf9,0x28,0x44,0x24,0x30]
-; FMACALL32_BDVER2-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10]
+; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x30]
+; FMACALL32_BDVER2-NEXT: vmovhps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x17,0x44,0x24,0x10]
; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40]
; FMACALL32_BDVER2-NEXT: vmovlps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x20]
; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x28]
; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero
-; FMACALL32_BDVER2-NEXT: vmovhpd {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x16,0x44,0x24,0x20]
-; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0],mem[0]
+; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0x44,0x24,0x20]
+; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1],mem[0,1]
; FMACALL32_BDVER2-NEXT: addl $108, %esp ## encoding: [0x83,0xc4,0x6c]
; FMACALL32_BDVER2-NEXT: retl ## encoding: [0xc3]
entry:
; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x7c,0x24,0x44]
-; FMACALL32_BDVER2-NEXT: vmovupd {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfd,0x10,0x84,0x24,0xc0,0x00,0x00,0x00]
-; FMACALL32_BDVER2-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10]
+; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xc0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT: vmovhps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x17,0x44,0x24,0x10]
; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x80,0x00,0x00,0x00]
; FMACALL32_BDVER2-NEXT: vunpckhpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A]
; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32_BDVER2-NEXT: vmovapd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf9,0x28,0x44,0x24,0x70]
-; FMACALL32_BDVER2-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10]
+; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x70]
+; FMACALL32_BDVER2-NEXT: vmovhps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x17,0x44,0x24,0x10]
; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50]
; FMACALL32_BDVER2-NEXT: vmovlps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero
; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x20]
; FMACALL32_BDVER2-NEXT: ## xmm1 = mem[0],zero
-; FMACALL32_BDVER2-NEXT: vmovhpd {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x16,0x44,0x24,0x28]
-; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0],mem[0]
-; FMACALL32_BDVER2-NEXT: vmovhpd {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x16,0x4c,0x24,0x18]
-; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0],mem[0]
+; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0x44,0x24,0x28]
+; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1],mem[0,1]
+; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x16,0x4c,0x24,0x18]
+; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1],mem[0,1]
; FMACALL32_BDVER2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01]
; FMACALL32_BDVER2-NEXT: addl $252, %esp ## encoding: [0x81,0xc4,0xfc,0x00,0x00,0x00]
; FMACALL32_BDVER2-NEXT: retl ## encoding: [0xc3]
; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A]
; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32_BDVER2-NEXT: vmovapd 40(%ebp), %xmm0 ## encoding: [0xc5,0xf9,0x28,0x45,0x28]
+; FMACALL32_BDVER2-NEXT: vmovaps 40(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x28]
; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x94,0x00,0x00,0x00]
-; FMACALL32_BDVER2-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10]
+; FMACALL32_BDVER2-NEXT: vmovhps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x17,0x44,0x24,0x10]
; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xa0,0x00,0x00,0x00]
; FMACALL32_BDVER2-NEXT: vunpckhpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A]
; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32_BDVER2-NEXT: vmovapd 24(%ebp), %xmm0 ## encoding: [0xc5,0xf9,0x28,0x45,0x18]
+; FMACALL32_BDVER2-NEXT: vmovaps 24(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x18]
; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xc0,0x00,0x00,0x00]
-; FMACALL32_BDVER2-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10]
+; FMACALL32_BDVER2-NEXT: vmovhps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x17,0x44,0x24,0x10]
; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x00,0x01,0x00,0x00]
; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm0, %xmm1 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc1,0x01]
; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A]
; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32_BDVER2-NEXT: vmovapd 8(%ebp), %xmm0 ## encoding: [0xc5,0xf9,0x28,0x45,0x08]
+; FMACALL32_BDVER2-NEXT: vmovaps 8(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x08]
; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x7c,0x24,0x30]
-; FMACALL32_BDVER2-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10]
+; FMACALL32_BDVER2-NEXT: vmovhps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x17,0x44,0x24,0x10]
; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xe0,0x00,0x00,0x00]
; FMACALL32_BDVER2-NEXT: vunpckhpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A]
; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32_BDVER2-NEXT: vmovapd 56(%ebp), %xmm0 ## encoding: [0xc5,0xf9,0x28,0x45,0x38]
-; FMACALL32_BDVER2-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10]
+; FMACALL32_BDVER2-NEXT: vmovaps 56(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x38]
+; FMACALL32_BDVER2-NEXT: vmovhps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x17,0x44,0x24,0x10]
; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x30,0x01,0x00,0x00]
; FMACALL32_BDVER2-NEXT: vmovlps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero
; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x50]
; FMACALL32_BDVER2-NEXT: ## xmm1 = mem[0],zero
-; FMACALL32_BDVER2-NEXT: vmovhpd {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x16,0x44,0x24,0x58]
-; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0],mem[0]
-; FMACALL32_BDVER2-NEXT: vmovhpd {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x16,0x4c,0x24,0x48]
-; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0],mem[0]
+; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0x44,0x24,0x58]
+; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1],mem[0,1]
+; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x16,0x4c,0x24,0x48]
+; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1],mem[0,1]
; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm2 ## encoding: [0xc5,0xfb,0x10,0x54,0x24,0x70]
; FMACALL32_BDVER2-NEXT: ## xmm2 = mem[0],zero
-; FMACALL32_BDVER2-NEXT: vmovhpd {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0x16,0x54,0x24,0x68]
-; FMACALL32_BDVER2-NEXT: ## xmm2 = xmm2[0],mem[0]
+; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x16,0x54,0x24,0x68]
+; FMACALL32_BDVER2-NEXT: ## xmm2 = xmm2[0,1],mem[0,1]
; FMACALL32_BDVER2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01]
; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x8c,0x24,0x80,0x00,0x00,0x00]
; FMACALL32_BDVER2-NEXT: ## xmm1 = mem[0],zero
-; FMACALL32_BDVER2-NEXT: vmovhpd {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x16,0x4c,0x24,0x78]
-; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0],mem[0]
+; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x16,0x4c,0x24,0x78]
+; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1],mem[0,1]
; FMACALL32_BDVER2-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ## encoding: [0xc4,0xe3,0x75,0x18,0xca,0x01]
; FMACALL32_BDVER2-NEXT: movl %ebp, %esp ## encoding: [0x89,0xec]
; FMACALL32_BDVER2-NEXT: popl %ebp ## encoding: [0x5d]
; LIN-SSE2-NEXT: movslq %edx, %rdx
; LIN-SSE2-NEXT: movslq %esi, %rsi
; LIN-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; LIN-SSE2-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; LIN-SSE2-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
; LIN-SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
-; LIN-SSE2-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; LIN-SSE2-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
; LIN-SSE2-NEXT: retq
;
; LIN-SSE4-LABEL: foo:
; LIN-SSE4-NEXT: movslq %ecx, %rcx
; LIN-SSE4-NEXT: movslq %edx, %rdx
; LIN-SSE4-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; LIN-SSE4-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; LIN-SSE4-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
; LIN-SSE4-NEXT: movslq %esi, %rax
; LIN-SSE4-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
-; LIN-SSE4-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; LIN-SSE4-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
; LIN-SSE4-NEXT: retq
;
; WIN-SSE2-LABEL: foo:
; WIN-SSE2-NEXT: movslq %r10d, %r9
; WIN-SSE2-NEXT: movslq %edx, %rdx
; WIN-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; WIN-SSE2-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; WIN-SSE2-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
; WIN-SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
-; WIN-SSE2-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; WIN-SSE2-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
; WIN-SSE2-NEXT: retq
;
; WIN-SSE4-LABEL: foo:
; WIN-SSE4-NEXT: movslq %edx, %rdx
; WIN-SSE4-NEXT: movslq %r8d, %r8
; WIN-SSE4-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; WIN-SSE4-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; WIN-SSE4-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
; WIN-SSE4-NEXT: movslq %r9d, %rax
; WIN-SSE4-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
-; WIN-SSE4-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; WIN-SSE4-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
; WIN-SSE4-NEXT: retq
;
; LIN32-LABEL: foo:
; LIN32-NEXT: pextrd $3, %xmm0, %esi
; LIN32-NEXT: movd %xmm0, %edi
; LIN32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; LIN32-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; LIN32-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
; LIN32-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
-; LIN32-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; LIN32-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
; LIN32-NEXT: popl %esi
; LIN32-NEXT: popl %edi
; LIN32-NEXT: retl
; CHECK-I686-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
; CHECK-I686-NEXT: fstpl {{[0-9]+}}(%esp)
; CHECK-I686-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-I686-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; CHECK-I686-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
; CHECK-I686-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
-; CHECK-I686-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; CHECK-I686-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
; CHECK-I686-NEXT: addl $88, %esp
; CHECK-I686-NEXT: popl %esi
; CHECK-I686-NEXT: retl
; CHECK-I686-NEXT: movlps %xmm0, (%esp)
; CHECK-I686-NEXT: calll __truncdfhf2
; CHECK-I686-NEXT: movw %ax, %si
-; CHECK-I686-NEXT: movapd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-I686-NEXT: movhpd %xmm0, (%esp)
+; CHECK-I686-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-I686-NEXT: movhps %xmm0, (%esp)
; CHECK-I686-NEXT: calll __truncdfhf2
; CHECK-I686-NEXT: movw %ax, %di
; CHECK-I686-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-I686-NEXT: movlps %xmm0, (%esp)
; CHECK-I686-NEXT: calll __truncdfhf2
; CHECK-I686-NEXT: movw %ax, %bx
-; CHECK-I686-NEXT: movapd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-I686-NEXT: movhpd %xmm0, (%esp)
+; CHECK-I686-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-I686-NEXT: movhps %xmm0, (%esp)
; CHECK-I686-NEXT: calll __truncdfhf2
; CHECK-I686-NEXT: movw %ax, 6(%ebp)
; CHECK-I686-NEXT: movw %bx, 4(%ebp)
define <2 x double> @elt1_v2f64(double %x) {
; X32SSE-LABEL: elt1_v2f64:
; X32SSE: # %bb.0:
-; X32SSE-NEXT: movapd {{.*#+}} xmm0 = <4.2E+1,u>
-; X32SSE-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; X32SSE-NEXT: movaps {{.*#+}} xmm0 = <4.2E+1,u>
+; X32SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
; X32SSE-NEXT: retl
;
; X64SSE-LABEL: elt1_v2f64:
;
; X32AVX-LABEL: elt1_v2f64:
; X32AVX: # %bb.0:
-; X32AVX-NEXT: vmovapd {{.*#+}} xmm0 = <4.2E+1,u>
-; X32AVX-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; X32AVX-NEXT: vmovaps {{.*#+}} xmm0 = <4.2E+1,u>
+; X32AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
; X32AVX-NEXT: retl
;
; X64AVX-LABEL: elt1_v2f64:
define <8 x double> @elt1_v8f64(double %x) {
; X32SSE-LABEL: elt1_v8f64:
; X32SSE: # %bb.0:
-; X32SSE-NEXT: movapd {{.*#+}} xmm0 = <4.2E+1,u>
-; X32SSE-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; X32SSE-NEXT: movaps {{.*#+}} xmm0 = <4.2E+1,u>
+; X32SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
; X32SSE-NEXT: movaps {{.*#+}} xmm1 = [2.0E+0,3.0E+0]
; X32SSE-NEXT: movaps {{.*#+}} xmm2 = [4.0E+0,5.0E+0]
; X32SSE-NEXT: movaps {{.*#+}} xmm3 = [6.0E+0,7.0E+0]
;
; X32AVX1-LABEL: elt1_v8f64:
; X32AVX1: # %bb.0:
-; X32AVX1-NEXT: vmovapd {{.*#+}} xmm0 = <4.2E+1,u,2.0E+0,3.0E+0>
-; X32AVX1-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; X32AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2,3]
+; X32AVX1-NEXT: vmovaps {{.*#+}} xmm0 = <4.2E+1,u,2.0E+0,3.0E+0>
+; X32AVX1-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; X32AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
; X32AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [4.0E+0,5.0E+0,6.0E+0,7.0E+0]
; X32AVX1-NEXT: retl
;
;
; X32AVX2-LABEL: elt1_v8f64:
; X32AVX2: # %bb.0:
-; X32AVX2-NEXT: vmovapd {{.*#+}} xmm0 = <4.2E+1,u,2.0E+0,3.0E+0>
-; X32AVX2-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; X32AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2,3]
+; X32AVX2-NEXT: vmovaps {{.*#+}} xmm0 = <4.2E+1,u,2.0E+0,3.0E+0>
+; X32AVX2-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; X32AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
; X32AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [4.0E+0,5.0E+0,6.0E+0,7.0E+0]
; X32AVX2-NEXT: retl
;
;
; X32AVX512F-LABEL: elt1_v8f64:
; X32AVX512F: # %bb.0:
-; X32AVX512F-NEXT: vmovapd {{.*#+}} xmm0 = <4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0>
-; X32AVX512F-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; X32AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = <4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0>
+; X32AVX512F-NEXT: vmovaps {{.*#+}} xmm0 = <4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0>
+; X32AVX512F-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; X32AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = <4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0>
; X32AVX512F-NEXT: vinsertf32x4 $0, %xmm0, %zmm1, %zmm0
; X32AVX512F-NEXT: retl
;
define <2 x i64> @insert_v2i64_x1(<2 x i64> %a) {
; SSE2-LABEL: insert_v2i64_x1:
; SSE2: # %bb.0:
-; SSE2-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
+; SSE2-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
; SSE2-NEXT: retq
;
; SSE3-LABEL: insert_v2i64_x1:
; SSE3: # %bb.0:
-; SSE3-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
+; SSE3-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: insert_v2i64_x1:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
+; SSSE3-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: insert_v2i64_x1:
define <4 x i64> @insert_v4i64_01x3(<4 x i64> %a) {
; SSE2-LABEL: insert_v4i64_01x3:
; SSE2: # %bb.0:
-; SSE2-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
+; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
; SSE2-NEXT: retq
;
; SSE3-LABEL: insert_v4i64_01x3:
; SSE3: # %bb.0:
-; SSE3-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
+; SSE3-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: insert_v4i64_01x3:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
+; SSSE3-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: insert_v4i64_01x3:
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB0_2
; SSE2-NEXT: ## %bb.1: ## %cond.store
-; SSE2-NEXT: movlpd %xmm0, (%rdi)
+; SSE2-NEXT: movlps %xmm0, (%rdi)
; SSE2-NEXT: addq $8, %rdi
; SSE2-NEXT: LBB0_2: ## %else
; SSE2-NEXT: shrl $16, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB0_4
; SSE2-NEXT: ## %bb.3: ## %cond.store1
-; SSE2-NEXT: movhpd %xmm0, (%rdi)
+; SSE2-NEXT: movhps %xmm0, (%rdi)
; SSE2-NEXT: addq $8, %rdi
; SSE2-NEXT: LBB0_4: ## %else2
; SSE2-NEXT: pextrw $2, %xmm4, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB0_6
; SSE2-NEXT: ## %bb.5: ## %cond.store4
-; SSE2-NEXT: movlpd %xmm1, (%rdi)
+; SSE2-NEXT: movlps %xmm1, (%rdi)
; SSE2-NEXT: addq $8, %rdi
; SSE2-NEXT: LBB0_6: ## %else5
; SSE2-NEXT: pextrw $3, %xmm4, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB0_8
; SSE2-NEXT: ## %bb.7: ## %cond.store7
-; SSE2-NEXT: movhpd %xmm1, (%rdi)
+; SSE2-NEXT: movhps %xmm1, (%rdi)
; SSE2-NEXT: addq $8, %rdi
; SSE2-NEXT: LBB0_8: ## %else8
; SSE2-NEXT: pextrw $4, %xmm4, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB0_10
; SSE2-NEXT: ## %bb.9: ## %cond.store10
-; SSE2-NEXT: movlpd %xmm2, (%rdi)
+; SSE2-NEXT: movlps %xmm2, (%rdi)
; SSE2-NEXT: addq $8, %rdi
; SSE2-NEXT: LBB0_10: ## %else11
; SSE2-NEXT: pextrw $5, %xmm4, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB0_12
; SSE2-NEXT: ## %bb.11: ## %cond.store13
-; SSE2-NEXT: movhpd %xmm2, (%rdi)
+; SSE2-NEXT: movhps %xmm2, (%rdi)
; SSE2-NEXT: addq $8, %rdi
; SSE2-NEXT: LBB0_12: ## %else14
; SSE2-NEXT: pextrw $6, %xmm4, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB0_14
; SSE2-NEXT: ## %bb.13: ## %cond.store16
-; SSE2-NEXT: movlpd %xmm3, (%rdi)
+; SSE2-NEXT: movlps %xmm3, (%rdi)
; SSE2-NEXT: addq $8, %rdi
; SSE2-NEXT: LBB0_14: ## %else17
; SSE2-NEXT: pextrw $7, %xmm4, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB0_16
; SSE2-NEXT: ## %bb.15: ## %cond.store19
-; SSE2-NEXT: movhpd %xmm3, (%rdi)
+; SSE2-NEXT: movhps %xmm3, (%rdi)
; SSE2-NEXT: LBB0_16: ## %else20
; SSE2-NEXT: retq
;
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB0_2
; SSE42-NEXT: ## %bb.1: ## %cond.store
-; SSE42-NEXT: movlpd %xmm0, (%rdi)
+; SSE42-NEXT: movlps %xmm0, (%rdi)
; SSE42-NEXT: addq $8, %rdi
; SSE42-NEXT: LBB0_2: ## %else
; SSE42-NEXT: pextrb $2, %xmm4, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB0_4
; SSE42-NEXT: ## %bb.3: ## %cond.store1
-; SSE42-NEXT: movhpd %xmm0, (%rdi)
+; SSE42-NEXT: movhps %xmm0, (%rdi)
; SSE42-NEXT: addq $8, %rdi
; SSE42-NEXT: LBB0_4: ## %else2
; SSE42-NEXT: pextrb $4, %xmm4, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB0_6
; SSE42-NEXT: ## %bb.5: ## %cond.store4
-; SSE42-NEXT: movlpd %xmm1, (%rdi)
+; SSE42-NEXT: movlps %xmm1, (%rdi)
; SSE42-NEXT: addq $8, %rdi
; SSE42-NEXT: LBB0_6: ## %else5
; SSE42-NEXT: pextrb $6, %xmm4, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB0_8
; SSE42-NEXT: ## %bb.7: ## %cond.store7
-; SSE42-NEXT: movhpd %xmm1, (%rdi)
+; SSE42-NEXT: movhps %xmm1, (%rdi)
; SSE42-NEXT: addq $8, %rdi
; SSE42-NEXT: LBB0_8: ## %else8
; SSE42-NEXT: pextrb $8, %xmm4, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB0_10
; SSE42-NEXT: ## %bb.9: ## %cond.store10
-; SSE42-NEXT: movlpd %xmm2, (%rdi)
+; SSE42-NEXT: movlps %xmm2, (%rdi)
; SSE42-NEXT: addq $8, %rdi
; SSE42-NEXT: LBB0_10: ## %else11
; SSE42-NEXT: pextrb $10, %xmm4, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB0_12
; SSE42-NEXT: ## %bb.11: ## %cond.store13
-; SSE42-NEXT: movhpd %xmm2, (%rdi)
+; SSE42-NEXT: movhps %xmm2, (%rdi)
; SSE42-NEXT: addq $8, %rdi
; SSE42-NEXT: LBB0_12: ## %else14
; SSE42-NEXT: pextrb $12, %xmm4, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB0_14
; SSE42-NEXT: ## %bb.13: ## %cond.store16
-; SSE42-NEXT: movlpd %xmm3, (%rdi)
+; SSE42-NEXT: movlps %xmm3, (%rdi)
; SSE42-NEXT: addq $8, %rdi
; SSE42-NEXT: LBB0_14: ## %else17
; SSE42-NEXT: pextrb $14, %xmm4, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB0_16
; SSE42-NEXT: ## %bb.15: ## %cond.store19
-; SSE42-NEXT: movhpd %xmm3, (%rdi)
+; SSE42-NEXT: movhps %xmm3, (%rdi)
; SSE42-NEXT: LBB0_16: ## %else20
; SSE42-NEXT: retq
;
; AVX1OR2-NEXT: testb $1, %al
; AVX1OR2-NEXT: je LBB0_2
; AVX1OR2-NEXT: ## %bb.1: ## %cond.store
-; AVX1OR2-NEXT: vmovlpd %xmm0, (%rdi)
+; AVX1OR2-NEXT: vmovlps %xmm0, (%rdi)
; AVX1OR2-NEXT: addq $8, %rdi
; AVX1OR2-NEXT: LBB0_2: ## %else
; AVX1OR2-NEXT: vpextrb $2, %xmm2, %eax
; AVX1OR2-NEXT: testb $1, %al
; AVX1OR2-NEXT: je LBB0_4
; AVX1OR2-NEXT: ## %bb.3: ## %cond.store1
-; AVX1OR2-NEXT: vmovhpd %xmm0, (%rdi)
+; AVX1OR2-NEXT: vmovhps %xmm0, (%rdi)
; AVX1OR2-NEXT: addq $8, %rdi
; AVX1OR2-NEXT: LBB0_4: ## %else2
; AVX1OR2-NEXT: vpextrb $4, %xmm2, %eax
; AVX1OR2-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1OR2-NEXT: je LBB0_6
; AVX1OR2-NEXT: ## %bb.5: ## %cond.store4
-; AVX1OR2-NEXT: vmovlpd %xmm0, (%rdi)
+; AVX1OR2-NEXT: vmovlps %xmm0, (%rdi)
; AVX1OR2-NEXT: addq $8, %rdi
; AVX1OR2-NEXT: LBB0_6: ## %else5
; AVX1OR2-NEXT: vpextrb $6, %xmm2, %eax
; AVX1OR2-NEXT: testb $1, %al
; AVX1OR2-NEXT: je LBB0_8
; AVX1OR2-NEXT: ## %bb.7: ## %cond.store7
-; AVX1OR2-NEXT: vmovhpd %xmm0, (%rdi)
+; AVX1OR2-NEXT: vmovhps %xmm0, (%rdi)
; AVX1OR2-NEXT: addq $8, %rdi
; AVX1OR2-NEXT: LBB0_8: ## %else8
; AVX1OR2-NEXT: vpextrb $8, %xmm2, %eax
; AVX1OR2-NEXT: testb $1, %al
; AVX1OR2-NEXT: je LBB0_10
; AVX1OR2-NEXT: ## %bb.9: ## %cond.store10
-; AVX1OR2-NEXT: vmovlpd %xmm1, (%rdi)
+; AVX1OR2-NEXT: vmovlps %xmm1, (%rdi)
; AVX1OR2-NEXT: addq $8, %rdi
; AVX1OR2-NEXT: LBB0_10: ## %else11
; AVX1OR2-NEXT: vpextrb $10, %xmm2, %eax
; AVX1OR2-NEXT: testb $1, %al
; AVX1OR2-NEXT: je LBB0_12
; AVX1OR2-NEXT: ## %bb.11: ## %cond.store13
-; AVX1OR2-NEXT: vmovhpd %xmm1, (%rdi)
+; AVX1OR2-NEXT: vmovhps %xmm1, (%rdi)
; AVX1OR2-NEXT: addq $8, %rdi
; AVX1OR2-NEXT: LBB0_12: ## %else14
; AVX1OR2-NEXT: vpextrb $12, %xmm2, %eax
; AVX1OR2-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX1OR2-NEXT: je LBB0_14
; AVX1OR2-NEXT: ## %bb.13: ## %cond.store16
-; AVX1OR2-NEXT: vmovlpd %xmm0, (%rdi)
+; AVX1OR2-NEXT: vmovlps %xmm0, (%rdi)
; AVX1OR2-NEXT: addq $8, %rdi
; AVX1OR2-NEXT: LBB0_14: ## %else17
; AVX1OR2-NEXT: vpextrb $14, %xmm2, %eax
; AVX1OR2-NEXT: testb $1, %al
; AVX1OR2-NEXT: je LBB0_16
; AVX1OR2-NEXT: ## %bb.15: ## %cond.store19
-; AVX1OR2-NEXT: vmovhpd %xmm0, (%rdi)
+; AVX1OR2-NEXT: vmovhps %xmm0, (%rdi)
; AVX1OR2-NEXT: LBB0_16: ## %else20
; AVX1OR2-NEXT: vzeroupper
; AVX1OR2-NEXT: retq
; SSE2-NEXT: testb $1, -{{[0-9]+}}(%rsp)
; SSE2-NEXT: je LBB1_2
; SSE2-NEXT: ## %bb.1: ## %cond.store
-; SSE2-NEXT: movlpd %xmm0, (%rdi)
+; SSE2-NEXT: movlps %xmm0, (%rdi)
; SSE2-NEXT: addq $8, %rdi
; SSE2-NEXT: LBB1_2: ## %else
; SSE2-NEXT: movd %xmm8, %eax
; SSE2-NEXT: testb $1, %cl
; SSE2-NEXT: je LBB1_4
; SSE2-NEXT: ## %bb.3: ## %cond.store1
-; SSE2-NEXT: movhpd %xmm0, (%rdi)
+; SSE2-NEXT: movhps %xmm0, (%rdi)
; SSE2-NEXT: addq $8, %rdi
; SSE2-NEXT: LBB1_4: ## %else2
; SSE2-NEXT: movl %eax, %ecx
; SSE2-NEXT: testb $1, %cl
; SSE2-NEXT: je LBB1_6
; SSE2-NEXT: ## %bb.5: ## %cond.store4
-; SSE2-NEXT: movlpd %xmm1, (%rdi)
+; SSE2-NEXT: movlps %xmm1, (%rdi)
; SSE2-NEXT: addq $8, %rdi
; SSE2-NEXT: LBB1_6: ## %else5
; SSE2-NEXT: shrl $24, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB1_8
; SSE2-NEXT: ## %bb.7: ## %cond.store7
-; SSE2-NEXT: movhpd %xmm1, (%rdi)
+; SSE2-NEXT: movhps %xmm1, (%rdi)
; SSE2-NEXT: addq $8, %rdi
; SSE2-NEXT: LBB1_8: ## %else8
; SSE2-NEXT: pextrw $2, %xmm8, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB1_10
; SSE2-NEXT: ## %bb.9: ## %cond.store10
-; SSE2-NEXT: movlpd %xmm2, (%rdi)
+; SSE2-NEXT: movlps %xmm2, (%rdi)
; SSE2-NEXT: addq $8, %rdi
; SSE2-NEXT: LBB1_10: ## %else11
; SSE2-NEXT: shrl $8, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB1_12
; SSE2-NEXT: ## %bb.11: ## %cond.store13
-; SSE2-NEXT: movhpd %xmm2, (%rdi)
+; SSE2-NEXT: movhps %xmm2, (%rdi)
; SSE2-NEXT: addq $8, %rdi
; SSE2-NEXT: LBB1_12: ## %else14
; SSE2-NEXT: pextrw $3, %xmm8, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB1_14
; SSE2-NEXT: ## %bb.13: ## %cond.store16
-; SSE2-NEXT: movlpd %xmm3, (%rdi)
+; SSE2-NEXT: movlps %xmm3, (%rdi)
; SSE2-NEXT: addq $8, %rdi
; SSE2-NEXT: LBB1_14: ## %else17
; SSE2-NEXT: shrl $8, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB1_16
; SSE2-NEXT: ## %bb.15: ## %cond.store19
-; SSE2-NEXT: movhpd %xmm3, (%rdi)
+; SSE2-NEXT: movhps %xmm3, (%rdi)
; SSE2-NEXT: addq $8, %rdi
; SSE2-NEXT: LBB1_16: ## %else20
; SSE2-NEXT: pextrw $4, %xmm8, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB1_18
; SSE2-NEXT: ## %bb.17: ## %cond.store22
-; SSE2-NEXT: movlpd %xmm4, (%rdi)
+; SSE2-NEXT: movlps %xmm4, (%rdi)
; SSE2-NEXT: addq $8, %rdi
; SSE2-NEXT: LBB1_18: ## %else23
; SSE2-NEXT: shrl $8, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB1_20
; SSE2-NEXT: ## %bb.19: ## %cond.store25
-; SSE2-NEXT: movhpd %xmm4, (%rdi)
+; SSE2-NEXT: movhps %xmm4, (%rdi)
; SSE2-NEXT: addq $8, %rdi
; SSE2-NEXT: LBB1_20: ## %else26
; SSE2-NEXT: pextrw $5, %xmm8, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB1_22
; SSE2-NEXT: ## %bb.21: ## %cond.store28
-; SSE2-NEXT: movlpd %xmm5, (%rdi)
+; SSE2-NEXT: movlps %xmm5, (%rdi)
; SSE2-NEXT: addq $8, %rdi
; SSE2-NEXT: LBB1_22: ## %else29
; SSE2-NEXT: shrl $8, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB1_24
; SSE2-NEXT: ## %bb.23: ## %cond.store31
-; SSE2-NEXT: movhpd %xmm5, (%rdi)
+; SSE2-NEXT: movhps %xmm5, (%rdi)
; SSE2-NEXT: addq $8, %rdi
; SSE2-NEXT: LBB1_24: ## %else32
; SSE2-NEXT: pextrw $6, %xmm8, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB1_26
; SSE2-NEXT: ## %bb.25: ## %cond.store34
-; SSE2-NEXT: movlpd %xmm6, (%rdi)
+; SSE2-NEXT: movlps %xmm6, (%rdi)
; SSE2-NEXT: addq $8, %rdi
; SSE2-NEXT: LBB1_26: ## %else35
; SSE2-NEXT: shrl $8, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB1_28
; SSE2-NEXT: ## %bb.27: ## %cond.store37
-; SSE2-NEXT: movhpd %xmm6, (%rdi)
+; SSE2-NEXT: movhps %xmm6, (%rdi)
; SSE2-NEXT: addq $8, %rdi
; SSE2-NEXT: LBB1_28: ## %else38
; SSE2-NEXT: pextrw $7, %xmm8, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB1_30
; SSE2-NEXT: ## %bb.29: ## %cond.store40
-; SSE2-NEXT: movlpd %xmm7, (%rdi)
+; SSE2-NEXT: movlps %xmm7, (%rdi)
; SSE2-NEXT: addq $8, %rdi
; SSE2-NEXT: LBB1_30: ## %else41
; SSE2-NEXT: shrl $8, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB1_32
; SSE2-NEXT: ## %bb.31: ## %cond.store43
-; SSE2-NEXT: movhpd %xmm7, (%rdi)
+; SSE2-NEXT: movhps %xmm7, (%rdi)
; SSE2-NEXT: LBB1_32: ## %else44
; SSE2-NEXT: retq
;
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB1_2
; SSE42-NEXT: ## %bb.1: ## %cond.store
-; SSE42-NEXT: movlpd %xmm0, (%rdi)
+; SSE42-NEXT: movlps %xmm0, (%rdi)
; SSE42-NEXT: addq $8, %rdi
; SSE42-NEXT: LBB1_2: ## %else
; SSE42-NEXT: pextrb $1, %xmm8, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB1_4
; SSE42-NEXT: ## %bb.3: ## %cond.store1
-; SSE42-NEXT: movhpd %xmm0, (%rdi)
+; SSE42-NEXT: movhps %xmm0, (%rdi)
; SSE42-NEXT: addq $8, %rdi
; SSE42-NEXT: LBB1_4: ## %else2
; SSE42-NEXT: pextrb $2, %xmm8, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB1_6
; SSE42-NEXT: ## %bb.5: ## %cond.store4
-; SSE42-NEXT: movlpd %xmm1, (%rdi)
+; SSE42-NEXT: movlps %xmm1, (%rdi)
; SSE42-NEXT: addq $8, %rdi
; SSE42-NEXT: LBB1_6: ## %else5
; SSE42-NEXT: pextrb $3, %xmm8, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB1_8
; SSE42-NEXT: ## %bb.7: ## %cond.store7
-; SSE42-NEXT: movhpd %xmm1, (%rdi)
+; SSE42-NEXT: movhps %xmm1, (%rdi)
; SSE42-NEXT: addq $8, %rdi
; SSE42-NEXT: LBB1_8: ## %else8
; SSE42-NEXT: pextrb $4, %xmm8, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB1_10
; SSE42-NEXT: ## %bb.9: ## %cond.store10
-; SSE42-NEXT: movlpd %xmm2, (%rdi)
+; SSE42-NEXT: movlps %xmm2, (%rdi)
; SSE42-NEXT: addq $8, %rdi
; SSE42-NEXT: LBB1_10: ## %else11
; SSE42-NEXT: pextrb $5, %xmm8, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB1_12
; SSE42-NEXT: ## %bb.11: ## %cond.store13
-; SSE42-NEXT: movhpd %xmm2, (%rdi)
+; SSE42-NEXT: movhps %xmm2, (%rdi)
; SSE42-NEXT: addq $8, %rdi
; SSE42-NEXT: LBB1_12: ## %else14
; SSE42-NEXT: pextrb $6, %xmm8, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB1_14
; SSE42-NEXT: ## %bb.13: ## %cond.store16
-; SSE42-NEXT: movlpd %xmm3, (%rdi)
+; SSE42-NEXT: movlps %xmm3, (%rdi)
; SSE42-NEXT: addq $8, %rdi
; SSE42-NEXT: LBB1_14: ## %else17
; SSE42-NEXT: pextrb $7, %xmm8, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB1_16
; SSE42-NEXT: ## %bb.15: ## %cond.store19
-; SSE42-NEXT: movhpd %xmm3, (%rdi)
+; SSE42-NEXT: movhps %xmm3, (%rdi)
; SSE42-NEXT: addq $8, %rdi
; SSE42-NEXT: LBB1_16: ## %else20
; SSE42-NEXT: pextrb $8, %xmm8, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB1_18
; SSE42-NEXT: ## %bb.17: ## %cond.store22
-; SSE42-NEXT: movlpd %xmm4, (%rdi)
+; SSE42-NEXT: movlps %xmm4, (%rdi)
; SSE42-NEXT: addq $8, %rdi
; SSE42-NEXT: LBB1_18: ## %else23
; SSE42-NEXT: pextrb $9, %xmm8, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB1_20
; SSE42-NEXT: ## %bb.19: ## %cond.store25
-; SSE42-NEXT: movhpd %xmm4, (%rdi)
+; SSE42-NEXT: movhps %xmm4, (%rdi)
; SSE42-NEXT: addq $8, %rdi
; SSE42-NEXT: LBB1_20: ## %else26
; SSE42-NEXT: pextrb $10, %xmm8, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB1_22
; SSE42-NEXT: ## %bb.21: ## %cond.store28
-; SSE42-NEXT: movlpd %xmm5, (%rdi)
+; SSE42-NEXT: movlps %xmm5, (%rdi)
; SSE42-NEXT: addq $8, %rdi
; SSE42-NEXT: LBB1_22: ## %else29
; SSE42-NEXT: pextrb $11, %xmm8, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB1_24
; SSE42-NEXT: ## %bb.23: ## %cond.store31
-; SSE42-NEXT: movhpd %xmm5, (%rdi)
+; SSE42-NEXT: movhps %xmm5, (%rdi)
; SSE42-NEXT: addq $8, %rdi
; SSE42-NEXT: LBB1_24: ## %else32
; SSE42-NEXT: pextrb $12, %xmm8, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB1_26
; SSE42-NEXT: ## %bb.25: ## %cond.store34
-; SSE42-NEXT: movlpd %xmm6, (%rdi)
+; SSE42-NEXT: movlps %xmm6, (%rdi)
; SSE42-NEXT: addq $8, %rdi
; SSE42-NEXT: LBB1_26: ## %else35
; SSE42-NEXT: pextrb $13, %xmm8, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB1_28
; SSE42-NEXT: ## %bb.27: ## %cond.store37
-; SSE42-NEXT: movhpd %xmm6, (%rdi)
+; SSE42-NEXT: movhps %xmm6, (%rdi)
; SSE42-NEXT: addq $8, %rdi
; SSE42-NEXT: LBB1_28: ## %else38
; SSE42-NEXT: pextrb $14, %xmm8, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB1_30
; SSE42-NEXT: ## %bb.29: ## %cond.store40
-; SSE42-NEXT: movlpd %xmm7, (%rdi)
+; SSE42-NEXT: movlps %xmm7, (%rdi)
; SSE42-NEXT: addq $8, %rdi
; SSE42-NEXT: LBB1_30: ## %else41
; SSE42-NEXT: pextrb $15, %xmm8, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB1_32
; SSE42-NEXT: ## %bb.31: ## %cond.store43
-; SSE42-NEXT: movhpd %xmm7, (%rdi)
+; SSE42-NEXT: movhps %xmm7, (%rdi)
; SSE42-NEXT: LBB1_32: ## %else44
; SSE42-NEXT: retq
;
; AVX1OR2-NEXT: testb $1, %al
; AVX1OR2-NEXT: je LBB1_2
; AVX1OR2-NEXT: ## %bb.1: ## %cond.store
-; AVX1OR2-NEXT: vmovlpd %xmm0, (%rdi)
+; AVX1OR2-NEXT: vmovlps %xmm0, (%rdi)
; AVX1OR2-NEXT: addq $8, %rdi
; AVX1OR2-NEXT: LBB1_2: ## %else
; AVX1OR2-NEXT: vpextrb $1, %xmm4, %eax
; AVX1OR2-NEXT: testb $1, %al
; AVX1OR2-NEXT: je LBB1_4
; AVX1OR2-NEXT: ## %bb.3: ## %cond.store1
-; AVX1OR2-NEXT: vmovhpd %xmm0, (%rdi)
+; AVX1OR2-NEXT: vmovhps %xmm0, (%rdi)
; AVX1OR2-NEXT: addq $8, %rdi
; AVX1OR2-NEXT: LBB1_4: ## %else2
; AVX1OR2-NEXT: vpextrb $2, %xmm4, %eax
; AVX1OR2-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1OR2-NEXT: je LBB1_6
; AVX1OR2-NEXT: ## %bb.5: ## %cond.store4
-; AVX1OR2-NEXT: vmovlpd %xmm0, (%rdi)
+; AVX1OR2-NEXT: vmovlps %xmm0, (%rdi)
; AVX1OR2-NEXT: addq $8, %rdi
; AVX1OR2-NEXT: LBB1_6: ## %else5
; AVX1OR2-NEXT: vpextrb $3, %xmm4, %eax
; AVX1OR2-NEXT: testb $1, %al
; AVX1OR2-NEXT: je LBB1_8
; AVX1OR2-NEXT: ## %bb.7: ## %cond.store7
-; AVX1OR2-NEXT: vmovhpd %xmm0, (%rdi)
+; AVX1OR2-NEXT: vmovhps %xmm0, (%rdi)
; AVX1OR2-NEXT: addq $8, %rdi
; AVX1OR2-NEXT: LBB1_8: ## %else8
; AVX1OR2-NEXT: vpextrb $4, %xmm4, %eax
; AVX1OR2-NEXT: testb $1, %al
; AVX1OR2-NEXT: je LBB1_10
; AVX1OR2-NEXT: ## %bb.9: ## %cond.store10
-; AVX1OR2-NEXT: vmovlpd %xmm1, (%rdi)
+; AVX1OR2-NEXT: vmovlps %xmm1, (%rdi)
; AVX1OR2-NEXT: addq $8, %rdi
; AVX1OR2-NEXT: LBB1_10: ## %else11
; AVX1OR2-NEXT: vpextrb $5, %xmm4, %eax
; AVX1OR2-NEXT: testb $1, %al
; AVX1OR2-NEXT: je LBB1_12
; AVX1OR2-NEXT: ## %bb.11: ## %cond.store13
-; AVX1OR2-NEXT: vmovhpd %xmm1, (%rdi)
+; AVX1OR2-NEXT: vmovhps %xmm1, (%rdi)
; AVX1OR2-NEXT: addq $8, %rdi
; AVX1OR2-NEXT: LBB1_12: ## %else14
; AVX1OR2-NEXT: vpextrb $6, %xmm4, %eax
; AVX1OR2-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX1OR2-NEXT: je LBB1_14
; AVX1OR2-NEXT: ## %bb.13: ## %cond.store16
-; AVX1OR2-NEXT: vmovlpd %xmm0, (%rdi)
+; AVX1OR2-NEXT: vmovlps %xmm0, (%rdi)
; AVX1OR2-NEXT: addq $8, %rdi
; AVX1OR2-NEXT: LBB1_14: ## %else17
; AVX1OR2-NEXT: vpextrb $7, %xmm4, %eax
; AVX1OR2-NEXT: testb $1, %al
; AVX1OR2-NEXT: je LBB1_16
; AVX1OR2-NEXT: ## %bb.15: ## %cond.store19
-; AVX1OR2-NEXT: vmovhpd %xmm0, (%rdi)
+; AVX1OR2-NEXT: vmovhps %xmm0, (%rdi)
; AVX1OR2-NEXT: addq $8, %rdi
; AVX1OR2-NEXT: LBB1_16: ## %else20
; AVX1OR2-NEXT: vpextrb $8, %xmm4, %eax
; AVX1OR2-NEXT: testb $1, %al
; AVX1OR2-NEXT: je LBB1_18
; AVX1OR2-NEXT: ## %bb.17: ## %cond.store22
-; AVX1OR2-NEXT: vmovlpd %xmm2, (%rdi)
+; AVX1OR2-NEXT: vmovlps %xmm2, (%rdi)
; AVX1OR2-NEXT: addq $8, %rdi
; AVX1OR2-NEXT: LBB1_18: ## %else23
; AVX1OR2-NEXT: vpextrb $9, %xmm4, %eax
; AVX1OR2-NEXT: testb $1, %al
; AVX1OR2-NEXT: je LBB1_20
; AVX1OR2-NEXT: ## %bb.19: ## %cond.store25
-; AVX1OR2-NEXT: vmovhpd %xmm2, (%rdi)
+; AVX1OR2-NEXT: vmovhps %xmm2, (%rdi)
; AVX1OR2-NEXT: addq $8, %rdi
; AVX1OR2-NEXT: LBB1_20: ## %else26
; AVX1OR2-NEXT: vpextrb $10, %xmm4, %eax
; AVX1OR2-NEXT: vextractf128 $1, %ymm2, %xmm0
; AVX1OR2-NEXT: je LBB1_22
; AVX1OR2-NEXT: ## %bb.21: ## %cond.store28
-; AVX1OR2-NEXT: vmovlpd %xmm0, (%rdi)
+; AVX1OR2-NEXT: vmovlps %xmm0, (%rdi)
; AVX1OR2-NEXT: addq $8, %rdi
; AVX1OR2-NEXT: LBB1_22: ## %else29
; AVX1OR2-NEXT: vpextrb $11, %xmm4, %eax
; AVX1OR2-NEXT: testb $1, %al
; AVX1OR2-NEXT: je LBB1_24
; AVX1OR2-NEXT: ## %bb.23: ## %cond.store31
-; AVX1OR2-NEXT: vmovhpd %xmm0, (%rdi)
+; AVX1OR2-NEXT: vmovhps %xmm0, (%rdi)
; AVX1OR2-NEXT: addq $8, %rdi
; AVX1OR2-NEXT: LBB1_24: ## %else32
; AVX1OR2-NEXT: vpextrb $12, %xmm4, %eax
; AVX1OR2-NEXT: testb $1, %al
; AVX1OR2-NEXT: je LBB1_26
; AVX1OR2-NEXT: ## %bb.25: ## %cond.store34
-; AVX1OR2-NEXT: vmovlpd %xmm3, (%rdi)
+; AVX1OR2-NEXT: vmovlps %xmm3, (%rdi)
; AVX1OR2-NEXT: addq $8, %rdi
; AVX1OR2-NEXT: LBB1_26: ## %else35
; AVX1OR2-NEXT: vpextrb $13, %xmm4, %eax
; AVX1OR2-NEXT: testb $1, %al
; AVX1OR2-NEXT: je LBB1_28
; AVX1OR2-NEXT: ## %bb.27: ## %cond.store37
-; AVX1OR2-NEXT: vmovhpd %xmm3, (%rdi)
+; AVX1OR2-NEXT: vmovhps %xmm3, (%rdi)
; AVX1OR2-NEXT: addq $8, %rdi
; AVX1OR2-NEXT: LBB1_28: ## %else38
; AVX1OR2-NEXT: vpextrb $14, %xmm4, %eax
; AVX1OR2-NEXT: vextractf128 $1, %ymm3, %xmm0
; AVX1OR2-NEXT: je LBB1_30
; AVX1OR2-NEXT: ## %bb.29: ## %cond.store40
-; AVX1OR2-NEXT: vmovlpd %xmm0, (%rdi)
+; AVX1OR2-NEXT: vmovlps %xmm0, (%rdi)
; AVX1OR2-NEXT: addq $8, %rdi
; AVX1OR2-NEXT: LBB1_30: ## %else41
; AVX1OR2-NEXT: vpextrb $15, %xmm4, %eax
; AVX1OR2-NEXT: testb $1, %al
; AVX1OR2-NEXT: je LBB1_32
; AVX1OR2-NEXT: ## %bb.31: ## %cond.store43
-; AVX1OR2-NEXT: vmovhpd %xmm0, (%rdi)
+; AVX1OR2-NEXT: vmovhps %xmm0, (%rdi)
; AVX1OR2-NEXT: LBB1_32: ## %else44
; AVX1OR2-NEXT: vzeroupper
; AVX1OR2-NEXT: retq
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB0_2
; SSE2-NEXT: ## %bb.1: ## %cond.load
-; SSE2-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
+; SSE2-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
; SSE2-NEXT: addq $8, %rdi
; SSE2-NEXT: LBB0_2: ## %else
; SSE2-NEXT: pextrw $4, %xmm1, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB0_4
; SSE2-NEXT: ## %bb.3: ## %cond.load1
-; SSE2-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; SSE2-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
; SSE2-NEXT: LBB0_4: ## %else2
; SSE2-NEXT: retq
;
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB0_2
; SSE42-NEXT: ## %bb.1: ## %cond.load
-; SSE42-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
+; SSE42-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
; SSE42-NEXT: addq $8, %rdi
; SSE42-NEXT: LBB0_2: ## %else
; SSE42-NEXT: pextrb $8, %xmm2, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB0_4
; SSE42-NEXT: ## %bb.3: ## %cond.load1
-; SSE42-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; SSE42-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
; SSE42-NEXT: LBB0_4: ## %else2
; SSE42-NEXT: retq
;
; AVX1OR2-NEXT: testb $1, %al
; AVX1OR2-NEXT: je LBB0_2
; AVX1OR2-NEXT: ## %bb.1: ## %cond.load
-; AVX1OR2-NEXT: vmovlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
+; AVX1OR2-NEXT: vmovlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
; AVX1OR2-NEXT: addq $8, %rdi
; AVX1OR2-NEXT: LBB0_2: ## %else
; AVX1OR2-NEXT: vpextrb $8, %xmm1, %eax
; AVX1OR2-NEXT: testb $1, %al
; AVX1OR2-NEXT: je LBB0_4
; AVX1OR2-NEXT: ## %bb.3: ## %cond.load1
-; AVX1OR2-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX1OR2-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
; AVX1OR2-NEXT: LBB0_4: ## %else2
; AVX1OR2-NEXT: retq
;
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB1_2
; SSE2-NEXT: ## %bb.1: ## %cond.load
-; SSE2-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
+; SSE2-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
; SSE2-NEXT: addq $8, %rdi
; SSE2-NEXT: LBB1_2: ## %else
; SSE2-NEXT: pextrw $4, %xmm2, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB1_4
; SSE2-NEXT: ## %bb.3: ## %cond.load1
-; SSE2-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; SSE2-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
; SSE2-NEXT: addq $8, %rdi
; SSE2-NEXT: LBB1_4: ## %else2
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB1_6
; SSE2-NEXT: ## %bb.5: ## %cond.load5
-; SSE2-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
+; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
; SSE2-NEXT: addq $8, %rdi
; SSE2-NEXT: LBB1_6: ## %else6
; SSE2-NEXT: pextrw $4, %xmm2, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB1_8
; SSE2-NEXT: ## %bb.7: ## %cond.load9
-; SSE2-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; SSE2-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
; SSE2-NEXT: LBB1_8: ## %else10
; SSE2-NEXT: retq
;
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB1_2
; SSE42-NEXT: ## %bb.1: ## %cond.load
-; SSE42-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
+; SSE42-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
; SSE42-NEXT: addq $8, %rdi
; SSE42-NEXT: LBB1_2: ## %else
; SSE42-NEXT: pextrb $8, %xmm4, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB1_4
; SSE42-NEXT: ## %bb.3: ## %cond.load1
-; SSE42-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; SSE42-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
; SSE42-NEXT: addq $8, %rdi
; SSE42-NEXT: LBB1_4: ## %else2
; SSE42-NEXT: pxor %xmm2, %xmm2
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB1_6
; SSE42-NEXT: ## %bb.5: ## %cond.load5
-; SSE42-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
+; SSE42-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
; SSE42-NEXT: addq $8, %rdi
; SSE42-NEXT: LBB1_6: ## %else6
; SSE42-NEXT: pextrb $8, %xmm3, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB1_8
; SSE42-NEXT: ## %bb.7: ## %cond.load9
-; SSE42-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; SSE42-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
; SSE42-NEXT: LBB1_8: ## %else10
; SSE42-NEXT: retq
;
; AVX1-NEXT: testb $1, %al
; AVX1-NEXT: je LBB1_4
; AVX1-NEXT: ## %bb.3: ## %cond.load1
-; AVX1-NEXT: vmovhpd {{.*#+}} xmm2 = xmm0[0],mem[0]
-; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3]
+; AVX1-NEXT: vmovhps {{.*#+}} xmm2 = xmm0[0,1],mem[0,1]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: LBB1_4: ## %else2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vxorpd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpextrb $0, %xmm1, %eax
; AVX1-NEXT: testb $1, %al
; AVX1-NEXT: je LBB1_6
; AVX1-NEXT: ## %bb.5: ## %cond.load5
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vmovlpd {{.*#+}} xmm2 = mem[0],xmm2[1]
+; AVX1-NEXT: vmovlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: LBB1_6: ## %else6
; AVX1-NEXT: je LBB1_8
; AVX1-NEXT: ## %bb.7: ## %cond.load9
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX1-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: LBB1_8: ## %else10
; AVX1-NEXT: retq
; AVX2-NEXT: testb $1, %al
; AVX2-NEXT: je LBB1_4
; AVX2-NEXT: ## %bb.3: ## %cond.load1
-; AVX2-NEXT: vmovhpd {{.*#+}} xmm2 = xmm0[0],mem[0]
-; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3]
+; AVX2-NEXT: vmovhps {{.*#+}} xmm2 = xmm0[0,1],mem[0,1]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: LBB1_4: ## %else2
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
-; AVX2-NEXT: vxorpd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpextrb $0, %xmm1, %eax
; AVX2-NEXT: testb $1, %al
; AVX2-NEXT: je LBB1_6
; AVX2-NEXT: ## %bb.5: ## %cond.load5
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX2-NEXT: vmovlpd {{.*#+}} xmm2 = mem[0],xmm2[1]
-; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vmovlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: LBB1_6: ## %else6
; AVX2-NEXT: vpextrb $8, %xmm1, %eax
; AVX2-NEXT: testb $1, %al
; AVX2-NEXT: je LBB1_8
; AVX2-NEXT: ## %bb.7: ## %cond.load9
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: LBB1_8: ## %else10
; AVX2-NEXT: retq
;
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB2_2
; SSE2-NEXT: ## %bb.1: ## %cond.load
-; SSE2-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
+; SSE2-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
; SSE2-NEXT: addq $8, %rdi
; SSE2-NEXT: LBB2_2: ## %else
; SSE2-NEXT: shrl $16, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB2_4
; SSE2-NEXT: ## %bb.3: ## %cond.load1
-; SSE2-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; SSE2-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
; SSE2-NEXT: addq $8, %rdi
; SSE2-NEXT: LBB2_4: ## %else2
; SSE2-NEXT: pextrw $2, %xmm4, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB2_6
; SSE2-NEXT: ## %bb.5: ## %cond.load5
-; SSE2-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
+; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
; SSE2-NEXT: addq $8, %rdi
; SSE2-NEXT: LBB2_6: ## %else6
; SSE2-NEXT: pextrw $3, %xmm4, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB2_8
; SSE2-NEXT: ## %bb.7: ## %cond.load9
-; SSE2-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; SSE2-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
; SSE2-NEXT: addq $8, %rdi
; SSE2-NEXT: LBB2_8: ## %else10
; SSE2-NEXT: pextrw $4, %xmm4, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB2_10
; SSE2-NEXT: ## %bb.9: ## %cond.load13
-; SSE2-NEXT: movlpd {{.*#+}} xmm2 = mem[0],xmm2[1]
+; SSE2-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
; SSE2-NEXT: addq $8, %rdi
; SSE2-NEXT: LBB2_10: ## %else14
; SSE2-NEXT: pextrw $5, %xmm4, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB2_12
; SSE2-NEXT: ## %bb.11: ## %cond.load17
-; SSE2-NEXT: movhpd {{.*#+}} xmm2 = xmm2[0],mem[0]
+; SSE2-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
; SSE2-NEXT: addq $8, %rdi
; SSE2-NEXT: LBB2_12: ## %else18
; SSE2-NEXT: pextrw $6, %xmm4, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB2_14
; SSE2-NEXT: ## %bb.13: ## %cond.load21
-; SSE2-NEXT: movlpd {{.*#+}} xmm3 = mem[0],xmm3[1]
+; SSE2-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3]
; SSE2-NEXT: addq $8, %rdi
; SSE2-NEXT: LBB2_14: ## %else22
; SSE2-NEXT: pextrw $7, %xmm4, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB2_16
; SSE2-NEXT: ## %bb.15: ## %cond.load25
-; SSE2-NEXT: movhpd {{.*#+}} xmm3 = xmm3[0],mem[0]
+; SSE2-NEXT: movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1]
; SSE2-NEXT: LBB2_16: ## %else26
; SSE2-NEXT: retq
;
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB2_2
; SSE42-NEXT: ## %bb.1: ## %cond.load
-; SSE42-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
+; SSE42-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
; SSE42-NEXT: addq $8, %rdi
; SSE42-NEXT: LBB2_2: ## %else
; SSE42-NEXT: pextrb $2, %xmm4, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB2_4
; SSE42-NEXT: ## %bb.3: ## %cond.load1
-; SSE42-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; SSE42-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
; SSE42-NEXT: addq $8, %rdi
; SSE42-NEXT: LBB2_4: ## %else2
; SSE42-NEXT: pextrb $4, %xmm4, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB2_6
; SSE42-NEXT: ## %bb.5: ## %cond.load5
-; SSE42-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
+; SSE42-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
; SSE42-NEXT: addq $8, %rdi
; SSE42-NEXT: LBB2_6: ## %else6
; SSE42-NEXT: pextrb $6, %xmm4, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB2_8
; SSE42-NEXT: ## %bb.7: ## %cond.load9
-; SSE42-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; SSE42-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
; SSE42-NEXT: addq $8, %rdi
; SSE42-NEXT: LBB2_8: ## %else10
; SSE42-NEXT: pextrb $8, %xmm4, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB2_10
; SSE42-NEXT: ## %bb.9: ## %cond.load13
-; SSE42-NEXT: movlpd {{.*#+}} xmm2 = mem[0],xmm2[1]
+; SSE42-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
; SSE42-NEXT: addq $8, %rdi
; SSE42-NEXT: LBB2_10: ## %else14
; SSE42-NEXT: pextrb $10, %xmm4, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB2_12
; SSE42-NEXT: ## %bb.11: ## %cond.load17
-; SSE42-NEXT: movhpd {{.*#+}} xmm2 = xmm2[0],mem[0]
+; SSE42-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
; SSE42-NEXT: addq $8, %rdi
; SSE42-NEXT: LBB2_12: ## %else18
; SSE42-NEXT: pextrb $12, %xmm4, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB2_14
; SSE42-NEXT: ## %bb.13: ## %cond.load21
-; SSE42-NEXT: movlpd {{.*#+}} xmm3 = mem[0],xmm3[1]
+; SSE42-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3]
; SSE42-NEXT: addq $8, %rdi
; SSE42-NEXT: LBB2_14: ## %else22
; SSE42-NEXT: pextrb $14, %xmm4, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB2_16
; SSE42-NEXT: ## %bb.15: ## %cond.load25
-; SSE42-NEXT: movhpd {{.*#+}} xmm3 = xmm3[0],mem[0]
+; SSE42-NEXT: movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1]
; SSE42-NEXT: LBB2_16: ## %else26
; SSE42-NEXT: retq
;
; AVX1OR2-NEXT: je LBB2_2
; AVX1OR2-NEXT: ## %bb.1: ## %cond.load
; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX1OR2-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3]
+; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7]
; AVX1OR2-NEXT: addq $8, %rdi
; AVX1OR2-NEXT: LBB2_2: ## %else
; AVX1OR2-NEXT: vpextrb $2, %xmm2, %eax
; AVX1OR2-NEXT: testb $1, %al
; AVX1OR2-NEXT: je LBB2_4
; AVX1OR2-NEXT: ## %bb.3: ## %cond.load1
-; AVX1OR2-NEXT: vmovhpd {{.*#+}} xmm3 = xmm0[0],mem[0]
-; AVX1OR2-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3]
+; AVX1OR2-NEXT: vmovhps {{.*#+}} xmm3 = xmm0[0,1],mem[0,1]
+; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
; AVX1OR2-NEXT: addq $8, %rdi
; AVX1OR2-NEXT: LBB2_4: ## %else2
; AVX1OR2-NEXT: vpextrb $4, %xmm2, %eax
; AVX1OR2-NEXT: je LBB2_6
; AVX1OR2-NEXT: ## %bb.5: ## %cond.load5
; AVX1OR2-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1OR2-NEXT: vmovlpd {{.*#+}} xmm3 = mem[0],xmm3[1]
+; AVX1OR2-NEXT: vmovlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3]
; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX1OR2-NEXT: addq $8, %rdi
; AVX1OR2-NEXT: LBB2_6: ## %else6
; AVX1OR2-NEXT: je LBB2_8
; AVX1OR2-NEXT: ## %bb.7: ## %cond.load9
; AVX1OR2-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1OR2-NEXT: vmovhpd {{.*#+}} xmm3 = xmm3[0],mem[0]
+; AVX1OR2-NEXT: vmovhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1]
; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX1OR2-NEXT: addq $8, %rdi
; AVX1OR2-NEXT: LBB2_8: ## %else10
; AVX1OR2-NEXT: je LBB2_10
; AVX1OR2-NEXT: ## %bb.9: ## %cond.load13
; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX1OR2-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3]
+; AVX1OR2-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7]
; AVX1OR2-NEXT: addq $8, %rdi
; AVX1OR2-NEXT: LBB2_10: ## %else14
; AVX1OR2-NEXT: vpextrb $10, %xmm2, %eax
; AVX1OR2-NEXT: testb $1, %al
; AVX1OR2-NEXT: je LBB2_12
; AVX1OR2-NEXT: ## %bb.11: ## %cond.load17
-; AVX1OR2-NEXT: vmovhpd {{.*#+}} xmm3 = xmm1[0],mem[0]
-; AVX1OR2-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3]
+; AVX1OR2-NEXT: vmovhps {{.*#+}} xmm3 = xmm1[0,1],mem[0,1]
+; AVX1OR2-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
; AVX1OR2-NEXT: addq $8, %rdi
; AVX1OR2-NEXT: LBB2_12: ## %else18
; AVX1OR2-NEXT: vpextrb $12, %xmm2, %eax
; AVX1OR2-NEXT: je LBB2_14
; AVX1OR2-NEXT: ## %bb.13: ## %cond.load21
; AVX1OR2-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1OR2-NEXT: vmovlpd {{.*#+}} xmm3 = mem[0],xmm3[1]
+; AVX1OR2-NEXT: vmovlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3]
; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX1OR2-NEXT: addq $8, %rdi
; AVX1OR2-NEXT: LBB2_14: ## %else22
; AVX1OR2-NEXT: je LBB2_16
; AVX1OR2-NEXT: ## %bb.15: ## %cond.load25
; AVX1OR2-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1OR2-NEXT: vmovhpd {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX1OR2-NEXT: vmovhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
; AVX1OR2-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1OR2-NEXT: LBB2_16: ## %else26
; AVX1OR2-NEXT: retq
; SSE2-NEXT: testb $1, %cl
; SSE2-NEXT: je LBB3_2
; SSE2-NEXT: ## %bb.1: ## %cond.load
-; SSE2-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
+; SSE2-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
; SSE2-NEXT: addq $8, %rsi
; SSE2-NEXT: LBB3_2: ## %else
; SSE2-NEXT: pextrw $2, %xmm8, %ecx
; SSE2-NEXT: testb $1, %cl
; SSE2-NEXT: je LBB3_4
; SSE2-NEXT: ## %bb.3: ## %cond.load1
-; SSE2-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; SSE2-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
; SSE2-NEXT: addq $8, %rsi
; SSE2-NEXT: LBB3_4: ## %else2
; SSE2-NEXT: pxor %xmm8, %xmm8
; SSE2-NEXT: testb $1, %cl
; SSE2-NEXT: je LBB3_6
; SSE2-NEXT: ## %bb.5: ## %cond.load5
-; SSE2-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
+; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
; SSE2-NEXT: addq $8, %rsi
; SSE2-NEXT: LBB3_6: ## %else6
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
; SSE2-NEXT: testb $1, %cl
; SSE2-NEXT: je LBB3_8
; SSE2-NEXT: ## %bb.7: ## %cond.load9
-; SSE2-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; SSE2-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
; SSE2-NEXT: addq $8, %rsi
; SSE2-NEXT: LBB3_8: ## %else10
; SSE2-NEXT: pxor %xmm9, %xmm9
; SSE2-NEXT: testb $1, %cl
; SSE2-NEXT: je LBB3_10
; SSE2-NEXT: ## %bb.9: ## %cond.load13
-; SSE2-NEXT: movlpd {{.*#+}} xmm2 = mem[0],xmm2[1]
+; SSE2-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
; SSE2-NEXT: addq $8, %rsi
; SSE2-NEXT: LBB3_10: ## %else14
; SSE2-NEXT: pextrw $2, %xmm9, %ecx
; SSE2-NEXT: testb $1, %cl
; SSE2-NEXT: je LBB3_12
; SSE2-NEXT: ## %bb.11: ## %cond.load17
-; SSE2-NEXT: movhpd {{.*#+}} xmm2 = xmm2[0],mem[0]
+; SSE2-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
; SSE2-NEXT: addq $8, %rsi
; SSE2-NEXT: LBB3_12: ## %else18
; SSE2-NEXT: pxor %xmm9, %xmm9
; SSE2-NEXT: testb $1, %cl
; SSE2-NEXT: je LBB3_14
; SSE2-NEXT: ## %bb.13: ## %cond.load21
-; SSE2-NEXT: movlpd {{.*#+}} xmm3 = mem[0],xmm3[1]
+; SSE2-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3]
; SSE2-NEXT: addq $8, %rsi
; SSE2-NEXT: LBB3_14: ## %else22
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
; SSE2-NEXT: testb $1, %cl
; SSE2-NEXT: je LBB3_16
; SSE2-NEXT: ## %bb.15: ## %cond.load25
-; SSE2-NEXT: movhpd {{.*#+}} xmm3 = xmm3[0],mem[0]
+; SSE2-NEXT: movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1]
; SSE2-NEXT: addq $8, %rsi
; SSE2-NEXT: LBB3_16: ## %else26
; SSE2-NEXT: pxor %xmm8, %xmm8
; SSE2-NEXT: testb $1, %cl
; SSE2-NEXT: je LBB3_18
; SSE2-NEXT: ## %bb.17: ## %cond.load29
-; SSE2-NEXT: movlpd {{.*#+}} xmm4 = mem[0],xmm4[1]
+; SSE2-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3]
; SSE2-NEXT: addq $8, %rsi
; SSE2-NEXT: LBB3_18: ## %else30
; SSE2-NEXT: pextrw $2, %xmm8, %ecx
; SSE2-NEXT: testb $1, %cl
; SSE2-NEXT: je LBB3_20
; SSE2-NEXT: ## %bb.19: ## %cond.load33
-; SSE2-NEXT: movhpd {{.*#+}} xmm4 = xmm4[0],mem[0]
+; SSE2-NEXT: movhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1]
; SSE2-NEXT: addq $8, %rsi
; SSE2-NEXT: LBB3_20: ## %else34
; SSE2-NEXT: pxor %xmm8, %xmm8
; SSE2-NEXT: testb $1, %cl
; SSE2-NEXT: je LBB3_22
; SSE2-NEXT: ## %bb.21: ## %cond.load37
-; SSE2-NEXT: movlpd {{.*#+}} xmm5 = mem[0],xmm5[1]
+; SSE2-NEXT: movlps {{.*#+}} xmm5 = mem[0,1],xmm5[2,3]
; SSE2-NEXT: addq $8, %rsi
; SSE2-NEXT: LBB3_22: ## %else38
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
; SSE2-NEXT: testb $1, %cl
; SSE2-NEXT: je LBB3_24
; SSE2-NEXT: ## %bb.23: ## %cond.load41
-; SSE2-NEXT: movhpd {{.*#+}} xmm5 = xmm5[0],mem[0]
+; SSE2-NEXT: movhps {{.*#+}} xmm5 = xmm5[0,1],mem[0,1]
; SSE2-NEXT: addq $8, %rsi
; SSE2-NEXT: LBB3_24: ## %else42
; SSE2-NEXT: pxor %xmm9, %xmm9
; SSE2-NEXT: testb $1, %cl
; SSE2-NEXT: je LBB3_26
; SSE2-NEXT: ## %bb.25: ## %cond.load45
-; SSE2-NEXT: movlpd {{.*#+}} xmm6 = mem[0],xmm6[1]
+; SSE2-NEXT: movlps {{.*#+}} xmm6 = mem[0,1],xmm6[2,3]
; SSE2-NEXT: addq $8, %rsi
; SSE2-NEXT: LBB3_26: ## %else46
; SSE2-NEXT: pextrw $2, %xmm9, %ecx
; SSE2-NEXT: testb $1, %cl
; SSE2-NEXT: je LBB3_28
; SSE2-NEXT: ## %bb.27: ## %cond.load49
-; SSE2-NEXT: movhpd {{.*#+}} xmm6 = xmm6[0],mem[0]
+; SSE2-NEXT: movhps {{.*#+}} xmm6 = xmm6[0,1],mem[0,1]
; SSE2-NEXT: addq $8, %rsi
; SSE2-NEXT: LBB3_28: ## %else50
; SSE2-NEXT: pxor %xmm9, %xmm9
; SSE2-NEXT: testb $1, %cl
; SSE2-NEXT: je LBB3_30
; SSE2-NEXT: ## %bb.29: ## %cond.load53
-; SSE2-NEXT: movlpd {{.*#+}} xmm7 = mem[0],xmm7[1]
+; SSE2-NEXT: movlps {{.*#+}} xmm7 = mem[0,1],xmm7[2,3]
; SSE2-NEXT: addq $8, %rsi
; SSE2-NEXT: LBB3_30: ## %else54
; SSE2-NEXT: pextrw $6, %xmm8, %ecx
; SSE2-NEXT: testb $1, %cl
; SSE2-NEXT: je LBB3_32
; SSE2-NEXT: ## %bb.31: ## %cond.load57
-; SSE2-NEXT: movhpd {{.*#+}} xmm7 = xmm7[0],mem[0]
+; SSE2-NEXT: movhps {{.*#+}} xmm7 = xmm7[0,1],mem[0,1]
; SSE2-NEXT: LBB3_32: ## %else58
-; SSE2-NEXT: movapd %xmm0, (%rax)
-; SSE2-NEXT: movapd %xmm1, 16(%rax)
-; SSE2-NEXT: movapd %xmm2, 32(%rax)
-; SSE2-NEXT: movapd %xmm3, 48(%rax)
-; SSE2-NEXT: movapd %xmm4, 64(%rax)
-; SSE2-NEXT: movapd %xmm5, 80(%rax)
-; SSE2-NEXT: movapd %xmm6, 96(%rax)
-; SSE2-NEXT: movapd %xmm7, 112(%rax)
+; SSE2-NEXT: movaps %xmm0, (%rax)
+; SSE2-NEXT: movaps %xmm1, 16(%rax)
+; SSE2-NEXT: movaps %xmm2, 32(%rax)
+; SSE2-NEXT: movaps %xmm3, 48(%rax)
+; SSE2-NEXT: movaps %xmm4, 64(%rax)
+; SSE2-NEXT: movaps %xmm5, 80(%rax)
+; SSE2-NEXT: movaps %xmm6, 96(%rax)
+; SSE2-NEXT: movaps %xmm7, 112(%rax)
; SSE2-NEXT: retq
;
; SSE42-LABEL: expandload_v16f64_v16i32:
; SSE42-NEXT: testb $1, %cl
; SSE42-NEXT: je LBB3_2
; SSE42-NEXT: ## %bb.1: ## %cond.load
-; SSE42-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
+; SSE42-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
; SSE42-NEXT: addq $8, %rsi
; SSE42-NEXT: LBB3_2: ## %else
; SSE42-NEXT: pextrb $4, %xmm8, %ecx
; SSE42-NEXT: testb $1, %cl
; SSE42-NEXT: je LBB3_4
; SSE42-NEXT: ## %bb.3: ## %cond.load1
-; SSE42-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; SSE42-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
; SSE42-NEXT: addq $8, %rsi
; SSE42-NEXT: LBB3_4: ## %else2
; SSE42-NEXT: pxor %xmm8, %xmm8
; SSE42-NEXT: testb $1, %cl
; SSE42-NEXT: je LBB3_6
; SSE42-NEXT: ## %bb.5: ## %cond.load5
-; SSE42-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
+; SSE42-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
; SSE42-NEXT: addq $8, %rsi
; SSE42-NEXT: LBB3_6: ## %else6
; SSE42-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
; SSE42-NEXT: testb $1, %cl
; SSE42-NEXT: je LBB3_8
; SSE42-NEXT: ## %bb.7: ## %cond.load9
-; SSE42-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; SSE42-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
; SSE42-NEXT: addq $8, %rsi
; SSE42-NEXT: LBB3_8: ## %else10
; SSE42-NEXT: pxor %xmm9, %xmm9
; SSE42-NEXT: testb $1, %cl
; SSE42-NEXT: je LBB3_10
; SSE42-NEXT: ## %bb.9: ## %cond.load13
-; SSE42-NEXT: movlpd {{.*#+}} xmm2 = mem[0],xmm2[1]
+; SSE42-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
; SSE42-NEXT: addq $8, %rsi
; SSE42-NEXT: LBB3_10: ## %else14
; SSE42-NEXT: pextrb $4, %xmm9, %ecx
; SSE42-NEXT: testb $1, %cl
; SSE42-NEXT: je LBB3_12
; SSE42-NEXT: ## %bb.11: ## %cond.load17
-; SSE42-NEXT: movhpd {{.*#+}} xmm2 = xmm2[0],mem[0]
+; SSE42-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
; SSE42-NEXT: addq $8, %rsi
; SSE42-NEXT: LBB3_12: ## %else18
; SSE42-NEXT: pxor %xmm9, %xmm9
; SSE42-NEXT: testb $1, %cl
; SSE42-NEXT: je LBB3_14
; SSE42-NEXT: ## %bb.13: ## %cond.load21
-; SSE42-NEXT: movlpd {{.*#+}} xmm3 = mem[0],xmm3[1]
+; SSE42-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3]
; SSE42-NEXT: addq $8, %rsi
; SSE42-NEXT: LBB3_14: ## %else22
; SSE42-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
; SSE42-NEXT: testb $1, %cl
; SSE42-NEXT: je LBB3_16
; SSE42-NEXT: ## %bb.15: ## %cond.load25
-; SSE42-NEXT: movhpd {{.*#+}} xmm3 = xmm3[0],mem[0]
+; SSE42-NEXT: movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1]
; SSE42-NEXT: addq $8, %rsi
; SSE42-NEXT: LBB3_16: ## %else26
; SSE42-NEXT: pxor %xmm8, %xmm8
; SSE42-NEXT: testb $1, %cl
; SSE42-NEXT: je LBB3_18
; SSE42-NEXT: ## %bb.17: ## %cond.load29
-; SSE42-NEXT: movlpd {{.*#+}} xmm4 = mem[0],xmm4[1]
+; SSE42-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3]
; SSE42-NEXT: addq $8, %rsi
; SSE42-NEXT: LBB3_18: ## %else30
; SSE42-NEXT: pextrb $4, %xmm8, %ecx
; SSE42-NEXT: testb $1, %cl
; SSE42-NEXT: je LBB3_20
; SSE42-NEXT: ## %bb.19: ## %cond.load33
-; SSE42-NEXT: movhpd {{.*#+}} xmm4 = xmm4[0],mem[0]
+; SSE42-NEXT: movhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1]
; SSE42-NEXT: addq $8, %rsi
; SSE42-NEXT: LBB3_20: ## %else34
; SSE42-NEXT: pxor %xmm8, %xmm8
; SSE42-NEXT: testb $1, %cl
; SSE42-NEXT: je LBB3_22
; SSE42-NEXT: ## %bb.21: ## %cond.load37
-; SSE42-NEXT: movlpd {{.*#+}} xmm5 = mem[0],xmm5[1]
+; SSE42-NEXT: movlps {{.*#+}} xmm5 = mem[0,1],xmm5[2,3]
; SSE42-NEXT: addq $8, %rsi
; SSE42-NEXT: LBB3_22: ## %else38
; SSE42-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
; SSE42-NEXT: testb $1, %cl
; SSE42-NEXT: je LBB3_24
; SSE42-NEXT: ## %bb.23: ## %cond.load41
-; SSE42-NEXT: movhpd {{.*#+}} xmm5 = xmm5[0],mem[0]
+; SSE42-NEXT: movhps {{.*#+}} xmm5 = xmm5[0,1],mem[0,1]
; SSE42-NEXT: addq $8, %rsi
; SSE42-NEXT: LBB3_24: ## %else42
; SSE42-NEXT: pxor %xmm9, %xmm9
; SSE42-NEXT: testb $1, %cl
; SSE42-NEXT: je LBB3_26
; SSE42-NEXT: ## %bb.25: ## %cond.load45
-; SSE42-NEXT: movlpd {{.*#+}} xmm6 = mem[0],xmm6[1]
+; SSE42-NEXT: movlps {{.*#+}} xmm6 = mem[0,1],xmm6[2,3]
; SSE42-NEXT: addq $8, %rsi
; SSE42-NEXT: LBB3_26: ## %else46
; SSE42-NEXT: pextrb $4, %xmm9, %ecx
; SSE42-NEXT: testb $1, %cl
; SSE42-NEXT: je LBB3_28
; SSE42-NEXT: ## %bb.27: ## %cond.load49
-; SSE42-NEXT: movhpd {{.*#+}} xmm6 = xmm6[0],mem[0]
+; SSE42-NEXT: movhps {{.*#+}} xmm6 = xmm6[0,1],mem[0,1]
; SSE42-NEXT: addq $8, %rsi
; SSE42-NEXT: LBB3_28: ## %else50
; SSE42-NEXT: pxor %xmm9, %xmm9
; SSE42-NEXT: testb $1, %cl
; SSE42-NEXT: je LBB3_30
; SSE42-NEXT: ## %bb.29: ## %cond.load53
-; SSE42-NEXT: movlpd {{.*#+}} xmm7 = mem[0],xmm7[1]
+; SSE42-NEXT: movlps {{.*#+}} xmm7 = mem[0,1],xmm7[2,3]
; SSE42-NEXT: addq $8, %rsi
; SSE42-NEXT: LBB3_30: ## %else54
; SSE42-NEXT: pextrb $12, %xmm8, %ecx
; SSE42-NEXT: testb $1, %cl
; SSE42-NEXT: je LBB3_32
; SSE42-NEXT: ## %bb.31: ## %cond.load57
-; SSE42-NEXT: movhpd {{.*#+}} xmm7 = xmm7[0],mem[0]
+; SSE42-NEXT: movhps {{.*#+}} xmm7 = xmm7[0,1],mem[0,1]
; SSE42-NEXT: LBB3_32: ## %else58
-; SSE42-NEXT: movapd %xmm0, (%rax)
-; SSE42-NEXT: movapd %xmm1, 16(%rax)
-; SSE42-NEXT: movapd %xmm2, 32(%rax)
-; SSE42-NEXT: movapd %xmm3, 48(%rax)
-; SSE42-NEXT: movapd %xmm4, 64(%rax)
-; SSE42-NEXT: movapd %xmm5, 80(%rax)
-; SSE42-NEXT: movapd %xmm6, 96(%rax)
-; SSE42-NEXT: movapd %xmm7, 112(%rax)
+; SSE42-NEXT: movaps %xmm0, (%rax)
+; SSE42-NEXT: movaps %xmm1, 16(%rax)
+; SSE42-NEXT: movaps %xmm2, 32(%rax)
+; SSE42-NEXT: movaps %xmm3, 48(%rax)
+; SSE42-NEXT: movaps %xmm4, 64(%rax)
+; SSE42-NEXT: movaps %xmm5, 80(%rax)
+; SSE42-NEXT: movaps %xmm6, 96(%rax)
+; SSE42-NEXT: movaps %xmm7, 112(%rax)
; SSE42-NEXT: retq
;
; AVX1-LABEL: expandload_v16f64_v16i32:
; AVX1-NEXT: testb $1, %al
; AVX1-NEXT: je LBB3_4
; AVX1-NEXT: ## %bb.3: ## %cond.load1
-; AVX1-NEXT: vmovhpd {{.*#+}} xmm6 = xmm0[0],mem[0]
-; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3]
+; AVX1-NEXT: vmovhps {{.*#+}} xmm6 = xmm0[0,1],mem[0,1]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: LBB3_4: ## %else2
-; AVX1-NEXT: vxorpd %xmm6, %xmm6, %xmm6
+; AVX1-NEXT: vxorps %xmm6, %xmm6, %xmm6
; AVX1-NEXT: vpcmpeqd %xmm6, %xmm4, %xmm7
; AVX1-NEXT: vpackssdw %xmm0, %xmm7, %xmm7
; AVX1-NEXT: vpacksswb %xmm0, %xmm7, %xmm7
; AVX1-NEXT: je LBB3_6
; AVX1-NEXT: ## %bb.5: ## %cond.load5
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
-; AVX1-NEXT: vmovlpd {{.*#+}} xmm7 = mem[0],xmm7[1]
+; AVX1-NEXT: vmovlps {{.*#+}} xmm7 = mem[0,1],xmm7[2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: LBB3_6: ## %else6
; AVX1-NEXT: je LBB3_8
; AVX1-NEXT: ## %bb.7: ## %cond.load9
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT: vmovhpd {{.*#+}} xmm6 = xmm6[0],mem[0]
+; AVX1-NEXT: vmovhps {{.*#+}} xmm6 = xmm6[0,1],mem[0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: LBB3_8: ## %else10
; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4
-; AVX1-NEXT: vxorpd %xmm6, %xmm6, %xmm6
+; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6
; AVX1-NEXT: vpcmpeqd %xmm6, %xmm4, %xmm6
; AVX1-NEXT: vpackssdw %xmm6, %xmm0, %xmm7
; AVX1-NEXT: vpacksswb %xmm0, %xmm7, %xmm7
; AVX1-NEXT: testb $1, %al
; AVX1-NEXT: je LBB3_12
; AVX1-NEXT: ## %bb.11: ## %cond.load17
-; AVX1-NEXT: vmovhpd {{.*#+}} xmm6 = xmm1[0],mem[0]
-; AVX1-NEXT: vblendpd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3]
+; AVX1-NEXT: vmovhps {{.*#+}} xmm6 = xmm1[0,1],mem[0,1]
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: LBB3_12: ## %else18
-; AVX1-NEXT: vxorpd %xmm6, %xmm6, %xmm6
+; AVX1-NEXT: vxorps %xmm6, %xmm6, %xmm6
; AVX1-NEXT: vpcmpeqd %xmm6, %xmm4, %xmm4
; AVX1-NEXT: vpackssdw %xmm4, %xmm0, %xmm6
; AVX1-NEXT: vpacksswb %xmm0, %xmm6, %xmm6
; AVX1-NEXT: je LBB3_14
; AVX1-NEXT: ## %bb.13: ## %cond.load21
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
-; AVX1-NEXT: vmovlpd {{.*#+}} xmm6 = mem[0],xmm6[1]
+; AVX1-NEXT: vmovlps {{.*#+}} xmm6 = mem[0,1],xmm6[2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: LBB3_14: ## %else22
; AVX1-NEXT: je LBB3_16
; AVX1-NEXT: ## %bb.15: ## %cond.load25
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT: vmovhpd {{.*#+}} xmm4 = xmm4[0],mem[0]
+; AVX1-NEXT: vmovhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: LBB3_16: ## %else26
-; AVX1-NEXT: vxorpd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX1-NEXT: vpcmpeqd %xmm4, %xmm5, %xmm6
; AVX1-NEXT: vpackssdw %xmm0, %xmm6, %xmm6
; AVX1-NEXT: vpacksswb %xmm6, %xmm0, %xmm6
; AVX1-NEXT: testb $1, %al
; AVX1-NEXT: je LBB3_20
; AVX1-NEXT: ## %bb.19: ## %cond.load33
-; AVX1-NEXT: vmovhpd {{.*#+}} xmm4 = xmm2[0],mem[0]
-; AVX1-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3]
+; AVX1-NEXT: vmovhps {{.*#+}} xmm4 = xmm2[0,1],mem[0,1]
+; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: LBB3_20: ## %else34
-; AVX1-NEXT: vxorpd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vxorps %xmm4, %xmm4, %xmm4
; AVX1-NEXT: vpcmpeqd %xmm4, %xmm5, %xmm6
; AVX1-NEXT: vpackssdw %xmm0, %xmm6, %xmm6
; AVX1-NEXT: vpacksswb %xmm6, %xmm0, %xmm6
; AVX1-NEXT: je LBB3_22
; AVX1-NEXT: ## %bb.21: ## %cond.load37
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
-; AVX1-NEXT: vmovlpd {{.*#+}} xmm6 = mem[0],xmm6[1]
+; AVX1-NEXT: vmovlps {{.*#+}} xmm6 = mem[0,1],xmm6[2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: LBB3_22: ## %else38
; AVX1-NEXT: je LBB3_24
; AVX1-NEXT: ## %bb.23: ## %cond.load41
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT: vmovhpd {{.*#+}} xmm4 = xmm4[0],mem[0]
+; AVX1-NEXT: vmovhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: LBB3_24: ## %else42
; AVX1-NEXT: testb $1, %al
; AVX1-NEXT: je LBB3_28
; AVX1-NEXT: ## %bb.27: ## %cond.load49
-; AVX1-NEXT: vmovhpd {{.*#+}} xmm5 = xmm3[0],mem[0]
-; AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3]
+; AVX1-NEXT: vmovhps {{.*#+}} xmm5 = xmm3[0,1],mem[0,1]
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: LBB3_28: ## %else50
-; AVX1-NEXT: vxorpd %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vxorps %xmm5, %xmm5, %xmm5
; AVX1-NEXT: vpcmpeqd %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vpackssdw %xmm4, %xmm0, %xmm5
; AVX1-NEXT: vpacksswb %xmm5, %xmm0, %xmm5
; AVX1-NEXT: je LBB3_30
; AVX1-NEXT: ## %bb.29: ## %cond.load53
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
-; AVX1-NEXT: vmovlpd {{.*#+}} xmm5 = mem[0],xmm5[1]
+; AVX1-NEXT: vmovlps {{.*#+}} xmm5 = mem[0,1],xmm5[2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: LBB3_30: ## %else54
; AVX1-NEXT: je LBB3_32
; AVX1-NEXT: ## %bb.31: ## %cond.load57
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT: vmovhpd {{.*#+}} xmm4 = xmm4[0],mem[0]
+; AVX1-NEXT: vmovhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
; AVX1-NEXT: LBB3_32: ## %else58
; AVX1-NEXT: retq
; AVX2-NEXT: testb $1, %al
; AVX2-NEXT: je LBB3_4
; AVX2-NEXT: ## %bb.3: ## %cond.load1
-; AVX2-NEXT: vmovhpd {{.*#+}} xmm6 = xmm0[0],mem[0]
-; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3]
+; AVX2-NEXT: vmovhps {{.*#+}} xmm6 = xmm0[0,1],mem[0,1]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: LBB3_4: ## %else2
-; AVX2-NEXT: vxorpd %xmm6, %xmm6, %xmm6
+; AVX2-NEXT: vpxor %xmm6, %xmm6, %xmm6
; AVX2-NEXT: vpcmpeqd %xmm6, %xmm4, %xmm7
; AVX2-NEXT: vpackssdw %xmm0, %xmm7, %xmm7
; AVX2-NEXT: vpacksswb %xmm0, %xmm7, %xmm7
; AVX2-NEXT: testb $1, %al
; AVX2-NEXT: je LBB3_6
; AVX2-NEXT: ## %bb.5: ## %cond.load5
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm7
-; AVX2-NEXT: vmovlpd {{.*#+}} xmm7 = mem[0],xmm7[1]
-; AVX2-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm7
+; AVX2-NEXT: vmovlps {{.*#+}} xmm7 = mem[0,1],xmm7[2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm0
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: LBB3_6: ## %else6
; AVX2-NEXT: vpcmpeqd %xmm6, %xmm4, %xmm6
; AVX2-NEXT: testb $1, %al
; AVX2-NEXT: je LBB3_8
; AVX2-NEXT: ## %bb.7: ## %cond.load9
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm6
-; AVX2-NEXT: vmovhpd {{.*#+}} xmm6 = xmm6[0],mem[0]
-; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm6
+; AVX2-NEXT: vmovhps {{.*#+}} xmm6 = xmm6[0,1],mem[0,1]
+; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: LBB3_8: ## %else10
; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm4
-; AVX2-NEXT: vxorpd %xmm6, %xmm6, %xmm6
+; AVX2-NEXT: vpxor %xmm6, %xmm6, %xmm6
; AVX2-NEXT: vpcmpeqd %xmm6, %xmm4, %xmm6
; AVX2-NEXT: vpackssdw %xmm6, %xmm0, %xmm7
; AVX2-NEXT: vpacksswb %xmm0, %xmm7, %xmm7
; AVX2-NEXT: testb $1, %al
; AVX2-NEXT: je LBB3_12
; AVX2-NEXT: ## %bb.11: ## %cond.load17
-; AVX2-NEXT: vmovhpd {{.*#+}} xmm6 = xmm1[0],mem[0]
-; AVX2-NEXT: vblendpd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3]
+; AVX2-NEXT: vmovhps {{.*#+}} xmm6 = xmm1[0,1],mem[0,1]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: LBB3_12: ## %else18
-; AVX2-NEXT: vxorpd %xmm6, %xmm6, %xmm6
+; AVX2-NEXT: vpxor %xmm6, %xmm6, %xmm6
; AVX2-NEXT: vpcmpeqd %xmm6, %xmm4, %xmm4
; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm6
; AVX2-NEXT: vpacksswb %xmm0, %xmm6, %xmm6
; AVX2-NEXT: testb $1, %al
; AVX2-NEXT: je LBB3_14
; AVX2-NEXT: ## %bb.13: ## %cond.load21
-; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm6
-; AVX2-NEXT: vmovlpd {{.*#+}} xmm6 = mem[0],xmm6[1]
-; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm6
+; AVX2-NEXT: vmovlps {{.*#+}} xmm6 = mem[0,1],xmm6[2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm1, %ymm1
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: LBB3_14: ## %else22
; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm4
; AVX2-NEXT: testb $1, %al
; AVX2-NEXT: je LBB3_16
; AVX2-NEXT: ## %bb.15: ## %cond.load25
-; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX2-NEXT: vmovhpd {{.*#+}} xmm4 = xmm4[0],mem[0]
-; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
+; AVX2-NEXT: vmovhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1]
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: LBB3_16: ## %else26
-; AVX2-NEXT: vxorpd %xmm4, %xmm4, %xmm4
+; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX2-NEXT: vpcmpeqd %xmm4, %xmm5, %xmm6
; AVX2-NEXT: vpackssdw %xmm0, %xmm6, %xmm6
; AVX2-NEXT: vpacksswb %xmm6, %xmm0, %xmm6
; AVX2-NEXT: testb $1, %al
; AVX2-NEXT: je LBB3_20
; AVX2-NEXT: ## %bb.19: ## %cond.load33
-; AVX2-NEXT: vmovhpd {{.*#+}} xmm4 = xmm2[0],mem[0]
-; AVX2-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3]
+; AVX2-NEXT: vmovhps {{.*#+}} xmm4 = xmm2[0,1],mem[0,1]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: LBB3_20: ## %else34
-; AVX2-NEXT: vxorpd %xmm4, %xmm4, %xmm4
+; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX2-NEXT: vpcmpeqd %xmm4, %xmm5, %xmm6
; AVX2-NEXT: vpackssdw %xmm0, %xmm6, %xmm6
; AVX2-NEXT: vpacksswb %xmm6, %xmm0, %xmm6
; AVX2-NEXT: testb $1, %al
; AVX2-NEXT: je LBB3_22
; AVX2-NEXT: ## %bb.21: ## %cond.load37
-; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm6
-; AVX2-NEXT: vmovlpd {{.*#+}} xmm6 = mem[0],xmm6[1]
-; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm6
+; AVX2-NEXT: vmovlps {{.*#+}} xmm6 = mem[0,1],xmm6[2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm2
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: LBB3_22: ## %else38
; AVX2-NEXT: vpcmpeqd %xmm4, %xmm5, %xmm4
; AVX2-NEXT: testb $1, %al
; AVX2-NEXT: je LBB3_24
; AVX2-NEXT: ## %bb.23: ## %cond.load41
-; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX2-NEXT: vmovhpd {{.*#+}} xmm4 = xmm4[0],mem[0]
-; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX2-NEXT: vmovhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1]
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: LBB3_24: ## %else42
; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm4
; AVX2-NEXT: testb $1, %al
; AVX2-NEXT: je LBB3_28
; AVX2-NEXT: ## %bb.27: ## %cond.load49
-; AVX2-NEXT: vmovhpd {{.*#+}} xmm5 = xmm3[0],mem[0]
-; AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3]
+; AVX2-NEXT: vmovhps {{.*#+}} xmm5 = xmm3[0,1],mem[0,1]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: LBB3_28: ## %else50
-; AVX2-NEXT: vxorpd %xmm5, %xmm5, %xmm5
+; AVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5
; AVX2-NEXT: vpcmpeqd %xmm5, %xmm4, %xmm4
; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm5
; AVX2-NEXT: vpacksswb %xmm5, %xmm0, %xmm5
; AVX2-NEXT: testb $1, %al
; AVX2-NEXT: je LBB3_30
; AVX2-NEXT: ## %bb.29: ## %cond.load53
-; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm5
-; AVX2-NEXT: vmovlpd {{.*#+}} xmm5 = mem[0],xmm5[1]
-; AVX2-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5
+; AVX2-NEXT: vmovlps {{.*#+}} xmm5 = mem[0,1],xmm5[2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm3, %ymm3
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: LBB3_30: ## %else54
; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm4
; AVX2-NEXT: testb $1, %al
; AVX2-NEXT: je LBB3_32
; AVX2-NEXT: ## %bb.31: ## %cond.load57
-; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX2-NEXT: vmovhpd {{.*#+}} xmm4 = xmm4[0],mem[0]
-; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vmovhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1]
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
; AVX2-NEXT: LBB3_32: ## %else58
; AVX2-NEXT: retq
;
; WIDEN_AVX2-NEXT: je .LBB1_2
; WIDEN_AVX2-NEXT: # %bb.1: # %cond.store
; WIDEN_AVX2-NEXT: vmovq %xmm1, %rax
-; WIDEN_AVX2-NEXT: vmovlpd %xmm0, (%rax)
+; WIDEN_AVX2-NEXT: vmovlps %xmm0, (%rax)
; WIDEN_AVX2-NEXT: .LBB1_2: # %else
; WIDEN_AVX2-NEXT: vpextrb $8, %xmm2, %eax
; WIDEN_AVX2-NEXT: testb $1, %al
; WIDEN_AVX2-NEXT: je .LBB1_4
; WIDEN_AVX2-NEXT: # %bb.3: # %cond.store1
; WIDEN_AVX2-NEXT: vpextrq $1, %xmm1, %rax
-; WIDEN_AVX2-NEXT: vmovhpd %xmm0, (%rax)
+; WIDEN_AVX2-NEXT: vmovhps %xmm0, (%rax)
; WIDEN_AVX2-NEXT: .LBB1_4: # %else2
; WIDEN_AVX2-NEXT: retq
;
; PROMOTE_AVX2-NEXT: je .LBB1_2
; PROMOTE_AVX2-NEXT: # %bb.1: # %cond.store
; PROMOTE_AVX2-NEXT: vmovq %xmm1, %rax
-; PROMOTE_AVX2-NEXT: vmovlpd %xmm0, (%rax)
+; PROMOTE_AVX2-NEXT: vmovlps %xmm0, (%rax)
; PROMOTE_AVX2-NEXT: .LBB1_2: # %else
; PROMOTE_AVX2-NEXT: vpextrb $8, %xmm2, %eax
; PROMOTE_AVX2-NEXT: testb $1, %al
; PROMOTE_AVX2-NEXT: je .LBB1_4
; PROMOTE_AVX2-NEXT: # %bb.3: # %cond.store1
; PROMOTE_AVX2-NEXT: vpextrq $1, %xmm1, %rax
-; PROMOTE_AVX2-NEXT: vmovhpd %xmm0, (%rax)
+; PROMOTE_AVX2-NEXT: vmovhps %xmm0, (%rax)
; PROMOTE_AVX2-NEXT: .LBB1_4: # %else2
; PROMOTE_AVX2-NEXT: retq
%gep = getelementptr double, double *%base, <2 x i32> %ind
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB1_2
; SSE2-NEXT: ## %bb.1: ## %cond.load
-; SSE2-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
+; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
; SSE2-NEXT: LBB1_2: ## %else
; SSE2-NEXT: pextrw $4, %xmm0, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB1_4
; SSE2-NEXT: ## %bb.3: ## %cond.load1
-; SSE2-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; SSE2-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
; SSE2-NEXT: LBB1_4: ## %else2
-; SSE2-NEXT: movapd %xmm1, %xmm0
+; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: load_v2f64_v2i64:
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB1_2
; SSE42-NEXT: ## %bb.1: ## %cond.load
-; SSE42-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
+; SSE42-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
; SSE42-NEXT: LBB1_2: ## %else
; SSE42-NEXT: pextrb $8, %xmm2, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB1_4
; SSE42-NEXT: ## %bb.3: ## %cond.load1
-; SSE42-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; SSE42-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
; SSE42-NEXT: LBB1_4: ## %else2
-; SSE42-NEXT: movapd %xmm1, %xmm0
+; SSE42-NEXT: movaps %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX1OR2-LABEL: load_v2f64_v2i64:
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB2_2
; SSE2-NEXT: ## %bb.1: ## %cond.load
-; SSE2-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
+; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
; SSE2-NEXT: LBB2_2: ## %else
; SSE2-NEXT: pextrw $2, %xmm3, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB2_4
; SSE2-NEXT: ## %bb.3: ## %cond.load1
-; SSE2-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; SSE2-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
; SSE2-NEXT: LBB2_4: ## %else2
; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: pcmpeqd %xmm3, %xmm0
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB2_6
; SSE2-NEXT: ## %bb.5: ## %cond.load4
-; SSE2-NEXT: movlpd {{.*#+}} xmm2 = mem[0],xmm2[1]
+; SSE2-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
; SSE2-NEXT: LBB2_6: ## %else5
; SSE2-NEXT: pextrw $6, %xmm0, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB2_8
; SSE2-NEXT: ## %bb.7: ## %cond.load7
-; SSE2-NEXT: movhpd {{.*#+}} xmm2 = xmm2[0],mem[0]
+; SSE2-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
; SSE2-NEXT: LBB2_8: ## %else8
-; SSE2-NEXT: movapd %xmm1, %xmm0
-; SSE2-NEXT: movapd %xmm2, %xmm1
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: movaps %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSE42-LABEL: load_v4f64_v4i32:
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB2_2
; SSE42-NEXT: ## %bb.1: ## %cond.load
-; SSE42-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
+; SSE42-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
; SSE42-NEXT: LBB2_2: ## %else
; SSE42-NEXT: pextrb $4, %xmm3, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB2_4
; SSE42-NEXT: ## %bb.3: ## %cond.load1
-; SSE42-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; SSE42-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
; SSE42-NEXT: LBB2_4: ## %else2
; SSE42-NEXT: pxor %xmm3, %xmm3
; SSE42-NEXT: pcmpeqd %xmm3, %xmm0
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB2_6
; SSE42-NEXT: ## %bb.5: ## %cond.load4
-; SSE42-NEXT: movlpd {{.*#+}} xmm2 = mem[0],xmm2[1]
+; SSE42-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
; SSE42-NEXT: LBB2_6: ## %else5
; SSE42-NEXT: pextrb $12, %xmm0, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB2_8
; SSE42-NEXT: ## %bb.7: ## %cond.load7
-; SSE42-NEXT: movhpd {{.*#+}} xmm2 = xmm2[0],mem[0]
+; SSE42-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
; SSE42-NEXT: LBB2_8: ## %else8
-; SSE42-NEXT: movapd %xmm1, %xmm0
-; SSE42-NEXT: movapd %xmm2, %xmm1
+; SSE42-NEXT: movaps %xmm1, %xmm0
+; SSE42-NEXT: movaps %xmm2, %xmm1
; SSE42-NEXT: retq
;
; AVX1-LABEL: load_v4f64_v4i32:
; SSE2-NEXT: pcmpeqd %xmm0, %xmm3
; SSE2-NEXT: movd %xmm3, %eax
; SSE2-NEXT: testb $1, %al
-; SSE2-NEXT: xorpd %xmm1, %xmm1
+; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: je LBB3_2
; SSE2-NEXT: ## %bb.1: ## %cond.load
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT: xorpd %xmm1, %xmm1
+; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: LBB3_2: ## %else
; SSE2-NEXT: pextrw $2, %xmm3, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB3_4
; SSE2-NEXT: ## %bb.3: ## %cond.load1
-; SSE2-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; SSE2-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
; SSE2-NEXT: LBB3_4: ## %else2
; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: pcmpeqd %xmm3, %xmm2
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB3_6
; SSE2-NEXT: ## %bb.5: ## %cond.load4
-; SSE2-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
+; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
; SSE2-NEXT: LBB3_6: ## %else5
; SSE2-NEXT: pextrw $6, %xmm2, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB3_8
; SSE2-NEXT: ## %bb.7: ## %cond.load7
-; SSE2-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; SSE2-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
; SSE2-NEXT: LBB3_8: ## %else8
; SSE2-NEXT: retq
;
; SSE42-NEXT: pcmpeqd %xmm0, %xmm3
; SSE42-NEXT: pextrb $0, %xmm3, %eax
; SSE42-NEXT: testb $1, %al
-; SSE42-NEXT: xorpd %xmm1, %xmm1
+; SSE42-NEXT: xorps %xmm1, %xmm1
; SSE42-NEXT: je LBB3_2
; SSE42-NEXT: ## %bb.1: ## %cond.load
; SSE42-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSE42-NEXT: xorpd %xmm1, %xmm1
+; SSE42-NEXT: xorps %xmm1, %xmm1
; SSE42-NEXT: LBB3_2: ## %else
; SSE42-NEXT: pextrb $4, %xmm3, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB3_4
; SSE42-NEXT: ## %bb.3: ## %cond.load1
-; SSE42-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; SSE42-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
; SSE42-NEXT: LBB3_4: ## %else2
; SSE42-NEXT: pxor %xmm3, %xmm3
; SSE42-NEXT: pcmpeqd %xmm3, %xmm2
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB3_6
; SSE42-NEXT: ## %bb.5: ## %cond.load4
-; SSE42-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
+; SSE42-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
; SSE42-NEXT: LBB3_6: ## %else5
; SSE42-NEXT: pextrb $12, %xmm2, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB3_8
; SSE42-NEXT: ## %bb.7: ## %cond.load7
-; SSE42-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; SSE42-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
; SSE42-NEXT: LBB3_8: ## %else8
; SSE42-NEXT: retq
;
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB4_2
; SSE2-NEXT: ## %bb.1: ## %cond.load
-; SSE2-NEXT: movlpd {{.*#+}} xmm2 = mem[0],xmm2[1]
+; SSE2-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
; SSE2-NEXT: LBB4_2: ## %else
; SSE2-NEXT: pextrw $4, %xmm0, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB4_4
; SSE2-NEXT: ## %bb.3: ## %cond.load1
-; SSE2-NEXT: movhpd {{.*#+}} xmm2 = xmm2[0],mem[0]
+; SSE2-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
; SSE2-NEXT: LBB4_4: ## %else2
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: pcmpeqd %xmm0, %xmm1
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB4_6
; SSE2-NEXT: ## %bb.5: ## %cond.load4
-; SSE2-NEXT: movlpd {{.*#+}} xmm3 = mem[0],xmm3[1]
+; SSE2-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3]
; SSE2-NEXT: LBB4_6: ## %else5
; SSE2-NEXT: pextrw $4, %xmm0, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB4_8
; SSE2-NEXT: ## %bb.7: ## %cond.load7
-; SSE2-NEXT: movhpd {{.*#+}} xmm3 = xmm3[0],mem[0]
+; SSE2-NEXT: movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1]
; SSE2-NEXT: LBB4_8: ## %else8
-; SSE2-NEXT: movapd %xmm2, %xmm0
-; SSE2-NEXT: movapd %xmm3, %xmm1
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: movaps %xmm3, %xmm1
; SSE2-NEXT: retq
;
; SSE42-LABEL: load_v4f64_v4i64:
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB4_2
; SSE42-NEXT: ## %bb.1: ## %cond.load
-; SSE42-NEXT: movlpd {{.*#+}} xmm2 = mem[0],xmm2[1]
+; SSE42-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
; SSE42-NEXT: LBB4_2: ## %else
; SSE42-NEXT: pextrb $8, %xmm4, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB4_4
; SSE42-NEXT: ## %bb.3: ## %cond.load1
-; SSE42-NEXT: movhpd {{.*#+}} xmm2 = xmm2[0],mem[0]
+; SSE42-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
; SSE42-NEXT: LBB4_4: ## %else2
; SSE42-NEXT: pxor %xmm0, %xmm0
; SSE42-NEXT: pcmpeqq %xmm0, %xmm1
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB4_6
; SSE42-NEXT: ## %bb.5: ## %cond.load4
-; SSE42-NEXT: movlpd {{.*#+}} xmm3 = mem[0],xmm3[1]
+; SSE42-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3]
; SSE42-NEXT: LBB4_6: ## %else5
; SSE42-NEXT: pextrb $8, %xmm1, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB4_8
; SSE42-NEXT: ## %bb.7: ## %cond.load7
-; SSE42-NEXT: movhpd {{.*#+}} xmm3 = xmm3[0],mem[0]
+; SSE42-NEXT: movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1]
; SSE42-NEXT: LBB4_8: ## %else8
-; SSE42-NEXT: movapd %xmm2, %xmm0
-; SSE42-NEXT: movapd %xmm3, %xmm1
+; SSE42-NEXT: movaps %xmm2, %xmm0
+; SSE42-NEXT: movaps %xmm3, %xmm1
; SSE42-NEXT: retq
;
; AVX1-LABEL: load_v4f64_v4i64:
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB5_2
; SSE2-NEXT: ## %bb.1: ## %cond.load
-; SSE2-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
+; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
; SSE2-NEXT: LBB5_2: ## %else
; SSE2-NEXT: shrl $16, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB5_4
; SSE2-NEXT: ## %bb.3: ## %cond.load1
-; SSE2-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; SSE2-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
; SSE2-NEXT: LBB5_4: ## %else2
; SSE2-NEXT: pxor %xmm5, %xmm5
; SSE2-NEXT: pcmpeqw %xmm0, %xmm5
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB5_6
; SSE2-NEXT: ## %bb.5: ## %cond.load4
-; SSE2-NEXT: movlpd {{.*#+}} xmm2 = mem[0],xmm2[1]
+; SSE2-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
; SSE2-NEXT: LBB5_6: ## %else5
; SSE2-NEXT: pextrw $3, %xmm5, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB5_8
; SSE2-NEXT: ## %bb.7: ## %cond.load7
-; SSE2-NEXT: movhpd {{.*#+}} xmm2 = xmm2[0],mem[0]
+; SSE2-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
; SSE2-NEXT: LBB5_8: ## %else8
; SSE2-NEXT: pxor %xmm5, %xmm5
; SSE2-NEXT: pcmpeqw %xmm0, %xmm5
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB5_10
; SSE2-NEXT: ## %bb.9: ## %cond.load10
-; SSE2-NEXT: movlpd {{.*#+}} xmm3 = mem[0],xmm3[1]
+; SSE2-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3]
; SSE2-NEXT: LBB5_10: ## %else11
; SSE2-NEXT: pextrw $5, %xmm5, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB5_12
; SSE2-NEXT: ## %bb.11: ## %cond.load13
-; SSE2-NEXT: movhpd {{.*#+}} xmm3 = xmm3[0],mem[0]
+; SSE2-NEXT: movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1]
; SSE2-NEXT: LBB5_12: ## %else14
; SSE2-NEXT: pxor %xmm5, %xmm5
; SSE2-NEXT: pcmpeqw %xmm5, %xmm0
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB5_14
; SSE2-NEXT: ## %bb.13: ## %cond.load16
-; SSE2-NEXT: movlpd {{.*#+}} xmm4 = mem[0],xmm4[1]
+; SSE2-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3]
; SSE2-NEXT: LBB5_14: ## %else17
; SSE2-NEXT: pextrw $7, %xmm0, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB5_16
; SSE2-NEXT: ## %bb.15: ## %cond.load19
-; SSE2-NEXT: movhpd {{.*#+}} xmm4 = xmm4[0],mem[0]
+; SSE2-NEXT: movhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1]
; SSE2-NEXT: LBB5_16: ## %else20
-; SSE2-NEXT: movapd %xmm1, %xmm0
-; SSE2-NEXT: movapd %xmm2, %xmm1
-; SSE2-NEXT: movapd %xmm3, %xmm2
-; SSE2-NEXT: movapd %xmm4, %xmm3
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: movaps %xmm2, %xmm1
+; SSE2-NEXT: movaps %xmm3, %xmm2
+; SSE2-NEXT: movaps %xmm4, %xmm3
; SSE2-NEXT: retq
;
; SSE42-LABEL: load_v8f64_v8i16:
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB5_2
; SSE42-NEXT: ## %bb.1: ## %cond.load
-; SSE42-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
+; SSE42-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
; SSE42-NEXT: LBB5_2: ## %else
; SSE42-NEXT: pextrb $2, %xmm5, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB5_4
; SSE42-NEXT: ## %bb.3: ## %cond.load1
-; SSE42-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; SSE42-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
; SSE42-NEXT: LBB5_4: ## %else2
; SSE42-NEXT: pxor %xmm5, %xmm5
; SSE42-NEXT: pcmpeqw %xmm0, %xmm5
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB5_6
; SSE42-NEXT: ## %bb.5: ## %cond.load4
-; SSE42-NEXT: movlpd {{.*#+}} xmm2 = mem[0],xmm2[1]
+; SSE42-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
; SSE42-NEXT: LBB5_6: ## %else5
; SSE42-NEXT: pextrb $6, %xmm5, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB5_8
; SSE42-NEXT: ## %bb.7: ## %cond.load7
-; SSE42-NEXT: movhpd {{.*#+}} xmm2 = xmm2[0],mem[0]
+; SSE42-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
; SSE42-NEXT: LBB5_8: ## %else8
; SSE42-NEXT: pxor %xmm5, %xmm5
; SSE42-NEXT: pcmpeqw %xmm0, %xmm5
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB5_10
; SSE42-NEXT: ## %bb.9: ## %cond.load10
-; SSE42-NEXT: movlpd {{.*#+}} xmm3 = mem[0],xmm3[1]
+; SSE42-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3]
; SSE42-NEXT: LBB5_10: ## %else11
; SSE42-NEXT: pextrb $10, %xmm5, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB5_12
; SSE42-NEXT: ## %bb.11: ## %cond.load13
-; SSE42-NEXT: movhpd {{.*#+}} xmm3 = xmm3[0],mem[0]
+; SSE42-NEXT: movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1]
; SSE42-NEXT: LBB5_12: ## %else14
; SSE42-NEXT: pxor %xmm5, %xmm5
; SSE42-NEXT: pcmpeqw %xmm5, %xmm0
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB5_14
; SSE42-NEXT: ## %bb.13: ## %cond.load16
-; SSE42-NEXT: movlpd {{.*#+}} xmm4 = mem[0],xmm4[1]
+; SSE42-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3]
; SSE42-NEXT: LBB5_14: ## %else17
; SSE42-NEXT: pextrb $14, %xmm0, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB5_16
; SSE42-NEXT: ## %bb.15: ## %cond.load19
-; SSE42-NEXT: movhpd {{.*#+}} xmm4 = xmm4[0],mem[0]
+; SSE42-NEXT: movhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1]
; SSE42-NEXT: LBB5_16: ## %else20
-; SSE42-NEXT: movapd %xmm1, %xmm0
-; SSE42-NEXT: movapd %xmm2, %xmm1
-; SSE42-NEXT: movapd %xmm3, %xmm2
-; SSE42-NEXT: movapd %xmm4, %xmm3
+; SSE42-NEXT: movaps %xmm1, %xmm0
+; SSE42-NEXT: movaps %xmm2, %xmm1
+; SSE42-NEXT: movaps %xmm3, %xmm2
+; SSE42-NEXT: movaps %xmm4, %xmm3
; SSE42-NEXT: retq
;
; AVX1-LABEL: load_v8f64_v8i16:
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB6_2
; SSE2-NEXT: ## %bb.1: ## %cond.load
-; SSE2-NEXT: movlpd {{.*#+}} xmm4 = mem[0],xmm4[1]
+; SSE2-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3]
; SSE2-NEXT: LBB6_2: ## %else
; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB6_4
; SSE2-NEXT: ## %bb.3: ## %cond.load1
-; SSE2-NEXT: movhpd {{.*#+}} xmm4 = xmm4[0],mem[0]
+; SSE2-NEXT: movhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1]
; SSE2-NEXT: LBB6_4: ## %else2
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: pcmpeqd %xmm0, %xmm1
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB6_6
; SSE2-NEXT: ## %bb.5: ## %cond.load4
-; SSE2-NEXT: movlpd {{.*#+}} xmm5 = mem[0],xmm5[1]
+; SSE2-NEXT: movlps {{.*#+}} xmm5 = mem[0,1],xmm5[2,3]
; SSE2-NEXT: LBB6_6: ## %else5
; SSE2-NEXT: pextrw $4, %xmm0, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB6_8
; SSE2-NEXT: ## %bb.7: ## %cond.load7
-; SSE2-NEXT: movhpd {{.*#+}} xmm5 = xmm5[0],mem[0]
+; SSE2-NEXT: movhps {{.*#+}} xmm5 = xmm5[0,1],mem[0,1]
; SSE2-NEXT: LBB6_8: ## %else8
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: pcmpeqd %xmm0, %xmm2
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB6_10
; SSE2-NEXT: ## %bb.9: ## %cond.load10
-; SSE2-NEXT: movlpd {{.*#+}} xmm6 = mem[0],xmm6[1]
+; SSE2-NEXT: movlps {{.*#+}} xmm6 = mem[0,1],xmm6[2,3]
; SSE2-NEXT: LBB6_10: ## %else11
; SSE2-NEXT: pextrw $4, %xmm0, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB6_12
; SSE2-NEXT: ## %bb.11: ## %cond.load13
-; SSE2-NEXT: movhpd {{.*#+}} xmm6 = xmm6[0],mem[0]
+; SSE2-NEXT: movhps {{.*#+}} xmm6 = xmm6[0,1],mem[0,1]
; SSE2-NEXT: LBB6_12: ## %else14
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: pcmpeqd %xmm0, %xmm3
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB6_14
; SSE2-NEXT: ## %bb.13: ## %cond.load16
-; SSE2-NEXT: movlpd {{.*#+}} xmm8 = mem[0],xmm8[1]
+; SSE2-NEXT: movlps {{.*#+}} xmm8 = mem[0,1],xmm8[2,3]
; SSE2-NEXT: LBB6_14: ## %else17
; SSE2-NEXT: pextrw $4, %xmm0, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB6_16
; SSE2-NEXT: ## %bb.15: ## %cond.load19
-; SSE2-NEXT: movhpd {{.*#+}} xmm8 = xmm8[0],mem[0]
+; SSE2-NEXT: movhps {{.*#+}} xmm8 = xmm8[0,1],mem[0,1]
; SSE2-NEXT: LBB6_16: ## %else20
-; SSE2-NEXT: movapd %xmm4, %xmm0
-; SSE2-NEXT: movapd %xmm5, %xmm1
-; SSE2-NEXT: movapd %xmm6, %xmm2
-; SSE2-NEXT: movapd %xmm8, %xmm3
+; SSE2-NEXT: movaps %xmm4, %xmm0
+; SSE2-NEXT: movaps %xmm5, %xmm1
+; SSE2-NEXT: movaps %xmm6, %xmm2
+; SSE2-NEXT: movaps %xmm8, %xmm3
; SSE2-NEXT: retq
;
; SSE42-LABEL: load_v8f64_v8i64:
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB6_2
; SSE42-NEXT: ## %bb.1: ## %cond.load
-; SSE42-NEXT: movlpd {{.*#+}} xmm4 = mem[0],xmm4[1]
+; SSE42-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3]
; SSE42-NEXT: LBB6_2: ## %else
; SSE42-NEXT: pextrb $8, %xmm7, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB6_4
; SSE42-NEXT: ## %bb.3: ## %cond.load1
-; SSE42-NEXT: movhpd {{.*#+}} xmm4 = xmm4[0],mem[0]
+; SSE42-NEXT: movhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1]
; SSE42-NEXT: LBB6_4: ## %else2
; SSE42-NEXT: pxor %xmm0, %xmm0
; SSE42-NEXT: pcmpeqq %xmm0, %xmm1
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB6_6
; SSE42-NEXT: ## %bb.5: ## %cond.load4
-; SSE42-NEXT: movlpd {{.*#+}} xmm5 = mem[0],xmm5[1]
+; SSE42-NEXT: movlps {{.*#+}} xmm5 = mem[0,1],xmm5[2,3]
; SSE42-NEXT: LBB6_6: ## %else5
; SSE42-NEXT: pextrb $8, %xmm1, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB6_8
; SSE42-NEXT: ## %bb.7: ## %cond.load7
-; SSE42-NEXT: movhpd {{.*#+}} xmm5 = xmm5[0],mem[0]
+; SSE42-NEXT: movhps {{.*#+}} xmm5 = xmm5[0,1],mem[0,1]
; SSE42-NEXT: LBB6_8: ## %else8
; SSE42-NEXT: pxor %xmm0, %xmm0
; SSE42-NEXT: pcmpeqq %xmm0, %xmm2
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB6_10
; SSE42-NEXT: ## %bb.9: ## %cond.load10
-; SSE42-NEXT: movlpd {{.*#+}} xmm6 = mem[0],xmm6[1]
+; SSE42-NEXT: movlps {{.*#+}} xmm6 = mem[0,1],xmm6[2,3]
; SSE42-NEXT: LBB6_10: ## %else11
; SSE42-NEXT: pextrb $8, %xmm2, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB6_12
; SSE42-NEXT: ## %bb.11: ## %cond.load13
-; SSE42-NEXT: movhpd {{.*#+}} xmm6 = xmm6[0],mem[0]
+; SSE42-NEXT: movhps {{.*#+}} xmm6 = xmm6[0,1],mem[0,1]
; SSE42-NEXT: LBB6_12: ## %else14
; SSE42-NEXT: pxor %xmm0, %xmm0
; SSE42-NEXT: pcmpeqq %xmm0, %xmm3
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB6_14
; SSE42-NEXT: ## %bb.13: ## %cond.load16
-; SSE42-NEXT: movlpd {{.*#+}} xmm8 = mem[0],xmm8[1]
+; SSE42-NEXT: movlps {{.*#+}} xmm8 = mem[0,1],xmm8[2,3]
; SSE42-NEXT: LBB6_14: ## %else17
; SSE42-NEXT: pextrb $8, %xmm3, %eax
; SSE42-NEXT: testb $1, %al
; SSE42-NEXT: je LBB6_16
; SSE42-NEXT: ## %bb.15: ## %cond.load19
-; SSE42-NEXT: movhpd {{.*#+}} xmm8 = xmm8[0],mem[0]
+; SSE42-NEXT: movhps {{.*#+}} xmm8 = xmm8[0,1],mem[0,1]
; SSE42-NEXT: LBB6_16: ## %else20
-; SSE42-NEXT: movapd %xmm4, %xmm0
-; SSE42-NEXT: movapd %xmm5, %xmm1
-; SSE42-NEXT: movapd %xmm6, %xmm2
-; SSE42-NEXT: movapd %xmm8, %xmm3
+; SSE42-NEXT: movaps %xmm4, %xmm0
+; SSE42-NEXT: movaps %xmm5, %xmm1
+; SSE42-NEXT: movaps %xmm6, %xmm2
+; SSE42-NEXT: movaps %xmm8, %xmm3
; SSE42-NEXT: retq
;
; AVX1-LABEL: load_v8f64_v8i64:
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB13_2
; SSE2-NEXT: ## %bb.1: ## %cond.load
-; SSE2-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
+; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
; SSE2-NEXT: LBB13_2: ## %else
; SSE2-NEXT: pextrw $4, %xmm0, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB13_4
; SSE2-NEXT: ## %bb.3: ## %cond.load1
; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE2-NEXT: LBB13_4: ## %else2
-; SSE2-NEXT: movapd %xmm1, %xmm0
+; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: load_v2i64_v2i64:
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB14_2
; SSE2-NEXT: ## %bb.1: ## %cond.load
-; SSE2-NEXT: movlpd {{.*#+}} xmm2 = mem[0],xmm2[1]
+; SSE2-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
; SSE2-NEXT: LBB14_2: ## %else
; SSE2-NEXT: pextrw $4, %xmm0, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB14_4
; SSE2-NEXT: ## %bb.3: ## %cond.load1
; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
; SSE2-NEXT: LBB14_4: ## %else2
-; SSE2-NEXT: xorpd %xmm0, %xmm0
+; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: pcmpeqd %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2]
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB14_6
; SSE2-NEXT: ## %bb.5: ## %cond.load4
-; SSE2-NEXT: movlpd {{.*#+}} xmm3 = mem[0],xmm3[1]
+; SSE2-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3]
; SSE2-NEXT: LBB14_6: ## %else5
; SSE2-NEXT: pextrw $4, %xmm0, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB14_8
; SSE2-NEXT: ## %bb.7: ## %cond.load7
; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; SSE2-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0]
; SSE2-NEXT: LBB14_8: ## %else8
-; SSE2-NEXT: movapd %xmm2, %xmm0
-; SSE2-NEXT: movapd %xmm3, %xmm1
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: movaps %xmm3, %xmm1
; SSE2-NEXT: retq
;
; SSE42-LABEL: load_v4i64_v4i64:
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB15_2
; SSE2-NEXT: ## %bb.1: ## %cond.load
-; SSE2-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
+; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
; SSE2-NEXT: LBB15_2: ## %else
; SSE2-NEXT: shrl $16, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB15_4
; SSE2-NEXT: ## %bb.3: ## %cond.load1
; SSE2-NEXT: movsd {{.*#+}} xmm5 = mem[0],zero
-; SSE2-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0]
; SSE2-NEXT: LBB15_4: ## %else2
-; SSE2-NEXT: xorpd %xmm5, %xmm5
+; SSE2-NEXT: pxor %xmm5, %xmm5
; SSE2-NEXT: pcmpeqw %xmm0, %xmm5
; SSE2-NEXT: pextrw $2, %xmm5, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB15_6
; SSE2-NEXT: ## %bb.5: ## %cond.load4
-; SSE2-NEXT: movlpd {{.*#+}} xmm2 = mem[0],xmm2[1]
+; SSE2-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
; SSE2-NEXT: LBB15_6: ## %else5
; SSE2-NEXT: pextrw $3, %xmm5, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB15_8
; SSE2-NEXT: ## %bb.7: ## %cond.load7
; SSE2-NEXT: movsd {{.*#+}} xmm5 = mem[0],zero
-; SSE2-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm5[0]
+; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm5[0]
; SSE2-NEXT: LBB15_8: ## %else8
-; SSE2-NEXT: xorpd %xmm5, %xmm5
+; SSE2-NEXT: pxor %xmm5, %xmm5
; SSE2-NEXT: pcmpeqw %xmm0, %xmm5
; SSE2-NEXT: pextrw $4, %xmm5, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB15_10
; SSE2-NEXT: ## %bb.9: ## %cond.load10
-; SSE2-NEXT: movlpd {{.*#+}} xmm3 = mem[0],xmm3[1]
+; SSE2-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3]
; SSE2-NEXT: LBB15_10: ## %else11
; SSE2-NEXT: pextrw $5, %xmm5, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB15_12
; SSE2-NEXT: ## %bb.11: ## %cond.load13
; SSE2-NEXT: movsd {{.*#+}} xmm5 = mem[0],zero
-; SSE2-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm5[0]
+; SSE2-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm5[0]
; SSE2-NEXT: LBB15_12: ## %else14
-; SSE2-NEXT: xorpd %xmm5, %xmm5
+; SSE2-NEXT: pxor %xmm5, %xmm5
; SSE2-NEXT: pcmpeqw %xmm5, %xmm0
; SSE2-NEXT: pextrw $6, %xmm0, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB15_14
; SSE2-NEXT: ## %bb.13: ## %cond.load16
-; SSE2-NEXT: movlpd {{.*#+}} xmm4 = mem[0],xmm4[1]
+; SSE2-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3]
; SSE2-NEXT: LBB15_14: ## %else17
; SSE2-NEXT: pextrw $7, %xmm0, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB15_16
; SSE2-NEXT: ## %bb.15: ## %cond.load19
; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT: unpcklpd {{.*#+}} xmm4 = xmm4[0],xmm0[0]
+; SSE2-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0]
; SSE2-NEXT: LBB15_16: ## %else20
-; SSE2-NEXT: movapd %xmm1, %xmm0
-; SSE2-NEXT: movapd %xmm2, %xmm1
-; SSE2-NEXT: movapd %xmm3, %xmm2
-; SSE2-NEXT: movapd %xmm4, %xmm3
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: movaps %xmm2, %xmm1
+; SSE2-NEXT: movaps %xmm3, %xmm2
+; SSE2-NEXT: movaps %xmm4, %xmm3
; SSE2-NEXT: retq
;
; SSE42-LABEL: load_v8i64_v8i16:
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB16_2
; SSE2-NEXT: ## %bb.1: ## %cond.load
-; SSE2-NEXT: movlpd {{.*#+}} xmm4 = mem[0],xmm4[1]
+; SSE2-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3]
; SSE2-NEXT: LBB16_2: ## %else
; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: je LBB16_4
; SSE2-NEXT: ## %bb.3: ## %cond.load1
; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT: unpcklpd {{.*#+}} xmm4 = xmm4[0],xmm0[0]
+; SSE2-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0]
; SSE2-NEXT: LBB16_4: ## %else2
-; SSE2-NEXT: xorpd %xmm0, %xmm0
+; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: pcmpeqd %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2]
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB16_6
; SSE2-NEXT: ## %bb.5: ## %cond.load4
-; SSE2-NEXT: movlpd {{.*#+}} xmm5 = mem[0],xmm5[1]
+; SSE2-NEXT: movlps {{.*#+}} xmm5 = mem[0,1],xmm5[2,3]
; SSE2-NEXT: LBB16_6: ## %else5
; SSE2-NEXT: pextrw $4, %xmm0, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB16_8
; SSE2-NEXT: ## %bb.7: ## %cond.load7
; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT: unpcklpd {{.*#+}} xmm5 = xmm5[0],xmm0[0]
+; SSE2-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0]
; SSE2-NEXT: LBB16_8: ## %else8
-; SSE2-NEXT: xorpd %xmm0, %xmm0
+; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: pcmpeqd %xmm0, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB16_10
; SSE2-NEXT: ## %bb.9: ## %cond.load10
-; SSE2-NEXT: movlpd {{.*#+}} xmm6 = mem[0],xmm6[1]
+; SSE2-NEXT: movlps {{.*#+}} xmm6 = mem[0,1],xmm6[2,3]
; SSE2-NEXT: LBB16_10: ## %else11
; SSE2-NEXT: pextrw $4, %xmm0, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB16_12
; SSE2-NEXT: ## %bb.11: ## %cond.load13
; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm0[0]
+; SSE2-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0]
; SSE2-NEXT: LBB16_12: ## %else14
-; SSE2-NEXT: xorpd %xmm0, %xmm0
+; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: pcmpeqd %xmm0, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2]
; SSE2-NEXT: pand %xmm3, %xmm0
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB16_14
; SSE2-NEXT: ## %bb.13: ## %cond.load16
-; SSE2-NEXT: movlpd {{.*#+}} xmm8 = mem[0],xmm8[1]
+; SSE2-NEXT: movlps {{.*#+}} xmm8 = mem[0,1],xmm8[2,3]
; SSE2-NEXT: LBB16_14: ## %else17
; SSE2-NEXT: pextrw $4, %xmm0, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB16_16
; SSE2-NEXT: ## %bb.15: ## %cond.load19
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT: unpcklpd {{.*#+}} xmm8 = xmm8[0],xmm0[0]
+; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm0[0]
; SSE2-NEXT: LBB16_16: ## %else20
-; SSE2-NEXT: movapd %xmm4, %xmm0
-; SSE2-NEXT: movapd %xmm5, %xmm1
-; SSE2-NEXT: movapd %xmm6, %xmm2
-; SSE2-NEXT: movapd %xmm8, %xmm3
+; SSE2-NEXT: movaps %xmm4, %xmm0
+; SSE2-NEXT: movaps %xmm5, %xmm1
+; SSE2-NEXT: movaps %xmm6, %xmm2
+; SSE2-NEXT: movdqa %xmm8, %xmm3
; SSE2-NEXT: retq
;
; SSE42-LABEL: load_v8i64_v8i64:
define <2 x double> @mload_constmask_v2f64(<2 x double>* %addr, <2 x double> %dst) {
; SSE-LABEL: mload_constmask_v2f64:
; SSE: ## %bb.0:
-; SSE-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
; SSE-NEXT: retq
;
; AVX-LABEL: mload_constmask_v2f64:
; AVX: ## %bb.0:
-; AVX-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
; AVX-NEXT: retq
%res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1> <i1 0, i1 1>, <2 x double> %dst)
ret <2 x double> %res
; SSE-LABEL: mload_constmask_v4f64:
; SSE: ## %bb.0:
; SSE-NEXT: movups (%rdi), %xmm0
-; SSE-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
+; SSE-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
; SSE-NEXT: retq
;
; AVX1OR2-LABEL: mload_constmask_v4f64:
define <4 x i64> @mload_constmask_v4i64(<4 x i64>* %addr, <4 x i64> %dst) {
; SSE2-LABEL: mload_constmask_v4i64:
; SSE2: ## %bb.0:
-; SSE2-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
+; SSE2-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
; SSE2-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSE2-NEXT: retq
; SSE-LABEL: mload_constmask_v8f64:
; SSE: ## %bb.0:
; SSE-NEXT: movups (%rdi), %xmm0
-; SSE-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
-; SSE-NEXT: movhpd {{.*#+}} xmm3 = xmm3[0],mem[0]
+; SSE-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
+; SSE-NEXT: movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1]
; SSE-NEXT: retq
;
; AVX1OR2-LABEL: mload_constmask_v8f64:
define <4 x i64> @load_one_mask_bit_set3(<4 x i64>* %addr, <4 x i64> %val) {
; SSE2-LABEL: load_one_mask_bit_set3:
; SSE2: ## %bb.0:
-; SSE2-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
+; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
; SSE2-NEXT: retq
;
; SSE42-LABEL: load_one_mask_bit_set3:
define <4 x double> @load_one_mask_bit_set4(<4 x double>* %addr, <4 x double> %val) {
; SSE-LABEL: load_one_mask_bit_set4:
; SSE: ## %bb.0:
-; SSE-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; SSE-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
; SSE-NEXT: retq
;
; AVX-LABEL: load_one_mask_bit_set4:
; AVX: ## %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-NEXT: retq
%res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x double> %val)
define <8 x double> @load_one_mask_bit_set5(<8 x double>* %addr, <8 x double> %val) {
; SSE-LABEL: load_one_mask_bit_set5:
; SSE: ## %bb.0:
-; SSE-NEXT: movhpd {{.*#+}} xmm3 = xmm3[0],mem[0]
+; SSE-NEXT: movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1]
; SSE-NEXT: retq
;
; AVX1OR2-LABEL: load_one_mask_bit_set5:
; AVX1OR2: ## %bb.0:
; AVX1OR2-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1OR2-NEXT: vmovhpd {{.*#+}} xmm2 = xmm2[0],mem[0]
+; AVX1OR2-NEXT: vmovhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
; AVX1OR2-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1OR2-NEXT: retq
;
; AVX512-LABEL: load_one_mask_bit_set5:
; AVX512: ## %bb.0:
; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm1
-; AVX512-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX512-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
; AVX512-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0
; AVX512-NEXT: retq
%res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x double> %val)
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB1_2
; SSE2-NEXT: ## %bb.1: ## %cond.store
-; SSE2-NEXT: movlpd %xmm1, (%rdi)
+; SSE2-NEXT: movlps %xmm1, (%rdi)
; SSE2-NEXT: LBB1_2: ## %else
; SSE2-NEXT: pextrw $4, %xmm0, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB1_4
; SSE2-NEXT: ## %bb.3: ## %cond.store1
-; SSE2-NEXT: movhpd %xmm1, 8(%rdi)
+; SSE2-NEXT: movhps %xmm1, 8(%rdi)
; SSE2-NEXT: LBB1_4: ## %else2
; SSE2-NEXT: retq
;
; SSE4-NEXT: testb $1, %al
; SSE4-NEXT: je LBB1_2
; SSE4-NEXT: ## %bb.1: ## %cond.store
-; SSE4-NEXT: movlpd %xmm1, (%rdi)
+; SSE4-NEXT: movlps %xmm1, (%rdi)
; SSE4-NEXT: LBB1_2: ## %else
; SSE4-NEXT: pextrb $8, %xmm2, %eax
; SSE4-NEXT: testb $1, %al
; SSE4-NEXT: je LBB1_4
; SSE4-NEXT: ## %bb.3: ## %cond.store1
-; SSE4-NEXT: movhpd %xmm1, 8(%rdi)
+; SSE4-NEXT: movhps %xmm1, 8(%rdi)
; SSE4-NEXT: LBB1_4: ## %else2
; SSE4-NEXT: retq
;
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB2_2
; SSE2-NEXT: ## %bb.1: ## %cond.store
-; SSE2-NEXT: movlpd %xmm2, (%rdi)
+; SSE2-NEXT: movlps %xmm2, (%rdi)
; SSE2-NEXT: LBB2_2: ## %else
; SSE2-NEXT: pextrw $4, %xmm0, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB2_4
; SSE2-NEXT: ## %bb.3: ## %cond.store1
-; SSE2-NEXT: movhpd %xmm2, 8(%rdi)
+; SSE2-NEXT: movhps %xmm2, 8(%rdi)
; SSE2-NEXT: LBB2_4: ## %else2
; SSE2-NEXT: pxor %xmm4, %xmm1
; SSE2-NEXT: movdqa %xmm4, %xmm0
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB2_6
; SSE2-NEXT: ## %bb.5: ## %cond.store3
-; SSE2-NEXT: movlpd %xmm3, 16(%rdi)
+; SSE2-NEXT: movlps %xmm3, 16(%rdi)
; SSE2-NEXT: LBB2_6: ## %else4
; SSE2-NEXT: pextrw $4, %xmm0, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB2_8
; SSE2-NEXT: ## %bb.7: ## %cond.store5
-; SSE2-NEXT: movhpd %xmm3, 24(%rdi)
+; SSE2-NEXT: movhps %xmm3, 24(%rdi)
; SSE2-NEXT: LBB2_8: ## %else6
; SSE2-NEXT: retq
;
; SSE4-NEXT: testb $1, %al
; SSE4-NEXT: je LBB2_2
; SSE4-NEXT: ## %bb.1: ## %cond.store
-; SSE4-NEXT: movlpd %xmm2, (%rdi)
+; SSE4-NEXT: movlps %xmm2, (%rdi)
; SSE4-NEXT: LBB2_2: ## %else
; SSE4-NEXT: pextrb $8, %xmm4, %eax
; SSE4-NEXT: testb $1, %al
; SSE4-NEXT: je LBB2_4
; SSE4-NEXT: ## %bb.3: ## %cond.store1
-; SSE4-NEXT: movhpd %xmm2, 8(%rdi)
+; SSE4-NEXT: movhps %xmm2, 8(%rdi)
; SSE4-NEXT: LBB2_4: ## %else2
; SSE4-NEXT: pxor %xmm0, %xmm0
; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
; SSE4-NEXT: testb $1, %al
; SSE4-NEXT: je LBB2_6
; SSE4-NEXT: ## %bb.5: ## %cond.store3
-; SSE4-NEXT: movlpd %xmm3, 16(%rdi)
+; SSE4-NEXT: movlps %xmm3, 16(%rdi)
; SSE4-NEXT: LBB2_6: ## %else4
; SSE4-NEXT: pextrb $8, %xmm0, %eax
; SSE4-NEXT: testb $1, %al
; SSE4-NEXT: je LBB2_8
; SSE4-NEXT: ## %bb.7: ## %cond.store5
-; SSE4-NEXT: movhpd %xmm3, 24(%rdi)
+; SSE4-NEXT: movhps %xmm3, 24(%rdi)
; SSE4-NEXT: LBB2_8: ## %else6
; SSE4-NEXT: retq
;
define void @one_mask_bit_set4(<4 x double>* %addr, <4 x double> %val) {
; SSE-LABEL: one_mask_bit_set4:
; SSE: ## %bb.0:
-; SSE-NEXT: movhpd %xmm1, 24(%rdi)
+; SSE-NEXT: movhps %xmm1, 24(%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: one_mask_bit_set4:
; AVX: ## %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT: vmovhpd %xmm0, 24(%rdi)
+; AVX-NEXT: vmovhps %xmm0, 24(%rdi)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %val, <4 x double>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 false, i1 true>)
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB23_2
; SSE2-NEXT: ## %bb.1: ## %cond.store
-; SSE2-NEXT: movlpd %xmm0, (%rdi)
+; SSE2-NEXT: movlps %xmm0, (%rdi)
; SSE2-NEXT: LBB23_2: ## %else
; SSE2-NEXT: pextrw $2, %xmm2, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB23_4
; SSE2-NEXT: ## %bb.3: ## %cond.store1
-; SSE2-NEXT: movhpd %xmm0, 8(%rdi)
+; SSE2-NEXT: movhps %xmm0, 8(%rdi)
; SSE2-NEXT: LBB23_4: ## %else2
; SSE2-NEXT: pextrw $4, %xmm2, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB23_6
; SSE2-NEXT: ## %bb.5: ## %cond.store3
-; SSE2-NEXT: movlpd %xmm1, 16(%rdi)
+; SSE2-NEXT: movlps %xmm1, 16(%rdi)
; SSE2-NEXT: LBB23_6: ## %else4
; SSE2-NEXT: pextrw $6, %xmm2, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB23_8
; SSE2-NEXT: ## %bb.7: ## %cond.store5
-; SSE2-NEXT: movhpd %xmm1, 24(%rdi)
+; SSE2-NEXT: movhps %xmm1, 24(%rdi)
; SSE2-NEXT: LBB23_8: ## %else6
; SSE2-NEXT: retq
;
; SSE4-NEXT: testb $1, %al
; SSE4-NEXT: je LBB23_2
; SSE4-NEXT: ## %bb.1: ## %cond.store
-; SSE4-NEXT: movlpd %xmm0, (%rdi)
+; SSE4-NEXT: movlps %xmm0, (%rdi)
; SSE4-NEXT: LBB23_2: ## %else
; SSE4-NEXT: pextrb $4, %xmm2, %eax
; SSE4-NEXT: testb $1, %al
; SSE4-NEXT: je LBB23_4
; SSE4-NEXT: ## %bb.3: ## %cond.store1
-; SSE4-NEXT: movhpd %xmm0, 8(%rdi)
+; SSE4-NEXT: movhps %xmm0, 8(%rdi)
; SSE4-NEXT: LBB23_4: ## %else2
; SSE4-NEXT: pextrb $8, %xmm2, %eax
; SSE4-NEXT: testb $1, %al
; SSE4-NEXT: je LBB23_6
; SSE4-NEXT: ## %bb.5: ## %cond.store3
-; SSE4-NEXT: movlpd %xmm1, 16(%rdi)
+; SSE4-NEXT: movlps %xmm1, 16(%rdi)
; SSE4-NEXT: LBB23_6: ## %else4
; SSE4-NEXT: pextrb $12, %xmm2, %eax
; SSE4-NEXT: testb $1, %al
; SSE4-NEXT: je LBB23_8
; SSE4-NEXT: ## %bb.7: ## %cond.store5
-; SSE4-NEXT: movhpd %xmm1, 24(%rdi)
+; SSE4-NEXT: movhps %xmm1, 24(%rdi)
; SSE4-NEXT: LBB23_8: ## %else6
; SSE4-NEXT: retq
;