From: Craig Topper Date: Sat, 6 Jul 2019 17:59:51 +0000 (+0000) Subject: [X86] Remove patterns from MOVLPSmr and MOVHPSmr instructions. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=127061edab0ecf9a27f25af9c3f02a817c665cb2;p=llvm [X86] Remove patterns from MOVLPSmr and MOVHPSmr instructions. These patterns are the same as the MOVLPDmr and MOVHPDmr patterns, but with a bitcast at the end. We can just select the PD instruction and let execution domain fixing switch to PS. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@365267 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 02f5af438b6..78ce2e339ec 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -6362,14 +6362,11 @@ let Predicates = [HasAVX512] in { } let SchedRW = [WriteFStore] in { +let mayStore = 1, hasSideEffects = 0 in def VMOVHPSZ128mr : AVX512PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128X:$src), "vmovhps\t{$src, $dst|$dst, $src}", - [(store (f64 (extractelt - (X86Unpckh (bc_v2f64 (v4f32 VR128X:$src)), - (bc_v2f64 (v4f32 VR128X:$src))), - (iPTR 0))), addr:$dst)]>, - EVEX, EVEX_CD8<32, CD8VT2>; + []>, EVEX, EVEX_CD8<32, CD8VT2>; def VMOVHPDZ128mr : AVX512PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128X:$src), "vmovhpd\t{$src, $dst|$dst, $src}", @@ -6377,12 +6374,11 @@ def VMOVHPDZ128mr : AVX512PDI<0x17, MRMDestMem, (outs), (v2f64 (X86Unpckh VR128X:$src, VR128X:$src)), (iPTR 0))), addr:$dst)]>, EVEX, EVEX_CD8<64, CD8VT1>, VEX_W; +let mayStore = 1, hasSideEffects = 0 in def VMOVLPSZ128mr : AVX512PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128X:$src), "vmovlps\t{$src, $dst|$dst, $src}", - [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128X:$src)), - (iPTR 0))), addr:$dst)]>, - EVEX, EVEX_CD8<32, CD8VT2>; + []>, EVEX, EVEX_CD8<32, CD8VT2>; def VMOVLPDZ128mr : AVX512PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128X:$src), "vmovlpd\t{$src, $dst|$dst, $src}", diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 2e868a60ce4..acb9128db79 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -5962,6 +5962,19 @@ static const uint16_t ReplaceableInstrsAVX2[][3] = { { X86::VUNPCKHPSYrr, X86::VUNPCKHPSYrr, X86::VPUNPCKHDQYrr }, }; +static const uint16_t ReplaceableInstrsFP[][3] = { + //PackedSingle PackedDouble + { X86::MOVLPSrm, X86::MOVLPDrm, X86::INSTRUCTION_LIST_END }, + { X86::MOVHPSrm, X86::MOVHPDrm, X86::INSTRUCTION_LIST_END }, + { X86::MOVHPSmr, X86::MOVHPDmr, X86::INSTRUCTION_LIST_END }, + { X86::VMOVLPSrm, X86::VMOVLPDrm, X86::INSTRUCTION_LIST_END }, + { X86::VMOVHPSrm, X86::VMOVHPDrm, X86::INSTRUCTION_LIST_END }, + { X86::VMOVHPSmr, X86::VMOVHPDmr, X86::INSTRUCTION_LIST_END }, + { X86::VMOVLPSZ128rm, X86::VMOVLPDZ128rm, X86::INSTRUCTION_LIST_END }, + { X86::VMOVHPSZ128rm, X86::VMOVHPDZ128rm, X86::INSTRUCTION_LIST_END }, + { X86::VMOVHPSZ128mr, X86::VMOVHPDZ128mr, X86::INSTRUCTION_LIST_END }, +}; + static const uint16_t ReplaceableInstrsAVX2InsertExtract[][3] = { //PackedSingle PackedDouble PackedInt { X86::VEXTRACTF128mr, X86::VEXTRACTF128mr, X86::VEXTRACTI128mr }, @@ -6202,7 +6215,7 @@ static const uint16_t ReplaceableInstrsAVX512DQMasked[][4] = { }; // NOTE: These should only be used by the custom domain methods. -static const uint16_t ReplaceableCustomInstrs[][3] = { +static const uint16_t ReplaceableBlendInstrs[][3] = { //PackedSingle PackedDouble PackedInt { X86::BLENDPSrmi, X86::BLENDPDrmi, X86::PBLENDWrmi }, { X86::BLENDPSrri, X86::BLENDPDrri, X86::PBLENDWrri }, @@ -6211,7 +6224,7 @@ static const uint16_t ReplaceableCustomInstrs[][3] = { { X86::VBLENDPSYrmi, X86::VBLENDPDYrmi, X86::VPBLENDWYrmi }, { X86::VBLENDPSYrri, X86::VBLENDPDYrri, X86::VPBLENDWYrri }, }; -static const uint16_t ReplaceableCustomAVX2Instrs[][3] = { +static const uint16_t ReplaceableBlendAVX2Instrs[][3] = { //PackedSingle PackedDouble PackedInt { X86::VBLENDPSrmi, X86::VBLENDPDrmi, X86::VPBLENDDrmi }, { X86::VBLENDPSrri, X86::VBLENDPDrri, X86::VPBLENDDrri }, @@ -6405,9 +6418,9 @@ bool X86InstrInfo::setExecutionDomainCustom(MachineInstr &MI, Imm = (ImmWidth == 16 ? ((Imm << 8) | Imm) : Imm); unsigned NewImm = Imm; - const uint16_t *table = lookup(Opcode, dom, ReplaceableCustomInstrs); + const uint16_t *table = lookup(Opcode, dom, ReplaceableBlendInstrs); if (!table) - table = lookup(Opcode, dom, ReplaceableCustomAVX2Instrs); + table = lookup(Opcode, dom, ReplaceableBlendAVX2Instrs); if (Domain == 1) { // PackedSingle AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4, &NewImm); @@ -6417,7 +6430,7 @@ bool X86InstrInfo::setExecutionDomainCustom(MachineInstr &MI, if (Subtarget.hasAVX2()) { // If we are already VPBLENDW use that, else use VPBLENDD. if ((ImmWidth / (Is256 ? 2 : 1)) != 8) { - table = lookup(Opcode, dom, ReplaceableCustomAVX2Instrs); + table = lookup(Opcode, dom, ReplaceableBlendAVX2Instrs); AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4, &NewImm); } } else { @@ -6525,6 +6538,8 @@ X86InstrInfo::getExecutionDomain(const MachineInstr &MI) const { validDomains = 0xe; } else if (lookup(opcode, domain, ReplaceableInstrsAVX2)) { validDomains = Subtarget.hasAVX2() ? 0xe : 0x6; + } else if (lookup(opcode, domain, ReplaceableInstrsFP)) { + validDomains = 0x6; } else if (lookup(opcode, domain, ReplaceableInstrsAVX2InsertExtract)) { // Insert/extract instructions should only effect domain if AVX2 // is enabled. @@ -6564,6 +6579,11 @@ void X86InstrInfo::setExecutionDomain(MachineInstr &MI, unsigned Domain) const { "256-bit vector operations only available in AVX2"); table = lookup(MI.getOpcode(), dom, ReplaceableInstrsAVX2); } + if (!table) { // try the FP table + table = lookup(MI.getOpcode(), dom, ReplaceableInstrsFP); + assert((!table || Domain < 3) && + "Can only select PackedSingle or PackedDouble"); + } if (!table) { // try the other table assert(Subtarget.hasAVX2() && "256-bit insert/extract only available in AVX2"); diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 76530adc152..ea14fb0600a 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -635,10 +635,10 @@ defm MOVL : sse12_mov_hilo_packed<0x12, X86Movsd, "movlp">; let SchedRW = [WriteFStore] in { let Predicates = [UseAVX] in { +let mayStore = 1, hasSideEffects = 0 in def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movlps\t{$src, $dst|$dst, $src}", - [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)), - (iPTR 0))), addr:$dst)]>, + []>, VEX, VEX_WIG; def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movlpd\t{$src, $dst|$dst, $src}", @@ -646,10 +646,10 @@ def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG; }// UseAVX +let mayStore = 1, hasSideEffects = 0 in def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movlps\t{$src, $dst|$dst, $src}", - [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)), - (iPTR 0))), addr:$dst)]>; + []>; def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movlpd\t{$src, $dst|$dst, $src}", [(store (f64 (extractelt (v2f64 VR128:$src), @@ -675,24 +675,20 @@ let SchedRW = [WriteFStore] in { // v2f64 extract element 1 is always custom lowered to unpack high to low // and extract element 0 so the non-store version isn't too horrible. let Predicates = [UseAVX] in { +let mayStore = 1, hasSideEffects = 0 in def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movhps\t{$src, $dst|$dst, $src}", - [(store (f64 (extractelt - (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)), - (bc_v2f64 (v4f32 VR128:$src))), - (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG; + []>, VEX, VEX_WIG; def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movhpd\t{$src, $dst|$dst, $src}", [(store (f64 (extractelt (v2f64 (X86Unpckh VR128:$src, VR128:$src)), (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG; } // UseAVX +let mayStore = 1, hasSideEffects = 0 in def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movhps\t{$src, $dst|$dst, $src}", - [(store (f64 (extractelt - (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)), - (bc_v2f64 (v4f32 VR128:$src))), - (iPTR 0))), addr:$dst)]>; + []>; def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movhpd\t{$src, $dst|$dst, $src}", [(store (f64 (extractelt diff --git a/test/CodeGen/X86/avx2-masked-gather.ll b/test/CodeGen/X86/avx2-masked-gather.ll index 62a9105993f..f220c7959e4 100644 --- a/test/CodeGen/X86/avx2-masked-gather.ll +++ b/test/CodeGen/X86/avx2-masked-gather.ll @@ -634,15 +634,15 @@ define <4 x double> @masked_gather_v4double(<4 x double*>* %ptr, <4 x i1> %masks ; NOGATHER-NEXT: # %bb.1: # %cond.load ; NOGATHER-NEXT: vmovq %xmm2, %rax ; NOGATHER-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; NOGATHER-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3] +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] ; NOGATHER-NEXT: .LBB9_2: # %else ; NOGATHER-NEXT: vpextrb $4, %xmm0, %eax ; NOGATHER-NEXT: testb $1, %al ; NOGATHER-NEXT: je .LBB9_4 ; NOGATHER-NEXT: # %bb.3: # %cond.load1 ; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax -; NOGATHER-NEXT: vmovhpd {{.*#+}} xmm3 = xmm1[0],mem[0] -; NOGATHER-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3] +; NOGATHER-NEXT: vmovhps {{.*#+}} xmm3 = xmm1[0,1],mem[0,1] +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; NOGATHER-NEXT: .LBB9_4: # %else2 ; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax ; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm2 @@ -651,7 +651,7 @@ define <4 x double> @masked_gather_v4double(<4 x double*>* %ptr, <4 x i1> %masks ; NOGATHER-NEXT: # %bb.5: # %cond.load4 ; NOGATHER-NEXT: vmovq %xmm2, %rax ; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm3 -; NOGATHER-NEXT: vmovlpd {{.*#+}} xmm3 = mem[0],xmm3[1] +; NOGATHER-NEXT: vmovlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3] ; NOGATHER-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; NOGATHER-NEXT: .LBB9_6: # %else5 ; NOGATHER-NEXT: vpextrb $12, %xmm0, %eax @@ -660,10 +660,10 @@ define <4 x double> @masked_gather_v4double(<4 x double*>* %ptr, <4 x i1> %masks ; NOGATHER-NEXT: # %bb.7: # %cond.load7 ; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax ; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0 -; NOGATHER-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; NOGATHER-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 ; NOGATHER-NEXT: .LBB9_8: # %else8 -; NOGATHER-NEXT: vmovapd %ymm1, %ymm0 +; NOGATHER-NEXT: vmovaps %ymm1, %ymm0 ; NOGATHER-NEXT: retq entry: %ld = load <4 x double*>, <4 x double*>* %ptr @@ -744,16 +744,16 @@ define <2 x double> @masked_gather_v2double(<2 x double*>* %ptr, <2 x i1> %masks ; NOGATHER-NEXT: je .LBB11_2 ; NOGATHER-NEXT: # %bb.1: # %cond.load ; NOGATHER-NEXT: vmovq %xmm2, %rax -; NOGATHER-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] +; NOGATHER-NEXT: vmovlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; NOGATHER-NEXT: .LBB11_2: # %else ; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax ; NOGATHER-NEXT: testb $1, %al ; NOGATHER-NEXT: je .LBB11_4 ; NOGATHER-NEXT: # %bb.3: # %cond.load1 ; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax -; NOGATHER-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; NOGATHER-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] ; NOGATHER-NEXT: .LBB11_4: # %else2 -; NOGATHER-NEXT: vmovapd %xmm1, %xmm0 +; NOGATHER-NEXT: vmovaps %xmm1, %xmm0 ; NOGATHER-NEXT: retq entry: %ld = load <2 x double*>, <2 x double*>* %ptr diff --git a/test/CodeGen/X86/avx512-insert-extract.ll b/test/CodeGen/X86/avx512-insert-extract.ll index c73be41ef96..e197d278bd7 100644 --- a/test/CodeGen/X86/avx512-insert-extract.ll +++ b/test/CodeGen/X86/avx512-insert-extract.ll @@ -21,10 +21,10 @@ define <16 x float> @test1(<16 x float> %x, float* %br, float %y) nounwind { define <8 x double> @test2(<8 x double> %x, double* %br, double %y) nounwind { ; CHECK-LABEL: test2: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovhpd {{.*#+}} xmm2 = xmm0[0],mem[0] +; CHECK-NEXT: vmovhps {{.*#+}} xmm2 = xmm0[0,1],mem[0,1] ; CHECK-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm2 ; CHECK-NEXT: vextractf32x4 $3, %zmm0, %xmm0 -; CHECK-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; CHECK-NEXT: vinsertf32x4 $3, %xmm0, %zmm2, %zmm0 ; CHECK-NEXT: retq %rrr = load double, double* %br diff --git a/test/CodeGen/X86/avx512-masked-memop-64-32.ll b/test/CodeGen/X86/avx512-masked-memop-64-32.ll index f199cb097aa..35374c880b7 100644 --- a/test/CodeGen/X86/avx512-masked-memop-64-32.ll +++ b/test/CodeGen/X86/avx512-masked-memop-64-32.ll @@ -75,7 +75,7 @@ define <8 x double> @load_one_mask_bit_set5(<8 x double>* %addr, <8 x double> %v ; AVX512-LABEL: load_one_mask_bit_set5: ; AVX512: ## %bb.0: ; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm1 -; AVX512-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] ; AVX512-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq %res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1>, <8 x double> %val) diff --git a/test/CodeGen/X86/dag-merge-fast-accesses.ll b/test/CodeGen/X86/dag-merge-fast-accesses.ll index 662f74f9754..5099aab2a57 100644 --- a/test/CodeGen/X86/dag-merge-fast-accesses.ll +++ b/test/CodeGen/X86/dag-merge-fast-accesses.ll @@ -35,8 +35,8 @@ define void @merge_vec_element_store(<4 x double> %v, double* %ptr) { ; ; SLOW-LABEL: merge_vec_element_store: ; SLOW: # %bb.0: -; SLOW-NEXT: movlpd %xmm0, (%rdi) -; SLOW-NEXT: movhpd %xmm0, 8(%rdi) +; SLOW-NEXT: movlps %xmm0, (%rdi) +; SLOW-NEXT: movhps %xmm0, 8(%rdi) ; SLOW-NEXT: retq %vecext0 = extractelement <4 x double> %v, i32 0 diff --git a/test/CodeGen/X86/extract-store.ll b/test/CodeGen/X86/extract-store.ll index 41c2f5c495b..4d557c94226 100644 --- a/test/CodeGen/X86/extract-store.ll +++ b/test/CodeGen/X86/extract-store.ll @@ -486,23 +486,23 @@ define void @extract_f64_1(double* nocapture %dst, <2 x double> %foo) nounwind { ; SSE-X32-LABEL: extract_f64_1: ; SSE-X32: # %bb.0: ; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SSE-X32-NEXT: movhpd %xmm0, (%eax) +; SSE-X32-NEXT: movhps %xmm0, (%eax) ; SSE-X32-NEXT: retl ; ; SSE-X64-LABEL: extract_f64_1: ; SSE-X64: # %bb.0: -; SSE-X64-NEXT: movhpd %xmm0, (%rdi) +; SSE-X64-NEXT: movhps %xmm0, (%rdi) ; SSE-X64-NEXT: retq ; ; AVX-X32-LABEL: extract_f64_1: ; AVX-X32: # %bb.0: ; AVX-X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX-X32-NEXT: vmovhpd %xmm0, (%eax) +; AVX-X32-NEXT: vmovhps %xmm0, (%eax) ; AVX-X32-NEXT: retl ; ; AVX-X64-LABEL: extract_f64_1: ; AVX-X64: # %bb.0: -; AVX-X64-NEXT: vmovhpd %xmm0, (%rdi) +; AVX-X64-NEXT: vmovhps %xmm0, (%rdi) ; AVX-X64-NEXT: retq %vecext = extractelement <2 x double> %foo, i32 1 store double %vecext, double* %dst, align 1 diff --git a/test/CodeGen/X86/extractelement-load.ll b/test/CodeGen/X86/extractelement-load.ll index f1d31b83a77..915b9bfa8d1 100644 --- a/test/CodeGen/X86/extractelement-load.ll +++ b/test/CodeGen/X86/extractelement-load.ll @@ -46,8 +46,8 @@ define void @t3(<2 x double>* %a0) { ; X32-SSE2-LABEL: t3: ; X32-SSE2: # %bb.0: # %bb ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movupd (%eax), %xmm0 -; X32-SSE2-NEXT: movhpd %xmm0, (%eax) +; X32-SSE2-NEXT: movups (%eax), %xmm0 +; X32-SSE2-NEXT: movhps %xmm0, (%eax) ; X32-SSE2-NEXT: retl ; ; X64-SSSE3-LABEL: t3: diff --git a/test/CodeGen/X86/fma.ll b/test/CodeGen/X86/fma.ll index fa4dd3db189..78ee863d19f 100644 --- a/test/CodeGen/X86/fma.ll +++ b/test/CodeGen/X86/fma.ll @@ -1446,9 +1446,9 @@ define <2 x double> @test_v2f64(<2 x double> %a, <2 x double> %b, <2 x double> % ; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] ; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: vmovapd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf9,0x28,0x44,0x24,0x30] -; FMACALL32_BDVER2-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x30] +; FMACALL32_BDVER2-NEXT: vmovhps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x17,0x44,0x24,0x10] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40] ; FMACALL32_BDVER2-NEXT: vmovlps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload @@ -1461,8 +1461,8 @@ define <2 x double> @test_v2f64(<2 x double> %a, <2 x double> %b, <2 x double> % ; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x20] ; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x28] ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero -; FMACALL32_BDVER2-NEXT: vmovhpd {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x16,0x44,0x24,0x20] -; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0],mem[0] +; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0x44,0x24,0x20] +; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1],mem[0,1] ; FMACALL32_BDVER2-NEXT: addl $108, %esp ## encoding: [0x83,0xc4,0x6c] ; FMACALL32_BDVER2-NEXT: retl ## encoding: [0xc3] entry: @@ -1596,9 +1596,9 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b, <4 x double> % ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x7c,0x24,0x44] -; FMACALL32_BDVER2-NEXT: vmovupd {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfd,0x10,0x84,0x24,0xc0,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovhps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x17,0x44,0x24,0x10] ; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vunpckhpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload @@ -1622,9 +1622,9 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b, <4 x double> % ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: vmovapd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf9,0x28,0x44,0x24,0x70] -; FMACALL32_BDVER2-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x70] +; FMACALL32_BDVER2-NEXT: vmovhps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x17,0x44,0x24,0x10] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50] ; FMACALL32_BDVER2-NEXT: vmovlps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload @@ -1645,10 +1645,10 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b, <4 x double> % ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero ; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x20] ; FMACALL32_BDVER2-NEXT: ## xmm1 = mem[0],zero -; FMACALL32_BDVER2-NEXT: vmovhpd {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x16,0x44,0x24,0x28] -; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0],mem[0] -; FMACALL32_BDVER2-NEXT: vmovhpd {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x16,0x4c,0x24,0x18] -; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0],mem[0] +; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0x44,0x24,0x28] +; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1],mem[0,1] +; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x16,0x4c,0x24,0x18] +; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1],mem[0,1] ; FMACALL32_BDVER2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] ; FMACALL32_BDVER2-NEXT: addl $252, %esp ## encoding: [0x81,0xc4,0xfc,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: retl ## encoding: [0xc3] @@ -1856,10 +1856,10 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> % ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: vmovapd 40(%ebp), %xmm0 ## encoding: [0xc5,0xf9,0x28,0x45,0x28] +; FMACALL32_BDVER2-NEXT: vmovaps 40(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x28] ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x94,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10] +; FMACALL32_BDVER2-NEXT: vmovhps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x17,0x44,0x24,0x10] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xa0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vunpckhpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload @@ -1882,10 +1882,10 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> % ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: vmovapd 24(%ebp), %xmm0 ## encoding: [0xc5,0xf9,0x28,0x45,0x18] +; FMACALL32_BDVER2-NEXT: vmovaps 24(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x18] ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xc0,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10] +; FMACALL32_BDVER2-NEXT: vmovhps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x17,0x44,0x24,0x10] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x00,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm0, %xmm1 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc1,0x01] @@ -1914,10 +1914,10 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> % ; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] ; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: vmovapd 8(%ebp), %xmm0 ## encoding: [0xc5,0xf9,0x28,0x45,0x08] +; FMACALL32_BDVER2-NEXT: vmovaps 8(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x08] ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x7c,0x24,0x30] -; FMACALL32_BDVER2-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10] +; FMACALL32_BDVER2-NEXT: vmovhps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x17,0x44,0x24,0x10] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xe0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vunpckhpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload @@ -1940,8 +1940,8 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> % ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: vmovapd 56(%ebp), %xmm0 ## encoding: [0xc5,0xf9,0x28,0x45,0x38] -; FMACALL32_BDVER2-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10] +; FMACALL32_BDVER2-NEXT: vmovaps 56(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x38] +; FMACALL32_BDVER2-NEXT: vmovhps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x17,0x44,0x24,0x10] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x30,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovlps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload @@ -1974,19 +1974,19 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> % ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero ; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x50] ; FMACALL32_BDVER2-NEXT: ## xmm1 = mem[0],zero -; FMACALL32_BDVER2-NEXT: vmovhpd {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x16,0x44,0x24,0x58] -; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0],mem[0] -; FMACALL32_BDVER2-NEXT: vmovhpd {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x16,0x4c,0x24,0x48] -; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0],mem[0] +; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0x44,0x24,0x58] +; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1],mem[0,1] +; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x16,0x4c,0x24,0x48] +; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1],mem[0,1] ; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm2 ## encoding: [0xc5,0xfb,0x10,0x54,0x24,0x70] ; FMACALL32_BDVER2-NEXT: ## xmm2 = mem[0],zero -; FMACALL32_BDVER2-NEXT: vmovhpd {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0x16,0x54,0x24,0x68] -; FMACALL32_BDVER2-NEXT: ## xmm2 = xmm2[0],mem[0] +; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x16,0x54,0x24,0x68] +; FMACALL32_BDVER2-NEXT: ## xmm2 = xmm2[0,1],mem[0,1] ; FMACALL32_BDVER2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] ; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x8c,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: ## xmm1 = mem[0],zero -; FMACALL32_BDVER2-NEXT: vmovhpd {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x16,0x4c,0x24,0x78] -; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0],mem[0] +; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x16,0x4c,0x24,0x78] +; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1],mem[0,1] ; FMACALL32_BDVER2-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ## encoding: [0xc4,0xe3,0x75,0x18,0xca,0x01] ; FMACALL32_BDVER2-NEXT: movl %ebp, %esp ## encoding: [0x89,0xec] ; FMACALL32_BDVER2-NEXT: popl %ebp ## encoding: [0x5d] diff --git a/test/CodeGen/X86/gather-addresses.ll b/test/CodeGen/X86/gather-addresses.ll index d76027d402e..1cd85e6e582 100644 --- a/test/CodeGen/X86/gather-addresses.ll +++ b/test/CodeGen/X86/gather-addresses.ll @@ -26,9 +26,9 @@ define <4 x double> @foo(double* %p, <4 x i32>* %i, <4 x i32>* %h) nounwind { ; LIN-SSE2-NEXT: movslq %edx, %rdx ; LIN-SSE2-NEXT: movslq %esi, %rsi ; LIN-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; LIN-SSE2-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; LIN-SSE2-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; LIN-SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; LIN-SSE2-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; LIN-SSE2-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] ; LIN-SSE2-NEXT: retq ; ; LIN-SSE4-LABEL: foo: @@ -43,10 +43,10 @@ define <4 x double> @foo(double* %p, <4 x i32>* %i, <4 x i32>* %h) nounwind { ; LIN-SSE4-NEXT: movslq %ecx, %rcx ; LIN-SSE4-NEXT: movslq %edx, %rdx ; LIN-SSE4-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; LIN-SSE4-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; LIN-SSE4-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; LIN-SSE4-NEXT: movslq %esi, %rax ; LIN-SSE4-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; LIN-SSE4-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; LIN-SSE4-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] ; LIN-SSE4-NEXT: retq ; ; WIN-SSE2-LABEL: foo: @@ -65,9 +65,9 @@ define <4 x double> @foo(double* %p, <4 x i32>* %i, <4 x i32>* %h) nounwind { ; WIN-SSE2-NEXT: movslq %r10d, %r9 ; WIN-SSE2-NEXT: movslq %edx, %rdx ; WIN-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; WIN-SSE2-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; WIN-SSE2-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; WIN-SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; WIN-SSE2-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; WIN-SSE2-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] ; WIN-SSE2-NEXT: retq ; ; WIN-SSE4-LABEL: foo: @@ -82,10 +82,10 @@ define <4 x double> @foo(double* %p, <4 x i32>* %i, <4 x i32>* %h) nounwind { ; WIN-SSE4-NEXT: movslq %edx, %rdx ; WIN-SSE4-NEXT: movslq %r8d, %r8 ; WIN-SSE4-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; WIN-SSE4-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; WIN-SSE4-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; WIN-SSE4-NEXT: movslq %r9d, %rax ; WIN-SSE4-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; WIN-SSE4-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; WIN-SSE4-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] ; WIN-SSE4-NEXT: retq ; ; LIN32-LABEL: foo: @@ -102,9 +102,9 @@ define <4 x double> @foo(double* %p, <4 x i32>* %i, <4 x i32>* %h) nounwind { ; LIN32-NEXT: pextrd $3, %xmm0, %esi ; LIN32-NEXT: movd %xmm0, %edi ; LIN32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; LIN32-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; LIN32-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; LIN32-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; LIN32-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; LIN32-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] ; LIN32-NEXT: popl %esi ; LIN32-NEXT: popl %edi ; LIN32-NEXT: retl diff --git a/test/CodeGen/X86/half.ll b/test/CodeGen/X86/half.ll index 095dfa2b04a..b862bf7a781 100644 --- a/test/CodeGen/X86/half.ll +++ b/test/CodeGen/X86/half.ll @@ -550,9 +550,9 @@ define <4 x double> @test_extend64_vec4(<4 x half>* %p) #0 { ; CHECK-I686-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload ; CHECK-I686-NEXT: fstpl {{[0-9]+}}(%esp) ; CHECK-I686-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-I686-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; CHECK-I686-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; CHECK-I686-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-I686-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; CHECK-I686-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] ; CHECK-I686-NEXT: addl $88, %esp ; CHECK-I686-NEXT: popl %esi ; CHECK-I686-NEXT: retl @@ -807,16 +807,16 @@ define void @test_trunc64_vec4(<4 x double> %a, <4 x half>* %p) #0 { ; CHECK-I686-NEXT: movlps %xmm0, (%esp) ; CHECK-I686-NEXT: calll __truncdfhf2 ; CHECK-I686-NEXT: movw %ax, %si -; CHECK-I686-NEXT: movapd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-I686-NEXT: movhpd %xmm0, (%esp) +; CHECK-I686-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-I686-NEXT: movhps %xmm0, (%esp) ; CHECK-I686-NEXT: calll __truncdfhf2 ; CHECK-I686-NEXT: movw %ax, %di ; CHECK-I686-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-I686-NEXT: movlps %xmm0, (%esp) ; CHECK-I686-NEXT: calll __truncdfhf2 ; CHECK-I686-NEXT: movw %ax, %bx -; CHECK-I686-NEXT: movapd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-I686-NEXT: movhpd %xmm0, (%esp) +; CHECK-I686-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-I686-NEXT: movhps %xmm0, (%esp) ; CHECK-I686-NEXT: calll __truncdfhf2 ; CHECK-I686-NEXT: movw %ax, 6(%ebp) ; CHECK-I686-NEXT: movw %bx, 4(%ebp) diff --git a/test/CodeGen/X86/insert-into-constant-vector.ll b/test/CodeGen/X86/insert-into-constant-vector.ll index 579d2aa7d48..18d57e92805 100644 --- a/test/CodeGen/X86/insert-into-constant-vector.ll +++ b/test/CodeGen/X86/insert-into-constant-vector.ll @@ -212,8 +212,8 @@ define <4 x float> @elt1_v4f32(float %x) { define <2 x double> @elt1_v2f64(double %x) { ; X32SSE-LABEL: elt1_v2f64: ; X32SSE: # %bb.0: -; X32SSE-NEXT: movapd {{.*#+}} xmm0 = <4.2E+1,u> -; X32SSE-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; X32SSE-NEXT: movaps {{.*#+}} xmm0 = <4.2E+1,u> +; X32SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; X32SSE-NEXT: retl ; ; X64SSE-LABEL: elt1_v2f64: @@ -225,8 +225,8 @@ define <2 x double> @elt1_v2f64(double %x) { ; ; X32AVX-LABEL: elt1_v2f64: ; X32AVX: # %bb.0: -; X32AVX-NEXT: vmovapd {{.*#+}} xmm0 = <4.2E+1,u> -; X32AVX-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; X32AVX-NEXT: vmovaps {{.*#+}} xmm0 = <4.2E+1,u> +; X32AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; X32AVX-NEXT: retl ; ; X64AVX-LABEL: elt1_v2f64: @@ -454,8 +454,8 @@ define <8 x i64> @elt5_v8i64(i64 %x) { define <8 x double> @elt1_v8f64(double %x) { ; X32SSE-LABEL: elt1_v8f64: ; X32SSE: # %bb.0: -; X32SSE-NEXT: movapd {{.*#+}} xmm0 = <4.2E+1,u> -; X32SSE-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; X32SSE-NEXT: movaps {{.*#+}} xmm0 = <4.2E+1,u> +; X32SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; X32SSE-NEXT: movaps {{.*#+}} xmm1 = [2.0E+0,3.0E+0] ; X32SSE-NEXT: movaps {{.*#+}} xmm2 = [4.0E+0,5.0E+0] ; X32SSE-NEXT: movaps {{.*#+}} xmm3 = [6.0E+0,7.0E+0] @@ -473,9 +473,9 @@ define <8 x double> @elt1_v8f64(double %x) { ; ; X32AVX1-LABEL: elt1_v8f64: ; X32AVX1: # %bb.0: -; X32AVX1-NEXT: vmovapd {{.*#+}} xmm0 = <4.2E+1,u,2.0E+0,3.0E+0> -; X32AVX1-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; X32AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2,3] +; X32AVX1-NEXT: vmovaps {{.*#+}} xmm0 = <4.2E+1,u,2.0E+0,3.0E+0> +; X32AVX1-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; X32AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] ; X32AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [4.0E+0,5.0E+0,6.0E+0,7.0E+0] ; X32AVX1-NEXT: retl ; @@ -489,9 +489,9 @@ define <8 x double> @elt1_v8f64(double %x) { ; ; X32AVX2-LABEL: elt1_v8f64: ; X32AVX2: # %bb.0: -; X32AVX2-NEXT: vmovapd {{.*#+}} xmm0 = <4.2E+1,u,2.0E+0,3.0E+0> -; X32AVX2-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; X32AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2,3] +; X32AVX2-NEXT: vmovaps {{.*#+}} xmm0 = <4.2E+1,u,2.0E+0,3.0E+0> +; X32AVX2-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; X32AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] ; X32AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [4.0E+0,5.0E+0,6.0E+0,7.0E+0] ; X32AVX2-NEXT: retl ; @@ -505,9 +505,9 @@ define <8 x double> @elt1_v8f64(double %x) { ; ; X32AVX512F-LABEL: elt1_v8f64: ; X32AVX512F: # %bb.0: -; X32AVX512F-NEXT: vmovapd {{.*#+}} xmm0 = <4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0> -; X32AVX512F-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; X32AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = <4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0> +; X32AVX512F-NEXT: vmovaps {{.*#+}} xmm0 = <4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0> +; X32AVX512F-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; X32AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = <4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0> ; X32AVX512F-NEXT: vinsertf32x4 $0, %xmm0, %zmm1, %zmm0 ; X32AVX512F-NEXT: retl ; diff --git a/test/CodeGen/X86/insertelement-ones.ll b/test/CodeGen/X86/insertelement-ones.ll index 11b065f6536..d468c56ed04 100644 --- a/test/CodeGen/X86/insertelement-ones.ll +++ b/test/CodeGen/X86/insertelement-ones.ll @@ -11,17 +11,17 @@ define <2 x i64> @insert_v2i64_x1(<2 x i64> %a) { ; SSE2-LABEL: insert_v2i64_x1: ; SSE2: # %bb.0: -; SSE2-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1] +; SSE2-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] ; SSE2-NEXT: retq ; ; SSE3-LABEL: insert_v2i64_x1: ; SSE3: # %bb.0: -; SSE3-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1] +; SSE3-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: insert_v2i64_x1: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1] +; SSSE3-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: insert_v2i64_x1: @@ -54,17 +54,17 @@ define <2 x i64> @insert_v2i64_x1(<2 x i64> %a) { define <4 x i64> @insert_v4i64_01x3(<4 x i64> %a) { ; SSE2-LABEL: insert_v4i64_01x3: ; SSE2: # %bb.0: -; SSE2-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] +; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; SSE2-NEXT: retq ; ; SSE3-LABEL: insert_v4i64_01x3: ; SSE3: # %bb.0: -; SSE3-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] +; SSE3-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: insert_v4i64_01x3: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] +; SSSE3-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: insert_v4i64_01x3: diff --git a/test/CodeGen/X86/masked_compressstore.ll b/test/CodeGen/X86/masked_compressstore.ll index 0981bf1145a..687ea286b92 100644 --- a/test/CodeGen/X86/masked_compressstore.ll +++ b/test/CodeGen/X86/masked_compressstore.ll @@ -18,56 +18,56 @@ define void @compressstore_v8f64_v8i1(double* %base, <8 x double> %V, <8 x i1> % ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB0_2 ; SSE2-NEXT: ## %bb.1: ## %cond.store -; SSE2-NEXT: movlpd %xmm0, (%rdi) +; SSE2-NEXT: movlps %xmm0, (%rdi) ; SSE2-NEXT: addq $8, %rdi ; SSE2-NEXT: LBB0_2: ## %else ; SSE2-NEXT: shrl $16, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB0_4 ; SSE2-NEXT: ## %bb.3: ## %cond.store1 -; SSE2-NEXT: movhpd %xmm0, (%rdi) +; SSE2-NEXT: movhps %xmm0, (%rdi) ; SSE2-NEXT: addq $8, %rdi ; SSE2-NEXT: LBB0_4: ## %else2 ; SSE2-NEXT: pextrw $2, %xmm4, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB0_6 ; SSE2-NEXT: ## %bb.5: ## %cond.store4 -; SSE2-NEXT: movlpd %xmm1, (%rdi) +; SSE2-NEXT: movlps %xmm1, (%rdi) ; SSE2-NEXT: addq $8, %rdi ; SSE2-NEXT: LBB0_6: ## %else5 ; SSE2-NEXT: pextrw $3, %xmm4, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB0_8 ; SSE2-NEXT: ## %bb.7: ## %cond.store7 -; SSE2-NEXT: movhpd %xmm1, (%rdi) +; SSE2-NEXT: movhps %xmm1, (%rdi) ; SSE2-NEXT: addq $8, %rdi ; SSE2-NEXT: LBB0_8: ## %else8 ; SSE2-NEXT: pextrw $4, %xmm4, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB0_10 ; SSE2-NEXT: ## %bb.9: ## %cond.store10 -; SSE2-NEXT: movlpd %xmm2, (%rdi) +; SSE2-NEXT: movlps %xmm2, (%rdi) ; SSE2-NEXT: addq $8, %rdi ; SSE2-NEXT: LBB0_10: ## %else11 ; SSE2-NEXT: pextrw $5, %xmm4, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB0_12 ; SSE2-NEXT: ## %bb.11: ## %cond.store13 -; SSE2-NEXT: movhpd %xmm2, (%rdi) +; SSE2-NEXT: movhps %xmm2, (%rdi) ; SSE2-NEXT: addq $8, %rdi ; SSE2-NEXT: LBB0_12: ## %else14 ; SSE2-NEXT: pextrw $6, %xmm4, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB0_14 ; SSE2-NEXT: ## %bb.13: ## %cond.store16 -; SSE2-NEXT: movlpd %xmm3, (%rdi) +; SSE2-NEXT: movlps %xmm3, (%rdi) ; SSE2-NEXT: addq $8, %rdi ; SSE2-NEXT: LBB0_14: ## %else17 ; SSE2-NEXT: pextrw $7, %xmm4, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB0_16 ; SSE2-NEXT: ## %bb.15: ## %cond.store19 -; SSE2-NEXT: movhpd %xmm3, (%rdi) +; SSE2-NEXT: movhps %xmm3, (%rdi) ; SSE2-NEXT: LBB0_16: ## %else20 ; SSE2-NEXT: retq ; @@ -77,56 +77,56 @@ define void @compressstore_v8f64_v8i1(double* %base, <8 x double> %V, <8 x i1> % ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB0_2 ; SSE42-NEXT: ## %bb.1: ## %cond.store -; SSE42-NEXT: movlpd %xmm0, (%rdi) +; SSE42-NEXT: movlps %xmm0, (%rdi) ; SSE42-NEXT: addq $8, %rdi ; SSE42-NEXT: LBB0_2: ## %else ; SSE42-NEXT: pextrb $2, %xmm4, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB0_4 ; SSE42-NEXT: ## %bb.3: ## %cond.store1 -; SSE42-NEXT: movhpd %xmm0, (%rdi) +; SSE42-NEXT: movhps %xmm0, (%rdi) ; SSE42-NEXT: addq $8, %rdi ; SSE42-NEXT: LBB0_4: ## %else2 ; SSE42-NEXT: pextrb $4, %xmm4, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB0_6 ; SSE42-NEXT: ## %bb.5: ## %cond.store4 -; SSE42-NEXT: movlpd %xmm1, (%rdi) +; SSE42-NEXT: movlps %xmm1, (%rdi) ; SSE42-NEXT: addq $8, %rdi ; SSE42-NEXT: LBB0_6: ## %else5 ; SSE42-NEXT: pextrb $6, %xmm4, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB0_8 ; SSE42-NEXT: ## %bb.7: ## %cond.store7 -; SSE42-NEXT: movhpd %xmm1, (%rdi) +; SSE42-NEXT: movhps %xmm1, (%rdi) ; SSE42-NEXT: addq $8, %rdi ; SSE42-NEXT: LBB0_8: ## %else8 ; SSE42-NEXT: pextrb $8, %xmm4, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB0_10 ; SSE42-NEXT: ## %bb.9: ## %cond.store10 -; SSE42-NEXT: movlpd %xmm2, (%rdi) +; SSE42-NEXT: movlps %xmm2, (%rdi) ; SSE42-NEXT: addq $8, %rdi ; SSE42-NEXT: LBB0_10: ## %else11 ; SSE42-NEXT: pextrb $10, %xmm4, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB0_12 ; SSE42-NEXT: ## %bb.11: ## %cond.store13 -; SSE42-NEXT: movhpd %xmm2, (%rdi) +; SSE42-NEXT: movhps %xmm2, (%rdi) ; SSE42-NEXT: addq $8, %rdi ; SSE42-NEXT: LBB0_12: ## %else14 ; SSE42-NEXT: pextrb $12, %xmm4, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB0_14 ; SSE42-NEXT: ## %bb.13: ## %cond.store16 -; SSE42-NEXT: movlpd %xmm3, (%rdi) +; SSE42-NEXT: movlps %xmm3, (%rdi) ; SSE42-NEXT: addq $8, %rdi ; SSE42-NEXT: LBB0_14: ## %else17 ; SSE42-NEXT: pextrb $14, %xmm4, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB0_16 ; SSE42-NEXT: ## %bb.15: ## %cond.store19 -; SSE42-NEXT: movhpd %xmm3, (%rdi) +; SSE42-NEXT: movhps %xmm3, (%rdi) ; SSE42-NEXT: LBB0_16: ## %else20 ; SSE42-NEXT: retq ; @@ -136,14 +136,14 @@ define void @compressstore_v8f64_v8i1(double* %base, <8 x double> %V, <8 x i1> % ; AVX1OR2-NEXT: testb $1, %al ; AVX1OR2-NEXT: je LBB0_2 ; AVX1OR2-NEXT: ## %bb.1: ## %cond.store -; AVX1OR2-NEXT: vmovlpd %xmm0, (%rdi) +; AVX1OR2-NEXT: vmovlps %xmm0, (%rdi) ; AVX1OR2-NEXT: addq $8, %rdi ; AVX1OR2-NEXT: LBB0_2: ## %else ; AVX1OR2-NEXT: vpextrb $2, %xmm2, %eax ; AVX1OR2-NEXT: testb $1, %al ; AVX1OR2-NEXT: je LBB0_4 ; AVX1OR2-NEXT: ## %bb.3: ## %cond.store1 -; AVX1OR2-NEXT: vmovhpd %xmm0, (%rdi) +; AVX1OR2-NEXT: vmovhps %xmm0, (%rdi) ; AVX1OR2-NEXT: addq $8, %rdi ; AVX1OR2-NEXT: LBB0_4: ## %else2 ; AVX1OR2-NEXT: vpextrb $4, %xmm2, %eax @@ -151,28 +151,28 @@ define void @compressstore_v8f64_v8i1(double* %base, <8 x double> %V, <8 x i1> % ; AVX1OR2-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1OR2-NEXT: je LBB0_6 ; AVX1OR2-NEXT: ## %bb.5: ## %cond.store4 -; AVX1OR2-NEXT: vmovlpd %xmm0, (%rdi) +; AVX1OR2-NEXT: vmovlps %xmm0, (%rdi) ; AVX1OR2-NEXT: addq $8, %rdi ; AVX1OR2-NEXT: LBB0_6: ## %else5 ; AVX1OR2-NEXT: vpextrb $6, %xmm2, %eax ; AVX1OR2-NEXT: testb $1, %al ; AVX1OR2-NEXT: je LBB0_8 ; AVX1OR2-NEXT: ## %bb.7: ## %cond.store7 -; AVX1OR2-NEXT: vmovhpd %xmm0, (%rdi) +; AVX1OR2-NEXT: vmovhps %xmm0, (%rdi) ; AVX1OR2-NEXT: addq $8, %rdi ; AVX1OR2-NEXT: LBB0_8: ## %else8 ; AVX1OR2-NEXT: vpextrb $8, %xmm2, %eax ; AVX1OR2-NEXT: testb $1, %al ; AVX1OR2-NEXT: je LBB0_10 ; AVX1OR2-NEXT: ## %bb.9: ## %cond.store10 -; AVX1OR2-NEXT: vmovlpd %xmm1, (%rdi) +; AVX1OR2-NEXT: vmovlps %xmm1, (%rdi) ; AVX1OR2-NEXT: addq $8, %rdi ; AVX1OR2-NEXT: LBB0_10: ## %else11 ; AVX1OR2-NEXT: vpextrb $10, %xmm2, %eax ; AVX1OR2-NEXT: testb $1, %al ; AVX1OR2-NEXT: je LBB0_12 ; AVX1OR2-NEXT: ## %bb.11: ## %cond.store13 -; AVX1OR2-NEXT: vmovhpd %xmm1, (%rdi) +; AVX1OR2-NEXT: vmovhps %xmm1, (%rdi) ; AVX1OR2-NEXT: addq $8, %rdi ; AVX1OR2-NEXT: LBB0_12: ## %else14 ; AVX1OR2-NEXT: vpextrb $12, %xmm2, %eax @@ -180,14 +180,14 @@ define void @compressstore_v8f64_v8i1(double* %base, <8 x double> %V, <8 x i1> % ; AVX1OR2-NEXT: vextractf128 $1, %ymm1, %xmm0 ; AVX1OR2-NEXT: je LBB0_14 ; AVX1OR2-NEXT: ## %bb.13: ## %cond.store16 -; AVX1OR2-NEXT: vmovlpd %xmm0, (%rdi) +; AVX1OR2-NEXT: vmovlps %xmm0, (%rdi) ; AVX1OR2-NEXT: addq $8, %rdi ; AVX1OR2-NEXT: LBB0_14: ## %else17 ; AVX1OR2-NEXT: vpextrb $14, %xmm2, %eax ; AVX1OR2-NEXT: testb $1, %al ; AVX1OR2-NEXT: je LBB0_16 ; AVX1OR2-NEXT: ## %bb.15: ## %cond.store19 -; AVX1OR2-NEXT: vmovhpd %xmm0, (%rdi) +; AVX1OR2-NEXT: vmovhps %xmm0, (%rdi) ; AVX1OR2-NEXT: LBB0_16: ## %else20 ; AVX1OR2-NEXT: vzeroupper ; AVX1OR2-NEXT: retq @@ -229,7 +229,7 @@ define void @compressstore_v16f64_v16i1(double* %base, <16 x double> %V, <16 x i ; SSE2-NEXT: testb $1, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: je LBB1_2 ; SSE2-NEXT: ## %bb.1: ## %cond.store -; SSE2-NEXT: movlpd %xmm0, (%rdi) +; SSE2-NEXT: movlps %xmm0, (%rdi) ; SSE2-NEXT: addq $8, %rdi ; SSE2-NEXT: LBB1_2: ## %else ; SSE2-NEXT: movd %xmm8, %eax @@ -238,7 +238,7 @@ define void @compressstore_v16f64_v16i1(double* %base, <16 x double> %V, <16 x i ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je LBB1_4 ; SSE2-NEXT: ## %bb.3: ## %cond.store1 -; SSE2-NEXT: movhpd %xmm0, (%rdi) +; SSE2-NEXT: movhps %xmm0, (%rdi) ; SSE2-NEXT: addq $8, %rdi ; SSE2-NEXT: LBB1_4: ## %else2 ; SSE2-NEXT: movl %eax, %ecx @@ -246,98 +246,98 @@ define void @compressstore_v16f64_v16i1(double* %base, <16 x double> %V, <16 x i ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je LBB1_6 ; SSE2-NEXT: ## %bb.5: ## %cond.store4 -; SSE2-NEXT: movlpd %xmm1, (%rdi) +; SSE2-NEXT: movlps %xmm1, (%rdi) ; SSE2-NEXT: addq $8, %rdi ; SSE2-NEXT: LBB1_6: ## %else5 ; SSE2-NEXT: shrl $24, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB1_8 ; SSE2-NEXT: ## %bb.7: ## %cond.store7 -; SSE2-NEXT: movhpd %xmm1, (%rdi) +; SSE2-NEXT: movhps %xmm1, (%rdi) ; SSE2-NEXT: addq $8, %rdi ; SSE2-NEXT: LBB1_8: ## %else8 ; SSE2-NEXT: pextrw $2, %xmm8, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB1_10 ; SSE2-NEXT: ## %bb.9: ## %cond.store10 -; SSE2-NEXT: movlpd %xmm2, (%rdi) +; SSE2-NEXT: movlps %xmm2, (%rdi) ; SSE2-NEXT: addq $8, %rdi ; SSE2-NEXT: LBB1_10: ## %else11 ; SSE2-NEXT: shrl $8, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB1_12 ; SSE2-NEXT: ## %bb.11: ## %cond.store13 -; SSE2-NEXT: movhpd %xmm2, (%rdi) +; SSE2-NEXT: movhps %xmm2, (%rdi) ; SSE2-NEXT: addq $8, %rdi ; SSE2-NEXT: LBB1_12: ## %else14 ; SSE2-NEXT: pextrw $3, %xmm8, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB1_14 ; SSE2-NEXT: ## %bb.13: ## %cond.store16 -; SSE2-NEXT: movlpd %xmm3, (%rdi) +; SSE2-NEXT: movlps %xmm3, (%rdi) ; SSE2-NEXT: addq $8, %rdi ; SSE2-NEXT: LBB1_14: ## %else17 ; SSE2-NEXT: shrl $8, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB1_16 ; SSE2-NEXT: ## %bb.15: ## %cond.store19 -; SSE2-NEXT: movhpd %xmm3, (%rdi) +; SSE2-NEXT: movhps %xmm3, (%rdi) ; SSE2-NEXT: addq $8, %rdi ; SSE2-NEXT: LBB1_16: ## %else20 ; SSE2-NEXT: pextrw $4, %xmm8, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB1_18 ; SSE2-NEXT: ## %bb.17: ## %cond.store22 -; SSE2-NEXT: movlpd %xmm4, (%rdi) +; SSE2-NEXT: movlps %xmm4, (%rdi) ; SSE2-NEXT: addq $8, %rdi ; SSE2-NEXT: LBB1_18: ## %else23 ; SSE2-NEXT: shrl $8, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB1_20 ; SSE2-NEXT: ## %bb.19: ## %cond.store25 -; SSE2-NEXT: movhpd %xmm4, (%rdi) +; SSE2-NEXT: movhps %xmm4, (%rdi) ; SSE2-NEXT: addq $8, %rdi ; SSE2-NEXT: LBB1_20: ## %else26 ; SSE2-NEXT: pextrw $5, %xmm8, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB1_22 ; SSE2-NEXT: ## %bb.21: ## %cond.store28 -; SSE2-NEXT: movlpd %xmm5, (%rdi) +; SSE2-NEXT: movlps %xmm5, (%rdi) ; SSE2-NEXT: addq $8, %rdi ; SSE2-NEXT: LBB1_22: ## %else29 ; SSE2-NEXT: shrl $8, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB1_24 ; SSE2-NEXT: ## %bb.23: ## %cond.store31 -; SSE2-NEXT: movhpd %xmm5, (%rdi) +; SSE2-NEXT: movhps %xmm5, (%rdi) ; SSE2-NEXT: addq $8, %rdi ; SSE2-NEXT: LBB1_24: ## %else32 ; SSE2-NEXT: pextrw $6, %xmm8, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB1_26 ; SSE2-NEXT: ## %bb.25: ## %cond.store34 -; SSE2-NEXT: movlpd %xmm6, (%rdi) +; SSE2-NEXT: movlps %xmm6, (%rdi) ; SSE2-NEXT: addq $8, %rdi ; SSE2-NEXT: LBB1_26: ## %else35 ; SSE2-NEXT: shrl $8, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB1_28 ; SSE2-NEXT: ## %bb.27: ## %cond.store37 -; SSE2-NEXT: movhpd %xmm6, (%rdi) +; SSE2-NEXT: movhps %xmm6, (%rdi) ; SSE2-NEXT: addq $8, %rdi ; SSE2-NEXT: LBB1_28: ## %else38 ; SSE2-NEXT: pextrw $7, %xmm8, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB1_30 ; SSE2-NEXT: ## %bb.29: ## %cond.store40 -; SSE2-NEXT: movlpd %xmm7, (%rdi) +; SSE2-NEXT: movlps %xmm7, (%rdi) ; SSE2-NEXT: addq $8, %rdi ; SSE2-NEXT: LBB1_30: ## %else41 ; SSE2-NEXT: shrl $8, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB1_32 ; SSE2-NEXT: ## %bb.31: ## %cond.store43 -; SSE2-NEXT: movhpd %xmm7, (%rdi) +; SSE2-NEXT: movhps %xmm7, (%rdi) ; SSE2-NEXT: LBB1_32: ## %else44 ; SSE2-NEXT: retq ; @@ -348,112 +348,112 @@ define void @compressstore_v16f64_v16i1(double* %base, <16 x double> %V, <16 x i ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB1_2 ; SSE42-NEXT: ## %bb.1: ## %cond.store -; SSE42-NEXT: movlpd %xmm0, (%rdi) +; SSE42-NEXT: movlps %xmm0, (%rdi) ; SSE42-NEXT: addq $8, %rdi ; SSE42-NEXT: LBB1_2: ## %else ; SSE42-NEXT: pextrb $1, %xmm8, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB1_4 ; SSE42-NEXT: ## %bb.3: ## %cond.store1 -; SSE42-NEXT: movhpd %xmm0, (%rdi) +; SSE42-NEXT: movhps %xmm0, (%rdi) ; SSE42-NEXT: addq $8, %rdi ; SSE42-NEXT: LBB1_4: ## %else2 ; SSE42-NEXT: pextrb $2, %xmm8, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB1_6 ; SSE42-NEXT: ## %bb.5: ## %cond.store4 -; SSE42-NEXT: movlpd %xmm1, (%rdi) +; SSE42-NEXT: movlps %xmm1, (%rdi) ; SSE42-NEXT: addq $8, %rdi ; SSE42-NEXT: LBB1_6: ## %else5 ; SSE42-NEXT: pextrb $3, %xmm8, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB1_8 ; SSE42-NEXT: ## %bb.7: ## %cond.store7 -; SSE42-NEXT: movhpd %xmm1, (%rdi) +; SSE42-NEXT: movhps %xmm1, (%rdi) ; SSE42-NEXT: addq $8, %rdi ; SSE42-NEXT: LBB1_8: ## %else8 ; SSE42-NEXT: pextrb $4, %xmm8, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB1_10 ; SSE42-NEXT: ## %bb.9: ## %cond.store10 -; SSE42-NEXT: movlpd %xmm2, (%rdi) +; SSE42-NEXT: movlps %xmm2, (%rdi) ; SSE42-NEXT: addq $8, %rdi ; SSE42-NEXT: LBB1_10: ## %else11 ; SSE42-NEXT: pextrb $5, %xmm8, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB1_12 ; SSE42-NEXT: ## %bb.11: ## %cond.store13 -; SSE42-NEXT: movhpd %xmm2, (%rdi) +; SSE42-NEXT: movhps %xmm2, (%rdi) ; SSE42-NEXT: addq $8, %rdi ; SSE42-NEXT: LBB1_12: ## %else14 ; SSE42-NEXT: pextrb $6, %xmm8, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB1_14 ; SSE42-NEXT: ## %bb.13: ## %cond.store16 -; SSE42-NEXT: movlpd %xmm3, (%rdi) +; SSE42-NEXT: movlps %xmm3, (%rdi) ; SSE42-NEXT: addq $8, %rdi ; SSE42-NEXT: LBB1_14: ## %else17 ; SSE42-NEXT: pextrb $7, %xmm8, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB1_16 ; SSE42-NEXT: ## %bb.15: ## %cond.store19 -; SSE42-NEXT: movhpd %xmm3, (%rdi) +; SSE42-NEXT: movhps %xmm3, (%rdi) ; SSE42-NEXT: addq $8, %rdi ; SSE42-NEXT: LBB1_16: ## %else20 ; SSE42-NEXT: pextrb $8, %xmm8, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB1_18 ; SSE42-NEXT: ## %bb.17: ## %cond.store22 -; SSE42-NEXT: movlpd %xmm4, (%rdi) +; SSE42-NEXT: movlps %xmm4, (%rdi) ; SSE42-NEXT: addq $8, %rdi ; SSE42-NEXT: LBB1_18: ## %else23 ; SSE42-NEXT: pextrb $9, %xmm8, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB1_20 ; SSE42-NEXT: ## %bb.19: ## %cond.store25 -; SSE42-NEXT: movhpd %xmm4, (%rdi) +; SSE42-NEXT: movhps %xmm4, (%rdi) ; SSE42-NEXT: addq $8, %rdi ; SSE42-NEXT: LBB1_20: ## %else26 ; SSE42-NEXT: pextrb $10, %xmm8, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB1_22 ; SSE42-NEXT: ## %bb.21: ## %cond.store28 -; SSE42-NEXT: movlpd %xmm5, (%rdi) +; SSE42-NEXT: movlps %xmm5, (%rdi) ; SSE42-NEXT: addq $8, %rdi ; SSE42-NEXT: LBB1_22: ## %else29 ; SSE42-NEXT: pextrb $11, %xmm8, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB1_24 ; SSE42-NEXT: ## %bb.23: ## %cond.store31 -; SSE42-NEXT: movhpd %xmm5, (%rdi) +; SSE42-NEXT: movhps %xmm5, (%rdi) ; SSE42-NEXT: addq $8, %rdi ; SSE42-NEXT: LBB1_24: ## %else32 ; SSE42-NEXT: pextrb $12, %xmm8, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB1_26 ; SSE42-NEXT: ## %bb.25: ## %cond.store34 -; SSE42-NEXT: movlpd %xmm6, (%rdi) +; SSE42-NEXT: movlps %xmm6, (%rdi) ; SSE42-NEXT: addq $8, %rdi ; SSE42-NEXT: LBB1_26: ## %else35 ; SSE42-NEXT: pextrb $13, %xmm8, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB1_28 ; SSE42-NEXT: ## %bb.27: ## %cond.store37 -; SSE42-NEXT: movhpd %xmm6, (%rdi) +; SSE42-NEXT: movhps %xmm6, (%rdi) ; SSE42-NEXT: addq $8, %rdi ; SSE42-NEXT: LBB1_28: ## %else38 ; SSE42-NEXT: pextrb $14, %xmm8, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB1_30 ; SSE42-NEXT: ## %bb.29: ## %cond.store40 -; SSE42-NEXT: movlpd %xmm7, (%rdi) +; SSE42-NEXT: movlps %xmm7, (%rdi) ; SSE42-NEXT: addq $8, %rdi ; SSE42-NEXT: LBB1_30: ## %else41 ; SSE42-NEXT: pextrb $15, %xmm8, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB1_32 ; SSE42-NEXT: ## %bb.31: ## %cond.store43 -; SSE42-NEXT: movhpd %xmm7, (%rdi) +; SSE42-NEXT: movhps %xmm7, (%rdi) ; SSE42-NEXT: LBB1_32: ## %else44 ; SSE42-NEXT: retq ; @@ -463,14 +463,14 @@ define void @compressstore_v16f64_v16i1(double* %base, <16 x double> %V, <16 x i ; AVX1OR2-NEXT: testb $1, %al ; AVX1OR2-NEXT: je LBB1_2 ; AVX1OR2-NEXT: ## %bb.1: ## %cond.store -; AVX1OR2-NEXT: vmovlpd %xmm0, (%rdi) +; AVX1OR2-NEXT: vmovlps %xmm0, (%rdi) ; AVX1OR2-NEXT: addq $8, %rdi ; AVX1OR2-NEXT: LBB1_2: ## %else ; AVX1OR2-NEXT: vpextrb $1, %xmm4, %eax ; AVX1OR2-NEXT: testb $1, %al ; AVX1OR2-NEXT: je LBB1_4 ; AVX1OR2-NEXT: ## %bb.3: ## %cond.store1 -; AVX1OR2-NEXT: vmovhpd %xmm0, (%rdi) +; AVX1OR2-NEXT: vmovhps %xmm0, (%rdi) ; AVX1OR2-NEXT: addq $8, %rdi ; AVX1OR2-NEXT: LBB1_4: ## %else2 ; AVX1OR2-NEXT: vpextrb $2, %xmm4, %eax @@ -478,28 +478,28 @@ define void @compressstore_v16f64_v16i1(double* %base, <16 x double> %V, <16 x i ; AVX1OR2-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1OR2-NEXT: je LBB1_6 ; AVX1OR2-NEXT: ## %bb.5: ## %cond.store4 -; AVX1OR2-NEXT: vmovlpd %xmm0, (%rdi) +; AVX1OR2-NEXT: vmovlps %xmm0, (%rdi) ; AVX1OR2-NEXT: addq $8, %rdi ; AVX1OR2-NEXT: LBB1_6: ## %else5 ; AVX1OR2-NEXT: vpextrb $3, %xmm4, %eax ; AVX1OR2-NEXT: testb $1, %al ; AVX1OR2-NEXT: je LBB1_8 ; AVX1OR2-NEXT: ## %bb.7: ## %cond.store7 -; AVX1OR2-NEXT: vmovhpd %xmm0, (%rdi) +; AVX1OR2-NEXT: vmovhps %xmm0, (%rdi) ; AVX1OR2-NEXT: addq $8, %rdi ; AVX1OR2-NEXT: LBB1_8: ## %else8 ; AVX1OR2-NEXT: vpextrb $4, %xmm4, %eax ; AVX1OR2-NEXT: testb $1, %al ; AVX1OR2-NEXT: je LBB1_10 ; AVX1OR2-NEXT: ## %bb.9: ## %cond.store10 -; AVX1OR2-NEXT: vmovlpd %xmm1, (%rdi) +; AVX1OR2-NEXT: vmovlps %xmm1, (%rdi) ; AVX1OR2-NEXT: addq $8, %rdi ; AVX1OR2-NEXT: LBB1_10: ## %else11 ; AVX1OR2-NEXT: vpextrb $5, %xmm4, %eax ; AVX1OR2-NEXT: testb $1, %al ; AVX1OR2-NEXT: je LBB1_12 ; AVX1OR2-NEXT: ## %bb.11: ## %cond.store13 -; AVX1OR2-NEXT: vmovhpd %xmm1, (%rdi) +; AVX1OR2-NEXT: vmovhps %xmm1, (%rdi) ; AVX1OR2-NEXT: addq $8, %rdi ; AVX1OR2-NEXT: LBB1_12: ## %else14 ; AVX1OR2-NEXT: vpextrb $6, %xmm4, %eax @@ -507,28 +507,28 @@ define void @compressstore_v16f64_v16i1(double* %base, <16 x double> %V, <16 x i ; AVX1OR2-NEXT: vextractf128 $1, %ymm1, %xmm0 ; AVX1OR2-NEXT: je LBB1_14 ; AVX1OR2-NEXT: ## %bb.13: ## %cond.store16 -; AVX1OR2-NEXT: vmovlpd %xmm0, (%rdi) +; AVX1OR2-NEXT: vmovlps %xmm0, (%rdi) ; AVX1OR2-NEXT: addq $8, %rdi ; AVX1OR2-NEXT: LBB1_14: ## %else17 ; AVX1OR2-NEXT: vpextrb $7, %xmm4, %eax ; AVX1OR2-NEXT: testb $1, %al ; AVX1OR2-NEXT: je LBB1_16 ; AVX1OR2-NEXT: ## %bb.15: ## %cond.store19 -; AVX1OR2-NEXT: vmovhpd %xmm0, (%rdi) +; AVX1OR2-NEXT: vmovhps %xmm0, (%rdi) ; AVX1OR2-NEXT: addq $8, %rdi ; AVX1OR2-NEXT: LBB1_16: ## %else20 ; AVX1OR2-NEXT: vpextrb $8, %xmm4, %eax ; AVX1OR2-NEXT: testb $1, %al ; AVX1OR2-NEXT: je LBB1_18 ; AVX1OR2-NEXT: ## %bb.17: ## %cond.store22 -; AVX1OR2-NEXT: vmovlpd %xmm2, (%rdi) +; AVX1OR2-NEXT: vmovlps %xmm2, (%rdi) ; AVX1OR2-NEXT: addq $8, %rdi ; AVX1OR2-NEXT: LBB1_18: ## %else23 ; AVX1OR2-NEXT: vpextrb $9, %xmm4, %eax ; AVX1OR2-NEXT: testb $1, %al ; AVX1OR2-NEXT: je LBB1_20 ; AVX1OR2-NEXT: ## %bb.19: ## %cond.store25 -; AVX1OR2-NEXT: vmovhpd %xmm2, (%rdi) +; AVX1OR2-NEXT: vmovhps %xmm2, (%rdi) ; AVX1OR2-NEXT: addq $8, %rdi ; AVX1OR2-NEXT: LBB1_20: ## %else26 ; AVX1OR2-NEXT: vpextrb $10, %xmm4, %eax @@ -536,28 +536,28 @@ define void @compressstore_v16f64_v16i1(double* %base, <16 x double> %V, <16 x i ; AVX1OR2-NEXT: vextractf128 $1, %ymm2, %xmm0 ; AVX1OR2-NEXT: je LBB1_22 ; AVX1OR2-NEXT: ## %bb.21: ## %cond.store28 -; AVX1OR2-NEXT: vmovlpd %xmm0, (%rdi) +; AVX1OR2-NEXT: vmovlps %xmm0, (%rdi) ; AVX1OR2-NEXT: addq $8, %rdi ; AVX1OR2-NEXT: LBB1_22: ## %else29 ; AVX1OR2-NEXT: vpextrb $11, %xmm4, %eax ; AVX1OR2-NEXT: testb $1, %al ; AVX1OR2-NEXT: je LBB1_24 ; AVX1OR2-NEXT: ## %bb.23: ## %cond.store31 -; AVX1OR2-NEXT: vmovhpd %xmm0, (%rdi) +; AVX1OR2-NEXT: vmovhps %xmm0, (%rdi) ; AVX1OR2-NEXT: addq $8, %rdi ; AVX1OR2-NEXT: LBB1_24: ## %else32 ; AVX1OR2-NEXT: vpextrb $12, %xmm4, %eax ; AVX1OR2-NEXT: testb $1, %al ; AVX1OR2-NEXT: je LBB1_26 ; AVX1OR2-NEXT: ## %bb.25: ## %cond.store34 -; AVX1OR2-NEXT: vmovlpd %xmm3, (%rdi) +; AVX1OR2-NEXT: vmovlps %xmm3, (%rdi) ; AVX1OR2-NEXT: addq $8, %rdi ; AVX1OR2-NEXT: LBB1_26: ## %else35 ; AVX1OR2-NEXT: vpextrb $13, %xmm4, %eax ; AVX1OR2-NEXT: testb $1, %al ; AVX1OR2-NEXT: je LBB1_28 ; AVX1OR2-NEXT: ## %bb.27: ## %cond.store37 -; AVX1OR2-NEXT: vmovhpd %xmm3, (%rdi) +; AVX1OR2-NEXT: vmovhps %xmm3, (%rdi) ; AVX1OR2-NEXT: addq $8, %rdi ; AVX1OR2-NEXT: LBB1_28: ## %else38 ; AVX1OR2-NEXT: vpextrb $14, %xmm4, %eax @@ -565,14 +565,14 @@ define void @compressstore_v16f64_v16i1(double* %base, <16 x double> %V, <16 x i ; AVX1OR2-NEXT: vextractf128 $1, %ymm3, %xmm0 ; AVX1OR2-NEXT: je LBB1_30 ; AVX1OR2-NEXT: ## %bb.29: ## %cond.store40 -; AVX1OR2-NEXT: vmovlpd %xmm0, (%rdi) +; AVX1OR2-NEXT: vmovlps %xmm0, (%rdi) ; AVX1OR2-NEXT: addq $8, %rdi ; AVX1OR2-NEXT: LBB1_30: ## %else41 ; AVX1OR2-NEXT: vpextrb $15, %xmm4, %eax ; AVX1OR2-NEXT: testb $1, %al ; AVX1OR2-NEXT: je LBB1_32 ; AVX1OR2-NEXT: ## %bb.31: ## %cond.store43 -; AVX1OR2-NEXT: vmovhpd %xmm0, (%rdi) +; AVX1OR2-NEXT: vmovhps %xmm0, (%rdi) ; AVX1OR2-NEXT: LBB1_32: ## %else44 ; AVX1OR2-NEXT: vzeroupper ; AVX1OR2-NEXT: retq diff --git a/test/CodeGen/X86/masked_expandload.ll b/test/CodeGen/X86/masked_expandload.ll index 4eb2f73e572..3d996b03b8b 100644 --- a/test/CodeGen/X86/masked_expandload.ll +++ b/test/CodeGen/X86/masked_expandload.ll @@ -22,14 +22,14 @@ define <2 x double> @expandload_v2f64_v2i64(double* %base, <2 x double> %src0, < ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB0_2 ; SSE2-NEXT: ## %bb.1: ## %cond.load -; SSE2-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1] +; SSE2-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] ; SSE2-NEXT: addq $8, %rdi ; SSE2-NEXT: LBB0_2: ## %else ; SSE2-NEXT: pextrw $4, %xmm1, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB0_4 ; SSE2-NEXT: ## %bb.3: ## %cond.load1 -; SSE2-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; SSE2-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; SSE2-NEXT: LBB0_4: ## %else2 ; SSE2-NEXT: retq ; @@ -41,14 +41,14 @@ define <2 x double> @expandload_v2f64_v2i64(double* %base, <2 x double> %src0, < ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB0_2 ; SSE42-NEXT: ## %bb.1: ## %cond.load -; SSE42-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1] +; SSE42-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] ; SSE42-NEXT: addq $8, %rdi ; SSE42-NEXT: LBB0_2: ## %else ; SSE42-NEXT: pextrb $8, %xmm2, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB0_4 ; SSE42-NEXT: ## %bb.3: ## %cond.load1 -; SSE42-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; SSE42-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; SSE42-NEXT: LBB0_4: ## %else2 ; SSE42-NEXT: retq ; @@ -60,14 +60,14 @@ define <2 x double> @expandload_v2f64_v2i64(double* %base, <2 x double> %src0, < ; AVX1OR2-NEXT: testb $1, %al ; AVX1OR2-NEXT: je LBB0_2 ; AVX1OR2-NEXT: ## %bb.1: ## %cond.load -; AVX1OR2-NEXT: vmovlpd {{.*#+}} xmm0 = mem[0],xmm0[1] +; AVX1OR2-NEXT: vmovlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] ; AVX1OR2-NEXT: addq $8, %rdi ; AVX1OR2-NEXT: LBB0_2: ## %else ; AVX1OR2-NEXT: vpextrb $8, %xmm1, %eax ; AVX1OR2-NEXT: testb $1, %al ; AVX1OR2-NEXT: je LBB0_4 ; AVX1OR2-NEXT: ## %bb.3: ## %cond.load1 -; AVX1OR2-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1OR2-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; AVX1OR2-NEXT: LBB0_4: ## %else2 ; AVX1OR2-NEXT: retq ; @@ -104,14 +104,14 @@ define <4 x double> @expandload_v4f64_v4i64(double* %base, <4 x double> %src0, < ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB1_2 ; SSE2-NEXT: ## %bb.1: ## %cond.load -; SSE2-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1] +; SSE2-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] ; SSE2-NEXT: addq $8, %rdi ; SSE2-NEXT: LBB1_2: ## %else ; SSE2-NEXT: pextrw $4, %xmm2, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB1_4 ; SSE2-NEXT: ## %bb.3: ## %cond.load1 -; SSE2-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; SSE2-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; SSE2-NEXT: addq $8, %rdi ; SSE2-NEXT: LBB1_4: ## %else2 ; SSE2-NEXT: pxor %xmm2, %xmm2 @@ -122,14 +122,14 @@ define <4 x double> @expandload_v4f64_v4i64(double* %base, <4 x double> %src0, < ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB1_6 ; SSE2-NEXT: ## %bb.5: ## %cond.load5 -; SSE2-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] +; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; SSE2-NEXT: addq $8, %rdi ; SSE2-NEXT: LBB1_6: ## %else6 ; SSE2-NEXT: pextrw $4, %xmm2, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB1_8 ; SSE2-NEXT: ## %bb.7: ## %cond.load9 -; SSE2-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; SSE2-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] ; SSE2-NEXT: LBB1_8: ## %else10 ; SSE2-NEXT: retq ; @@ -141,14 +141,14 @@ define <4 x double> @expandload_v4f64_v4i64(double* %base, <4 x double> %src0, < ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB1_2 ; SSE42-NEXT: ## %bb.1: ## %cond.load -; SSE42-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1] +; SSE42-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] ; SSE42-NEXT: addq $8, %rdi ; SSE42-NEXT: LBB1_2: ## %else ; SSE42-NEXT: pextrb $8, %xmm4, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB1_4 ; SSE42-NEXT: ## %bb.3: ## %cond.load1 -; SSE42-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; SSE42-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; SSE42-NEXT: addq $8, %rdi ; SSE42-NEXT: LBB1_4: ## %else2 ; SSE42-NEXT: pxor %xmm2, %xmm2 @@ -157,14 +157,14 @@ define <4 x double> @expandload_v4f64_v4i64(double* %base, <4 x double> %src0, < ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB1_6 ; SSE42-NEXT: ## %bb.5: ## %cond.load5 -; SSE42-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] +; SSE42-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; SSE42-NEXT: addq $8, %rdi ; SSE42-NEXT: LBB1_6: ## %else6 ; SSE42-NEXT: pextrb $8, %xmm3, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB1_8 ; SSE42-NEXT: ## %bb.7: ## %cond.load9 -; SSE42-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; SSE42-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] ; SSE42-NEXT: LBB1_8: ## %else10 ; SSE42-NEXT: retq ; @@ -185,19 +185,19 @@ define <4 x double> @expandload_v4f64_v4i64(double* %base, <4 x double> %src0, < ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je LBB1_4 ; AVX1-NEXT: ## %bb.3: ## %cond.load1 -; AVX1-NEXT: vmovhpd {{.*#+}} xmm2 = xmm0[0],mem[0] -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] +; AVX1-NEXT: vmovhps {{.*#+}} xmm2 = xmm0[0,1],mem[0,1] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: LBB1_4: ## %else2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpextrb $0, %xmm1, %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je LBB1_6 ; AVX1-NEXT: ## %bb.5: ## %cond.load5 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovlpd {{.*#+}} xmm2 = mem[0],xmm2[1] +; AVX1-NEXT: vmovlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: LBB1_6: ## %else6 @@ -206,7 +206,7 @@ define <4 x double> @expandload_v4f64_v4i64(double* %base, <4 x double> %src0, < ; AVX1-NEXT: je LBB1_8 ; AVX1-NEXT: ## %bb.7: ## %cond.load9 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: LBB1_8: ## %else10 ; AVX1-NEXT: retq @@ -228,29 +228,29 @@ define <4 x double> @expandload_v4f64_v4i64(double* %base, <4 x double> %src0, < ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je LBB1_4 ; AVX2-NEXT: ## %bb.3: ## %cond.load1 -; AVX2-NEXT: vmovhpd {{.*#+}} xmm2 = xmm0[0],mem[0] -; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] +; AVX2-NEXT: vmovhps {{.*#+}} xmm2 = xmm0[0,1],mem[0,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: LBB1_4: ## %else2 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpextrb $0, %xmm1, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je LBB1_6 ; AVX2-NEXT: ## %bb.5: ## %cond.load5 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vmovlpd {{.*#+}} xmm2 = mem[0],xmm2[1] -; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vmovlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: LBB1_6: ## %else6 ; AVX2-NEXT: vpextrb $8, %xmm1, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je LBB1_8 ; AVX2-NEXT: ## %bb.7: ## %cond.load9 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: LBB1_8: ## %else10 ; AVX2-NEXT: retq ; @@ -282,56 +282,56 @@ define <8 x double> @expandload_v8f64_v8i1(double* %base, <8 x double> %src0, <8 ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB2_2 ; SSE2-NEXT: ## %bb.1: ## %cond.load -; SSE2-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1] +; SSE2-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] ; SSE2-NEXT: addq $8, %rdi ; SSE2-NEXT: LBB2_2: ## %else ; SSE2-NEXT: shrl $16, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB2_4 ; SSE2-NEXT: ## %bb.3: ## %cond.load1 -; SSE2-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; SSE2-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; SSE2-NEXT: addq $8, %rdi ; SSE2-NEXT: LBB2_4: ## %else2 ; SSE2-NEXT: pextrw $2, %xmm4, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB2_6 ; SSE2-NEXT: ## %bb.5: ## %cond.load5 -; SSE2-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] +; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; SSE2-NEXT: addq $8, %rdi ; SSE2-NEXT: LBB2_6: ## %else6 ; SSE2-NEXT: pextrw $3, %xmm4, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB2_8 ; SSE2-NEXT: ## %bb.7: ## %cond.load9 -; SSE2-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; SSE2-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] ; SSE2-NEXT: addq $8, %rdi ; SSE2-NEXT: LBB2_8: ## %else10 ; SSE2-NEXT: pextrw $4, %xmm4, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB2_10 ; SSE2-NEXT: ## %bb.9: ## %cond.load13 -; SSE2-NEXT: movlpd {{.*#+}} xmm2 = mem[0],xmm2[1] +; SSE2-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] ; SSE2-NEXT: addq $8, %rdi ; SSE2-NEXT: LBB2_10: ## %else14 ; SSE2-NEXT: pextrw $5, %xmm4, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB2_12 ; SSE2-NEXT: ## %bb.11: ## %cond.load17 -; SSE2-NEXT: movhpd {{.*#+}} xmm2 = xmm2[0],mem[0] +; SSE2-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] ; SSE2-NEXT: addq $8, %rdi ; SSE2-NEXT: LBB2_12: ## %else18 ; SSE2-NEXT: pextrw $6, %xmm4, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB2_14 ; SSE2-NEXT: ## %bb.13: ## %cond.load21 -; SSE2-NEXT: movlpd {{.*#+}} xmm3 = mem[0],xmm3[1] +; SSE2-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3] ; SSE2-NEXT: addq $8, %rdi ; SSE2-NEXT: LBB2_14: ## %else22 ; SSE2-NEXT: pextrw $7, %xmm4, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB2_16 ; SSE2-NEXT: ## %bb.15: ## %cond.load25 -; SSE2-NEXT: movhpd {{.*#+}} xmm3 = xmm3[0],mem[0] +; SSE2-NEXT: movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1] ; SSE2-NEXT: LBB2_16: ## %else26 ; SSE2-NEXT: retq ; @@ -341,56 +341,56 @@ define <8 x double> @expandload_v8f64_v8i1(double* %base, <8 x double> %src0, <8 ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB2_2 ; SSE42-NEXT: ## %bb.1: ## %cond.load -; SSE42-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1] +; SSE42-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] ; SSE42-NEXT: addq $8, %rdi ; SSE42-NEXT: LBB2_2: ## %else ; SSE42-NEXT: pextrb $2, %xmm4, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB2_4 ; SSE42-NEXT: ## %bb.3: ## %cond.load1 -; SSE42-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; SSE42-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; SSE42-NEXT: addq $8, %rdi ; SSE42-NEXT: LBB2_4: ## %else2 ; SSE42-NEXT: pextrb $4, %xmm4, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB2_6 ; SSE42-NEXT: ## %bb.5: ## %cond.load5 -; SSE42-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] +; SSE42-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; SSE42-NEXT: addq $8, %rdi ; SSE42-NEXT: LBB2_6: ## %else6 ; SSE42-NEXT: pextrb $6, %xmm4, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB2_8 ; SSE42-NEXT: ## %bb.7: ## %cond.load9 -; SSE42-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; SSE42-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] ; SSE42-NEXT: addq $8, %rdi ; SSE42-NEXT: LBB2_8: ## %else10 ; SSE42-NEXT: pextrb $8, %xmm4, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB2_10 ; SSE42-NEXT: ## %bb.9: ## %cond.load13 -; SSE42-NEXT: movlpd {{.*#+}} xmm2 = mem[0],xmm2[1] +; SSE42-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] ; SSE42-NEXT: addq $8, %rdi ; SSE42-NEXT: LBB2_10: ## %else14 ; SSE42-NEXT: pextrb $10, %xmm4, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB2_12 ; SSE42-NEXT: ## %bb.11: ## %cond.load17 -; SSE42-NEXT: movhpd {{.*#+}} xmm2 = xmm2[0],mem[0] +; SSE42-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] ; SSE42-NEXT: addq $8, %rdi ; SSE42-NEXT: LBB2_12: ## %else18 ; SSE42-NEXT: pextrb $12, %xmm4, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB2_14 ; SSE42-NEXT: ## %bb.13: ## %cond.load21 -; SSE42-NEXT: movlpd {{.*#+}} xmm3 = mem[0],xmm3[1] +; SSE42-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3] ; SSE42-NEXT: addq $8, %rdi ; SSE42-NEXT: LBB2_14: ## %else22 ; SSE42-NEXT: pextrb $14, %xmm4, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB2_16 ; SSE42-NEXT: ## %bb.15: ## %cond.load25 -; SSE42-NEXT: movhpd {{.*#+}} xmm3 = xmm3[0],mem[0] +; SSE42-NEXT: movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1] ; SSE42-NEXT: LBB2_16: ## %else26 ; SSE42-NEXT: retq ; @@ -401,15 +401,15 @@ define <8 x double> @expandload_v8f64_v8i1(double* %base, <8 x double> %src0, <8 ; AVX1OR2-NEXT: je LBB2_2 ; AVX1OR2-NEXT: ## %bb.1: ## %cond.load ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX1OR2-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3] +; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] ; AVX1OR2-NEXT: addq $8, %rdi ; AVX1OR2-NEXT: LBB2_2: ## %else ; AVX1OR2-NEXT: vpextrb $2, %xmm2, %eax ; AVX1OR2-NEXT: testb $1, %al ; AVX1OR2-NEXT: je LBB2_4 ; AVX1OR2-NEXT: ## %bb.3: ## %cond.load1 -; AVX1OR2-NEXT: vmovhpd {{.*#+}} xmm3 = xmm0[0],mem[0] -; AVX1OR2-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] +; AVX1OR2-NEXT: vmovhps {{.*#+}} xmm3 = xmm0[0,1],mem[0,1] +; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX1OR2-NEXT: addq $8, %rdi ; AVX1OR2-NEXT: LBB2_4: ## %else2 ; AVX1OR2-NEXT: vpextrb $4, %xmm2, %eax @@ -417,7 +417,7 @@ define <8 x double> @expandload_v8f64_v8i1(double* %base, <8 x double> %src0, <8 ; AVX1OR2-NEXT: je LBB2_6 ; AVX1OR2-NEXT: ## %bb.5: ## %cond.load5 ; AVX1OR2-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1OR2-NEXT: vmovlpd {{.*#+}} xmm3 = mem[0],xmm3[1] +; AVX1OR2-NEXT: vmovlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3] ; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1OR2-NEXT: addq $8, %rdi ; AVX1OR2-NEXT: LBB2_6: ## %else6 @@ -426,7 +426,7 @@ define <8 x double> @expandload_v8f64_v8i1(double* %base, <8 x double> %src0, <8 ; AVX1OR2-NEXT: je LBB2_8 ; AVX1OR2-NEXT: ## %bb.7: ## %cond.load9 ; AVX1OR2-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1OR2-NEXT: vmovhpd {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX1OR2-NEXT: vmovhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1] ; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1OR2-NEXT: addq $8, %rdi ; AVX1OR2-NEXT: LBB2_8: ## %else10 @@ -435,15 +435,15 @@ define <8 x double> @expandload_v8f64_v8i1(double* %base, <8 x double> %src0, <8 ; AVX1OR2-NEXT: je LBB2_10 ; AVX1OR2-NEXT: ## %bb.9: ## %cond.load13 ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX1OR2-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3] +; AVX1OR2-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] ; AVX1OR2-NEXT: addq $8, %rdi ; AVX1OR2-NEXT: LBB2_10: ## %else14 ; AVX1OR2-NEXT: vpextrb $10, %xmm2, %eax ; AVX1OR2-NEXT: testb $1, %al ; AVX1OR2-NEXT: je LBB2_12 ; AVX1OR2-NEXT: ## %bb.11: ## %cond.load17 -; AVX1OR2-NEXT: vmovhpd {{.*#+}} xmm3 = xmm1[0],mem[0] -; AVX1OR2-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3] +; AVX1OR2-NEXT: vmovhps {{.*#+}} xmm3 = xmm1[0,1],mem[0,1] +; AVX1OR2-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX1OR2-NEXT: addq $8, %rdi ; AVX1OR2-NEXT: LBB2_12: ## %else18 ; AVX1OR2-NEXT: vpextrb $12, %xmm2, %eax @@ -451,7 +451,7 @@ define <8 x double> @expandload_v8f64_v8i1(double* %base, <8 x double> %src0, <8 ; AVX1OR2-NEXT: je LBB2_14 ; AVX1OR2-NEXT: ## %bb.13: ## %cond.load21 ; AVX1OR2-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1OR2-NEXT: vmovlpd {{.*#+}} xmm3 = mem[0],xmm3[1] +; AVX1OR2-NEXT: vmovlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3] ; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1OR2-NEXT: addq $8, %rdi ; AVX1OR2-NEXT: LBB2_14: ## %else22 @@ -460,7 +460,7 @@ define <8 x double> @expandload_v8f64_v8i1(double* %base, <8 x double> %src0, <8 ; AVX1OR2-NEXT: je LBB2_16 ; AVX1OR2-NEXT: ## %bb.15: ## %cond.load25 ; AVX1OR2-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1OR2-NEXT: vmovhpd {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX1OR2-NEXT: vmovhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] ; AVX1OR2-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1OR2-NEXT: LBB2_16: ## %else26 ; AVX1OR2-NEXT: retq @@ -502,14 +502,14 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je LBB3_2 ; SSE2-NEXT: ## %bb.1: ## %cond.load -; SSE2-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1] +; SSE2-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] ; SSE2-NEXT: addq $8, %rsi ; SSE2-NEXT: LBB3_2: ## %else ; SSE2-NEXT: pextrw $2, %xmm8, %ecx ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je LBB3_4 ; SSE2-NEXT: ## %bb.3: ## %cond.load1 -; SSE2-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; SSE2-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; SSE2-NEXT: addq $8, %rsi ; SSE2-NEXT: LBB3_4: ## %else2 ; SSE2-NEXT: pxor %xmm8, %xmm8 @@ -518,7 +518,7 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je LBB3_6 ; SSE2-NEXT: ## %bb.5: ## %cond.load5 -; SSE2-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] +; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; SSE2-NEXT: addq $8, %rsi ; SSE2-NEXT: LBB3_6: ## %else6 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 @@ -526,7 +526,7 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je LBB3_8 ; SSE2-NEXT: ## %bb.7: ## %cond.load9 -; SSE2-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; SSE2-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] ; SSE2-NEXT: addq $8, %rsi ; SSE2-NEXT: LBB3_8: ## %else10 ; SSE2-NEXT: pxor %xmm9, %xmm9 @@ -535,14 +535,14 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je LBB3_10 ; SSE2-NEXT: ## %bb.9: ## %cond.load13 -; SSE2-NEXT: movlpd {{.*#+}} xmm2 = mem[0],xmm2[1] +; SSE2-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] ; SSE2-NEXT: addq $8, %rsi ; SSE2-NEXT: LBB3_10: ## %else14 ; SSE2-NEXT: pextrw $2, %xmm9, %ecx ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je LBB3_12 ; SSE2-NEXT: ## %bb.11: ## %cond.load17 -; SSE2-NEXT: movhpd {{.*#+}} xmm2 = xmm2[0],mem[0] +; SSE2-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] ; SSE2-NEXT: addq $8, %rsi ; SSE2-NEXT: LBB3_12: ## %else18 ; SSE2-NEXT: pxor %xmm9, %xmm9 @@ -551,7 +551,7 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je LBB3_14 ; SSE2-NEXT: ## %bb.13: ## %cond.load21 -; SSE2-NEXT: movlpd {{.*#+}} xmm3 = mem[0],xmm3[1] +; SSE2-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3] ; SSE2-NEXT: addq $8, %rsi ; SSE2-NEXT: LBB3_14: ## %else22 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 @@ -559,7 +559,7 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je LBB3_16 ; SSE2-NEXT: ## %bb.15: ## %cond.load25 -; SSE2-NEXT: movhpd {{.*#+}} xmm3 = xmm3[0],mem[0] +; SSE2-NEXT: movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1] ; SSE2-NEXT: addq $8, %rsi ; SSE2-NEXT: LBB3_16: ## %else26 ; SSE2-NEXT: pxor %xmm8, %xmm8 @@ -568,14 +568,14 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je LBB3_18 ; SSE2-NEXT: ## %bb.17: ## %cond.load29 -; SSE2-NEXT: movlpd {{.*#+}} xmm4 = mem[0],xmm4[1] +; SSE2-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3] ; SSE2-NEXT: addq $8, %rsi ; SSE2-NEXT: LBB3_18: ## %else30 ; SSE2-NEXT: pextrw $2, %xmm8, %ecx ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je LBB3_20 ; SSE2-NEXT: ## %bb.19: ## %cond.load33 -; SSE2-NEXT: movhpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; SSE2-NEXT: movhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1] ; SSE2-NEXT: addq $8, %rsi ; SSE2-NEXT: LBB3_20: ## %else34 ; SSE2-NEXT: pxor %xmm8, %xmm8 @@ -584,7 +584,7 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je LBB3_22 ; SSE2-NEXT: ## %bb.21: ## %cond.load37 -; SSE2-NEXT: movlpd {{.*#+}} xmm5 = mem[0],xmm5[1] +; SSE2-NEXT: movlps {{.*#+}} xmm5 = mem[0,1],xmm5[2,3] ; SSE2-NEXT: addq $8, %rsi ; SSE2-NEXT: LBB3_22: ## %else38 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 @@ -592,7 +592,7 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je LBB3_24 ; SSE2-NEXT: ## %bb.23: ## %cond.load41 -; SSE2-NEXT: movhpd {{.*#+}} xmm5 = xmm5[0],mem[0] +; SSE2-NEXT: movhps {{.*#+}} xmm5 = xmm5[0,1],mem[0,1] ; SSE2-NEXT: addq $8, %rsi ; SSE2-NEXT: LBB3_24: ## %else42 ; SSE2-NEXT: pxor %xmm9, %xmm9 @@ -601,14 +601,14 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je LBB3_26 ; SSE2-NEXT: ## %bb.25: ## %cond.load45 -; SSE2-NEXT: movlpd {{.*#+}} xmm6 = mem[0],xmm6[1] +; SSE2-NEXT: movlps {{.*#+}} xmm6 = mem[0,1],xmm6[2,3] ; SSE2-NEXT: addq $8, %rsi ; SSE2-NEXT: LBB3_26: ## %else46 ; SSE2-NEXT: pextrw $2, %xmm9, %ecx ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je LBB3_28 ; SSE2-NEXT: ## %bb.27: ## %cond.load49 -; SSE2-NEXT: movhpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; SSE2-NEXT: movhps {{.*#+}} xmm6 = xmm6[0,1],mem[0,1] ; SSE2-NEXT: addq $8, %rsi ; SSE2-NEXT: LBB3_28: ## %else50 ; SSE2-NEXT: pxor %xmm9, %xmm9 @@ -617,23 +617,23 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je LBB3_30 ; SSE2-NEXT: ## %bb.29: ## %cond.load53 -; SSE2-NEXT: movlpd {{.*#+}} xmm7 = mem[0],xmm7[1] +; SSE2-NEXT: movlps {{.*#+}} xmm7 = mem[0,1],xmm7[2,3] ; SSE2-NEXT: addq $8, %rsi ; SSE2-NEXT: LBB3_30: ## %else54 ; SSE2-NEXT: pextrw $6, %xmm8, %ecx ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: je LBB3_32 ; SSE2-NEXT: ## %bb.31: ## %cond.load57 -; SSE2-NEXT: movhpd {{.*#+}} xmm7 = xmm7[0],mem[0] +; SSE2-NEXT: movhps {{.*#+}} xmm7 = xmm7[0,1],mem[0,1] ; SSE2-NEXT: LBB3_32: ## %else58 -; SSE2-NEXT: movapd %xmm0, (%rax) -; SSE2-NEXT: movapd %xmm1, 16(%rax) -; SSE2-NEXT: movapd %xmm2, 32(%rax) -; SSE2-NEXT: movapd %xmm3, 48(%rax) -; SSE2-NEXT: movapd %xmm4, 64(%rax) -; SSE2-NEXT: movapd %xmm5, 80(%rax) -; SSE2-NEXT: movapd %xmm6, 96(%rax) -; SSE2-NEXT: movapd %xmm7, 112(%rax) +; SSE2-NEXT: movaps %xmm0, (%rax) +; SSE2-NEXT: movaps %xmm1, 16(%rax) +; SSE2-NEXT: movaps %xmm2, 32(%rax) +; SSE2-NEXT: movaps %xmm3, 48(%rax) +; SSE2-NEXT: movaps %xmm4, 64(%rax) +; SSE2-NEXT: movaps %xmm5, 80(%rax) +; SSE2-NEXT: movaps %xmm6, 96(%rax) +; SSE2-NEXT: movaps %xmm7, 112(%rax) ; SSE2-NEXT: retq ; ; SSE42-LABEL: expandload_v16f64_v16i32: @@ -646,14 +646,14 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; SSE42-NEXT: testb $1, %cl ; SSE42-NEXT: je LBB3_2 ; SSE42-NEXT: ## %bb.1: ## %cond.load -; SSE42-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1] +; SSE42-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] ; SSE42-NEXT: addq $8, %rsi ; SSE42-NEXT: LBB3_2: ## %else ; SSE42-NEXT: pextrb $4, %xmm8, %ecx ; SSE42-NEXT: testb $1, %cl ; SSE42-NEXT: je LBB3_4 ; SSE42-NEXT: ## %bb.3: ## %cond.load1 -; SSE42-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; SSE42-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; SSE42-NEXT: addq $8, %rsi ; SSE42-NEXT: LBB3_4: ## %else2 ; SSE42-NEXT: pxor %xmm8, %xmm8 @@ -662,7 +662,7 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; SSE42-NEXT: testb $1, %cl ; SSE42-NEXT: je LBB3_6 ; SSE42-NEXT: ## %bb.5: ## %cond.load5 -; SSE42-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] +; SSE42-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; SSE42-NEXT: addq $8, %rsi ; SSE42-NEXT: LBB3_6: ## %else6 ; SSE42-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 @@ -670,7 +670,7 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; SSE42-NEXT: testb $1, %cl ; SSE42-NEXT: je LBB3_8 ; SSE42-NEXT: ## %bb.7: ## %cond.load9 -; SSE42-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; SSE42-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] ; SSE42-NEXT: addq $8, %rsi ; SSE42-NEXT: LBB3_8: ## %else10 ; SSE42-NEXT: pxor %xmm9, %xmm9 @@ -679,14 +679,14 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; SSE42-NEXT: testb $1, %cl ; SSE42-NEXT: je LBB3_10 ; SSE42-NEXT: ## %bb.9: ## %cond.load13 -; SSE42-NEXT: movlpd {{.*#+}} xmm2 = mem[0],xmm2[1] +; SSE42-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] ; SSE42-NEXT: addq $8, %rsi ; SSE42-NEXT: LBB3_10: ## %else14 ; SSE42-NEXT: pextrb $4, %xmm9, %ecx ; SSE42-NEXT: testb $1, %cl ; SSE42-NEXT: je LBB3_12 ; SSE42-NEXT: ## %bb.11: ## %cond.load17 -; SSE42-NEXT: movhpd {{.*#+}} xmm2 = xmm2[0],mem[0] +; SSE42-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] ; SSE42-NEXT: addq $8, %rsi ; SSE42-NEXT: LBB3_12: ## %else18 ; SSE42-NEXT: pxor %xmm9, %xmm9 @@ -695,7 +695,7 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; SSE42-NEXT: testb $1, %cl ; SSE42-NEXT: je LBB3_14 ; SSE42-NEXT: ## %bb.13: ## %cond.load21 -; SSE42-NEXT: movlpd {{.*#+}} xmm3 = mem[0],xmm3[1] +; SSE42-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3] ; SSE42-NEXT: addq $8, %rsi ; SSE42-NEXT: LBB3_14: ## %else22 ; SSE42-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 @@ -703,7 +703,7 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; SSE42-NEXT: testb $1, %cl ; SSE42-NEXT: je LBB3_16 ; SSE42-NEXT: ## %bb.15: ## %cond.load25 -; SSE42-NEXT: movhpd {{.*#+}} xmm3 = xmm3[0],mem[0] +; SSE42-NEXT: movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1] ; SSE42-NEXT: addq $8, %rsi ; SSE42-NEXT: LBB3_16: ## %else26 ; SSE42-NEXT: pxor %xmm8, %xmm8 @@ -712,14 +712,14 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; SSE42-NEXT: testb $1, %cl ; SSE42-NEXT: je LBB3_18 ; SSE42-NEXT: ## %bb.17: ## %cond.load29 -; SSE42-NEXT: movlpd {{.*#+}} xmm4 = mem[0],xmm4[1] +; SSE42-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3] ; SSE42-NEXT: addq $8, %rsi ; SSE42-NEXT: LBB3_18: ## %else30 ; SSE42-NEXT: pextrb $4, %xmm8, %ecx ; SSE42-NEXT: testb $1, %cl ; SSE42-NEXT: je LBB3_20 ; SSE42-NEXT: ## %bb.19: ## %cond.load33 -; SSE42-NEXT: movhpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; SSE42-NEXT: movhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1] ; SSE42-NEXT: addq $8, %rsi ; SSE42-NEXT: LBB3_20: ## %else34 ; SSE42-NEXT: pxor %xmm8, %xmm8 @@ -728,7 +728,7 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; SSE42-NEXT: testb $1, %cl ; SSE42-NEXT: je LBB3_22 ; SSE42-NEXT: ## %bb.21: ## %cond.load37 -; SSE42-NEXT: movlpd {{.*#+}} xmm5 = mem[0],xmm5[1] +; SSE42-NEXT: movlps {{.*#+}} xmm5 = mem[0,1],xmm5[2,3] ; SSE42-NEXT: addq $8, %rsi ; SSE42-NEXT: LBB3_22: ## %else38 ; SSE42-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 @@ -736,7 +736,7 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; SSE42-NEXT: testb $1, %cl ; SSE42-NEXT: je LBB3_24 ; SSE42-NEXT: ## %bb.23: ## %cond.load41 -; SSE42-NEXT: movhpd {{.*#+}} xmm5 = xmm5[0],mem[0] +; SSE42-NEXT: movhps {{.*#+}} xmm5 = xmm5[0,1],mem[0,1] ; SSE42-NEXT: addq $8, %rsi ; SSE42-NEXT: LBB3_24: ## %else42 ; SSE42-NEXT: pxor %xmm9, %xmm9 @@ -745,14 +745,14 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; SSE42-NEXT: testb $1, %cl ; SSE42-NEXT: je LBB3_26 ; SSE42-NEXT: ## %bb.25: ## %cond.load45 -; SSE42-NEXT: movlpd {{.*#+}} xmm6 = mem[0],xmm6[1] +; SSE42-NEXT: movlps {{.*#+}} xmm6 = mem[0,1],xmm6[2,3] ; SSE42-NEXT: addq $8, %rsi ; SSE42-NEXT: LBB3_26: ## %else46 ; SSE42-NEXT: pextrb $4, %xmm9, %ecx ; SSE42-NEXT: testb $1, %cl ; SSE42-NEXT: je LBB3_28 ; SSE42-NEXT: ## %bb.27: ## %cond.load49 -; SSE42-NEXT: movhpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; SSE42-NEXT: movhps {{.*#+}} xmm6 = xmm6[0,1],mem[0,1] ; SSE42-NEXT: addq $8, %rsi ; SSE42-NEXT: LBB3_28: ## %else50 ; SSE42-NEXT: pxor %xmm9, %xmm9 @@ -761,23 +761,23 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; SSE42-NEXT: testb $1, %cl ; SSE42-NEXT: je LBB3_30 ; SSE42-NEXT: ## %bb.29: ## %cond.load53 -; SSE42-NEXT: movlpd {{.*#+}} xmm7 = mem[0],xmm7[1] +; SSE42-NEXT: movlps {{.*#+}} xmm7 = mem[0,1],xmm7[2,3] ; SSE42-NEXT: addq $8, %rsi ; SSE42-NEXT: LBB3_30: ## %else54 ; SSE42-NEXT: pextrb $12, %xmm8, %ecx ; SSE42-NEXT: testb $1, %cl ; SSE42-NEXT: je LBB3_32 ; SSE42-NEXT: ## %bb.31: ## %cond.load57 -; SSE42-NEXT: movhpd {{.*#+}} xmm7 = xmm7[0],mem[0] +; SSE42-NEXT: movhps {{.*#+}} xmm7 = xmm7[0,1],mem[0,1] ; SSE42-NEXT: LBB3_32: ## %else58 -; SSE42-NEXT: movapd %xmm0, (%rax) -; SSE42-NEXT: movapd %xmm1, 16(%rax) -; SSE42-NEXT: movapd %xmm2, 32(%rax) -; SSE42-NEXT: movapd %xmm3, 48(%rax) -; SSE42-NEXT: movapd %xmm4, 64(%rax) -; SSE42-NEXT: movapd %xmm5, 80(%rax) -; SSE42-NEXT: movapd %xmm6, 96(%rax) -; SSE42-NEXT: movapd %xmm7, 112(%rax) +; SSE42-NEXT: movaps %xmm0, (%rax) +; SSE42-NEXT: movaps %xmm1, 16(%rax) +; SSE42-NEXT: movaps %xmm2, 32(%rax) +; SSE42-NEXT: movaps %xmm3, 48(%rax) +; SSE42-NEXT: movaps %xmm4, 64(%rax) +; SSE42-NEXT: movaps %xmm5, 80(%rax) +; SSE42-NEXT: movaps %xmm6, 96(%rax) +; SSE42-NEXT: movaps %xmm7, 112(%rax) ; SSE42-NEXT: retq ; ; AVX1-LABEL: expandload_v16f64_v16i32: @@ -801,11 +801,11 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je LBB3_4 ; AVX1-NEXT: ## %bb.3: ## %cond.load1 -; AVX1-NEXT: vmovhpd {{.*#+}} xmm6 = xmm0[0],mem[0] -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3] +; AVX1-NEXT: vmovhps {{.*#+}} xmm6 = xmm0[0,1],mem[0,1] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: LBB3_4: ## %else2 -; AVX1-NEXT: vxorpd %xmm6, %xmm6, %xmm6 +; AVX1-NEXT: vxorps %xmm6, %xmm6, %xmm6 ; AVX1-NEXT: vpcmpeqd %xmm6, %xmm4, %xmm7 ; AVX1-NEXT: vpackssdw %xmm0, %xmm7, %xmm7 ; AVX1-NEXT: vpacksswb %xmm0, %xmm7, %xmm7 @@ -814,7 +814,7 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; AVX1-NEXT: je LBB3_6 ; AVX1-NEXT: ## %bb.5: ## %cond.load5 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 -; AVX1-NEXT: vmovlpd {{.*#+}} xmm7 = mem[0],xmm7[1] +; AVX1-NEXT: vmovlps {{.*#+}} xmm7 = mem[0,1],xmm7[2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: LBB3_6: ## %else6 @@ -826,12 +826,12 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; AVX1-NEXT: je LBB3_8 ; AVX1-NEXT: ## %bb.7: ## %cond.load9 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vmovhpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX1-NEXT: vmovhps {{.*#+}} xmm6 = xmm6[0,1],mem[0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: LBB3_8: ## %else10 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-NEXT: vxorpd %xmm6, %xmm6, %xmm6 +; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 ; AVX1-NEXT: vpcmpeqd %xmm6, %xmm4, %xmm6 ; AVX1-NEXT: vpackssdw %xmm6, %xmm0, %xmm7 ; AVX1-NEXT: vpacksswb %xmm0, %xmm7, %xmm7 @@ -849,11 +849,11 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je LBB3_12 ; AVX1-NEXT: ## %bb.11: ## %cond.load17 -; AVX1-NEXT: vmovhpd {{.*#+}} xmm6 = xmm1[0],mem[0] -; AVX1-NEXT: vblendpd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3] +; AVX1-NEXT: vmovhps {{.*#+}} xmm6 = xmm1[0,1],mem[0,1] +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: LBB3_12: ## %else18 -; AVX1-NEXT: vxorpd %xmm6, %xmm6, %xmm6 +; AVX1-NEXT: vxorps %xmm6, %xmm6, %xmm6 ; AVX1-NEXT: vpcmpeqd %xmm6, %xmm4, %xmm4 ; AVX1-NEXT: vpackssdw %xmm4, %xmm0, %xmm6 ; AVX1-NEXT: vpacksswb %xmm0, %xmm6, %xmm6 @@ -862,7 +862,7 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; AVX1-NEXT: je LBB3_14 ; AVX1-NEXT: ## %bb.13: ## %cond.load21 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 -; AVX1-NEXT: vmovlpd {{.*#+}} xmm6 = mem[0],xmm6[1] +; AVX1-NEXT: vmovlps {{.*#+}} xmm6 = mem[0,1],xmm6[2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: LBB3_14: ## %else22 @@ -873,11 +873,11 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; AVX1-NEXT: je LBB3_16 ; AVX1-NEXT: ## %bb.15: ## %cond.load25 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vmovhpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX1-NEXT: vmovhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: LBB3_16: ## %else26 -; AVX1-NEXT: vxorpd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm5, %xmm6 ; AVX1-NEXT: vpackssdw %xmm0, %xmm6, %xmm6 ; AVX1-NEXT: vpacksswb %xmm6, %xmm0, %xmm6 @@ -896,11 +896,11 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je LBB3_20 ; AVX1-NEXT: ## %bb.19: ## %cond.load33 -; AVX1-NEXT: vmovhpd {{.*#+}} xmm4 = xmm2[0],mem[0] -; AVX1-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3] +; AVX1-NEXT: vmovhps {{.*#+}} xmm4 = xmm2[0,1],mem[0,1] +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: LBB3_20: ## %else34 -; AVX1-NEXT: vxorpd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm5, %xmm6 ; AVX1-NEXT: vpackssdw %xmm0, %xmm6, %xmm6 ; AVX1-NEXT: vpacksswb %xmm6, %xmm0, %xmm6 @@ -909,7 +909,7 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; AVX1-NEXT: je LBB3_22 ; AVX1-NEXT: ## %bb.21: ## %cond.load37 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 -; AVX1-NEXT: vmovlpd {{.*#+}} xmm6 = mem[0],xmm6[1] +; AVX1-NEXT: vmovlps {{.*#+}} xmm6 = mem[0,1],xmm6[2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: LBB3_22: ## %else38 @@ -921,7 +921,7 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; AVX1-NEXT: je LBB3_24 ; AVX1-NEXT: ## %bb.23: ## %cond.load41 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vmovhpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX1-NEXT: vmovhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: LBB3_24: ## %else42 @@ -944,11 +944,11 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je LBB3_28 ; AVX1-NEXT: ## %bb.27: ## %cond.load49 -; AVX1-NEXT: vmovhpd {{.*#+}} xmm5 = xmm3[0],mem[0] -; AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3] +; AVX1-NEXT: vmovhps {{.*#+}} xmm5 = xmm3[0,1],mem[0,1] +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: LBB3_28: ## %else50 -; AVX1-NEXT: vxorpd %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vxorps %xmm5, %xmm5, %xmm5 ; AVX1-NEXT: vpcmpeqd %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vpackssdw %xmm4, %xmm0, %xmm5 ; AVX1-NEXT: vpacksswb %xmm5, %xmm0, %xmm5 @@ -957,7 +957,7 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; AVX1-NEXT: je LBB3_30 ; AVX1-NEXT: ## %bb.29: ## %cond.load53 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 -; AVX1-NEXT: vmovlpd {{.*#+}} xmm5 = mem[0],xmm5[1] +; AVX1-NEXT: vmovlps {{.*#+}} xmm5 = mem[0,1],xmm5[2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: LBB3_30: ## %else54 @@ -968,7 +968,7 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; AVX1-NEXT: je LBB3_32 ; AVX1-NEXT: ## %bb.31: ## %cond.load57 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vmovhpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX1-NEXT: vmovhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 ; AVX1-NEXT: LBB3_32: ## %else58 ; AVX1-NEXT: retq @@ -994,11 +994,11 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je LBB3_4 ; AVX2-NEXT: ## %bb.3: ## %cond.load1 -; AVX2-NEXT: vmovhpd {{.*#+}} xmm6 = xmm0[0],mem[0] -; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3] +; AVX2-NEXT: vmovhps {{.*#+}} xmm6 = xmm0[0,1],mem[0,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: LBB3_4: ## %else2 -; AVX2-NEXT: vxorpd %xmm6, %xmm6, %xmm6 +; AVX2-NEXT: vpxor %xmm6, %xmm6, %xmm6 ; AVX2-NEXT: vpcmpeqd %xmm6, %xmm4, %xmm7 ; AVX2-NEXT: vpackssdw %xmm0, %xmm7, %xmm7 ; AVX2-NEXT: vpacksswb %xmm0, %xmm7, %xmm7 @@ -1006,9 +1006,9 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je LBB3_6 ; AVX2-NEXT: ## %bb.5: ## %cond.load5 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm7 -; AVX2-NEXT: vmovlpd {{.*#+}} xmm7 = mem[0],xmm7[1] -; AVX2-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm7 +; AVX2-NEXT: vmovlps {{.*#+}} xmm7 = mem[0,1],xmm7[2,3] +; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm0 ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: LBB3_6: ## %else6 ; AVX2-NEXT: vpcmpeqd %xmm6, %xmm4, %xmm6 @@ -1018,13 +1018,13 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je LBB3_8 ; AVX2-NEXT: ## %bb.7: ## %cond.load9 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX2-NEXT: vmovhpd {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX2-NEXT: vmovhps {{.*#+}} xmm6 = xmm6[0,1],mem[0,1] +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0 ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: LBB3_8: ## %else10 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-NEXT: vxorpd %xmm6, %xmm6, %xmm6 +; AVX2-NEXT: vpxor %xmm6, %xmm6, %xmm6 ; AVX2-NEXT: vpcmpeqd %xmm6, %xmm4, %xmm6 ; AVX2-NEXT: vpackssdw %xmm6, %xmm0, %xmm7 ; AVX2-NEXT: vpacksswb %xmm0, %xmm7, %xmm7 @@ -1042,11 +1042,11 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je LBB3_12 ; AVX2-NEXT: ## %bb.11: ## %cond.load17 -; AVX2-NEXT: vmovhpd {{.*#+}} xmm6 = xmm1[0],mem[0] -; AVX2-NEXT: vblendpd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3] +; AVX2-NEXT: vmovhps {{.*#+}} xmm6 = xmm1[0,1],mem[0,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: LBB3_12: ## %else18 -; AVX2-NEXT: vxorpd %xmm6, %xmm6, %xmm6 +; AVX2-NEXT: vpxor %xmm6, %xmm6, %xmm6 ; AVX2-NEXT: vpcmpeqd %xmm6, %xmm4, %xmm4 ; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm6 ; AVX2-NEXT: vpacksswb %xmm0, %xmm6, %xmm6 @@ -1054,9 +1054,9 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je LBB3_14 ; AVX2-NEXT: ## %bb.13: ## %cond.load21 -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm6 -; AVX2-NEXT: vmovlpd {{.*#+}} xmm6 = mem[0],xmm6[1] -; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm6 +; AVX2-NEXT: vmovlps {{.*#+}} xmm6 = mem[0,1],xmm6[2,3] +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm1, %ymm1 ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: LBB3_14: ## %else22 ; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm4 @@ -1065,12 +1065,12 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je LBB3_16 ; AVX2-NEXT: ## %bb.15: ## %cond.load25 -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX2-NEXT: vmovhpd {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-NEXT: vmovhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1] +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: LBB3_16: ## %else26 -; AVX2-NEXT: vxorpd %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX2-NEXT: vpcmpeqd %xmm4, %xmm5, %xmm6 ; AVX2-NEXT: vpackssdw %xmm0, %xmm6, %xmm6 ; AVX2-NEXT: vpacksswb %xmm6, %xmm0, %xmm6 @@ -1089,11 +1089,11 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je LBB3_20 ; AVX2-NEXT: ## %bb.19: ## %cond.load33 -; AVX2-NEXT: vmovhpd {{.*#+}} xmm4 = xmm2[0],mem[0] -; AVX2-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3] +; AVX2-NEXT: vmovhps {{.*#+}} xmm4 = xmm2[0,1],mem[0,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: LBB3_20: ## %else34 -; AVX2-NEXT: vxorpd %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX2-NEXT: vpcmpeqd %xmm4, %xmm5, %xmm6 ; AVX2-NEXT: vpackssdw %xmm0, %xmm6, %xmm6 ; AVX2-NEXT: vpacksswb %xmm6, %xmm0, %xmm6 @@ -1101,9 +1101,9 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je LBB3_22 ; AVX2-NEXT: ## %bb.21: ## %cond.load37 -; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm6 -; AVX2-NEXT: vmovlpd {{.*#+}} xmm6 = mem[0],xmm6[1] -; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm6 +; AVX2-NEXT: vmovlps {{.*#+}} xmm6 = mem[0,1],xmm6[2,3] +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm2 ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: LBB3_22: ## %else38 ; AVX2-NEXT: vpcmpeqd %xmm4, %xmm5, %xmm4 @@ -1113,9 +1113,9 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je LBB3_24 ; AVX2-NEXT: ## %bb.23: ## %cond.load41 -; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX2-NEXT: vmovhpd {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-NEXT: vmovhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1] +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: LBB3_24: ## %else42 ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm4 @@ -1137,11 +1137,11 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je LBB3_28 ; AVX2-NEXT: ## %bb.27: ## %cond.load49 -; AVX2-NEXT: vmovhpd {{.*#+}} xmm5 = xmm3[0],mem[0] -; AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3] +; AVX2-NEXT: vmovhps {{.*#+}} xmm5 = xmm3[0,1],mem[0,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: LBB3_28: ## %else50 -; AVX2-NEXT: vxorpd %xmm5, %xmm5, %xmm5 +; AVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; AVX2-NEXT: vpcmpeqd %xmm5, %xmm4, %xmm4 ; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm5 ; AVX2-NEXT: vpacksswb %xmm5, %xmm0, %xmm5 @@ -1149,9 +1149,9 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je LBB3_30 ; AVX2-NEXT: ## %bb.29: ## %cond.load53 -; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm5 -; AVX2-NEXT: vmovlpd {{.*#+}} xmm5 = mem[0],xmm5[1] -; AVX2-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX2-NEXT: vmovlps {{.*#+}} xmm5 = mem[0,1],xmm5[2,3] +; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm3, %ymm3 ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: LBB3_30: ## %else54 ; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm4 @@ -1160,9 +1160,9 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je LBB3_32 ; AVX2-NEXT: ## %bb.31: ## %cond.load57 -; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX2-NEXT: vmovhpd {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vmovhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1] +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 ; AVX2-NEXT: LBB3_32: ## %else58 ; AVX2-NEXT: retq ; diff --git a/test/CodeGen/X86/masked_gather_scatter_widen.ll b/test/CodeGen/X86/masked_gather_scatter_widen.ll index f018615c24a..03491a8ada5 100644 --- a/test/CodeGen/X86/masked_gather_scatter_widen.ll +++ b/test/CodeGen/X86/masked_gather_scatter_widen.ll @@ -127,14 +127,14 @@ define void @test_scatter_v2i32_index(<2 x double> %a1, double* %base, <2 x i32> ; WIDEN_AVX2-NEXT: je .LBB1_2 ; WIDEN_AVX2-NEXT: # %bb.1: # %cond.store ; WIDEN_AVX2-NEXT: vmovq %xmm1, %rax -; WIDEN_AVX2-NEXT: vmovlpd %xmm0, (%rax) +; WIDEN_AVX2-NEXT: vmovlps %xmm0, (%rax) ; WIDEN_AVX2-NEXT: .LBB1_2: # %else ; WIDEN_AVX2-NEXT: vpextrb $8, %xmm2, %eax ; WIDEN_AVX2-NEXT: testb $1, %al ; WIDEN_AVX2-NEXT: je .LBB1_4 ; WIDEN_AVX2-NEXT: # %bb.3: # %cond.store1 ; WIDEN_AVX2-NEXT: vpextrq $1, %xmm1, %rax -; WIDEN_AVX2-NEXT: vmovhpd %xmm0, (%rax) +; WIDEN_AVX2-NEXT: vmovhps %xmm0, (%rax) ; WIDEN_AVX2-NEXT: .LBB1_4: # %else2 ; WIDEN_AVX2-NEXT: retq ; @@ -152,14 +152,14 @@ define void @test_scatter_v2i32_index(<2 x double> %a1, double* %base, <2 x i32> ; PROMOTE_AVX2-NEXT: je .LBB1_2 ; PROMOTE_AVX2-NEXT: # %bb.1: # %cond.store ; PROMOTE_AVX2-NEXT: vmovq %xmm1, %rax -; PROMOTE_AVX2-NEXT: vmovlpd %xmm0, (%rax) +; PROMOTE_AVX2-NEXT: vmovlps %xmm0, (%rax) ; PROMOTE_AVX2-NEXT: .LBB1_2: # %else ; PROMOTE_AVX2-NEXT: vpextrb $8, %xmm2, %eax ; PROMOTE_AVX2-NEXT: testb $1, %al ; PROMOTE_AVX2-NEXT: je .LBB1_4 ; PROMOTE_AVX2-NEXT: # %bb.3: # %cond.store1 ; PROMOTE_AVX2-NEXT: vpextrq $1, %xmm1, %rax -; PROMOTE_AVX2-NEXT: vmovhpd %xmm0, (%rax) +; PROMOTE_AVX2-NEXT: vmovhps %xmm0, (%rax) ; PROMOTE_AVX2-NEXT: .LBB1_4: # %else2 ; PROMOTE_AVX2-NEXT: retq %gep = getelementptr double, double *%base, <2 x i32> %ind diff --git a/test/CodeGen/X86/masked_load.ll b/test/CodeGen/X86/masked_load.ll index 4fa837ce372..17f8f0a1bb0 100644 --- a/test/CodeGen/X86/masked_load.ll +++ b/test/CodeGen/X86/masked_load.ll @@ -45,15 +45,15 @@ define <2 x double> @load_v2f64_v2i64(<2 x i64> %trigger, <2 x double>* %addr, < ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB1_2 ; SSE2-NEXT: ## %bb.1: ## %cond.load -; SSE2-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] +; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; SSE2-NEXT: LBB1_2: ## %else ; SSE2-NEXT: pextrw $4, %xmm0, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB1_4 ; SSE2-NEXT: ## %bb.3: ## %cond.load1 -; SSE2-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; SSE2-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] ; SSE2-NEXT: LBB1_4: ## %else2 -; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE42-LABEL: load_v2f64_v2i64: @@ -64,15 +64,15 @@ define <2 x double> @load_v2f64_v2i64(<2 x i64> %trigger, <2 x double>* %addr, < ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB1_2 ; SSE42-NEXT: ## %bb.1: ## %cond.load -; SSE42-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] +; SSE42-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; SSE42-NEXT: LBB1_2: ## %else ; SSE42-NEXT: pextrb $8, %xmm2, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB1_4 ; SSE42-NEXT: ## %bb.3: ## %cond.load1 -; SSE42-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; SSE42-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] ; SSE42-NEXT: LBB1_4: ## %else2 -; SSE42-NEXT: movapd %xmm1, %xmm0 +; SSE42-NEXT: movaps %xmm1, %xmm0 ; SSE42-NEXT: retq ; ; AVX1OR2-LABEL: load_v2f64_v2i64: @@ -114,13 +114,13 @@ define <4 x double> @load_v4f64_v4i32(<4 x i32> %trigger, <4 x double>* %addr, < ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB2_2 ; SSE2-NEXT: ## %bb.1: ## %cond.load -; SSE2-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] +; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; SSE2-NEXT: LBB2_2: ## %else ; SSE2-NEXT: pextrw $2, %xmm3, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB2_4 ; SSE2-NEXT: ## %bb.3: ## %cond.load1 -; SSE2-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; SSE2-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] ; SSE2-NEXT: LBB2_4: ## %else2 ; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pcmpeqd %xmm3, %xmm0 @@ -128,16 +128,16 @@ define <4 x double> @load_v4f64_v4i32(<4 x i32> %trigger, <4 x double>* %addr, < ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB2_6 ; SSE2-NEXT: ## %bb.5: ## %cond.load4 -; SSE2-NEXT: movlpd {{.*#+}} xmm2 = mem[0],xmm2[1] +; SSE2-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] ; SSE2-NEXT: LBB2_6: ## %else5 ; SSE2-NEXT: pextrw $6, %xmm0, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB2_8 ; SSE2-NEXT: ## %bb.7: ## %cond.load7 -; SSE2-NEXT: movhpd {{.*#+}} xmm2 = xmm2[0],mem[0] +; SSE2-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] ; SSE2-NEXT: LBB2_8: ## %else8 -; SSE2-NEXT: movapd %xmm1, %xmm0 -; SSE2-NEXT: movapd %xmm2, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE42-LABEL: load_v4f64_v4i32: @@ -148,13 +148,13 @@ define <4 x double> @load_v4f64_v4i32(<4 x i32> %trigger, <4 x double>* %addr, < ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB2_2 ; SSE42-NEXT: ## %bb.1: ## %cond.load -; SSE42-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] +; SSE42-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; SSE42-NEXT: LBB2_2: ## %else ; SSE42-NEXT: pextrb $4, %xmm3, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB2_4 ; SSE42-NEXT: ## %bb.3: ## %cond.load1 -; SSE42-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; SSE42-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] ; SSE42-NEXT: LBB2_4: ## %else2 ; SSE42-NEXT: pxor %xmm3, %xmm3 ; SSE42-NEXT: pcmpeqd %xmm3, %xmm0 @@ -162,16 +162,16 @@ define <4 x double> @load_v4f64_v4i32(<4 x i32> %trigger, <4 x double>* %addr, < ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB2_6 ; SSE42-NEXT: ## %bb.5: ## %cond.load4 -; SSE42-NEXT: movlpd {{.*#+}} xmm2 = mem[0],xmm2[1] +; SSE42-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] ; SSE42-NEXT: LBB2_6: ## %else5 ; SSE42-NEXT: pextrb $12, %xmm0, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB2_8 ; SSE42-NEXT: ## %bb.7: ## %cond.load7 -; SSE42-NEXT: movhpd {{.*#+}} xmm2 = xmm2[0],mem[0] +; SSE42-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] ; SSE42-NEXT: LBB2_8: ## %else8 -; SSE42-NEXT: movapd %xmm1, %xmm0 -; SSE42-NEXT: movapd %xmm2, %xmm1 +; SSE42-NEXT: movaps %xmm1, %xmm0 +; SSE42-NEXT: movaps %xmm2, %xmm1 ; SSE42-NEXT: retq ; ; AVX1-LABEL: load_v4f64_v4i32: @@ -225,17 +225,17 @@ define <4 x double> @load_v4f64_v4i32_zero(<4 x i32> %trigger, <4 x double>* %ad ; SSE2-NEXT: pcmpeqd %xmm0, %xmm3 ; SSE2-NEXT: movd %xmm3, %eax ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: xorpd %xmm1, %xmm1 +; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: je LBB3_2 ; SSE2-NEXT: ## %bb.1: ## %cond.load ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: xorpd %xmm1, %xmm1 +; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: LBB3_2: ## %else ; SSE2-NEXT: pextrw $2, %xmm3, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB3_4 ; SSE2-NEXT: ## %bb.3: ## %cond.load1 -; SSE2-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; SSE2-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; SSE2-NEXT: LBB3_4: ## %else2 ; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 @@ -243,13 +243,13 @@ define <4 x double> @load_v4f64_v4i32_zero(<4 x i32> %trigger, <4 x double>* %ad ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB3_6 ; SSE2-NEXT: ## %bb.5: ## %cond.load4 -; SSE2-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] +; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; SSE2-NEXT: LBB3_6: ## %else5 ; SSE2-NEXT: pextrw $6, %xmm2, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB3_8 ; SSE2-NEXT: ## %bb.7: ## %cond.load7 -; SSE2-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; SSE2-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] ; SSE2-NEXT: LBB3_8: ## %else8 ; SSE2-NEXT: retq ; @@ -261,17 +261,17 @@ define <4 x double> @load_v4f64_v4i32_zero(<4 x i32> %trigger, <4 x double>* %ad ; SSE42-NEXT: pcmpeqd %xmm0, %xmm3 ; SSE42-NEXT: pextrb $0, %xmm3, %eax ; SSE42-NEXT: testb $1, %al -; SSE42-NEXT: xorpd %xmm1, %xmm1 +; SSE42-NEXT: xorps %xmm1, %xmm1 ; SSE42-NEXT: je LBB3_2 ; SSE42-NEXT: ## %bb.1: ## %cond.load ; SSE42-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE42-NEXT: xorpd %xmm1, %xmm1 +; SSE42-NEXT: xorps %xmm1, %xmm1 ; SSE42-NEXT: LBB3_2: ## %else ; SSE42-NEXT: pextrb $4, %xmm3, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB3_4 ; SSE42-NEXT: ## %bb.3: ## %cond.load1 -; SSE42-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; SSE42-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; SSE42-NEXT: LBB3_4: ## %else2 ; SSE42-NEXT: pxor %xmm3, %xmm3 ; SSE42-NEXT: pcmpeqd %xmm3, %xmm2 @@ -279,13 +279,13 @@ define <4 x double> @load_v4f64_v4i32_zero(<4 x i32> %trigger, <4 x double>* %ad ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB3_6 ; SSE42-NEXT: ## %bb.5: ## %cond.load4 -; SSE42-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] +; SSE42-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; SSE42-NEXT: LBB3_6: ## %else5 ; SSE42-NEXT: pextrb $12, %xmm2, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB3_8 ; SSE42-NEXT: ## %bb.7: ## %cond.load7 -; SSE42-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; SSE42-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] ; SSE42-NEXT: LBB3_8: ## %else8 ; SSE42-NEXT: retq ; @@ -339,13 +339,13 @@ define <4 x double> @load_v4f64_v4i64(<4 x i64> %trigger, <4 x double>* %addr, < ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB4_2 ; SSE2-NEXT: ## %bb.1: ## %cond.load -; SSE2-NEXT: movlpd {{.*#+}} xmm2 = mem[0],xmm2[1] +; SSE2-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] ; SSE2-NEXT: LBB4_2: ## %else ; SSE2-NEXT: pextrw $4, %xmm0, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB4_4 ; SSE2-NEXT: ## %bb.3: ## %cond.load1 -; SSE2-NEXT: movhpd {{.*#+}} xmm2 = xmm2[0],mem[0] +; SSE2-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] ; SSE2-NEXT: LBB4_4: ## %else2 ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 @@ -355,16 +355,16 @@ define <4 x double> @load_v4f64_v4i64(<4 x i64> %trigger, <4 x double>* %addr, < ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB4_6 ; SSE2-NEXT: ## %bb.5: ## %cond.load4 -; SSE2-NEXT: movlpd {{.*#+}} xmm3 = mem[0],xmm3[1] +; SSE2-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3] ; SSE2-NEXT: LBB4_6: ## %else5 ; SSE2-NEXT: pextrw $4, %xmm0, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB4_8 ; SSE2-NEXT: ## %bb.7: ## %cond.load7 -; SSE2-NEXT: movhpd {{.*#+}} xmm3 = xmm3[0],mem[0] +; SSE2-NEXT: movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1] ; SSE2-NEXT: LBB4_8: ## %else8 -; SSE2-NEXT: movapd %xmm2, %xmm0 -; SSE2-NEXT: movapd %xmm3, %xmm1 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm3, %xmm1 ; SSE2-NEXT: retq ; ; SSE42-LABEL: load_v4f64_v4i64: @@ -375,13 +375,13 @@ define <4 x double> @load_v4f64_v4i64(<4 x i64> %trigger, <4 x double>* %addr, < ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB4_2 ; SSE42-NEXT: ## %bb.1: ## %cond.load -; SSE42-NEXT: movlpd {{.*#+}} xmm2 = mem[0],xmm2[1] +; SSE42-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] ; SSE42-NEXT: LBB4_2: ## %else ; SSE42-NEXT: pextrb $8, %xmm4, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB4_4 ; SSE42-NEXT: ## %bb.3: ## %cond.load1 -; SSE42-NEXT: movhpd {{.*#+}} xmm2 = xmm2[0],mem[0] +; SSE42-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] ; SSE42-NEXT: LBB4_4: ## %else2 ; SSE42-NEXT: pxor %xmm0, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm0, %xmm1 @@ -389,16 +389,16 @@ define <4 x double> @load_v4f64_v4i64(<4 x i64> %trigger, <4 x double>* %addr, < ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB4_6 ; SSE42-NEXT: ## %bb.5: ## %cond.load4 -; SSE42-NEXT: movlpd {{.*#+}} xmm3 = mem[0],xmm3[1] +; SSE42-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3] ; SSE42-NEXT: LBB4_6: ## %else5 ; SSE42-NEXT: pextrb $8, %xmm1, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB4_8 ; SSE42-NEXT: ## %bb.7: ## %cond.load7 -; SSE42-NEXT: movhpd {{.*#+}} xmm3 = xmm3[0],mem[0] +; SSE42-NEXT: movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1] ; SSE42-NEXT: LBB4_8: ## %else8 -; SSE42-NEXT: movapd %xmm2, %xmm0 -; SSE42-NEXT: movapd %xmm3, %xmm1 +; SSE42-NEXT: movaps %xmm2, %xmm0 +; SSE42-NEXT: movaps %xmm3, %xmm1 ; SSE42-NEXT: retq ; ; AVX1-LABEL: load_v4f64_v4i64: @@ -450,13 +450,13 @@ define <8 x double> @load_v8f64_v8i16(<8 x i16> %trigger, <8 x double>* %addr, < ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB5_2 ; SSE2-NEXT: ## %bb.1: ## %cond.load -; SSE2-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] +; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; SSE2-NEXT: LBB5_2: ## %else ; SSE2-NEXT: shrl $16, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB5_4 ; SSE2-NEXT: ## %bb.3: ## %cond.load1 -; SSE2-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; SSE2-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] ; SSE2-NEXT: LBB5_4: ## %else2 ; SSE2-NEXT: pxor %xmm5, %xmm5 ; SSE2-NEXT: pcmpeqw %xmm0, %xmm5 @@ -464,13 +464,13 @@ define <8 x double> @load_v8f64_v8i16(<8 x i16> %trigger, <8 x double>* %addr, < ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB5_6 ; SSE2-NEXT: ## %bb.5: ## %cond.load4 -; SSE2-NEXT: movlpd {{.*#+}} xmm2 = mem[0],xmm2[1] +; SSE2-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] ; SSE2-NEXT: LBB5_6: ## %else5 ; SSE2-NEXT: pextrw $3, %xmm5, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB5_8 ; SSE2-NEXT: ## %bb.7: ## %cond.load7 -; SSE2-NEXT: movhpd {{.*#+}} xmm2 = xmm2[0],mem[0] +; SSE2-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] ; SSE2-NEXT: LBB5_8: ## %else8 ; SSE2-NEXT: pxor %xmm5, %xmm5 ; SSE2-NEXT: pcmpeqw %xmm0, %xmm5 @@ -478,13 +478,13 @@ define <8 x double> @load_v8f64_v8i16(<8 x i16> %trigger, <8 x double>* %addr, < ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB5_10 ; SSE2-NEXT: ## %bb.9: ## %cond.load10 -; SSE2-NEXT: movlpd {{.*#+}} xmm3 = mem[0],xmm3[1] +; SSE2-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3] ; SSE2-NEXT: LBB5_10: ## %else11 ; SSE2-NEXT: pextrw $5, %xmm5, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB5_12 ; SSE2-NEXT: ## %bb.11: ## %cond.load13 -; SSE2-NEXT: movhpd {{.*#+}} xmm3 = xmm3[0],mem[0] +; SSE2-NEXT: movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1] ; SSE2-NEXT: LBB5_12: ## %else14 ; SSE2-NEXT: pxor %xmm5, %xmm5 ; SSE2-NEXT: pcmpeqw %xmm5, %xmm0 @@ -492,18 +492,18 @@ define <8 x double> @load_v8f64_v8i16(<8 x i16> %trigger, <8 x double>* %addr, < ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB5_14 ; SSE2-NEXT: ## %bb.13: ## %cond.load16 -; SSE2-NEXT: movlpd {{.*#+}} xmm4 = mem[0],xmm4[1] +; SSE2-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3] ; SSE2-NEXT: LBB5_14: ## %else17 ; SSE2-NEXT: pextrw $7, %xmm0, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB5_16 ; SSE2-NEXT: ## %bb.15: ## %cond.load19 -; SSE2-NEXT: movhpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; SSE2-NEXT: movhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1] ; SSE2-NEXT: LBB5_16: ## %else20 -; SSE2-NEXT: movapd %xmm1, %xmm0 -; SSE2-NEXT: movapd %xmm2, %xmm1 -; SSE2-NEXT: movapd %xmm3, %xmm2 -; SSE2-NEXT: movapd %xmm4, %xmm3 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm2, %xmm1 +; SSE2-NEXT: movaps %xmm3, %xmm2 +; SSE2-NEXT: movaps %xmm4, %xmm3 ; SSE2-NEXT: retq ; ; SSE42-LABEL: load_v8f64_v8i16: @@ -514,13 +514,13 @@ define <8 x double> @load_v8f64_v8i16(<8 x i16> %trigger, <8 x double>* %addr, < ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB5_2 ; SSE42-NEXT: ## %bb.1: ## %cond.load -; SSE42-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] +; SSE42-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; SSE42-NEXT: LBB5_2: ## %else ; SSE42-NEXT: pextrb $2, %xmm5, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB5_4 ; SSE42-NEXT: ## %bb.3: ## %cond.load1 -; SSE42-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; SSE42-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] ; SSE42-NEXT: LBB5_4: ## %else2 ; SSE42-NEXT: pxor %xmm5, %xmm5 ; SSE42-NEXT: pcmpeqw %xmm0, %xmm5 @@ -528,13 +528,13 @@ define <8 x double> @load_v8f64_v8i16(<8 x i16> %trigger, <8 x double>* %addr, < ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB5_6 ; SSE42-NEXT: ## %bb.5: ## %cond.load4 -; SSE42-NEXT: movlpd {{.*#+}} xmm2 = mem[0],xmm2[1] +; SSE42-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] ; SSE42-NEXT: LBB5_6: ## %else5 ; SSE42-NEXT: pextrb $6, %xmm5, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB5_8 ; SSE42-NEXT: ## %bb.7: ## %cond.load7 -; SSE42-NEXT: movhpd {{.*#+}} xmm2 = xmm2[0],mem[0] +; SSE42-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] ; SSE42-NEXT: LBB5_8: ## %else8 ; SSE42-NEXT: pxor %xmm5, %xmm5 ; SSE42-NEXT: pcmpeqw %xmm0, %xmm5 @@ -542,13 +542,13 @@ define <8 x double> @load_v8f64_v8i16(<8 x i16> %trigger, <8 x double>* %addr, < ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB5_10 ; SSE42-NEXT: ## %bb.9: ## %cond.load10 -; SSE42-NEXT: movlpd {{.*#+}} xmm3 = mem[0],xmm3[1] +; SSE42-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3] ; SSE42-NEXT: LBB5_10: ## %else11 ; SSE42-NEXT: pextrb $10, %xmm5, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB5_12 ; SSE42-NEXT: ## %bb.11: ## %cond.load13 -; SSE42-NEXT: movhpd {{.*#+}} xmm3 = xmm3[0],mem[0] +; SSE42-NEXT: movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1] ; SSE42-NEXT: LBB5_12: ## %else14 ; SSE42-NEXT: pxor %xmm5, %xmm5 ; SSE42-NEXT: pcmpeqw %xmm5, %xmm0 @@ -556,18 +556,18 @@ define <8 x double> @load_v8f64_v8i16(<8 x i16> %trigger, <8 x double>* %addr, < ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB5_14 ; SSE42-NEXT: ## %bb.13: ## %cond.load16 -; SSE42-NEXT: movlpd {{.*#+}} xmm4 = mem[0],xmm4[1] +; SSE42-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3] ; SSE42-NEXT: LBB5_14: ## %else17 ; SSE42-NEXT: pextrb $14, %xmm0, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB5_16 ; SSE42-NEXT: ## %bb.15: ## %cond.load19 -; SSE42-NEXT: movhpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; SSE42-NEXT: movhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1] ; SSE42-NEXT: LBB5_16: ## %else20 -; SSE42-NEXT: movapd %xmm1, %xmm0 -; SSE42-NEXT: movapd %xmm2, %xmm1 -; SSE42-NEXT: movapd %xmm3, %xmm2 -; SSE42-NEXT: movapd %xmm4, %xmm3 +; SSE42-NEXT: movaps %xmm1, %xmm0 +; SSE42-NEXT: movaps %xmm2, %xmm1 +; SSE42-NEXT: movaps %xmm3, %xmm2 +; SSE42-NEXT: movaps %xmm4, %xmm3 ; SSE42-NEXT: retq ; ; AVX1-LABEL: load_v8f64_v8i16: @@ -646,7 +646,7 @@ define <8 x double> @load_v8f64_v8i64(<8 x i64> %trigger, <8 x double>* %addr, < ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB6_2 ; SSE2-NEXT: ## %bb.1: ## %cond.load -; SSE2-NEXT: movlpd {{.*#+}} xmm4 = mem[0],xmm4[1] +; SSE2-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3] ; SSE2-NEXT: LBB6_2: ## %else ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; SSE2-NEXT: movd %xmm0, %eax @@ -654,7 +654,7 @@ define <8 x double> @load_v8f64_v8i64(<8 x i64> %trigger, <8 x double>* %addr, < ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB6_4 ; SSE2-NEXT: ## %bb.3: ## %cond.load1 -; SSE2-NEXT: movhpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; SSE2-NEXT: movhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1] ; SSE2-NEXT: LBB6_4: ## %else2 ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 @@ -664,13 +664,13 @@ define <8 x double> @load_v8f64_v8i64(<8 x i64> %trigger, <8 x double>* %addr, < ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB6_6 ; SSE2-NEXT: ## %bb.5: ## %cond.load4 -; SSE2-NEXT: movlpd {{.*#+}} xmm5 = mem[0],xmm5[1] +; SSE2-NEXT: movlps {{.*#+}} xmm5 = mem[0,1],xmm5[2,3] ; SSE2-NEXT: LBB6_6: ## %else5 ; SSE2-NEXT: pextrw $4, %xmm0, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB6_8 ; SSE2-NEXT: ## %bb.7: ## %cond.load7 -; SSE2-NEXT: movhpd {{.*#+}} xmm5 = xmm5[0],mem[0] +; SSE2-NEXT: movhps {{.*#+}} xmm5 = xmm5[0,1],mem[0,1] ; SSE2-NEXT: LBB6_8: ## %else8 ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 @@ -680,13 +680,13 @@ define <8 x double> @load_v8f64_v8i64(<8 x i64> %trigger, <8 x double>* %addr, < ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB6_10 ; SSE2-NEXT: ## %bb.9: ## %cond.load10 -; SSE2-NEXT: movlpd {{.*#+}} xmm6 = mem[0],xmm6[1] +; SSE2-NEXT: movlps {{.*#+}} xmm6 = mem[0,1],xmm6[2,3] ; SSE2-NEXT: LBB6_10: ## %else11 ; SSE2-NEXT: pextrw $4, %xmm0, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB6_12 ; SSE2-NEXT: ## %bb.11: ## %cond.load13 -; SSE2-NEXT: movhpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; SSE2-NEXT: movhps {{.*#+}} xmm6 = xmm6[0,1],mem[0,1] ; SSE2-NEXT: LBB6_12: ## %else14 ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm3 @@ -696,18 +696,18 @@ define <8 x double> @load_v8f64_v8i64(<8 x i64> %trigger, <8 x double>* %addr, < ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB6_14 ; SSE2-NEXT: ## %bb.13: ## %cond.load16 -; SSE2-NEXT: movlpd {{.*#+}} xmm8 = mem[0],xmm8[1] +; SSE2-NEXT: movlps {{.*#+}} xmm8 = mem[0,1],xmm8[2,3] ; SSE2-NEXT: LBB6_14: ## %else17 ; SSE2-NEXT: pextrw $4, %xmm0, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB6_16 ; SSE2-NEXT: ## %bb.15: ## %cond.load19 -; SSE2-NEXT: movhpd {{.*#+}} xmm8 = xmm8[0],mem[0] +; SSE2-NEXT: movhps {{.*#+}} xmm8 = xmm8[0,1],mem[0,1] ; SSE2-NEXT: LBB6_16: ## %else20 -; SSE2-NEXT: movapd %xmm4, %xmm0 -; SSE2-NEXT: movapd %xmm5, %xmm1 -; SSE2-NEXT: movapd %xmm6, %xmm2 -; SSE2-NEXT: movapd %xmm8, %xmm3 +; SSE2-NEXT: movaps %xmm4, %xmm0 +; SSE2-NEXT: movaps %xmm5, %xmm1 +; SSE2-NEXT: movaps %xmm6, %xmm2 +; SSE2-NEXT: movaps %xmm8, %xmm3 ; SSE2-NEXT: retq ; ; SSE42-LABEL: load_v8f64_v8i64: @@ -719,13 +719,13 @@ define <8 x double> @load_v8f64_v8i64(<8 x i64> %trigger, <8 x double>* %addr, < ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB6_2 ; SSE42-NEXT: ## %bb.1: ## %cond.load -; SSE42-NEXT: movlpd {{.*#+}} xmm4 = mem[0],xmm4[1] +; SSE42-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3] ; SSE42-NEXT: LBB6_2: ## %else ; SSE42-NEXT: pextrb $8, %xmm7, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB6_4 ; SSE42-NEXT: ## %bb.3: ## %cond.load1 -; SSE42-NEXT: movhpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; SSE42-NEXT: movhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1] ; SSE42-NEXT: LBB6_4: ## %else2 ; SSE42-NEXT: pxor %xmm0, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm0, %xmm1 @@ -733,13 +733,13 @@ define <8 x double> @load_v8f64_v8i64(<8 x i64> %trigger, <8 x double>* %addr, < ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB6_6 ; SSE42-NEXT: ## %bb.5: ## %cond.load4 -; SSE42-NEXT: movlpd {{.*#+}} xmm5 = mem[0],xmm5[1] +; SSE42-NEXT: movlps {{.*#+}} xmm5 = mem[0,1],xmm5[2,3] ; SSE42-NEXT: LBB6_6: ## %else5 ; SSE42-NEXT: pextrb $8, %xmm1, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB6_8 ; SSE42-NEXT: ## %bb.7: ## %cond.load7 -; SSE42-NEXT: movhpd {{.*#+}} xmm5 = xmm5[0],mem[0] +; SSE42-NEXT: movhps {{.*#+}} xmm5 = xmm5[0,1],mem[0,1] ; SSE42-NEXT: LBB6_8: ## %else8 ; SSE42-NEXT: pxor %xmm0, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm0, %xmm2 @@ -747,13 +747,13 @@ define <8 x double> @load_v8f64_v8i64(<8 x i64> %trigger, <8 x double>* %addr, < ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB6_10 ; SSE42-NEXT: ## %bb.9: ## %cond.load10 -; SSE42-NEXT: movlpd {{.*#+}} xmm6 = mem[0],xmm6[1] +; SSE42-NEXT: movlps {{.*#+}} xmm6 = mem[0,1],xmm6[2,3] ; SSE42-NEXT: LBB6_10: ## %else11 ; SSE42-NEXT: pextrb $8, %xmm2, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB6_12 ; SSE42-NEXT: ## %bb.11: ## %cond.load13 -; SSE42-NEXT: movhpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; SSE42-NEXT: movhps {{.*#+}} xmm6 = xmm6[0,1],mem[0,1] ; SSE42-NEXT: LBB6_12: ## %else14 ; SSE42-NEXT: pxor %xmm0, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm0, %xmm3 @@ -761,18 +761,18 @@ define <8 x double> @load_v8f64_v8i64(<8 x i64> %trigger, <8 x double>* %addr, < ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB6_14 ; SSE42-NEXT: ## %bb.13: ## %cond.load16 -; SSE42-NEXT: movlpd {{.*#+}} xmm8 = mem[0],xmm8[1] +; SSE42-NEXT: movlps {{.*#+}} xmm8 = mem[0,1],xmm8[2,3] ; SSE42-NEXT: LBB6_14: ## %else17 ; SSE42-NEXT: pextrb $8, %xmm3, %eax ; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: je LBB6_16 ; SSE42-NEXT: ## %bb.15: ## %cond.load19 -; SSE42-NEXT: movhpd {{.*#+}} xmm8 = xmm8[0],mem[0] +; SSE42-NEXT: movhps {{.*#+}} xmm8 = xmm8[0,1],mem[0,1] ; SSE42-NEXT: LBB6_16: ## %else20 -; SSE42-NEXT: movapd %xmm4, %xmm0 -; SSE42-NEXT: movapd %xmm5, %xmm1 -; SSE42-NEXT: movapd %xmm6, %xmm2 -; SSE42-NEXT: movapd %xmm8, %xmm3 +; SSE42-NEXT: movaps %xmm4, %xmm0 +; SSE42-NEXT: movaps %xmm5, %xmm1 +; SSE42-NEXT: movaps %xmm6, %xmm2 +; SSE42-NEXT: movaps %xmm8, %xmm3 ; SSE42-NEXT: retq ; ; AVX1-LABEL: load_v8f64_v8i64: @@ -1511,16 +1511,16 @@ define <2 x i64> @load_v2i64_v2i64(<2 x i64> %trigger, <2 x i64>* %addr, <2 x i6 ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB13_2 ; SSE2-NEXT: ## %bb.1: ## %cond.load -; SSE2-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] +; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; SSE2-NEXT: LBB13_2: ## %else ; SSE2-NEXT: pextrw $4, %xmm0, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB13_4 ; SSE2-NEXT: ## %bb.3: ## %cond.load1 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE2-NEXT: LBB13_4: ## %else2 -; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE42-LABEL: load_v2i64_v2i64: @@ -1591,16 +1591,16 @@ define <4 x i64> @load_v4i64_v4i64(<4 x i64> %trigger, <4 x i64>* %addr, <4 x i6 ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB14_2 ; SSE2-NEXT: ## %bb.1: ## %cond.load -; SSE2-NEXT: movlpd {{.*#+}} xmm2 = mem[0],xmm2[1] +; SSE2-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] ; SSE2-NEXT: LBB14_2: ## %else ; SSE2-NEXT: pextrw $4, %xmm0, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB14_4 ; SSE2-NEXT: ## %bb.3: ## %cond.load1 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE2-NEXT: LBB14_4: ## %else2 -; SSE2-NEXT: xorpd %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2] ; SSE2-NEXT: pand %xmm1, %xmm0 @@ -1608,17 +1608,17 @@ define <4 x i64> @load_v4i64_v4i64(<4 x i64> %trigger, <4 x i64>* %addr, <4 x i6 ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB14_6 ; SSE2-NEXT: ## %bb.5: ## %cond.load4 -; SSE2-NEXT: movlpd {{.*#+}} xmm3 = mem[0],xmm3[1] +; SSE2-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3] ; SSE2-NEXT: LBB14_6: ## %else5 ; SSE2-NEXT: pextrw $4, %xmm0, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB14_8 ; SSE2-NEXT: ## %bb.7: ## %cond.load7 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE2-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE2-NEXT: LBB14_8: ## %else8 -; SSE2-NEXT: movapd %xmm2, %xmm0 -; SSE2-NEXT: movapd %xmm3, %xmm1 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm3, %xmm1 ; SSE2-NEXT: retq ; ; SSE42-LABEL: load_v4i64_v4i64: @@ -1704,64 +1704,64 @@ define <8 x i64> @load_v8i64_v8i16(<8 x i16> %trigger, <8 x i64>* %addr, <8 x i6 ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB15_2 ; SSE2-NEXT: ## %bb.1: ## %cond.load -; SSE2-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] +; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; SSE2-NEXT: LBB15_2: ## %else ; SSE2-NEXT: shrl $16, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB15_4 ; SSE2-NEXT: ## %bb.3: ## %cond.load1 ; SSE2-NEXT: movsd {{.*#+}} xmm5 = mem[0],zero -; SSE2-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm5[0] +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] ; SSE2-NEXT: LBB15_4: ## %else2 -; SSE2-NEXT: xorpd %xmm5, %xmm5 +; SSE2-NEXT: pxor %xmm5, %xmm5 ; SSE2-NEXT: pcmpeqw %xmm0, %xmm5 ; SSE2-NEXT: pextrw $2, %xmm5, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB15_6 ; SSE2-NEXT: ## %bb.5: ## %cond.load4 -; SSE2-NEXT: movlpd {{.*#+}} xmm2 = mem[0],xmm2[1] +; SSE2-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] ; SSE2-NEXT: LBB15_6: ## %else5 ; SSE2-NEXT: pextrw $3, %xmm5, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB15_8 ; SSE2-NEXT: ## %bb.7: ## %cond.load7 ; SSE2-NEXT: movsd {{.*#+}} xmm5 = mem[0],zero -; SSE2-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm5[0] +; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm5[0] ; SSE2-NEXT: LBB15_8: ## %else8 -; SSE2-NEXT: xorpd %xmm5, %xmm5 +; SSE2-NEXT: pxor %xmm5, %xmm5 ; SSE2-NEXT: pcmpeqw %xmm0, %xmm5 ; SSE2-NEXT: pextrw $4, %xmm5, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB15_10 ; SSE2-NEXT: ## %bb.9: ## %cond.load10 -; SSE2-NEXT: movlpd {{.*#+}} xmm3 = mem[0],xmm3[1] +; SSE2-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3] ; SSE2-NEXT: LBB15_10: ## %else11 ; SSE2-NEXT: pextrw $5, %xmm5, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB15_12 ; SSE2-NEXT: ## %bb.11: ## %cond.load13 ; SSE2-NEXT: movsd {{.*#+}} xmm5 = mem[0],zero -; SSE2-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm5[0] +; SSE2-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm5[0] ; SSE2-NEXT: LBB15_12: ## %else14 -; SSE2-NEXT: xorpd %xmm5, %xmm5 +; SSE2-NEXT: pxor %xmm5, %xmm5 ; SSE2-NEXT: pcmpeqw %xmm5, %xmm0 ; SSE2-NEXT: pextrw $6, %xmm0, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB15_14 ; SSE2-NEXT: ## %bb.13: ## %cond.load16 -; SSE2-NEXT: movlpd {{.*#+}} xmm4 = mem[0],xmm4[1] +; SSE2-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3] ; SSE2-NEXT: LBB15_14: ## %else17 ; SSE2-NEXT: pextrw $7, %xmm0, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB15_16 ; SSE2-NEXT: ## %bb.15: ## %cond.load19 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: unpcklpd {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE2-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] ; SSE2-NEXT: LBB15_16: ## %else20 -; SSE2-NEXT: movapd %xmm1, %xmm0 -; SSE2-NEXT: movapd %xmm2, %xmm1 -; SSE2-NEXT: movapd %xmm3, %xmm2 -; SSE2-NEXT: movapd %xmm4, %xmm3 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm2, %xmm1 +; SSE2-NEXT: movaps %xmm3, %xmm2 +; SSE2-NEXT: movaps %xmm4, %xmm3 ; SSE2-NEXT: retq ; ; SSE42-LABEL: load_v8i64_v8i16: @@ -1904,7 +1904,7 @@ define <8 x i64> @load_v8i64_v8i64(<8 x i64> %trigger, <8 x i64>* %addr, <8 x i6 ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB16_2 ; SSE2-NEXT: ## %bb.1: ## %cond.load -; SSE2-NEXT: movlpd {{.*#+}} xmm4 = mem[0],xmm4[1] +; SSE2-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3] ; SSE2-NEXT: LBB16_2: ## %else ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; SSE2-NEXT: movd %xmm0, %eax @@ -1913,9 +1913,9 @@ define <8 x i64> @load_v8i64_v8i64(<8 x i64> %trigger, <8 x i64>* %addr, <8 x i6 ; SSE2-NEXT: je LBB16_4 ; SSE2-NEXT: ## %bb.3: ## %cond.load1 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: unpcklpd {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE2-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] ; SSE2-NEXT: LBB16_4: ## %else2 -; SSE2-NEXT: xorpd %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2] ; SSE2-NEXT: pand %xmm1, %xmm0 @@ -1923,16 +1923,16 @@ define <8 x i64> @load_v8i64_v8i64(<8 x i64> %trigger, <8 x i64>* %addr, <8 x i6 ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB16_6 ; SSE2-NEXT: ## %bb.5: ## %cond.load4 -; SSE2-NEXT: movlpd {{.*#+}} xmm5 = mem[0],xmm5[1] +; SSE2-NEXT: movlps {{.*#+}} xmm5 = mem[0,1],xmm5[2,3] ; SSE2-NEXT: LBB16_6: ## %else5 ; SSE2-NEXT: pextrw $4, %xmm0, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB16_8 ; SSE2-NEXT: ## %bb.7: ## %cond.load7 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: unpcklpd {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE2-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] ; SSE2-NEXT: LBB16_8: ## %else8 -; SSE2-NEXT: xorpd %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] ; SSE2-NEXT: pand %xmm2, %xmm0 @@ -1940,16 +1940,16 @@ define <8 x i64> @load_v8i64_v8i64(<8 x i64> %trigger, <8 x i64>* %addr, <8 x i6 ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB16_10 ; SSE2-NEXT: ## %bb.9: ## %cond.load10 -; SSE2-NEXT: movlpd {{.*#+}} xmm6 = mem[0],xmm6[1] +; SSE2-NEXT: movlps {{.*#+}} xmm6 = mem[0,1],xmm6[2,3] ; SSE2-NEXT: LBB16_10: ## %else11 ; SSE2-NEXT: pextrw $4, %xmm0, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB16_12 ; SSE2-NEXT: ## %bb.11: ## %cond.load13 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm0[0] +; SSE2-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] ; SSE2-NEXT: LBB16_12: ## %else14 -; SSE2-NEXT: xorpd %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2] ; SSE2-NEXT: pand %xmm3, %xmm0 @@ -1957,19 +1957,19 @@ define <8 x i64> @load_v8i64_v8i64(<8 x i64> %trigger, <8 x i64>* %addr, <8 x i6 ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB16_14 ; SSE2-NEXT: ## %bb.13: ## %cond.load16 -; SSE2-NEXT: movlpd {{.*#+}} xmm8 = mem[0],xmm8[1] +; SSE2-NEXT: movlps {{.*#+}} xmm8 = mem[0,1],xmm8[2,3] ; SSE2-NEXT: LBB16_14: ## %else17 ; SSE2-NEXT: pextrw $4, %xmm0, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB16_16 ; SSE2-NEXT: ## %bb.15: ## %cond.load19 -; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: unpcklpd {{.*#+}} xmm8 = xmm8[0],xmm0[0] +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm0[0] ; SSE2-NEXT: LBB16_16: ## %else20 -; SSE2-NEXT: movapd %xmm4, %xmm0 -; SSE2-NEXT: movapd %xmm5, %xmm1 -; SSE2-NEXT: movapd %xmm6, %xmm2 -; SSE2-NEXT: movapd %xmm8, %xmm3 +; SSE2-NEXT: movaps %xmm4, %xmm0 +; SSE2-NEXT: movaps %xmm5, %xmm1 +; SSE2-NEXT: movaps %xmm6, %xmm2 +; SSE2-NEXT: movdqa %xmm8, %xmm3 ; SSE2-NEXT: retq ; ; SSE42-LABEL: load_v8i64_v8i64: @@ -6686,12 +6686,12 @@ define <4 x float> @mload_constmask_v4f32_all(<4 x float>* %addr) { define <2 x double> @mload_constmask_v2f64(<2 x double>* %addr, <2 x double> %dst) { ; SSE-LABEL: mload_constmask_v2f64: ; SSE: ## %bb.0: -; SSE-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; SSE-NEXT: retq ; ; AVX-LABEL: mload_constmask_v2f64: ; AVX: ## %bb.0: -; AVX-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; AVX-NEXT: retq %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1> , <2 x double> %dst) ret <2 x double> %res @@ -6887,7 +6887,7 @@ define <4 x double> @mload_constmask_v4f64(<4 x double>* %addr, <4 x double> %ds ; SSE-LABEL: mload_constmask_v4f64: ; SSE: ## %bb.0: ; SSE-NEXT: movups (%rdi), %xmm0 -; SSE-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] +; SSE-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; SSE-NEXT: retq ; ; AVX1OR2-LABEL: mload_constmask_v4f64: @@ -6982,7 +6982,7 @@ define <8 x i32> @mload_constmask_v8i32(<8 x i32>* %addr, <8 x i32> %dst) { define <4 x i64> @mload_constmask_v4i64(<4 x i64>* %addr, <4 x i64> %dst) { ; SSE2-LABEL: mload_constmask_v4i64: ; SSE2: ## %bb.0: -; SSE2-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1] +; SSE2-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] ; SSE2-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero ; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE2-NEXT: retq @@ -7030,8 +7030,8 @@ define <8 x double> @mload_constmask_v8f64(<8 x double>* %addr, <8 x double> %ds ; SSE-LABEL: mload_constmask_v8f64: ; SSE: ## %bb.0: ; SSE-NEXT: movups (%rdi), %xmm0 -; SSE-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] -; SSE-NEXT: movhpd {{.*#+}} xmm3 = xmm3[0],mem[0] +; SSE-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; SSE-NEXT: movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1] ; SSE-NEXT: retq ; ; AVX1OR2-LABEL: mload_constmask_v8f64: @@ -7199,7 +7199,7 @@ define <4 x float> @load_one_mask_bit_set2(<4 x float>* %addr, <4 x float> %val) define <4 x i64> @load_one_mask_bit_set3(<4 x i64>* %addr, <4 x i64> %val) { ; SSE2-LABEL: load_one_mask_bit_set3: ; SSE2: ## %bb.0: -; SSE2-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] +; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; SSE2-NEXT: retq ; ; SSE42-LABEL: load_one_mask_bit_set3: @@ -7236,13 +7236,13 @@ define <4 x i64> @load_one_mask_bit_set3(<4 x i64>* %addr, <4 x i64> %val) { define <4 x double> @load_one_mask_bit_set4(<4 x double>* %addr, <4 x double> %val) { ; SSE-LABEL: load_one_mask_bit_set4: ; SSE: ## %bb.0: -; SSE-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; SSE-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] ; SSE-NEXT: retq ; ; AVX-LABEL: load_one_mask_bit_set4: ; AVX: ## %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-NEXT: retq %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1> , <4 x double> %val) @@ -7254,20 +7254,20 @@ define <4 x double> @load_one_mask_bit_set4(<4 x double>* %addr, <4 x double> %v define <8 x double> @load_one_mask_bit_set5(<8 x double>* %addr, <8 x double> %val) { ; SSE-LABEL: load_one_mask_bit_set5: ; SSE: ## %bb.0: -; SSE-NEXT: movhpd {{.*#+}} xmm3 = xmm3[0],mem[0] +; SSE-NEXT: movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1] ; SSE-NEXT: retq ; ; AVX1OR2-LABEL: load_one_mask_bit_set5: ; AVX1OR2: ## %bb.0: ; AVX1OR2-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1OR2-NEXT: vmovhpd {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX1OR2-NEXT: vmovhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] ; AVX1OR2-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1OR2-NEXT: retq ; ; AVX512-LABEL: load_one_mask_bit_set5: ; AVX512: ## %bb.0: ; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm1 -; AVX512-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] ; AVX512-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq %res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1> , <8 x double> %val) diff --git a/test/CodeGen/X86/masked_store.ll b/test/CodeGen/X86/masked_store.ll index a6d4016ffb9..0e715e898bb 100644 --- a/test/CodeGen/X86/masked_store.ll +++ b/test/CodeGen/X86/masked_store.ll @@ -50,13 +50,13 @@ define void @store_v2f64_v2i64(<2 x i64> %trigger, <2 x double>* %addr, <2 x dou ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB1_2 ; SSE2-NEXT: ## %bb.1: ## %cond.store -; SSE2-NEXT: movlpd %xmm1, (%rdi) +; SSE2-NEXT: movlps %xmm1, (%rdi) ; SSE2-NEXT: LBB1_2: ## %else ; SSE2-NEXT: pextrw $4, %xmm0, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB1_4 ; SSE2-NEXT: ## %bb.3: ## %cond.store1 -; SSE2-NEXT: movhpd %xmm1, 8(%rdi) +; SSE2-NEXT: movhps %xmm1, 8(%rdi) ; SSE2-NEXT: LBB1_4: ## %else2 ; SSE2-NEXT: retq ; @@ -68,13 +68,13 @@ define void @store_v2f64_v2i64(<2 x i64> %trigger, <2 x double>* %addr, <2 x dou ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je LBB1_2 ; SSE4-NEXT: ## %bb.1: ## %cond.store -; SSE4-NEXT: movlpd %xmm1, (%rdi) +; SSE4-NEXT: movlps %xmm1, (%rdi) ; SSE4-NEXT: LBB1_2: ## %else ; SSE4-NEXT: pextrb $8, %xmm2, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je LBB1_4 ; SSE4-NEXT: ## %bb.3: ## %cond.store1 -; SSE4-NEXT: movhpd %xmm1, 8(%rdi) +; SSE4-NEXT: movhps %xmm1, 8(%rdi) ; SSE4-NEXT: LBB1_4: ## %else2 ; SSE4-NEXT: retq ; @@ -128,13 +128,13 @@ define void @store_v4f64_v4i64(<4 x i64> %trigger, <4 x double>* %addr, <4 x dou ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB2_2 ; SSE2-NEXT: ## %bb.1: ## %cond.store -; SSE2-NEXT: movlpd %xmm2, (%rdi) +; SSE2-NEXT: movlps %xmm2, (%rdi) ; SSE2-NEXT: LBB2_2: ## %else ; SSE2-NEXT: pextrw $4, %xmm0, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB2_4 ; SSE2-NEXT: ## %bb.3: ## %cond.store1 -; SSE2-NEXT: movhpd %xmm2, 8(%rdi) +; SSE2-NEXT: movhps %xmm2, 8(%rdi) ; SSE2-NEXT: LBB2_4: ## %else2 ; SSE2-NEXT: pxor %xmm4, %xmm1 ; SSE2-NEXT: movdqa %xmm4, %xmm0 @@ -148,13 +148,13 @@ define void @store_v4f64_v4i64(<4 x i64> %trigger, <4 x double>* %addr, <4 x dou ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB2_6 ; SSE2-NEXT: ## %bb.5: ## %cond.store3 -; SSE2-NEXT: movlpd %xmm3, 16(%rdi) +; SSE2-NEXT: movlps %xmm3, 16(%rdi) ; SSE2-NEXT: LBB2_6: ## %else4 ; SSE2-NEXT: pextrw $4, %xmm0, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB2_8 ; SSE2-NEXT: ## %bb.7: ## %cond.store5 -; SSE2-NEXT: movhpd %xmm3, 24(%rdi) +; SSE2-NEXT: movhps %xmm3, 24(%rdi) ; SSE2-NEXT: LBB2_8: ## %else6 ; SSE2-NEXT: retq ; @@ -166,13 +166,13 @@ define void @store_v4f64_v4i64(<4 x i64> %trigger, <4 x double>* %addr, <4 x dou ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je LBB2_2 ; SSE4-NEXT: ## %bb.1: ## %cond.store -; SSE4-NEXT: movlpd %xmm2, (%rdi) +; SSE4-NEXT: movlps %xmm2, (%rdi) ; SSE4-NEXT: LBB2_2: ## %else ; SSE4-NEXT: pextrb $8, %xmm4, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je LBB2_4 ; SSE4-NEXT: ## %bb.3: ## %cond.store1 -; SSE4-NEXT: movhpd %xmm2, 8(%rdi) +; SSE4-NEXT: movhps %xmm2, 8(%rdi) ; SSE4-NEXT: LBB2_4: ## %else2 ; SSE4-NEXT: pxor %xmm0, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 @@ -180,13 +180,13 @@ define void @store_v4f64_v4i64(<4 x i64> %trigger, <4 x double>* %addr, <4 x dou ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je LBB2_6 ; SSE4-NEXT: ## %bb.5: ## %cond.store3 -; SSE4-NEXT: movlpd %xmm3, 16(%rdi) +; SSE4-NEXT: movlps %xmm3, 16(%rdi) ; SSE4-NEXT: LBB2_6: ## %else4 ; SSE4-NEXT: pextrb $8, %xmm0, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je LBB2_8 ; SSE4-NEXT: ## %bb.7: ## %cond.store5 -; SSE4-NEXT: movhpd %xmm3, 24(%rdi) +; SSE4-NEXT: movhps %xmm3, 24(%rdi) ; SSE4-NEXT: LBB2_8: ## %else6 ; SSE4-NEXT: retq ; @@ -5019,13 +5019,13 @@ define void @one_mask_bit_set3(<4 x i64>* %addr, <4 x i64> %val) { define void @one_mask_bit_set4(<4 x double>* %addr, <4 x double> %val) { ; SSE-LABEL: one_mask_bit_set4: ; SSE: ## %bb.0: -; SSE-NEXT: movhpd %xmm1, 24(%rdi) +; SSE-NEXT: movhps %xmm1, 24(%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: one_mask_bit_set4: ; AVX: ## %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-NEXT: vmovhpd %xmm0, 24(%rdi) +; AVX-NEXT: vmovhps %xmm0, 24(%rdi) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %val, <4 x double>* %addr, i32 4, <4 x i1>) @@ -5066,25 +5066,25 @@ define void @masked_store_bool_mask_demand_trunc_sext(<4 x double> %x, <4 x doub ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB23_2 ; SSE2-NEXT: ## %bb.1: ## %cond.store -; SSE2-NEXT: movlpd %xmm0, (%rdi) +; SSE2-NEXT: movlps %xmm0, (%rdi) ; SSE2-NEXT: LBB23_2: ## %else ; SSE2-NEXT: pextrw $2, %xmm2, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB23_4 ; SSE2-NEXT: ## %bb.3: ## %cond.store1 -; SSE2-NEXT: movhpd %xmm0, 8(%rdi) +; SSE2-NEXT: movhps %xmm0, 8(%rdi) ; SSE2-NEXT: LBB23_4: ## %else2 ; SSE2-NEXT: pextrw $4, %xmm2, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB23_6 ; SSE2-NEXT: ## %bb.5: ## %cond.store3 -; SSE2-NEXT: movlpd %xmm1, 16(%rdi) +; SSE2-NEXT: movlps %xmm1, 16(%rdi) ; SSE2-NEXT: LBB23_6: ## %else4 ; SSE2-NEXT: pextrw $6, %xmm2, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB23_8 ; SSE2-NEXT: ## %bb.7: ## %cond.store5 -; SSE2-NEXT: movhpd %xmm1, 24(%rdi) +; SSE2-NEXT: movhps %xmm1, 24(%rdi) ; SSE2-NEXT: LBB23_8: ## %else6 ; SSE2-NEXT: retq ; @@ -5094,25 +5094,25 @@ define void @masked_store_bool_mask_demand_trunc_sext(<4 x double> %x, <4 x doub ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je LBB23_2 ; SSE4-NEXT: ## %bb.1: ## %cond.store -; SSE4-NEXT: movlpd %xmm0, (%rdi) +; SSE4-NEXT: movlps %xmm0, (%rdi) ; SSE4-NEXT: LBB23_2: ## %else ; SSE4-NEXT: pextrb $4, %xmm2, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je LBB23_4 ; SSE4-NEXT: ## %bb.3: ## %cond.store1 -; SSE4-NEXT: movhpd %xmm0, 8(%rdi) +; SSE4-NEXT: movhps %xmm0, 8(%rdi) ; SSE4-NEXT: LBB23_4: ## %else2 ; SSE4-NEXT: pextrb $8, %xmm2, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je LBB23_6 ; SSE4-NEXT: ## %bb.5: ## %cond.store3 -; SSE4-NEXT: movlpd %xmm1, 16(%rdi) +; SSE4-NEXT: movlps %xmm1, 16(%rdi) ; SSE4-NEXT: LBB23_6: ## %else4 ; SSE4-NEXT: pextrb $12, %xmm2, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: je LBB23_8 ; SSE4-NEXT: ## %bb.7: ## %cond.store5 -; SSE4-NEXT: movhpd %xmm1, 24(%rdi) +; SSE4-NEXT: movhps %xmm1, 24(%rdi) ; SSE4-NEXT: LBB23_8: ## %else6 ; SSE4-NEXT: retq ;