From: Craig Topper Date: Mon, 1 Jul 2019 07:09:26 +0000 (+0000) Subject: [X86] Add some additional load folding tests to vec_int_to_fp.ll/vec_int_to_fp-widen... X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=e03609b17c84f9585818cbf74e40eecde6828612;p=llvm [X86] Add some additional load folding tests to vec_int_to_fp.ll/vec_int_to_fp-widen.ll and disable the peephole pass. Also copy some missing test cases from vec_int_to_fp.ll to vec_int_to_fp-widen.ll git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@364727 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/test/CodeGen/X86/vec_int_to_fp-widen.ll b/test/CodeGen/X86/vec_int_to_fp-widen.ll index 26027bcad19..0f2e35f2e29 100644 --- a/test/CodeGen/X86/vec_int_to_fp-widen.ll +++ b/test/CodeGen/X86/vec_int_to_fp-widen.ll @@ -1,18 +1,18 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=ALL,SSE,SSE2 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=ALL,SSE,SSE41 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX,VEX,AVX1 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,VEX,AVX2 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512F -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512VL -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512DQ -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512VLDQ +; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=ALL,SSE,SSE2 +; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=ALL,SSE,SSE41 +; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX,VEX,AVX1 +; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,VEX,AVX2 +; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512F +; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512VL +; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512DQ +; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512VLDQ ; ; 32-bit tests to make sure we're not doing anything stupid. -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse2 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse4.1 +; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown +; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse +; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse2 +; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse4.1 ; ; Signed Integer to Double @@ -3119,6 +3119,42 @@ define <2 x double> @sitofp_volatile_load_4i32_to_2f64(<4 x i32> *%a) { ret <2 x double> %cvt } +define <2 x double> @sitofp_load_4i32_to_2f64_2(<4 x i32>* %x) { +; SSE-LABEL: sitofp_load_4i32_to_2f64_2: +; SSE: # %bb.0: +; SSE-NEXT: movaps (%rdi), %xmm0 +; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sitofp_load_4i32_to_2f64_2: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps (%rdi), %xmm0 +; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX-NEXT: retq + %a = load <4 x i32>, <4 x i32>* %x + %b = sitofp <4 x i32> %a to <4 x double> + %c = shufflevector <4 x double> %b, <4 x double> undef, <2 x i32> + ret <2 x double> %c +} + +define <2 x double> @sitofp_volatile_load_4i32_to_2f64_2(<4 x i32>* %x) { +; SSE-LABEL: sitofp_volatile_load_4i32_to_2f64_2: +; SSE: # %bb.0: +; SSE-NEXT: movaps (%rdi), %xmm0 +; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sitofp_volatile_load_4i32_to_2f64_2: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps (%rdi), %xmm0 +; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX-NEXT: retq + %a = load volatile <4 x i32>, <4 x i32>* %x + %b = sitofp <4 x i32> %a to <4 x double> + %c = shufflevector <4 x double> %b, <4 x double> undef, <2 x i32> + ret <2 x double> %c +} + define <2 x double> @sitofp_load_2i16_to_2f64(<2 x i16> *%a) { ; SSE2-LABEL: sitofp_load_2i16_to_2f64: ; SSE2: # %bb.0: @@ -3522,6 +3558,146 @@ define <2 x double> @uitofp_load_2i32_to_2f64(<2 x i32> *%a) { ret <2 x double> %cvt } +define <2 x double> @uitofp_load_4i32_to_2f64_2(<4 x i32>* %x) { +; SSE2-LABEL: uitofp_load_4i32_to_2f64_2: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,0,0] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: mulpd {{.*}}(%rip), %xmm0 +; SSE2-NEXT: addpd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_4i32_to_2f64_2: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: mulpd {{.*}}(%rip), %xmm0 +; SSE41-NEXT: addpd %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; VEX-LABEL: uitofp_load_4i32_to_2f64_2: +; VEX: # %bb.0: +; VEX-NEXT: vmovdqa (%rdi), %xmm0 +; VEX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; VEX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; VEX-NEXT: vpsrld $16, %xmm0, %xmm0 +; VEX-NEXT: vcvtdq2pd %xmm1, %xmm1 +; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; VEX-NEXT: vmulpd {{.*}}(%rip), %xmm0, %xmm0 +; VEX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; VEX-NEXT: retq +; +; AVX512F-LABEL: uitofp_load_4i32_to_2f64_2: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps (%rdi), %xmm0 +; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: uitofp_load_4i32_to_2f64_2: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovaps (%rdi), %xmm0 +; AVX512VL-NEXT: vcvtudq2pd %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: uitofp_load_4i32_to_2f64_2: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 +; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: uitofp_load_4i32_to_2f64_2: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vmovaps (%rdi), %xmm0 +; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq + %a = load <4 x i32>, <4 x i32>* %x + %b = uitofp <4 x i32> %a to <4 x double> + %c = shufflevector <4 x double> %b, <4 x double> undef, <2 x i32> + ret <2 x double> %c +} + +define <2 x double> @uitofp_volatile_load_4i32_to_2f64_2(<4 x i32>* %x) { +; SSE2-LABEL: uitofp_volatile_load_4i32_to_2f64_2: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,0,0] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: mulpd {{.*}}(%rip), %xmm0 +; SSE2-NEXT: addpd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_volatile_load_4i32_to_2f64_2: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: mulpd {{.*}}(%rip), %xmm0 +; SSE41-NEXT: addpd %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; VEX-LABEL: uitofp_volatile_load_4i32_to_2f64_2: +; VEX: # %bb.0: +; VEX-NEXT: vmovdqa (%rdi), %xmm0 +; VEX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; VEX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; VEX-NEXT: vpsrld $16, %xmm0, %xmm0 +; VEX-NEXT: vcvtdq2pd %xmm1, %xmm1 +; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; VEX-NEXT: vmulpd {{.*}}(%rip), %xmm0, %xmm0 +; VEX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; VEX-NEXT: retq +; +; AVX512F-LABEL: uitofp_volatile_load_4i32_to_2f64_2: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps (%rdi), %xmm0 +; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: uitofp_volatile_load_4i32_to_2f64_2: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovaps (%rdi), %xmm0 +; AVX512VL-NEXT: vcvtudq2pd %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: uitofp_volatile_load_4i32_to_2f64_2: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 +; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: uitofp_volatile_load_4i32_to_2f64_2: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vmovaps (%rdi), %xmm0 +; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq + %a = load volatile <4 x i32>, <4 x i32>* %x + %b = uitofp <4 x i32> %a to <4 x double> + %c = shufflevector <4 x double> %b, <4 x double> undef, <2 x i32> + ret <2 x double> %c +} + define <2 x double> @uitofp_load_2i16_to_2f64(<2 x i16> *%a) { ; SSE2-LABEL: uitofp_load_2i16_to_2f64: ; SSE2: # %bb.0: @@ -4339,41 +4515,41 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { ; SSE2-NEXT: movdqa 16(%rdi), %xmm0 ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB77_1 +; SSE2-NEXT: js .LBB81_1 ; SSE2-NEXT: # %bb.2: ; SSE2-NEXT: cvtsi2ss %rax, %xmm1 -; SSE2-NEXT: jmp .LBB77_3 -; SSE2-NEXT: .LBB77_1: +; SSE2-NEXT: jmp .LBB81_3 +; SSE2-NEXT: .LBB81_1: ; SSE2-NEXT: movq %rax, %rcx ; SSE2-NEXT: shrq %rcx ; SSE2-NEXT: andl $1, %eax ; SSE2-NEXT: orq %rcx, %rax ; SSE2-NEXT: cvtsi2ss %rax, %xmm1 ; SSE2-NEXT: addss %xmm1, %xmm1 -; SSE2-NEXT: .LBB77_3: +; SSE2-NEXT: .LBB81_3: ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB77_4 +; SSE2-NEXT: js .LBB81_4 ; SSE2-NEXT: # %bb.5: ; SSE2-NEXT: cvtsi2ss %rax, %xmm3 -; SSE2-NEXT: jmp .LBB77_6 -; SSE2-NEXT: .LBB77_4: +; SSE2-NEXT: jmp .LBB81_6 +; SSE2-NEXT: .LBB81_4: ; SSE2-NEXT: movq %rax, %rcx ; SSE2-NEXT: shrq %rcx ; SSE2-NEXT: andl $1, %eax ; SSE2-NEXT: orq %rcx, %rax ; SSE2-NEXT: cvtsi2ss %rax, %xmm3 ; SSE2-NEXT: addss %xmm3, %xmm3 -; SSE2-NEXT: .LBB77_6: +; SSE2-NEXT: .LBB81_6: ; SSE2-NEXT: movq %xmm2, %rax ; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB77_7 +; SSE2-NEXT: js .LBB81_7 ; SSE2-NEXT: # %bb.8: ; SSE2-NEXT: xorps %xmm0, %xmm0 ; SSE2-NEXT: cvtsi2ss %rax, %xmm0 -; SSE2-NEXT: jmp .LBB77_9 -; SSE2-NEXT: .LBB77_7: +; SSE2-NEXT: jmp .LBB81_9 +; SSE2-NEXT: .LBB81_7: ; SSE2-NEXT: movq %rax, %rcx ; SSE2-NEXT: shrq %rcx ; SSE2-NEXT: andl $1, %eax @@ -4381,17 +4557,17 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { ; SSE2-NEXT: xorps %xmm0, %xmm0 ; SSE2-NEXT: cvtsi2ss %rax, %xmm0 ; SSE2-NEXT: addss %xmm0, %xmm0 -; SSE2-NEXT: .LBB77_9: +; SSE2-NEXT: .LBB81_9: ; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] ; SSE2-NEXT: movq %xmm2, %rax ; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB77_10 +; SSE2-NEXT: js .LBB81_10 ; SSE2-NEXT: # %bb.11: ; SSE2-NEXT: xorps %xmm2, %xmm2 ; SSE2-NEXT: cvtsi2ss %rax, %xmm2 -; SSE2-NEXT: jmp .LBB77_12 -; SSE2-NEXT: .LBB77_10: +; SSE2-NEXT: jmp .LBB81_12 +; SSE2-NEXT: .LBB81_10: ; SSE2-NEXT: movq %rax, %rcx ; SSE2-NEXT: shrq %rcx ; SSE2-NEXT: andl $1, %eax @@ -4399,7 +4575,7 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { ; SSE2-NEXT: xorps %xmm2, %xmm2 ; SSE2-NEXT: cvtsi2ss %rax, %xmm2 ; SSE2-NEXT: addss %xmm2, %xmm2 -; SSE2-NEXT: .LBB77_12: +; SSE2-NEXT: .LBB81_12: ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: retq @@ -4410,26 +4586,26 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { ; SSE41-NEXT: movdqa 16(%rdi), %xmm1 ; SSE41-NEXT: pextrq $1, %xmm0, %rax ; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: js .LBB77_1 +; SSE41-NEXT: js .LBB81_1 ; SSE41-NEXT: # %bb.2: ; SSE41-NEXT: cvtsi2ss %rax, %xmm2 -; SSE41-NEXT: jmp .LBB77_3 -; SSE41-NEXT: .LBB77_1: +; SSE41-NEXT: jmp .LBB81_3 +; SSE41-NEXT: .LBB81_1: ; SSE41-NEXT: movq %rax, %rcx ; SSE41-NEXT: shrq %rcx ; SSE41-NEXT: andl $1, %eax ; SSE41-NEXT: orq %rcx, %rax ; SSE41-NEXT: cvtsi2ss %rax, %xmm2 ; SSE41-NEXT: addss %xmm2, %xmm2 -; SSE41-NEXT: .LBB77_3: +; SSE41-NEXT: .LBB81_3: ; SSE41-NEXT: movq %xmm0, %rax ; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: js .LBB77_4 +; SSE41-NEXT: js .LBB81_4 ; SSE41-NEXT: # %bb.5: ; SSE41-NEXT: xorps %xmm0, %xmm0 ; SSE41-NEXT: cvtsi2ss %rax, %xmm0 -; SSE41-NEXT: jmp .LBB77_6 -; SSE41-NEXT: .LBB77_4: +; SSE41-NEXT: jmp .LBB81_6 +; SSE41-NEXT: .LBB81_4: ; SSE41-NEXT: movq %rax, %rcx ; SSE41-NEXT: shrq %rcx ; SSE41-NEXT: andl $1, %eax @@ -4437,16 +4613,16 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { ; SSE41-NEXT: xorps %xmm0, %xmm0 ; SSE41-NEXT: cvtsi2ss %rax, %xmm0 ; SSE41-NEXT: addss %xmm0, %xmm0 -; SSE41-NEXT: .LBB77_6: +; SSE41-NEXT: .LBB81_6: ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] ; SSE41-NEXT: movq %xmm1, %rax ; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: js .LBB77_7 +; SSE41-NEXT: js .LBB81_7 ; SSE41-NEXT: # %bb.8: ; SSE41-NEXT: xorps %xmm2, %xmm2 ; SSE41-NEXT: cvtsi2ss %rax, %xmm2 -; SSE41-NEXT: jmp .LBB77_9 -; SSE41-NEXT: .LBB77_7: +; SSE41-NEXT: jmp .LBB81_9 +; SSE41-NEXT: .LBB81_7: ; SSE41-NEXT: movq %rax, %rcx ; SSE41-NEXT: shrq %rcx ; SSE41-NEXT: andl $1, %eax @@ -4454,17 +4630,17 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { ; SSE41-NEXT: xorps %xmm2, %xmm2 ; SSE41-NEXT: cvtsi2ss %rax, %xmm2 ; SSE41-NEXT: addss %xmm2, %xmm2 -; SSE41-NEXT: .LBB77_9: +; SSE41-NEXT: .LBB81_9: ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] ; SSE41-NEXT: pextrq $1, %xmm1, %rax ; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: js .LBB77_10 +; SSE41-NEXT: js .LBB81_10 ; SSE41-NEXT: # %bb.11: ; SSE41-NEXT: xorps %xmm1, %xmm1 ; SSE41-NEXT: cvtsi2ss %rax, %xmm1 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; SSE41-NEXT: retq -; SSE41-NEXT: .LBB77_10: +; SSE41-NEXT: .LBB81_10: ; SSE41-NEXT: movq %rax, %rcx ; SSE41-NEXT: shrq %rcx ; SSE41-NEXT: andl $1, %eax @@ -4481,56 +4657,56 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { ; VEX-NEXT: vmovdqa 16(%rdi), %xmm0 ; VEX-NEXT: vpextrq $1, %xmm2, %rax ; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB77_1 +; VEX-NEXT: js .LBB81_1 ; VEX-NEXT: # %bb.2: ; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; VEX-NEXT: jmp .LBB77_3 -; VEX-NEXT: .LBB77_1: +; VEX-NEXT: jmp .LBB81_3 +; VEX-NEXT: .LBB81_1: ; VEX-NEXT: movq %rax, %rcx ; VEX-NEXT: shrq %rcx ; VEX-NEXT: andl $1, %eax ; VEX-NEXT: orq %rcx, %rax ; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 ; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1 -; VEX-NEXT: .LBB77_3: +; VEX-NEXT: .LBB81_3: ; VEX-NEXT: vmovq %xmm2, %rax ; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB77_4 +; VEX-NEXT: js .LBB81_4 ; VEX-NEXT: # %bb.5: ; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; VEX-NEXT: jmp .LBB77_6 -; VEX-NEXT: .LBB77_4: +; VEX-NEXT: jmp .LBB81_6 +; VEX-NEXT: .LBB81_4: ; VEX-NEXT: movq %rax, %rcx ; VEX-NEXT: shrq %rcx ; VEX-NEXT: andl $1, %eax ; VEX-NEXT: orq %rcx, %rax ; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 ; VEX-NEXT: vaddss %xmm2, %xmm2, %xmm2 -; VEX-NEXT: .LBB77_6: +; VEX-NEXT: .LBB81_6: ; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; VEX-NEXT: vmovq %xmm0, %rax ; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB77_7 +; VEX-NEXT: js .LBB81_7 ; VEX-NEXT: # %bb.8: ; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; VEX-NEXT: jmp .LBB77_9 -; VEX-NEXT: .LBB77_7: +; VEX-NEXT: jmp .LBB81_9 +; VEX-NEXT: .LBB81_7: ; VEX-NEXT: movq %rax, %rcx ; VEX-NEXT: shrq %rcx ; VEX-NEXT: andl $1, %eax ; VEX-NEXT: orq %rcx, %rax ; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 ; VEX-NEXT: vaddss %xmm2, %xmm2, %xmm2 -; VEX-NEXT: .LBB77_9: +; VEX-NEXT: .LBB81_9: ; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] ; VEX-NEXT: vpextrq $1, %xmm0, %rax ; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB77_10 +; VEX-NEXT: js .LBB81_10 ; VEX-NEXT: # %bb.11: ; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; VEX-NEXT: retq -; VEX-NEXT: .LBB77_10: +; VEX-NEXT: .LBB81_10: ; VEX-NEXT: movq %rax, %rcx ; VEX-NEXT: shrq %rcx ; VEX-NEXT: andl $1, %eax @@ -4728,41 +4904,41 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE2-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB81_1 +; SSE2-NEXT: js .LBB85_1 ; SSE2-NEXT: # %bb.2: ; SSE2-NEXT: cvtsi2ss %rax, %xmm3 -; SSE2-NEXT: jmp .LBB81_3 -; SSE2-NEXT: .LBB81_1: +; SSE2-NEXT: jmp .LBB85_3 +; SSE2-NEXT: .LBB85_1: ; SSE2-NEXT: movq %rax, %rcx ; SSE2-NEXT: shrq %rcx ; SSE2-NEXT: andl $1, %eax ; SSE2-NEXT: orq %rcx, %rax ; SSE2-NEXT: cvtsi2ss %rax, %xmm3 ; SSE2-NEXT: addss %xmm3, %xmm3 -; SSE2-NEXT: .LBB81_3: +; SSE2-NEXT: .LBB85_3: ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB81_4 +; SSE2-NEXT: js .LBB85_4 ; SSE2-NEXT: # %bb.5: ; SSE2-NEXT: cvtsi2ss %rax, %xmm4 -; SSE2-NEXT: jmp .LBB81_6 -; SSE2-NEXT: .LBB81_4: +; SSE2-NEXT: jmp .LBB85_6 +; SSE2-NEXT: .LBB85_4: ; SSE2-NEXT: movq %rax, %rcx ; SSE2-NEXT: shrq %rcx ; SSE2-NEXT: andl $1, %eax ; SSE2-NEXT: orq %rcx, %rax ; SSE2-NEXT: cvtsi2ss %rax, %xmm4 ; SSE2-NEXT: addss %xmm4, %xmm4 -; SSE2-NEXT: .LBB81_6: +; SSE2-NEXT: .LBB85_6: ; SSE2-NEXT: movq %xmm5, %rax ; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB81_7 +; SSE2-NEXT: js .LBB85_7 ; SSE2-NEXT: # %bb.8: ; SSE2-NEXT: xorps %xmm0, %xmm0 ; SSE2-NEXT: cvtsi2ss %rax, %xmm0 -; SSE2-NEXT: jmp .LBB81_9 -; SSE2-NEXT: .LBB81_7: +; SSE2-NEXT: jmp .LBB85_9 +; SSE2-NEXT: .LBB85_7: ; SSE2-NEXT: movq %rax, %rcx ; SSE2-NEXT: shrq %rcx ; SSE2-NEXT: andl $1, %eax @@ -4770,30 +4946,30 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE2-NEXT: xorps %xmm0, %xmm0 ; SSE2-NEXT: cvtsi2ss %rax, %xmm0 ; SSE2-NEXT: addss %xmm0, %xmm0 -; SSE2-NEXT: .LBB81_9: +; SSE2-NEXT: .LBB85_9: ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] ; SSE2-NEXT: movq %xmm5, %rax ; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB81_10 +; SSE2-NEXT: js .LBB85_10 ; SSE2-NEXT: # %bb.11: ; SSE2-NEXT: cvtsi2ss %rax, %xmm6 -; SSE2-NEXT: jmp .LBB81_12 -; SSE2-NEXT: .LBB81_10: +; SSE2-NEXT: jmp .LBB85_12 +; SSE2-NEXT: .LBB85_10: ; SSE2-NEXT: movq %rax, %rcx ; SSE2-NEXT: shrq %rcx ; SSE2-NEXT: andl $1, %eax ; SSE2-NEXT: orq %rcx, %rax ; SSE2-NEXT: cvtsi2ss %rax, %xmm6 ; SSE2-NEXT: addss %xmm6, %xmm6 -; SSE2-NEXT: .LBB81_12: +; SSE2-NEXT: .LBB85_12: ; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB81_13 +; SSE2-NEXT: js .LBB85_13 ; SSE2-NEXT: # %bb.14: ; SSE2-NEXT: xorps %xmm5, %xmm5 ; SSE2-NEXT: cvtsi2ss %rax, %xmm5 -; SSE2-NEXT: jmp .LBB81_15 -; SSE2-NEXT: .LBB81_13: +; SSE2-NEXT: jmp .LBB85_15 +; SSE2-NEXT: .LBB85_13: ; SSE2-NEXT: movq %rax, %rcx ; SSE2-NEXT: shrq %rcx ; SSE2-NEXT: andl $1, %eax @@ -4801,32 +4977,32 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE2-NEXT: xorps %xmm5, %xmm5 ; SSE2-NEXT: cvtsi2ss %rax, %xmm5 ; SSE2-NEXT: addss %xmm5, %xmm5 -; SSE2-NEXT: .LBB81_15: +; SSE2-NEXT: .LBB85_15: ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] ; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB81_16 +; SSE2-NEXT: js .LBB85_16 ; SSE2-NEXT: # %bb.17: ; SSE2-NEXT: cvtsi2ss %rax, %xmm7 -; SSE2-NEXT: jmp .LBB81_18 -; SSE2-NEXT: .LBB81_16: +; SSE2-NEXT: jmp .LBB85_18 +; SSE2-NEXT: .LBB85_16: ; SSE2-NEXT: movq %rax, %rcx ; SSE2-NEXT: shrq %rcx ; SSE2-NEXT: andl $1, %eax ; SSE2-NEXT: orq %rcx, %rax ; SSE2-NEXT: cvtsi2ss %rax, %xmm7 ; SSE2-NEXT: addss %xmm7, %xmm7 -; SSE2-NEXT: .LBB81_18: +; SSE2-NEXT: .LBB85_18: ; SSE2-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] ; SSE2-NEXT: movq %xmm2, %rax ; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB81_19 +; SSE2-NEXT: js .LBB85_19 ; SSE2-NEXT: # %bb.20: ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: cvtsi2ss %rax, %xmm1 -; SSE2-NEXT: jmp .LBB81_21 -; SSE2-NEXT: .LBB81_19: +; SSE2-NEXT: jmp .LBB85_21 +; SSE2-NEXT: .LBB85_19: ; SSE2-NEXT: movq %rax, %rcx ; SSE2-NEXT: shrq %rcx ; SSE2-NEXT: andl $1, %eax @@ -4834,18 +5010,18 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: cvtsi2ss %rax, %xmm1 ; SSE2-NEXT: addss %xmm1, %xmm1 -; SSE2-NEXT: .LBB81_21: +; SSE2-NEXT: .LBB85_21: ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE2-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] ; SSE2-NEXT: movq %xmm2, %rax ; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB81_22 +; SSE2-NEXT: js .LBB85_22 ; SSE2-NEXT: # %bb.23: ; SSE2-NEXT: xorps %xmm2, %xmm2 ; SSE2-NEXT: cvtsi2ss %rax, %xmm2 -; SSE2-NEXT: jmp .LBB81_24 -; SSE2-NEXT: .LBB81_22: +; SSE2-NEXT: jmp .LBB85_24 +; SSE2-NEXT: .LBB85_22: ; SSE2-NEXT: movq %rax, %rcx ; SSE2-NEXT: shrq %rcx ; SSE2-NEXT: andl $1, %eax @@ -4853,7 +5029,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE2-NEXT: xorps %xmm2, %xmm2 ; SSE2-NEXT: cvtsi2ss %rax, %xmm2 ; SSE2-NEXT: addss %xmm2, %xmm2 -; SSE2-NEXT: .LBB81_24: +; SSE2-NEXT: .LBB85_24: ; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] ; SSE2-NEXT: retq @@ -4866,26 +5042,26 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE41-NEXT: movdqa 48(%rdi), %xmm2 ; SSE41-NEXT: pextrq $1, %xmm0, %rax ; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: js .LBB81_1 +; SSE41-NEXT: js .LBB85_1 ; SSE41-NEXT: # %bb.2: ; SSE41-NEXT: cvtsi2ss %rax, %xmm3 -; SSE41-NEXT: jmp .LBB81_3 -; SSE41-NEXT: .LBB81_1: +; SSE41-NEXT: jmp .LBB85_3 +; SSE41-NEXT: .LBB85_1: ; SSE41-NEXT: movq %rax, %rcx ; SSE41-NEXT: shrq %rcx ; SSE41-NEXT: andl $1, %eax ; SSE41-NEXT: orq %rcx, %rax ; SSE41-NEXT: cvtsi2ss %rax, %xmm3 ; SSE41-NEXT: addss %xmm3, %xmm3 -; SSE41-NEXT: .LBB81_3: +; SSE41-NEXT: .LBB85_3: ; SSE41-NEXT: movq %xmm0, %rax ; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: js .LBB81_4 +; SSE41-NEXT: js .LBB85_4 ; SSE41-NEXT: # %bb.5: ; SSE41-NEXT: xorps %xmm0, %xmm0 ; SSE41-NEXT: cvtsi2ss %rax, %xmm0 -; SSE41-NEXT: jmp .LBB81_6 -; SSE41-NEXT: .LBB81_4: +; SSE41-NEXT: jmp .LBB85_6 +; SSE41-NEXT: .LBB85_4: ; SSE41-NEXT: movq %rax, %rcx ; SSE41-NEXT: shrq %rcx ; SSE41-NEXT: andl $1, %eax @@ -4893,29 +5069,29 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE41-NEXT: xorps %xmm0, %xmm0 ; SSE41-NEXT: cvtsi2ss %rax, %xmm0 ; SSE41-NEXT: addss %xmm0, %xmm0 -; SSE41-NEXT: .LBB81_6: +; SSE41-NEXT: .LBB85_6: ; SSE41-NEXT: movq %xmm4, %rax ; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: js .LBB81_7 +; SSE41-NEXT: js .LBB85_7 ; SSE41-NEXT: # %bb.8: ; SSE41-NEXT: cvtsi2ss %rax, %xmm5 -; SSE41-NEXT: jmp .LBB81_9 -; SSE41-NEXT: .LBB81_7: +; SSE41-NEXT: jmp .LBB85_9 +; SSE41-NEXT: .LBB85_7: ; SSE41-NEXT: movq %rax, %rcx ; SSE41-NEXT: shrq %rcx ; SSE41-NEXT: andl $1, %eax ; SSE41-NEXT: orq %rcx, %rax ; SSE41-NEXT: cvtsi2ss %rax, %xmm5 ; SSE41-NEXT: addss %xmm5, %xmm5 -; SSE41-NEXT: .LBB81_9: +; SSE41-NEXT: .LBB85_9: ; SSE41-NEXT: pextrq $1, %xmm4, %rax ; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: js .LBB81_10 +; SSE41-NEXT: js .LBB85_10 ; SSE41-NEXT: # %bb.11: ; SSE41-NEXT: xorps %xmm4, %xmm4 ; SSE41-NEXT: cvtsi2ss %rax, %xmm4 -; SSE41-NEXT: jmp .LBB81_12 -; SSE41-NEXT: .LBB81_10: +; SSE41-NEXT: jmp .LBB85_12 +; SSE41-NEXT: .LBB85_10: ; SSE41-NEXT: movq %rax, %rcx ; SSE41-NEXT: shrq %rcx ; SSE41-NEXT: andl $1, %eax @@ -4923,30 +5099,30 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE41-NEXT: xorps %xmm4, %xmm4 ; SSE41-NEXT: cvtsi2ss %rax, %xmm4 ; SSE41-NEXT: addss %xmm4, %xmm4 -; SSE41-NEXT: .LBB81_12: +; SSE41-NEXT: .LBB85_12: ; SSE41-NEXT: pextrq $1, %xmm1, %rax ; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: js .LBB81_13 +; SSE41-NEXT: js .LBB85_13 ; SSE41-NEXT: # %bb.14: ; SSE41-NEXT: cvtsi2ss %rax, %xmm6 -; SSE41-NEXT: jmp .LBB81_15 -; SSE41-NEXT: .LBB81_13: +; SSE41-NEXT: jmp .LBB85_15 +; SSE41-NEXT: .LBB85_13: ; SSE41-NEXT: movq %rax, %rcx ; SSE41-NEXT: shrq %rcx ; SSE41-NEXT: andl $1, %eax ; SSE41-NEXT: orq %rcx, %rax ; SSE41-NEXT: cvtsi2ss %rax, %xmm6 ; SSE41-NEXT: addss %xmm6, %xmm6 -; SSE41-NEXT: .LBB81_15: +; SSE41-NEXT: .LBB85_15: ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] ; SSE41-NEXT: movq %xmm1, %rax ; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: js .LBB81_16 +; SSE41-NEXT: js .LBB85_16 ; SSE41-NEXT: # %bb.17: ; SSE41-NEXT: xorps %xmm1, %xmm1 ; SSE41-NEXT: cvtsi2ss %rax, %xmm1 -; SSE41-NEXT: jmp .LBB81_18 -; SSE41-NEXT: .LBB81_16: +; SSE41-NEXT: jmp .LBB85_18 +; SSE41-NEXT: .LBB85_16: ; SSE41-NEXT: movq %rax, %rcx ; SSE41-NEXT: shrq %rcx ; SSE41-NEXT: andl $1, %eax @@ -4954,17 +5130,17 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE41-NEXT: xorps %xmm1, %xmm1 ; SSE41-NEXT: cvtsi2ss %rax, %xmm1 ; SSE41-NEXT: addss %xmm1, %xmm1 -; SSE41-NEXT: .LBB81_18: +; SSE41-NEXT: .LBB85_18: ; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[2,3] ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm5[0],xmm0[3] ; SSE41-NEXT: movq %xmm2, %rax ; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: js .LBB81_19 +; SSE41-NEXT: js .LBB85_19 ; SSE41-NEXT: # %bb.20: ; SSE41-NEXT: xorps %xmm3, %xmm3 ; SSE41-NEXT: cvtsi2ss %rax, %xmm3 -; SSE41-NEXT: jmp .LBB81_21 -; SSE41-NEXT: .LBB81_19: +; SSE41-NEXT: jmp .LBB85_21 +; SSE41-NEXT: .LBB85_19: ; SSE41-NEXT: movq %rax, %rcx ; SSE41-NEXT: shrq %rcx ; SSE41-NEXT: andl $1, %eax @@ -4972,18 +5148,18 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE41-NEXT: xorps %xmm3, %xmm3 ; SSE41-NEXT: cvtsi2ss %rax, %xmm3 ; SSE41-NEXT: addss %xmm3, %xmm3 -; SSE41-NEXT: .LBB81_21: +; SSE41-NEXT: .LBB85_21: ; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] ; SSE41-NEXT: pextrq $1, %xmm2, %rax ; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: js .LBB81_22 +; SSE41-NEXT: js .LBB85_22 ; SSE41-NEXT: # %bb.23: ; SSE41-NEXT: xorps %xmm2, %xmm2 ; SSE41-NEXT: cvtsi2ss %rax, %xmm2 ; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] ; SSE41-NEXT: retq -; SSE41-NEXT: .LBB81_22: +; SSE41-NEXT: .LBB85_22: ; SSE41-NEXT: movq %rax, %rcx ; SSE41-NEXT: shrq %rcx ; SSE41-NEXT: andl $1, %eax @@ -5002,121 +5178,121 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; VEX-NEXT: vmovdqa 48(%rdi), %xmm3 ; VEX-NEXT: vpextrq $1, %xmm4, %rax ; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB81_1 +; VEX-NEXT: js .LBB85_1 ; VEX-NEXT: # %bb.2: ; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; VEX-NEXT: jmp .LBB81_3 -; VEX-NEXT: .LBB81_1: +; VEX-NEXT: jmp .LBB85_3 +; VEX-NEXT: .LBB85_1: ; VEX-NEXT: movq %rax, %rcx ; VEX-NEXT: shrq %rcx ; VEX-NEXT: andl $1, %eax ; VEX-NEXT: orq %rcx, %rax ; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 ; VEX-NEXT: vaddss %xmm2, %xmm2, %xmm2 -; VEX-NEXT: .LBB81_3: +; VEX-NEXT: .LBB85_3: ; VEX-NEXT: vmovq %xmm4, %rax ; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB81_4 +; VEX-NEXT: js .LBB85_4 ; VEX-NEXT: # %bb.5: ; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm5 -; VEX-NEXT: jmp .LBB81_6 -; VEX-NEXT: .LBB81_4: +; VEX-NEXT: jmp .LBB85_6 +; VEX-NEXT: .LBB85_4: ; VEX-NEXT: movq %rax, %rcx ; VEX-NEXT: shrq %rcx ; VEX-NEXT: andl $1, %eax ; VEX-NEXT: orq %rcx, %rax ; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 ; VEX-NEXT: vaddss %xmm4, %xmm4, %xmm5 -; VEX-NEXT: .LBB81_6: +; VEX-NEXT: .LBB85_6: ; VEX-NEXT: vmovq %xmm3, %rax ; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB81_7 +; VEX-NEXT: js .LBB85_7 ; VEX-NEXT: # %bb.8: ; VEX-NEXT: vcvtsi2ss %rax, %xmm6, %xmm4 -; VEX-NEXT: jmp .LBB81_9 -; VEX-NEXT: .LBB81_7: +; VEX-NEXT: jmp .LBB85_9 +; VEX-NEXT: .LBB85_7: ; VEX-NEXT: movq %rax, %rcx ; VEX-NEXT: shrq %rcx ; VEX-NEXT: andl $1, %eax ; VEX-NEXT: orq %rcx, %rax ; VEX-NEXT: vcvtsi2ss %rax, %xmm6, %xmm4 ; VEX-NEXT: vaddss %xmm4, %xmm4, %xmm4 -; VEX-NEXT: .LBB81_9: +; VEX-NEXT: .LBB85_9: ; VEX-NEXT: vpextrq $1, %xmm3, %rax ; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB81_10 +; VEX-NEXT: js .LBB85_10 ; VEX-NEXT: # %bb.11: ; VEX-NEXT: vcvtsi2ss %rax, %xmm6, %xmm3 -; VEX-NEXT: jmp .LBB81_12 -; VEX-NEXT: .LBB81_10: +; VEX-NEXT: jmp .LBB85_12 +; VEX-NEXT: .LBB85_10: ; VEX-NEXT: movq %rax, %rcx ; VEX-NEXT: shrq %rcx ; VEX-NEXT: andl $1, %eax ; VEX-NEXT: orq %rcx, %rax ; VEX-NEXT: vcvtsi2ss %rax, %xmm6, %xmm3 ; VEX-NEXT: vaddss %xmm3, %xmm3, %xmm3 -; VEX-NEXT: .LBB81_12: +; VEX-NEXT: .LBB85_12: ; VEX-NEXT: vpextrq $1, %xmm1, %rax ; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB81_13 +; VEX-NEXT: js .LBB85_13 ; VEX-NEXT: # %bb.14: ; VEX-NEXT: vcvtsi2ss %rax, %xmm6, %xmm6 -; VEX-NEXT: jmp .LBB81_15 -; VEX-NEXT: .LBB81_13: +; VEX-NEXT: jmp .LBB85_15 +; VEX-NEXT: .LBB85_13: ; VEX-NEXT: movq %rax, %rcx ; VEX-NEXT: shrq %rcx ; VEX-NEXT: andl $1, %eax ; VEX-NEXT: orq %rcx, %rax ; VEX-NEXT: vcvtsi2ss %rax, %xmm6, %xmm6 ; VEX-NEXT: vaddss %xmm6, %xmm6, %xmm6 -; VEX-NEXT: .LBB81_15: +; VEX-NEXT: .LBB85_15: ; VEX-NEXT: vinsertps {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[2,3] ; VEX-NEXT: vmovq %xmm1, %rax ; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB81_16 +; VEX-NEXT: js .LBB85_16 ; VEX-NEXT: # %bb.17: ; VEX-NEXT: vcvtsi2ss %rax, %xmm7, %xmm1 -; VEX-NEXT: jmp .LBB81_18 -; VEX-NEXT: .LBB81_16: +; VEX-NEXT: jmp .LBB85_18 +; VEX-NEXT: .LBB85_16: ; VEX-NEXT: movq %rax, %rcx ; VEX-NEXT: shrq %rcx ; VEX-NEXT: andl $1, %eax ; VEX-NEXT: orq %rcx, %rax ; VEX-NEXT: vcvtsi2ss %rax, %xmm7, %xmm1 ; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1 -; VEX-NEXT: .LBB81_18: +; VEX-NEXT: .LBB85_18: ; VEX-NEXT: vinsertps {{.*#+}} xmm5 = xmm1[0],xmm6[0],xmm1[2,3] ; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm4[0],xmm2[3] ; VEX-NEXT: vmovq %xmm0, %rax ; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB81_19 +; VEX-NEXT: js .LBB85_19 ; VEX-NEXT: # %bb.20: ; VEX-NEXT: vcvtsi2ss %rax, %xmm7, %xmm2 -; VEX-NEXT: jmp .LBB81_21 -; VEX-NEXT: .LBB81_19: +; VEX-NEXT: jmp .LBB85_21 +; VEX-NEXT: .LBB85_19: ; VEX-NEXT: movq %rax, %rcx ; VEX-NEXT: shrq %rcx ; VEX-NEXT: andl $1, %eax ; VEX-NEXT: orq %rcx, %rax ; VEX-NEXT: vcvtsi2ss %rax, %xmm7, %xmm2 ; VEX-NEXT: vaddss %xmm2, %xmm2, %xmm2 -; VEX-NEXT: .LBB81_21: +; VEX-NEXT: .LBB85_21: ; VEX-NEXT: vinsertps {{.*#+}} xmm2 = xmm5[0,1],xmm2[0],xmm5[3] ; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0] ; VEX-NEXT: vpextrq $1, %xmm0, %rax ; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB81_22 +; VEX-NEXT: js .LBB85_22 ; VEX-NEXT: # %bb.23: ; VEX-NEXT: vcvtsi2ss %rax, %xmm7, %xmm0 -; VEX-NEXT: jmp .LBB81_24 -; VEX-NEXT: .LBB81_22: +; VEX-NEXT: jmp .LBB85_24 +; VEX-NEXT: .LBB85_22: ; VEX-NEXT: movq %rax, %rcx ; VEX-NEXT: shrq %rcx ; VEX-NEXT: andl $1, %eax ; VEX-NEXT: orq %rcx, %rax ; VEX-NEXT: vcvtsi2ss %rax, %xmm7, %xmm0 ; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0 -; VEX-NEXT: .LBB81_24: +; VEX-NEXT: .LBB85_24: ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] ; VEX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; VEX-NEXT: retq @@ -5512,3 +5688,325 @@ define <4 x float> @sitofp_i64_to_4f32(<4 x float> %a0, i64 %a1) nounwind { %res = insertelement <4 x float> %a0, float %cvt, i32 0 ret <4 x float> %res } + +; Extract from int vector and convert to FP. + +define float @extract0_sitofp_v4i32_f32(<4 x i32> %x) nounwind { +; SSE-LABEL: extract0_sitofp_v4i32_f32: +; SSE: # %bb.0: +; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: extract0_sitofp_v4i32_f32: +; AVX: # %bb.0: +; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 +; AVX-NEXT: retq + %e = extractelement <4 x i32> %x, i32 0 + %r = sitofp i32 %e to float + ret float %r +} + +define float @extract0_sitofp_v4i32_f32i_multiuse1(<4 x i32> %x) nounwind { +; SSE-LABEL: extract0_sitofp_v4i32_f32i_multiuse1: +; SSE: # %bb.0: +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE-NEXT: incl %eax +; SSE-NEXT: cvtsi2ss %eax, %xmm1 +; SSE-NEXT: divss %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: extract0_sitofp_v4i32_f32i_multiuse1: +; AVX: # %bb.0: +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 +; AVX-NEXT: incl %eax +; AVX-NEXT: vcvtsi2ss %eax, %xmm1, %xmm1 +; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %e = extractelement <4 x i32> %x, i32 0 + %f = sitofp i32 %e to float + %e1 = add i32 %e, 1 + %f1 = sitofp i32 %e1 to float + %r = fdiv float %f, %f1 + ret float %r +} + +define float @extract0_sitofp_v4i32_f32_multiuse2(<4 x i32> %x, i32* %p) nounwind { +; SSE-LABEL: extract0_sitofp_v4i32_f32_multiuse2: +; SSE: # %bb.0: +; SSE-NEXT: cvtdq2ps %xmm0, %xmm1 +; SSE-NEXT: movss %xmm0, (%rdi) +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: extract0_sitofp_v4i32_f32_multiuse2: +; AVX: # %bb.0: +; AVX-NEXT: vcvtdq2ps %xmm0, %xmm1 +; AVX-NEXT: vmovss %xmm0, (%rdi) +; AVX-NEXT: vmovaps %xmm1, %xmm0 +; AVX-NEXT: retq + %e = extractelement <4 x i32> %x, i32 0 + %r = sitofp i32 %e to float + store i32 %e, i32* %p + ret float %r +} + +define double @extract0_sitofp_v4i32_f64(<4 x i32> %x) nounwind { +; SSE-LABEL: extract0_sitofp_v4i32_f64: +; SSE: # %bb.0: +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: cvtsi2sd %eax, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: extract0_sitofp_v4i32_f64: +; AVX: # %bb.0: +; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX-NEXT: retq + %e = extractelement <4 x i32> %x, i32 0 + %r = sitofp i32 %e to double + ret double %r +} + +define float @extract0_uitofp_v4i32_f32(<4 x i32> %x) nounwind { +; SSE-LABEL: extract0_uitofp_v4i32_f32: +; SSE: # %bb.0: +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: cvtsi2ss %rax, %xmm0 +; SSE-NEXT: retq +; +; VEX-LABEL: extract0_uitofp_v4i32_f32: +; VEX: # %bb.0: +; VEX-NEXT: vmovd %xmm0, %eax +; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm0 +; VEX-NEXT: retq +; +; AVX512F-LABEL: extract0_uitofp_v4i32_f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: extract0_uitofp_v4i32_f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvtudq2ps %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: extract0_uitofp_v4i32_f32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: extract0_uitofp_v4i32_f32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtudq2ps %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq + %e = extractelement <4 x i32> %x, i32 0 + %r = uitofp i32 %e to float + ret float %r +} + +define double @extract0_uitofp_v4i32_f64(<4 x i32> %x) nounwind { +; SSE-LABEL: extract0_uitofp_v4i32_f64: +; SSE: # %bb.0: +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: cvtsi2sd %rax, %xmm0 +; SSE-NEXT: retq +; +; VEX-LABEL: extract0_uitofp_v4i32_f64: +; VEX: # %bb.0: +; VEX-NEXT: vmovd %xmm0, %eax +; VEX-NEXT: vcvtsi2sd %rax, %xmm1, %xmm0 +; VEX-NEXT: retq +; +; AVX512F-LABEL: extract0_uitofp_v4i32_f64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: extract0_uitofp_v4i32_f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvtudq2pd %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: extract0_uitofp_v4i32_f64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: extract0_uitofp_v4i32_f64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq + %e = extractelement <4 x i32> %x, i32 0 + %r = uitofp i32 %e to double + ret double %r +} + +; Extract non-zero element from int vector and convert to FP. + +define float @extract3_sitofp_v4i32_f32(<4 x i32> %x) nounwind { +; SSE-LABEL: extract3_sitofp_v4i32_f32: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: extract3_sitofp_v4i32_f32: +; AVX: # %bb.0: +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 +; AVX-NEXT: retq + %e = extractelement <4 x i32> %x, i32 3 + %r = sitofp i32 %e to float + ret float %r +} + +define double @extract3_sitofp_v4i32_f64(<4 x i32> %x) nounwind { +; SSE2-LABEL: extract3_sitofp_v4i32_f64: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2sd %eax, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: extract3_sitofp_v4i32_f64: +; SSE41: # %bb.0: +; SSE41-NEXT: extractps $3, %xmm0, %eax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2sd %eax, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: extract3_sitofp_v4i32_f64: +; AVX: # %bb.0: +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX-NEXT: retq + %e = extractelement <4 x i32> %x, i32 3 + %r = sitofp i32 %e to double + ret double %r +} + +define float @extract3_uitofp_v4i32_f32(<4 x i32> %x) nounwind { +; SSE2-LABEL: extract3_uitofp_v4i32_f32: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ss %rax, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: extract3_uitofp_v4i32_f32: +; SSE41: # %bb.0: +; SSE41-NEXT: extractps $3, %xmm0, %eax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ss %rax, %xmm0 +; SSE41-NEXT: retq +; +; VEX-LABEL: extract3_uitofp_v4i32_f32: +; VEX: # %bb.0: +; VEX-NEXT: vextractps $3, %xmm0, %eax +; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm0 +; VEX-NEXT: retq +; +; AVX512F-LABEL: extract3_uitofp_v4i32_f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: extract3_uitofp_v4i32_f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512VL-NEXT: vcvtudq2ps %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: extract3_uitofp_v4i32_f32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: extract3_uitofp_v4i32_f32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512VLDQ-NEXT: vcvtudq2ps %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq + %e = extractelement <4 x i32> %x, i32 3 + %r = uitofp i32 %e to float + ret float %r +} + +define double @extract3_uitofp_v4i32_f64(<4 x i32> %x) nounwind { +; SSE2-LABEL: extract3_uitofp_v4i32_f64: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2sd %rax, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: extract3_uitofp_v4i32_f64: +; SSE41: # %bb.0: +; SSE41-NEXT: extractps $3, %xmm0, %eax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2sd %rax, %xmm0 +; SSE41-NEXT: retq +; +; VEX-LABEL: extract3_uitofp_v4i32_f64: +; VEX: # %bb.0: +; VEX-NEXT: vextractps $3, %xmm0, %eax +; VEX-NEXT: vcvtsi2sd %rax, %xmm1, %xmm0 +; VEX-NEXT: retq +; +; AVX512F-LABEL: extract3_uitofp_v4i32_f64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: extract3_uitofp_v4i32_f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512VL-NEXT: vcvtudq2pd %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: extract3_uitofp_v4i32_f64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: extract3_uitofp_v4i32_f64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq + %e = extractelement <4 x i32> %x, i32 3 + %r = uitofp i32 %e to double + ret double %r +} + diff --git a/test/CodeGen/X86/vec_int_to_fp.ll b/test/CodeGen/X86/vec_int_to_fp.ll index f7d8216ed3d..18d3cab9df0 100644 --- a/test/CodeGen/X86/vec_int_to_fp.ll +++ b/test/CodeGen/X86/vec_int_to_fp.ll @@ -1,18 +1,18 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=ALL,SSE,SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=ALL,SSE,SSE41 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX,VEX,AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,VEX,AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512VL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512DQ -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512VLDQ +; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=ALL,SSE,SSE2 +; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=ALL,SSE,SSE41 +; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX,VEX,AVX1 +; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,VEX,AVX2 +; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512F +; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512VL +; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512DQ +; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512VLDQ ; ; 32-bit tests to make sure we're not doing anything stupid. -; RUN: llc < %s -mtriple=i686-unknown-unknown -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 +; RUN: llc < %s -disable-peephole -mtriple=i686-unknown-unknown +; RUN: llc < %s -disable-peephole -mtriple=i686-unknown-unknown -mattr=+sse +; RUN: llc < %s -disable-peephole -mtriple=i686-unknown-unknown -mattr=+sse2 +; RUN: llc < %s -disable-peephole -mtriple=i686-unknown-unknown -mattr=+sse4.1 ; ; Signed Integer to Double @@ -3119,6 +3119,42 @@ define <2 x double> @sitofp_volatile_load_4i32_to_2f64(<4 x i32> *%a) { ret <2 x double> %cvt } +define <2 x double> @sitofp_load_4i32_to_2f64_2(<4 x i32>* %x) { +; SSE-LABEL: sitofp_load_4i32_to_2f64_2: +; SSE: # %bb.0: +; SSE-NEXT: movaps (%rdi), %xmm0 +; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sitofp_load_4i32_to_2f64_2: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps (%rdi), %xmm0 +; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX-NEXT: retq + %a = load <4 x i32>, <4 x i32>* %x + %b = sitofp <4 x i32> %a to <4 x double> + %c = shufflevector <4 x double> %b, <4 x double> undef, <2 x i32> + ret <2 x double> %c +} + +define <2 x double> @sitofp_volatile_load_4i32_to_2f64_2(<4 x i32>* %x) { +; SSE-LABEL: sitofp_volatile_load_4i32_to_2f64_2: +; SSE: # %bb.0: +; SSE-NEXT: movaps (%rdi), %xmm0 +; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sitofp_volatile_load_4i32_to_2f64_2: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps (%rdi), %xmm0 +; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX-NEXT: retq + %a = load volatile <4 x i32>, <4 x i32>* %x + %b = sitofp <4 x i32> %a to <4 x double> + %c = shufflevector <4 x double> %b, <4 x double> undef, <2 x i32> + ret <2 x double> %c +} + define <2 x double> @sitofp_load_2i16_to_2f64(<2 x i16> *%a) { ; SSE2-LABEL: sitofp_load_2i16_to_2f64: ; SSE2: # %bb.0: @@ -3520,6 +3556,146 @@ define <2 x double> @uitofp_load_2i32_to_2f64(<2 x i32> *%a) { ret <2 x double> %cvt } +define <2 x double> @uitofp_load_4i32_to_2f64_2(<4 x i32>* %x) { +; SSE2-LABEL: uitofp_load_4i32_to_2f64_2: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,0,0] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: mulpd {{.*}}(%rip), %xmm0 +; SSE2-NEXT: addpd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_4i32_to_2f64_2: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: mulpd {{.*}}(%rip), %xmm0 +; SSE41-NEXT: addpd %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; VEX-LABEL: uitofp_load_4i32_to_2f64_2: +; VEX: # %bb.0: +; VEX-NEXT: vmovdqa (%rdi), %xmm0 +; VEX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; VEX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; VEX-NEXT: vpsrld $16, %xmm0, %xmm0 +; VEX-NEXT: vcvtdq2pd %xmm1, %xmm1 +; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; VEX-NEXT: vmulpd {{.*}}(%rip), %xmm0, %xmm0 +; VEX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; VEX-NEXT: retq +; +; AVX512F-LABEL: uitofp_load_4i32_to_2f64_2: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps (%rdi), %xmm0 +; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: uitofp_load_4i32_to_2f64_2: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovaps (%rdi), %xmm0 +; AVX512VL-NEXT: vcvtudq2pd %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: uitofp_load_4i32_to_2f64_2: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 +; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: uitofp_load_4i32_to_2f64_2: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vmovaps (%rdi), %xmm0 +; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq + %a = load <4 x i32>, <4 x i32>* %x + %b = uitofp <4 x i32> %a to <4 x double> + %c = shufflevector <4 x double> %b, <4 x double> undef, <2 x i32> + ret <2 x double> %c +} + +define <2 x double> @uitofp_volatile_load_4i32_to_2f64_2(<4 x i32>* %x) { +; SSE2-LABEL: uitofp_volatile_load_4i32_to_2f64_2: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,0,0] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: mulpd {{.*}}(%rip), %xmm0 +; SSE2-NEXT: addpd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_volatile_load_4i32_to_2f64_2: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: mulpd {{.*}}(%rip), %xmm0 +; SSE41-NEXT: addpd %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; VEX-LABEL: uitofp_volatile_load_4i32_to_2f64_2: +; VEX: # %bb.0: +; VEX-NEXT: vmovdqa (%rdi), %xmm0 +; VEX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; VEX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; VEX-NEXT: vpsrld $16, %xmm0, %xmm0 +; VEX-NEXT: vcvtdq2pd %xmm1, %xmm1 +; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; VEX-NEXT: vmulpd {{.*}}(%rip), %xmm0, %xmm0 +; VEX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; VEX-NEXT: retq +; +; AVX512F-LABEL: uitofp_volatile_load_4i32_to_2f64_2: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps (%rdi), %xmm0 +; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: uitofp_volatile_load_4i32_to_2f64_2: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovaps (%rdi), %xmm0 +; AVX512VL-NEXT: vcvtudq2pd %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: uitofp_volatile_load_4i32_to_2f64_2: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 +; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: uitofp_volatile_load_4i32_to_2f64_2: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vmovaps (%rdi), %xmm0 +; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq + %a = load volatile <4 x i32>, <4 x i32>* %x + %b = uitofp <4 x i32> %a to <4 x double> + %c = shufflevector <4 x double> %b, <4 x double> undef, <2 x i32> + ret <2 x double> %c +} + define <2 x double> @uitofp_load_2i16_to_2f64(<2 x i16> *%a) { ; SSE2-LABEL: uitofp_load_2i16_to_2f64: ; SSE2: # %bb.0: @@ -4335,41 +4511,41 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { ; SSE2-NEXT: movdqa 16(%rdi), %xmm0 ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB77_1 +; SSE2-NEXT: js .LBB81_1 ; SSE2-NEXT: # %bb.2: ; SSE2-NEXT: cvtsi2ss %rax, %xmm1 -; SSE2-NEXT: jmp .LBB77_3 -; SSE2-NEXT: .LBB77_1: +; SSE2-NEXT: jmp .LBB81_3 +; SSE2-NEXT: .LBB81_1: ; SSE2-NEXT: movq %rax, %rcx ; SSE2-NEXT: shrq %rcx ; SSE2-NEXT: andl $1, %eax ; SSE2-NEXT: orq %rcx, %rax ; SSE2-NEXT: cvtsi2ss %rax, %xmm1 ; SSE2-NEXT: addss %xmm1, %xmm1 -; SSE2-NEXT: .LBB77_3: +; SSE2-NEXT: .LBB81_3: ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB77_4 +; SSE2-NEXT: js .LBB81_4 ; SSE2-NEXT: # %bb.5: ; SSE2-NEXT: cvtsi2ss %rax, %xmm3 -; SSE2-NEXT: jmp .LBB77_6 -; SSE2-NEXT: .LBB77_4: +; SSE2-NEXT: jmp .LBB81_6 +; SSE2-NEXT: .LBB81_4: ; SSE2-NEXT: movq %rax, %rcx ; SSE2-NEXT: shrq %rcx ; SSE2-NEXT: andl $1, %eax ; SSE2-NEXT: orq %rcx, %rax ; SSE2-NEXT: cvtsi2ss %rax, %xmm3 ; SSE2-NEXT: addss %xmm3, %xmm3 -; SSE2-NEXT: .LBB77_6: +; SSE2-NEXT: .LBB81_6: ; SSE2-NEXT: movq %xmm2, %rax ; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB77_7 +; SSE2-NEXT: js .LBB81_7 ; SSE2-NEXT: # %bb.8: ; SSE2-NEXT: xorps %xmm0, %xmm0 ; SSE2-NEXT: cvtsi2ss %rax, %xmm0 -; SSE2-NEXT: jmp .LBB77_9 -; SSE2-NEXT: .LBB77_7: +; SSE2-NEXT: jmp .LBB81_9 +; SSE2-NEXT: .LBB81_7: ; SSE2-NEXT: movq %rax, %rcx ; SSE2-NEXT: shrq %rcx ; SSE2-NEXT: andl $1, %eax @@ -4377,17 +4553,17 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { ; SSE2-NEXT: xorps %xmm0, %xmm0 ; SSE2-NEXT: cvtsi2ss %rax, %xmm0 ; SSE2-NEXT: addss %xmm0, %xmm0 -; SSE2-NEXT: .LBB77_9: +; SSE2-NEXT: .LBB81_9: ; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] ; SSE2-NEXT: movq %xmm2, %rax ; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB77_10 +; SSE2-NEXT: js .LBB81_10 ; SSE2-NEXT: # %bb.11: ; SSE2-NEXT: xorps %xmm2, %xmm2 ; SSE2-NEXT: cvtsi2ss %rax, %xmm2 -; SSE2-NEXT: jmp .LBB77_12 -; SSE2-NEXT: .LBB77_10: +; SSE2-NEXT: jmp .LBB81_12 +; SSE2-NEXT: .LBB81_10: ; SSE2-NEXT: movq %rax, %rcx ; SSE2-NEXT: shrq %rcx ; SSE2-NEXT: andl $1, %eax @@ -4395,7 +4571,7 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { ; SSE2-NEXT: xorps %xmm2, %xmm2 ; SSE2-NEXT: cvtsi2ss %rax, %xmm2 ; SSE2-NEXT: addss %xmm2, %xmm2 -; SSE2-NEXT: .LBB77_12: +; SSE2-NEXT: .LBB81_12: ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: retq @@ -4406,26 +4582,26 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { ; SSE41-NEXT: movdqa 16(%rdi), %xmm1 ; SSE41-NEXT: pextrq $1, %xmm0, %rax ; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: js .LBB77_1 +; SSE41-NEXT: js .LBB81_1 ; SSE41-NEXT: # %bb.2: ; SSE41-NEXT: cvtsi2ss %rax, %xmm2 -; SSE41-NEXT: jmp .LBB77_3 -; SSE41-NEXT: .LBB77_1: +; SSE41-NEXT: jmp .LBB81_3 +; SSE41-NEXT: .LBB81_1: ; SSE41-NEXT: movq %rax, %rcx ; SSE41-NEXT: shrq %rcx ; SSE41-NEXT: andl $1, %eax ; SSE41-NEXT: orq %rcx, %rax ; SSE41-NEXT: cvtsi2ss %rax, %xmm2 ; SSE41-NEXT: addss %xmm2, %xmm2 -; SSE41-NEXT: .LBB77_3: +; SSE41-NEXT: .LBB81_3: ; SSE41-NEXT: movq %xmm0, %rax ; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: js .LBB77_4 +; SSE41-NEXT: js .LBB81_4 ; SSE41-NEXT: # %bb.5: ; SSE41-NEXT: xorps %xmm0, %xmm0 ; SSE41-NEXT: cvtsi2ss %rax, %xmm0 -; SSE41-NEXT: jmp .LBB77_6 -; SSE41-NEXT: .LBB77_4: +; SSE41-NEXT: jmp .LBB81_6 +; SSE41-NEXT: .LBB81_4: ; SSE41-NEXT: movq %rax, %rcx ; SSE41-NEXT: shrq %rcx ; SSE41-NEXT: andl $1, %eax @@ -4433,16 +4609,16 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { ; SSE41-NEXT: xorps %xmm0, %xmm0 ; SSE41-NEXT: cvtsi2ss %rax, %xmm0 ; SSE41-NEXT: addss %xmm0, %xmm0 -; SSE41-NEXT: .LBB77_6: +; SSE41-NEXT: .LBB81_6: ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] ; SSE41-NEXT: movq %xmm1, %rax ; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: js .LBB77_7 +; SSE41-NEXT: js .LBB81_7 ; SSE41-NEXT: # %bb.8: ; SSE41-NEXT: xorps %xmm2, %xmm2 ; SSE41-NEXT: cvtsi2ss %rax, %xmm2 -; SSE41-NEXT: jmp .LBB77_9 -; SSE41-NEXT: .LBB77_7: +; SSE41-NEXT: jmp .LBB81_9 +; SSE41-NEXT: .LBB81_7: ; SSE41-NEXT: movq %rax, %rcx ; SSE41-NEXT: shrq %rcx ; SSE41-NEXT: andl $1, %eax @@ -4450,17 +4626,17 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { ; SSE41-NEXT: xorps %xmm2, %xmm2 ; SSE41-NEXT: cvtsi2ss %rax, %xmm2 ; SSE41-NEXT: addss %xmm2, %xmm2 -; SSE41-NEXT: .LBB77_9: +; SSE41-NEXT: .LBB81_9: ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] ; SSE41-NEXT: pextrq $1, %xmm1, %rax ; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: js .LBB77_10 +; SSE41-NEXT: js .LBB81_10 ; SSE41-NEXT: # %bb.11: ; SSE41-NEXT: xorps %xmm1, %xmm1 ; SSE41-NEXT: cvtsi2ss %rax, %xmm1 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; SSE41-NEXT: retq -; SSE41-NEXT: .LBB77_10: +; SSE41-NEXT: .LBB81_10: ; SSE41-NEXT: movq %rax, %rcx ; SSE41-NEXT: shrq %rcx ; SSE41-NEXT: andl $1, %eax @@ -4477,56 +4653,56 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { ; VEX-NEXT: vmovdqa 16(%rdi), %xmm0 ; VEX-NEXT: vpextrq $1, %xmm2, %rax ; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB77_1 +; VEX-NEXT: js .LBB81_1 ; VEX-NEXT: # %bb.2: ; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; VEX-NEXT: jmp .LBB77_3 -; VEX-NEXT: .LBB77_1: +; VEX-NEXT: jmp .LBB81_3 +; VEX-NEXT: .LBB81_1: ; VEX-NEXT: movq %rax, %rcx ; VEX-NEXT: shrq %rcx ; VEX-NEXT: andl $1, %eax ; VEX-NEXT: orq %rcx, %rax ; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 ; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1 -; VEX-NEXT: .LBB77_3: +; VEX-NEXT: .LBB81_3: ; VEX-NEXT: vmovq %xmm2, %rax ; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB77_4 +; VEX-NEXT: js .LBB81_4 ; VEX-NEXT: # %bb.5: ; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; VEX-NEXT: jmp .LBB77_6 -; VEX-NEXT: .LBB77_4: +; VEX-NEXT: jmp .LBB81_6 +; VEX-NEXT: .LBB81_4: ; VEX-NEXT: movq %rax, %rcx ; VEX-NEXT: shrq %rcx ; VEX-NEXT: andl $1, %eax ; VEX-NEXT: orq %rcx, %rax ; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 ; VEX-NEXT: vaddss %xmm2, %xmm2, %xmm2 -; VEX-NEXT: .LBB77_6: +; VEX-NEXT: .LBB81_6: ; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; VEX-NEXT: vmovq %xmm0, %rax ; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB77_7 +; VEX-NEXT: js .LBB81_7 ; VEX-NEXT: # %bb.8: ; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; VEX-NEXT: jmp .LBB77_9 -; VEX-NEXT: .LBB77_7: +; VEX-NEXT: jmp .LBB81_9 +; VEX-NEXT: .LBB81_7: ; VEX-NEXT: movq %rax, %rcx ; VEX-NEXT: shrq %rcx ; VEX-NEXT: andl $1, %eax ; VEX-NEXT: orq %rcx, %rax ; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 ; VEX-NEXT: vaddss %xmm2, %xmm2, %xmm2 -; VEX-NEXT: .LBB77_9: +; VEX-NEXT: .LBB81_9: ; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] ; VEX-NEXT: vpextrq $1, %xmm0, %rax ; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB77_10 +; VEX-NEXT: js .LBB81_10 ; VEX-NEXT: # %bb.11: ; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; VEX-NEXT: retq -; VEX-NEXT: .LBB77_10: +; VEX-NEXT: .LBB81_10: ; VEX-NEXT: movq %rax, %rcx ; VEX-NEXT: shrq %rcx ; VEX-NEXT: andl $1, %eax @@ -4724,41 +4900,41 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE2-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB81_1 +; SSE2-NEXT: js .LBB85_1 ; SSE2-NEXT: # %bb.2: ; SSE2-NEXT: cvtsi2ss %rax, %xmm3 -; SSE2-NEXT: jmp .LBB81_3 -; SSE2-NEXT: .LBB81_1: +; SSE2-NEXT: jmp .LBB85_3 +; SSE2-NEXT: .LBB85_1: ; SSE2-NEXT: movq %rax, %rcx ; SSE2-NEXT: shrq %rcx ; SSE2-NEXT: andl $1, %eax ; SSE2-NEXT: orq %rcx, %rax ; SSE2-NEXT: cvtsi2ss %rax, %xmm3 ; SSE2-NEXT: addss %xmm3, %xmm3 -; SSE2-NEXT: .LBB81_3: +; SSE2-NEXT: .LBB85_3: ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB81_4 +; SSE2-NEXT: js .LBB85_4 ; SSE2-NEXT: # %bb.5: ; SSE2-NEXT: cvtsi2ss %rax, %xmm4 -; SSE2-NEXT: jmp .LBB81_6 -; SSE2-NEXT: .LBB81_4: +; SSE2-NEXT: jmp .LBB85_6 +; SSE2-NEXT: .LBB85_4: ; SSE2-NEXT: movq %rax, %rcx ; SSE2-NEXT: shrq %rcx ; SSE2-NEXT: andl $1, %eax ; SSE2-NEXT: orq %rcx, %rax ; SSE2-NEXT: cvtsi2ss %rax, %xmm4 ; SSE2-NEXT: addss %xmm4, %xmm4 -; SSE2-NEXT: .LBB81_6: +; SSE2-NEXT: .LBB85_6: ; SSE2-NEXT: movq %xmm5, %rax ; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB81_7 +; SSE2-NEXT: js .LBB85_7 ; SSE2-NEXT: # %bb.8: ; SSE2-NEXT: xorps %xmm0, %xmm0 ; SSE2-NEXT: cvtsi2ss %rax, %xmm0 -; SSE2-NEXT: jmp .LBB81_9 -; SSE2-NEXT: .LBB81_7: +; SSE2-NEXT: jmp .LBB85_9 +; SSE2-NEXT: .LBB85_7: ; SSE2-NEXT: movq %rax, %rcx ; SSE2-NEXT: shrq %rcx ; SSE2-NEXT: andl $1, %eax @@ -4766,30 +4942,30 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE2-NEXT: xorps %xmm0, %xmm0 ; SSE2-NEXT: cvtsi2ss %rax, %xmm0 ; SSE2-NEXT: addss %xmm0, %xmm0 -; SSE2-NEXT: .LBB81_9: +; SSE2-NEXT: .LBB85_9: ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] ; SSE2-NEXT: movq %xmm5, %rax ; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB81_10 +; SSE2-NEXT: js .LBB85_10 ; SSE2-NEXT: # %bb.11: ; SSE2-NEXT: cvtsi2ss %rax, %xmm6 -; SSE2-NEXT: jmp .LBB81_12 -; SSE2-NEXT: .LBB81_10: +; SSE2-NEXT: jmp .LBB85_12 +; SSE2-NEXT: .LBB85_10: ; SSE2-NEXT: movq %rax, %rcx ; SSE2-NEXT: shrq %rcx ; SSE2-NEXT: andl $1, %eax ; SSE2-NEXT: orq %rcx, %rax ; SSE2-NEXT: cvtsi2ss %rax, %xmm6 ; SSE2-NEXT: addss %xmm6, %xmm6 -; SSE2-NEXT: .LBB81_12: +; SSE2-NEXT: .LBB85_12: ; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB81_13 +; SSE2-NEXT: js .LBB85_13 ; SSE2-NEXT: # %bb.14: ; SSE2-NEXT: xorps %xmm5, %xmm5 ; SSE2-NEXT: cvtsi2ss %rax, %xmm5 -; SSE2-NEXT: jmp .LBB81_15 -; SSE2-NEXT: .LBB81_13: +; SSE2-NEXT: jmp .LBB85_15 +; SSE2-NEXT: .LBB85_13: ; SSE2-NEXT: movq %rax, %rcx ; SSE2-NEXT: shrq %rcx ; SSE2-NEXT: andl $1, %eax @@ -4797,32 +4973,32 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE2-NEXT: xorps %xmm5, %xmm5 ; SSE2-NEXT: cvtsi2ss %rax, %xmm5 ; SSE2-NEXT: addss %xmm5, %xmm5 -; SSE2-NEXT: .LBB81_15: +; SSE2-NEXT: .LBB85_15: ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] ; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB81_16 +; SSE2-NEXT: js .LBB85_16 ; SSE2-NEXT: # %bb.17: ; SSE2-NEXT: cvtsi2ss %rax, %xmm7 -; SSE2-NEXT: jmp .LBB81_18 -; SSE2-NEXT: .LBB81_16: +; SSE2-NEXT: jmp .LBB85_18 +; SSE2-NEXT: .LBB85_16: ; SSE2-NEXT: movq %rax, %rcx ; SSE2-NEXT: shrq %rcx ; SSE2-NEXT: andl $1, %eax ; SSE2-NEXT: orq %rcx, %rax ; SSE2-NEXT: cvtsi2ss %rax, %xmm7 ; SSE2-NEXT: addss %xmm7, %xmm7 -; SSE2-NEXT: .LBB81_18: +; SSE2-NEXT: .LBB85_18: ; SSE2-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] ; SSE2-NEXT: movq %xmm2, %rax ; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB81_19 +; SSE2-NEXT: js .LBB85_19 ; SSE2-NEXT: # %bb.20: ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: cvtsi2ss %rax, %xmm1 -; SSE2-NEXT: jmp .LBB81_21 -; SSE2-NEXT: .LBB81_19: +; SSE2-NEXT: jmp .LBB85_21 +; SSE2-NEXT: .LBB85_19: ; SSE2-NEXT: movq %rax, %rcx ; SSE2-NEXT: shrq %rcx ; SSE2-NEXT: andl $1, %eax @@ -4830,18 +5006,18 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: cvtsi2ss %rax, %xmm1 ; SSE2-NEXT: addss %xmm1, %xmm1 -; SSE2-NEXT: .LBB81_21: +; SSE2-NEXT: .LBB85_21: ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE2-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] ; SSE2-NEXT: movq %xmm2, %rax ; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB81_22 +; SSE2-NEXT: js .LBB85_22 ; SSE2-NEXT: # %bb.23: ; SSE2-NEXT: xorps %xmm2, %xmm2 ; SSE2-NEXT: cvtsi2ss %rax, %xmm2 -; SSE2-NEXT: jmp .LBB81_24 -; SSE2-NEXT: .LBB81_22: +; SSE2-NEXT: jmp .LBB85_24 +; SSE2-NEXT: .LBB85_22: ; SSE2-NEXT: movq %rax, %rcx ; SSE2-NEXT: shrq %rcx ; SSE2-NEXT: andl $1, %eax @@ -4849,7 +5025,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE2-NEXT: xorps %xmm2, %xmm2 ; SSE2-NEXT: cvtsi2ss %rax, %xmm2 ; SSE2-NEXT: addss %xmm2, %xmm2 -; SSE2-NEXT: .LBB81_24: +; SSE2-NEXT: .LBB85_24: ; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] ; SSE2-NEXT: retq @@ -4862,26 +5038,26 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE41-NEXT: movdqa 48(%rdi), %xmm2 ; SSE41-NEXT: pextrq $1, %xmm0, %rax ; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: js .LBB81_1 +; SSE41-NEXT: js .LBB85_1 ; SSE41-NEXT: # %bb.2: ; SSE41-NEXT: cvtsi2ss %rax, %xmm3 -; SSE41-NEXT: jmp .LBB81_3 -; SSE41-NEXT: .LBB81_1: +; SSE41-NEXT: jmp .LBB85_3 +; SSE41-NEXT: .LBB85_1: ; SSE41-NEXT: movq %rax, %rcx ; SSE41-NEXT: shrq %rcx ; SSE41-NEXT: andl $1, %eax ; SSE41-NEXT: orq %rcx, %rax ; SSE41-NEXT: cvtsi2ss %rax, %xmm3 ; SSE41-NEXT: addss %xmm3, %xmm3 -; SSE41-NEXT: .LBB81_3: +; SSE41-NEXT: .LBB85_3: ; SSE41-NEXT: movq %xmm0, %rax ; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: js .LBB81_4 +; SSE41-NEXT: js .LBB85_4 ; SSE41-NEXT: # %bb.5: ; SSE41-NEXT: xorps %xmm0, %xmm0 ; SSE41-NEXT: cvtsi2ss %rax, %xmm0 -; SSE41-NEXT: jmp .LBB81_6 -; SSE41-NEXT: .LBB81_4: +; SSE41-NEXT: jmp .LBB85_6 +; SSE41-NEXT: .LBB85_4: ; SSE41-NEXT: movq %rax, %rcx ; SSE41-NEXT: shrq %rcx ; SSE41-NEXT: andl $1, %eax @@ -4889,29 +5065,29 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE41-NEXT: xorps %xmm0, %xmm0 ; SSE41-NEXT: cvtsi2ss %rax, %xmm0 ; SSE41-NEXT: addss %xmm0, %xmm0 -; SSE41-NEXT: .LBB81_6: +; SSE41-NEXT: .LBB85_6: ; SSE41-NEXT: movq %xmm4, %rax ; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: js .LBB81_7 +; SSE41-NEXT: js .LBB85_7 ; SSE41-NEXT: # %bb.8: ; SSE41-NEXT: cvtsi2ss %rax, %xmm5 -; SSE41-NEXT: jmp .LBB81_9 -; SSE41-NEXT: .LBB81_7: +; SSE41-NEXT: jmp .LBB85_9 +; SSE41-NEXT: .LBB85_7: ; SSE41-NEXT: movq %rax, %rcx ; SSE41-NEXT: shrq %rcx ; SSE41-NEXT: andl $1, %eax ; SSE41-NEXT: orq %rcx, %rax ; SSE41-NEXT: cvtsi2ss %rax, %xmm5 ; SSE41-NEXT: addss %xmm5, %xmm5 -; SSE41-NEXT: .LBB81_9: +; SSE41-NEXT: .LBB85_9: ; SSE41-NEXT: pextrq $1, %xmm4, %rax ; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: js .LBB81_10 +; SSE41-NEXT: js .LBB85_10 ; SSE41-NEXT: # %bb.11: ; SSE41-NEXT: xorps %xmm4, %xmm4 ; SSE41-NEXT: cvtsi2ss %rax, %xmm4 -; SSE41-NEXT: jmp .LBB81_12 -; SSE41-NEXT: .LBB81_10: +; SSE41-NEXT: jmp .LBB85_12 +; SSE41-NEXT: .LBB85_10: ; SSE41-NEXT: movq %rax, %rcx ; SSE41-NEXT: shrq %rcx ; SSE41-NEXT: andl $1, %eax @@ -4919,30 +5095,30 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE41-NEXT: xorps %xmm4, %xmm4 ; SSE41-NEXT: cvtsi2ss %rax, %xmm4 ; SSE41-NEXT: addss %xmm4, %xmm4 -; SSE41-NEXT: .LBB81_12: +; SSE41-NEXT: .LBB85_12: ; SSE41-NEXT: pextrq $1, %xmm1, %rax ; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: js .LBB81_13 +; SSE41-NEXT: js .LBB85_13 ; SSE41-NEXT: # %bb.14: ; SSE41-NEXT: cvtsi2ss %rax, %xmm6 -; SSE41-NEXT: jmp .LBB81_15 -; SSE41-NEXT: .LBB81_13: +; SSE41-NEXT: jmp .LBB85_15 +; SSE41-NEXT: .LBB85_13: ; SSE41-NEXT: movq %rax, %rcx ; SSE41-NEXT: shrq %rcx ; SSE41-NEXT: andl $1, %eax ; SSE41-NEXT: orq %rcx, %rax ; SSE41-NEXT: cvtsi2ss %rax, %xmm6 ; SSE41-NEXT: addss %xmm6, %xmm6 -; SSE41-NEXT: .LBB81_15: +; SSE41-NEXT: .LBB85_15: ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] ; SSE41-NEXT: movq %xmm1, %rax ; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: js .LBB81_16 +; SSE41-NEXT: js .LBB85_16 ; SSE41-NEXT: # %bb.17: ; SSE41-NEXT: xorps %xmm1, %xmm1 ; SSE41-NEXT: cvtsi2ss %rax, %xmm1 -; SSE41-NEXT: jmp .LBB81_18 -; SSE41-NEXT: .LBB81_16: +; SSE41-NEXT: jmp .LBB85_18 +; SSE41-NEXT: .LBB85_16: ; SSE41-NEXT: movq %rax, %rcx ; SSE41-NEXT: shrq %rcx ; SSE41-NEXT: andl $1, %eax @@ -4950,17 +5126,17 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE41-NEXT: xorps %xmm1, %xmm1 ; SSE41-NEXT: cvtsi2ss %rax, %xmm1 ; SSE41-NEXT: addss %xmm1, %xmm1 -; SSE41-NEXT: .LBB81_18: +; SSE41-NEXT: .LBB85_18: ; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[2,3] ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm5[0],xmm0[3] ; SSE41-NEXT: movq %xmm2, %rax ; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: js .LBB81_19 +; SSE41-NEXT: js .LBB85_19 ; SSE41-NEXT: # %bb.20: ; SSE41-NEXT: xorps %xmm3, %xmm3 ; SSE41-NEXT: cvtsi2ss %rax, %xmm3 -; SSE41-NEXT: jmp .LBB81_21 -; SSE41-NEXT: .LBB81_19: +; SSE41-NEXT: jmp .LBB85_21 +; SSE41-NEXT: .LBB85_19: ; SSE41-NEXT: movq %rax, %rcx ; SSE41-NEXT: shrq %rcx ; SSE41-NEXT: andl $1, %eax @@ -4968,18 +5144,18 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE41-NEXT: xorps %xmm3, %xmm3 ; SSE41-NEXT: cvtsi2ss %rax, %xmm3 ; SSE41-NEXT: addss %xmm3, %xmm3 -; SSE41-NEXT: .LBB81_21: +; SSE41-NEXT: .LBB85_21: ; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] ; SSE41-NEXT: pextrq $1, %xmm2, %rax ; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: js .LBB81_22 +; SSE41-NEXT: js .LBB85_22 ; SSE41-NEXT: # %bb.23: ; SSE41-NEXT: xorps %xmm2, %xmm2 ; SSE41-NEXT: cvtsi2ss %rax, %xmm2 ; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] ; SSE41-NEXT: retq -; SSE41-NEXT: .LBB81_22: +; SSE41-NEXT: .LBB85_22: ; SSE41-NEXT: movq %rax, %rcx ; SSE41-NEXT: shrq %rcx ; SSE41-NEXT: andl $1, %eax @@ -4998,121 +5174,121 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; VEX-NEXT: vmovdqa 48(%rdi), %xmm3 ; VEX-NEXT: vpextrq $1, %xmm4, %rax ; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB81_1 +; VEX-NEXT: js .LBB85_1 ; VEX-NEXT: # %bb.2: ; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; VEX-NEXT: jmp .LBB81_3 -; VEX-NEXT: .LBB81_1: +; VEX-NEXT: jmp .LBB85_3 +; VEX-NEXT: .LBB85_1: ; VEX-NEXT: movq %rax, %rcx ; VEX-NEXT: shrq %rcx ; VEX-NEXT: andl $1, %eax ; VEX-NEXT: orq %rcx, %rax ; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 ; VEX-NEXT: vaddss %xmm2, %xmm2, %xmm2 -; VEX-NEXT: .LBB81_3: +; VEX-NEXT: .LBB85_3: ; VEX-NEXT: vmovq %xmm4, %rax ; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB81_4 +; VEX-NEXT: js .LBB85_4 ; VEX-NEXT: # %bb.5: ; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm5 -; VEX-NEXT: jmp .LBB81_6 -; VEX-NEXT: .LBB81_4: +; VEX-NEXT: jmp .LBB85_6 +; VEX-NEXT: .LBB85_4: ; VEX-NEXT: movq %rax, %rcx ; VEX-NEXT: shrq %rcx ; VEX-NEXT: andl $1, %eax ; VEX-NEXT: orq %rcx, %rax ; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 ; VEX-NEXT: vaddss %xmm4, %xmm4, %xmm5 -; VEX-NEXT: .LBB81_6: +; VEX-NEXT: .LBB85_6: ; VEX-NEXT: vmovq %xmm3, %rax ; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB81_7 +; VEX-NEXT: js .LBB85_7 ; VEX-NEXT: # %bb.8: ; VEX-NEXT: vcvtsi2ss %rax, %xmm6, %xmm4 -; VEX-NEXT: jmp .LBB81_9 -; VEX-NEXT: .LBB81_7: +; VEX-NEXT: jmp .LBB85_9 +; VEX-NEXT: .LBB85_7: ; VEX-NEXT: movq %rax, %rcx ; VEX-NEXT: shrq %rcx ; VEX-NEXT: andl $1, %eax ; VEX-NEXT: orq %rcx, %rax ; VEX-NEXT: vcvtsi2ss %rax, %xmm6, %xmm4 ; VEX-NEXT: vaddss %xmm4, %xmm4, %xmm4 -; VEX-NEXT: .LBB81_9: +; VEX-NEXT: .LBB85_9: ; VEX-NEXT: vpextrq $1, %xmm3, %rax ; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB81_10 +; VEX-NEXT: js .LBB85_10 ; VEX-NEXT: # %bb.11: ; VEX-NEXT: vcvtsi2ss %rax, %xmm6, %xmm3 -; VEX-NEXT: jmp .LBB81_12 -; VEX-NEXT: .LBB81_10: +; VEX-NEXT: jmp .LBB85_12 +; VEX-NEXT: .LBB85_10: ; VEX-NEXT: movq %rax, %rcx ; VEX-NEXT: shrq %rcx ; VEX-NEXT: andl $1, %eax ; VEX-NEXT: orq %rcx, %rax ; VEX-NEXT: vcvtsi2ss %rax, %xmm6, %xmm3 ; VEX-NEXT: vaddss %xmm3, %xmm3, %xmm3 -; VEX-NEXT: .LBB81_12: +; VEX-NEXT: .LBB85_12: ; VEX-NEXT: vpextrq $1, %xmm1, %rax ; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB81_13 +; VEX-NEXT: js .LBB85_13 ; VEX-NEXT: # %bb.14: ; VEX-NEXT: vcvtsi2ss %rax, %xmm6, %xmm6 -; VEX-NEXT: jmp .LBB81_15 -; VEX-NEXT: .LBB81_13: +; VEX-NEXT: jmp .LBB85_15 +; VEX-NEXT: .LBB85_13: ; VEX-NEXT: movq %rax, %rcx ; VEX-NEXT: shrq %rcx ; VEX-NEXT: andl $1, %eax ; VEX-NEXT: orq %rcx, %rax ; VEX-NEXT: vcvtsi2ss %rax, %xmm6, %xmm6 ; VEX-NEXT: vaddss %xmm6, %xmm6, %xmm6 -; VEX-NEXT: .LBB81_15: +; VEX-NEXT: .LBB85_15: ; VEX-NEXT: vinsertps {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[2,3] ; VEX-NEXT: vmovq %xmm1, %rax ; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB81_16 +; VEX-NEXT: js .LBB85_16 ; VEX-NEXT: # %bb.17: ; VEX-NEXT: vcvtsi2ss %rax, %xmm7, %xmm1 -; VEX-NEXT: jmp .LBB81_18 -; VEX-NEXT: .LBB81_16: +; VEX-NEXT: jmp .LBB85_18 +; VEX-NEXT: .LBB85_16: ; VEX-NEXT: movq %rax, %rcx ; VEX-NEXT: shrq %rcx ; VEX-NEXT: andl $1, %eax ; VEX-NEXT: orq %rcx, %rax ; VEX-NEXT: vcvtsi2ss %rax, %xmm7, %xmm1 ; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1 -; VEX-NEXT: .LBB81_18: +; VEX-NEXT: .LBB85_18: ; VEX-NEXT: vinsertps {{.*#+}} xmm5 = xmm1[0],xmm6[0],xmm1[2,3] ; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm4[0],xmm2[3] ; VEX-NEXT: vmovq %xmm0, %rax ; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB81_19 +; VEX-NEXT: js .LBB85_19 ; VEX-NEXT: # %bb.20: ; VEX-NEXT: vcvtsi2ss %rax, %xmm7, %xmm2 -; VEX-NEXT: jmp .LBB81_21 -; VEX-NEXT: .LBB81_19: +; VEX-NEXT: jmp .LBB85_21 +; VEX-NEXT: .LBB85_19: ; VEX-NEXT: movq %rax, %rcx ; VEX-NEXT: shrq %rcx ; VEX-NEXT: andl $1, %eax ; VEX-NEXT: orq %rcx, %rax ; VEX-NEXT: vcvtsi2ss %rax, %xmm7, %xmm2 ; VEX-NEXT: vaddss %xmm2, %xmm2, %xmm2 -; VEX-NEXT: .LBB81_21: +; VEX-NEXT: .LBB85_21: ; VEX-NEXT: vinsertps {{.*#+}} xmm2 = xmm5[0,1],xmm2[0],xmm5[3] ; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0] ; VEX-NEXT: vpextrq $1, %xmm0, %rax ; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB81_22 +; VEX-NEXT: js .LBB85_22 ; VEX-NEXT: # %bb.23: ; VEX-NEXT: vcvtsi2ss %rax, %xmm7, %xmm0 -; VEX-NEXT: jmp .LBB81_24 -; VEX-NEXT: .LBB81_22: +; VEX-NEXT: jmp .LBB85_24 +; VEX-NEXT: .LBB85_22: ; VEX-NEXT: movq %rax, %rcx ; VEX-NEXT: shrq %rcx ; VEX-NEXT: andl $1, %eax ; VEX-NEXT: orq %rcx, %rax ; VEX-NEXT: vcvtsi2ss %rax, %xmm7, %xmm0 ; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0 -; VEX-NEXT: .LBB81_24: +; VEX-NEXT: .LBB85_24: ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] ; VEX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; VEX-NEXT: retq