From 405f7c22868c11413b7cae646c20f987d4dc546f Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 21 Jun 2019 17:24:21 +0000 Subject: [PATCH] [X86] Use vmovq for v4i64/v4f64/v8i64/v8f64 vzmovl. We already use vmovq for v2i64/v2f64 vzmovl. But we were using a blendpd+xorpd for v4i64/v4f64/v8i64/v8f64 under opt speed. Or movsd+xorpd under optsize. I think the blend with 0 or movss/d is only needed for vXi32 where we don't have an instruction that can move 32 bits from one xmm to another while zeroing upper bits. movq is no worse than blendpd on any known CPUs. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@364079 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86InstrAVX512.td | 53 ++++++++----------- lib/Target/X86/X86InstrSSE.td | 35 +++++------- test/CodeGen/X86/vec_extract-avx.ll | 28 +++++----- test/CodeGen/X86/vector-extend-inreg.ll | 16 +++--- test/CodeGen/X86/vector-shuffle-256-v4.ll | 9 ++-- test/CodeGen/X86/vector-shuffle-512-v8.ll | 6 +-- .../X86/vector-shuffle-combining-avx2.ll | 3 +- 7 files changed, 60 insertions(+), 90 deletions(-) diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 9f4a75c6689..8315b867316 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -4286,15 +4286,6 @@ let Predicates = [HasAVX512, OptForSize] in { (v4i32 (VMOVSSZrr (v4i32 (AVX512_128_SET0)), (v4i32 (EXTRACT_SUBREG (v8i32 VR256X:$src), sub_xmm)))), sub_xmm)>; - def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))), - (SUBREG_TO_REG (i32 0), - (v2f64 (VMOVSDZrr (v2f64 (AVX512_128_SET0)), - (v2f64 (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)))), sub_xmm)>; - def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))), - (SUBREG_TO_REG (i32 0), - (v2i64 (VMOVSDZrr (v2i64 (AVX512_128_SET0)), - (v2i64 (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)))), sub_xmm)>; - def : Pat<(v16f32 (X86vzmovl (v16f32 VR512:$src))), (SUBREG_TO_REG (i32 0), (v4f32 (VMOVSSZrr (v4f32 (AVX512_128_SET0)), @@ -4303,17 +4294,6 @@ let Predicates = [HasAVX512, OptForSize] in { (SUBREG_TO_REG (i32 0), (v4i32 (VMOVSSZrr (v4i32 (AVX512_128_SET0)), (v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)))), sub_xmm)>; - - def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))), - (SUBREG_TO_REG (i32 0), - (v2f64 (VMOVSDZrr (v2f64 (AVX512_128_SET0)), - (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)))), sub_xmm)>; - - def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))), - (SUBREG_TO_REG (i32 0), - (v2i64 (VMOVSDZrr (v2i64 (AVX512_128_SET0)), - (v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)))), sub_xmm)>; - } // Use 128-bit blends for OptForSpeed since BLENDs have better throughput than @@ -4329,17 +4309,6 @@ let Predicates = [HasAVX512, OptForSpeed] in { (v4i32 (VPBLENDWrri (v4i32 (V_SET0)), (v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)), (i8 3))), sub_xmm)>; - - def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))), - (SUBREG_TO_REG (i32 0), - (v2f64 (VBLENDPDrri (v2f64 (V_SET0)), - (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)), - (i8 1))), sub_xmm)>; - def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))), - (SUBREG_TO_REG (i32 0), - (v2i64 (VPBLENDWrri (v2i64 (V_SET0)), - (v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)), - (i8 0xf))), sub_xmm)>; } let Predicates = [HasAVX512] in { @@ -4452,6 +4421,28 @@ let Predicates = [HasAVX512] in { (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>; def : Pat<(v8i64 (X86vzload addr:$src)), (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIZrm addr:$src)), sub_xmm)>; + + def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))), + (SUBREG_TO_REG (i32 0), + (v2f64 (VMOVZPQILo2PQIZrr + (v2f64 (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)))), + sub_xmm)>; + def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))), + (SUBREG_TO_REG (i32 0), + (v2i64 (VMOVZPQILo2PQIZrr + (v2i64 (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)))), + sub_xmm)>; + + def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))), + (SUBREG_TO_REG (i32 0), + (v2f64 (VMOVZPQILo2PQIZrr + (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)))), + sub_xmm)>; + def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))), + (SUBREG_TO_REG (i32 0), + (v2i64 (VMOVZPQILo2PQIZrr + (v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)))), + sub_xmm)>; } //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index c96bac6828f..e25d2dca404 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -312,17 +312,6 @@ let Predicates = [UseAVX, OptForSize] in { (SUBREG_TO_REG (i32 0), (v4i32 (VMOVSSrr (v4i32 (V_SET0)), (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>; - - def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), - (SUBREG_TO_REG (i32 0), - (v2f64 (VMOVSDrr (v2f64 (V_SET0)), - (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))), - sub_xmm)>; - def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), - (SUBREG_TO_REG (i32 0), - (v2i64 (VMOVSDrr (v2i64 (V_SET0)), - (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))), - sub_xmm)>; } let Predicates = [UseSSE1] in { @@ -4307,6 +4296,19 @@ let Predicates = [UseSSE2] in { (MOVZPQILo2PQIrr VR128:$src)>; } +let Predicates = [UseAVX] in { + def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), + (SUBREG_TO_REG (i32 0), + (v2f64 (VMOVZPQILo2PQIrr + (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))), + sub_xmm)>; + def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), + (SUBREG_TO_REG (i32 0), + (v2i64 (VMOVZPQILo2PQIrr + (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))), + sub_xmm)>; +} + //===---------------------------------------------------------------------===// // SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP //===---------------------------------------------------------------------===// @@ -6319,17 +6321,6 @@ let Predicates = [HasAVX, OptForSpeed] in { (v4i32 (VPBLENDWrri (v4i32 (V_SET0)), (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)), (i8 3))), sub_xmm)>; - - def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), - (SUBREG_TO_REG (i32 0), - (v2f64 (VBLENDPDrri (v2f64 (V_SET0)), - (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)), - (i8 1))), sub_xmm)>; - def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), - (SUBREG_TO_REG (i32 0), - (v2i64 (VPBLENDWrri (v2i64 (V_SET0)), - (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)), - (i8 0xf))), sub_xmm)>; } // Prefer a movss or movsd over a blendps when optimizing for size. these were diff --git a/test/CodeGen/X86/vec_extract-avx.ll b/test/CodeGen/X86/vec_extract-avx.ll index a15424a763e..5b3fcb1e023 100644 --- a/test/CodeGen/X86/vec_extract-avx.ll +++ b/test/CodeGen/X86/vec_extract-avx.ll @@ -144,19 +144,17 @@ define void @legal_vzmovl_2i64_4i64(<2 x i64>* %in, <4 x i64>* %out) { ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: vmovups (%ecx), %xmm0 -; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; X32-NEXT: vmovaps %ymm0, (%eax) +; X32-NEXT: vmovdqu (%ecx), %xmm0 +; X32-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; X32-NEXT: vmovdqa %ymm0, (%eax) ; X32-NEXT: vzeroupper ; X32-NEXT: retl ; ; X64-LABEL: legal_vzmovl_2i64_4i64: ; X64: # %bb.0: -; X64-NEXT: vmovups (%rdi), %xmm0 -; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; X64-NEXT: vmovaps %ymm0, (%rsi) +; X64-NEXT: vmovdqu (%rdi), %xmm0 +; X64-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; X64-NEXT: vmovdqa %ymm0, (%rsi) ; X64-NEXT: vzeroupper ; X64-NEXT: retq %ld = load <2 x i64>, <2 x i64>* %in, align 8 @@ -198,19 +196,17 @@ define void @legal_vzmovl_2f64_4f64(<2 x double>* %in, <4 x double>* %out) { ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: vmovups (%ecx), %xmm0 -; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; X32-NEXT: vmovaps %ymm0, (%eax) +; X32-NEXT: vmovdqu (%ecx), %xmm0 +; X32-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; X32-NEXT: vmovdqa %ymm0, (%eax) ; X32-NEXT: vzeroupper ; X32-NEXT: retl ; ; X64-LABEL: legal_vzmovl_2f64_4f64: ; X64: # %bb.0: -; X64-NEXT: vmovups (%rdi), %xmm0 -; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; X64-NEXT: vmovaps %ymm0, (%rsi) +; X64-NEXT: vmovdqu (%rdi), %xmm0 +; X64-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; X64-NEXT: vmovdqa %ymm0, (%rsi) ; X64-NEXT: vzeroupper ; X64-NEXT: retq %ld = load <2 x double>, <2 x double>* %in, align 8 diff --git a/test/CodeGen/X86/vector-extend-inreg.ll b/test/CodeGen/X86/vector-extend-inreg.ll index d790cb54b61..f60bf4b0109 100644 --- a/test/CodeGen/X86/vector-extend-inreg.ll +++ b/test/CodeGen/X86/vector-extend-inreg.ll @@ -71,18 +71,17 @@ define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) noun ; X32-AVX-NEXT: andl $-128, %esp ; X32-AVX-NEXT: subl $384, %esp # imm = 0x180 ; X32-AVX-NEXT: movl 40(%ebp), %ecx -; X32-AVX-NEXT: vbroadcastsd 32(%ebp), %ymm0 -; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; X32-AVX-NEXT: vpbroadcastq 32(%ebp), %ymm0 +; X32-AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X32-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp) ; X32-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp) ; X32-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp) +; X32-AVX-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%esp) ; X32-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp) ; X32-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp) ; X32-AVX-NEXT: vmovaps %ymm1, (%esp) -; X32-AVX-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp) +; X32-AVX-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%esp) ; X32-AVX-NEXT: leal (%ecx,%ecx), %eax ; X32-AVX-NEXT: andl $31, %eax ; X32-AVX-NEXT: movl 128(%esp,%eax,4), %eax @@ -101,14 +100,13 @@ define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) noun ; X64-AVX-NEXT: andq $-128, %rsp ; X64-AVX-NEXT: subq $256, %rsp # imm = 0x100 ; X64-AVX-NEXT: # kill: def $edi killed $edi def $rdi -; X64-AVX-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[3,1,2,3] -; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; X64-AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm3[3,1,2,3] +; X64-AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) ; X64-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) ; X64-AVX-NEXT: vmovaps %ymm1, (%rsp) -; X64-AVX-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) +; X64-AVX-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) ; X64-AVX-NEXT: andl $15, %edi ; X64-AVX-NEXT: movq (%rsp,%rdi,8), %rax ; X64-AVX-NEXT: movq %rbp, %rsp diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll index 4ae23a0437e..099aad76ba7 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -1505,8 +1505,7 @@ define <4 x double> @insert_reg_and_zero_v4f64(double %a) { ; ALL-LABEL: insert_reg_and_zero_v4f64: ; ALL: # %bb.0: ; ALL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; ALL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; ALL-NEXT: retq %v = insertelement <4 x double> undef, double %a, i32 0 %shuffle = shufflevector <4 x double> %v, <4 x double> zeroinitializer, <4 x i32> @@ -1987,8 +1986,7 @@ entry: define <4 x double> @shuffle_v4f64_0zzz_optsize(<4 x double> %a) optsize { ; ALL-LABEL: shuffle_v4f64_0zzz_optsize: ; ALL: # %bb.0: -; ALL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; ALL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; ALL-NEXT: retq %b = shufflevector <4 x double> %a, <4 x double> zeroinitializer, <4 x i32> ret <4 x double> %b @@ -1997,8 +1995,7 @@ define <4 x double> @shuffle_v4f64_0zzz_optsize(<4 x double> %a) optsize { define <4 x i64> @shuffle_v4i64_0zzz_optsize(<4 x i64> %a) optsize { ; ALL-LABEL: shuffle_v4i64_0zzz_optsize: ; ALL: # %bb.0: -; ALL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; ALL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; ALL-NEXT: retq %b = shufflevector <4 x i64> %a, <4 x i64> zeroinitializer, <4 x i32> ret <4 x i64> %b diff --git a/test/CodeGen/X86/vector-shuffle-512-v8.ll b/test/CodeGen/X86/vector-shuffle-512-v8.ll index ba0707a5bba..dea5457baea 100644 --- a/test/CodeGen/X86/vector-shuffle-512-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -1973,8 +1973,7 @@ define <8 x double> @shuffle_v8f64_uuu2301(<8 x double> %a0, <8 x double> %a1) { define <8 x i64> @shuffle_v8i64_0zzzzzzz(<8 x i64> %a) { ; ALL-LABEL: shuffle_v8i64_0zzzzzzz: ; ALL: # %bb.0: -; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; ALL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> zeroinitializer, <8 x i32> ret <8 x i64> %shuffle @@ -1983,8 +1982,7 @@ define <8 x i64> @shuffle_v8i64_0zzzzzzz(<8 x i64> %a) { define <8 x double> @shuffle_v8f64_0zzzzzzz(<8 x double> %a) { ; ALL-LABEL: shuffle_v8f64_0zzzzzzz: ; ALL: # %bb.0: -; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; ALL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x double> %a, <8 x double> zeroinitializer, <8 x i32> ret <8 x double> %shuffle diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll index 1d416edbfda..7a8a7d32623 100644 --- a/test/CodeGen/X86/vector-shuffle-combining-avx2.ll +++ b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll @@ -380,8 +380,7 @@ define <4 x i64> @combine_pshufb_as_zext128(<32 x i8> %a0) { define <4 x double> @combine_pshufb_as_vzmovl_64(<4 x double> %a0) { ; CHECK-LABEL: combine_pshufb_as_vzmovl_64: ; CHECK: # %bb.0: -; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; CHECK-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; CHECK-NEXT: ret{{[l|q]}} %1 = bitcast <4 x double> %a0 to <32 x i8> %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> ) -- 2.50.1