From: Craig Topper Date: Sun, 15 Jul 2018 18:51:07 +0000 (+0000) Subject: [X86] Use 128-bit ops for 256-bit vzmovl patterns. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=c7482ced308de739b8ffdf3383578bf76f74249a;p=llvm [X86] Use 128-bit ops for 256-bit vzmovl patterns. 128-bit ops implicitly zero the upper bits. This should address the comment about domain crossing for the integer version without AVX2 since we can use a 128-bit VBLENDW without AVX2. The only bad thing I see here is that we failed to reuse an vxorps in some of the tests, but I think that's already known issue. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@337134 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index eb6a3323491..c2e1a94f408 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -6431,19 +6431,26 @@ let Predicates = [HasAVX, OptForSpeed] in { // Move low f32 and clear high bits. def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), - (VBLENDPSYrri (v8f32 (AVX_SET0)), VR256:$src, (i8 1))>; + (SUBREG_TO_REG (i32 0), + (VBLENDPSrri (v4f32 (V_SET0)), + (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm), + (i8 1)), sub_xmm)>; + def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), + (SUBREG_TO_REG (i32 0), + (VPBLENDWrri (v4i32 (V_SET0)), + (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm), + (i8 3)), sub_xmm)>; - // Move low f64 and clear high bits. def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), - (VBLENDPDYrri (v4f64 (AVX_SET0)), VR256:$src, (i8 1))>; - - // These will incur an FP/int domain crossing penalty, but it may be the only - // way without AVX2. Do not add any complexity because we may be able to match - // more optimal patterns defined earlier in this file. - def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), - (VBLENDPSYrri (v8i32 (AVX_SET0)), VR256:$src, (i8 1))>; + (SUBREG_TO_REG (i32 0), + (VBLENDPDrri (v2f64 (V_SET0)), + (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm), + (i8 1)), sub_xmm)>; def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), - (VBLENDPDYrri (v4i64 (AVX_SET0)), VR256:$src, (i8 1))>; + (SUBREG_TO_REG (i32 0), + (VPBLENDWrri (v2i64 (V_SET0)), + (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm), + (i8 0xf)), sub_xmm)>; } // Prefer a movss or movsd over a blendps when optimizing for size. these were diff --git a/test/CodeGen/X86/2012-01-12-extract-sv.ll b/test/CodeGen/X86/2012-01-12-extract-sv.ll index 156e373a5af..bcebefe6519 100644 --- a/test/CodeGen/X86/2012-01-12-extract-sv.ll +++ b/test/CodeGen/X86/2012-01-12-extract-sv.ll @@ -1,16 +1,18 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mattr=+avx -mtriple=i686-pc-win32 | FileCheck %s define void @endless_loop() { ; CHECK-LABEL: endless_loop: -; CHECK-NEXT: # %bb.0: +; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vmovaps (%eax), %ymm0 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 ; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] -; CHECK-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,1] -; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7] +; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] ; CHECK-NEXT: vmovaps %ymm0, (%eax) ; CHECK-NEXT: vmovaps %ymm1, (%eax) ; CHECK-NEXT: vzeroupper diff --git a/test/CodeGen/X86/avx-load-store.ll b/test/CodeGen/X86/avx-load-store.ll index c6667340c7c..d55bbac5dc1 100644 --- a/test/CodeGen/X86/avx-load-store.ll +++ b/test/CodeGen/X86/avx-load-store.ll @@ -87,8 +87,10 @@ define <8 x float> @mov00(<8 x float> %v, float * %ptr) nounwind { ; CHECK_O0-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK_O0-NEXT: # implicit-def: $ymm1 ; CHECK_O0-NEXT: vmovaps %xmm0, %xmm1 +; CHECK_O0-NEXT: vmovaps %xmm1, %xmm0 ; CHECK_O0-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK_O0-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm2[1,2,3,4,5,6,7] +; CHECK_O0-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] +; CHECK_O0-NEXT: # kill: def $ymm0 killed $xmm0 ; CHECK_O0-NEXT: retq %val = load float, float* %ptr %i0 = insertelement <8 x float> zeroinitializer, float %val, i32 0 @@ -106,8 +108,10 @@ define <4 x double> @mov01(<4 x double> %v, double * %ptr) nounwind { ; CHECK_O0-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; CHECK_O0-NEXT: # implicit-def: $ymm1 ; CHECK_O0-NEXT: vmovaps %xmm0, %xmm1 +; CHECK_O0-NEXT: vmovaps %xmm1, %xmm0 ; CHECK_O0-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK_O0-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm2[1,2,3] +; CHECK_O0-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm2[1] +; CHECK_O0-NEXT: # kill: def $ymm0 killed $xmm0 ; CHECK_O0-NEXT: retq %val = load double, double* %ptr %i0 = insertelement <4 x double> zeroinitializer, double %val, i32 0 diff --git a/test/CodeGen/X86/vec_extract-avx.ll b/test/CodeGen/X86/vec_extract-avx.ll index 094a1b0402c..9a12d69b46b 100644 --- a/test/CodeGen/X86/vec_extract-avx.ll +++ b/test/CodeGen/X86/vec_extract-avx.ll @@ -119,7 +119,7 @@ define void @legal_vzmovl_2i32_8i32(<2 x i32>* %in, <8 x i32>* %out) { ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] +; X32-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; X32-NEXT: vmovaps %ymm0, (%eax) ; X32-NEXT: vzeroupper ; X32-NEXT: retl @@ -128,7 +128,7 @@ define void @legal_vzmovl_2i32_8i32(<2 x i32>* %in, <8 x i32>* %out) { ; X64: # %bb.0: ; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] +; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; X64-NEXT: vmovaps %ymm0, (%rsi) ; X64-NEXT: vzeroupper ; X64-NEXT: retq @@ -146,7 +146,7 @@ define void @legal_vzmovl_2i64_4i64(<2 x i64>* %in, <4 x i64>* %out) { ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: vmovups (%ecx), %xmm0 ; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; X32-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; X32-NEXT: vmovaps %ymm0, (%eax) ; X32-NEXT: vzeroupper ; X32-NEXT: retl @@ -155,7 +155,7 @@ define void @legal_vzmovl_2i64_4i64(<2 x i64>* %in, <4 x i64>* %out) { ; X64: # %bb.0: ; X64-NEXT: vmovups (%rdi), %xmm0 ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; X64-NEXT: vmovaps %ymm0, (%rsi) ; X64-NEXT: vzeroupper ; X64-NEXT: retq @@ -180,7 +180,7 @@ define void @legal_vzmovl_2f32_8f32(<2 x float>* %in, <8 x float>* %out) { ; X64: # %bb.0: ; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] +; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; X64-NEXT: vmovaps %ymm0, (%rsi) ; X64-NEXT: vzeroupper ; X64-NEXT: retq @@ -198,7 +198,7 @@ define void @legal_vzmovl_2f64_4f64(<2 x double>* %in, <4 x double>* %out) { ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: vmovups (%ecx), %xmm0 ; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; X32-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; X32-NEXT: vmovaps %ymm0, (%eax) ; X32-NEXT: vzeroupper ; X32-NEXT: retl @@ -207,7 +207,7 @@ define void @legal_vzmovl_2f64_4f64(<2 x double>* %in, <4 x double>* %out) { ; X64: # %bb.0: ; X64-NEXT: vmovups (%rdi), %xmm0 ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; X64-NEXT: vmovaps %ymm0, (%rsi) ; X64-NEXT: vzeroupper ; X64-NEXT: retq diff --git a/test/CodeGen/X86/vector-extend-inreg.ll b/test/CodeGen/X86/vector-extend-inreg.ll index 289afebbdea..86bb13f57eb 100644 --- a/test/CodeGen/X86/vector-extend-inreg.ll +++ b/test/CodeGen/X86/vector-extend-inreg.ll @@ -73,7 +73,8 @@ define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) noun ; X32-AVX-NEXT: movl 40(%ebp), %ecx ; X32-AVX-NEXT: vbroadcastsd 32(%ebp), %ymm0 ; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; X32-AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X32-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp) ; X32-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp) ; X32-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp) @@ -102,7 +103,8 @@ define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) noun ; X64-AVX-NEXT: # kill: def $edi killed $edi def $rdi ; X64-AVX-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[3,1,2,3] ; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; X64-AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) ; X64-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) ; X64-AVX-NEXT: vmovaps %ymm1, (%rsp) diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll index e125601a588..4e796f07d6f 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -1376,7 +1376,7 @@ define <4 x double> @insert_reg_and_zero_v4f64(double %a) { ; ALL: # %bb.0: ; ALL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; ALL-NEXT: retq %v = insertelement <4 x double> undef, double %a, i32 0 %shuffle = shufflevector <4 x double> %v, <4 x double> zeroinitializer, <4 x i32> diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll index 50e0f173cb6..502a7a84410 100644 --- a/test/CodeGen/X86/vector-shuffle-combining-avx2.ll +++ b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll @@ -523,13 +523,13 @@ define <4 x double> @combine_pshufb_as_vzmovl_64(<4 x double> %a0) { ; X32-LABEL: combine_pshufb_as_vzmovl_64: ; X32: # %bb.0: ; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; X32-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; X32-NEXT: retl ; ; X64-LABEL: combine_pshufb_as_vzmovl_64: ; X64: # %bb.0: ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; X64-NEXT: retq %1 = bitcast <4 x double> %a0 to <32 x i8> %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> ) @@ -541,13 +541,13 @@ define <8 x float> @combine_pshufb_as_vzmovl_32(<8 x float> %a0) { ; X32-LABEL: combine_pshufb_as_vzmovl_32: ; X32: # %bb.0: ; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] +; X32-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; X32-NEXT: retl ; ; X64-LABEL: combine_pshufb_as_vzmovl_32: ; X64: # %bb.0: ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] +; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; X64-NEXT: retq %1 = bitcast <8 x float> %a0 to <32 x i8> %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> )