From: Sanjay Patel Date: Thu, 3 Jan 2019 22:26:51 +0000 (+0000) Subject: [x86] split tests for FP and integer horizontal math X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=798ea3748ca6322c8cb7d4ef0e8eb2c674c553d6;p=llvm [x86] split tests for FP and integer horizontal math These are similar patterns, but when you throw AVX512 onto the pile, the number of variations explodes. For FP, we really don't care about AVX1 vs. AVX2 for FP ops. There may be some superficial shuffle diffs, but that's not what we're testing for here, so I removed those RUNs. Separating by type also lets us specify 'sse3' for the FP file vs. 'ssse3' for the integer file...because x86. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@350357 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/test/CodeGen/X86/haddsub-undef.ll b/test/CodeGen/X86/haddsub-undef.ll index 83782b19126..416ce915465 100644 --- a/test/CodeGen/X86/haddsub-undef.ll +++ b/test/CodeGen/X86/haddsub-undef.ll @@ -1,10 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSE-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSE,SSE-FAST -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX1,AVX1-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX1,AVX1-FAST -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX2,AVX2-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX2,AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3 | FileCheck %s --check-prefixes=SSE,SSE-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3,fast-hops | FileCheck %s --check-prefixes=SSE,SSE-FAST +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST ; Verify that we correctly fold horizontal binop even in the presence of UNDEFs. @@ -316,141 +314,6 @@ define <8 x float> @test13_undef(<8 x float> %a, <8 x float> %b) { ret <8 x float> %vecinit4 } -define <8 x i32> @test14_undef(<8 x i32> %a, <8 x i32> %b) { -; SSE-LABEL: test14_undef: -; SSE: # %bb.0: -; SSE-NEXT: phaddd %xmm2, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: test14_undef: -; AVX1: # %bb.0: -; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: test14_undef: -; AVX2: # %bb.0: -; AVX2-NEXT: vphaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: retq - %vecext = extractelement <8 x i32> %a, i32 0 - %vecext1 = extractelement <8 x i32> %a, i32 1 - %add = add i32 %vecext, %vecext1 - %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0 - %vecext2 = extractelement <8 x i32> %b, i32 2 - %vecext3 = extractelement <8 x i32> %b, i32 3 - %add4 = add i32 %vecext2, %vecext3 - %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 3 - ret <8 x i32> %vecinit5 -} - -; integer horizontal adds instead of two scalar adds followed by vector inserts. -define <8 x i32> @test15_undef(<8 x i32> %a, <8 x i32> %b) { -; SSE-LABEL: test15_undef: -; SSE: # %bb.0: -; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE-NEXT: movd %xmm0, %ecx -; SSE-NEXT: addl %eax, %ecx -; SSE-NEXT: movd %xmm3, %eax -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] -; SSE-NEXT: movd %xmm0, %edx -; SSE-NEXT: addl %eax, %edx -; SSE-NEXT: movd %ecx, %xmm0 -; SSE-NEXT: movd %edx, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: retq -; -; AVX1-LABEL: test15_undef: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: vpextrd $1, %xmm0, %ecx -; AVX1-NEXT: addl %eax, %ecx -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: vpextrd $1, %xmm0, %edx -; AVX1-NEXT: addl %eax, %edx -; AVX1-NEXT: vmovd %ecx, %xmm0 -; AVX1-NEXT: vmovd %edx, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: test15_undef: -; AVX2: # %bb.0: -; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 -; AVX2-NEXT: retq - %vecext = extractelement <8 x i32> %a, i32 0 - %vecext1 = extractelement <8 x i32> %a, i32 1 - %add = add i32 %vecext, %vecext1 - %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0 - %vecext2 = extractelement <8 x i32> %b, i32 4 - %vecext3 = extractelement <8 x i32> %b, i32 5 - %add4 = add i32 %vecext2, %vecext3 - %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 6 - ret <8 x i32> %vecinit5 -} - -define <8 x i32> @test16_undef(<8 x i32> %a, <8 x i32> %b) { -; SSE-LABEL: test16_undef: -; SSE: # %bb.0: -; SSE-NEXT: phaddd %xmm0, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: test16_undef: -; AVX1: # %bb.0: -; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: test16_undef: -; AVX2: # %bb.0: -; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 -; AVX2-NEXT: retq - %vecext = extractelement <8 x i32> %a, i32 0 - %vecext1 = extractelement <8 x i32> %a, i32 1 - %add = add i32 %vecext, %vecext1 - %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0 - %vecext2 = extractelement <8 x i32> %a, i32 2 - %vecext3 = extractelement <8 x i32> %a, i32 3 - %add4 = add i32 %vecext2, %vecext3 - %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1 - ret <8 x i32> %vecinit5 -} - -define <8 x i32> @test17_undef(<8 x i32> %a, <8 x i32> %b) { -; SSE-LABEL: test17_undef: -; SSE: # %bb.0: -; SSE-NEXT: phaddd %xmm1, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: test17_undef: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: test17_undef: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vphaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq - %vecext = extractelement <8 x i32> %a, i32 0 - %vecext1 = extractelement <8 x i32> %a, i32 1 - %add1 = add i32 %vecext, %vecext1 - %vecinit1 = insertelement <8 x i32> undef, i32 %add1, i32 0 - %vecext2 = extractelement <8 x i32> %a, i32 2 - %vecext3 = extractelement <8 x i32> %a, i32 3 - %add2 = add i32 %vecext2, %vecext3 - %vecinit2 = insertelement <8 x i32> %vecinit1, i32 %add2, i32 1 - %vecext4 = extractelement <8 x i32> %a, i32 4 - %vecext5 = extractelement <8 x i32> %a, i32 5 - %add3 = add i32 %vecext4, %vecext5 - %vecinit3 = insertelement <8 x i32> %vecinit2, i32 %add3, i32 2 - %vecext6 = extractelement <8 x i32> %a, i32 6 - %vecext7 = extractelement <8 x i32> %a, i32 7 - %add4 = add i32 %vecext6, %vecext7 - %vecinit4 = insertelement <8 x i32> %vecinit3, i32 %add4, i32 3 - ret <8 x i32> %vecinit4 -} - define <2 x double> @add_pd_003(<2 x double> %x) { ; SSE-SLOW-LABEL: add_pd_003: ; SSE-SLOW: # %bb.0: @@ -621,18 +484,23 @@ define <4 x float> @add_ps_007_2(<4 x float> %x) { ; SSE-FAST-NEXT: haddps %xmm0, %xmm0 ; SSE-FAST-NEXT: retq ; +; AVX-SLOW-LABEL: add_ps_007_2: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] +; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: add_ps_007_2: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq ; AVX1-SLOW-LABEL: add_ps_007_2: ; AVX1-SLOW: # %bb.0: ; AVX1-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] ; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] ; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 ; AVX1-SLOW-NEXT: retq -; -; AVX-FAST-LABEL: add_ps_007_2: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: retq -; ; AVX2-SLOW-LABEL: add_ps_007_2: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vbroadcastss %xmm0, %xmm1 @@ -720,6 +588,19 @@ define <4 x float> @add_ps_018(<4 x float> %x) { ; SSE-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-FAST-NEXT: retq ; +; AVX-SLOW-LABEL: add_ps_018: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] +; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: add_ps_018: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-FAST-NEXT: retq ; AVX1-SLOW-LABEL: add_ps_018: ; AVX1-SLOW: # %bb.0: ; AVX1-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] @@ -727,13 +608,6 @@ define <4 x float> @add_ps_018(<4 x float> %x) { ; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 ; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX1-SLOW-NEXT: retq -; -; AVX-FAST-LABEL: add_ps_018: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-FAST-NEXT: retq -; ; AVX2-SLOW-LABEL: add_ps_018: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vbroadcastss %xmm0, %xmm1 diff --git a/test/CodeGen/X86/phaddsub-undef.ll b/test/CodeGen/X86/phaddsub-undef.ll new file mode 100644 index 00000000000..161d057af1d --- /dev/null +++ b/test/CodeGen/X86/phaddsub-undef.ll @@ -0,0 +1,145 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSE-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSE,SSE-FAST +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX1,AVX1-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX1,AVX1-FAST +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX2,AVX2-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX2,AVX2-FAST + +; Verify that we correctly fold horizontal binop even in the presence of UNDEFs. + +define <8 x i32> @test14_undef(<8 x i32> %a, <8 x i32> %b) { +; SSE-LABEL: test14_undef: +; SSE: # %bb.0: +; SSE-NEXT: phaddd %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: test14_undef: +; AVX1: # %bb.0: +; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test14_undef: +; AVX2: # %bb.0: +; AVX2-NEXT: vphaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %vecext = extractelement <8 x i32> %a, i32 0 + %vecext1 = extractelement <8 x i32> %a, i32 1 + %add = add i32 %vecext, %vecext1 + %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0 + %vecext2 = extractelement <8 x i32> %b, i32 2 + %vecext3 = extractelement <8 x i32> %b, i32 3 + %add4 = add i32 %vecext2, %vecext3 + %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 3 + ret <8 x i32> %vecinit5 +} + +; integer horizontal adds instead of two scalar adds followed by vector inserts. +define <8 x i32> @test15_undef(<8 x i32> %a, <8 x i32> %b) { +; SSE-LABEL: test15_undef: +; SSE: # %bb.0: +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE-NEXT: movd %xmm0, %ecx +; SSE-NEXT: addl %eax, %ecx +; SSE-NEXT: movd %xmm3, %eax +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] +; SSE-NEXT: movd %xmm0, %edx +; SSE-NEXT: addl %eax, %edx +; SSE-NEXT: movd %ecx, %xmm0 +; SSE-NEXT: movd %edx, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; SSE-NEXT: retq +; +; AVX1-LABEL: test15_undef: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vpextrd $1, %xmm0, %ecx +; AVX1-NEXT: addl %eax, %ecx +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vpextrd $1, %xmm0, %edx +; AVX1-NEXT: addl %eax, %edx +; AVX1-NEXT: vmovd %ecx, %xmm0 +; AVX1-NEXT: vmovd %edx, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test15_undef: +; AVX2: # %bb.0: +; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: retq + %vecext = extractelement <8 x i32> %a, i32 0 + %vecext1 = extractelement <8 x i32> %a, i32 1 + %add = add i32 %vecext, %vecext1 + %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0 + %vecext2 = extractelement <8 x i32> %b, i32 4 + %vecext3 = extractelement <8 x i32> %b, i32 5 + %add4 = add i32 %vecext2, %vecext3 + %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 6 + ret <8 x i32> %vecinit5 +} + +define <8 x i32> @test16_undef(<8 x i32> %a, <8 x i32> %b) { +; SSE-LABEL: test16_undef: +; SSE: # %bb.0: +; SSE-NEXT: phaddd %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: test16_undef: +; AVX1: # %bb.0: +; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test16_undef: +; AVX2: # %bb.0: +; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: retq + %vecext = extractelement <8 x i32> %a, i32 0 + %vecext1 = extractelement <8 x i32> %a, i32 1 + %add = add i32 %vecext, %vecext1 + %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0 + %vecext2 = extractelement <8 x i32> %a, i32 2 + %vecext3 = extractelement <8 x i32> %a, i32 3 + %add4 = add i32 %vecext2, %vecext3 + %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1 + ret <8 x i32> %vecinit5 +} + +define <8 x i32> @test17_undef(<8 x i32> %a, <8 x i32> %b) { +; SSE-LABEL: test17_undef: +; SSE: # %bb.0: +; SSE-NEXT: phaddd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: test17_undef: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test17_undef: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq + %vecext = extractelement <8 x i32> %a, i32 0 + %vecext1 = extractelement <8 x i32> %a, i32 1 + %add1 = add i32 %vecext, %vecext1 + %vecinit1 = insertelement <8 x i32> undef, i32 %add1, i32 0 + %vecext2 = extractelement <8 x i32> %a, i32 2 + %vecext3 = extractelement <8 x i32> %a, i32 3 + %add2 = add i32 %vecext2, %vecext3 + %vecinit2 = insertelement <8 x i32> %vecinit1, i32 %add2, i32 1 + %vecext4 = extractelement <8 x i32> %a, i32 4 + %vecext5 = extractelement <8 x i32> %a, i32 5 + %add3 = add i32 %vecext4, %vecext5 + %vecinit3 = insertelement <8 x i32> %vecinit2, i32 %add3, i32 2 + %vecext6 = extractelement <8 x i32> %a, i32 6 + %vecext7 = extractelement <8 x i32> %a, i32 7 + %add4 = add i32 %vecext6, %vecext7 + %vecinit4 = insertelement <8 x i32> %vecinit3, i32 %add4, i32 3 + ret <8 x i32> %vecinit4 +} +