From 61bb9eeae651b68a990557d3e6d25033451f0f0b Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 28 Sep 2017 18:45:29 +0000 Subject: [PATCH] [AVX512] Add avx512bw command lines to 128-bit idiv tests. The multiply lowering on some of the tests can take advantage of the vpmovwb to simplify the truncate. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@314448 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/X86/vector-idiv-sdiv-128.ll | 78 ++++++++++++++++-------- test/CodeGen/X86/vector-idiv-udiv-128.ll | 69 +++++++++++++-------- 2 files changed, 96 insertions(+), 51 deletions(-) diff --git a/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/test/CodeGen/X86/vector-idiv-sdiv-128.ll index 87cf2026d1e..51c47ac8e3c 100644 --- a/test/CodeGen/X86/vector-idiv-sdiv-128.ll +++ b/test/CodeGen/X86/vector-idiv-sdiv-128.ll @@ -2,7 +2,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2NOBW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX512BW ; ; sdiv by 7 @@ -585,32 +586,55 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind { ; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: test_rem7_16i8: -; AVX2: # BB#0: -; AVX2-NEXT: vpmovsxbw %xmm0, %ymm1 -; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpsrlw $2, %xmm1, %xmm2 -; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpsubb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpsrlw $7, %xmm1, %xmm1 -; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX2-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 -; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2NOBW-LABEL: test_rem7_16i8: +; AVX2NOBW: # BB#0: +; AVX2NOBW-NEXT: vpmovsxbw %xmm0, %ymm1 +; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm1 +; AVX2NOBW-NEXT: vpsrlw $2, %xmm1, %xmm2 +; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX2NOBW-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX2NOBW-NEXT: vpxor %xmm3, %xmm2, %xmm2 +; AVX2NOBW-NEXT: vpsubb %xmm3, %xmm2, %xmm2 +; AVX2NOBW-NEXT: vpsrlw $7, %xmm1, %xmm1 +; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX2NOBW-NEXT: vpmovsxbw %xmm1, %ymm1 +; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 +; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2NOBW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2NOBW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2NOBW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2NOBW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2NOBW-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX2NOBW-NEXT: vzeroupper +; AVX2NOBW-NEXT: retq +; +; AVX512BW-LABEL: test_rem7_16i8: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm1 +; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 +; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512BW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm1 +; AVX512BW-NEXT: vpsrlw $2, %xmm1, %xmm2 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX512BW-NEXT: vpxor %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpsubb %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpsrlw $7, %xmm1, %xmm1 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX512BW-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX512BW-NEXT: vpmovsxbw %xmm1, %ymm1 +; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 +; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 +; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %res = srem <16 x i8> %a, ret <16 x i8> %res } diff --git a/test/CodeGen/X86/vector-idiv-udiv-128.ll b/test/CodeGen/X86/vector-idiv-udiv-128.ll index 8138442b3ea..dbf7144d0af 100644 --- a/test/CodeGen/X86/vector-idiv-udiv-128.ll +++ b/test/CodeGen/X86/vector-idiv-udiv-128.ll @@ -2,7 +2,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2NOBW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX512BW ; ; udiv by 7 @@ -556,29 +557,49 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind { ; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: test_rem7_16i8: -; AVX2: # BB#0: -; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpsrlw $1, %xmm2, %xmm2 -; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX2-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpsrlw $2, %xmm1, %xmm1 -; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 -; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2NOBW-LABEL: test_rem7_16i8: +; AVX2NOBW: # BB#0: +; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX2NOBW-NEXT: vpsubb %xmm1, %xmm0, %xmm2 +; AVX2NOBW-NEXT: vpsrlw $1, %xmm2, %xmm2 +; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX2NOBW-NEXT: vpsrlw $2, %xmm1, %xmm1 +; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX2NOBW-NEXT: vpmovsxbw %xmm1, %ymm1 +; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 +; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2NOBW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2NOBW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2NOBW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2NOBW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2NOBW-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX2NOBW-NEXT: vzeroupper +; AVX2NOBW-NEXT: retq +; +; AVX512BW-LABEL: test_rem7_16i8: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 +; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512BW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm2 +; AVX512BW-NEXT: vpsrlw $1, %xmm2, %xmm2 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512BW-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX512BW-NEXT: vpsrlw $2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovsxbw %xmm1, %ymm1 +; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 +; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 +; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %res = urem <16 x i8> %a, ret <16 x i8> %res } -- 2.49.0