From 2fa71f8ee2121d10b84be4bdaaef52f41e645abe Mon Sep 17 00:00:00 2001 From: David Zarzycki Date: Sun, 6 Oct 2019 10:25:52 +0000 Subject: [PATCH] [X86] Enable AVX512BW for memcmp() git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373845 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 9 +- test/CodeGen/X86/memcmp.ll | 20 ++++- test/CodeGen/X86/setcc-wide-types.ll | 118 ++++++++++++++++++--------- 3 files changed, 106 insertions(+), 41 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 6f535617f1a..e1e3a4ca486 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -42354,10 +42354,12 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG, if ((OpSize == 128 && Subtarget.hasSSE2()) || (OpSize == 256 && Subtarget.hasAVX2()) || (OpSize == 512 && Subtarget.useAVX512Regs())) { - EVT VecVT = OpSize == 512 ? MVT::v16i32 : + auto BW = Subtarget.hasBWI(); + EVT VecVT = OpSize == 512 ? (BW ? MVT::v64i8 : MVT::v16i32) : OpSize == 256 ? MVT::v32i8 : MVT::v16i8; - EVT CmpVT = OpSize == 512 ? MVT::v16i1 : VecVT; + EVT CmpVT = OpSize == 512 ? (BW ? MVT::v64i1 : MVT::v16i1) : VecVT; + SDValue Cmp; if (IsOrXorXorCCZero) { // This is a bitwise-combined equality comparison of 2 pairs of vectors: @@ -42377,6 +42379,9 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG, Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ); } // For 512-bits we want to emit a setcc that will lower to kortest. + if (OpSize == 512 && BW) + return DAG.getSetCC(DL, VT, DAG.getBitcast(MVT::i64, Cmp), + DAG.getConstant(0xFFFFFFFFFFFFFFFF, DL, MVT::i64), CC); if (OpSize == 512) return DAG.getSetCC(DL, VT, DAG.getBitcast(MVT::i16, Cmp), DAG.getConstant(0xFFFF, DL, MVT::i16), CC); diff --git a/test/CodeGen/X86/memcmp.ll b/test/CodeGen/X86/memcmp.ll index 54bd3fc2e80..0077df867db 100644 --- a/test/CodeGen/X86/memcmp.ll +++ b/test/CodeGen/X86/memcmp.ll @@ -6,7 +6,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512F --check-prefix=X64-AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512BW ; This tests codegen time inlining/optimization of memcmp ; rdar://6480398 @@ -1551,6 +1551,15 @@ define i1 @length64_eq(i8* %x, i8* %y) nounwind { ; X64-AVX512F-NEXT: setae %al ; X64-AVX512F-NEXT: vzeroupper ; X64-AVX512F-NEXT: retq +; +; X64-AVX512BW-LABEL: length64_eq: +; X64-AVX512BW: # %bb.0: +; X64-AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0 +; X64-AVX512BW-NEXT: vpcmpeqb (%rsi), %zmm0, %k0 +; X64-AVX512BW-NEXT: kortestq %k0, %k0 +; X64-AVX512BW-NEXT: setae %al +; X64-AVX512BW-NEXT: vzeroupper +; X64-AVX512BW-NEXT: retq %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 64) nounwind %cmp = icmp ne i32 %call, 0 ret i1 %cmp @@ -1612,6 +1621,15 @@ define i1 @length64_eq_const(i8* %X) nounwind { ; X64-AVX512F-NEXT: setb %al ; X64-AVX512F-NEXT: vzeroupper ; X64-AVX512F-NEXT: retq +; +; X64-AVX512BW-LABEL: length64_eq_const: +; X64-AVX512BW: # %bb.0: +; X64-AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0 +; X64-AVX512BW-NEXT: vpcmpeqb {{.*}}(%rip), %zmm0, %k0 +; X64-AVX512BW-NEXT: kortestq %k0, %k0 +; X64-AVX512BW-NEXT: setb %al +; X64-AVX512BW-NEXT: vzeroupper +; X64-AVX512BW-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 64) nounwind %c = icmp eq i32 %m, 0 ret i1 %c diff --git a/test/CodeGen/X86/setcc-wide-types.ll b/test/CodeGen/X86/setcc-wide-types.ll index d8176e488c1..58baea95fcd 100644 --- a/test/CodeGen/X86/setcc-wide-types.ll +++ b/test/CodeGen/X86/setcc-wide-types.ll @@ -319,14 +319,23 @@ define i32 @ne_i512(<8 x i64> %x, <8 x i64> %y) { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: ne_i512: -; AVX512: # %bb.0: -; AVX512-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: kortestw %k0, %k0 -; AVX512-NEXT: setae %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: ne_i512: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; AVX512F-NEXT: xorl %eax, %eax +; AVX512F-NEXT: kortestw %k0, %k0 +; AVX512F-NEXT: setae %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: ne_i512: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: xorl %eax, %eax +; AVX512BW-NEXT: kortestq %k0, %k0 +; AVX512BW-NEXT: setae %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %bcx = bitcast <8 x i64> %x to i512 %bcy = bitcast <8 x i64> %y to i512 %cmp = icmp ne i512 %bcx, %bcy @@ -464,14 +473,23 @@ define i32 @eq_i512(<8 x i64> %x, <8 x i64> %y) { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: eq_i512: -; AVX512: # %bb.0: -; AVX512-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: kortestw %k0, %k0 -; AVX512-NEXT: setb %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: eq_i512: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; AVX512F-NEXT: xorl %eax, %eax +; AVX512F-NEXT: kortestw %k0, %k0 +; AVX512F-NEXT: setb %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: eq_i512: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: xorl %eax, %eax +; AVX512BW-NEXT: kortestq %k0, %k0 +; AVX512BW-NEXT: setb %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %bcx = bitcast <8 x i64> %x to i512 %bcy = bitcast <8 x i64> %y to i512 %cmp = icmp eq i512 %bcx, %bcy @@ -804,17 +822,29 @@ define i32 @ne_i512_pair(i512* %a, i512* %b) { ; NO512-NEXT: setne %al ; NO512-NEXT: retq ; -; AVX512-LABEL: ne_i512_pair: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 -; AVX512-NEXT: vmovdqu64 64(%rdi), %zmm1 -; AVX512-NEXT: vpcmpeqd (%rsi), %zmm0, %k1 -; AVX512-NEXT: vpcmpeqd 64(%rsi), %zmm1, %k0 {%k1} -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: kortestw %k0, %k0 -; AVX512-NEXT: setae %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: ne_i512_pair: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 +; AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm1 +; AVX512F-NEXT: vpcmpeqd (%rsi), %zmm0, %k1 +; AVX512F-NEXT: vpcmpeqd 64(%rsi), %zmm1, %k0 {%k1} +; AVX512F-NEXT: xorl %eax, %eax +; AVX512F-NEXT: kortestw %k0, %k0 +; AVX512F-NEXT: setae %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: ne_i512_pair: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqu64 64(%rdi), %zmm1 +; AVX512BW-NEXT: vpcmpeqb (%rsi), %zmm0, %k1 +; AVX512BW-NEXT: vpcmpeqb 64(%rsi), %zmm1, %k0 {%k1} +; AVX512BW-NEXT: xorl %eax, %eax +; AVX512BW-NEXT: kortestq %k0, %k0 +; AVX512BW-NEXT: setae %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %a0 = load i512, i512* %a %b0 = load i512, i512* %b %xor1 = xor i512 %a0, %b0 @@ -886,17 +916,29 @@ define i32 @eq_i512_pair(i512* %a, i512* %b) { ; NO512-NEXT: sete %al ; NO512-NEXT: retq ; -; AVX512-LABEL: eq_i512_pair: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 -; AVX512-NEXT: vmovdqu64 64(%rdi), %zmm1 -; AVX512-NEXT: vpcmpeqd (%rsi), %zmm0, %k1 -; AVX512-NEXT: vpcmpeqd 64(%rsi), %zmm1, %k0 {%k1} -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: kortestw %k0, %k0 -; AVX512-NEXT: setb %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: eq_i512_pair: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 +; AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm1 +; AVX512F-NEXT: vpcmpeqd (%rsi), %zmm0, %k1 +; AVX512F-NEXT: vpcmpeqd 64(%rsi), %zmm1, %k0 {%k1} +; AVX512F-NEXT: xorl %eax, %eax +; AVX512F-NEXT: kortestw %k0, %k0 +; AVX512F-NEXT: setb %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: eq_i512_pair: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqu64 64(%rdi), %zmm1 +; AVX512BW-NEXT: vpcmpeqb (%rsi), %zmm0, %k1 +; AVX512BW-NEXT: vpcmpeqb 64(%rsi), %zmm1, %k0 {%k1} +; AVX512BW-NEXT: xorl %eax, %eax +; AVX512BW-NEXT: kortestq %k0, %k0 +; AVX512BW-NEXT: setb %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %a0 = load i512, i512* %a %b0 = load i512, i512* %b %xor1 = xor i512 %a0, %b0 -- 2.40.0