From 0d4ac91263122f6f913516cfcec90f574d7fa4d1 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sun, 6 Oct 2019 18:43:03 +0000 Subject: [PATCH] [LegalizeTypes][X86] When splitting a vselect for type legalization, don't split a setcc condition if the setcc input is legal and vXi1 conditions are supported Summary: The VSELECT splitting code tries to split a setcc input as well. But on avx512 where mask registers are well supported it should be better to just split the mask and use a single compare. Reviewers: RKSimon, spatel, efriedma Reviewed By: spatel Subscribers: hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D68359 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373863 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../SelectionDAG/LegalizeTypesGeneric.cpp | 15 ++- test/CodeGen/X86/avx512-vselect.ll | 59 ++++----- test/CodeGen/X86/min-legal-vector-width.ll | 20 +-- test/CodeGen/X86/pr34177.ll | 121 ++++++++++++------ 4 files changed, 122 insertions(+), 93 deletions(-) diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp index 560b5729e3d..5562f400b6e 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp @@ -521,9 +521,18 @@ void DAGTypeLegalizer::SplitRes_SELECT(SDNode *N, SDValue &Lo, SDValue &Hi) { GetSplitVector(Cond, CL, CH); // It seems to improve code to generate two narrow SETCCs as opposed to // splitting a wide result vector. - else if (Cond.getOpcode() == ISD::SETCC) - SplitVecRes_SETCC(Cond.getNode(), CL, CH); - else + else if (Cond.getOpcode() == ISD::SETCC) { + // If the condition is a vXi1 vector, and the LHS of the setcc is a legal + // type and the setcc result type is the same vXi1, then leave the setcc + // alone. + EVT CondLHSVT = Cond.getOperand(0).getValueType(); + if (Cond.getValueType().getVectorElementType() == MVT::i1 && + isTypeLegal(CondLHSVT) && + getSetCCResultType(CondLHSVT) == Cond.getValueType()) + std::tie(CL, CH) = DAG.SplitVector(Cond, dl); + else + SplitVecRes_SETCC(Cond.getNode(), CL, CH); + } else std::tie(CL, CH) = DAG.SplitVector(Cond, dl); } diff --git a/test/CodeGen/X86/avx512-vselect.ll b/test/CodeGen/X86/avx512-vselect.ll index 7ee4e6674e0..07e5aeac015 100644 --- a/test/CodeGen/X86/avx512-vselect.ll +++ b/test/CodeGen/X86/avx512-vselect.ll @@ -51,10 +51,9 @@ entry: define <16 x i64> @test3(<16 x i8> %x, <16 x i64> %a, <16 x i64> %b) { ; CHECK-SKX-LABEL: test3: ; CHECK-SKX: # %bb.0: -; CHECK-SKX-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,0,1] -; CHECK-SKX-NEXT: vptestnmb %xmm5, %xmm5, %k1 -; CHECK-SKX-NEXT: vptestnmb %xmm0, %xmm0, %k2 -; CHECK-SKX-NEXT: vpblendmq %zmm1, %zmm3, %zmm0 {%k2} +; CHECK-SKX-NEXT: vptestnmb %xmm0, %xmm0, %k1 +; CHECK-SKX-NEXT: vpblendmq %zmm1, %zmm3, %zmm0 {%k1} +; CHECK-SKX-NEXT: kshiftrw $8, %k1, %k1 ; CHECK-SKX-NEXT: vpblendmq %zmm2, %zmm4, %zmm1 {%k1} ; CHECK-SKX-NEXT: retq ; @@ -76,10 +75,9 @@ define <16 x i64> @test3(<16 x i8> %x, <16 x i64> %a, <16 x i64> %b) { define <16 x i64> @test4(<16 x i16> %x, <16 x i64> %a, <16 x i64> %b) { ; CHECK-SKX-LABEL: test4: ; CHECK-SKX: # %bb.0: -; CHECK-SKX-NEXT: vextracti128 $1, %ymm0, %xmm5 -; CHECK-SKX-NEXT: vptestnmw %xmm5, %xmm5, %k1 -; CHECK-SKX-NEXT: vptestnmw %xmm0, %xmm0, %k2 -; CHECK-SKX-NEXT: vpblendmq %zmm1, %zmm3, %zmm0 {%k2} +; CHECK-SKX-NEXT: vptestnmw %ymm0, %ymm0, %k1 +; CHECK-SKX-NEXT: vpblendmq %zmm1, %zmm3, %zmm0 {%k1} +; CHECK-SKX-NEXT: kshiftrw $8, %k1, %k1 ; CHECK-SKX-NEXT: vpblendmq %zmm2, %zmm4, %zmm1 {%k1} ; CHECK-SKX-NEXT: retq ; @@ -99,23 +97,13 @@ define <16 x i64> @test4(<16 x i16> %x, <16 x i64> %a, <16 x i64> %b) { } define <16 x i64> @test5(<16 x i32> %x, <16 x i64> %a, <16 x i64> %b) { -; CHECK-SKX-LABEL: test5: -; CHECK-SKX: # %bb.0: -; CHECK-SKX-NEXT: vextracti64x4 $1, %zmm0, %ymm5 -; CHECK-SKX-NEXT: vptestnmd %ymm5, %ymm5, %k1 -; CHECK-SKX-NEXT: vptestnmd %ymm0, %ymm0, %k2 -; CHECK-SKX-NEXT: vpblendmq %zmm1, %zmm3, %zmm0 {%k2} -; CHECK-SKX-NEXT: vpblendmq %zmm2, %zmm4, %zmm1 {%k1} -; CHECK-SKX-NEXT: retq -; -; CHECK-KNL-LABEL: test5: -; CHECK-KNL: # %bb.0: -; CHECK-KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm5 -; CHECK-KNL-NEXT: vptestnmd %zmm5, %zmm5, %k1 -; CHECK-KNL-NEXT: vptestnmd %zmm0, %zmm0, %k2 -; CHECK-KNL-NEXT: vpblendmq %zmm1, %zmm3, %zmm0 {%k2} -; CHECK-KNL-NEXT: vpblendmq %zmm2, %zmm4, %zmm1 {%k1} -; CHECK-KNL-NEXT: retq +; CHECK-LABEL: test5: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 +; CHECK-NEXT: vpblendmq %zmm1, %zmm3, %zmm0 {%k1} +; CHECK-NEXT: kshiftrw $8, %k1, %k1 +; CHECK-NEXT: vpblendmq %zmm2, %zmm4, %zmm1 {%k1} +; CHECK-NEXT: retq %c = icmp eq <16 x i32> %x, zeroinitializer %ret = select <16 x i1> %c, <16 x i64> %a, <16 x i64> %b ret <16 x i64> %ret @@ -124,10 +112,9 @@ define <16 x i64> @test5(<16 x i32> %x, <16 x i64> %a, <16 x i64> %b) { define <32 x i32> @test6(<32 x i8> %x, <32 x i32> %a, <32 x i32> %b) { ; CHECK-SKX-LABEL: test6: ; CHECK-SKX: # %bb.0: -; CHECK-SKX-NEXT: vextracti128 $1, %ymm0, %xmm5 -; CHECK-SKX-NEXT: vptestnmb %xmm5, %xmm5, %k1 -; CHECK-SKX-NEXT: vptestnmb %xmm0, %xmm0, %k2 -; CHECK-SKX-NEXT: vpblendmd %zmm1, %zmm3, %zmm0 {%k2} +; CHECK-SKX-NEXT: vptestnmb %ymm0, %ymm0, %k1 +; CHECK-SKX-NEXT: vpblendmd %zmm1, %zmm3, %zmm0 {%k1} +; CHECK-SKX-NEXT: kshiftrd $16, %k1, %k1 ; CHECK-SKX-NEXT: vpblendmd %zmm2, %zmm4, %zmm1 {%k1} ; CHECK-SKX-NEXT: retq ; @@ -151,10 +138,9 @@ define <32 x i32> @test6(<32 x i8> %x, <32 x i32> %a, <32 x i32> %b) { define <32 x i32> @test7(<32 x i16> %x, <32 x i32> %a, <32 x i32> %b) { ; CHECK-SKX-LABEL: test7: ; CHECK-SKX: # %bb.0: -; CHECK-SKX-NEXT: vextracti64x4 $1, %zmm0, %ymm5 -; CHECK-SKX-NEXT: vptestnmw %ymm5, %ymm5, %k1 -; CHECK-SKX-NEXT: vptestnmw %ymm0, %ymm0, %k2 -; CHECK-SKX-NEXT: vpblendmd %zmm1, %zmm3, %zmm0 {%k2} +; CHECK-SKX-NEXT: vptestnmw %zmm0, %zmm0, %k1 +; CHECK-SKX-NEXT: vpblendmd %zmm1, %zmm3, %zmm0 {%k1} +; CHECK-SKX-NEXT: kshiftrd $16, %k1, %k1 ; CHECK-SKX-NEXT: vpblendmd %zmm2, %zmm4, %zmm1 {%k1} ; CHECK-SKX-NEXT: retq ; @@ -179,10 +165,9 @@ define <32 x i32> @test7(<32 x i16> %x, <32 x i32> %a, <32 x i32> %b) { define <64 x i16> @test8(<64 x i8> %x, <64 x i16> %a, <64 x i16> %b) { ; CHECK-SKX-LABEL: test8: ; CHECK-SKX: # %bb.0: -; CHECK-SKX-NEXT: vextracti64x4 $1, %zmm0, %ymm5 -; CHECK-SKX-NEXT: vptestnmb %ymm5, %ymm5, %k1 -; CHECK-SKX-NEXT: vptestnmb %ymm0, %ymm0, %k2 -; CHECK-SKX-NEXT: vpblendmw %zmm1, %zmm3, %zmm0 {%k2} +; CHECK-SKX-NEXT: vptestnmb %zmm0, %zmm0, %k1 +; CHECK-SKX-NEXT: vpblendmw %zmm1, %zmm3, %zmm0 {%k1} +; CHECK-SKX-NEXT: kshiftrq $32, %k1, %k1 ; CHECK-SKX-NEXT: vpblendmw %zmm2, %zmm4, %zmm1 {%k1} ; CHECK-SKX-NEXT: retq ; diff --git a/test/CodeGen/X86/min-legal-vector-width.ll b/test/CodeGen/X86/min-legal-vector-width.ll index 46e73c1f854..88329600b23 100644 --- a/test/CodeGen/X86/min-legal-vector-width.ll +++ b/test/CodeGen/X86/min-legal-vector-width.ll @@ -1013,9 +1013,7 @@ define void @vselect_split_v8i16_setcc(<8 x i16> %s, <8 x i16> %t, <8 x i64>* %p ; CHECK-NEXT: vmovdqa (%rsi), %ymm2 ; CHECK-NEXT: vmovdqa 32(%rsi), %ymm3 ; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k2 +; CHECK-NEXT: kshiftrb $4, %k1, %k2 ; CHECK-NEXT: vmovdqa64 32(%rdi), %ymm3 {%k2} ; CHECK-NEXT: vmovdqa64 (%rdi), %ymm2 {%k1} ; CHECK-NEXT: vmovdqa %ymm2, (%rdx) @@ -1035,10 +1033,8 @@ define void @vselect_split_v8i32_setcc(<8 x i32> %s, <8 x i32> %t, <8 x i64>* %p ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rsi), %ymm2 ; CHECK-NEXT: vmovdqa 32(%rsi), %ymm3 -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 -; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k2 +; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 +; CHECK-NEXT: kshiftrb $4, %k1, %k2 ; CHECK-NEXT: vmovdqa64 32(%rdi), %ymm3 {%k2} ; CHECK-NEXT: vmovdqa64 (%rdi), %ymm2 {%k1} ; CHECK-NEXT: vmovdqa %ymm2, (%rdx) @@ -1059,9 +1055,7 @@ define void @vselect_split_v16i8_setcc(<16 x i8> %s, <16 x i8> %t, <16 x i32>* % ; CHECK-NEXT: vmovdqa (%rsi), %ymm2 ; CHECK-NEXT: vmovdqa 32(%rsi), %ymm3 ; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k1 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k2 +; CHECK-NEXT: kshiftrw $8, %k1, %k2 ; CHECK-NEXT: vmovdqa32 32(%rdi), %ymm3 {%k2} ; CHECK-NEXT: vmovdqa32 (%rdi), %ymm2 {%k1} ; CHECK-NEXT: vmovdqa %ymm2, (%rdx) @@ -1081,10 +1075,8 @@ define void @vselect_split_v16i16_setcc(<16 x i16> %s, <16 x i16> %t, <16 x i32> ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rsi), %ymm2 ; CHECK-NEXT: vmovdqa 32(%rsi), %ymm3 -; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 -; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k2 +; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 +; CHECK-NEXT: kshiftrw $8, %k1, %k2 ; CHECK-NEXT: vmovdqa32 32(%rdi), %ymm3 {%k2} ; CHECK-NEXT: vmovdqa32 (%rdi), %ymm2 {%k1} ; CHECK-NEXT: vmovdqa %ymm2, (%rdx) diff --git a/test/CodeGen/X86/pr34177.ll b/test/CodeGen/X86/pr34177.ll index 056682bb275..f8ead6352f1 100644 --- a/test/CodeGen/X86/pr34177.ll +++ b/test/CodeGen/X86/pr34177.ll @@ -6,45 +6,88 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" define void @test(<4 x i64> %a, <4 x x86_fp80> %b, <8 x x86_fp80>* %c) local_unnamed_addr { -; CHECK-LABEL: test: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovq %xmm0, %rax -; CHECK-NEXT: vpextrq $1, %xmm0, %rcx -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vmovq %xmm0, %rdx -; CHECK-NEXT: vpextrq $1, %xmm0, %rsi -; CHECK-NEXT: cmpq $3, %rsi -; CHECK-NEXT: fld1 -; CHECK-NEXT: fldz -; CHECK-NEXT: fld %st(0) -; CHECK-NEXT: fcmove %st(2), %st -; CHECK-NEXT: cmpq $2, %rdx -; CHECK-NEXT: fld %st(1) -; CHECK-NEXT: fcmove %st(3), %st -; CHECK-NEXT: cmpq $1, %rcx -; CHECK-NEXT: fld %st(2) -; CHECK-NEXT: fcmove %st(4), %st -; CHECK-NEXT: testq %rax, %rax -; CHECK-NEXT: fxch %st(3) -; CHECK-NEXT: fcmove %st(4), %st -; CHECK-NEXT: fstp %st(4) -; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) -; CHECK-NEXT: fstpt 70(%rdi) -; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) -; CHECK-NEXT: fstpt 50(%rdi) -; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) -; CHECK-NEXT: fstpt 30(%rdi) -; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) -; CHECK-NEXT: fstpt 10(%rdi) -; CHECK-NEXT: fxch %st(1) -; CHECK-NEXT: fadd %st, %st(0) -; CHECK-NEXT: fstpt 60(%rdi) -; CHECK-NEXT: fadd %st, %st(0) -; CHECK-NEXT: fstpt 40(%rdi) -; CHECK-NEXT: fadd %st, %st(0) -; CHECK-NEXT: fstpt 20(%rdi) -; CHECK-NEXT: fadd %st, %st(0) -; CHECK-NEXT: fstpt (%rdi) +; AVX512F-LABEL: test: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: vmovq %xmm0, %rdx +; AVX512F-NEXT: vpextrq $1, %xmm0, %rsi +; AVX512F-NEXT: cmpq $3, %rsi +; AVX512F-NEXT: fld1 +; AVX512F-NEXT: fldz +; AVX512F-NEXT: fld %st(0) +; AVX512F-NEXT: fcmove %st(2), %st +; AVX512F-NEXT: cmpq $2, %rdx +; AVX512F-NEXT: fld %st(1) +; AVX512F-NEXT: fcmove %st(3), %st +; AVX512F-NEXT: cmpq $1, %rcx +; AVX512F-NEXT: fld %st(2) +; AVX512F-NEXT: fcmove %st(4), %st +; AVX512F-NEXT: testq %rax, %rax +; AVX512F-NEXT: fxch %st(3) +; AVX512F-NEXT: fcmove %st(4), %st +; AVX512F-NEXT: fstp %st(4) +; AVX512F-NEXT: fldt {{[0-9]+}}(%rsp) +; AVX512F-NEXT: fstpt 70(%rdi) +; AVX512F-NEXT: fldt {{[0-9]+}}(%rsp) +; AVX512F-NEXT: fstpt 50(%rdi) +; AVX512F-NEXT: fldt {{[0-9]+}}(%rsp) +; AVX512F-NEXT: fstpt 30(%rdi) +; AVX512F-NEXT: fldt {{[0-9]+}}(%rsp) +; AVX512F-NEXT: fstpt 10(%rdi) +; AVX512F-NEXT: fxch %st(1) +; AVX512F-NEXT: fadd %st, %st(0) +; AVX512F-NEXT: fstpt 60(%rdi) +; AVX512F-NEXT: fadd %st, %st(0) +; AVX512F-NEXT: fstpt 40(%rdi) +; AVX512F-NEXT: fadd %st, %st(0) +; AVX512F-NEXT: fstpt 20(%rdi) +; AVX512F-NEXT: fadd %st, %st(0) +; AVX512F-NEXT: fstpt (%rdi) +; +; AVX512VL-LABEL: test: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpcmpeqq {{.*}}(%rip), %ymm0, %k0 +; AVX512VL-NEXT: kshiftrb $2, %k0, %k1 +; AVX512VL-NEXT: kshiftrb $1, %k0, %k2 +; AVX512VL-NEXT: kmovd %k0, %eax +; AVX512VL-NEXT: testb $1, %al +; AVX512VL-NEXT: fld1 +; AVX512VL-NEXT: fldz +; AVX512VL-NEXT: fld %st(0) +; AVX512VL-NEXT: fcmovne %st(2), %st +; AVX512VL-NEXT: kshiftrb $1, %k1, %k0 +; AVX512VL-NEXT: kmovd %k0, %eax +; AVX512VL-NEXT: testb $1, %al +; AVX512VL-NEXT: fld %st(1) +; AVX512VL-NEXT: fcmovne %st(3), %st +; AVX512VL-NEXT: kmovd %k1, %eax +; AVX512VL-NEXT: testb $1, %al +; AVX512VL-NEXT: fld %st(2) +; AVX512VL-NEXT: fcmovne %st(4), %st +; AVX512VL-NEXT: kmovd %k2, %eax +; AVX512VL-NEXT: testb $1, %al +; AVX512VL-NEXT: fxch %st(3) +; AVX512VL-NEXT: fcmovne %st(4), %st +; AVX512VL-NEXT: fstp %st(4) +; AVX512VL-NEXT: fldt {{[0-9]+}}(%rsp) +; AVX512VL-NEXT: fstpt 70(%rdi) +; AVX512VL-NEXT: fldt {{[0-9]+}}(%rsp) +; AVX512VL-NEXT: fstpt 50(%rdi) +; AVX512VL-NEXT: fldt {{[0-9]+}}(%rsp) +; AVX512VL-NEXT: fstpt 30(%rdi) +; AVX512VL-NEXT: fldt {{[0-9]+}}(%rsp) +; AVX512VL-NEXT: fstpt 10(%rdi) +; AVX512VL-NEXT: fxch %st(1) +; AVX512VL-NEXT: fadd %st, %st(0) +; AVX512VL-NEXT: fstpt (%rdi) +; AVX512VL-NEXT: fadd %st, %st(0) +; AVX512VL-NEXT: fstpt 60(%rdi) +; AVX512VL-NEXT: fadd %st, %st(0) +; AVX512VL-NEXT: fstpt 40(%rdi) +; AVX512VL-NEXT: fadd %st, %st(0) +; AVX512VL-NEXT: fstpt 20(%rdi) %1 = icmp eq <4 x i64> , %a %2 = select <4 x i1> %1, <4 x x86_fp80> , <4 x x86_fp80> zeroinitializer %3 = fadd <4 x x86_fp80> %2, %2 -- 2.40.0