From: Ayman Musa Date: Mon, 26 Sep 2016 06:22:08 +0000 (+0000) Subject: [X86][avx512] Fix bug in masked compress store. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=de3ef8ab93ce6b741df4307324b2e31ae5482fef;p=llvm [X86][avx512] Fix bug in masked compress store. Differential Revision: https://reviews.llvm.org/D23984 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@282381 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/include/llvm/CodeGen/SelectionDAG.h b/include/llvm/CodeGen/SelectionDAG.h index 281892d5307..87c0b97691d 100644 --- a/include/llvm/CodeGen/SelectionDAG.h +++ b/include/llvm/CodeGen/SelectionDAG.h @@ -968,7 +968,8 @@ public: MachineMemOperand *MMO, ISD::LoadExtType); SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT, - MachineMemOperand *MMO, bool IsTrunc); + MachineMemOperand *MMO, bool IsTrunc, + bool isCompressing = false); SDValue getMaskedGather(SDVTList VTs, EVT VT, const SDLoc &dl, ArrayRef Ops, MachineMemOperand *MMO); SDValue getMaskedScatter(SDVTList VTs, EVT VT, const SDLoc &dl, diff --git a/include/llvm/CodeGen/SelectionDAGNodes.h b/include/llvm/CodeGen/SelectionDAGNodes.h index 2e35a5f807e..fb5a01fade8 100644 --- a/include/llvm/CodeGen/SelectionDAGNodes.h +++ b/include/llvm/CodeGen/SelectionDAGNodes.h @@ -453,6 +453,7 @@ protected: uint16_t : NumLSBaseSDNodeBits; uint16_t IsTruncating : 1; + uint16_t IsCompressing : 1; }; union { @@ -1959,15 +1960,23 @@ class MaskedStoreSDNode : public MaskedLoadStoreSDNode { public: friend class SelectionDAG; MaskedStoreSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs, - bool isTrunc, EVT MemVT, MachineMemOperand *MMO) + bool isTrunc, bool isCompressing, EVT MemVT, + MachineMemOperand *MMO) : MaskedLoadStoreSDNode(ISD::MSTORE, Order, dl, VTs, MemVT, MMO) { StoreSDNodeBits.IsTruncating = isTrunc; + StoreSDNodeBits.IsCompressing = isCompressing; } /// Return true if the op does a truncation before store. /// For integers this is the same as doing a TRUNCATE and storing the result. /// For floats, it is the same as doing an FP_ROUND and storing the result. bool isTruncatingStore() const { return StoreSDNodeBits.IsTruncating; } + /// Returns true if the op does a compression to the vector before storing. + /// The node contiguously stores the active elements (integers or floats) + /// in src (those with their respective bit set in writemask k) to unaligned + /// memory at base_addr. + bool isCompressingStore() const { return StoreSDNodeBits.IsCompressing; } + const SDValue &getValue() const { return getOperand(3); } static bool classof(const SDNode *N) { diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index c3d5bee091e..e9680ff521d 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -5340,7 +5340,7 @@ SDValue SelectionDAG::getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue SelectionDAG::getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, - bool isTrunc) { + bool isTrunc, bool isCompress) { assert(Chain.getValueType() == MVT::Other && "Invalid chain type"); EVT VT = Val.getValueType(); @@ -5350,7 +5350,7 @@ SDValue SelectionDAG::getMaskedStore(SDValue Chain, const SDLoc &dl, AddNodeIDNode(ID, ISD::MSTORE, VTs, Ops); ID.AddInteger(VT.getRawBits()); ID.AddInteger(getSyntheticNodeSubclassData( - dl.getIROrder(), VTs, isTrunc, MemVT, MMO)); + dl.getIROrder(), VTs, isTrunc, isCompress, MemVT, MMO)); ID.AddInteger(MMO->getPointerInfo().getAddrSpace()); void *IP = nullptr; if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) { @@ -5358,7 +5358,7 @@ SDValue SelectionDAG::getMaskedStore(SDValue Chain, const SDLoc &dl, return SDValue(E, 0); } auto *N = newSDNode(dl.getIROrder(), dl.getDebugLoc(), VTs, - isTrunc, MemVT, MMO); + isTrunc, isCompress, MemVT, MMO); createOperands(N, Ops); CSEMap.InsertNode(N, IP); diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index f8384e806e1..71717d04269 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -18818,11 +18818,11 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, return DAG.getStore(Chain, dl, DataToCompress, Addr, MemIntr->getMemOperand()); - SDValue Compressed = - getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress), - Mask, DAG.getUNDEF(VT), Subtarget, DAG); - return DAG.getStore(Chain, dl, Compressed, Addr, - MemIntr->getMemOperand()); + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); + + return DAG.getMaskedStore(Chain, dl, DataToCompress, Addr, VMask, VT, + MemIntr->getMemOperand(), false, true); } case TRUNCATE_TO_MEM_VI8: case TRUNCATE_TO_MEM_VI16: diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 6abb2bab609..3d3d1e26538 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -7452,7 +7452,7 @@ defm VPMOVQ2M : avx512_convert_vector_to_mask<0x39, "vpmovq2m", // AVX-512 - COMPRESS and EXPAND // -multiclass compress_by_vec_width opc, X86VectorVTInfo _, +multiclass compress_by_vec_width_common opc, X86VectorVTInfo _, string OpcodeStr> { defm rr : AVX512_maskable opc, X86VectorVTInfo _, def mrk : AVX5128I, + []>, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>; } +multiclass compress_by_vec_width_lowering { + + def : Pat<(X86mCompressingStore addr:$dst, _.KRCWM:$mask, + (_.VT _.RC:$src)), + (!cast(NAME#_.ZSuffix##mrk) + addr:$dst, _.KRCWM:$mask, _.RC:$src)>; +} + multiclass compress_by_elt_width opc, string OpcodeStr, AVX512VLVectorVTInfo VTInfo> { - defm Z : compress_by_vec_width, EVEX_V512; + defm Z : compress_by_vec_width_common, + compress_by_vec_width_lowering, EVEX_V512; let Predicates = [HasVLX] in { - defm Z256 : compress_by_vec_width, EVEX_V256; - defm Z128 : compress_by_vec_width, EVEX_V128; + defm Z256 : compress_by_vec_width_common, + compress_by_vec_width_lowering, EVEX_V256; + defm Z128 : compress_by_vec_width_common, + compress_by_vec_width_lowering, EVEX_V128; } } diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 13879c5efbb..744d0b3ea10 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -959,7 +959,8 @@ def masked_load_unaligned : PatFrag<(ops node:$src1, node:$src2, node:$src3), // do not support vector types (llvm-tblgen will fail). def X86mstore : PatFrag<(ops node:$src1, node:$src2, node:$src3), (masked_store node:$src1, node:$src2, node:$src3), [{ - return !cast(N)->isTruncatingStore(); + return (!cast(N)->isTruncatingStore()) && + (!cast(N)->isCompressingStore()); }]>; def masked_store_aligned128 : PatFrag<(ops node:$src1, node:$src2, node:$src3), @@ -988,6 +989,11 @@ def masked_store_unaligned : PatFrag<(ops node:$src1, node:$src2, node:$src3), return isa(N); }]>; +def X86mCompressingStore : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_store node:$src1, node:$src2, node:$src3), [{ + return cast(N)->isCompressingStore(); +}]>; + // masked truncstore fragments // X86mtruncstore can't be implemented in core DAG files because some targets // doesn't support vector type ( llvm-tblgen will fail) diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll index ee53d46cacf..0ee213d8ba2 100644 --- a/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -893,6 +893,37 @@ define <4 x i32> @compr10(<4 x i32> %data, i8 %mask) { ret <4 x i32> %res } + +@xmm = common global <4 x i32> zeroinitializer, align 16 +@k8 = common global i8 0, align 1 + +define i32 @compr11() { +; CHECK-LABEL: compr11: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: movq _xmm@{{.*}}(%rip), %rax ## encoding: [0x48,0x8b,0x05,A,A,A,A] +; CHECK-NEXT: ## fixup A - offset: 3, value: _xmm@GOTPCREL-4, kind: reloc_riprel_4byte_movq_load +; CHECK-NEXT: vmovdqa32 (%rax), %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x6f,0x00] +; CHECK-NEXT: movq _k8@{{.*}}(%rip), %rax ## encoding: [0x48,0x8b,0x05,A,A,A,A] +; CHECK-NEXT: ## fixup A - offset: 3, value: _k8@GOTPCREL-4, kind: reloc_riprel_4byte_movq_load +; CHECK-NEXT: movzbl (%rax), %eax ## encoding: [0x0f,0xb6,0x00] +; CHECK-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; CHECK-NEXT: vpcompressd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x8b,0xc0] +; CHECK-NEXT: vpxord %xmm1, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x75,0x08,0xef,0xc9] +; CHECK-NEXT: vmovdqa32 %xmm0, -{{[0-9]+}}(%rsp) ## encoding: [0x62,0xf1,0x7d,0x08,0x7f,0x84,0x24,0xd8,0xff,0xff,0xff] +; CHECK-NEXT: vmovdqa32 %xmm1, -{{[0-9]+}}(%rsp) ## encoding: [0x62,0xf1,0x7d,0x08,0x7f,0x8c,0x24,0xe8,0xff,0xff,0xff] +; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] +; CHECK-NEXT: retq ## encoding: [0xc3] +entry: + %.compoundliteral = alloca <2 x i64>, align 16 + %res = alloca <4 x i32>, align 16 + %a0 = load <4 x i32>, <4 x i32>* @xmm, align 16 + %a2 = load i8, i8* @k8, align 1 + %a21 = call <4 x i32> @llvm.x86.avx512.mask.compress.d.128(<4 x i32> %a0, <4 x i32> zeroinitializer, i8 %a2) #2 + store volatile <4 x i32> %a21, <4 x i32>* %res, align 16 + store <2 x i64> zeroinitializer, <2 x i64>* %.compoundliteral, align 16 + ret i32 0 +} + declare <4 x i32> @llvm.x86.avx512.mask.compress.d.128(<4 x i32> %data, <4 x i32> %src0, i8 %mask) ; Expand @@ -5219,9 +5250,9 @@ define <8 x i32>@test_int_x86_avx512_mask_psrav8_si_const() { ; CHECK: ## BB#0: ; CHECK-NEXT: vmovdqa32 {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] ; CHECK-NEXT: ## encoding: [0x62,0xf1,0x7d,0x28,0x6f,0x05,A,A,A,A] -; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI308_0-4, kind: reloc_riprel_4byte +; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI309_0-4, kind: reloc_riprel_4byte ; CHECK-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x46,0x05,A,A,A,A] -; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI308_1-4, kind: reloc_riprel_4byte +; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI309_1-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.psrav8.si(<8 x i32> , <8 x i32> , <8 x i32> zeroinitializer, i8 -1) ret <8 x i32> %res @@ -5252,9 +5283,9 @@ define <2 x i64>@test_int_x86_avx512_mask_psrav_q_128_const(i8 %x3) { ; CHECK: ## BB#0: ; CHECK-NEXT: vmovdqa64 {{.*#+}} xmm0 = [2,18446744073709551607] ; CHECK-NEXT: ## encoding: [0x62,0xf1,0xfd,0x08,0x6f,0x05,A,A,A,A] -; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI310_0-4, kind: reloc_riprel_4byte +; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI311_0-4, kind: reloc_riprel_4byte ; CHECK-NEXT: vpsravq {{.*}}(%rip), %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x46,0x05,A,A,A,A] -; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI310_1-4, kind: reloc_riprel_4byte +; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI311_1-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.psrav.q.128(<2 x i64> , <2 x i64> , <2 x i64> zeroinitializer, i8 -1) ret <2 x i64> %res