From: Tim Renouf Date: Thu, 16 May 2019 21:49:06 +0000 (+0000) Subject: [CodeGen] Fixed de-optimization of legalize subvector extract X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=6036d197a87b66b3e7771e6913e07ee9fe61e392;p=llvm [CodeGen] Fixed de-optimization of legalize subvector extract The recent introduction of v3i32 etc as an MVT, and its use in AMDGPU 3-dword memory instructions, caused a de-optimization problem for code with such a load that then bitcasts via vector of i8, because v12i8 is not an MVT so it legalizes the bitcast by widening it. This commit adds the ability to widen a bitcast using extract_subvector on the result, so the value does not need to go via memory. Differential Revision: https://reviews.llvm.org/D60457 Change-Id: Ie4abb7760547e54a2445961992eafc78e80d4b64 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@360942 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 74b48a842e3..add97ec1057 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -4219,6 +4219,24 @@ SDValue DAGTypeLegalizer::WidenVecOp_BITCAST(SDNode *N) { } } + // Handle a case like bitcast v12i8 -> v3i32. Normally that would get widened + // to v16i8 -> v4i32, but for a target where v3i32 is legal but v12i8 is not, + // we end up here. Handling the case here with EXTRACT_SUBVECTOR avoids + // having to copy via memory. + if (VT.isVector()) { + EVT EltVT = VT.getVectorElementType(); + unsigned EltSize = EltVT.getSizeInBits(); + if (InWidenSize % EltSize == 0) { + unsigned NewNumElts = InWidenSize / EltSize; + EVT NewVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NewNumElts); + if (TLI.isTypeLegal(NewVT)) { + SDValue BitOp = DAG.getNode(ISD::BITCAST, dl, NewVT, InOp); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, BitOp, + DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))); + } + } + } + return CreateStackStoreLoad(InOp, VT); } diff --git a/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll b/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll new file mode 100644 index 00000000000..a39833455a1 --- /dev/null +++ b/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll @@ -0,0 +1,37 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s -stop-after=amdgpu-isel | FileCheck -check-prefix=GCN %s + +; We want to see a BUFFER_LOAD, some register shuffling, and a BUFFER_STORE. +; Specifically, we do not want to see a BUFFER_STORE that says "store into +; stack" in the middle. + +define amdgpu_hs void @main([0 x i8] addrspace(6)* inreg %arg) { + ; GCN-LABEL: name: main + ; GCN: bb.0.main_body: + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 0 + ; GCN: [[DEF:%[0-9]+]]:sreg_32_xm0 = IMPLICIT_DEF + ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[DEF]] + ; GCN: [[DEF1:%[0-9]+]]:sreg_128 = IMPLICIT_DEF + ; GCN: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[DEF1]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4) + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2 + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub1 + ; GCN: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0 + ; GCN: [[REG_SEQUENCE:%[0-9]+]]:sgpr_96 = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1, killed [[COPY1]], %subreg.sub2 + ; GCN: [[COPY4:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE]] + ; GCN: [[DEF2:%[0-9]+]]:sreg_32_xm0 = IMPLICIT_DEF + ; GCN: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF2]] + ; GCN: [[DEF3:%[0-9]+]]:sreg_128 = IMPLICIT_DEF + ; GCN: BUFFER_STORE_DWORDX3_OFFEN_exact killed [[COPY4]], [[COPY5]], [[DEF3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom TargetCustom7, align 1, addrspace 4) + ; GCN: S_ENDPGM 0 +main_body: + %tmp25 = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> undef, i32 undef, i32 0, i32 0) + %tmp27 = bitcast <4 x float> %tmp25 to <16 x i8> + %tmp28 = shufflevector <16 x i8> %tmp27, <16 x i8> undef, <12 x i32> + %tmp29 = bitcast <12 x i8> %tmp28 to <3 x i32> + call void @llvm.amdgcn.raw.buffer.store.v3i32(<3 x i32> %tmp29, <4 x i32> undef, i32 undef, i32 0, i32 0) #3 + ret void +} + +declare void @llvm.amdgcn.raw.buffer.store.v3i32(<3 x i32>, <4 x i32>, i32, i32, i32 immarg) +declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32 immarg) +