From: Matt Arsenault Date: Mon, 5 Aug 2019 15:59:07 +0000 (+0000) Subject: AMDGPU: Correct behavior of f16 buffer loads X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=8f6e6a59983df96724541ae17e9ddcef06b85a27;p=llvm AMDGPU: Correct behavior of f16 buffer loads Don't assume format loads for f16. Also fixes support for targets without i16. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@367879 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp index b4849b2881e..8029b8968b3 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -2002,6 +2002,12 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) { dbgs() << "\n"); SDValue R = SDValue(); + // See if the target wants to custom expand this node. + if (CustomLowerNode(N, N->getValueType(ResNo), true)) { + LLVM_DEBUG(dbgs() << "Node has been custom expanded, done\n"); + return; + } + switch (N->getOpcode()) { // These opcodes cannot appear if promotion of FP16 is done in the backend // instead of Clang diff --git a/lib/Target/AMDGPU/BUFInstructions.td b/lib/Target/AMDGPU/BUFInstructions.td index 504d391ad45..e3b55c2223e 100644 --- a/lib/Target/AMDGPU/BUFInstructions.td +++ b/lib/Target/AMDGPU/BUFInstructions.td @@ -1182,8 +1182,12 @@ let SubtargetPredicate = HasPackedD16VMem in { defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; +defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 65568667b4d..f6d80eb4aa1 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -675,9 +675,12 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2i16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4i16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::f16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom); @@ -4119,6 +4122,41 @@ SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL); } +SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat, + SelectionDAG &DAG, + ArrayRef Ops) const { + SDLoc DL(M); + EVT LoadVT = M->getValueType(0); + EVT EltType = LoadVT.getScalarType(); + EVT IntVT = LoadVT.changeTypeToInteger(); + + bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16); + + unsigned Opc = + IsFormat ? AMDGPUISD::BUFFER_LOAD_FORMAT : AMDGPUISD::BUFFER_LOAD; + + if (IsD16) { + return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops); + } + + // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics + if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32) + return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M); + + if (isTypeLegal(LoadVT)) { + return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT, + M->getMemOperand(), DAG); + } + + EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT); + SDVTList VTList = DAG.getVTList(CastVT, MVT::Other); + SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT, + M->getMemOperand(), DAG); + return DAG.getMergeValues( + {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)}, + DL); +} + static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); @@ -4245,8 +4283,14 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N, } case ISD::INTRINSIC_W_CHAIN: { if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) { - Results.push_back(Res); - Results.push_back(Res.getValue(1)); + if (Res.getOpcode() == ISD::MERGE_VALUES) { + // FIXME: Hacky + Results.push_back(Res.getOperand(0)); + Results.push_back(Res.getOperand(1)); + } else { + Results.push_back(Res); + Results.push_back(Res.getValue(1)); + } return; } @@ -6194,6 +6238,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, } case Intrinsic::amdgcn_raw_buffer_load: case Intrinsic::amdgcn_raw_buffer_load_format: { + const bool IsFormat = IntrID == Intrinsic::amdgcn_raw_buffer_load_format; + auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG); SDValue Ops[] = { Op.getOperand(0), // Chain @@ -6206,28 +6252,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, DAG.getConstant(0, DL, MVT::i1), // idxen }; - unsigned Opc = (IntrID == Intrinsic::amdgcn_raw_buffer_load) ? - AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT; - - EVT VT = Op.getValueType(); - EVT IntVT = VT.changeTypeToInteger(); - auto *M = cast(Op); - EVT LoadVT = Op.getValueType(); - - if (LoadVT.getScalarType() == MVT::f16) - return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, - M, DAG, Ops); - - // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics - if (LoadVT.getScalarType() == MVT::i8 || - LoadVT.getScalarType() == MVT::i16) - return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M); - - return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, - M->getMemOperand(), DAG); + return lowerIntrinsicLoad(cast(Op), IsFormat, DAG, Ops); } case Intrinsic::amdgcn_struct_buffer_load: case Intrinsic::amdgcn_struct_buffer_load_format: { + const bool IsFormat = IntrID == Intrinsic::amdgcn_struct_buffer_load_format; + auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); SDValue Ops[] = { Op.getOperand(0), // Chain @@ -6240,25 +6270,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, DAG.getConstant(1, DL, MVT::i1), // idxen }; - unsigned Opc = (IntrID == Intrinsic::amdgcn_struct_buffer_load) ? - AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT; - - EVT VT = Op.getValueType(); - EVT IntVT = VT.changeTypeToInteger(); - auto *M = cast(Op); - EVT LoadVT = Op.getValueType(); - - if (LoadVT.getScalarType() == MVT::f16) - return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, - M, DAG, Ops); - - // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics - if (LoadVT.getScalarType() == MVT::i8 || - LoadVT.getScalarType() == MVT::i16) - return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M); - - return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, - M->getMemOperand(), DAG); + return lowerIntrinsicLoad(cast(Op), IsFormat, DAG, Ops); } case Intrinsic::amdgcn_tbuffer_load: { MemSDNode *M = cast(Op); @@ -7123,9 +7135,10 @@ SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG, SDValue BufferLoad = DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, M->getMemOperand()); - SDValue BufferLoadTrunc = DAG.getNode(ISD::TRUNCATE, DL, - LoadVT.getScalarType(), BufferLoad); - return DAG.getMergeValues({BufferLoadTrunc, BufferLoad.getValue(1)}, DL); + SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad); + LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal); + + return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL); } // Handle 8 bit and 16 bit buffer stores diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h index 410460226a2..4e98b2aab3a 100644 --- a/lib/Target/AMDGPU/SIISelLowering.h +++ b/lib/Target/AMDGPU/SIISelLowering.h @@ -94,6 +94,9 @@ private: SelectionDAG &DAG, ArrayRef Ops, bool IsIntrinsic = false) const; + SDValue lowerIntrinsicLoad(MemSDNode *M, bool IsFormat, SelectionDAG &DAG, + ArrayRef Ops) const; + // Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to // dwordx4 if on SI. SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList, diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 7f287b41416..cce131b9bcd 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -919,9 +919,10 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); } - if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()){ + if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) { setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom); - setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); + if (Subtarget->hasFullFP16()) + setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); } if (!Subtarget->hasFP16()) diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll index b241f8b98e0..958a72566b5 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll @@ -298,13 +298,13 @@ main_body: ret float %val } -;CHECK-LABEL: {{^}}raw_buffer_load_ushort: +;CHECK-LABEL: {{^}}raw_buffer_load_i16: ;CHECK-NEXT: %bb. ;CHECK-NEXT: buffer_load_ushort v{{[0-9]}}, off, s[0:3], 0 ;CHECK: s_waitcnt vmcnt(0) ;CHECK-NEXT: v_cvt_f32_u32_e32 v0, v0 ;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @raw_buffer_load_ushort(<4 x i32> inreg %rsrc) { +define amdgpu_ps float @raw_buffer_load_i16(<4 x i32> inreg %rsrc) { main_body: %tmp = call i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) %tmp2 = zext i16 %tmp to i32 @@ -340,6 +340,66 @@ main_body: ret float %val } +;CHECK-LABEL: {{^}}raw_buffer_load_f16: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_ushort [[VAL:v[0-9]+]], off, s[0:3], 0 +;CHECK: s_waitcnt vmcnt(0) +;CHECK: ds_write_b16 v0, [[VAL]] +define amdgpu_ps void @raw_buffer_load_f16(<4 x i32> inreg %rsrc, half addrspace(3)* %ptr) { +main_body: + %val = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + store half %val, half addrspace(3)* %ptr + ret void +} + +;CHECK-LABEL: {{^}}raw_buffer_load_v2f16: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_dword [[VAL:v[0-9]+]], off, s[0:3], 0 +;CHECK: s_waitcnt vmcnt(0) +;CHECK: ds_write_b32 v0, [[VAL]] +define amdgpu_ps void @raw_buffer_load_v2f16(<4 x i32> inreg %rsrc, <2 x half> addrspace(3)* %ptr) { +main_body: + %val = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + store <2 x half> %val, <2 x half> addrspace(3)* %ptr + ret void +} + +;CHECK-LABEL: {{^}}raw_buffer_load_v4f16: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], off, s[0:3], 0 +;CHECK: s_waitcnt vmcnt(0) +;CHECK: ds_write_b64 v0, [[VAL]] +define amdgpu_ps void @raw_buffer_load_v4f16(<4 x i32> inreg %rsrc, <4 x half> addrspace(3)* %ptr) { +main_body: + %val = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + store <4 x half> %val, <4 x half> addrspace(3)* %ptr + ret void +} + +;CHECK-LABEL: {{^}}raw_buffer_load_v2i16: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_dword [[VAL:v[0-9]+]], off, s[0:3], 0 +;CHECK: s_waitcnt vmcnt(0) +;CHECK: ds_write_b32 v0, [[VAL]] +define amdgpu_ps void @raw_buffer_load_v2i16(<4 x i32> inreg %rsrc, <2 x i16> addrspace(3)* %ptr) { +main_body: + %val = call <2 x i16> @llvm.amdgcn.raw.buffer.load.v2i16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + store <2 x i16> %val, <2 x i16> addrspace(3)* %ptr + ret void +} + +;CHECK-LABEL: {{^}}raw_buffer_load_v4i16: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], off, s[0:3], 0 +;CHECK: s_waitcnt vmcnt(0) +;CHECK: ds_write_b64 v0, [[VAL]] +define amdgpu_ps void @raw_buffer_load_v4i16(<4 x i32> inreg %rsrc, <4 x i16> addrspace(3)* %ptr) { +main_body: + %val = call <4 x i16> @llvm.amdgcn.raw.buffer.load.v4i16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + store <4 x i16> %val, <4 x i16> addrspace(3)* %ptr + ret void +} + declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32) #0 declare <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32>, i32, i32, i32) #0 declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #0 @@ -349,5 +409,10 @@ declare <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32>, i32, i32, i32) # declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 declare i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32>, i32, i32, i32) #0 declare i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32>, i32, i32, i32) #0 +declare <2 x i16> @llvm.amdgcn.raw.buffer.load.v2i16(<4 x i32>, i32, i32, i32) #0 +declare <4 x i16> @llvm.amdgcn.raw.buffer.load.v4i16(<4 x i32>, i32, i32, i32) #0 +declare half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32>, i32, i32, i32) #0 +declare <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32>, i32, i32, i32) #0 +declare <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32>, i32, i32, i32) #0 attributes #0 = { nounwind readonly } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll index 4ac34286b57..496f8079f8e 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll @@ -200,6 +200,78 @@ main_body: ret float %val } +;CHECK-LABEL: {{^}}struct_buffer_load_f16: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_ushort [[VAL:v[0-9]+]], v1, s[0:3], 0 idxen +;CHECK: s_waitcnt vmcnt(0) +;CHECK: ds_write_b16 v0, [[VAL]] +define amdgpu_ps void @struct_buffer_load_f16(<4 x i32> inreg %rsrc, half addrspace(3)* %ptr, i32 %idx) { +main_body: + %val = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0) + store half %val, half addrspace(3)* %ptr + ret void +} + +;CHECK-LABEL: {{^}}struct_buffer_load_v2f16: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_dword [[VAL:v[0-9]+]], v1, s[0:3], 0 idxen +;CHECK: s_waitcnt vmcnt(0) +;CHECK: ds_write_b32 v0, [[VAL]] +define amdgpu_ps void @struct_buffer_load_v2f16(<4 x i32> inreg %rsrc, <2 x half> addrspace(3)* %ptr, i32 %idx) { +main_body: + %val = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0) + store <2 x half> %val, <2 x half> addrspace(3)* %ptr + ret void +} + +;CHECK-LABEL: {{^}}struct_buffer_load_v4f16: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], v1, s[0:3], 0 idxen +;CHECK: s_waitcnt vmcnt(0) +;CHECK: ds_write_b64 v0, [[VAL]] +define amdgpu_ps void @struct_buffer_load_v4f16(<4 x i32> inreg %rsrc, <4 x half> addrspace(3)* %ptr, i32 %idx) { +main_body: + %val = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0) + store <4 x half> %val, <4 x half> addrspace(3)* %ptr + ret void +} + +;CHECK-LABEL: {{^}}struct_buffer_load_i16: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_ushort [[VAL:v[0-9]+]], v1, s[0:3], 0 idxen +;CHECK: s_waitcnt vmcnt(0) +;CHECK: ds_write_b16 v0, [[VAL]] +define amdgpu_ps void @struct_buffer_load_i16(<4 x i32> inreg %rsrc, i16 addrspace(3)* %ptr, i32 %idx) { +main_body: + %val = call i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0) + store i16 %val, i16 addrspace(3)* %ptr + ret void +} + +;CHECK-LABEL: {{^}}struct_buffer_load_v2i16: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_dword [[VAL:v[0-9]+]], v1, s[0:3], 0 idxen +;CHECK: s_waitcnt vmcnt(0) +;CHECK: ds_write_b32 v0, [[VAL]] +define amdgpu_ps void @struct_buffer_load_v2i16(<4 x i32> inreg %rsrc, <2 x i16> addrspace(3)* %ptr, i32 %idx) { +main_body: + %val = call <2 x i16> @llvm.amdgcn.struct.buffer.load.v2i16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0) + store <2 x i16> %val, <2 x i16> addrspace(3)* %ptr + ret void +} + +;CHECK-LABEL: {{^}}struct_buffer_load_v4i16: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], v1, s[0:3], 0 idxen +;CHECK: s_waitcnt vmcnt(0) +;CHECK: ds_write_b64 v0, [[VAL]] +define amdgpu_ps void @struct_buffer_load_v4i16(<4 x i32> inreg %rsrc, <4 x i16> addrspace(3)* %ptr, i32 %idx) { +main_body: + %val = call <4 x i16> @llvm.amdgcn.struct.buffer.load.v4i16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0) + store <4 x i16> %val, <4 x i16> addrspace(3)* %ptr + ret void +} + declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32) #0 declare <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32>, i32, i32, i32, i32) #0 declare <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32>, i32, i32, i32, i32) #0 @@ -208,6 +280,13 @@ declare <2 x i32> @llvm.amdgcn.struct.buffer.load.v2i32(<4 x i32>, i32, i32, i32 declare <4 x i32> @llvm.amdgcn.struct.buffer.load.v4i32(<4 x i32>, i32, i32, i32, i32) #0 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 declare i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32>, i32, i32, i32, i32) #0 + +declare half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32>, i32, i32, i32, i32) #0 +declare <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32>, i32, i32, i32, i32) #0 +declare <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32>, i32, i32, i32, i32) #0 + declare i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32>, i32, i32, i32, i32) #0 +declare <2 x i16> @llvm.amdgcn.struct.buffer.load.v2i16(<4 x i32>, i32, i32, i32, i32) #0 +declare <4 x i16> @llvm.amdgcn.struct.buffer.load.v4i16(<4 x i32>, i32, i32, i32, i32) #0 attributes #0 = { nounwind readonly }