def int_amdgcn_atomic_inc : AMDGPUAtomicIncIntrin;
def int_amdgcn_atomic_dec : AMDGPUAtomicIncIntrin;
-class AMDGPUImageLoad : Intrinsic <
+class AMDGPUImageLoad<bit NoMem = 0> : Intrinsic <
[llvm_anyfloat_ty], // vdata(VGPR)
[llvm_anyint_ty, // vaddr(VGPR)
llvm_anyint_ty, // rsrc(SGPR)
llvm_i1_ty, // slc(imm)
llvm_i1_ty, // lwe(imm)
llvm_i1_ty], // da(imm)
- [IntrReadMem]>;
+ !if(NoMem, [IntrNoMem], [IntrReadMem])>;
def int_amdgcn_image_load : AMDGPUImageLoad;
def int_amdgcn_image_load_mip : AMDGPUImageLoad;
-def int_amdgcn_image_getresinfo : AMDGPUImageLoad;
+def int_amdgcn_image_getresinfo : AMDGPUImageLoad<1>;
class AMDGPUImageStore : Intrinsic <
[],
def int_amdgcn_image_store : AMDGPUImageStore;
def int_amdgcn_image_store_mip : AMDGPUImageStore;
-class AMDGPUImageSample : Intrinsic <
+class AMDGPUImageSample<bit NoMem = 0> : Intrinsic <
[llvm_anyfloat_ty], // vdata(VGPR)
[llvm_anyfloat_ty, // vaddr(VGPR)
llvm_anyint_ty, // rsrc(SGPR)
llvm_i1_ty, // slc(imm)
llvm_i1_ty, // lwe(imm)
llvm_i1_ty], // da(imm)
- [IntrReadMem]>;
+ !if(NoMem, [IntrNoMem], [IntrReadMem])>;
// Basic sample
def int_amdgcn_image_sample : AMDGPUImageSample;
def int_amdgcn_image_gather4_c_b_cl_o : AMDGPUImageSample;
def int_amdgcn_image_gather4_c_lz_o : AMDGPUImageSample;
-def int_amdgcn_image_getlod : AMDGPUImageSample;
+def int_amdgcn_image_getlod : AMDGPUImageSample<1>;
class AMDGPUImageAtomic : Intrinsic <
[llvm_i32_ty],
defm IMAGE_STORE_MIP : MIMG_Store <0x00000009, "image_store_mip">;
//def IMAGE_STORE_PCK : MIMG_NoPattern_ <"image_store_pck", 0x0000000a>;
//def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"image_store_mip_pck", 0x0000000b>;
+
+let mayLoad = 0, mayStore = 0 in {
defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "image_get_resinfo">;
+}
+
defm IMAGE_ATOMIC_SWAP : MIMG_Atomic <mimg<0x0f, 0x10>, "image_atomic_swap">;
defm IMAGE_ATOMIC_CMPSWAP : MIMG_Atomic <mimg<0x10, 0x11>, "image_atomic_cmpswap", VReg_64>;
defm IMAGE_ATOMIC_ADD : MIMG_Atomic <mimg<0x11, 0x12>, "image_atomic_add">;
defm IMAGE_GATHER4_C_B_O : MIMG_Gather_WQM <0x0000005d, "image_gather4_c_b_o">;
defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <0x0000005e, "image_gather4_c_b_cl_o">;
defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <0x0000005f, "image_gather4_c_lz_o">;
+
+let mayLoad = 0, mayStore = 0 in {
defm IMAGE_GET_LOD : MIMG_Sampler_WQM <0x00000060, "image_get_lod">;
+}
+
defm IMAGE_SAMPLE_CD : MIMG_Sampler <0x00000068, "image_sample_cd">;
defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <0x00000069, "image_sample_cd_cl">;
defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <0x0000006a, "image_sample_c_cd">;
if (MI.mayLoad())
Flags |= MachineMemOperand::MOLoad;
- auto MMO = MF->getMachineMemOperand(PtrInfo, Flags, 0, 0);
- MI.addMemOperand(*MF, MMO);
+ if (Flags != MachineMemOperand::MODereferenceable) {
+ auto MMO = MF->getMachineMemOperand(PtrInfo, Flags, 0, 0);
+ MI.addMemOperand(*MF, MMO);
+ }
+
return BB;
}
return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src),
0);
}
+ case Intrinsic::amdgcn_image_getlod:
+ case Intrinsic::amdgcn_image_getresinfo: {
+ unsigned Idx = (IntrinsicID == Intrinsic::amdgcn_image_getresinfo) ? 3 : 4;
+
+ // Replace dmask with everything disabled with undef.
+ const ConstantSDNode *DMask = dyn_cast<ConstantSDNode>(Op.getOperand(Idx));
+ if (!DMask || DMask->isNullValue())
+ return DAG.getUNDEF(Op.getValueType());
+ return SDValue();
+ }
default:
return Op;
}
case Intrinsic::amdgcn_image_sample_c_b_cl_o:
case Intrinsic::amdgcn_image_sample_c_lz_o:
case Intrinsic::amdgcn_image_sample_c_cd_o:
- case Intrinsic::amdgcn_image_sample_c_cd_cl_o:
-
- case Intrinsic::amdgcn_image_getlod: {
+ case Intrinsic::amdgcn_image_sample_c_cd_cl_o: {
// Replace dmask with everything disabled with undef.
const ConstantSDNode *DMask = dyn_cast<ConstantSDNode>(Op.getOperand(5));
if (!DMask || DMask->isNullValue()) {
unsigned DmaskIdx = (Node->getNumOperands() - Node->getNumValues() == 9) ? 2 : 3;
unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
unsigned NewDmask = 0;
+ bool HasChain = Node->getNumValues() > 1;
+
+ if (OldDmask == 0) {
+ // These are folded out, but on the chance it happens don't assert.
+ return Node;
+ }
// Try to figure out the used register components
for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
// Set which texture component corresponds to the lane.
unsigned Comp;
for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) {
- assert(Dmask);
Comp = countTrailingZeros(Dmask);
Dmask &= ~(1 << Comp);
}
MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
- auto NewVTList =
- DAG.getVTList(BitsSet == 1 ?
- SVT : MVT::getVectorVT(SVT, BitsSet == 3 ? 4 : BitsSet),
- MVT::Other);
+ MVT ResultVT = BitsSet == 1 ?
+ SVT : MVT::getVectorVT(SVT, BitsSet == 3 ? 4 : BitsSet);
+ SDVTList NewVTList = HasChain ?
+ DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
+
MachineSDNode *NewNode = DAG.getMachineNode(NewOpcode, SDLoc(Node),
NewVTList, Ops);
- NewNode->setMemRefs(Node->memoperands_begin(), Node->memoperands_end());
- // Update chain.
- DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
+ if (HasChain) {
+ // Update chain.
+ NewNode->setMemRefs(Node->memoperands_begin(), Node->memoperands_end());
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
+ }
if (BitsSet == 1) {
assert(Node->hasNUsesOfValue(1, 0));
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; GCN-LABEL: {{^}}adjust_writemask_crash_0:
+; GCN-LABEL: {{^}}adjust_writemask_crash_0_nochain:
; GCN: image_get_lod v0, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x2
; GCN-NOT: v1
; GCN-NOT: v0
; GCN: buffer_store_dword v0
-define amdgpu_ps void @adjust_writemask_crash_0() #0 {
+define amdgpu_ps void @adjust_writemask_crash_0_nochain() #0 {
main_body:
%tmp = call <2 x float> @llvm.amdgcn.image.getlod.v2f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 3, i1 false, i1 false, i1 false, i1 false, i1 false)
%tmp1 = bitcast <2 x float> %tmp to <2 x i32>
ret void
}
-; GCN-LABEL: {{^}}adjust_writemask_crash_1:
+; GCN-LABEL: {{^}}adjust_writemask_crash_1_nochain:
; GCN: image_get_lod v0, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x1
; GCN-NOT: v1
; GCN-NOT: v0
; GCN: buffer_store_dword v0
-define amdgpu_ps void @adjust_writemask_crash_1() #0 {
+define amdgpu_ps void @adjust_writemask_crash_1_nochain() #0 {
main_body:
%tmp = call <2 x float> @llvm.amdgcn.image.getlod.v2f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 3, i1 false, i1 false, i1 false, i1 false, i1 false)
%tmp1 = bitcast <2 x float> %tmp to <2 x i32>
ret void
}
+; GCN-LABEL: {{^}}adjust_writemask_crash_0_chain:
+; GCN: image_sample v0, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x2
+; GCN-NOT: v1
+; GCN-NOT: v0
+; GCN: buffer_store_dword v0
+define amdgpu_ps void @adjust_writemask_crash_0_chain() #0 {
+main_body:
+ %tmp = call <2 x float> @llvm.amdgcn.image.sample.v2f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 3, i1 false, i1 false, i1 false, i1 false, i1 false)
+ %tmp1 = bitcast <2 x float> %tmp to <2 x i32>
+ %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %tmp3 = bitcast <4 x i32> %tmp2 to <4 x float>
+ %tmp4 = extractelement <4 x float> %tmp3, i32 0
+ store volatile float %tmp4, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_crash_1_chain:
+; GCN: image_sample v0, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x1
+; GCN-NOT: v1
+; GCN-NOT: v0
+; GCN: buffer_store_dword v0
+define amdgpu_ps void @adjust_writemask_crash_1_chain() #0 {
+main_body:
+ %tmp = call <2 x float> @llvm.amdgcn.image.sample.v2f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 3, i1 false, i1 false, i1 false, i1 false, i1 false)
+ %tmp1 = bitcast <2 x float> %tmp to <2 x i32>
+ %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> <i32 1, i32 0, i32 undef, i32 undef>
+ %tmp3 = bitcast <4 x i32> %tmp2 to <4 x float>
+ %tmp4 = extractelement <4 x float> %tmp3, i32 1
+ store volatile float %tmp4, float addrspace(1)* undef
+ ret void
+}
+
define amdgpu_ps void @adjust_writemask_crash_0_v4() #0 {
main_body:
%tmp = call <4 x float> @llvm.amdgcn.image.getlod.v4f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 5, i1 false, i1 false, i1 false, i1 false, i1 false)
}
+declare <2 x float> @llvm.amdgcn.image.sample.v2f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #1
declare <2 x float> @llvm.amdgcn.image.getlod.v2f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #1
declare <4 x float> @llvm.amdgcn.image.getlod.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #1
; GCN-LABEL: {{^}}getlod:
; GCN: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf da
+; GCN: s_waitcnt vmcnt(0)
+; GCN: store_dwordx4
define amdgpu_kernel void @getlod(<4 x float> addrspace(1)* %out) {
main_body:
%r = call <4 x float> @llvm.amdgcn.image.getlod.v4f32.f32.v8i32(float undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 1)
; GCN-LABEL: {{^}}getlod_v2:
; GCN: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf da
+; GCN: s_waitcnt vmcnt(0)
+; GCN: store_dwordx4
define amdgpu_kernel void @getlod_v2(<4 x float> addrspace(1)* %out) {
main_body:
%r = call <4 x float> @llvm.amdgcn.image.getlod.v4f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 1)
; GCN-LABEL: {{^}}getlod_v4:
; GCN: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf da
+; GCN: s_waitcnt vmcnt(0)
+; GCN: store_dwordx4
define amdgpu_kernel void @getlod_v4(<4 x float> addrspace(1)* %out) {
main_body:
%r = call <4 x float> @llvm.amdgcn.image.getlod.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 1)
; GCN-LABEL: {{^}}getresinfo:
; GCN-NOT: s_waitcnt
; GCN: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+; GCN: s_waitcnt vmcnt(0)
+; GCN: exp
define amdgpu_ps void @getresinfo() #0 {
main_body:
%r = call <4 x float> @llvm.amdgcn.image.getresinfo.v4f32.i32.v8i32(i32 undef, <8 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false)
ret void
}
+; GCN-LABEL: {{^}}getresinfo_dmask0:
+; GCN-NOT: image_get_resinfo
+define amdgpu_ps void @getresinfo_dmask0() #0 {
+main_body:
+ %r = call <4 x float> @llvm.amdgcn.image.getresinfo.v4f32.i32.v8i32(i32 undef, <8 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false)
+ %r0 = extractelement <4 x float> %r, i32 0
+ %r1 = extractelement <4 x float> %r, i32 1
+ %r2 = extractelement <4 x float> %r, i32 2
+ %r3 = extractelement <4 x float> %r, i32 3
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r0, float %r1, float %r2, float %r3, i1 true, i1 true) #0
+ ret void
+}
+
; Ideally, the register allocator would avoid the wait here
;
; GCN-LABEL: {{^}}image_store_wait:
declare <4 x float> @llvm.amdgcn.image.load.v4f32.v2i32.v8i32(<2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
declare <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
declare <4 x float> @llvm.amdgcn.image.load.mip.v4f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
-declare <4 x float> @llvm.amdgcn.image.getresinfo.v4f32.i32.v8i32(i32, <8 x i32>, i32, i1, i1, i1, i1) #1
+declare <4 x float> @llvm.amdgcn.image.getresinfo.v4f32.i32.v8i32(i32, <8 x i32>, i32, i1, i1, i1, i1) #2
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
attributes #0 = { nounwind }
attributes #1 = { nounwind readonly }
+attributes #2 = { nounwind readnone }