NODE_NAME_CASE(MAD_I24)
NODE_NAME_CASE(TEXTURE_FETCH)
NODE_NAME_CASE(EXPORT)
+ NODE_NAME_CASE(EXPORT_DONE)
+ NODE_NAME_CASE(R600_EXPORT)
NODE_NAME_CASE(CONST_ADDRESS)
NODE_NAME_CASE(REGISTER_LOAD)
NODE_NAME_CASE(REGISTER_STORE)
MUL_LOHI_I24,
MUL_LOHI_U24,
TEXTURE_FETCH,
- EXPORT,
+ EXPORT, // exp on SI+
+ EXPORT_DONE, // exp on SI+ with done bit set
+ R600_EXPORT,
CONST_ADDRESS,
REGISTER_LOAD,
REGISTER_STORE,
SDTypeProfile<1, 4, [SDTCisFP<0>]>,
[SDNPInGlue]>;
+
def AMDGPUkill : SDNode<"AMDGPUISD::KILL", AMDGPUKillSDT,
[SDNPHasChain, SDNPSideEffect]>;
+// SI+ export
+def AMDGPUExportOp : SDTypeProfile<0, 8, [
+ SDTCisInt<0>, // i8 en
+ SDTCisInt<1>, // i1 vm
+ // skip done
+ SDTCisInt<2>, // i8 tgt
+ SDTCisSameAs<3, 1>, // i1 compr
+ SDTCisFP<4>, // f32 src0
+ SDTCisSameAs<5, 4>, // f32 src1
+ SDTCisSameAs<6, 4>, // f32 src2
+ SDTCisSameAs<7, 4> // f32 src3
+]>;
+
+def AMDGPUexport: SDNode<"AMDGPUISD::EXPORT", AMDGPUExportOp,
+ [SDNPHasChain, SDNPMayStore]>;
+
+def AMDGPUexport_done: SDNode<"AMDGPUISD::EXPORT_DONE", AMDGPUExportOp,
+ [SDNPHasChain, SDNPMayLoad, SDNPMayStore]>;
+
+
+def R600ExportOp : SDTypeProfile<0, 7, [SDTCisFP<0>, SDTCisInt<1>]>;
+
+def R600_EXPORT: SDNode<"AMDGPUISD::R600_EXPORT", R600ExportOp,
+ [SDNPHasChain, SDNPSideEffect]>;
+
//===----------------------------------------------------------------------===//
// Flow Control Profile Types
//===----------------------------------------------------------------------===//
DAG.getConstant(2, DL, MVT::i32), // SWZ_Z
DAG.getConstant(3, DL, MVT::i32) // SWZ_W
};
- return DAG.getNode(AMDGPUISD::EXPORT, DL, Op.getValueType(), Args);
+ return DAG.getNode(AMDGPUISD::R600_EXPORT, DL, Op.getValueType(), Args);
}
// default for switch(IntrinsicID)
return SDValue();
}
- case AMDGPUISD::EXPORT: {
+ case AMDGPUISD::R600_EXPORT: {
SDValue Arg = N->getOperand(1);
if (Arg.getOpcode() != ISD::BUILD_VECTOR)
break;
N->getOperand(7) // SWZ_W
};
NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG, DL);
- return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs);
+ return DAG.getNode(AMDGPUISD::R600_EXPORT, DL, N->getVTList(), NewArgs);
}
case AMDGPUISD::TEXTURE_FETCH: {
SDValue Arg = N->getOperand(1);
// Export Instructions
//===----------------------------------------------------------------------===//
-def ExportType : SDTypeProfile<0, 7, [SDTCisFP<0>, SDTCisInt<1>]>;
-
-def EXPORT: SDNode<"AMDGPUISD::EXPORT", ExportType,
- [SDNPHasChain, SDNPSideEffect]>;
-
class ExportWord0 {
field bits<32> Word0;
}
multiclass ExportPattern<Instruction ExportInst, bits<8> cf_inst> {
- def : Pat<(EXPORT (v4f32 R600_Reg128:$src), (i32 imm:$base), (i32 imm:$type),
+ def : Pat<(R600_EXPORT (v4f32 R600_Reg128:$src), (i32 imm:$base), (i32 imm:$type),
(i32 imm:$swz_x), (i32 imm:$swz_y), (i32 imm:$swz_z), (i32 imm:$swz_w)),
(ExportInst R600_Reg128:$src, imm:$type, imm:$base,
imm:$swz_x, imm:$swz_y, imm:$swz_z, imm:$swz_w, cf_inst, 0)
namespace SIInstrFlags {
// This needs to be kept in sync with the field bits in InstSI.
-enum {
+enum : uint32_t {
SALU = 1 << 3,
VALU = 1 << 4,
DS = 1 << 19,
MIMG = 1 << 20,
FLAT = 1 << 21,
- WQM = 1 << 22,
- VGPRSpill = 1 << 23,
- SGPRSpill = 1 << 24,
- VOPAsmPrefer32Bit = 1 << 25,
- Gather4 = 1 << 26,
- DisableWQM = 1 << 27,
- SOPK_ZEXT = 1 << 28,
- SCALAR_STORE = 1 << 29,
- FIXED_SIZE = 1 << 30
+ EXP = 1 << 22,
+ WQM = 1 << 23,
+ VGPRSpill = 1 << 24,
+ SGPRSpill = 1 << 25,
+ VOPAsmPrefer32Bit = 1 << 26,
+ Gather4 = 1 << 27,
+ DisableWQM = 1 << 28,
+ SOPK_ZEXT = 1 << 29,
+ SCALAR_STORE = 1 << 30,
+ FIXED_SIZE = 1u << 31
};
}
SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Src);
return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, Cast);
}
+ case AMDGPUIntrinsic::SI_export: {
+ const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(2));
+ const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(3));
+ const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(4));
+ const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(5));
+ const ConstantSDNode *Compr = cast<ConstantSDNode>(Op.getOperand(6));
+
+ const SDValue Ops[] = {
+ Chain,
+ DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8),
+ DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1),
+ DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8),
+ DAG.getTargetConstant(Compr->getZExtValue(), DL, MVT::i1),
+ Op.getOperand(7), // src0
+ Op.getOperand(8), // src1
+ Op.getOperand(9), // src2
+ Op.getOperand(10) // src3
+ };
+
+ unsigned Opc = Done->isNullValue() ?
+ AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
+ return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
+ }
default:
return SDValue();
}
MachineBasicBlock::iterator Insert = SkipBB->begin();
// Exec mask is zero: Export to NULL target...
- BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP))
- .addImm(0)
+ BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP_DONE))
.addImm(0x09) // V_008DFC_SQ_EXP_NULL
- .addImm(0)
- .addImm(1)
- .addImm(1)
.addReg(AMDGPU::VGPR0, RegState::Undef)
.addReg(AMDGPU::VGPR0, RegState::Undef)
.addReg(AMDGPU::VGPR0, RegState::Undef)
- .addReg(AMDGPU::VGPR0, RegState::Undef);
+ .addReg(AMDGPU::VGPR0, RegState::Undef)
+ .addImm(1) // vm
+ .addImm(0) // compr
+ .addImm(0); // en
// ... and terminate wavefront.
BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT);
// Only consider stores or EXP for EXP_CNT
- Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT &&
- (MI.getOpcode() == AMDGPU::EXP || MI.getDesc().mayStore()));
+ Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT) && MI.mayStore();
// LGKM may uses larger values
if (TSFlags & SIInstrFlags::LGKM_CNT) {
if (Op.isDef())
return true;
- // For exports all registers are relevant
+ // For exports all registers are relevant.
+ // TODO: Skip undef/disabled registers.
MachineInstr &MI = *Op.getParent();
- if (MI.getOpcode() == AMDGPU::EXP)
+ if (TII->isEXP(MI))
return true;
// For stores the stored value is also relevant
// Remember which export instructions we have seen
if (Increment.Named.EXP) {
- ExpInstrTypesSeen |= I->getOpcode() == AMDGPU::EXP ? 1 : 2;
+ ExpInstrTypesSeen |= TII->isEXP(*I) ? 1 : 2;
}
for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
field bit DS = 0;
field bit MIMG = 0;
field bit FLAT = 0;
+ field bit EXP = 0;
// Whether WQM _must_ be enabled for this instruction.
field bit WQM = 0;
let TSFlags{19} = DS;
let TSFlags{20} = MIMG;
let TSFlags{21} = FLAT;
- let TSFlags{22} = WQM;
- let TSFlags{23} = VGPRSpill;
- let TSFlags{24} = SGPRSpill;
- let TSFlags{25} = VOPAsmPrefer32Bit;
- let TSFlags{26} = Gather4;
- let TSFlags{27} = DisableWQM;
- let TSFlags{28} = SOPKZext;
- let TSFlags{29} = ScalarStore;
- let TSFlags{30} = FixedSize;
+ let TSFlags{22} = EXP;
+ let TSFlags{23} = WQM;
+ let TSFlags{24} = VGPRSpill;
+ let TSFlags{25} = SGPRSpill;
+ let TSFlags{26} = VOPAsmPrefer32Bit;
+ let TSFlags{27} = Gather4;
+ let TSFlags{28} = DisableWQM;
+ let TSFlags{29} = SOPKZext;
+ let TSFlags{30} = ScalarStore;
+ let TSFlags{31} = FixedSize;
let SchedRW = [Write32Bit];
let hasSideEffects = 0;
}
+class EXPCommon<dag outs, dag ins, string asm, list<dag> pattern> :
+ InstSI<outs, ins, asm, pattern> {
+ let EXP = 1;
+ let EXP_CNT = 1;
+ let mayLoad = 0; // Set to 1 if done bit is set.
+ let mayStore = 1;
+ let UseNamedOperandTable = 1;
+ let Uses = [EXEC];
+ let SchedRW = [WriteExport];
+}
+
} // End Uses = [EXEC]
class MIMG <dag outs, dag ins, string asm, list<dag> pattern> :
return get(Opcode).TSFlags & SIInstrFlags::FLAT;
}
+ static bool isEXP(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::EXP;
+ }
+
+ bool isEXP(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::EXP;
+ }
+
static bool isWQM(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::WQM;
}
// EXP classes
//===----------------------------------------------------------------------===//
-class EXPCommon : InstSI<
+class EXP_Helper<bit done, SDPatternOperator node = null_frag> : EXPCommon<
(outs),
- (ins i32imm:$en, i32imm:$tgt, i32imm:$compr, i32imm:$done, i32imm:$vm,
- VGPR_32:$src0, VGPR_32:$src1, VGPR_32:$src2, VGPR_32:$src3),
- "exp $en, $tgt, $compr, $done, $vm, $src0, $src1, $src2, $src3",
- [] > {
-
- let EXP_CNT = 1;
- let Uses = [EXEC];
- let SchedRW = [WriteExport];
-}
-
-multiclass EXP_m {
-
- let isPseudo = 1, isCodeGenOnly = 1 in {
- def "" : EXPCommon, SIMCInstr <"exp", SIEncodingFamily.NONE> ;
- }
-
- def _si : EXPCommon, SIMCInstr <"exp", SIEncodingFamily.SI>, EXPe {
- let DecoderNamespace="SICI";
- let DisableDecoder = DisableSIDecoder;
- }
+ (ins i8imm:$tgt, VGPR_32:$src0, VGPR_32:$src1, VGPR_32:$src2, VGPR_32:$src3,
+ i1imm:$vm, i1imm:$compr, i8imm:$en),
+ "exp $en, $tgt, $compr, "#!if(done, "1", "0")#", $vm, $src0, $src1, $src2, $src3",
+ [(node (i8 timm:$en), (i1 timm:$vm), (i8 timm:$tgt), (i1 timm:$compr),
+ f32:$src0, f32:$src1, f32:$src2, f32:$src3)]
+>;
- def _vi : EXPCommon, SIMCInstr <"exp", SIEncodingFamily.VI>, EXPe_vi {
- let DecoderNamespace="VI";
- let DisableDecoder = DisableVIDecoder;
+// Split EXP instruction into EXP and EXP_DONE so we can set
+// mayLoad for done=1.
+multiclass EXP_m<bit done, SDPatternOperator node> {
+ let mayLoad = done in {
+ let isPseudo = 1, isCodeGenOnly = 1 in {
+ def "" : EXP_Helper<done, node>,
+ SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.NONE>;
+ }
+
+ let done = done in {
+ def _si : EXP_Helper<done>,
+ SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.SI>,
+ EXPe {
+ let DecoderNamespace = "SICI";
+ let DisableDecoder = DisableSIDecoder;
+ }
+
+ def _vi : EXP_Helper<done>,
+ SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.VI>,
+ EXPe_vi {
+ let DecoderNamespace = "VI";
+ let DisableDecoder = DisableVIDecoder;
+ }
+ }
}
}
// EXP Instructions
//===----------------------------------------------------------------------===//
-defm EXP : EXP_m;
+defm EXP : EXP_m<0, AMDGPUexport>;
+defm EXP_DONE : EXP_m<1, AMDGPUexport_done>;
//===----------------------------------------------------------------------===//
// VINTRP Instructions
(SI_KILL (i32 0xbf800000))
>;
-def : Pat <
- (int_SI_export imm:$en, imm:$vm, imm:$done, imm:$tgt, imm:$compr,
- f32:$src0, f32:$src1, f32:$src2, f32:$src3),
- (EXP imm:$en, imm:$tgt, imm:$compr, imm:$done, imm:$vm,
- $src0, $src1, $src2, $src3)
->;
-
//===----------------------------------------------------------------------===//
// VOP1 Patterns
//===----------------------------------------------------------------------===//
let TargetPrefix = "SI", isTarget = 1 in {
def int_SI_packf16 : Intrinsic <[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
- def int_SI_export : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>;
+
+ def int_SI_export : Intrinsic <[],
+ [llvm_i32_ty, // en
+ llvm_i32_ty, // vm (FIXME: should be i1)
+ llvm_i32_ty, // done (FIXME: should be i1)
+ llvm_i32_ty, // tgt
+ llvm_i32_ty, // compr (FIXME: should be i1)
+ llvm_float_ty, // src0
+ llvm_float_ty, // src1
+ llvm_float_ty, // src2
+ llvm_float_ty], // src3
+ []
+ >;
+
def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_anyint_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]> ;
; GCN-LABEL: {{^}}vgpr:
; GCN: v_mov_b32_e32 v1, v0
; GCN-DAG: v_add_f32_e32 v0, 1.0, v1
-; GCN-DAG: exp 15, 0, 1, 1, 1, v1, v1, v1, v1
+; GCN-DAG: exp 15, 0, -1, 1, -1, v1, v1, v1, v1
; GCN: s_waitcnt expcnt(0)
; GCN-NOT: s_endpgm
define amdgpu_vs {float, float} @vgpr([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) {
; GCN-LABEL: {{^}}vgpr_literal:
; GCN: v_mov_b32_e32 v4, v0
-; GCN: exp 15, 0, 1, 1, 1, v4, v4, v4, v4
+; GCN: exp 15, 0, -1, 1, -1, v4, v4, v4, v4
; GCN-DAG: v_mov_b32_e32 v0, 1.0
; GCN-DAG: v_mov_b32_e32 v1, 2.0
; GCN-DAG: v_mov_b32_e32 v2, 4.0
; GCN-LABEL: {{^}}both:
; GCN: v_mov_b32_e32 v1, v0
-; GCN-DAG: exp 15, 0, 1, 1, 1, v1, v1, v1, v1
+; GCN-DAG: exp 15, 0, -1, 1, -1, v1, v1, v1, v1
; GCN-DAG: v_add_f32_e32 v0, 1.0, v1
; GCN-DAG: s_add_i32 s0, s3, 2
; GCN-DAG: s_mov_b32 s1, s2
; GCN-LABEL: {{^}}structure_literal:
; GCN: v_mov_b32_e32 v3, v0
-; GCN: exp 15, 0, 1, 1, 1, v3, v3, v3, v3
+; GCN: exp 15, 0, -1, 1, -1, v3, v3, v3, v3
; GCN-DAG: v_mov_b32_e32 v0, 1.0
; GCN-DAG: s_mov_b32 s0, 2
; GCN-DAG: s_mov_b32 s1, 3
--- /dev/null
+# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-insert-waits -o - %s | FileCheck %s
+--- |
+ define amdgpu_ps <4 x float> @exp_done_waitcnt(<4 x i32> inreg, <4 x i32> inreg, i32 inreg %w, float %v) {
+ %a = load volatile float, float addrspace(1)* undef
+ %b = load volatile float, float addrspace(1)* undef
+ %c = load volatile float, float addrspace(1)* undef
+ %d = load volatile float, float addrspace(1)* undef
+ call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %a, float %b, float %c, float %d)
+ ret <4 x float> <float 5.000000e-01, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>
+ }
+
+ declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+ attributes #0 = { readnone }
+ attributes #1 = { nounwind }
+
+...
+---
+
+# CHECK-LABEL: name: exp_done_waitcnt{{$}}
+# CHECK: EXP_DONE
+# CHECK-NEXT: S_WAITCNT 3855
+# CHECK: %vgpr0 = V_MOV_B32
+# CHECK: %vgpr1 = V_MOV_B32
+# CHECK: %vgpr2 = V_MOV_B32
+# CHECK: %vgpr3 = V_MOV_B32
+name: exp_done_waitcnt
+alignment: 0
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.2):
+ %sgpr3 = S_MOV_B32 61440
+ %sgpr2 = S_MOV_B32 -1
+ %vgpr0 = BUFFER_LOAD_DWORD_OFFSET %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`)
+ %vgpr1 = BUFFER_LOAD_DWORD_OFFSET %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`)
+ %vgpr2 = BUFFER_LOAD_DWORD_OFFSET %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`)
+ %vgpr3 = BUFFER_LOAD_DWORD_OFFSET killed %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`)
+ EXP_DONE 0, killed %vgpr0, killed %vgpr1, killed %vgpr2, killed %vgpr3, -1, -1, 15, implicit %exec
+ %vgpr0 = V_MOV_B32_e32 1056964608, implicit %exec
+ %vgpr1 = V_MOV_B32_e32 1065353216, implicit %exec
+ %vgpr2 = V_MOV_B32_e32 1073741824, implicit %exec
+ %vgpr3 = V_MOV_B32_e32 1082130432, implicit %exec
+ SI_RETURN killed %vgpr0, killed %vgpr1, killed %vgpr2, killed %vgpr3
+
+...
%m0 = S_MOV_B32 undef %sgpr0
%vgpr1 = V_MOVRELS_B32_e32 undef %vgpr1, implicit %m0, implicit %exec, implicit killed %vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8
%vgpr4 = V_MAC_F32_e32 undef %vgpr0, undef %vgpr0, undef %vgpr4, implicit %exec
- EXP 15, 12, 0, 1, 0, undef %vgpr0, killed %vgpr1, killed %vgpr4, undef %vgpr0, implicit %exec
+ EXP_DONE 15, undef %vgpr0, killed %vgpr1, killed %vgpr4, undef %vgpr0, 0, 0, 12, implicit %exec
S_ENDPGM
...