From 7684aab92af11ee8bc1a3239ac6bde5c0c110ef6 Mon Sep 17 00:00:00 2001 From: Tim Renouf Date: Fri, 22 Mar 2019 10:11:21 +0000 Subject: [PATCH] [AMDGPU] Added v5i32 and v5f32 register classes They are not used by anything yet, but a subsequent commit will start using them for image ops that return 5 dwords. Differential Revision: https://reviews.llvm.org/D58903 Change-Id: I63e1904081e39a6d66e4eb96d51df25ad399d271 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@356735 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/AMDGPUCallingConv.td | 3 +- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 2 + lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 17 +++++- .../AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp | 3 + lib/Target/AMDGPU/SIISelLowering.cpp | 17 ++++++ lib/Target/AMDGPU/SIInstrInfo.cpp | 8 +++ lib/Target/AMDGPU/SIInstructions.td | 22 +++++++ lib/Target/AMDGPU/SIRegisterInfo.cpp | 32 ++++++++++ lib/Target/AMDGPU/SIRegisterInfo.td | 40 ++++++++++++- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 4 ++ test/CodeGen/AMDGPU/select-vectors.ll | 17 ++++++ test/CodeGen/AMDGPU/spill-wide-sgpr.ll | 60 +++++++++++++++++++ 12 files changed, 221 insertions(+), 4 deletions(-) diff --git a/lib/Target/AMDGPU/AMDGPUCallingConv.td b/lib/Target/AMDGPU/AMDGPUCallingConv.td index 597d23a9c1a..deb2bd8fbdb 100644 --- a/lib/Target/AMDGPU/AMDGPUCallingConv.td +++ b/lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -110,11 +110,12 @@ def CC_AMDGPU_Func : CallingConv<[ VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>, - CCIfType<[i64, f64, v2i32, v2f32, v3i32, v3f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64, v4i16, v4f16], CCCustom<"allocateVGPRTuple">>, + CCIfType<[i64, f64, v2i32, v2f32, v3i32, v3f32, v4i32, v4f32, v5i32, v5f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64, v4i16, v4f16], CCCustom<"allocateVGPRTuple">>, CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>, CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>, CCIfType<[v3i32, v3f32], CCAssignToStack<12, 4>>, CCIfType<[v4i32, v4f32, v2i64, v2f64], CCAssignToStack<16, 4>>, + CCIfType<[v5i32, v5f32], CCAssignToStack<20, 4>>, CCIfType<[v8i32, v8f32], CCAssignToStack<32, 4>>, CCIfType<[v16i32, v16f32], CCAssignToStack<64, 4>> ]>; diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 113880e1495..6e44ebac6e3 100644 --- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -544,6 +544,8 @@ static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) { return AMDGPU::SGPR_96RegClassID; case 4: return AMDGPU::SReg_128RegClassID; + case 5: + return AMDGPU::SGPR_160RegClassID; case 8: return AMDGPU::SReg_256RegClassID; case 16: diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index c8bff58126a..3d460199d94 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -156,6 +156,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::LOAD, MVT::v4f32, Promote); AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32); + setOperationAction(ISD::LOAD, MVT::v5f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32); + setOperationAction(ISD::LOAD, MVT::v8f32, Promote); AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32); @@ -244,6 +247,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::STORE, MVT::v4f32, Promote); AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32); + setOperationAction(ISD::STORE, MVT::v5f32, Promote); + AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32); + setOperationAction(ISD::STORE, MVT::v8f32, Promote); AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32); @@ -335,6 +341,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::CONCAT_VECTORS, MVT::v3f32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v5i32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v5f32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom); @@ -343,6 +351,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3i32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5f32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5i32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom); @@ -402,7 +412,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); static const MVT::SimpleValueType VectorIntTypes[] = { - MVT::v2i32, MVT::v3i32, MVT::v4i32 + MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32 }; for (MVT VT : VectorIntTypes) { @@ -444,7 +454,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, } static const MVT::SimpleValueType FloatVectorTypes[] = { - MVT::v2f32, MVT::v3f32, MVT::v4f32 + MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32 }; for (MVT VT : FloatVectorTypes) { @@ -492,6 +502,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SELECT, MVT::v4f32, Promote); AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32); + setOperationAction(ISD::SELECT, MVT::v5f32, Promote); + AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32); + // There are no libcalls of any kind. for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I) setLibcallName(static_cast(I), nullptr); diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp index be1ab484862..a1da357066b 100644 --- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp +++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp @@ -357,6 +357,9 @@ void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O, } else if (MRI.getRegClass(AMDGPU::VReg_96RegClassID).contains(RegNo)) { O << 'v'; NumRegs = 3; + } else if (MRI.getRegClass(AMDGPU::VReg_160RegClassID).contains(RegNo)) { + O << 'v'; + NumRegs = 5; } else if (MRI.getRegClass(AMDGPU::VReg_256RegClassID).contains(RegNo)) { O << 'v'; NumRegs = 8; diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index f029b3468e6..7280edd3fcc 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -132,6 +132,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass); addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); + addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass); + addRegisterClass(MVT::v5f32, &AMDGPU::VReg_160RegClass); + addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass); addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); @@ -155,6 +158,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::LOAD, MVT::v2i32, Custom); setOperationAction(ISD::LOAD, MVT::v3i32, Custom); setOperationAction(ISD::LOAD, MVT::v4i32, Custom); + setOperationAction(ISD::LOAD, MVT::v5i32, Custom); setOperationAction(ISD::LOAD, MVT::v8i32, Custom); setOperationAction(ISD::LOAD, MVT::v16i32, Custom); setOperationAction(ISD::LOAD, MVT::i1, Custom); @@ -163,6 +167,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::STORE, MVT::v2i32, Custom); setOperationAction(ISD::STORE, MVT::v3i32, Custom); setOperationAction(ISD::STORE, MVT::v4i32, Custom); + setOperationAction(ISD::STORE, MVT::v5i32, Custom); setOperationAction(ISD::STORE, MVT::v8i32, Custom); setOperationAction(ISD::STORE, MVT::v16i32, Custom); setOperationAction(ISD::STORE, MVT::i1, Custom); @@ -336,6 +341,12 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i32, Expand); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4f32, Expand); + // Deal with vec5 vector operations when widened to vec8. + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5i32, Expand); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5f32, Expand); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i32, Expand); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8f32, Expand); + // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, // and output demarshalling setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); @@ -9688,6 +9699,9 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, case 128: RC = &AMDGPU::SReg_128RegClass; break; + case 160: + RC = &AMDGPU::SReg_160RegClass; + break; case 256: RC = &AMDGPU::SReg_256RegClass; break; @@ -9713,6 +9727,9 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, case 128: RC = &AMDGPU::VReg_128RegClass; break; + case 160: + RC = &AMDGPU::VReg_160RegClass; + break; case 256: RC = &AMDGPU::VReg_256RegClass; break; diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index 8a35ef01457..74e15dbb881 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -845,6 +845,8 @@ static unsigned getSGPRSpillSaveOpcode(unsigned Size) { return AMDGPU::SI_SPILL_S96_SAVE; case 16: return AMDGPU::SI_SPILL_S128_SAVE; + case 20: + return AMDGPU::SI_SPILL_S160_SAVE; case 32: return AMDGPU::SI_SPILL_S256_SAVE; case 64: @@ -864,6 +866,8 @@ static unsigned getVGPRSpillSaveOpcode(unsigned Size) { return AMDGPU::SI_SPILL_V96_SAVE; case 16: return AMDGPU::SI_SPILL_V128_SAVE; + case 20: + return AMDGPU::SI_SPILL_V160_SAVE; case 32: return AMDGPU::SI_SPILL_V256_SAVE; case 64: @@ -949,6 +953,8 @@ static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { return AMDGPU::SI_SPILL_S96_RESTORE; case 16: return AMDGPU::SI_SPILL_S128_RESTORE; + case 20: + return AMDGPU::SI_SPILL_S160_RESTORE; case 32: return AMDGPU::SI_SPILL_S256_RESTORE; case 64: @@ -968,6 +974,8 @@ static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { return AMDGPU::SI_SPILL_V96_RESTORE; case 16: return AMDGPU::SI_SPILL_V128_RESTORE; + case 20: + return AMDGPU::SI_SPILL_V160_RESTORE; case 32: return AMDGPU::SI_SPILL_V256_RESTORE; case 64: diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td index 9ab755c6785..869deb93679 100644 --- a/lib/Target/AMDGPU/SIInstructions.td +++ b/lib/Target/AMDGPU/SIInstructions.td @@ -481,6 +481,7 @@ defm SI_SPILL_S32 : SI_SPILL_SGPR ; defm SI_SPILL_S64 : SI_SPILL_SGPR ; defm SI_SPILL_S96 : SI_SPILL_SGPR ; defm SI_SPILL_S128 : SI_SPILL_SGPR ; +defm SI_SPILL_S160 : SI_SPILL_SGPR ; defm SI_SPILL_S256 : SI_SPILL_SGPR ; defm SI_SPILL_S512 : SI_SPILL_SGPR ; @@ -514,6 +515,7 @@ defm SI_SPILL_V32 : SI_SPILL_VGPR ; defm SI_SPILL_V64 : SI_SPILL_VGPR ; defm SI_SPILL_V96 : SI_SPILL_VGPR ; defm SI_SPILL_V128 : SI_SPILL_VGPR ; +defm SI_SPILL_V160 : SI_SPILL_VGPR ; defm SI_SPILL_V256 : SI_SPILL_VGPR ; defm SI_SPILL_V512 : SI_SPILL_VGPR ; @@ -771,6 +773,22 @@ foreach Index = 0-3 in { >; } +foreach Index = 0-4 in { + def Extract_Element_v5i32_#Index : Extract_Element < + i32, v5i32, Index, !cast(sub#Index) + >; + def Insert_Element_v5i32_#Index : Insert_Element < + i32, v5i32, Index, !cast(sub#Index) + >; + + def Extract_Element_v5f32_#Index : Extract_Element < + f32, v5f32, Index, !cast(sub#Index) + >; + def Insert_Element_v5f32_#Index : Insert_Element < + f32, v5f32, Index, !cast(sub#Index) + >; +} + foreach Index = 0-7 in { def Extract_Element_v8i32_#Index : Extract_Element < i32, v8i32, Index, !cast(sub#Index) @@ -900,6 +918,10 @@ def : BitConvert ; def : BitConvert ; def : BitConvert ; +// 160-bit bitcast +def : BitConvert ; +def : BitConvert ; + // 256-bit bitcast def : BitConvert ; def : BitConvert ; diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp index 294aa7b8b25..3346d303a1e 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -410,6 +410,11 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) { case AMDGPU::SI_SPILL_V256_SAVE: case AMDGPU::SI_SPILL_V256_RESTORE: return 8; + case AMDGPU::SI_SPILL_S160_SAVE: + case AMDGPU::SI_SPILL_S160_RESTORE: + case AMDGPU::SI_SPILL_V160_SAVE: + case AMDGPU::SI_SPILL_V160_RESTORE: + return 5; case AMDGPU::SI_SPILL_S128_SAVE: case AMDGPU::SI_SPILL_S128_RESTORE: case AMDGPU::SI_SPILL_V128_SAVE: @@ -979,6 +984,7 @@ bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( switch (MI->getOpcode()) { case AMDGPU::SI_SPILL_S512_SAVE: case AMDGPU::SI_SPILL_S256_SAVE: + case AMDGPU::SI_SPILL_S160_SAVE: case AMDGPU::SI_SPILL_S128_SAVE: case AMDGPU::SI_SPILL_S96_SAVE: case AMDGPU::SI_SPILL_S64_SAVE: @@ -986,6 +992,7 @@ bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( return spillSGPR(MI, FI, RS, true); case AMDGPU::SI_SPILL_S512_RESTORE: case AMDGPU::SI_SPILL_S256_RESTORE: + case AMDGPU::SI_SPILL_S160_RESTORE: case AMDGPU::SI_SPILL_S128_RESTORE: case AMDGPU::SI_SPILL_S96_RESTORE: case AMDGPU::SI_SPILL_S64_RESTORE: @@ -1015,6 +1022,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, // SGPR register spill case AMDGPU::SI_SPILL_S512_SAVE: case AMDGPU::SI_SPILL_S256_SAVE: + case AMDGPU::SI_SPILL_S160_SAVE: case AMDGPU::SI_SPILL_S128_SAVE: case AMDGPU::SI_SPILL_S96_SAVE: case AMDGPU::SI_SPILL_S64_SAVE: @@ -1026,6 +1034,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, // SGPR register restore case AMDGPU::SI_SPILL_S512_RESTORE: case AMDGPU::SI_SPILL_S256_RESTORE: + case AMDGPU::SI_SPILL_S160_RESTORE: case AMDGPU::SI_SPILL_S128_RESTORE: case AMDGPU::SI_SPILL_S96_RESTORE: case AMDGPU::SI_SPILL_S64_RESTORE: @@ -1037,6 +1046,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, // VGPR register spill case AMDGPU::SI_SPILL_V512_SAVE: case AMDGPU::SI_SPILL_V256_SAVE: + case AMDGPU::SI_SPILL_V160_SAVE: case AMDGPU::SI_SPILL_V128_SAVE: case AMDGPU::SI_SPILL_V96_SAVE: case AMDGPU::SI_SPILL_V64_SAVE: @@ -1059,6 +1069,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, case AMDGPU::SI_SPILL_V64_RESTORE: case AMDGPU::SI_SPILL_V96_RESTORE: case AMDGPU::SI_SPILL_V128_RESTORE: + case AMDGPU::SI_SPILL_V160_RESTORE: case AMDGPU::SI_SPILL_V256_RESTORE: case AMDGPU::SI_SPILL_V512_RESTORE: { const MachineOperand *VData = TII->getNamedOperand(*MI, @@ -1251,6 +1262,8 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { &AMDGPU::SReg_96RegClass, &AMDGPU::VReg_128RegClass, &AMDGPU::SReg_128RegClass, + &AMDGPU::VReg_160RegClass, + &AMDGPU::SReg_160RegClass, &AMDGPU::VReg_256RegClass, &AMDGPU::SReg_256RegClass, &AMDGPU::VReg_512RegClass, @@ -1283,6 +1296,8 @@ bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const { return getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) != nullptr; case 128: return getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) != nullptr; + case 160: + return getCommonSubClass(&AMDGPU::VReg_160RegClass, RC) != nullptr; case 256: return getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) != nullptr; case 512: @@ -1303,6 +1318,8 @@ const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass( return &AMDGPU::VReg_96RegClass; case 128: return &AMDGPU::VReg_128RegClass; + case 160: + return &AMDGPU::VReg_160RegClass; case 256: return &AMDGPU::VReg_256RegClass; case 512: @@ -1323,6 +1340,8 @@ const TargetRegisterClass *SIRegisterInfo::getEquivalentSGPRClass( return &AMDGPU::SReg_96RegClass; case 128: return &AMDGPU::SReg_128RegClass; + case 160: + return &AMDGPU::SReg_160RegClass; case 256: return &AMDGPU::SReg_256RegClass; case 512: @@ -1349,6 +1368,8 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass( return &AMDGPU::SReg_96RegClass; case 4: return &AMDGPU::SReg_128RegClass; + case 5: + return &AMDGPU::SReg_160RegClass; case 8: return &AMDGPU::SReg_256RegClass; case 16: /* fall-through */ @@ -1365,6 +1386,8 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass( return &AMDGPU::VReg_96RegClass; case 4: return &AMDGPU::VReg_128RegClass; + case 5: + return &AMDGPU::VReg_160RegClass; case 8: return &AMDGPU::VReg_256RegClass; case 16: /* fall-through */ @@ -1427,6 +1450,10 @@ ArrayRef SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, }; + static const int16_t Sub0_4[] = { + AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4, + }; + static const int16_t Sub0_3[] = { AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, }; @@ -1448,6 +1475,8 @@ ArrayRef SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC return makeArrayRef(Sub0_2); case 128: return makeArrayRef(Sub0_3); + case 160: + return makeArrayRef(Sub0_4); case 256: return makeArrayRef(Sub0_7); case 512: @@ -1618,6 +1647,9 @@ SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO, case 128: return RB->getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_128RegClass : &AMDGPU::SReg_128RegClass; + case 160: + return RB->getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_160RegClass : + &AMDGPU::SReg_160RegClass; case 256: return RB->getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_256RegClass : &AMDGPU::SReg_256RegClass; diff --git a/lib/Target/AMDGPU/SIRegisterInfo.td b/lib/Target/AMDGPU/SIRegisterInfo.td index 7707e5af1b9..2f2dc4b41c9 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/lib/Target/AMDGPU/SIRegisterInfo.td @@ -14,6 +14,7 @@ class getSubRegs { list ret2 = [sub0, sub1]; list ret3 = [sub0, sub1, sub2]; list ret4 = [sub0, sub1, sub2, sub3]; + list ret5 = [sub0, sub1, sub2, sub3, sub4]; list ret8 = [sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7]; list ret16 = [sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, @@ -23,7 +24,8 @@ class getSubRegs { list ret = !if(!eq(size, 2), ret2, !if(!eq(size, 3), ret3, !if(!eq(size, 4), ret4, - !if(!eq(size, 8), ret8, ret16)))); + !if(!eq(size, 5), ret5, + !if(!eq(size, 8), ret8, ret16))))); } //===----------------------------------------------------------------------===// @@ -190,6 +192,14 @@ def SGPR_128Regs : RegisterTuples.ret, (add (decimate (shl SGPR_32, 2), 4)), (add (decimate (shl SGPR_32, 3), 4))]>; +// SGPR 160-bit registers. No operations use these, but for symmetry with 160-bit VGPRs. +def SGPR_160Regs : RegisterTuples.ret, + [(add (decimate SGPR_32, 4)), + (add (decimate (shl SGPR_32, 1), 4)), + (add (decimate (shl SGPR_32, 2), 4)), + (add (decimate (shl SGPR_32, 3), 4)), + (add (decimate (shl SGPR_32, 4), 4))]>; + // SGPR 256-bit registers def SGPR_256Regs : RegisterTuples.ret, [(add (decimate SGPR_32, 4)), @@ -372,6 +382,14 @@ def VGPR_128 : RegisterTuples.ret, (add (shl VGPR_32, 2)), (add (shl VGPR_32, 3))]>; +// VGPR 160-bit registers +def VGPR_160 : RegisterTuples.ret, + [(add (trunc VGPR_32, 252)), + (add (shl VGPR_32, 1)), + (add (shl VGPR_32, 2)), + (add (shl VGPR_32, 3)), + (add (shl VGPR_32, 4))]>; + // VGPR 256-bit registers def VGPR_256 : RegisterTuples.ret, [(add (trunc VGPR_32, 249)), @@ -505,6 +523,18 @@ def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, } // End CopyCost = 2 +// There are no 5-component scalar instructions, but this is needed +// for symmetry with VGPRs. +def SGPR_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32, + (add SGPR_160Regs)> { + let AllocationPriority = 12; +} + +def SReg_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32, + (add SGPR_160)> { + let AllocationPriority = 12; +} + def SGPR_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256Regs)> { let AllocationPriority = 13; } @@ -565,6 +595,14 @@ def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, (add VG let AllocationPriority = 4; } +def VReg_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32, (add VGPR_160)> { + let Size = 160; + + // Requires 5 v_mov_b32 to copy + let CopyCost = 5; + let AllocationPriority = 5; +} + def VReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add VGPR_256)> { let Size = 256; let CopyCost = 8; diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 3e1cf68e265..b397554e76d 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -822,6 +822,10 @@ unsigned getRegBitWidth(unsigned RCID) { case AMDGPU::SReg_128RegClassID: case AMDGPU::VReg_128RegClassID: return 128; + case AMDGPU::SGPR_160RegClassID: + case AMDGPU::SReg_160RegClassID: + case AMDGPU::VReg_160RegClassID: + return 160; case AMDGPU::SReg_256RegClassID: case AMDGPU::VReg_256RegClassID: return 256; diff --git a/test/CodeGen/AMDGPU/select-vectors.ll b/test/CodeGen/AMDGPU/select-vectors.ll index 26923f8c3eb..4c136d09c4a 100644 --- a/test/CodeGen/AMDGPU/select-vectors.ll +++ b/test/CodeGen/AMDGPU/select-vectors.ll @@ -281,6 +281,23 @@ bb: ret void } +; GCN-LABEL: {{^}}s_select_v5f32: +; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}} + +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 + +; GCN: buffer_store_dwordx +define amdgpu_kernel void @s_select_v5f32(<5 x float> addrspace(1)* %out, <5 x float> %a, <5 x float> %b, i32 %c) #0 { + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, <5 x float> %a, <5 x float> %b + store <5 x float> %select, <5 x float> addrspace(1)* %out, align 16 + ret void +} + ; GCN-LABEL: {{^}}select_v8f32: ; GCN: v_cndmask_b32_e32 ; GCN: v_cndmask_b32_e32 diff --git a/test/CodeGen/AMDGPU/spill-wide-sgpr.ll b/test/CodeGen/AMDGPU/spill-wide-sgpr.ll index 5a55ce51440..920f503fcf3 100644 --- a/test/CodeGen/AMDGPU/spill-wide-sgpr.ll +++ b/test/CodeGen/AMDGPU/spill-wide-sgpr.ll @@ -139,6 +139,66 @@ ret: ret void } +; ALL-LABEL: {{^}}spill_sgpr_x5: +; SMEM: s_add_u32 m0, s3, 0x100{{$}} +; SMEM: s_buffer_store_dword s +; SMEM: s_buffer_store_dword s +; SMEM: s_buffer_store_dword s +; SMEM: s_buffer_store_dword s +; SMEM: s_buffer_store_dword s +; SMEM: s_cbranch_scc1 + +; SMEM: s_add_u32 m0, s3, 0x100{{$}} +; SMEM: s_buffer_load_dword s +; SMEM: s_buffer_load_dword s +; SMEM: s_buffer_load_dword s +; SMEM: s_buffer_load_dword s +; SMEM: s_buffer_load_dword s +; SMEM: s_dcache_wb +; SMEM: s_endpgm + +; FIXME: Should only need 4 bytes +; SMEM: ScratchSize: 24 + +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 0 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 1 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 2 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 3 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 4 +; VGPR: s_cbranch_scc1 + +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 0 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 2 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 3 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 4 + + +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: s_cbranch_scc1 + +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +define amdgpu_kernel void @spill_sgpr_x5(i32 addrspace(1)* %out, i32 %in) #0 { + %wide.sgpr = call <5 x i32> asm sideeffect "; def $0", "=s" () #0 + %cmp = icmp eq i32 %in, 0 + br i1 %cmp, label %bb0, label %ret + +bb0: + call void asm sideeffect "; use $0", "s"(<5 x i32> %wide.sgpr) #0 + br label %ret + +ret: + ret void +} + ; ALL-LABEL: {{^}}spill_sgpr_x8: ; SMEM: s_add_u32 m0, s3, 0x100{{$}} -- 2.50.1