From ae152bde495ec05ab6064e5336f3fd5344dbe0e7 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Fri, 4 Nov 2016 13:06:34 +0000 Subject: [PATCH] Revert "AMDGPU: Add VI i16 support" This reverts commit r285939 and r285948. These broke some conformance tests. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@285995 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/AMDGPU.td | 2 - lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 27 +- lib/Target/AMDGPU/AMDGPUInstructions.td | 6 +- lib/Target/AMDGPU/BUFInstructions.td | 51 +--- lib/Target/AMDGPU/DSInstructions.td | 10 +- lib/Target/AMDGPU/FLATInstructions.td | 6 - lib/Target/AMDGPU/SIISelLowering.cpp | 76 +---- lib/Target/AMDGPU/SIInstrInfo.td | 1 + lib/Target/AMDGPU/SIInstructions.td | 105 ++++--- lib/Target/AMDGPU/SIRegisterInfo.td | 11 +- lib/Target/AMDGPU/SOPInstructions.td | 37 +-- lib/Target/AMDGPU/VIInstructions.td | 4 - lib/Target/AMDGPU/VOP1Instructions.td | 47 +-- lib/Target/AMDGPU/VOP2Instructions.td | 72 ----- lib/Target/AMDGPU/VOP3Instructions.td | 32 --- test/CodeGen/AMDGPU/add.i16.ll | 149 ---------- test/CodeGen/AMDGPU/anyext.ll | 45 +-- test/CodeGen/AMDGPU/bitreverse.ll | 3 +- test/CodeGen/AMDGPU/cgp-bitfield-extract.ll | 9 +- test/CodeGen/AMDGPU/copy-illegal-type.ll | 141 +++++---- test/CodeGen/AMDGPU/ctlz.ll | 1 - test/CodeGen/AMDGPU/ctlz_zero_undef.ll | 4 +- test/CodeGen/AMDGPU/cube.ll | 8 +- test/CodeGen/AMDGPU/cvt_f32_ubyte.ll | 206 +++++++------ test/CodeGen/AMDGPU/global-extload-i16.ll | 302 -------------------- test/CodeGen/AMDGPU/half.ll | 36 +-- test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll | 12 +- test/CodeGen/AMDGPU/load-constant-i16.ll | 15 +- test/CodeGen/AMDGPU/load-global-i16.ll | 15 +- test/CodeGen/AMDGPU/load-global-i8.ll | 20 +- test/CodeGen/AMDGPU/load-local-i16.ll | 12 +- test/CodeGen/AMDGPU/load-local-i8.ll | 20 +- test/CodeGen/AMDGPU/mad_uint24.ll | 20 +- test/CodeGen/AMDGPU/max.i16.ll | 87 ------ test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll | 12 +- test/CodeGen/AMDGPU/shl.ll | 42 --- test/CodeGen/AMDGPU/sign_extend.ll | 46 +-- test/CodeGen/AMDGPU/sra.ll | 30 -- test/CodeGen/AMDGPU/sub.ll | 40 --- test/CodeGen/AMDGPU/trunc-bitcast-vector.ll | 11 +- test/CodeGen/AMDGPU/trunc-store-i1.ll | 11 +- test/CodeGen/AMDGPU/zero_extend.ll | 47 +-- 42 files changed, 341 insertions(+), 1490 deletions(-) delete mode 100644 test/CodeGen/AMDGPU/add.i16.ll delete mode 100644 test/CodeGen/AMDGPU/global-extload-i16.ll delete mode 100644 test/CodeGen/AMDGPU/max.i16.ll diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td index 7a208d7c09a..e7d6ef3fd81 100644 --- a/lib/Target/AMDGPU/AMDGPU.td +++ b/lib/Target/AMDGPU/AMDGPU.td @@ -493,8 +493,6 @@ def isCIVI : Predicate < def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">; -def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">; - class PredicateControl { Predicate SubtargetPredicate; Predicate SIAssemblerPredicate = isSICI; diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 118f8fabb7d..d7108416ccc 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -586,32 +586,19 @@ bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const { // Truncate is just accessing a subregister. - - unsigned SrcSize = Source.getSizeInBits(); - unsigned DestSize = Dest.getSizeInBits(); - - return DestSize < SrcSize && DestSize % 32 == 0 ; + return Dest.bitsLT(Source) && (Dest.getSizeInBits() % 32 == 0); } bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const { // Truncate is just accessing a subregister. - - unsigned SrcSize = Source->getScalarSizeInBits(); - unsigned DestSize = Dest->getScalarSizeInBits(); - - if (DestSize== 16 && Subtarget->has16BitInsts()) - return SrcSize >= 32; - - return DestSize < SrcSize && DestSize % 32 == 0; + return Dest->getPrimitiveSizeInBits() < Source->getPrimitiveSizeInBits() && + (Dest->getPrimitiveSizeInBits() % 32 == 0); } bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const { unsigned SrcSize = Src->getScalarSizeInBits(); unsigned DestSize = Dest->getScalarSizeInBits(); - if (SrcSize == 16 && Subtarget->has16BitInsts()) - return DestSize >= 32; - return SrcSize == 32 && DestSize == 64; } @@ -620,10 +607,6 @@ bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const { // practical purposes, the extra mov 0 to load a 64-bit is free. As used, // this will enable reducing 64-bit operations the 32-bit, which is always // good. - - if (Src == MVT::i16) - return Dest == MVT::i32 ||Dest == MVT::i64 ; - return Src == MVT::i32 && Dest == MVT::i64; } @@ -2463,10 +2446,6 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, if (VT.isVector() || Size > 64) return SDValue(); - // There are i16 integer mul/mad. - if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16)) - return SDValue(); - SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td index c2544c295e3..cc9cce5468a 100644 --- a/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -529,14 +529,14 @@ multiclass BFIPatterns ; def : Pat < (f64 (fcopysign f64:$src0, f64:$src1)), (REG_SEQUENCE RC64, (i32 (EXTRACT_SUBREG $src0, sub0)), sub0, - (BFI_INT (LoadImm32 (i32 0x7fffffff)), + (BFI_INT (LoadImm32 0x7fffffff), (i32 (EXTRACT_SUBREG $src0, sub1)), (i32 (EXTRACT_SUBREG $src1, sub1))), sub1) >; @@ -545,7 +545,7 @@ multiclass BFIPatterns ; diff --git a/lib/Target/AMDGPU/BUFInstructions.td b/lib/Target/AMDGPU/BUFInstructions.td index 928b5d2d5d3..42d16a53284 100644 --- a/lib/Target/AMDGPU/BUFInstructions.td +++ b/lib/Target/AMDGPU/BUFInstructions.td @@ -708,13 +708,13 @@ let Predicates = [isGCN] in { // int_SI_vs_load_input def : Pat< (SIload_input v4i32:$tlst, imm:$attr_offset, i32:$buf_idx_vgpr), - (BUFFER_LOAD_FORMAT_XYZW_IDXEN $buf_idx_vgpr, $tlst, (i32 0), imm:$attr_offset, 0, 0, 0) + (BUFFER_LOAD_FORMAT_XYZW_IDXEN $buf_idx_vgpr, $tlst, 0, imm:$attr_offset, 0, 0, 0) >; // Offset in an 32-bit VGPR def : Pat < (SIload_constant v4i32:$sbase, i32:$voff), - (BUFFER_LOAD_DWORD_OFFEN $voff, $sbase, (i32 0), 0, 0, 0, 0) + (BUFFER_LOAD_DWORD_OFFEN $voff, $sbase, 0, 0, 0, 0, 0) >; @@ -914,7 +914,7 @@ def : Pat< >; -class MUBUFLoad_PatternADDR64 : Pat < (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe))), @@ -936,34 +936,15 @@ multiclass MUBUFLoad_Atomic_Pattern ; -def : MUBUFLoad_PatternADDR64 ; -def : MUBUFLoad_PatternADDR64 ; -def : MUBUFLoad_PatternADDR64 ; +def : MUBUFLoad_Pattern ; +def : MUBUFLoad_Pattern ; +def : MUBUFLoad_Pattern ; +def : MUBUFLoad_Pattern ; defm : MUBUFLoad_Atomic_Pattern ; defm : MUBUFLoad_Atomic_Pattern ; } // End Predicates = [isSICI] -multiclass MUBUFLoad_Pattern { - - def : Pat < - (vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, - i16:$offset, i1:$glc, i1:$slc, i1:$tfe))), - (Instr_OFFSET $srsrc, $soffset, $offset, $glc, $slc, $tfe) - >; -} - -let Predicates = [Has16BitInsts] in { - -defm : MUBUFLoad_Pattern ; -defm : MUBUFLoad_Pattern ; -defm : MUBUFLoad_Pattern ; -defm : MUBUFLoad_Pattern ; - -} // End Predicates = [Has16BitInsts] - class MUBUFScratchLoadPat : Pat < (vt (ld (MUBUFScratch v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset))), @@ -972,8 +953,6 @@ class MUBUFScratchLoadPat : Pat < def : MUBUFScratchLoadPat ; def : MUBUFScratchLoadPat ; -def : MUBUFScratchLoadPat ; -def : MUBUFScratchLoadPat ; def : MUBUFScratchLoadPat ; def : MUBUFScratchLoadPat ; def : MUBUFScratchLoadPat ; @@ -1046,20 +1025,6 @@ defm : MUBUFStore_Atomic_Pattern ; } // End Predicates = [isSICI] - -multiclass MUBUFStore_Pattern { - - def : Pat < - (st vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, - i16:$offset, i1:$glc, i1:$slc, i1:$tfe)), - (Instr_OFFSET $vdata, $srsrc, $soffset, $offset, $glc, $slc, $tfe) - >; -} - -defm : MUBUFStore_Pattern ; -defm : MUBUFStore_Pattern ; - class MUBUFScratchStorePat : Pat < (st vt:$value, (MUBUFScratch v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset)), @@ -1068,8 +1033,6 @@ class MUBUFScratchStorePat : Pat def : MUBUFScratchStorePat ; def : MUBUFScratchStorePat ; -def : MUBUFScratchStorePat ; -def : MUBUFScratchStorePat ; def : MUBUFScratchStorePat ; def : MUBUFScratchStorePat ; def : MUBUFScratchStorePat ; diff --git a/lib/Target/AMDGPU/DSInstructions.td b/lib/Target/AMDGPU/DSInstructions.td index a077001df6b..54935bbde7f 100644 --- a/lib/Target/AMDGPU/DSInstructions.td +++ b/lib/Target/AMDGPU/DSInstructions.td @@ -489,12 +489,8 @@ class DSReadPat : Pat < def : DSReadPat ; def : DSReadPat ; -def : DSReadPat ; -def : DSReadPat ; -def : DSReadPat ; def : DSReadPat ; def : DSReadPat ; -def : DSReadPat ; def : DSReadPat ; let AddedComplexity = 100 in { @@ -516,8 +512,6 @@ class DSWritePat : Pat < def : DSWritePat ; def : DSWritePat ; -def : DSWritePat ; -def : DSWritePat ; def : DSWritePat ; let AddedComplexity = 100 in { @@ -528,8 +522,8 @@ def : DSWritePat ; def : Pat < (si_store_local v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1)), - (DS_WRITE2_B32 $ptr, (i32 (EXTRACT_SUBREG $value, sub0)), - (i32 (EXTRACT_SUBREG $value, sub1)), $offset0, $offset1, + (DS_WRITE2_B32 $ptr, (EXTRACT_SUBREG $value, sub0), + (EXTRACT_SUBREG $value, sub1), $offset0, $offset1, (i1 0)) >; diff --git a/lib/Target/AMDGPU/FLATInstructions.td b/lib/Target/AMDGPU/FLATInstructions.td index 69a3a7730ee..7b54c61dc21 100644 --- a/lib/Target/AMDGPU/FLATInstructions.td +++ b/lib/Target/AMDGPU/FLATInstructions.td @@ -341,8 +341,6 @@ let Predicates = [isCIVI] in { def : FlatLoadPat ; def : FlatLoadPat ; -def : FlatLoadPat ; -def : FlatLoadPat ; def : FlatLoadPat ; def : FlatLoadPat ; def : FlatLoadPat ; @@ -391,10 +389,6 @@ def : FlatAtomicPat ; } // End Predicates = [isCIVI] -let Predicates = [isVI] in { - def : FlatStorePat ; - def : FlatStorePat ; -} //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index ac13bd2b07e..3b84e386341 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -78,9 +78,6 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass); addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); - if (Subtarget->has16BitInsts()) - addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass); - computeRegisterProperties(STI.getRegisterInfo()); // We need to custom lower vector stores from local memory @@ -224,55 +221,6 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::FDIV, MVT::f32, Custom); setOperationAction(ISD::FDIV, MVT::f64, Custom); - if (Subtarget->has16BitInsts()) { - setOperationAction(ISD::Constant, MVT::i16, Legal); - - setOperationAction(ISD::SMIN, MVT::i16, Legal); - setOperationAction(ISD::SMAX, MVT::i16, Legal); - - setOperationAction(ISD::UMIN, MVT::i16, Legal); - setOperationAction(ISD::UMAX, MVT::i16, Legal); - - setOperationAction(ISD::SETCC, MVT::i16, Promote); - AddPromotedToType(ISD::SETCC, MVT::i16, MVT::i32); - - setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Promote); - AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32); - - setOperationAction(ISD::ROTR, MVT::i16, Promote); - setOperationAction(ISD::ROTL, MVT::i16, Promote); - - setOperationAction(ISD::SDIV, MVT::i16, Promote); - setOperationAction(ISD::UDIV, MVT::i16, Promote); - setOperationAction(ISD::SREM, MVT::i16, Promote); - setOperationAction(ISD::UREM, MVT::i16, Promote); - - setOperationAction(ISD::BSWAP, MVT::i16, Promote); - setOperationAction(ISD::BITREVERSE, MVT::i16, Promote); - - setOperationAction(ISD::CTTZ, MVT::i16, Promote); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Promote); - setOperationAction(ISD::CTLZ, MVT::i16, Promote); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Promote); - - setOperationAction(ISD::SELECT_CC, MVT::i16, Expand); - - setOperationAction(ISD::BR_CC, MVT::i16, Expand); - - setOperationAction(ISD::LOAD, MVT::i16, Custom); - - setTruncStoreAction(MVT::i64, MVT::i16, Expand); - - setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote); - AddPromotedToType(ISD::UINT_TO_FP, MVT::i16, MVT::i32); - setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote); - AddPromotedToType(ISD::SINT_TO_FP, MVT::i16, MVT::i32); - setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote); - AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32); - setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote); - AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32); - } - setTargetDAGCombine(ISD::FADD); setTargetDAGCombine(ISD::FSUB); setTargetDAGCombine(ISD::FMINNUM); @@ -2610,6 +2558,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { EVT MemVT = Load->getMemoryVT(); if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) { + assert(MemVT == MVT::i1 && "Only i1 non-extloads expected"); // FIXME: Copied from PPC // First, load into 32 bits, then truncate to 1 bit. @@ -2617,10 +2566,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { SDValue BasePtr = Load->getBasePtr(); MachineMemOperand *MMO = Load->getMemOperand(); - EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16; - SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, - BasePtr, RealMemVT, MMO); + BasePtr, MVT::i8, MMO); SDValue Ops[] = { DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD), @@ -3434,23 +3381,8 @@ static SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, } EVT VT = K0->getValueType(0); - - MVT NVT = MVT::i32; - unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; - - SDValue Tmp1, Tmp2, Tmp3; - Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0)); - Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1)); - Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1); - - if (VT == MVT::i16) { - Tmp1 = DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, NVT, - Tmp1, Tmp2, Tmp3); - - return DAG.getNode(ISD::TRUNCATE, SL, VT, Tmp1); - } else - return DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, VT, - Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0)); + return DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, VT, + Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0)); } static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) { diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td index d770bd425c4..f19e99e7cd1 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.td +++ b/lib/Target/AMDGPU/SIInstrInfo.td @@ -1128,6 +1128,7 @@ def getAtomicNoRetOp : InstrMapping { include "SIInstructions.td" include "CIInstructions.td" +include "VIInstructions.td" include "DSInstructions.td" include "MIMGInstructions.td" diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td index b758a576047..4122eb915f3 100644 --- a/lib/Target/AMDGPU/SIInstructions.td +++ b/lib/Target/AMDGPU/SIInstructions.td @@ -374,7 +374,7 @@ def : Pat< def : Pat < (int_AMDGPU_kilp), - (SI_KILL (i32 0xbf800000)) + (SI_KILL 0xbf800000) >; def : Pat < @@ -555,7 +555,7 @@ def : BitConvert ; def : Pat < (AMDGPUclamp (VOP3Mods0Clamp f32:$src0, i32:$src0_modifiers, i32:$omod), (f32 FP_ZERO), (f32 FP_ONE)), - (V_ADD_F32_e64 $src0_modifiers, $src0, 0, (i32 0), 1, $omod) + (V_ADD_F32_e64 $src0_modifiers, $src0, 0, 0, 1, $omod) >; /********** ================================ **********/ @@ -566,7 +566,7 @@ def : Pat < def : Pat < (fneg (fabs f32:$src)), - (S_OR_B32 $src, (S_MOV_B32(i32 0x80000000))) // Set sign bit + (S_OR_B32 $src, (S_MOV_B32 0x80000000)) // Set sign bit >; // FIXME: Should use S_OR_B32 @@ -575,19 +575,19 @@ def : Pat < (REG_SEQUENCE VReg_64, (i32 (EXTRACT_SUBREG f64:$src, sub0)), sub0, - (V_OR_B32_e32 (i32 (EXTRACT_SUBREG f64:$src, sub1)), - (V_MOV_B32_e32 (i32 0x80000000))), // Set sign bit. + (V_OR_B32_e32 (EXTRACT_SUBREG f64:$src, sub1), + (V_MOV_B32_e32 0x80000000)), // Set sign bit. sub1) >; def : Pat < (fabs f32:$src), - (V_AND_B32_e64 $src, (V_MOV_B32_e32 (i32 0x7fffffff))) + (V_AND_B32_e64 $src, (V_MOV_B32_e32 0x7fffffff)) >; def : Pat < (fneg f32:$src), - (V_XOR_B32_e32 $src, (V_MOV_B32_e32 (i32 0x80000000))) + (V_XOR_B32_e32 $src, (V_MOV_B32_e32 0x80000000)) >; def : Pat < @@ -595,8 +595,8 @@ def : Pat < (REG_SEQUENCE VReg_64, (i32 (EXTRACT_SUBREG f64:$src, sub0)), sub0, - (V_AND_B32_e64 (i32 (EXTRACT_SUBREG f64:$src, sub1)), - (V_MOV_B32_e32 (i32 0x7fffffff))), // Set sign bit. + (V_AND_B32_e64 (EXTRACT_SUBREG f64:$src, sub1), + (V_MOV_B32_e32 0x7fffffff)), // Set sign bit. sub1) >; @@ -605,8 +605,8 @@ def : Pat < (REG_SEQUENCE VReg_64, (i32 (EXTRACT_SUBREG f64:$src, sub0)), sub0, - (V_XOR_B32_e32 (i32 (EXTRACT_SUBREG f64:$src, sub1)), - (i32 (V_MOV_B32_e32 (i32 0x80000000)))), + (V_XOR_B32_e32 (EXTRACT_SUBREG f64:$src, sub1), + (V_MOV_B32_e32 0x80000000)), sub1) >; @@ -666,21 +666,21 @@ def : POW_Common ; def : Pat < (int_AMDGPU_cube v4f32:$src), (REG_SEQUENCE VReg_128, - (V_CUBETC_F32 0 /* src0_modifiers */, (f32 (EXTRACT_SUBREG $src, sub0)), - 0 /* src1_modifiers */, (f32 (EXTRACT_SUBREG $src, sub1)), - 0 /* src2_modifiers */, (f32 (EXTRACT_SUBREG $src, sub2)), + (V_CUBETC_F32 0 /* src0_modifiers */, (EXTRACT_SUBREG $src, sub0), + 0 /* src1_modifiers */, (EXTRACT_SUBREG $src, sub1), + 0 /* src2_modifiers */, (EXTRACT_SUBREG $src, sub2), 0 /* clamp */, 0 /* omod */), sub0, - (V_CUBESC_F32 0 /* src0_modifiers */, (f32 (EXTRACT_SUBREG $src, sub0)), - 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)), - 0 /* src2_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)), + (V_CUBESC_F32 0 /* src0_modifiers */, (EXTRACT_SUBREG $src, sub0), + 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1), + 0 /* src2_modifiers */,(EXTRACT_SUBREG $src, sub2), 0 /* clamp */, 0 /* omod */), sub1, - (V_CUBEMA_F32 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub0)), - 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)), - 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)), + (V_CUBEMA_F32 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub0), + 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1), + 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub2), 0 /* clamp */, 0 /* omod */), sub2, - (V_CUBEID_F32 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub0)), - 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)), - 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)), + (V_CUBEID_F32 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub0), + 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1), + 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub2), 0 /* clamp */, 0 /* omod */), sub3) >; @@ -701,7 +701,7 @@ def : Ext32Pat ; def : Pat < (AMDGPUurecip i32:$src0), (V_CVT_U32_F32_e32 - (V_MUL_F32_e32 (i32 CONST.FP_UINT_MAX_PLUS_1), + (V_MUL_F32_e32 CONST.FP_UINT_MAX_PLUS_1, (V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0)))) >; @@ -767,37 +767,32 @@ def : Pat < //===----------------------------------------------------------------------===// def : Pat<(i32 (sext_inreg i32:$src, i1)), - (S_BFE_I32 i32:$src, (i32 65536))>; // 0 | 1 << 16 + (S_BFE_I32 i32:$src, 65536)>; // 0 | 1 << 16 // Handle sext_inreg in i64 def : Pat < (i64 (sext_inreg i64:$src, i1)), - (S_BFE_I64 i64:$src, (i32 0x10000)) // 0 | 1 << 16 ->; - -def : Pat < - (i16 (sext_inreg i16:$src, i8)), - (S_BFE_I32 $src, (i32 0x80000)) // 0 | 8 << 16 + (S_BFE_I64 i64:$src, 0x10000) // 0 | 1 << 16 >; def : Pat < (i64 (sext_inreg i64:$src, i8)), - (S_BFE_I64 i64:$src, (i32 0x80000)) // 0 | 8 << 16 + (S_BFE_I64 i64:$src, 0x80000) // 0 | 8 << 16 >; def : Pat < (i64 (sext_inreg i64:$src, i16)), - (S_BFE_I64 i64:$src, (i32 0x100000)) // 0 | 16 << 16 + (S_BFE_I64 i64:$src, 0x100000) // 0 | 16 << 16 >; def : Pat < (i64 (sext_inreg i64:$src, i32)), - (S_BFE_I64 i64:$src, (i32 0x200000)) // 0 | 32 << 16 + (S_BFE_I64 i64:$src, 0x200000) // 0 | 32 << 16 >; def : Pat < (i64 (zext i32:$src)), - (REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 (i32 0)), sub1) + (REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 0), sub1) >; def : Pat < @@ -809,7 +804,7 @@ class ZExt_i64_i1_Pat : Pat < (i64 (ext i1:$src)), (REG_SEQUENCE VReg_64, (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src), sub0, - (S_MOV_B32 (i32 0)), sub1) + (S_MOV_B32 0), sub1) >; @@ -821,25 +816,25 @@ def : ZExt_i64_i1_Pat; def : Pat < (i64 (sext i32:$src)), (REG_SEQUENCE SReg_64, $src, sub0, - (i32 (COPY_TO_REGCLASS (S_ASHR_I32 $src, (i32 31)), SReg_32_XM0)), sub1) + (i32 (COPY_TO_REGCLASS (S_ASHR_I32 $src, 31), SReg_32_XM0)), sub1) >; def : Pat < (i64 (sext i1:$src)), (REG_SEQUENCE VReg_64, - (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src), sub0, - (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src), sub1) + (V_CNDMASK_B32_e64 0, -1, $src), sub0, + (V_CNDMASK_B32_e64 0, -1, $src), sub1) >; -class FPToI1Pat : Pat < +class FPToI1Pat : Pat < (i1 (fp_to_int (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)))), - (i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)) + (i1 (Inst 0, KOne, $src0_modifiers, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)) >; -def : FPToI1Pat; -def : FPToI1Pat; -def : FPToI1Pat; -def : FPToI1Pat; +def : FPToI1Pat; +def : FPToI1Pat; +def : FPToI1Pat; +def : FPToI1Pat; // If we need to perform a logical operation on i1 values, we need to // use vector comparisons since there is only one SCC register. Vector @@ -864,12 +859,12 @@ def : Pat < def : Pat < (f32 (sint_to_fp i1:$src)), - (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_NEG_ONE), $src) + (V_CNDMASK_B32_e64 (i32 0), CONST.FP32_NEG_ONE, $src) >; def : Pat < (f32 (uint_to_fp i1:$src)), - (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_ONE), $src) + (V_CNDMASK_B32_e64 (i32 0), CONST.FP32_ONE, $src) >; def : Pat < @@ -893,20 +888,20 @@ def : Pat < def : Pat < (i1 (trunc i32:$a)), - (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1)) + (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), 1) >; def : Pat < (i1 (trunc i64:$a)), (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), - (i32 (EXTRACT_SUBREG $a, sub0))), (i32 1)) + (EXTRACT_SUBREG $a, sub0)), 1) >; def : Pat < (i32 (bswap i32:$a)), - (V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)), - (V_ALIGNBIT_B32 $a, $a, (i32 24)), - (V_ALIGNBIT_B32 $a, $a, (i32 8))) + (V_BFI_B32 (S_MOV_B32 0x00ff00ff), + (V_ALIGNBIT_B32 $a, $a, 24), + (V_ALIGNBIT_B32 $a, $a, 8)) >; def : Pat < @@ -922,7 +917,7 @@ multiclass BFMPatterns { def : Pat < (vt (add (vt (shl 1, vt:$a)), -1)), - (BFM $a, (MOV (i32 0))) + (BFM $a, (MOV 0)) >; } @@ -933,7 +928,7 @@ def : BFEPattern ; def : Pat< (fcanonicalize f32:$src), - (V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), 0, $src, 0, 0) + (V_MUL_F32_e64 0, CONST.FP32_ONE, 0, $src, 0, 0) >; def : Pat< @@ -968,7 +963,7 @@ def : Pat < (V_MOV_B64_PSEUDO 0x3fefffffffffffff), DSTCLAMP.NONE, DSTOMOD.NONE), $x, - (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, (i32 3 /*NaN*/))), + (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, 3/*NaN*/)), DSTCLAMP.NONE, DSTOMOD.NONE) >; diff --git a/lib/Target/AMDGPU/SIRegisterInfo.td b/lib/Target/AMDGPU/SIRegisterInfo.td index a5ba0ef7e0e..7d3634ef2d1 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/lib/Target/AMDGPU/SIRegisterInfo.td @@ -123,7 +123,7 @@ def SCC_CLASS : RegisterClass<"AMDGPU", [i1], 1, (add SCC)> { // TODO: Do we need to set DwarfRegAlias on register tuples? // SGPR 32-bit registers -def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16], 32, +def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add (sequence "SGPR%u", 0, 103))> { let AllocationPriority = 1; } @@ -190,8 +190,7 @@ def TTMP_128Regs : RegisterTuples<[sub0, sub1, sub2, sub3], (add (decimate (shl TTMP_32, 3), 4))]>; // VGPR 32-bit registers -// i16 only on VI+ -def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16], 32, +def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add (sequence "VGPR%u", 0, 255))> { let AllocationPriority = 1; let Size = 32; @@ -259,8 +258,8 @@ def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32], 32, } // Register class for all scalar registers (SGPRs + Special Registers) -def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16], 32, - (add SReg_32_XM0, M0, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI)> { +def SReg_32 : RegisterClass<"AMDGPU", [i32, f32], 32, + (add SReg_32_XM0, M0)> { let AllocationPriority = 1; } @@ -347,7 +346,7 @@ def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> { let Size = 32; } -def VS_32 : RegisterClass<"AMDGPU", [i32, f32, i16], 32, (add VGPR_32, SReg_32)> { +def VS_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VGPR_32, SReg_32)> { let isAllocatable = 0; } diff --git a/lib/Target/AMDGPU/SOPInstructions.td b/lib/Target/AMDGPU/SOPInstructions.td index 2486fbf3edf..e38a11db9ac 100644 --- a/lib/Target/AMDGPU/SOPInstructions.td +++ b/lib/Target/AMDGPU/SOPInstructions.td @@ -879,7 +879,7 @@ def : Pat < (i64 (ctpop i64:$src)), (i64 (REG_SEQUENCE SReg_64, (i32 (COPY_TO_REGCLASS (S_BCNT1_I32_B64 $src), SReg_32)), sub0, - (S_MOV_B32 (i32 0)), sub1)) + (S_MOV_B32 0), sub1)) >; def : Pat < @@ -887,18 +887,6 @@ def : Pat < (S_ABS_I32 $x) >; -def : Pat < - (i16 imm:$imm), - (S_MOV_B32 imm:$imm) ->; - -// Same as a 32-bit inreg -def : Pat< - (i32 (sext i16:$src)), - (S_SEXT_I32_I16 $src) ->; - - //===----------------------------------------------------------------------===// // SOP2 Patterns //===----------------------------------------------------------------------===// @@ -910,29 +898,6 @@ def : Pat < (S_ADD_U32 $src0, $src1) >; -// FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that -// REG_SEQUENCE patterns don't support instructions with multiple -// outputs. -def : Pat< - (i64 (zext i16:$src)), - (REG_SEQUENCE SReg_64, - (i32 (COPY_TO_REGCLASS (S_AND_B32 $src, (S_MOV_B32 (i32 0xffff))), SGPR_32)), sub0, - (S_MOV_B32 (i32 0)), sub1) ->; - -def : Pat < - (i64 (sext i16:$src)), - (REG_SEQUENCE SReg_64, (i32 (S_SEXT_I32_I16 $src)), sub0, - (i32 (COPY_TO_REGCLASS (S_ASHR_I32 (i32 (S_SEXT_I32_I16 $src)), (S_MOV_B32 (i32 31))), SGPR_32)), sub1) ->; - -def : Pat< - (i32 (zext i16:$src)), - (S_AND_B32 (S_MOV_B32 (i32 0xffff)), $src) ->; - - - //===----------------------------------------------------------------------===// // SOPP Patterns //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/VIInstructions.td b/lib/Target/AMDGPU/VIInstructions.td index b45c8fc9c7d..ead90ece0ad 100644 --- a/lib/Target/AMDGPU/VIInstructions.td +++ b/lib/Target/AMDGPU/VIInstructions.td @@ -8,7 +8,3 @@ //===----------------------------------------------------------------------===// // Instruction definitions for VI and newer. //===----------------------------------------------------------------------===// - -FIXME: Deleting this file broke buildbots that don't do full rebuilds. This -file is no longer used by the backend, so it can be deleted once all -the buildbots update there dependencies. diff --git a/lib/Target/AMDGPU/VOP1Instructions.td b/lib/Target/AMDGPU/VOP1Instructions.td index b2840982462..6124d4e05da 100644 --- a/lib/Target/AMDGPU/VOP1Instructions.td +++ b/lib/Target/AMDGPU/VOP1Instructions.td @@ -301,20 +301,6 @@ defm V_COS_F16 : VOP1Inst <"v_cos_f16", VOP_F16_F16>; } -let Predicates = [isVI] in { - -def : Pat< - (f32 (f16_to_fp i16:$src)), - (V_CVT_F32_F16_e32 $src) ->; - -def : Pat< - (i16 (fp_to_f16 f32:$src)), - (V_CVT_F16_F32_e32 $src) ->; - -} - //===----------------------------------------------------------------------===// // Target //===----------------------------------------------------------------------===// @@ -575,39 +561,10 @@ def V_MOVRELD_B32_V16 : V_MOVRELD_B32_pseudo; let Predicates = [isVI] in { def : Pat < - (i32 (int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$row_mask, imm:$bank_mask, - imm:$bound_ctrl)), + (int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$row_mask, imm:$bank_mask, + imm:$bound_ctrl), (V_MOV_B32_dpp $src, (as_i32imm $dpp_ctrl), (as_i32imm $row_mask), (as_i32imm $bank_mask), (as_i1imm $bound_ctrl)) >; - -def : Pat< - (i32 (anyext i16:$src)), - (COPY $src) ->; - -def : Pat< - (i64 (anyext i16:$src)), - (REG_SEQUENCE VReg_64, - (i32 (COPY $src)), sub0, - (V_MOV_B32_e32 (i32 0)), sub1) ->; - -def : Pat< - (i16 (trunc i32:$src)), - (COPY $src) ->; - -def : Pat< - (i1 (trunc i16:$src)), - (COPY $src) ->; - - -def : Pat < - (i16 (trunc i64:$src)), - (EXTRACT_SUBREG $src, sub0) ->; - } // End Predicates = [isVI] diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td index 570ca05587b..fc13382926d 100644 --- a/lib/Target/AMDGPU/VOP2Instructions.td +++ b/lib/Target/AMDGPU/VOP2Instructions.td @@ -345,78 +345,6 @@ defm V_MIN_I16 : VOP2Inst <"v_min_i16", VOP_I16_I16_I16>; } // End SubtargetPredicate = isVI -// Note: 16-bit instructions produce a 0 result in the high 16-bits. -multiclass Arithmetic_i16_Pats { - -def : Pat< - (op i16:$src0, i16:$src1), - (inst $src0, $src1) ->; - -def : Pat< - (i32 (zext (op i16:$src0, i16:$src1))), - (inst $src0, $src1) ->; - -def : Pat< - (i64 (zext (op i16:$src0, i16:$src1))), - (REG_SEQUENCE VReg_64, - (inst $src0, $src1), sub0, - (V_MOV_B32_e32 (i32 0)), sub1) ->; - -} - -multiclass Bits_OpsRev_i16_Pats { - -def : Pat< - (op i16:$src0, i32:$src1), - (inst $src1, $src0) ->; - -def : Pat< - (i32 (zext (op i16:$src0, i32:$src1))), - (inst $src1, $src0) ->; - - -def : Pat< - (i64 (zext (op i16:$src0, i32:$src1))), - (REG_SEQUENCE VReg_64, - (inst $src1, $src0), sub0, - (V_MOV_B32_e32 (i32 0)), sub1) ->; -} - -class ZExt_i16_i1_Pat : Pat < - (i16 (ext i1:$src)), - (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src) ->; - -let Predicates = [isVI] in { - -defm : Arithmetic_i16_Pats; -defm : Arithmetic_i16_Pats; -defm : Arithmetic_i16_Pats; -defm : Arithmetic_i16_Pats; -defm : Arithmetic_i16_Pats; -defm : Arithmetic_i16_Pats; -defm : Arithmetic_i16_Pats; - -defm : Arithmetic_i16_Pats; -defm : Arithmetic_i16_Pats; -defm : Arithmetic_i16_Pats; - -defm : Bits_OpsRev_i16_Pats; -defm : Bits_OpsRev_i16_Pats; -defm : Bits_OpsRev_i16_Pats; - -def : ZExt_i16_i1_Pat; -def : ZExt_i16_i1_Pat; -def : ZExt_i16_i1_Pat; - -} // End Predicates = [isVI] - //===----------------------------------------------------------------------===// // SI //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td index 73e331503ad..0f063756de5 100644 --- a/lib/Target/AMDGPU/VOP3Instructions.td +++ b/lib/Target/AMDGPU/VOP3Instructions.td @@ -222,38 +222,6 @@ let isCommutable = 1 in { } // End SubtargetPredicate = isVI -def : Pat < - (i16 (select i1:$src0, i16:$src1, i16:$src2)), - (V_CNDMASK_B32_e64 $src2, $src1, $src0) ->; - -let Predicates = [isVI] in { - -multiclass Tenary_i16_Pats { -def : Pat< - (op2 (op1 i16:$src0, i16:$src1), i16:$src2), - (inst i16:$src0, i16:$src1, i16:$src2) ->; - -def : Pat< - (i32 (op3 (op2 (op1 i16:$src0, i16:$src1), i16:$src2))), - (inst i16:$src0, i16:$src1, i16:$src2) ->; - -def : Pat< - (i64 (op3 (op2 (op1 i16:$src0, i16:$src1), i16:$src2))), - (REG_SEQUENCE VReg_64, - (inst i16:$src0, i16:$src1, i16:$src2), sub0, - (V_MOV_B32_e32 (i32 0)), sub1) ->; -} - -defm: Tenary_i16_Pats; -defm: Tenary_i16_Pats; - -} // End Predicates = [isVI] - //===----------------------------------------------------------------------===// // Target diff --git a/test/CodeGen/AMDGPU/add.i16.ll b/test/CodeGen/AMDGPU/add.i16.ll deleted file mode 100644 index 3c7a2c1f897..00000000000 --- a/test/CodeGen/AMDGPU/add.i16.ll +++ /dev/null @@ -1,149 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s - -; FIXME: Need to handle non-uniform case for function below (load without gep). -; GCN-LABEL: {{^}}v_test_add_i16: -; VI: flat_load_ushort [[A:v[0-9]+]] -; VI: flat_load_ushort [[B:v[0-9]+]] -; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] -; VI-NEXT: buffer_store_short [[ADD]] -define void @v_test_add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid - %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid - %gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid - %a = load volatile i16, i16 addrspace(1)* %gep.in0 - %b = load volatile i16, i16 addrspace(1)* %gep.in1 - %add = add i16 %a, %b - store i16 %add, i16 addrspace(1)* %out - ret void -} - -; FIXME: Need to handle non-uniform case for function below (load without gep). -; GCN-LABEL: {{^}}v_test_add_i16_constant: -; VI: flat_load_ushort [[A:v[0-9]+]] -; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0x7b, [[A]] -; VI-NEXT: buffer_store_short [[ADD]] -define void @v_test_add_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid - %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid - %a = load volatile i16, i16 addrspace(1)* %gep.in0 - %add = add i16 %a, 123 - store i16 %add, i16 addrspace(1)* %out - ret void -} - -; FIXME: Need to handle non-uniform case for function below (load without gep). -; GCN-LABEL: {{^}}v_test_add_i16_neg_constant: -; VI: flat_load_ushort [[A:v[0-9]+]] -; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0xfffffcb3, [[A]] -; VI-NEXT: buffer_store_short [[ADD]] -define void @v_test_add_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid - %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid - %a = load volatile i16, i16 addrspace(1)* %gep.in0 - %add = add i16 %a, -845 - store i16 %add, i16 addrspace(1)* %out - ret void -} - -; FIXME: Need to handle non-uniform case for function below (load without gep). -; GCN-LABEL: {{^}}v_test_add_i16_inline_neg1: -; VI: flat_load_ushort [[A:v[0-9]+]] -; VI: v_add_u16_e32 [[ADD:v[0-9]+]], -1, [[A]] -; VI-NEXT: buffer_store_short [[ADD]] -define void @v_test_add_i16_inline_neg1(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid - %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid - %a = load volatile i16, i16 addrspace(1)* %gep.in0 - %add = add i16 %a, -1 - store i16 %add, i16 addrspace(1)* %out - ret void -} - -; FIXME: Need to handle non-uniform case for function below (load without gep). -; GCN-LABEL: {{^}}v_test_add_i16_zext_to_i32: -; VI: flat_load_ushort [[A:v[0-9]+]] -; VI: flat_load_ushort [[B:v[0-9]+]] -; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] -; VI-NEXT: buffer_store_dword [[ADD]] -define void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid - %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid - %gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid - %a = load volatile i16, i16 addrspace(1)* %gep.in0 - %b = load volatile i16, i16 addrspace(1)* %gep.in1 - %add = add i16 %a, %b - %ext = zext i16 %add to i32 - store i32 %ext, i32 addrspace(1)* %out - ret void -} - -; FIXME: Need to handle non-uniform case for function below (load without gep). -; GCN-LABEL: {{^}}v_test_add_i16_zext_to_i64: -; VI: flat_load_ushort [[A:v[0-9]+]] -; VI: flat_load_ushort [[B:v[0-9]+]] -; VI-DAG: v_add_u16_e32 v[[ADD:[0-9]+]], [[A]], [[B]] -; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0 -; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}} -define void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid - %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid - %gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid - %a = load volatile i16, i16 addrspace(1)* %gep.in0 - %b = load volatile i16, i16 addrspace(1)* %gep.in1 - %add = add i16 %a, %b - %ext = zext i16 %add to i64 - store i64 %ext, i64 addrspace(1)* %out - ret void -} - -; FIXME: Need to handle non-uniform case for function below (load without gep). -; GCN-LABEL: {{^}}v_test_add_i16_sext_to_i32: -; VI: flat_load_ushort [[A:v[0-9]+]] -; VI: flat_load_ushort [[B:v[0-9]+]] -; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] -; VI-NEXT: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16 -; VI-NEXT: buffer_store_dword [[SEXT]] -define void @v_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid - %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid - %gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid - %a = load i16, i16 addrspace(1)* %gep.in0 - %b = load i16, i16 addrspace(1)* %gep.in1 - %add = add i16 %a, %b - %ext = sext i16 %add to i32 - store i32 %ext, i32 addrspace(1)* %out - ret void -} - -; FIXME: Need to handle non-uniform case for function below (load without gep). -; GCN-LABEL: {{^}}v_test_add_i16_sext_to_i64: -; VI: flat_load_ushort [[A:v[0-9]+]] -; VI: flat_load_ushort [[B:v[0-9]+]] -; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] -; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16 -; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] -; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @v_test_add_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid - %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid - %gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid - %a = load i16, i16 addrspace(1)* %gep.in0 - %b = load i16, i16 addrspace(1)* %gep.in1 - %add = add i16 %a, %b - %ext = sext i16 %add to i64 - store i64 %ext, i64 addrspace(1)* %out - ret void -} - -declare i32 @llvm.amdgcn.workitem.id.x() #0 - -attributes #0 = { nounwind readnone } -attributes #1 = { nounwind } diff --git a/test/CodeGen/AMDGPU/anyext.ll b/test/CodeGen/AMDGPU/anyext.ll index a1d3715a095..48d8f312249 100644 --- a/test/CodeGen/AMDGPU/anyext.ll +++ b/test/CodeGen/AMDGPU/anyext.ll @@ -1,40 +1,15 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone -declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone - -; GCN-LABEL: {{^}}anyext_i1_i32: -; GCN: v_cndmask_b32_e64 +; CHECK-LABEL: {{^}}anyext_i1_i32: +; CHECK: v_cndmask_b32_e64 define void @anyext_i1_i32(i32 addrspace(1)* %out, i32 %cond) { entry: - %tmp = icmp eq i32 %cond, 0 - %tmp1 = zext i1 %tmp to i8 - %tmp2 = xor i8 %tmp1, -1 - %tmp3 = and i8 %tmp2, 1 - %tmp4 = zext i8 %tmp3 to i32 - store i32 %tmp4, i32 addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}s_anyext_i16_i32: -; VI: v_add_u16_e32 [[ADD:v[0-9]+]], -; VI: v_xor_b32_e32 [[XOR:v[0-9]+]], -1, [[ADD]] -; VI: v_and_b32_e32 [[AND:v[0-9]+]], 1, [[XOR]] -; VI: buffer_store_dword [[AND]] -define void @s_anyext_i16_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %a, i16 addrspace(1)* %b) { -entry: - %tid.x = call i32 @llvm.amdgcn.workitem.id.x() - %tid.y = call i32 @llvm.amdgcn.workitem.id.y() - %a.ptr = getelementptr i16, i16 addrspace(1)* %a, i32 %tid.x - %b.ptr = getelementptr i16, i16 addrspace(1)* %b, i32 %tid.y - %a.l = load i16, i16 addrspace(1)* %a.ptr - %b.l = load i16, i16 addrspace(1)* %b.ptr - %tmp = add i16 %a.l, %b.l - %tmp1 = trunc i16 %tmp to i8 - %tmp2 = xor i8 %tmp1, -1 - %tmp3 = and i8 %tmp2, 1 - %tmp4 = zext i8 %tmp3 to i32 - store i32 %tmp4, i32 addrspace(1)* %out + %0 = icmp eq i32 %cond, 0 + %1 = zext i1 %0 to i8 + %2 = xor i8 %1, -1 + %3 = and i8 %2, 1 + %4 = zext i8 %3 to i32 + store i32 %4, i32 addrspace(1)* %out ret void } diff --git a/test/CodeGen/AMDGPU/bitreverse.ll b/test/CodeGen/AMDGPU/bitreverse.ll index aca88a9ef2a..0acaceaa6fe 100644 --- a/test/CodeGen/AMDGPU/bitreverse.ll +++ b/test/CodeGen/AMDGPU/bitreverse.ll @@ -1,6 +1,5 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s declare i16 @llvm.bitreverse.i16(i16) #1 declare i32 @llvm.bitreverse.i32(i32) #1 @@ -13,7 +12,7 @@ declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) #1 declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) #1 ; FUNC-LABEL: {{^}}s_brev_i16: -; SI: s_brev_b32 +; SI: s_brev_b32 define void @s_brev_i16(i16 addrspace(1)* noalias %out, i16 %val) #0 { %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1 store i16 %brev, i16 addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll b/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll index baf6eb140bc..33daf0292ae 100644 --- a/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll +++ b/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll @@ -116,19 +116,14 @@ ret: ; OPT: store ; OPT: ret -; For GFX8: since i16 is legal type, we cannot sink lshr into BBs. ; GCN-LABEL: {{^}}sink_ubfe_i16: ; GCN-NOT: lshr -; VI: s_bfe_u32 s0, s0, 0xc0004 ; GCN: s_cbranch_vccnz -; SI: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80004 -; VI: s_and_b32 s0, s0, 0xff - +; GCN: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80004 ; GCN: BB2_2: -; SI: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x70004 -; VI: s_and_b32 s0, s0, 0x7f +; GCN: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x70004 ; GCN: BB2_3: ; GCN: buffer_store_short diff --git a/test/CodeGen/AMDGPU/copy-illegal-type.ll b/test/CodeGen/AMDGPU/copy-illegal-type.ll index 6918dff74d5..00d2257f4ad 100644 --- a/test/CodeGen/AMDGPU/copy-illegal-type.ll +++ b/test/CodeGen/AMDGPU/copy-illegal-type.ll @@ -1,13 +1,10 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s - -declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone -declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone +; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}test_copy_v4i8: -; GCN: buffer_load_dword [[REG:v[0-9]+]] -; GCN: buffer_store_dword [[REG]] -; GCN: s_endpgm +; SI: buffer_load_dword [[REG:v[0-9]+]] +; SI: buffer_store_dword [[REG]] +; SI: s_endpgm define void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4 @@ -15,10 +12,10 @@ define void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* } ; FUNC-LABEL: {{^}}test_copy_v4i8_x2: -; GCN: buffer_load_dword [[REG:v[0-9]+]] -; GCN: buffer_store_dword [[REG]] -; GCN: buffer_store_dword [[REG]] -; GCN: s_endpgm +; SI: buffer_load_dword [[REG:v[0-9]+]] +; SI: buffer_store_dword [[REG]] +; SI: buffer_store_dword [[REG]] +; SI: s_endpgm define void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind { %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 @@ -27,11 +24,11 @@ define void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace( } ; FUNC-LABEL: {{^}}test_copy_v4i8_x3: -; GCN: buffer_load_dword [[REG:v[0-9]+]] -; GCN: buffer_store_dword [[REG]] -; GCN: buffer_store_dword [[REG]] -; GCN: buffer_store_dword [[REG]] -; GCN: s_endpgm +; SI: buffer_load_dword [[REG:v[0-9]+]] +; SI: buffer_store_dword [[REG]] +; SI: buffer_store_dword [[REG]] +; SI: buffer_store_dword [[REG]] +; SI: s_endpgm define void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind { %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 @@ -41,12 +38,12 @@ define void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace( } ; FUNC-LABEL: {{^}}test_copy_v4i8_x4: -; GCN: buffer_load_dword [[REG:v[0-9]+]] -; GCN: buffer_store_dword [[REG]] -; GCN: buffer_store_dword [[REG]] -; GCN: buffer_store_dword [[REG]] -; GCN: buffer_store_dword [[REG]] -; GCN: s_endpgm +; SI: buffer_load_dword [[REG:v[0-9]+]] +; SI: buffer_store_dword [[REG]] +; SI: buffer_store_dword [[REG]] +; SI: buffer_store_dword [[REG]] +; SI: buffer_store_dword [[REG]] +; SI: s_endpgm define void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind { %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 @@ -57,14 +54,14 @@ define void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace( } ; FUNC-LABEL: {{^}}test_copy_v4i8_extra_use: -; GCN: buffer_load_dword -; GCN-DAG: v_lshrrev_b32 -; GCN: v_and_b32 -; GCN: v_or_b32 -; GCN-DAG: buffer_store_dword -; GCN-DAG: buffer_store_dword +; SI: buffer_load_dword +; SI-DAG: v_lshrrev_b32 +; SI: v_and_b32 +; SI: v_or_b32 +; SI-DAG: buffer_store_dword +; SI-DAG: buffer_store_dword -; GCN: s_endpgm +; SI: s_endpgm define void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind { %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 %add = add <4 x i8> %val, @@ -73,22 +70,18 @@ define void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> add ret void } -; FIXME: Need to handle non-uniform case for function below (load without gep). ; FUNC-LABEL: {{^}}test_copy_v4i8_x2_extra_use: -; GCN: {{buffer|flat}}_load_dword -; GCN-DAG: v_lshrrev_b32 +; SI: buffer_load_dword +; SI-DAG: v_lshrrev_b32 ; SI-DAG: v_add_i32 -; VI-DAG: v_add_u16 -; GCN-DAG: v_and_b32 -; GCN-DAG: v_or_b32 -; GCN-DAG: {{buffer|flat}}_store_dword -; GCN: {{buffer|flat}}_store_dword -; GCN: {{buffer|flat}}_store_dword -; GCN: s_endpgm +; SI-DAG: v_and_b32 +; SI-DAG: v_or_b32 +; SI-DAG: buffer_store_dword +; SI: buffer_store_dword +; SI: buffer_store_dword +; SI: s_endpgm define void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind { - %tid.x = call i32 @llvm.amdgcn.workitem.id.x() - %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x - %val = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4 + %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 %add = add <4 x i8> %val, store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4 @@ -97,10 +90,10 @@ define void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> } ; FUNC-LABEL: {{^}}test_copy_v3i8_align4: -; GCN: buffer_load_dword -; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} -; GCN: s_endpgm +; SI: buffer_load_dword +; SI-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; SI-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} +; SI: s_endpgm define void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4 store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 4 @@ -108,11 +101,11 @@ define void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspa } ; FUNC-LABEL: {{^}}test_copy_v3i8_align2: -; GCN-DAG: buffer_load_ushort v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GCN-DAG: buffer_load_ubyte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} -; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} -; GCN: s_endpgm +; SI-DAG: buffer_load_ushort v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; SI-DAG: buffer_load_ubyte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} +; SI-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; SI-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} +; SI: s_endpgm define void @test_copy_v3i8_align2(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 2 store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 2 @@ -120,14 +113,14 @@ define void @test_copy_v3i8_align2(<3 x i8> addrspace(1)* %out, <3 x i8> addrspa } ; FUNC-LABEL: {{^}}test_copy_v3i8_align1: -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte -; GCN: buffer_store_byte -; GCN: buffer_store_byte -; GCN: buffer_store_byte -; GCN: s_endpgm +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: s_endpgm define void @test_copy_v3i8_align1(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 1 store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 1 @@ -135,12 +128,12 @@ define void @test_copy_v3i8_align1(<3 x i8> addrspace(1)* %out, <3 x i8> addrspa } ; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_load: -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_store_dword -; GCN: s_endpgm +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_store_dword +; SI: s_endpgm define void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { %val = load volatile <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4 @@ -148,15 +141,15 @@ define void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> } ; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_store: -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_store_byte -; GCN: buffer_store_byte -; GCN: buffer_store_byte -; GCN: buffer_store_byte -; GCN: s_endpgm +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: s_endpgm define void @test_copy_v4i8_volatile_store(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 store volatile <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4 diff --git a/test/CodeGen/AMDGPU/ctlz.ll b/test/CodeGen/AMDGPU/ctlz.ll index 0d0ead69aa0..597f11c490c 100644 --- a/test/CodeGen/AMDGPU/ctlz.ll +++ b/test/CodeGen/AMDGPU/ctlz.ll @@ -100,7 +100,6 @@ define void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrsp ; GCN: buffer_load_ubyte [[VAL:v[0-9]+]], ; GCN-DAG: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]] ; GCN: buffer_store_byte [[RESULT]], -; GCN: s_endpgm define void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { %val = load i8, i8 addrspace(1)* %valptr %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone diff --git a/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/test/CodeGen/AMDGPU/ctlz_zero_undef.ll index 900938bab70..8c3a93d523b 100644 --- a/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ b/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC -check-prefix=GCN %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone diff --git a/test/CodeGen/AMDGPU/cube.ll b/test/CodeGen/AMDGPU/cube.ll index c5d1f86cea7..ab99af5864e 100644 --- a/test/CodeGen/AMDGPU/cube.ll +++ b/test/CodeGen/AMDGPU/cube.ll @@ -30,10 +30,10 @@ define void @cube(<4 x float> addrspace(1)* %out, float %a, float %b, float %c) } ; GCN-LABEL: {{^}}legacy_cube: -; GCN-DAG: v_cubeid_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} -; GCN-DAG: v_cubesc_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} -; GCN-DAG: v_cubetc_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} -; GCN-DAG: v_cubema_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} +; GCN-DAG: v_cubeid_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_cubesc_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_cubetc_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_cubema_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GCN: buffer_store_dwordx4 define void @legacy_cube(<4 x float> addrspace(1)* %out, <4 x float> %abcx) #1 { %cube = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %abcx) diff --git a/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll index ed9b8273fa4..36275ff12cf 100644 --- a/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -1,15 +1,15 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone -; GCN-LABEL: {{^}}load_i8_to_f32: -; GCN: buffer_load_ubyte [[LOADREG:v[0-9]+]], -; GCN-NOT: bfe -; GCN-NOT: lshr -; GCN: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[LOADREG]] -; GCN: buffer_store_dword [[CONV]], +; SI-LABEL: {{^}}load_i8_to_f32: +; SI: buffer_load_ubyte [[LOADREG:v[0-9]+]], +; SI-NOT: bfe +; SI-NOT: lshr +; SI: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[LOADREG]] +; SI: buffer_store_dword [[CONV]], define void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind { %load = load i8, i8 addrspace(1)* %in, align 1 %cvt = uitofp i8 %load to float @@ -17,11 +17,11 @@ define void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* n ret void } -; GCN-LABEL: {{^}}load_v2i8_to_v2f32: -; GCN: buffer_load_ushort [[LD:v[0-9]+]] -; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LD]] -; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LD]] -; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, +; SI-LABEL: {{^}}load_v2i8_to_v2f32: +; SI: buffer_load_ushort [[LD:v[0-9]+]] +; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LD]] +; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LD]] +; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind { %load = load <2 x i8>, <2 x i8> addrspace(1)* %in, align 2 %cvt = uitofp <2 x i8> %load to <2 x float> @@ -29,13 +29,13 @@ define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> ret void } -; GCN-LABEL: {{^}}load_v3i8_to_v3f32: -; GCN: buffer_load_dword [[VAL:v[0-9]+]] -; GCN-NOT: v_cvt_f32_ubyte3_e32 -; GCN-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[VAL]] -; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[VAL]] -; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[VAL]] -; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, +; SI-LABEL: {{^}}load_v3i8_to_v3f32: +; SI: buffer_load_dword [[VAL:v[0-9]+]] +; SI-NOT: v_cvt_f32_ubyte3_e32 +; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[VAL]] +; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[VAL]] +; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[VAL]] +; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind { %load = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4 %cvt = uitofp <3 x i8> %load to <3 x float> @@ -43,15 +43,15 @@ define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> ret void } -; GCN-LABEL: {{^}}load_v4i8_to_v4f32: -; GCN: buffer_load_dword [[LOADREG:v[0-9]+]] -; GCN-NOT: bfe -; GCN-NOT: lshr -; GCN-DAG: v_cvt_f32_ubyte3_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]] -; GCN-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[LOADREG]] -; GCN-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, [[LOADREG]] -; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]] -; GCN: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, +; SI-LABEL: {{^}}load_v4i8_to_v4f32: +; SI: buffer_load_dword [[LOADREG:v[0-9]+]] +; SI-NOT: bfe +; SI-NOT: lshr +; SI-DAG: v_cvt_f32_ubyte3_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]] +; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[LOADREG]] +; SI-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, [[LOADREG]] +; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]] +; SI: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, define void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 %cvt = uitofp <4 x i8> %load to <4 x float> @@ -63,19 +63,19 @@ define void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> ; position in the word for the component. ; FIXME: Packing bytes -; GCN-LABEL: {{^}}load_v4i8_to_v4f32_unaligned: -; GCN: buffer_load_ubyte [[LOADREG3:v[0-9]+]] -; GCN: buffer_load_ubyte [[LOADREG2:v[0-9]+]] -; GCN: buffer_load_ubyte [[LOADREG1:v[0-9]+]] -; GCN: buffer_load_ubyte [[LOADREG0:v[0-9]+]] -; GCN-DAG: v_lshlrev_b32 -; GCN-DAG: v_or_b32 -; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], -; GCN-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, -; GCN-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, -; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[HIRESULT:[0-9]+]] - -; GCN: buffer_store_dwordx4 +; SI-LABEL: {{^}}load_v4i8_to_v4f32_unaligned: +; SI: buffer_load_ubyte [[LOADREG3:v[0-9]+]] +; SI: buffer_load_ubyte [[LOADREG2:v[0-9]+]] +; SI: buffer_load_ubyte [[LOADREG1:v[0-9]+]] +; SI: buffer_load_ubyte [[LOADREG0:v[0-9]+]] +; SI-DAG: v_lshlrev_b32 +; SI-DAG: v_or_b32 +; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], +; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, +; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, +; SI-DAG: v_cvt_f32_ubyte0_e32 v[[HIRESULT:[0-9]+]] + +; SI: buffer_store_dwordx4 define void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1 %cvt = uitofp <4 x i8> %load to <4 x float> @@ -85,31 +85,25 @@ define void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out ; FIXME: Need to handle non-uniform case for function below (load without gep). ; Instructions still emitted to repack bytes for add use. - -; GCN-LABEL: {{^}}load_v4i8_to_v4f32_2_uses: -; GCN: {{buffer|flat}}_load_dword -; GCN-DAG: v_cvt_f32_ubyte0_e32 -; GCN-DAG: v_cvt_f32_ubyte1_e32 -; GCN-DAG: v_cvt_f32_ubyte2_e32 -; GCN-DAG: v_cvt_f32_ubyte3_e32 - -; GCN-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 24 -; GCN-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 16 - +; SI-LABEL: {{^}}load_v4i8_to_v4f32_2_uses: +; SI: {{buffer|flat}}_load_dword +; SI-DAG: v_cvt_f32_ubyte0_e32 +; SI-DAG: v_cvt_f32_ubyte1_e32 +; SI-DAG: v_cvt_f32_ubyte2_e32 +; SI-DAG: v_cvt_f32_ubyte3_e32 + +; SI-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 24 +; SI-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 16 ; SI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16 ; SI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 8 ; SI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffff, ; SI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff00, ; SI-DAG: v_add_i32 -; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffffff00, -; VI-DAG: v_add_u16_e32 -; VI-DAG: v_add_u16_e32 - -; GCN: {{buffer|flat}}_store_dwordx4 -; GCN: {{buffer|flat}}_store_dword +; SI: {{buffer|flat}}_store_dwordx4 +; SI: {{buffer|flat}}_store_dword -; GCN: s_endpgm +; SI: s_endpgm define void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind { %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x @@ -122,8 +116,8 @@ define void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, < } ; Make sure this doesn't crash. -; GCN-LABEL: {{^}}load_v7i8_to_v7f32: -; GCN: s_endpgm +; SI-LABEL: {{^}}load_v7i8_to_v7f32: +; SI: s_endpgm define void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind { %load = load <7 x i8>, <7 x i8> addrspace(1)* %in, align 1 %cvt = uitofp <7 x i8> %load to <7 x float> @@ -131,22 +125,22 @@ define void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> ret void } -; GCN-LABEL: {{^}}load_v8i8_to_v8f32: -; GCN: buffer_load_dwordx2 v{{\[}}[[LOLOAD:[0-9]+]]:[[HILOAD:[0-9]+]]{{\]}}, -; GCN-NOT: bfe -; GCN-NOT: lshr -; GCN-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[LOLOAD]] -; GCN-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, v[[LOLOAD]] -; GCN-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, v[[LOLOAD]] -; GCN-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[LOLOAD]] -; GCN-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[HILOAD]] -; GCN-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, v[[HILOAD]] -; GCN-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, v[[HILOAD]] -; GCN-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[HILOAD]] -; GCN-NOT: bfe -; GCN-NOT: lshr -; GCN: buffer_store_dwordx4 -; GCN: buffer_store_dwordx4 +; SI-LABEL: {{^}}load_v8i8_to_v8f32: +; SI: buffer_load_dwordx2 v{{\[}}[[LOLOAD:[0-9]+]]:[[HILOAD:[0-9]+]]{{\]}}, +; SI-NOT: bfe +; SI-NOT: lshr +; SI-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[LOLOAD]] +; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, v[[LOLOAD]] +; SI-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, v[[LOLOAD]] +; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[LOLOAD]] +; SI-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[HILOAD]] +; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, v[[HILOAD]] +; SI-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, v[[HILOAD]] +; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[HILOAD]] +; SI-NOT: bfe +; SI-NOT: lshr +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 define void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind { %load = load <8 x i8>, <8 x i8> addrspace(1)* %in, align 8 %cvt = uitofp <8 x i8> %load to <8 x float> @@ -154,11 +148,11 @@ define void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> ret void } -; GCN-LABEL: {{^}}i8_zext_inreg_i32_to_f32: -; GCN: buffer_load_dword [[LOADREG:v[0-9]+]], -; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 2, [[LOADREG]] -; GCN-NEXT: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[ADD]] -; GCN: buffer_store_dword [[CONV]], +; SI-LABEL: {{^}}i8_zext_inreg_i32_to_f32: +; SI: buffer_load_dword [[LOADREG:v[0-9]+]], +; SI: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 2, [[LOADREG]] +; SI-NEXT: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[ADD]] +; SI: buffer_store_dword [[CONV]], define void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { %load = load i32, i32 addrspace(1)* %in, align 4 %add = add i32 %load, 2 @@ -168,7 +162,7 @@ define void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addr ret void } -; GCN-LABEL: {{^}}i8_zext_inreg_hi1_to_f32: +; SI-LABEL: {{^}}i8_zext_inreg_hi1_to_f32: define void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { %load = load i32, i32 addrspace(1)* %in, align 4 %inreg = and i32 %load, 65280 @@ -180,7 +174,7 @@ define void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addr ; We don't get these ones because of the zext, but instcombine removes ; them so it shouldn't really matter. -; GCN-LABEL: {{^}}i8_zext_i32_to_f32: +; SI-LABEL: {{^}}i8_zext_i32_to_f32: define void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind { %load = load i8, i8 addrspace(1)* %in, align 1 %ext = zext i8 %load to i32 @@ -189,7 +183,7 @@ define void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1 ret void } -; GCN-LABEL: {{^}}v4i8_zext_v4i32_to_v4f32: +; SI-LABEL: {{^}}v4i8_zext_v4i32_to_v4f32: define void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1 %ext = zext <4 x i8> %load to <4 x i32> @@ -198,11 +192,11 @@ define void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 ret void } -; GCN-LABEL: {{^}}extract_byte0_to_f32: -; GCN: buffer_load_dword [[VAL:v[0-9]+]] -; GCN-NOT: [[VAL]] -; GCN: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[CONV]] +; SI-LABEL: {{^}}extract_byte0_to_f32: +; SI: buffer_load_dword [[VAL:v[0-9]+]] +; SI-NOT: [[VAL]] +; SI: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[VAL]] +; SI: buffer_store_dword [[CONV]] define void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { %val = load i32, i32 addrspace(1)* %in %and = and i32 %val, 255 @@ -211,11 +205,11 @@ define void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspac ret void } -; GCN-LABEL: {{^}}extract_byte1_to_f32: -; GCN: buffer_load_dword [[VAL:v[0-9]+]] -; GCN-NOT: [[VAL]] -; GCN: v_cvt_f32_ubyte1_e32 [[CONV:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[CONV]] +; SI-LABEL: {{^}}extract_byte1_to_f32: +; SI: buffer_load_dword [[VAL:v[0-9]+]] +; SI-NOT: [[VAL]] +; SI: v_cvt_f32_ubyte1_e32 [[CONV:v[0-9]+]], [[VAL]] +; SI: buffer_store_dword [[CONV]] define void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { %val = load i32, i32 addrspace(1)* %in %srl = lshr i32 %val, 8 @@ -225,11 +219,11 @@ define void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspac ret void } -; GCN-LABEL: {{^}}extract_byte2_to_f32: -; GCN: buffer_load_dword [[VAL:v[0-9]+]] -; GCN-NOT: [[VAL]] -; GCN: v_cvt_f32_ubyte2_e32 [[CONV:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[CONV]] +; SI-LABEL: {{^}}extract_byte2_to_f32: +; SI: buffer_load_dword [[VAL:v[0-9]+]] +; SI-NOT: [[VAL]] +; SI: v_cvt_f32_ubyte2_e32 [[CONV:v[0-9]+]], [[VAL]] +; SI: buffer_store_dword [[CONV]] define void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { %val = load i32, i32 addrspace(1)* %in %srl = lshr i32 %val, 16 @@ -239,11 +233,11 @@ define void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspac ret void } -; GCN-LABEL: {{^}}extract_byte3_to_f32: -; GCN: buffer_load_dword [[VAL:v[0-9]+]] -; GCN-NOT: [[VAL]] -; GCN: v_cvt_f32_ubyte3_e32 [[CONV:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[CONV]] +; SI-LABEL: {{^}}extract_byte3_to_f32: +; SI: buffer_load_dword [[VAL:v[0-9]+]] +; SI-NOT: [[VAL]] +; SI: v_cvt_f32_ubyte3_e32 [[CONV:v[0-9]+]], [[VAL]] +; SI: buffer_store_dword [[CONV]] define void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { %val = load i32, i32 addrspace(1)* %in %srl = lshr i32 %val, 24 diff --git a/test/CodeGen/AMDGPU/global-extload-i16.ll b/test/CodeGen/AMDGPU/global-extload-i16.ll deleted file mode 100644 index 4d999299716..00000000000 --- a/test/CodeGen/AMDGPU/global-extload-i16.ll +++ /dev/null @@ -1,302 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; XUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s -; FIXME: cypress is broken because the bigger testcases spill and it's not implemented - -; FUNC-LABEL: {{^}}zextload_global_i16_to_i32: -; SI: buffer_load_ushort -; SI: buffer_store_dword -; SI: s_endpgm -define void @zextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { - %a = load i16, i16 addrspace(1)* %in - %ext = zext i16 %a to i32 - store i32 %ext, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_i16_to_i32: -; SI: buffer_load_sshort -; SI: buffer_store_dword -; SI: s_endpgm -define void @sextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { - %a = load i16, i16 addrspace(1)* %in - %ext = sext i16 %a to i32 - store i32 %ext, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v1i16_to_v1i32: -; SI: buffer_load_ushort -; SI: s_endpgm -define void @zextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <1 x i16>, <1 x i16> addrspace(1)* %in - %ext = zext <1 x i16> %load to <1 x i32> - store <1 x i32> %ext, <1 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v1i16_to_v1i32: -; SI: buffer_load_sshort -; SI: s_endpgm -define void @sextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <1 x i16>, <1 x i16> addrspace(1)* %in - %ext = sext <1 x i16> %load to <1 x i32> - store <1 x i32> %ext, <1 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v2i16_to_v2i32: -; SI: s_endpgm -define void @zextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <2 x i16>, <2 x i16> addrspace(1)* %in - %ext = zext <2 x i16> %load to <2 x i32> - store <2 x i32> %ext, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v2i16_to_v2i32: -; SI: s_endpgm -define void @sextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <2 x i16>, <2 x i16> addrspace(1)* %in - %ext = sext <2 x i16> %load to <2 x i32> - store <2 x i32> %ext, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v4i16_to_v4i32: -; SI: s_endpgm -define void @zextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <4 x i16>, <4 x i16> addrspace(1)* %in - %ext = zext <4 x i16> %load to <4 x i32> - store <4 x i32> %ext, <4 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v4i16_to_v4i32: -; SI: s_endpgm -define void @sextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <4 x i16>, <4 x i16> addrspace(1)* %in - %ext = sext <4 x i16> %load to <4 x i32> - store <4 x i32> %ext, <4 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v8i16_to_v8i32: -; SI: s_endpgm -define void @zextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <8 x i16>, <8 x i16> addrspace(1)* %in - %ext = zext <8 x i16> %load to <8 x i32> - store <8 x i32> %ext, <8 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v8i16_to_v8i32: -; SI: s_endpgm -define void @sextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <8 x i16>, <8 x i16> addrspace(1)* %in - %ext = sext <8 x i16> %load to <8 x i32> - store <8 x i32> %ext, <8 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v16i16_to_v16i32: -; SI: s_endpgm -define void @zextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <16 x i16>, <16 x i16> addrspace(1)* %in - %ext = zext <16 x i16> %load to <16 x i32> - store <16 x i32> %ext, <16 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v16i16_to_v16i32: -; SI: s_endpgm -define void @sextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <16 x i16>, <16 x i16> addrspace(1)* %in - %ext = sext <16 x i16> %load to <16 x i32> - store <16 x i32> %ext, <16 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v32i16_to_v32i32: -; SI: s_endpgm -define void @zextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <32 x i16>, <32 x i16> addrspace(1)* %in - %ext = zext <32 x i16> %load to <32 x i32> - store <32 x i32> %ext, <32 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v32i16_to_v32i32: -; SI: s_endpgm -define void @sextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <32 x i16>, <32 x i16> addrspace(1)* %in - %ext = sext <32 x i16> %load to <32 x i32> - store <32 x i32> %ext, <32 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v64i16_to_v64i32: -; SI: s_endpgm -define void @zextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <64 x i16>, <64 x i16> addrspace(1)* %in - %ext = zext <64 x i16> %load to <64 x i32> - store <64 x i32> %ext, <64 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v64i16_to_v64i32: -; SI: s_endpgm -define void @sextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <64 x i16>, <64 x i16> addrspace(1)* %in - %ext = sext <64 x i16> %load to <64 x i32> - store <64 x i32> %ext, <64 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_i16_to_i64: -; SI-DAG: buffer_load_ushort v[[LO:[0-9]+]], -; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} -; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]] -define void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { - %a = load i16, i16 addrspace(1)* %in - %ext = zext i16 %a to i64 - store i64 %ext, i64 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_i16_to_i64: -; VI: buffer_load_ushort [[LOAD:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0 -; VI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]] -; VI: buffer_store_dwordx2 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 -define void @sextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { - %a = load i16, i16 addrspace(1)* %in - %ext = sext i16 %a to i64 - store i64 %ext, i64 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v1i16_to_v1i64: -; SI: s_endpgm -define void @zextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <1 x i16>, <1 x i16> addrspace(1)* %in - %ext = zext <1 x i16> %load to <1 x i64> - store <1 x i64> %ext, <1 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v1i16_to_v1i64: -; SI: s_endpgm -define void @sextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <1 x i16>, <1 x i16> addrspace(1)* %in - %ext = sext <1 x i16> %load to <1 x i64> - store <1 x i64> %ext, <1 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v2i16_to_v2i64: -; SI: s_endpgm -define void @zextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <2 x i16>, <2 x i16> addrspace(1)* %in - %ext = zext <2 x i16> %load to <2 x i64> - store <2 x i64> %ext, <2 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v2i16_to_v2i64: -; SI: s_endpgm -define void @sextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <2 x i16>, <2 x i16> addrspace(1)* %in - %ext = sext <2 x i16> %load to <2 x i64> - store <2 x i64> %ext, <2 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v4i16_to_v4i64: -; SI: s_endpgm -define void @zextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <4 x i16>, <4 x i16> addrspace(1)* %in - %ext = zext <4 x i16> %load to <4 x i64> - store <4 x i64> %ext, <4 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v4i16_to_v4i64: -; SI: s_endpgm -define void @sextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <4 x i16>, <4 x i16> addrspace(1)* %in - %ext = sext <4 x i16> %load to <4 x i64> - store <4 x i64> %ext, <4 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v8i16_to_v8i64: -; SI: s_endpgm -define void @zextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <8 x i16>, <8 x i16> addrspace(1)* %in - %ext = zext <8 x i16> %load to <8 x i64> - store <8 x i64> %ext, <8 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v8i16_to_v8i64: -; SI: s_endpgm -define void @sextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <8 x i16>, <8 x i16> addrspace(1)* %in - %ext = sext <8 x i16> %load to <8 x i64> - store <8 x i64> %ext, <8 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v16i16_to_v16i64: -; SI: s_endpgm -define void @zextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <16 x i16>, <16 x i16> addrspace(1)* %in - %ext = zext <16 x i16> %load to <16 x i64> - store <16 x i64> %ext, <16 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v16i16_to_v16i64: -; SI: s_endpgm -define void @sextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <16 x i16>, <16 x i16> addrspace(1)* %in - %ext = sext <16 x i16> %load to <16 x i64> - store <16 x i64> %ext, <16 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v32i16_to_v32i64: -; SI: s_endpgm -define void @zextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <32 x i16>, <32 x i16> addrspace(1)* %in - %ext = zext <32 x i16> %load to <32 x i64> - store <32 x i64> %ext, <32 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v32i16_to_v32i64: -; SI: s_endpgm -define void @sextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <32 x i16>, <32 x i16> addrspace(1)* %in - %ext = sext <32 x i16> %load to <32 x i64> - store <32 x i64> %ext, <32 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v64i16_to_v64i64: -; SI: s_endpgm -define void @zextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <64 x i16>, <64 x i16> addrspace(1)* %in - %ext = zext <64 x i16> %load to <64 x i64> - store <64 x i64> %ext, <64 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v64i16_to_v64i64: -; SI: s_endpgm -define void @sextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <64 x i16>, <64 x i16> addrspace(1)* %in - %ext = sext <64 x i16> %load to <64 x i64> - store <64 x i64> %ext, <64 x i64> addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/AMDGPU/half.ll b/test/CodeGen/AMDGPU/half.ll index b63ba8e3632..aa1f5b7362d 100644 --- a/test/CodeGen/AMDGPU/half.ll +++ b/test/CodeGen/AMDGPU/half.ll @@ -379,33 +379,19 @@ define void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x ; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f64: -; XSI: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]] -; XSI: v_cvt_f32_f16_e32 -; XSI: v_cvt_f32_f16_e32 -; XSI-DAG: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}} -; XSI: v_cvt_f32_f16_e32 -; XSI-NOT: v_cvt_f32_f16 - -; XVI: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]] -; XVI: v_cvt_f32_f16_e32 -; XVI: v_cvt_f32_f16_e32 -; XVI-DAG: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}} -; XVI: v_cvt_f32_f16_e32 -; XVI-NOT: v_cvt_f32_f16 - -; GCN: buffer_load_dwordx2 v{{\[}}[[IN_LO:[0-9]+]]:[[IN_HI:[0-9]+]] -; GCN: v_cvt_f32_f16_e32 [[Z32:v[0-9]+]], v[[IN_HI]] -; GCN: v_cvt_f32_f16_e32 [[X32:v[0-9]+]], v[[IN_LO]] -; GCN: v_lshrrev_b32_e32 [[Y16:v[0-9]+]], 16, v[[IN_LO]] -; GCN: v_cvt_f32_f16_e32 [[Y32:v[0-9]+]], [[Y16]] - -; GCN: v_cvt_f64_f32_e32 [[Z:v\[[0-9]+:[0-9]+\]]], [[Z32]] -; GCN: v_cvt_f64_f32_e32 v{{\[}}[[XLO:[0-9]+]]:{{[0-9]+}}], [[X32]] -; GCN: v_cvt_f64_f32_e32 v[{{[0-9]+}}:[[YHI:[0-9]+]]{{\]}}, [[Y32]] +; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]] +; GCN-DAG: v_cvt_f32_f16_e32 +; GCN-DAG: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}} +; GCN-DAG: v_cvt_f32_f16_e32 +; GCN-DAG: v_cvt_f32_f16_e32 + +; GCN: v_cvt_f64_f32_e32 +; GCN: v_cvt_f64_f32_e32 +; GCN: v_cvt_f64_f32_e32 ; GCN-NOT: v_cvt_f64_f32_e32 -; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[XLO]]:[[YHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GCN-DAG: buffer_store_dwordx2 [[Z]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 +; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 ; GCN: s_endpgm define void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 { %val = load <3 x half>, <3 x half> addrspace(1)* %in diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll index bf5d492dca4..541119242a9 100644 --- a/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll @@ -1,6 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s declare i32 @llvm.AMDGPU.bfe.u32(i32, i32, i32) nounwind readnone @@ -74,14 +73,11 @@ define void @bfe_u32_zextload_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) n } ; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8: -; GCN: buffer_load_dword +; SI: buffer_load_dword ; SI: v_add_i32 ; SI-NEXT: v_and_b32_e32 -; FIXME: Should be using s_add_i32 -; VI: v_add_i32 -; VI-NEXT: v_and_b32_e32 ; SI-NOT: {{[^@]}}bfe -; GCN: s_endpgm +; SI: s_endpgm define void @bfe_u32_zext_in_reg_i8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %load = load i32, i32 addrspace(1)* %in, align 4 %add = add i32 %load, 1 diff --git a/test/CodeGen/AMDGPU/load-constant-i16.ll b/test/CodeGen/AMDGPU/load-constant-i16.ll index 628d285141b..e3ec6647055 100644 --- a/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-SI,FUNC %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-VI,FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}constant_load_i16: @@ -428,15 +428,8 @@ define void @constant_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace( } ; FUNC-LABEL: {{^}}constant_sextload_i16_to_i64: -; FIXME: Need to optimize this sequence to avoid extra bfe: -; t28: i32,ch = load t12, t27, undef:i64 -; t31: i64 = any_extend t28 -; t33: i64 = sign_extend_inreg t31, ValueType:ch:i16 - -; GCN-NOHSA-SI-DAG: buffer_load_sshort v[[LO:[0-9]+]], +; GCN-NOHSA-DAG: buffer_load_sshort v[[LO:[0-9]+]], ; GCN-HSA-DAG: flat_load_sshort v[[LO:[0-9]+]], -; GCN-NOHSA-VI-DAG: buffer_load_ushort v[[ULO:[0-9]+]], -; GCN-NOHSA-VI-DAG: v_bfe_i32 v[[LO:[0-9]+]], v[[ULO]], 0, 16 ; GCN-DAG: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] ; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]] diff --git a/test/CodeGen/AMDGPU/load-global-i16.ll b/test/CodeGen/AMDGPU/load-global-i16.ll index f398dd32e06..a79c9015593 100644 --- a/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/test/CodeGen/AMDGPU/load-global-i16.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-SI,FUNC %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-VI,FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s @@ -444,15 +444,8 @@ define void @global_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1) } ; FUNC-LABEL: {{^}}global_sextload_i16_to_i64: -; FIXME: Need to optimize this sequence to avoid extra bfe: -; t28: i32,ch = load t12, t27, undef:i64 -; t31: i64 = any_extend t28 -; t33: i64 = sign_extend_inreg t31, ValueType:ch:i16 - -; GCN-NOHSA-SI-DAG: buffer_load_sshort v[[LO:[0-9]+]], +; GCN-NOHSA-DAG: buffer_load_sshort v[[LO:[0-9]+]], ; GCN-HSA-DAG: flat_load_sshort v[[LO:[0-9]+]], -; GCN-NOHSA-VI-DAG: buffer_load_ushort v[[ULO:[0-9]+]], -; GCN-NOHSA-VI-DAG: v_bfe_i32 v[[LO:[0-9]+]], v[[ULO]], 0, 16 ; GCN-DAG: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] ; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]] diff --git a/test/CodeGen/AMDGPU/load-global-i8.ll b/test/CodeGen/AMDGPU/load-global-i8.ll index b183b6ccd62..ae032a92175 100644 --- a/test/CodeGen/AMDGPU/load-global-i8.ll +++ b/test/CodeGen/AMDGPU/load-global-i8.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,SI,FUNC %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,SI,FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,VI,FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s @@ -163,8 +163,7 @@ define void @global_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8 ; GCN-NOHSA: buffer_load_dword v ; GCN-HSA: flat_load_dword v -; SI-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8 -; VI-DAG: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} +; GCN-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8 ; GCN-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 8 ; GCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff, @@ -186,16 +185,7 @@ entry: ; GCN-NOHSA: buffer_load_dword v ; GCN-HSA: flat_load_dword v -;FIXME: Need to optimize this sequence to avoid extra shift on VI. - -; t23: i16 = truncate t18 -; t49: i16 = srl t23, Constant:i32<8> -; t57: i32 = any_extend t49 -; t58: i32 = sign_extend_inreg t57, ValueType:ch:i8 - -; SI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8 -; VI-DAG: v_lshrrev_b16_e32 [[SHIFT:v[0-9]+]], 8, v{{[0-9]+}} -; VI-DAG: v_bfe_i32 v{{[0-9]+}}, [[SHIFT]], 0, 8 +; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8 ; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8 ; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 8 diff --git a/test/CodeGen/AMDGPU/load-local-i16.ll b/test/CodeGen/AMDGPU/load-local-i16.ll index d4e86de66af..9b0cbaa7701 100644 --- a/test/CodeGen/AMDGPU/load-local-i16.ll +++ b/test/CodeGen/AMDGPU/load-local-i16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}local_load_i16: @@ -539,13 +539,7 @@ define void @local_zextload_i16_to_i64(i64 addrspace(3)* %out, i16 addrspace(3)* } ; FUNC-LABEL: {{^}}local_sextload_i16_to_i64: -; FIXME: Need to optimize this sequence to avoid an extra shift. -; t25: i32,ch = load t12, t10, undef:i32 -; t28: i64 = any_extend t25 -; t30: i64 = sign_extend_inreg t28, ValueType:ch:i16 -; SI: ds_read_i16 v[[LO:[0-9]+]], -; VI: ds_read_u16 v[[ULO:[0-9]+]] -; VI: v_bfe_i32 v[[LO:[0-9]+]], v[[ULO]], 0, 16 +; GCN: ds_read_i16 v[[LO:[0-9]+]], ; GCN-DAG: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] ; GCN: ds_write_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]] diff --git a/test/CodeGen/AMDGPU/load-local-i8.ll b/test/CodeGen/AMDGPU/load-local-i8.ll index 02b59e89c3f..6b5b1cf9906 100644 --- a/test/CodeGen/AMDGPU/load-local-i8.ll +++ b/test/CodeGen/AMDGPU/load-local-i8.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s @@ -141,17 +141,8 @@ define void @local_zextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8> ; GCN-NOT: s_wqm_b64 ; GCN: s_mov_b32 m0 ; GCN: ds_read_u16 -; FIXME: Need to optimize this sequence to avoid extra shift on VI. -; t23: i16 = srl t39, Constant:i32<8> -; t31: i32 = any_extend t23 -; t33: i32 = sign_extend_inreg t31, ValueType:ch:i8 - -; SI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8 -; SI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8 - -; VI-DAG: v_lshrrev_b16_e32 [[SHIFT:v[0-9]+]], 8, v{{[0-9]+}} -; VI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8 -; VI-DAG: v_bfe_i32 v{{[0-9]+}}, [[SHIFT]], 0, 8 +; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8 +; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8 ; EG: LDS_USHORT_READ_RET ; EG-DAG: BFE_INT @@ -166,8 +157,7 @@ define void @local_sextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8> ; FUNC-LABEL: {{^}}local_zextload_v3i8_to_v3i32: ; GCN: ds_read_b32 -; SI-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8 -; VI-DAG: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, {{v[0-9]+}} +; GCN-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8 ; GCN-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 8 ; GCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff, diff --git a/test/CodeGen/AMDGPU/mad_uint24.ll b/test/CodeGen/AMDGPU/mad_uint24.ll index 21453471065..f10777546d8 100644 --- a/test/CodeGen/AMDGPU/mad_uint24.ll +++ b/test/CodeGen/AMDGPU/mad_uint24.ll @@ -1,15 +1,11 @@ +; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=VI --check-prefix=FUNC -; RUN: llc < %s -march=amdgcn -mcpu=fiji -verify-machineinstrs | FileCheck %s --check-prefix=VI --check-prefix=FUNC - -declare i32 @llvm.r600.read.tidig.x() nounwind readnone ; FUNC-LABEL: {{^}}u32_mad24: ; EG: MULADD_UINT24 ; SI: v_mad_u32_u24 -; VI: v_mad_u32_u24 define void @u32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { entry: @@ -29,9 +25,9 @@ entry: ; The result must be sign-extended ; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x ; EG: 16 -; FIXME: Should be using scalar instructions here. -; GCN: v_mad_u32_u24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}} -; GCN: v_bfe_i32 v{{[0-9]}}, [[MAD]], 0, 16 +; SI: v_mad_u32_u24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}} +; SI: v_bfe_i32 v{{[0-9]}}, [[MAD]], 0, 16 + define void @i16_mad24(i32 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) { entry: %0 = mul i16 %a, %b @@ -41,14 +37,14 @@ entry: ret void } -; FIXME: Need to handle non-uniform case for function below (load without gep). ; FUNC-LABEL: {{^}}i8_mad24: ; EG: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]] ; The result must be sign-extended ; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x ; EG: 8 -; GCN: v_mad_u32_u24 [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} -; GCN: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8 +; SI: v_mad_u32_u24 [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} +; SI: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8 + define void @i8_mad24(i32 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) { entry: %0 = mul i8 %a, %b diff --git a/test/CodeGen/AMDGPU/max.i16.ll b/test/CodeGen/AMDGPU/max.i16.ll deleted file mode 100644 index 0b0e026c5fa..00000000000 --- a/test/CodeGen/AMDGPU/max.i16.ll +++ /dev/null @@ -1,87 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=fiji -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=VI %s - - -declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - -; FIXME: Need to handle non-uniform case for function below (load without gep). -; GCN-LABEL: {{^}}v_test_imax_sge_i16: -; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_test_imax_sge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid - %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid - %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid - %a = load i16, i16 addrspace(1)* %gep0, align 4 - %b = load i16, i16 addrspace(1)* %gep1, align 4 - %cmp = icmp sge i16 %a, %b - %val = select i1 %cmp, i16 %a, i16 %b - store i16 %val, i16 addrspace(1)* %outgep, align 4 - ret void -} - -; FIXME: Need to handle non-uniform case for function below (load without gep). -; GCN-LABEL: {{^}}v_test_imax_sge_v4i16: -; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_test_imax_sge_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %aptr, <4 x i16> addrspace(1)* %bptr) nounwind { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - %gep0 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %aptr, i32 %tid - %gep1 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %bptr, i32 %tid - %outgep = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %out, i32 %tid - %a = load <4 x i16>, <4 x i16> addrspace(1)* %gep0, align 4 - %b = load <4 x i16>, <4 x i16> addrspace(1)* %gep1, align 4 - %cmp = icmp sge <4 x i16> %a, %b - %val = select <4 x i1> %cmp, <4 x i16> %a, <4 x i16> %b - store <4 x i16> %val, <4 x i16> addrspace(1)* %outgep, align 4 - ret void -} - -; FIXME: Need to handle non-uniform case for function below (load without gep). -; GCN-LABEL: {{^}}v_test_imax_sgt_i16: -; VI: v_max_i16_e32 -define void @v_test_imax_sgt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid - %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid - %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid - %a = load i16, i16 addrspace(1)* %gep0, align 4 - %b = load i16, i16 addrspace(1)* %gep1, align 4 - %cmp = icmp sgt i16 %a, %b - %val = select i1 %cmp, i16 %a, i16 %b - store i16 %val, i16 addrspace(1)* %outgep, align 4 - ret void -} - -; FIXME: Need to handle non-uniform case for function below (load without gep). -; GCN-LABEL: {{^}}v_test_umax_uge_i16: -; VI: v_max_u16_e32 -define void @v_test_umax_uge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid - %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid - %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid - %a = load i16, i16 addrspace(1)* %gep0, align 4 - %b = load i16, i16 addrspace(1)* %gep1, align 4 - %cmp = icmp uge i16 %a, %b - %val = select i1 %cmp, i16 %a, i16 %b - store i16 %val, i16 addrspace(1)* %outgep, align 4 - ret void -} - -; FIXME: Need to handle non-uniform case for function below (load without gep). -; GCN-LABEL: {{^}}v_test_umax_ugt_i16: -; VI: v_max_u16_e32 -define void @v_test_umax_ugt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid - %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid - %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid - %a = load i16, i16 addrspace(1)* %gep0, align 4 - %b = load i16, i16 addrspace(1)* %gep1, align 4 - %cmp = icmp ugt i16 %a, %b - %val = select i1 %cmp, i16 %a, i16 %b - store i16 %val, i16 addrspace(1)* %outgep, align 4 - ret void -} diff --git a/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll index 76869337730..9e5f31a5511 100644 --- a/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll +++ b/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll @@ -31,8 +31,7 @@ entry: } ; FUNC-LABEL: {{^}}test_umul24_i16_vgpr_sext: -; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} -; VI: v_mul_lo_u16_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} +; GCN: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} ; GCN: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 16 define void @test_umul24_i16_vgpr_sext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) { %tid.x = call i32 @llvm.amdgcn.workitem.id.x() @@ -63,9 +62,8 @@ entry: } ; FUNC-LABEL: {{^}}test_umul24_i16_vgpr: -; SI: v_mul_u32_u24_e32 -; SI: v_and_b32_e32 -; VI: v_mul_lo_u16 +; GCN: v_mul_u32_u24_e32 +; GCN: v_and_b32_e32 define void @test_umul24_i16_vgpr(i32 addrspace(1)* %out, i16 addrspace(1)* %in) { %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %tid.y = call i32 @llvm.amdgcn.workitem.id.y() @@ -79,9 +77,9 @@ define void @test_umul24_i16_vgpr(i32 addrspace(1)* %out, i16 addrspace(1)* %in) ret void } +; FIXME: Need to handle non-uniform case for function below (load without gep). ; FUNC-LABEL: {{^}}test_umul24_i8_vgpr: -; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} -; VI: v_mul_lo_u16_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} +; GCN: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} ; GCN: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8 define void @test_umul24_i8_vgpr(i32 addrspace(1)* %out, i8 addrspace(1)* %a, i8 addrspace(1)* %b) { entry: diff --git a/test/CodeGen/AMDGPU/shl.ll b/test/CodeGen/AMDGPU/shl.ll index 45ae1a3050e..5a2b03bff99 100644 --- a/test/CodeGen/AMDGPU/shl.ll +++ b/test/CodeGen/AMDGPU/shl.ll @@ -53,48 +53,6 @@ define void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in ret void } -;VI: {{^}}shl_i16: -;VI: v_lshlrev_b16_e32 v{{[0-9]+, [0-9]+, [0-9]+}} - -define void @shl_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { - %b_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1 - %a = load i16, i16 addrspace(1) * %in - %b = load i16, i16 addrspace(1) * %b_ptr - %result = shl i16 %a, %b - store i16 %result, i16 addrspace(1)* %out - ret void -} - - -;VI: {{^}}shl_v2i16: -;VI: v_lshlrev_b16_e32 v{{[0-9]+, [0-9]+, [0-9]+}} -;VI: v_lshlrev_b16_e32 v{{[0-9]+, [0-9]+, [0-9]+}} - -define void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { - %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i16 1 - %a = load <2 x i16>, <2 x i16> addrspace(1) * %in - %b = load <2 x i16>, <2 x i16> addrspace(1) * %b_ptr - %result = shl <2 x i16> %a, %b - store <2 x i16> %result, <2 x i16> addrspace(1)* %out - ret void -} - - -;VI: {{^}}shl_v4i16: -;VI: v_lshlrev_b16_e32 v{{[0-9]+, [0-9]+, [0-9]+}} -;VI: v_lshlrev_b16_e32 v{{[0-9]+, [0-9]+, [0-9]+}} -;VI: v_lshlrev_b16_e32 v{{[0-9]+, [0-9]+, [0-9]+}} -;VI: v_lshlrev_b16_e32 v{{[0-9]+, [0-9]+, [0-9]+}} - -define void @shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { - %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i16 1 - %a = load <4 x i16>, <4 x i16> addrspace(1) * %in - %b = load <4 x i16>, <4 x i16> addrspace(1) * %b_ptr - %result = shl <4 x i16> %a, %b - store <4 x i16> %result, <4 x i16> addrspace(1)* %out - ret void -} - ;EG-LABEL: {{^}}shl_i64: ;EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]] ;EG: LSHR {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}} diff --git a/test/CodeGen/AMDGPU/sign_extend.ll b/test/CodeGen/AMDGPU/sign_extend.ll index a7db28e5167..30e6bd1e78f 100644 --- a/test/CodeGen/AMDGPU/sign_extend.ll +++ b/test/CodeGen/AMDGPU/sign_extend.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}s_sext_i1_to_i32: ; GCN: v_cndmask_b32_e64 @@ -55,43 +55,22 @@ define void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) no } ; GCN-LABEL: {{^}}s_sext_i16_to_i64: -; GCN: s_bfe_i64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x100000 +; GCN: s_endpgm define void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind { %sext = sext i16 %a to i64 store i64 %sext, i64 addrspace(1)* %out, align 8 ret void } -; GCN-LABEL: {{^}}s_sext_i1_to_i16: -; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1 -; GCN-NEXT: buffer_store_short [[RESULT]] -define void @s_sext_i1_to_i16(i16 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %cmp = icmp eq i32 %a, %b - %sext = sext i1 %cmp to i16 - store i16 %sext, i16 addrspace(1)* %out - ret void -} - ; GCN-LABEL: {{^}}s_sext_v4i8_to_v4i32: ; GCN: s_load_dword [[VAL:s[0-9]+]] +; GCN-DAG: s_sext_i32_i8 [[EXT0:s[0-9]+]], [[VAL]] +; GCN-DAG: s_bfe_i32 [[EXT1:s[0-9]+]], [[VAL]], 0x80008 ; GCN-DAG: s_bfe_i32 [[EXT2:s[0-9]+]], [[VAL]], 0x80010 ; GCN-DAG: s_ashr_i32 [[EXT3:s[0-9]+]], [[VAL]], 24 -; SI-DAG: s_bfe_i32 [[EXT1:s[0-9]+]], [[VAL]], 0x80008 -; GCN-DAG: s_sext_i32_i8 [[EXT0:s[0-9]+]], [[VAL]] - -; FIXME: We end up with a v_bfe instruction, because the i16 srl -; gets selected to a v_lshrrev_b16 instructions, so the input to -; the bfe is a vector registers. To fix this we need to be able to -; optimize: -; t29: i16 = truncate t10 -; t55: i16 = srl t29, Constant:i32<8> -; t63: i32 = any_extend t55 -; t64: i32 = sign_extend_inreg t63, ValueType:ch:i8 - -; VI-DAG: v_bfe_i32 [[VEXT1:v[0-9]+]], v{{[0-9]+}}, 0, 8 ; GCN-DAG: v_mov_b32_e32 [[VEXT0:v[0-9]+]], [[EXT0]] -; SI-DAG: v_mov_b32_e32 [[VEXT1:v[0-9]+]], [[EXT1]] +; GCN-DAG: v_mov_b32_e32 [[VEXT1:v[0-9]+]], [[EXT1]] ; GCN-DAG: v_mov_b32_e32 [[VEXT2:v[0-9]+]], [[EXT2]] ; GCN-DAG: v_mov_b32_e32 [[VEXT3:v[0-9]+]], [[EXT3]] @@ -117,17 +96,10 @@ define void @s_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 %a) nounwind { ; GCN-LABEL: {{^}}v_sext_v4i8_to_v4i32: ; GCN: buffer_load_dword [[VAL:v[0-9]+]] -; FIXME: need to optimize same sequence as above test to avoid -; this shift. -; VI-DAG: v_lshrrev_b16_e32 [[SH16:v[0-9]+]], 8, [[VAL]] +; GCN-DAG: v_bfe_i32 [[EXT0:v[0-9]+]], [[VAL]], 0, 8 +; GCN-DAG: v_bfe_i32 [[EXT1:v[0-9]+]], [[VAL]], 8, 8 +; GCN-DAG: v_bfe_i32 [[EXT2:v[0-9]+]], [[VAL]], 16, 8 ; GCN-DAG: v_ashrrev_i32_e32 [[EXT3:v[0-9]+]], 24, [[VAL]] -; VI-DAG: v_bfe_i32 [[EXT0:v[0-9]+]], [[VAL]], 0, 8 -; VI-DAG: v_bfe_i32 [[EXT2:v[0-9]+]], [[VAL]], 16, 8 -; VI-DAG: v_bfe_i32 [[EXT1:v[0-9]+]], [[SH16]], 0, 8 - -; SI-DAG: v_bfe_i32 [[EXT2:v[0-9]+]], [[VAL]], 16, 8 -; SI-DAG: v_bfe_i32 [[EXT1:v[0-9]+]], [[VAL]], 8, 8 -; SI: v_bfe_i32 [[EXT0:v[0-9]+]], [[VAL]], 0, 8 ; GCN: buffer_store_dword [[EXT0]] ; GCN: buffer_store_dword [[EXT1]] diff --git a/test/CodeGen/AMDGPU/sra.ll b/test/CodeGen/AMDGPU/sra.ll index 710547426e3..dddfbfd3ed1 100644 --- a/test/CodeGen/AMDGPU/sra.ll +++ b/test/CodeGen/AMDGPU/sra.ll @@ -46,36 +46,6 @@ define void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %i ret void } -; FUNC-LABEL: {{^}}ashr_v2i16: -; FIXME: The ashr operation is uniform, but because its operands come from a -; global load we end up with the vector instructions rather than scalar. -; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -define void @ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { - %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i16 1 - %a = load <2 x i16>, <2 x i16> addrspace(1)* %in - %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr - %result = ashr <2 x i16> %a, %b - store <2 x i16> %result, <2 x i16> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}ashr_v4i16: -; FIXME: The ashr operation is uniform, but because its operands come from a -; global load we end up with the vector instructions rather than scalar. -; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -define void @ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { - %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i16 1 - %a = load <4 x i16>, <4 x i16> addrspace(1)* %in - %b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr - %result = ashr <4 x i16> %a, %b - store <4 x i16> %result, <4 x i16> addrspace(1)* %out - ret void -} - ; FUNC-LABEL: {{^}}s_ashr_i64: ; GCN: s_ashr_i64 s[{{[0-9]}}:{{[0-9]}}], s[{{[0-9]}}:{{[0-9]}}], 8 diff --git a/test/CodeGen/AMDGPU/sub.ll b/test/CodeGen/AMDGPU/sub.ll index 5816345098a..5a026cdf299 100644 --- a/test/CodeGen/AMDGPU/sub.ll +++ b/test/CodeGen/AMDGPU/sub.ll @@ -54,46 +54,6 @@ define void @test_sub_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1) ret void } -; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -define void @test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { - %b_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1 - %a = load i16, i16 addrspace(1)* %in - %b = load i16, i16 addrspace(1)* %b_ptr - %result = sub i16 %a, %b - store i16 %result, i16 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}test_sub_v2i16: - -; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} - -define void @test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { - %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i16 1 - %a = load <2 x i16>, <2 x i16> addrspace(1) * %in - %b = load <2 x i16>, <2 x i16> addrspace(1) * %b_ptr - %result = sub <2 x i16> %a, %b - store <2 x i16> %result, <2 x i16> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}test_sub_v4i16: - -; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} - -define void @test_sub_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { - %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i16 1 - %a = load <4 x i16>, <4 x i16> addrspace(1) * %in - %b = load <4 x i16>, <4 x i16> addrspace(1) * %b_ptr - %result = sub <4 x i16> %a, %b - store <4 x i16> %result, <4 x i16> addrspace(1)* %out - ret void -} - ; FUNC-LABEL: {{^}}s_sub_i64: ; SI: s_sub_u32 ; SI: s_subb_u32 diff --git a/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll b/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll index f7aa4bc2c6d..9e2373c55e3 100644 --- a/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll +++ b/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s ; CHECK-LABEL: {{^}}trunc_i64_bitcast_v2i32: ; CHECK: buffer_load_dword v @@ -47,12 +47,7 @@ define void @trunc_i16_bitcast_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace } ; CHECK-LABEL: {{^}}trunc_i16_bitcast_v4i16: -; FIXME We need to teach the dagcombiner to reduce load width for: -; t21: v2i32,ch = load t12, t10, undef:i64 -; t23: i64 = bitcast t21 -; t30: i16 = truncate t23 -; SI: buffer_load_dword v[[VAL:[0-9]+]] -; VI: buffer_load_dwordx2 v{{\[}}[[VAL:[0-9]+]] +; CHECK: buffer_load_dword [[VAL:v[0-9]+]] ; CHECK: buffer_store_short [[VAL]] define void @trunc_i16_bitcast_v4i16(i16 addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { %ld = load <4 x i16>, <4 x i16> addrspace(1)* %in diff --git a/test/CodeGen/AMDGPU/trunc-store-i1.ll b/test/CodeGen/AMDGPU/trunc-store-i1.ll index b1e879f2630..b71a838b62c 100644 --- a/test/CodeGen/AMDGPU/trunc-store-i1.ll +++ b/test/CodeGen/AMDGPU/trunc-store-i1.ll @@ -21,20 +21,13 @@ define void @global_truncstore_i64_to_i1(i1 addrspace(1)* %out, i64 %val) nounwi ret void } -; SI-LABEL: {{^}}s_arg_global_truncstore_i16_to_i1: +; SI-LABEL: {{^}}global_truncstore_i16_to_i1: ; SI: s_load_dword [[LOAD:s[0-9]+]], ; SI: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1 ; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]] ; SI: buffer_store_byte [[VREG]], -define void @s_arg_global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val) nounwind { +define void @global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val) nounwind { %trunc = trunc i16 %val to i1 store i1 %trunc, i1 addrspace(1)* %out, align 1 ret void } -; SI-LABEL: {{^}}global_truncstore_i16_to_i1: -define void @global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val0, i16 %val1) nounwind { - %add = add i16 %val0, %val1 - %trunc = trunc i16 %add to i1 - store i1 %trunc, i1 addrspace(1)* %out, align 1 - ret void -} diff --git a/test/CodeGen/AMDGPU/zero_extend.ll b/test/CodeGen/AMDGPU/zero_extend.ll index b30cb73f6da..53539921479 100644 --- a/test/CodeGen/AMDGPU/zero_extend.ll +++ b/test/CodeGen/AMDGPU/zero_extend.ll @@ -2,58 +2,39 @@ ; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600 -; R600: {{^}}s_mad_zext_i32_to_i64: +; R600: {{^}}test: ; R600: MEM_RAT_CACHELESS STORE_RAW ; R600: MEM_RAT_CACHELESS STORE_RAW -; SI: {{^}}s_mad_zext_i32_to_i64: +; SI: {{^}}test: ; SI: v_mov_b32_e32 v[[V_ZERO:[0-9]]], 0{{$}} ; SI: buffer_store_dwordx2 v[0:[[V_ZERO]]{{\]}} -define void @s_mad_zext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) #0 { +define void @test(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { entry: - %tmp0 = mul i32 %a, %b - %tmp1 = add i32 %tmp0, %c - %tmp2 = zext i32 %tmp1 to i64 - store i64 %tmp2, i64 addrspace(1)* %out + %0 = mul i32 %a, %b + %1 = add i32 %0, %c + %2 = zext i32 %1 to i64 + store i64 %2, i64 addrspace(1)* %out ret void } -; SI-LABEL: {{^}}s_cmp_zext_i1_to_i32 +; SI-LABEL: {{^}}testi1toi32: ; SI: v_cndmask_b32 -define void @s_cmp_zext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define void @testi1toi32(i32 addrspace(1)* %out, i32 %a, i32 %b) { entry: - %tmp0 = icmp eq i32 %a, %b - %tmp1 = zext i1 %tmp0 to i32 - store i32 %tmp1, i32 addrspace(1)* %out + %0 = icmp eq i32 %a, %b + %1 = zext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out ret void } -; SI-LABEL: {{^}}s_arg_zext_i1_to_i64: -define void @s_arg_zext_i1_to_i64(i64 addrspace(1)* %out, i1 zeroext %arg) #0 { - %ext = zext i1 %arg to i64 - store i64 %ext, i64 addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL: {{^}}s_cmp_zext_i1_to_i64: +; SI-LABEL: {{^}}zext_i1_to_i64: ; SI: s_mov_b32 s{{[0-9]+}}, 0 ; SI: v_cmp_eq_u32 ; SI: v_cndmask_b32 -define void @s_cmp_zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 { +define void @zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %cmp = icmp eq i32 %a, %b %ext = zext i1 %cmp to i64 store i64 %ext, i64 addrspace(1)* %out, align 8 ret void } - -; SI-LABEL: {{^}}s_cmp_zext_i1_to_i16 -; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc -; SI: buffer_store_short [[RESULT]] -define void @s_cmp_zext_i1_to_i16(i16 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) #0 { - %tmp0 = icmp eq i16 %a, %b - %tmp1 = zext i1 %tmp0 to i16 - store i16 %tmp1, i16 addrspace(1)* %out - ret void -} - -attributes #0 = { nounwind } -- 2.50.1