def HasD16LoadStore : Predicate<"Subtarget->hasD16LoadStore()">,
AssemblerPredicate<"FeatureGFX9Insts">;
+
+def LDSRequiresM0Init : Predicate<"Subtarget->ldsRequiresM0Init()">;
+def NotLDSRequiresM0Init : Predicate<"!Subtarget->ldsRequiresM0Init()">;
+
def HasDSAddTid : Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9">,
AssemblerPredicate<"FeatureGFX9Insts">;
}
SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const {
- if (cast<MemSDNode>(N)->getAddressSpace() != AMDGPUASI.LOCAL_ADDRESS)
+ if (cast<MemSDNode>(N)->getAddressSpace() != AMDGPUASI.LOCAL_ADDRESS ||
+ !Subtarget->ldsRequiresM0Init())
return N;
const SITargetLowering& Lowering =
return getGeneration() >= GFX9;
}
+ /// Return if most LDS instructions have an m0 use that require m0 to be
+ /// iniitalized.
+ bool ldsRequiresM0Init() const {
+ return getGeneration() < GFX9;
+ }
+
bool hasAddNoCarry() const {
return AddNoCarryInsts;
}
(inst $ptr, (as_i16imm $offset), (i1 0))
>;
+// FIXME: Passing name of PatFrag in workaround. Why doesn't
+// !cast<PatFrag>(frag.NAME#"_m0") work!?
+multiclass DSReadPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
+
+ let OtherPredicates = [LDSRequiresM0Init] in {
+ def : DSReadPat<inst, vt, !cast<PatFrag>(frag#"_m0")>;
+ }
+
+ let OtherPredicates = [NotLDSRequiresM0Init] in {
+ def : DSReadPat<!cast<DS_Pseudo>(inst.NAME#"_gfx9"), vt, !cast<PatFrag>(frag)>;
+ }
+}
+
+
multiclass DSReadPat_Hi16 <DS_Pseudo inst, PatFrag frag, ValueType vt = i16> {
def : GCNPat <
(build_vector vt:$lo, (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset)))),
>;
}
-
-def : DSReadPat <DS_READ_I8, i32, sextloadi8_local_m0>;
-def : DSReadPat <DS_READ_U8, i32, az_extloadi8_local_m0>;
-def : DSReadPat <DS_READ_I8, i16, sextloadi8_local_m0>;
-def : DSReadPat <DS_READ_U8, i16, az_extloadi8_local_m0>;
-def : DSReadPat <DS_READ_I16, i32, sextloadi16_local_m0>;
-def : DSReadPat <DS_READ_I16, i32, sextloadi16_local_m0>;
-def : DSReadPat <DS_READ_U16, i32, az_extloadi16_local_m0>;
-def : DSReadPat <DS_READ_U16, i16, load_local_m0>;
-def : DSReadPat <DS_READ_B32, i32, load_local_m0>;
+defm : DSReadPat_mc <DS_READ_I8, i32, "sextloadi8_local">;
+defm : DSReadPat_mc <DS_READ_U8, i32, "az_extloadi8_local">;
+defm : DSReadPat_mc <DS_READ_I8, i16, "sextloadi8_local">;
+defm : DSReadPat_mc <DS_READ_U8, i16, "az_extloadi8_local">;
+defm : DSReadPat_mc <DS_READ_I16, i32, "sextloadi16_local">;
+defm : DSReadPat_mc <DS_READ_I16, i32, "sextloadi16_local">;
+defm : DSReadPat_mc <DS_READ_U16, i32, "az_extloadi16_local">;
+defm : DSReadPat_mc <DS_READ_U16, i16, "load_local">;
+defm : DSReadPat_mc <DS_READ_B32, i32, "load_local">;
let AddedComplexity = 100 in {
-def : DSReadPat <DS_READ_B64, v2i32, load_align8_local_m0>;
+defm : DSReadPat_mc <DS_READ_B64, v2i32, "load_align8_local">;
} // End AddedComplexity = 100
-def : GCNPat <
- (v2i32 (load_local_m0 (DS64Bit4ByteAligned i32:$ptr, i8:$offset0,
- i8:$offset1))),
- (DS_READ2_B32 $ptr, $offset0, $offset1, (i1 0))
->;
-
-
let OtherPredicates = [HasD16LoadStore] in {
let AddedComplexity = 100 in {
defm : DSReadPat_Hi16<DS_READ_U16_D16_HI, load_local>;
(inst $ptr, $value, (as_i16imm $offset), (i1 0))
>;
-def : DSWritePat <DS_WRITE_B8, i32, truncstorei8_local_m0>;
-def : DSWritePat <DS_WRITE_B16, i32, truncstorei16_local_m0>;
-def : DSWritePat <DS_WRITE_B8, i16, truncstorei8_local_m0>;
-def : DSWritePat <DS_WRITE_B16, i16, store_local_m0>;
-def : DSWritePat <DS_WRITE_B32, i32, store_local_m0>;
+multiclass DSWritePat_mc <DS_Pseudo inst, ValueType vt, string frag> {
+ let OtherPredicates = [LDSRequiresM0Init] in {
+ def : DSWritePat<inst, vt, !cast<PatFrag>(frag#"_m0")>;
+ }
+
+ let OtherPredicates = [NotLDSRequiresM0Init] in {
+ def : DSWritePat<!cast<DS_Pseudo>(inst.NAME#"_gfx9"), vt, !cast<PatFrag>(frag)>;
+ }
+}
+
+defm : DSWritePat_mc <DS_WRITE_B8, i32, "truncstorei8_local">;
+defm : DSWritePat_mc <DS_WRITE_B16, i32, "truncstorei16_local">;
+defm : DSWritePat_mc <DS_WRITE_B8, i16, "truncstorei8_local">;
+defm : DSWritePat_mc <DS_WRITE_B16, i16, "store_local">;
+defm : DSWritePat_mc <DS_WRITE_B32, i32, "store_local">;
let OtherPredicates = [HasD16LoadStore] in {
def : DSWritePat <DS_WRITE_B16_D16_HI, i32, store_local_hi16>;
def : DSWritePat <DS_WRITE_B8_D16_HI, i32, truncstorei8_local_hi16>;
}
-let AddedComplexity = 100 in {
-def : DSWritePat <DS_WRITE_B64, v2i32, store_align8_local_m0>;
-} // End AddedComplexity = 100
+class DS64Bit4ByteAlignedReadPat<DS_Pseudo inst, PatFrag frag> : GCNPat <
+ (v2i32 (frag (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1))),
+ (inst $ptr, $offset0, $offset1, (i1 0))
+>;
-def : GCNPat <
- (store_local_m0 v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0,
- i8:$offset1)),
- (DS_WRITE2_B32 $ptr, (i32 (EXTRACT_SUBREG $value, sub0)),
- (i32 (EXTRACT_SUBREG $value, sub1)), $offset0, $offset1,
- (i1 0))
+class DS64Bit4ByteAlignedWritePat<DS_Pseudo inst, PatFrag frag> : GCNPat<
+ (frag v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1)),
+ (inst $ptr, (i32 (EXTRACT_SUBREG $value, sub0)),
+ (i32 (EXTRACT_SUBREG $value, sub1)), $offset0, $offset1,
+ (i1 0))
>;
+let OtherPredicates = [LDSRequiresM0Init] in {
+def : DS64Bit4ByteAlignedReadPat<DS_READ2_B32, load_local_m0>;
+def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32, store_local_m0>;
+}
+
+let OtherPredicates = [NotLDSRequiresM0Init] in {
+def : DS64Bit4ByteAlignedReadPat<DS_READ2_B32_gfx9, load_local>;
+def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32_gfx9, store_local>;
+}
+
+
+let AddedComplexity = 100 in {
+
+defm : DSWritePat_mc <DS_WRITE_B64, v2i32, "store_align8_local">;
+} // End AddedComplexity = 100
class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
(frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value),
(inst $ptr, $value, (as_i16imm $offset), (i1 0))
>;
+multiclass DSAtomicRetPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
+ let OtherPredicates = [LDSRequiresM0Init] in {
+ def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_m0")>;
+ }
+
+ let OtherPredicates = [NotLDSRequiresM0Init] in {
+ def : DSAtomicRetPat<!cast<DS_Pseudo>(inst.NAME#"_gfx9"), vt, !cast<PatFrag>(frag)>;
+ }
+}
+
+
+
class DSAtomicCmpXChg<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
(frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap),
(inst $ptr, $cmp, $swap, (as_i16imm $offset), (i1 0))
>;
+multiclass DSAtomicCmpXChg_mc<DS_Pseudo inst, ValueType vt, string frag> {
+ let OtherPredicates = [LDSRequiresM0Init] in {
+ def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_m0")>;
+ }
+
+ let OtherPredicates = [NotLDSRequiresM0Init] in {
+ def : DSAtomicCmpXChg<!cast<DS_Pseudo>(inst.NAME#"_gfx9"), vt, !cast<PatFrag>(frag)>;
+ }
+}
+
+
// 32-bit atomics.
-def : DSAtomicRetPat<DS_WRXCHG_RTN_B32, i32, atomic_swap_local_m0>;
-def : DSAtomicRetPat<DS_ADD_RTN_U32, i32, atomic_load_add_local_m0>;
-def : DSAtomicRetPat<DS_SUB_RTN_U32, i32, atomic_load_sub_local_m0>;
-def : DSAtomicRetPat<DS_INC_RTN_U32, i32, atomic_inc_local_m0>;
-def : DSAtomicRetPat<DS_DEC_RTN_U32, i32, atomic_dec_local_m0>;
-def : DSAtomicRetPat<DS_AND_RTN_B32, i32, atomic_load_and_local_m0>;
-def : DSAtomicRetPat<DS_OR_RTN_B32, i32, atomic_load_or_local_m0>;
-def : DSAtomicRetPat<DS_XOR_RTN_B32, i32, atomic_load_xor_local_m0>;
-def : DSAtomicRetPat<DS_MIN_RTN_I32, i32, atomic_load_min_local_m0>;
-def : DSAtomicRetPat<DS_MAX_RTN_I32, i32, atomic_load_max_local_m0>;
-def : DSAtomicRetPat<DS_MIN_RTN_U32, i32, atomic_load_umin_local_m0>;
-def : DSAtomicRetPat<DS_MAX_RTN_U32, i32, atomic_load_umax_local_m0>;
-def : DSAtomicCmpXChg<DS_CMPST_RTN_B32, i32, atomic_cmp_swap_local_m0>;
+defm : DSAtomicRetPat_mc<DS_WRXCHG_RTN_B32, i32, "atomic_swap_local">;
+defm : DSAtomicRetPat_mc<DS_ADD_RTN_U32, i32, "atomic_load_add_local">;
+defm : DSAtomicRetPat_mc<DS_SUB_RTN_U32, i32, "atomic_load_sub_local">;
+defm : DSAtomicRetPat_mc<DS_INC_RTN_U32, i32, "atomic_inc_local">;
+defm : DSAtomicRetPat_mc<DS_DEC_RTN_U32, i32, "atomic_dec_local">;
+defm : DSAtomicRetPat_mc<DS_AND_RTN_B32, i32, "atomic_load_and_local">;
+defm : DSAtomicRetPat_mc<DS_OR_RTN_B32, i32, "atomic_load_or_local">;
+defm : DSAtomicRetPat_mc<DS_XOR_RTN_B32, i32, "atomic_load_xor_local">;
+defm : DSAtomicRetPat_mc<DS_MIN_RTN_I32, i32, "atomic_load_min_local">;
+defm : DSAtomicRetPat_mc<DS_MAX_RTN_I32, i32, "atomic_load_max_local">;
+defm : DSAtomicRetPat_mc<DS_MIN_RTN_U32, i32, "atomic_load_umin_local">;
+defm : DSAtomicRetPat_mc<DS_MAX_RTN_U32, i32, "atomic_load_umax_local">;
+defm : DSAtomicCmpXChg_mc<DS_CMPST_RTN_B32, i32, "atomic_cmp_swap_local">;
// 64-bit atomics.
-def : DSAtomicRetPat<DS_WRXCHG_RTN_B64, i64, atomic_swap_local_m0>;
-def : DSAtomicRetPat<DS_ADD_RTN_U64, i64, atomic_load_add_local_m0>;
-def : DSAtomicRetPat<DS_SUB_RTN_U64, i64, atomic_load_sub_local_m0>;
-def : DSAtomicRetPat<DS_INC_RTN_U64, i64, atomic_inc_local_m0>;
-def : DSAtomicRetPat<DS_DEC_RTN_U64, i64, atomic_dec_local_m0>;
-def : DSAtomicRetPat<DS_AND_RTN_B64, i64, atomic_load_and_local_m0>;
-def : DSAtomicRetPat<DS_OR_RTN_B64, i64, atomic_load_or_local_m0>;
-def : DSAtomicRetPat<DS_XOR_RTN_B64, i64, atomic_load_xor_local_m0>;
-def : DSAtomicRetPat<DS_MIN_RTN_I64, i64, atomic_load_min_local_m0>;
-def : DSAtomicRetPat<DS_MAX_RTN_I64, i64, atomic_load_max_local_m0>;
-def : DSAtomicRetPat<DS_MIN_RTN_U64, i64, atomic_load_umin_local_m0>;
-def : DSAtomicRetPat<DS_MAX_RTN_U64, i64, atomic_load_umax_local_m0>;
-
-def : DSAtomicCmpXChg<DS_CMPST_RTN_B64, i64, atomic_cmp_swap_local_m0>;
+defm : DSAtomicRetPat_mc<DS_WRXCHG_RTN_B64, i64, "atomic_swap_local">;
+defm : DSAtomicRetPat_mc<DS_ADD_RTN_U64, i64, "atomic_load_add_local">;
+defm : DSAtomicRetPat_mc<DS_SUB_RTN_U64, i64, "atomic_load_sub_local">;
+defm : DSAtomicRetPat_mc<DS_INC_RTN_U64, i64, "atomic_inc_local">;
+defm : DSAtomicRetPat_mc<DS_DEC_RTN_U64, i64, "atomic_dec_local">;
+defm : DSAtomicRetPat_mc<DS_AND_RTN_B64, i64, "atomic_load_and_local">;
+defm : DSAtomicRetPat_mc<DS_OR_RTN_B64, i64, "atomic_load_or_local">;
+defm : DSAtomicRetPat_mc<DS_XOR_RTN_B64, i64, "atomic_load_xor_local">;
+defm : DSAtomicRetPat_mc<DS_MIN_RTN_I64, i64, "atomic_load_min_local">;
+defm : DSAtomicRetPat_mc<DS_MAX_RTN_I64, i64, "atomic_load_max_local">;
+defm : DSAtomicRetPat_mc<DS_MIN_RTN_U64, i64, "atomic_load_umin_local">;
+defm : DSAtomicRetPat_mc<DS_MAX_RTN_U64, i64, "atomic_load_umax_local">;
+
+defm : DSAtomicCmpXChg_mc<DS_CMPST_RTN_B64, i64, "atomic_cmp_swap_local">;
//===----------------------------------------------------------------------===//
// Real instructions
defm atomic_inc_global : global_binary_atomic_op<SIatomic_inc>;
defm atomic_dec_global : global_binary_atomic_op<SIatomic_dec>;
+def atomic_inc_local : local_binary_atomic_op<SIatomic_inc>;
+def atomic_dec_local : local_binary_atomic_op<SIatomic_dec>;
+
//===----------------------------------------------------------------------===//
// SDNodes PatFrags for loads/stores with a glue input.
// This is for SDNodes and PatFrag for local loads and stores to
static bool offsetsCanBeCombined(CombineInfo &CI);
bool findMatchingInst(CombineInfo &CI);
+
+ unsigned read2Opcode(unsigned EltSize) const;
+ unsigned read2ST64Opcode(unsigned EltSize) const;
MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI);
+
+ unsigned write2Opcode(unsigned EltSize) const;
+ unsigned write2ST64Opcode(unsigned EltSize) const;
MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);
return false;
}
+unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
+ if (STM->ldsRequiresM0Init())
+ return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
+ return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
+}
+
+unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
+ if (STM->ldsRequiresM0Init())
+ return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
+
+ return (EltSize == 4) ?
+ AMDGPU::DS_READ2ST64_B32_gfx9 : AMDGPU::DS_READ2ST64_B64_gfx9;
+}
+
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
CombineInfo &CI) {
MachineBasicBlock *MBB = CI.I->getParent();
unsigned NewOffset0 = CI.Offset0;
unsigned NewOffset1 = CI.Offset1;
- unsigned Opc = (CI.EltSize == 4) ? AMDGPU::DS_READ2_B32
- : AMDGPU::DS_READ2_B64;
-
- if (CI.UseST64)
- Opc = (CI.EltSize == 4) ? AMDGPU::DS_READ2ST64_B32
- : AMDGPU::DS_READ2ST64_B64;
+ unsigned Opc = CI.UseST64 ?
+ read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
return Next;
}
+unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
+ if (STM->ldsRequiresM0Init())
+ return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
+ return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 : AMDGPU::DS_WRITE2_B64_gfx9;
+}
+
+unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
+ if (STM->ldsRequiresM0Init())
+ return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64;
+
+ return (EltSize == 4) ?
+ AMDGPU::DS_WRITE2ST64_B32_gfx9 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
+}
+
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
CombineInfo &CI) {
MachineBasicBlock *MBB = CI.I->getParent();
unsigned NewOffset0 = CI.Offset0;
unsigned NewOffset1 = CI.Offset1;
- unsigned Opc = (CI.EltSize == 4) ? AMDGPU::DS_WRITE2_B32
- : AMDGPU::DS_WRITE2_B64;
-
- if (CI.UseST64)
- Opc = (CI.EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
- : AMDGPU::DS_WRITE2ST64_B64;
+ unsigned Opc = CI.UseST64 ?
+ write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
if (NewOffset0 > NewOffset1) {
// Canonicalize the merged instruction so the smaller offset comes first.
CombineInfo CI;
CI.I = I;
unsigned Opc = MI.getOpcode();
- if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) {
+ if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64 ||
+ Opc == AMDGPU::DS_READ_B32_gfx9 || Opc == AMDGPU::DS_READ_B64_gfx9) {
+
CI.InstClass = DS_READ_WRITE;
- CI.EltSize = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4;
+ CI.EltSize =
+ (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 : 4;
+
if (findMatchingInst(CI)) {
Modified = true;
I = mergeRead2Pair(CI);
}
continue;
- }
- if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) {
+ } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64 ||
+ Opc == AMDGPU::DS_WRITE_B32_gfx9 ||
+ Opc == AMDGPU::DS_WRITE_B64_gfx9) {
CI.InstClass = DS_READ_WRITE;
- CI.EltSize = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4;
+ CI.EltSize
+ = (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 : 4;
+
if (findMatchingInst(CI)) {
Modified = true;
I = mergeWrite2Pair(CI);
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SICI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=SICI -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI,SICI,SICIVI,GCN %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SICI,CIVI,SICIVI,GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,CIVI,SICIVI,GFX89,GCN %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX89,GCN %s
-; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_offset:
-; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SICI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
-; GCN: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
+; GCN-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_offset:
+; GFX9-NOT: m0
+; SICIVI-DAG: s_mov_b32 m0
+
+; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SICI-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; GFX89-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; GCN-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
; GCN: ds_cmpst_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i64_offset:
+; GCN-LABEL: {{^}}lds_atomic_cmpxchg_ret_i64_offset:
+; GFX9-NOT: m0
+; SICIVI-DAG: s_mov_b32 m0
+
; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
; SICI-DAG: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
-; VI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI-DAG: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
+; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; GFX89-DAG: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7
; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0
; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
; GCN-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]]
; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
; GCN: ds_cmpst_rtn_b64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32
-; GCN: buffer_store_dwordx2 [[RESULT]],
+; GCN: [[RESULT]]
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr, i64 %swap) nounwind {
%gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_bad_si_offset
+; GCN-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_bad_si_offset
+; GFX9-NOT: m0
; SI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; CIVI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GFX9: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap, i32 %a, i32 %b) nounwind {
%sub = sub i32 %a, %b
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_noret_i32_offset:
-; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
-; SICI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xa
-; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
-; VI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x28
+; GCN-LABEL: {{^}}lds_atomic_cmpxchg_noret_i32_offset:
+; GFX9-NOT: m0
+; SICIVI-DAG: s_mov_b32 m0
+
+
+; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
+; SICI-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xa
+; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
+; GFX89-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x28
; GCN-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_noret_i64_offset:
-; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
-; SICI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; VI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
-; VI-DAG: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; GCN-LABEL: {{^}}lds_atomic_cmpxchg_noret_i64_offset:
+; GFX9-NOT: m0
+; SICIVI-DAG: s_mov_b32 m0
+
+; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
+; SICI-DAG: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
+; GFX89-DAG: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7
; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0
; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=R600,FUNC %s
; FUNC-LABEL: {{^}}atomic_add_local:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
; R600: LDS_ADD *
-; SI: ds_add_u32
+; GCN: ds_add_u32
define amdgpu_kernel void @atomic_add_local(i32 addrspace(3)* %local) {
%unused = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst
ret void
}
; FUNC-LABEL: {{^}}atomic_add_local_const_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; R600: LDS_ADD *
-; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
define amdgpu_kernel void @atomic_add_local_const_offset(i32 addrspace(3)* %local) {
%gep = getelementptr i32, i32 addrspace(3)* %local, i32 4
%val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst
}
; FUNC-LABEL: {{^}}atomic_add_ret_local:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; R600: LDS_ADD_RET *
-; SI: ds_add_rtn_u32
+; GCN: ds_add_rtn_u32
define amdgpu_kernel void @atomic_add_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
%val = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst
store i32 %val, i32 addrspace(1)* %out
}
; FUNC-LABEL: {{^}}atomic_add_ret_local_const_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; R600: LDS_ADD_RET *
-; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20
+; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20
define amdgpu_kernel void @atomic_add_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
%gep = getelementptr i32, i32 addrspace(3)* %local, i32 5
%val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SICIVI,FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SICIVI,FUNC %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope -check-prefixes=R600,FUNC %s
; FUNC-LABEL: {{^}}atomic_sub_local:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; R600: LDS_SUB *
-; SI: ds_sub_u32
+; GCN: ds_sub_u32
define amdgpu_kernel void @atomic_sub_local(i32 addrspace(3)* %local) {
%unused = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst
ret void
}
; FUNC-LABEL: {{^}}atomic_sub_local_const_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; R600: LDS_SUB *
-; SI: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
define amdgpu_kernel void @atomic_sub_local_const_offset(i32 addrspace(3)* %local) {
%gep = getelementptr i32, i32 addrspace(3)* %local, i32 4
%val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst
}
; FUNC-LABEL: {{^}}atomic_sub_ret_local:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; R600: LDS_SUB_RET *
-; SI: ds_sub_rtn_u32
+; GCN: ds_sub_rtn_u32
define amdgpu_kernel void @atomic_sub_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
%val = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst
store i32 %val, i32 addrspace(1)* %out
}
; FUNC-LABEL: {{^}}atomic_sub_ret_local_const_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; R600: LDS_SUB_RET *
-; SI: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20
+; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20
define amdgpu_kernel void @atomic_sub_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
%gep = getelementptr i32, i32 addrspace(3)* %local, i32 5
%val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -strict-whitespace -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -strict-whitespace -check-prefixes=GCN,CI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -strict-whitespace -check-prefixes=GCN,GFX9 %s
; FIXME: We don't get cases where the address was an SGPR because we
; get a copy to the address register for each one.
@lds = addrspace(3) global [512 x float] undef, align 4
@lds.f64 = addrspace(3) global [512 x double] undef, align 8
-; SI-LABEL: @simple_read2_f32
-; SI: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:8
-; SI: s_waitcnt lgkmcnt(0)
-; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
-; SI: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
+; GCN-LABEL: {{^}}simple_read2_f32:
+; CI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:8
+; GCN: s_waitcnt lgkmcnt(0)
+; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
+; CI: buffer_store_dword [[RESULT]]
+; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+; GCN: s_endpgm
define amdgpu_kernel void @simple_read2_f32(float addrspace(1)* %out) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
ret void
}
-; SI-LABEL: @simple_read2_f32_max_offset
-; SI: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:255
-; SI: s_waitcnt lgkmcnt(0)
-; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
-; SI: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
+; GCN-LABEL: {{^}}simple_read2_f32_max_offset:
+; CI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:255
+; GCN: s_waitcnt lgkmcnt(0)
+; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
+
+; CI: buffer_store_dword [[RESULT]]
+; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
ret void
}
-; SI-LABEL: @simple_read2_f32_too_far
-; SI-NOT ds_read2_b32
-; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
-; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028
-; SI: s_endpgm
+; GCN-LABEL: @simple_read2_f32_too_far
+; CI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-NOT ds_read2_b32
+; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
+; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028
+; GCN: s_endpgm
define amdgpu_kernel void @simple_read2_f32_too_far(float addrspace(1)* %out) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
ret void
}
-; SI-LABEL: @simple_read2_f32_x2
-; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset1:8
-; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
-; SI: s_endpgm
+; GCN-LABEL: @simple_read2_f32_x2
+; CI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset1:8
+; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
+; GCN: s_endpgm
define amdgpu_kernel void @simple_read2_f32_x2(float addrspace(1)* %out) #0 {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 0
}
; Make sure there is an instruction between the two sets of reads.
-; SI-LABEL: @simple_read2_f32_x2_barrier
-; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset1:8
-; SI: s_barrier
-; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
-; SI: s_endpgm
+; GCN-LABEL: @simple_read2_f32_x2_barrier
+; CI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset1:8
+; GCN: s_barrier
+; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
+; GCN: s_endpgm
define amdgpu_kernel void @simple_read2_f32_x2_barrier(float addrspace(1)* %out) #0 {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 0
; For some reason adding something to the base address for the first
; element results in only folding the inner pair.
-; SI-LABEL: @simple_read2_f32_x2_nonzero_base
-; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset0:2 offset1:8
-; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
-; SI: s_endpgm
+; GCN-LABEL: @simple_read2_f32_x2_nonzero_base
+; CI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset0:2 offset1:8
+; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
+; GCN: s_endpgm
define amdgpu_kernel void @simple_read2_f32_x2_nonzero_base(float addrspace(1)* %out) #0 {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 2
; Base pointers come from different subregister of same super
; register. We can't safely merge this.
-; SI-LABEL: @read2_ptr_is_subreg_arg_f32
-; SI-NOT: ds_read2_b32
-; SI: ds_read_b32
-; SI: ds_read_b32
-; SI: s_endpgm
+; GCN-LABEL: @read2_ptr_is_subreg_arg_f32
+; CI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-NOT: ds_read2_b32
+; GCN: ds_read_b32
+; GCN: ds_read_b32
+; GCN: s_endpgm
define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
; sure we are really rejecting it because of the different
; subregisters.
-; SI-LABEL: @read2_ptr_is_subreg_arg_offset_f32
-; SI-NOT: ds_read2_b32
-; SI: ds_read_b32
-; SI: ds_read_b32
-; SI: s_endpgm
+; GCN-LABEL: @read2_ptr_is_subreg_arg_offset_f32
+; CI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-NOT: ds_read2_b32
+; GCN: ds_read_b32
+; GCN: ds_read_b32
+; GCN: s_endpgm
define amdgpu_kernel void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
ret void
}
-; SI-LABEL: {{^}}read2_ptr_is_subreg_f32:
-; SI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:8{{$}}
-; SI: s_endpgm
+; GCN-LABEL: {{^}}read2_ptr_is_subreg_f32:
+; CI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:8{{$}}
+; GCN: s_endpgm
define amdgpu_kernel void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%ptr.0 = insertelement <2 x [512 x float] addrspace(3)*> undef, [512 x float] addrspace(3)* @lds, i32 0
ret void
}
-; SI-LABEL: @simple_read2_f32_volatile_0
-; SI-NOT ds_read2_b32
-; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
-; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32
-; SI: s_endpgm
+; GCN-LABEL: @simple_read2_f32_volatile_0
+; CI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-NOT ds_read2_b32
+; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
+; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32
+; GCN: s_endpgm
define amdgpu_kernel void @simple_read2_f32_volatile_0(float addrspace(1)* %out) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
ret void
}
-; SI-LABEL: @simple_read2_f32_volatile_1
-; SI-NOT ds_read2_b32
-; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
-; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32
-; SI: s_endpgm
+; GCN-LABEL: @simple_read2_f32_volatile_1
+; CI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-NOT ds_read2_b32
+; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
+; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32
+; GCN: s_endpgm
define amdgpu_kernel void @simple_read2_f32_volatile_1(float addrspace(1)* %out) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
; Can't fold since not correctly aligned.
; XXX: This isn't really testing anything useful now. I think CI
; allows unaligned LDS accesses, which would be a problem here.
-; SI-LABEL: @unaligned_read2_f32
-; SI-NOT: ds_read2_b32
-; SI: s_endpgm
+; GCN-LABEL: @unaligned_read2_f32
+; CI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-NOT: ds_read2_b32
+; GCN: s_endpgm
define amdgpu_kernel void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i
ret void
}
-; SI-LABEL: @misaligned_2_simple_read2_f32
-; SI-NOT: ds_read2_b32
-; SI: s_endpgm
+; GCN-LABEL: @misaligned_2_simple_read2_f32
+; CI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-NOT: ds_read2_b32
+; GCN: s_endpgm
define amdgpu_kernel void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i
ret void
}
-; SI-LABEL: @simple_read2_f64
-; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, {{v[0-9]+}}
-; SI: ds_read2_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, [[VPTR]] offset1:8
-; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
-; SI: buffer_store_dwordx2 [[RESULT]]
-; SI: s_endpgm
+; GCN-LABEL: @simple_read2_f64
+; CI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, {{v[0-9]+}}
+; GCN: ds_read2_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, [[VPTR]] offset1:8
+; GCN: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
+
+; CI: buffer_store_dwordx2 [[RESULT]]
+; GFX9: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @simple_read2_f64(double addrspace(1)* %out) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
ret void
}
-; SI-LABEL: @simple_read2_f64_max_offset
-; SI: ds_read2_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:255
-; SI: s_endpgm
+; GCN-LABEL: @simple_read2_f64_max_offset
+; CI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN: ds_read2_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:255
+; GCN: s_endpgm
define amdgpu_kernel void @simple_read2_f64_max_offset(double addrspace(1)* %out) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
ret void
}
-; SI-LABEL: @simple_read2_f64_too_far
-; SI-NOT ds_read2_b64
-; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
-; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:2056
-; SI: s_endpgm
+; GCN-LABEL: @simple_read2_f64_too_far
+; CI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-NOT ds_read2_b64
+; GCN: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
+; GCN: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:2056
+; GCN: s_endpgm
define amdgpu_kernel void @simple_read2_f64_too_far(double addrspace(1)* %out) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
}
; Alignment only 4
-; SI-LABEL: @misaligned_read2_f64
-; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1
-; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:14 offset1:15
-; SI: s_endpgm
+; GCN-LABEL: @misaligned_read2_f64
+; CI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1
+; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:14 offset1:15
+; GCN: s_endpgm
define amdgpu_kernel void @misaligned_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
@foo = addrspace(3) global [4 x i32] undef, align 4
-; SI-LABEL: @load_constant_adjacent_offsets
-; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1
+; GCN-LABEL: @load_constant_adjacent_offsets
+; CI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1
define amdgpu_kernel void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) {
%val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
%val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
ret void
}
-; SI-LABEL: @load_constant_disjoint_offsets
-; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:2
+; GCN-LABEL: @load_constant_disjoint_offsets
+; CI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:2
define amdgpu_kernel void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) {
%val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
%val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
@bar = addrspace(3) global [4 x i64] undef, align 4
-; SI-LABEL: @load_misaligned64_constant_offsets
-; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1
-; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:2 offset1:3
+; GCN-LABEL: @load_misaligned64_constant_offsets
+; CI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1
+; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:2 offset1:3
define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) {
%val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
%val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
@bar.large = addrspace(3) global [4096 x i64] undef, align 4
-; SI-LABEL: @load_misaligned64_constant_large_offsets
-; SI-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}}
-; SI-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000
-; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE0]] offset1:1
-; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE1]] offset1:1
-; SI: s_endpgm
+; GCN-LABEL: @load_misaligned64_constant_large_offsets
+; CI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}}
+; GCN-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000
+; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE0]] offset1:1
+; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE1]] offset1:1
+; GCN: s_endpgm
define amdgpu_kernel void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) {
%val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
%val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4
@sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4
@sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4
+; GCN-LABEL: {{^}}sgemm_inner_loop_read2_sequence:
+; CI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb) #0 {
%x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1
%y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1
ret void
}
+; GCN-LABEL: {{^}}misaligned_read2_v2i32:
+; CI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
define amdgpu_kernel void @misaligned_read2_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(3)* %in) #0 {
%load = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 4
store <2 x i32> %load, <2 x i32> addrspace(1)* %out, align 8
ret void
}
+; GCN-LABEL: {{^}}misaligned_read2_i64:
+; CI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
define amdgpu_kernel void @misaligned_read2_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %in) #0 {
%load = load i64, i64 addrspace(3)* %in, align 4
store i64 %load, i64 addrspace(1)* %out, align 8
ret void
}
-; SI-LABEL: ds_read_diff_base_interleaving
-; SI-NOT: ds_read_b32
+; GCN-LABEL: ds_read_diff_base_interleaving
+; CI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-NOT: ds_read_b32
define amdgpu_kernel void @ds_read_diff_base_interleaving(
float addrspace(1)* nocapture %arg,
[4 x [4 x float]] addrspace(3)* %arg1,
ret void
}
-; Function Attrs: nounwind readnone
declare i32 @llvm.amdgcn.workgroup.id.x() #1
-
-; Function Attrs: nounwind readnone
declare i32 @llvm.amdgcn.workgroup.id.y() #1
-
-; Function Attrs: nounwind readnone
declare i32 @llvm.amdgcn.workitem.id.x() #1
-
-; Function Attrs: nounwind readnone
declare i32 @llvm.amdgcn.workitem.id.y() #1
-
-; Function Attrs: convergent nounwind
declare void @llvm.amdgcn.s.barrier() #2
attributes #0 = { nounwind }
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -check-prefixes=GCN,CI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -check-prefixes=GCN,GFX9 %s
@lds = addrspace(3) global [512 x float] undef, align 4
@lds.f64 = addrspace(3) global [512 x double] undef, align 8
-; SI-LABEL: @simple_read2st64_f32_0_1
-; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1
-; SI: s_waitcnt lgkmcnt(0)
-; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
-; SI: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
+; GCN-LABEL: @simple_read2st64_f32_0_1
+; CI: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1
+; GCN: s_waitcnt lgkmcnt(0)
+; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
+; CI: buffer_store_dword [[RESULT]]
+; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
ret void
}
-; SI-LABEL: @simple_read2st64_f32_1_2
-; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:2
-; SI: s_waitcnt lgkmcnt(0)
-; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
-; SI: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
+; GCN-LABEL: @simple_read2st64_f32_1_2
+; CI: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:2
+; GCN: s_waitcnt lgkmcnt(0)
+; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
+; CI: buffer_store_dword [[RESULT]]
+; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @simple_read2st64_f32_1_2(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%add.x.0 = add nsw i32 %x.i, 64
ret void
}
-; SI-LABEL: @simple_read2st64_f32_max_offset
-; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:255
-; SI: s_waitcnt lgkmcnt(0)
-; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
-; SI: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
+; GCN-LABEL: @simple_read2st64_f32_max_offset
+; CI: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:255
+; GCN: s_waitcnt lgkmcnt(0)
+; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
+; CI: buffer_store_dword [[RESULT]]
+; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%add.x.0 = add nsw i32 %x.i, 64
ret void
}
-; SI-LABEL: @simple_read2st64_f32_over_max_offset
-; SI-NOT: ds_read2st64_b32
-; SI-DAG: v_add_i32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}}
-; SI-DAG: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256
-; SI-DAG: ds_read_b32 {{v[0-9]+}}, [[BIGADD]]{{$}}
-; SI: s_endpgm
+; GCN-LABEL: @simple_read2st64_f32_over_max_offset
+; CI: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-NOT: ds_read2st64_b32
+; GCN-DAG: v_add{{(_co)?}}_{{i|u}}32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}}
+; GCN-DAG: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256
+; GCN-DAG: ds_read_b32 {{v[0-9]+}}, [[BIGADD]]{{$}}
+; GCN: s_endpgm
define amdgpu_kernel void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%add.x.0 = add nsw i32 %x.i, 64
ret void
}
-; SI-LABEL: @odd_invalid_read2st64_f32_0
-; SI-NOT: ds_read2st64_b32
-; SI: s_endpgm
+; GCN-LABEL: @odd_invalid_read2st64_f32_0
+; CI: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-NOT: ds_read2st64_b32
+; GCN: s_endpgm
define amdgpu_kernel void @odd_invalid_read2st64_f32_0(float addrspace(1)* %out) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
ret void
}
-; SI-LABEL: @odd_invalid_read2st64_f32_1
-; SI-NOT: ds_read2st64_b32
-; SI: s_endpgm
+; GCN-LABEL: @odd_invalid_read2st64_f32_1
+; CI: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-NOT: ds_read2st64_b32
+; GCN: s_endpgm
define amdgpu_kernel void @odd_invalid_read2st64_f32_1(float addrspace(1)* %out) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%add.x.0 = add nsw i32 %x.i, 64
ret void
}
-; SI-LABEL: @simple_read2st64_f64_0_1
-; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1
-; SI: s_waitcnt lgkmcnt(0)
-; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
-; SI: buffer_store_dwordx2 [[RESULT]]
-; SI: s_endpgm
+; GCN-LABEL: @simple_read2st64_f64_0_1
+; CI: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1
+; GCN: s_waitcnt lgkmcnt(0)
+; GCN: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
+; CI: buffer_store_dwordx2 [[RESULT]]
+; GFX9: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @simple_read2st64_f64_0_1(double addrspace(1)* %out) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
ret void
}
-; SI-LABEL: @simple_read2st64_f64_1_2
-; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:2
-; SI: s_waitcnt lgkmcnt(0)
-; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
-; SI: buffer_store_dwordx2 [[RESULT]]
-; SI: s_endpgm
+; GCN-LABEL: @simple_read2st64_f64_1_2
+; CI: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:2
+; GCN: s_waitcnt lgkmcnt(0)
+; GCN: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
+
+; CI: buffer_store_dwordx2 [[RESULT]]
+; GFX9: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @simple_read2st64_f64_1_2(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%add.x.0 = add nsw i32 %x.i, 64
; Alignment only
-; SI-LABEL: @misaligned_read2st64_f64
-; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1
-; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:128 offset1:129
-; SI: s_endpgm
+; GCN-LABEL: @misaligned_read2st64_f64
+; CI: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1
+; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:128 offset1:129
+; GCN: s_endpgm
define amdgpu_kernel void @misaligned_read2st64_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
}
; The maximum is not the usual 0xff because 0xff * 8 * 64 > 0xffff
-; SI-LABEL: @simple_read2st64_f64_max_offset
-; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:4 offset1:127
-; SI: s_waitcnt lgkmcnt(0)
-; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
-; SI: buffer_store_dwordx2 [[RESULT]]
-; SI: s_endpgm
+; GCN-LABEL: @simple_read2st64_f64_max_offset
+; CI: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:4 offset1:127
+; GCN: s_waitcnt lgkmcnt(0)
+; GCN: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
+
+; CI: buffer_store_dwordx2 [[RESULT]]
+; GFX9: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%add.x.0 = add nsw i32 %x.i, 256
ret void
}
-; SI-LABEL: @simple_read2st64_f64_over_max_offset
-; SI-NOT: ds_read2st64_b64
-; SI-DAG: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset:512
-; SI-DAG: v_add_i32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}}
-; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, [[BIGADD]]
-; SI: s_endpgm
+; GCN-LABEL: @simple_read2st64_f64_over_max_offset
+; CI: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-NOT: ds_read2st64_b64
+; GCN-DAG: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset:512
+; GCN-DAG: v_add_{{(co_)?}}{{i|u}}32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}}
+; GCN: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, [[BIGADD]]
+; GCN: s_endpgm
define amdgpu_kernel void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%add.x.0 = add nsw i32 %x.i, 64
ret void
}
-; SI-LABEL: @invalid_read2st64_f64_odd_offset
-; SI-NOT: ds_read2st64_b64
-; SI: s_endpgm
+; GCN-LABEL: @invalid_read2st64_f64_odd_offset
+; CI: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-NOT: ds_read2st64_b64
+; GCN: s_endpgm
define amdgpu_kernel void @invalid_read2st64_f64_odd_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%add.x.0 = add nsw i32 %x.i, 64
; The stride of 8 elements is 8 * 8 bytes. We need to make sure the
; stride in elements, not bytes, is a multiple of 64.
-; SI-LABEL: @byte_size_only_divisible_64_read2_f64
-; SI-NOT: ds_read2st_b64
-; SI: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:8
-; SI: s_endpgm
+; GCN-LABEL: @byte_size_only_divisible_64_read2_f64
+; CI: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-NOT: ds_read2st_b64
+; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:8
+; GCN: s_endpgm
define amdgpu_kernel void @byte_size_only_divisible_64_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
ret void
}
-; Function Attrs: nounwind readnone
declare i32 @llvm.amdgcn.workitem.id.x() #1
-
-; Function Attrs: nounwind readnone
declare i32 @llvm.amdgcn.workitem.id.y() #1
attributes #0 = { nounwind }
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -strict-whitespace -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -strict-whitespace -check-prefixes=GCN,CI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -strict-whitespace -check-prefixes=GCN,GFX9 %s
@lds = addrspace(3) global [512 x float] undef, align 4
@lds.f64 = addrspace(3) global [512 x double] undef, align 8
-; SI-LABEL: @simple_write2_one_val_f32
-; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]]
-; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
-; SI: ds_write2_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:8
-; SI: s_endpgm
+; GCN-LABEL: {{^}}simple_write2_one_val_f32:
+; CI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-DAG: {{buffer|global}}_load_dword [[VAL:v[0-9]+]]
+; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+; GCN: ds_write2_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:8
+; GCN: s_endpgm
define amdgpu_kernel void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i
ret void
}
-; SI-LABEL: @simple_write2_two_val_f32
-; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
-; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8
-; SI: s_endpgm
+; GCN-LABEL: {{^}}simple_write2_two_val_f32:
+; CI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+
+; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
+; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4
+
+; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+; GCN: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8
+; GCN: s_endpgm
define amdgpu_kernel void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
ret void
}
-; SI-LABEL: @simple_write2_two_val_f32_volatile_0
-; SI-NOT: ds_write2_b32
-; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}}
-; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32
-; SI: s_endpgm
+; GCN-LABEL: @simple_write2_two_val_f32_volatile_0
+; CI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-NOT: ds_write2_b32
+; GCN: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}}
+; GCN: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32
+; GCN: s_endpgm
define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
ret void
}
-; SI-LABEL: @simple_write2_two_val_f32_volatile_1
-; SI-NOT: ds_write2_b32
-; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}}
-; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32
-; SI: s_endpgm
+; GCN-LABEL: @simple_write2_two_val_f32_volatile_1
+; CI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-NOT: ds_write2_b32
+; GCN: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}}
+; GCN: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32
+; GCN: s_endpgm
define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
}
; 2 data subregisters from different super registers.
-; SI-LABEL: @simple_write2_two_val_subreg2_mixed_f32
-; SI: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}}
-; SI: buffer_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}}
-; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
-; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
-; SI: s_endpgm
+; GCN-LABEL: {{^}}simple_write2_two_val_subreg2_mixed_f32:
+; GFX9-NOT: m0
+
+; CI: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}}
+; CI: buffer_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}}
+; CI-DAG: s_mov_b32 m0
+
+; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+
+; GFX9: global_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}}
+; GFX9: global_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}}
+; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
+; GCN: s_endpgm
define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i
ret void
}
-; SI-LABEL: @simple_write2_two_val_subreg2_f32
-; SI-DAG: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
-; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
-; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
-; SI: s_endpgm
+; GCN-LABEL: @simple_write2_two_val_subreg2_f32
+; CI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-DAG: {{buffer|global}}_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
+; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
+; GCN: s_endpgm
define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i
ret void
}
-; SI-LABEL: @simple_write2_two_val_subreg4_f32
-; SI-DAG: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
-; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
-; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
-; SI: s_endpgm
+; GCN-LABEL: @simple_write2_two_val_subreg4_f32
+; CI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-DAG: {{buffer|global}}_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
+; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
+; GCN: s_endpgm
define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %x.i
ret void
}
-; SI-LABEL: @simple_write2_two_val_max_offset_f32
-; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
-; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255
-; SI: s_endpgm
+; GCN-LABEL: @simple_write2_two_val_max_offset_f32
+; CI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+
+; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
+; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4
+
+; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+; GCN: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255
+; GCN: s_endpgm
define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
ret void
}
-; SI-LABEL: @simple_write2_two_val_too_far_f32
-; SI: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}}
-; SI: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028
-; SI: s_endpgm
+; GCN-LABEL: @simple_write2_two_val_too_far_f32
+; CI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}}
+; GCN: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028
+; GCN: s_endpgm
define amdgpu_kernel void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
ret void
}
-; SI-LABEL: @simple_write2_two_val_f32_x2
-; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset1:8
-; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
-; SI: s_endpgm
+; GCN-LABEL: @simple_write2_two_val_f32_x2
+; CI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset1:8
+; GCN: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
+; GCN: s_endpgm
define amdgpu_kernel void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x
ret void
}
-; SI-LABEL: @simple_write2_two_val_f32_x2_nonzero_base
-; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset0:3 offset1:8
-; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
-; SI: s_endpgm
+; GCN-LABEL: @simple_write2_two_val_f32_x2_nonzero_base
+; CI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset0:3 offset1:8
+; GCN: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
+; GCN: s_endpgm
define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x
ret void
}
-; SI-LABEL: @write2_ptr_subreg_arg_two_val_f32
-; SI-NOT: ds_write2_b32
-; SI: ds_write_b32
-; SI: ds_write_b32
-; SI: s_endpgm
+; GCN-LABEL: @write2_ptr_subreg_arg_two_val_f32
+; CI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-NOT: ds_write2_b32
+; GCN: ds_write_b32
+; GCN: ds_write_b32
+; GCN: s_endpgm
define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1, <2 x float addrspace(3)*> %lds.ptr) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
ret void
}
-; SI-LABEL: @simple_write2_one_val_f64
-; SI-DAG: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]],
-; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
-; SI: ds_write2_b64 [[VPTR]], [[VAL]], [[VAL]] offset1:8
-; SI: s_endpgm
+; GCN-LABEL: @simple_write2_one_val_f64
+; CI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-DAG: {{buffer|global}}_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]],
+; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
+; GCN: ds_write2_b64 [[VPTR]], [[VAL]], [[VAL]] offset1:8
+; GCN: s_endpgm
define amdgpu_kernel void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
ret void
}
-; SI-LABEL: @misaligned_simple_write2_one_val_f64
-; SI-DAG: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
-; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
-; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:1
-; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:14 offset1:15
-; SI: s_endpgm
+; GCN-LABEL: @misaligned_simple_write2_one_val_f64
+; CI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-DAG: {{buffer|global}}_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
+; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
+; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:1
+; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:14 offset1:15
+; GCN: s_endpgm
define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
ret void
}
-; SI-LABEL: @simple_write2_two_val_f64
-; SI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
-; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
-; SI: ds_write2_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8
-; SI: s_endpgm
+; GCN-LABEL: @simple_write2_two_val_f64
+; CI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; CI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; CI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
+
+; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
+; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off offset:8
+
+
+; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
+; GCN: ds_write2_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8
+; GCN: s_endpgm
define amdgpu_kernel void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i
@foo = addrspace(3) global [4 x i32] undef, align 4
-; SI-LABEL: @store_constant_adjacent_offsets
-; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
+; GCN-LABEL: @store_constant_adjacent_offsets
+; CI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; GCN: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
define amdgpu_kernel void @store_constant_adjacent_offsets() {
store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
ret void
}
-; SI-LABEL: @store_constant_disjoint_offsets
-; SI-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b{{$}}
-; SI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; SI: ds_write2_b32 [[ZERO]], [[VAL]], [[VAL]] offset1:2
+; GCN-LABEL: @store_constant_disjoint_offsets
+; CI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b{{$}}
+; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; GCN: ds_write2_b32 [[ZERO]], [[VAL]], [[VAL]] offset1:2
define amdgpu_kernel void @store_constant_disjoint_offsets() {
store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
@bar = addrspace(3) global [4 x i64] undef, align 4
-; SI-LABEL: @store_misaligned64_constant_offsets
-; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; SI-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
-; SI-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
-; SI: s_endpgm
+; GCN-LABEL: @store_misaligned64_constant_offsets
+; CI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; GCN-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
+; GCN-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
+; GCN: s_endpgm
define amdgpu_kernel void @store_misaligned64_constant_offsets() {
store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
@bar.large = addrspace(3) global [4096 x i64] undef, align 4
-; SI-LABEL: @store_misaligned64_constant_large_offsets
-; SI-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}}
-; SI-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000{{$}}
-; SI-DAG: ds_write2_b32 [[BASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
-; SI-DAG: ds_write2_b32 [[BASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
-; SI: s_endpgm
+; GCN-LABEL: @store_misaligned64_constant_large_offsets
+; CI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}}
+; GCN-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000{{$}}
+; GCN-DAG: ds_write2_b32 [[BASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
+; GCN-DAG: ds_write2_b32 [[BASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
+; GCN: s_endpgm
define amdgpu_kernel void @store_misaligned64_constant_large_offsets() {
store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4
ret void
}
-; CI-LABEL: {{^}}simple_write2_v4f32_superreg_align4:
-; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:3 offset1:2{{$}}
-; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:1{{$}}
-; CI: s_endpgm
+; GCN-LABEL: {{^}}simple_write2_v4f32_superreg_align4:
+; CI: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:2 offset1:3{{$}}
+; GCN: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset1:1{{$}}
define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(<4 x float> addrspace(3)* %out, <4 x float> addrspace(1)* %in) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in
ret void
}
-; Function Attrs: nounwind readnone
declare i32 @llvm.amdgcn.workgroup.id.x() #1
-
-; Function Attrs: nounwind readnone
declare i32 @llvm.amdgcn.workgroup.id.y() #1
-
-; Function Attrs: nounwind readnone
declare i32 @llvm.amdgcn.workitem.id.x() #1
-
-; Function Attrs: nounwind readnone
declare i32 @llvm.amdgcn.workitem.id.y() #1
attributes #0 = { nounwind }
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
@lds = addrspace(3) global [512 x float] undef, align 4
-; SI-LABEL: @simple_write2st64_one_val_f32_0_1
-; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]]
-; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
-; SI: ds_write2st64_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:1
-; SI: s_endpgm
+; GCN-LABEL: @simple_write2st64_one_val_f32_0_1
+; CI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0n
+
+; GCN-DAG: {{buffer|global}}_load_dword [[VAL:v[0-9]+]]
+; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+; GCN: ds_write2st64_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:1
+; GCN: s_endpgm
define amdgpu_kernel void @simple_write2st64_one_val_f32_0_1(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i
ret void
}
-; SI-LABEL: @simple_write2st64_two_val_f32_2_5
-; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
-; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset0:2 offset1:5
-; SI: s_endpgm
+; GCN-LABEL: @simple_write2st64_two_val_f32_2_5
+; CI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+
+; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
+; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4
+
+
+; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+; GCN: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset0:2 offset1:5
+; GCN: s_endpgm
define amdgpu_kernel void @simple_write2st64_two_val_f32_2_5(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
ret void
}
-; SI-LABEL: @simple_write2st64_two_val_max_offset_f32
-; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
-; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255
-; SI: s_endpgm
+; GCN-LABEL: @simple_write2st64_two_val_max_offset_f32
+; CI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+
+; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
+; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4
+
+; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 2, v{{[0-9]+}}
+; GCN: v_add{{(_co)?}}_{{i|u}}32_e32 [[VPTR:v[0-9]+]], vcc, s{{[0-9]+}}, [[SHL]]
+; GCN: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255
+; GCN: s_endpgm
define amdgpu_kernel void @simple_write2st64_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in, float addrspace(3)* %lds) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
ret void
}
-; SI-LABEL: @simple_write2st64_two_val_max_offset_f64
-; SI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
-; SI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]],
-; SI: ds_write2st64_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset0:4 offset1:127
-; SI: s_endpgm
+; GCN-LABEL: @simple_write2st64_two_val_max_offset_f64
+; CI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; CI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; CI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
+
+; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
+; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off offset:8
+
+; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 3, v{{[0-9]+}}
+; GCN: v_add_{{(co_)?}}{{i|u}}32_e32 [[VPTR:v[0-9]+]], vcc, s{{[0-9]+}}, [[SHL]]
+; GCN: ds_write2st64_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset0:4 offset1:127
+; GCN: s_endpgm
define amdgpu_kernel void @simple_write2st64_two_val_max_offset_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i
ret void
}
-; SI-LABEL: @byte_size_only_divisible_64_write2st64_f64
-; SI-NOT: ds_write2st64_b64
-; SI: ds_write2_b64 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:8
-; SI: s_endpgm
+; GCN-LABEL: @byte_size_only_divisible_64_write2st64_f64
+; CI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-NOT: ds_write2st64_b64
+; GCN: ds_write2_b64 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:8
+; GCN: s_endpgm
define amdgpu_kernel void @byte_size_only_divisible_64_write2st64_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
ret void
}
-; Function Attrs: nounwind readnone
declare i32 @llvm.amdgcn.workitem.id.x() #1
-
-; Function Attrs: nounwind readnone
declare i32 @llvm.amdgcn.workitem.id.y() #1
attributes #0 = { nounwind }
; CHECK: ReservedNumVGPRs: 4
; GFX700: ReservedFirstVGPR: 8
; GFX800: ReservedFirstVGPR: 8
-; GFX900: ReservedFirstVGPR: 11
+; GFX900: ReservedFirstVGPR: 10
; CHECK: PrivateSegmentBufferSGPR: 0
; CHECK: WavefrontPrivateSegmentOffsetSGPR: 11
define amdgpu_kernel void @test(i32 addrspace(1)* %A) #0 !dbg !7 !kernel_arg_addr_space !12 !kernel_arg_access_qual !13 !kernel_arg_type !14 !kernel_arg_base_type !14 !kernel_arg_type_qual !15 {
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL,PREGFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL,PREGFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,IDXMODE,PREGFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,IDXMODE,GFX9 %s
; Tests for indirect addressing on SI, which is implemented using dynamic
; indexing of vectors.
; IDXMODE: v_mov_b32_e32 v[[VEC0_ELT2]], -4.0
; IDXMODE: s_set_gpr_idx_off
-; GCN: s_mov_b32 m0, -1
+; PREGFX9: s_mov_b32 m0, -1
+; GFX9-NOT: s_mov_b32 m0
; GCN: ds_write_b32
; GCN: ds_write_b32
; GCN: s_endpgm
; Make sure no crash on invalid non-constant
; GCN-LABEL: {{^}}invalid_variable_order_lds_atomic_dec_ret_i32:
+; CIVI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
define amdgpu_kernel void @invalid_variable_order_lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %order.var) #0 {
%result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 %order.var, i32 0, i1 false)
store i32 %result, i32 addrspace(1)* %out
; Make sure no crash on invalid non-constant
; GCN-LABEL: {{^}}invalid_variable_scope_lds_atomic_dec_ret_i32:
+; CIVI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
define amdgpu_kernel void @invalid_variable_scope_lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %scope.var) #0 {
%result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 %scope.var, i1 false)
store i32 %result, i32 addrspace(1)* %out
}
; GCN-LABEL: {{^}}lds_atomic_dec_ret_i32:
-; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; CIVI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42
; GCN: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]]
define amdgpu_kernel void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
%result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false)
}
; GCN-LABEL: {{^}}lds_atomic_dec_ret_i32_offset:
-; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; CIVI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42
; GCN: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] offset:16
define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
%gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
}
; GCN-LABEL: {{^}}lds_atomic_dec_noret_i32:
-; GCN: s_load_dword [[SPTR:s[0-9]+]],
-; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
-; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; CIVI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-DAG: s_load_dword [[SPTR:s[0-9]+]],
+; GCN-DAG: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
+; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
; GCN: ds_dec_u32 [[VPTR]], [[DATA]]
define amdgpu_kernel void @lds_atomic_dec_noret_i32(i32 addrspace(3)* %ptr) nounwind {
%result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false)
}
; GCN-LABEL: {{^}}lds_atomic_dec_noret_i32_offset:
-; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; CIVI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42
; GCN: ds_dec_u32 v{{[0-9]+}}, [[K]] offset:16
define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
%gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
@lds0 = addrspace(3) global [512 x i32] undef
; GCN-LABEL: {{^}}atomic_dec_shl_base_lds_0:
-; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; CIVI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-DAG: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
; GCN: ds_dec_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
define amdgpu_kernel void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
}
; GCN-LABEL: {{^}}lds_atomic_dec_ret_i64:
+; CIVI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
}
; GCN-LABEL: {{^}}lds_atomic_dec_ret_i64_offset:
+; CIVI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32
}
; GCN-LABEL: {{^}}lds_atomic_dec_noret_i64:
+; CIVI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
; GCN: ds_dec_u64 v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
}
; GCN-LABEL: {{^}}lds_atomic_dec_noret_i64_offset:
+; CIVI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
; GCN: ds_dec_u64 v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32{{$}}
@lds1 = addrspace(3) global [512 x i64] undef, align 8
; GCN-LABEL: {{^}}atomic_dec_shl_base_lds_0_i64:
-; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}}
+; CIVI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-DAG: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}}
; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16
define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
declare i32 @llvm.amdgcn.workitem.id.x() #1
; GCN-LABEL: {{^}}lds_atomic_inc_ret_i32:
-; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; CIVI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42
; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]]
define amdgpu_kernel void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
%result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false)
}
; GCN-LABEL: {{^}}lds_atomic_inc_ret_i32_offset:
-; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; CIVI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42
; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] offset:16
define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
%gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
}
; GCN-LABEL: {{^}}lds_atomic_inc_noret_i32:
-; GCN: s_load_dword [[SPTR:s[0-9]+]],
-; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
-; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; CIVI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-DAG: s_load_dword [[SPTR:s[0-9]+]],
+; GCN-DAG: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
+; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
; GCN: ds_inc_u32 [[VPTR]], [[DATA]]
define amdgpu_kernel void @lds_atomic_inc_noret_i32(i32 addrspace(3)* %ptr) nounwind {
%result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false)
}
; GCN-LABEL: {{^}}lds_atomic_inc_noret_i32_offset:
-; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; CIVI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42
; GCN: ds_inc_u32 v{{[0-9]+}}, [[K]] offset:16
define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
%gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
; FIXME: Remove m0 initialization
; GCN-LABEL: {{^}}load_local_hi_v2i16_zerolo_shift:
; GCN: s_waitcnt
-; GFX9-NEXT: s_mov_b32 m0, -1
; GFX9-NEXT: ds_read_u16 v0, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; FIXME: Is there a cost to using the extload over not?
; GCN-LABEL: {{^}}load_local_v2i16_split:
; GCN: s_waitcnt
-; GFX9-NEXT: s_mov_b32 m0, -1
; GFX9-NEXT: ds_read_u16 v1, v0
; GFX9-NEXT: s_waitcnt
; GFX9-NEXT: ds_read_u16_d16_hi v1, v0 offset:2
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG,FUNC %s
; FUNC-LABEL: {{^}}load_f32_local:
-; GCN: s_mov_b32 m0
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
; GCN: ds_read_b32
; EG: LDS_READ_RET
}
; FUNC-LABEL: {{^}}load_v2f32_local:
-; GCN: s_mov_b32 m0
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_read_b64
; EG: LDS_READ_RET
; FIXME: should this do a read2_b64?
; FUNC-LABEL: {{^}}local_load_v3f32:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN-DAG: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:8
; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+$}}
; GCN: s_waitcnt
}
; FUNC-LABEL: {{^}}local_load_v4f32:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_read2_b64
; EG: LDS_READ_RET
}
; FUNC-LABEL: {{^}}local_load_v8f32:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_read2_b64
; GCN: ds_read2_b64
}
; FUNC-LABEL: {{^}}local_load_v16f32:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_read2_b64
; GCN: ds_read2_b64
; GCN: ds_read2_b64
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG,FUNC %s
; FUNC-LABEL: {{^}}local_load_f64:
+; SICIV: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_read_b64 [[VAL:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}{{$}}
; GCN: ds_write_b64 v{{[0-9]+}}, [[VAL]]
}
; FUNC-LABEL: {{^}}local_load_v2f64:
+; SICIV: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_read2_b64
; EG: LDS_READ_RET
}
; FUNC-LABEL: {{^}}local_load_v3f64:
+; SICIV: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN-DAG: ds_read2_b64
; GCN-DAG: ds_read_b64
}
; FUNC-LABEL: {{^}}local_load_v4f64:
+; SICIV: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_read2_b64
; GCN: ds_read2_b64
}
; FUNC-LABEL: {{^}}local_load_v8f64:
+; SICIV: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_read2_b64
; GCN: ds_read2_b64
; GCN: ds_read2_b64
}
; FUNC-LABEL: {{^}}local_load_v16f64:
+; SICIV: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_read2_b64
; GCN: ds_read2_b64
; GCN: ds_read2_b64
-; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mtriple=r600---amdgiz -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
+; RUN: llc -march=r600 -mtriple=r600---amdgiz -mcpu=cypress < %s | FileCheck -check-prefixes=EG,FUNC %s
; FUNC-LABEL: {{^}}local_load_i1:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_read_u8
; GCN: v_and_b32_e32 v{{[0-9]+}}, 1
; GCN: ds_write_b8
}
; FUNC-LABEL: {{^}}local_load_v2i1:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
define amdgpu_kernel void @local_load_v2i1(<2 x i1> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
%load = load <2 x i1>, <2 x i1> addrspace(3)* %in
store <2 x i1> %load, <2 x i1> addrspace(3)* %out
}
; FUNC-LABEL: {{^}}local_load_v3i1:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
define amdgpu_kernel void @local_load_v3i1(<3 x i1> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
%load = load <3 x i1>, <3 x i1> addrspace(3)* %in
store <3 x i1> %load, <3 x i1> addrspace(3)* %out
}
; FUNC-LABEL: {{^}}local_load_v4i1:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
define amdgpu_kernel void @local_load_v4i1(<4 x i1> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
%load = load <4 x i1>, <4 x i1> addrspace(3)* %in
store <4 x i1> %load, <4 x i1> addrspace(3)* %out
}
; FUNC-LABEL: {{^}}local_load_v8i1:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
define amdgpu_kernel void @local_load_v8i1(<8 x i1> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
%load = load <8 x i1>, <8 x i1> addrspace(3)* %in
store <8 x i1> %load, <8 x i1> addrspace(3)* %out
}
; FUNC-LABEL: {{^}}local_load_v16i1:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
define amdgpu_kernel void @local_load_v16i1(<16 x i1> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
%load = load <16 x i1>, <16 x i1> addrspace(3)* %in
store <16 x i1> %load, <16 x i1> addrspace(3)* %out
}
; FUNC-LABEL: {{^}}local_load_v32i1:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
define amdgpu_kernel void @local_load_v32i1(<32 x i1> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
%load = load <32 x i1>, <32 x i1> addrspace(3)* %in
store <32 x i1> %load, <32 x i1> addrspace(3)* %out
}
; FUNC-LABEL: {{^}}local_load_v64i1:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
define amdgpu_kernel void @local_load_v64i1(<64 x i1> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
%load = load <64 x i1>, <64 x i1> addrspace(3)* %in
store <64 x i1> %load, <64 x i1> addrspace(3)* %out
}
; FUNC-LABEL: {{^}}local_zextload_i1_to_i32:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_read_u8
; GCN: ds_write_b32
define amdgpu_kernel void @local_zextload_i1_to_i32(i32 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
}
; FUNC-LABEL: {{^}}local_sextload_i1_to_i32:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_read_u8
; GCN: v_bfe_i32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1{{$}}
; GCN: ds_write_b32
}
; FUNC-LABEL: {{^}}local_zextload_v1i1_to_v1i32:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
define amdgpu_kernel void @local_zextload_v1i1_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
%load = load <1 x i1>, <1 x i1> addrspace(3)* %in
%ext = zext <1 x i1> %load to <1 x i32>
}
; FUNC-LABEL: {{^}}local_sextload_v1i1_to_v1i32:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
define amdgpu_kernel void @local_sextload_v1i1_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
%load = load <1 x i1>, <1 x i1> addrspace(3)* %in
%ext = sext <1 x i1> %load to <1 x i32>
}
; FUNC-LABEL: {{^}}local_zextload_v2i1_to_v2i32:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
define amdgpu_kernel void @local_zextload_v2i1_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
%load = load <2 x i1>, <2 x i1> addrspace(3)* %in
%ext = zext <2 x i1> %load to <2 x i32>
}
; FUNC-LABEL: {{^}}local_sextload_v2i1_to_v2i32:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
define amdgpu_kernel void @local_sextload_v2i1_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
%load = load <2 x i1>, <2 x i1> addrspace(3)* %in
%ext = sext <2 x i1> %load to <2 x i32>
}
; FUNC-LABEL: {{^}}local_zextload_v3i1_to_v3i32:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
define amdgpu_kernel void @local_zextload_v3i1_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
%load = load <3 x i1>, <3 x i1> addrspace(3)* %in
%ext = zext <3 x i1> %load to <3 x i32>
}
; FUNC-LABEL: {{^}}local_sextload_v3i1_to_v3i32:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
define amdgpu_kernel void @local_sextload_v3i1_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
%load = load <3 x i1>, <3 x i1> addrspace(3)* %in
%ext = sext <3 x i1> %load to <3 x i32>
}
; FUNC-LABEL: {{^}}local_zextload_v4i1_to_v4i32:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
define amdgpu_kernel void @local_zextload_v4i1_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
%load = load <4 x i1>, <4 x i1> addrspace(3)* %in
%ext = zext <4 x i1> %load to <4 x i32>
}
; FUNC-LABEL: {{^}}local_sextload_v4i1_to_v4i32:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
define amdgpu_kernel void @local_sextload_v4i1_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
%load = load <4 x i1>, <4 x i1> addrspace(3)* %in
%ext = sext <4 x i1> %load to <4 x i32>
}
; FUNC-LABEL: {{^}}local_zextload_v8i1_to_v8i32:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
define amdgpu_kernel void @local_zextload_v8i1_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
%load = load <8 x i1>, <8 x i1> addrspace(3)* %in
%ext = zext <8 x i1> %load to <8 x i32>
}
; FUNC-LABEL: {{^}}local_sextload_v8i1_to_v8i32:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
define amdgpu_kernel void @local_sextload_v8i1_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
%load = load <8 x i1>, <8 x i1> addrspace(3)* %in
%ext = sext <8 x i1> %load to <8 x i32>
}
; FUNC-LABEL: {{^}}local_zextload_v16i1_to_v16i32:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
define amdgpu_kernel void @local_zextload_v16i1_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
%load = load <16 x i1>, <16 x i1> addrspace(3)* %in
%ext = zext <16 x i1> %load to <16 x i32>
}
; FUNC-LABEL: {{^}}local_sextload_v16i1_to_v16i32:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
define amdgpu_kernel void @local_sextload_v16i1_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
%load = load <16 x i1>, <16 x i1> addrspace(3)* %in
%ext = sext <16 x i1> %load to <16 x i32>
}
; FUNC-LABEL: {{^}}local_zextload_v32i1_to_v32i32:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
define amdgpu_kernel void @local_zextload_v32i1_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
%load = load <32 x i1>, <32 x i1> addrspace(3)* %in
%ext = zext <32 x i1> %load to <32 x i32>
}
; FUNC-LABEL: {{^}}local_sextload_v32i1_to_v32i32:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
define amdgpu_kernel void @local_sextload_v32i1_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
%load = load <32 x i1>, <32 x i1> addrspace(3)* %in
%ext = sext <32 x i1> %load to <32 x i32>
}
; FUNC-LABEL: {{^}}local_zextload_v64i1_to_v64i32:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
define amdgpu_kernel void @local_zextload_v64i1_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
%load = load <64 x i1>, <64 x i1> addrspace(3)* %in
%ext = zext <64 x i1> %load to <64 x i32>
}
; FUNC-LABEL: {{^}}local_sextload_v64i1_to_v64i32:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
define amdgpu_kernel void @local_sextload_v64i1_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
%load = load <64 x i1>, <64 x i1> addrspace(3)* %in
%ext = sext <64 x i1> %load to <64 x i32>
}
; FUNC-LABEL: {{^}}local_zextload_i1_to_i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN-DAG: ds_read_u8 [[LOAD:v[0-9]+]],
; GCN-DAG: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}}
; GCN: ds_write_b64
}
; FUNC-LABEL: {{^}}local_sextload_i1_to_i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_read_u8 [[LOAD:v[0-9]+]],
; GCN: v_bfe_i32 [[BFE:v[0-9]+]], {{v[0-9]+}}, 0, 1{{$}}
; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[BFE]]
}
; FUNC-LABEL: {{^}}local_zextload_v1i1_to_v1i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
define amdgpu_kernel void @local_zextload_v1i1_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
%load = load <1 x i1>, <1 x i1> addrspace(3)* %in
%ext = zext <1 x i1> %load to <1 x i64>
}
; FUNC-LABEL: {{^}}local_sextload_v1i1_to_v1i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
define amdgpu_kernel void @local_sextload_v1i1_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
%load = load <1 x i1>, <1 x i1> addrspace(3)* %in
%ext = sext <1 x i1> %load to <1 x i64>
}
; FUNC-LABEL: {{^}}local_zextload_v2i1_to_v2i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
define amdgpu_kernel void @local_zextload_v2i1_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
%load = load <2 x i1>, <2 x i1> addrspace(3)* %in
%ext = zext <2 x i1> %load to <2 x i64>
}
; FUNC-LABEL: {{^}}local_sextload_v2i1_to_v2i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
define amdgpu_kernel void @local_sextload_v2i1_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
%load = load <2 x i1>, <2 x i1> addrspace(3)* %in
%ext = sext <2 x i1> %load to <2 x i64>
}
; FUNC-LABEL: {{^}}local_zextload_v3i1_to_v3i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
define amdgpu_kernel void @local_zextload_v3i1_to_v3i64(<3 x i64> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
%load = load <3 x i1>, <3 x i1> addrspace(3)* %in
%ext = zext <3 x i1> %load to <3 x i64>
}
; FUNC-LABEL: {{^}}local_sextload_v3i1_to_v3i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
define amdgpu_kernel void @local_sextload_v3i1_to_v3i64(<3 x i64> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
%load = load <3 x i1>, <3 x i1> addrspace(3)* %in
%ext = sext <3 x i1> %load to <3 x i64>
}
; FUNC-LABEL: {{^}}local_zextload_v4i1_to_v4i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
define amdgpu_kernel void @local_zextload_v4i1_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
%load = load <4 x i1>, <4 x i1> addrspace(3)* %in
%ext = zext <4 x i1> %load to <4 x i64>
}
; FUNC-LABEL: {{^}}local_sextload_v4i1_to_v4i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
define amdgpu_kernel void @local_sextload_v4i1_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
%load = load <4 x i1>, <4 x i1> addrspace(3)* %in
%ext = sext <4 x i1> %load to <4 x i64>
}
; FUNC-LABEL: {{^}}local_zextload_v8i1_to_v8i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
define amdgpu_kernel void @local_zextload_v8i1_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
%load = load <8 x i1>, <8 x i1> addrspace(3)* %in
%ext = zext <8 x i1> %load to <8 x i64>
}
; FUNC-LABEL: {{^}}local_sextload_v8i1_to_v8i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
define amdgpu_kernel void @local_sextload_v8i1_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
%load = load <8 x i1>, <8 x i1> addrspace(3)* %in
%ext = sext <8 x i1> %load to <8 x i64>
}
; FUNC-LABEL: {{^}}local_zextload_v16i1_to_v16i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
define amdgpu_kernel void @local_zextload_v16i1_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
%load = load <16 x i1>, <16 x i1> addrspace(3)* %in
%ext = zext <16 x i1> %load to <16 x i64>
}
; FUNC-LABEL: {{^}}local_sextload_v16i1_to_v16i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
define amdgpu_kernel void @local_sextload_v16i1_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
%load = load <16 x i1>, <16 x i1> addrspace(3)* %in
%ext = sext <16 x i1> %load to <16 x i64>
}
; FUNC-LABEL: {{^}}local_zextload_v32i1_to_v32i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
define amdgpu_kernel void @local_zextload_v32i1_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
%load = load <32 x i1>, <32 x i1> addrspace(3)* %in
%ext = zext <32 x i1> %load to <32 x i64>
}
; FUNC-LABEL: {{^}}local_sextload_v32i1_to_v32i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
define amdgpu_kernel void @local_sextload_v32i1_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
%load = load <32 x i1>, <32 x i1> addrspace(3)* %in
%ext = sext <32 x i1> %load to <32 x i64>
}
; FUNC-LABEL: {{^}}local_zextload_v64i1_to_v64i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
define amdgpu_kernel void @local_zextload_v64i1_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
%load = load <64 x i1>, <64 x i1> addrspace(3)* %in
%ext = zext <64 x i1> %load to <64 x i64>
}
; FUNC-LABEL: {{^}}local_sextload_v64i1_to_v64i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
define amdgpu_kernel void @local_sextload_v64i1_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
%load = load <64 x i1>, <64 x i1> addrspace(3)* %in
%ext = sext <64 x i1> %load to <64 x i64>
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,SICIVI,FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,GFX89,FUNC %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GFX89,FUNC %s
; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}local_load_i16:
+; GFX9-NOT: m0
+; SICIVI: s_mov_b32 m0
+
; GCN: ds_read_u16 v{{[0-9]+}}
; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
}
; FUNC-LABEL: {{^}}local_load_v2i16:
+; GFX9-NOT: m0
+; SICIVI: s_mov_b32 m0
+
; GCN: ds_read_b32
; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
}
; FUNC-LABEL: {{^}}local_load_v3i16:
+; GFX9-NOT: m0
+; SICIVI: s_mov_b32 m0
+
; GCN: ds_read_b64
; GCN-DAG: ds_write_b32
; GCN-DAG: ds_write_b16
}
; FUNC-LABEL: {{^}}local_load_v4i16:
+; GFX9-NOT: m0
+; SICIVI: s_mov_b32 m0
+
; GCN: ds_read_b64
; EG: LDS_READ_RET
}
; FUNC-LABEL: {{^}}local_load_v8i16:
+; GFX9-NOT: m0
+; SICIVI: s_mov_b32 m0
+
; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
; EG: LDS_READ_RET
}
; FUNC-LABEL: {{^}}local_load_v16i16:
+; GFX9-NOT: m0
+; SICIVI: s_mov_b32 m0
+
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:3{{$}}
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}}
}
; FUNC-LABEL: {{^}}local_zextload_i16_to_i32:
+; GFX9-NOT: m0
+; SICIVI: s_mov_b32 m0
+
; GCN: ds_read_u16
; GCN: ds_write_b32
; FUNC-LABEL: {{^}}local_sextload_i16_to_i32:
; GCN-NOT: s_wqm_b64
-; GCN: s_mov_b32 m0
+
+; GFX9-NOT: m0
+; SICIVI: s_mov_b32 m0
+
; GCN: ds_read_i16
; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
}
; FUNC-LABEL: {{^}}local_zextload_v1i16_to_v1i32:
+; GFX9-NOT: m0
+; SICIVI: s_mov_b32 m0
+
; GCN: ds_read_u16
; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
}
; FUNC-LABEL: {{^}}local_sextload_v1i16_to_v1i32:
+; GFX9-NOT: m0
+; SICIVI: s_mov_b32 m0
+
; GCN: ds_read_i16
; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
; FUNC-LABEL: {{^}}local_zextload_v2i16_to_v2i32:
; GCN-NOT: s_wqm_b64
-; GCN: s_mov_b32 m0
+; GFX9-NOT: m0
+; SICIVI: s_mov_b32 m0
+
; GCN: ds_read_b32
; EG: LDS_READ_RET
; FUNC-LABEL: {{^}}local_sextload_v2i16_to_v2i32:
; GCN-NOT: s_wqm_b64
-; GCN: s_mov_b32 m0
+; GFX9-NOT: m0
+; SICIVI: s_mov_b32 m0
+
; GCN: ds_read_b32
; EG: LDS_READ_RET
}
; FUNC-LABEL: {{^}}local_local_zextload_v3i16_to_v3i32:
+; GFX9-NOT: m0
+; SICIVI: s_mov_b32 m0
+
; GCN: ds_read_b64
; GCN-DAG: ds_write_b32
; GCN-DAG: ds_write_b64
}
; FUNC-LABEL: {{^}}local_local_sextload_v3i16_to_v3i32:
+; GFX9-NOT: m0
+; SICIVI: s_mov_b32 m0
+
; GCN: ds_read_b64
; GCN-DAG: ds_write_b32
; GCN-DAG: ds_write_b64
; FUNC-LABEL: {{^}}local_local_zextload_v4i16_to_v4i32:
; GCN-NOT: s_wqm_b64
-; GCN: s_mov_b32 m0
+; GFX9-NOT: m0
+; SICIVI: s_mov_b32 m0
+
; GCN: ds_read_b64
; EG: LDS_READ_RET
; FUNC-LABEL: {{^}}local_sextload_v4i16_to_v4i32:
; GCN-NOT: s_wqm_b64
-; GCN: s_mov_b32 m0
+; GFX9-NOT: m0
+; SICIVI: s_mov_b32 m0
+
; GCN: ds_read_b64
; EG: LDS_READ_RET
}
; FUNC-LABEL: {{^}}local_zextload_v8i16_to_v8i32:
+; GFX9-NOT: m0
+; SICIVI: s_mov_b32 m0
+
; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
; EG: LDS_READ_RET
}
; FUNC-LABEL: {{^}}local_sextload_v8i16_to_v8i32:
+; GFX9-NOT: m0
+; SICIVI: s_mov_b32 m0
+
; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
; EG: LDS_READ_RET
}
; FUNC-LABEL: {{^}}local_zextload_v16i16_to_v16i32:
+; GFX9-NOT: m0
+; SICIVI: s_mov_b32 m0
+
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
}
; FUNC-LABEL: {{^}}local_sextload_v16i16_to_v16i32:
+; GFX9-NOT: m0
+; SICIVI: s_mov_b32 m0
+
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
}
; FUNC-LABEL: {{^}}local_zextload_v32i16_to_v32i32:
+; GFX9-NOT: m0
+; SICIVI: s_mov_b32 m0
+
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5
}
; FUNC-LABEL: {{^}}local_sextload_v32i16_to_v32i32:
+; GFX9-NOT: m0
+; SICIVI: s_mov_b32 m0
+
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
}
; FUNC-LABEL: {{^}}local_zextload_v64i16_to_v64i32:
+; GFX9-NOT: m0
+; SICIVI: s_mov_b32 m0
+
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:14 offset1:15
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3
}
; FUNC-LABEL: {{^}}local_sextload_v64i16_to_v64i32:
+; GFX9-NOT: m0
+; SICIVI: s_mov_b32 m0
; EG: LDS_READ_RET
; EG: LDS_READ_RET
}
; FUNC-LABEL: {{^}}local_zextload_i16_to_i64:
+; GFX9-NOT: m0
+; SICIVI: s_mov_b32 m0
+
; GCN-DAG: ds_read_u16 v[[LO:[0-9]+]],
; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
}
; FUNC-LABEL: {{^}}local_sextload_i16_to_i64:
+; GFX9-NOT: m0
+; SICIVI: s_mov_b32 m0
+
; FIXME: Need to optimize this sequence to avoid an extra shift.
; t25: i32,ch = load<LD2[%in(addrspace=3)], anyext from i16> t12, t10, undef:i32
; t28: i64 = any_extend t25
; t30: i64 = sign_extend_inreg t28, ValueType:ch:i16
; SI: ds_read_i16 v[[LO:[0-9]+]],
-; VI: ds_read_u16 v[[ULO:[0-9]+]]
-; VI: v_bfe_i32 v[[LO:[0-9]+]], v[[ULO]], 0, 16
+; GFX89: ds_read_u16 v[[ULO:[0-9]+]]
+; GFX89: v_bfe_i32 v[[LO:[0-9]+]], v[[ULO]], 0, 16
; GCN-DAG: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
; GCN: ds_write_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]]
}
; FUNC-LABEL: {{^}}local_zextload_v1i16_to_v1i64:
+; GFX9-NOT: m0
+; SICIVI: s_mov_b32 m0
+
; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
}
; FUNC-LABEL: {{^}}local_sextload_v1i16_to_v1i64:
+; GFX9-NOT: m0
+; SICIVI: s_mov_b32 m0
+
; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
}
; FUNC-LABEL: {{^}}local_zextload_v2i16_to_v2i64:
+; GFX9-NOT: m0
+; SICIVI: s_mov_b32 m0
+
; EG: LDS_READ_RET
define amdgpu_kernel void @local_zextload_v2i16_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
}
; FUNC-LABEL: {{^}}local_sextload_v2i16_to_v2i64:
+; GFX9-NOT: m0
+; SICIVI: s_mov_b32 m0
+
; EG: LDS_READ_RET
; EG-DAG: BFE_INT
}
; FUNC-LABEL: {{^}}local_zextload_v4i16_to_v4i64:
+; GFX9-NOT: m0
+; SICIVI: s_mov_b32 m0
+
; EG: LDS_READ_RET
; EG: LDS_READ_RET
}
; FUNC-LABEL: {{^}}local_sextload_v4i16_to_v4i64:
+; GFX9-NOT: m0
+; SICIVI: s_mov_b32 m0
+
; EG: LDS_READ_RET
; EG: LDS_READ_RET
}
; FUNC-LABEL: {{^}}local_zextload_v8i16_to_v8i64:
+; GFX9-NOT: m0
+; SICIVI: s_mov_b32 m0
+
; EG: LDS_READ_RET
; EG: LDS_READ_RET
}
; FUNC-LABEL: {{^}}local_sextload_v8i16_to_v8i64:
+; GFX9-NOT: m0
+; SICIVI: s_mov_b32 m0
+
; EG: LDS_READ_RET
; EG: LDS_READ_RET
}
; FUNC-LABEL: {{^}}local_zextload_v16i16_to_v16i64:
+; GFX9-NOT: m0
+; SICIVI: s_mov_b32 m0
+
; EG: LDS_READ_RET
; EG: LDS_READ_RET
}
; FUNC-LABEL: {{^}}local_sextload_v16i16_to_v16i64:
+; GFX9-NOT: m0
+; SICIVI: s_mov_b32 m0
+
; EG: LDS_READ_RET
; EG: LDS_READ_RET
}
; FUNC-LABEL: {{^}}local_zextload_v32i16_to_v32i64:
+; GFX9-NOT: m0
+; SICIVI: s_mov_b32 m0
+
; EG: LDS_READ_RET
; EG: LDS_READ_RET
}
; FUNC-LABEL: {{^}}local_sextload_v32i16_to_v32i64:
+; GFX9-NOT: m0
+; SICIVI: s_mov_b32 m0
+
; EG: LDS_READ_RET
; EG: LDS_READ_RET
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
; FUNC-LABEL: {{^}}local_load_i32:
; GCN-NOT: s_wqm_b64
-; GCN: s_mov_b32 m0, -1
+; SICIVI: s_mov_b32 m0, -1
+; GFX9-NOT: m0
; GCN: ds_read_b32
; EG: LDS_READ_RET
}
; FUNC-LABEL: {{^}}local_load_v2i32:
+; SICIVI: s_mov_b32 m0, -1
+; GFX9-NOT: m0
+
; GCN: ds_read_b64
define amdgpu_kernel void @local_load_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 {
entry:
}
; FUNC-LABEL: {{^}}local_load_v3i32:
+; SICIVI: s_mov_b32 m0, -1
+; GFX9-NOT: m0
+
; GCN-DAG: ds_read_b64
; GCN-DAG: ds_read_b32
define amdgpu_kernel void @local_load_v3i32(<3 x i32> addrspace(3)* %out, <3 x i32> addrspace(3)* %in) #0 {
}
; FUNC-LABEL: {{^}}local_load_v4i32:
+; SICIVI: s_mov_b32 m0, -1
+; GFX9-NOT: m0
+
; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
define amdgpu_kernel void @local_load_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 {
}
; FUNC-LABEL: {{^}}local_load_v8i32:
+; SICIVI: s_mov_b32 m0, -1
+; GFX9-NOT: m0
+
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
define amdgpu_kernel void @local_load_v8i32(<8 x i32> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 {
}
; FUNC-LABEL: {{^}}local_load_v16i32:
+; SICIVI: s_mov_b32 m0, -1
+; GFX9-NOT: m0
+
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7{{$}}
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5{{$}}
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
}
; FUNC-LABEL: {{^}}local_zextload_i32_to_i64:
+; SICIVI: s_mov_b32 m0, -1
+; GFX9-NOT: m0
+
define amdgpu_kernel void @local_zextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)* %in) #0 {
%ld = load i32, i32 addrspace(3)* %in
%ext = zext i32 %ld to i64
}
; FUNC-LABEL: {{^}}local_sextload_i32_to_i64:
+; SICIVI: s_mov_b32 m0, -1
+; GFX9-NOT: m0
+
define amdgpu_kernel void @local_sextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)* %in) #0 {
%ld = load i32, i32 addrspace(3)* %in
%ext = sext i32 %ld to i64
}
; FUNC-LABEL: {{^}}local_zextload_v1i32_to_v1i64:
+; SICIVI: s_mov_b32 m0, -1
+; GFX9-NOT: m0
+
define amdgpu_kernel void @local_zextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i32> addrspace(3)* %in) #0 {
%ld = load <1 x i32>, <1 x i32> addrspace(3)* %in
%ext = zext <1 x i32> %ld to <1 x i64>
}
; FUNC-LABEL: {{^}}local_sextload_v1i32_to_v1i64:
+; SICIVI: s_mov_b32 m0, -1
+; GFX9-NOT: m0
+
define amdgpu_kernel void @local_sextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i32> addrspace(3)* %in) #0 {
%ld = load <1 x i32>, <1 x i32> addrspace(3)* %in
%ext = sext <1 x i32> %ld to <1 x i64>
}
; FUNC-LABEL: {{^}}local_zextload_v2i32_to_v2i64:
+; SICIVI: s_mov_b32 m0, -1
+; GFX9-NOT: m0
+
define amdgpu_kernel void @local_zextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 {
%ld = load <2 x i32>, <2 x i32> addrspace(3)* %in
%ext = zext <2 x i32> %ld to <2 x i64>
}
; FUNC-LABEL: {{^}}local_sextload_v2i32_to_v2i64:
+; SICIVI: s_mov_b32 m0, -1
+; GFX9-NOT: m0
+
define amdgpu_kernel void @local_sextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 {
%ld = load <2 x i32>, <2 x i32> addrspace(3)* %in
%ext = sext <2 x i32> %ld to <2 x i64>
}
; FUNC-LABEL: {{^}}local_zextload_v4i32_to_v4i64:
+; SICIVI: s_mov_b32 m0, -1
+; GFX9-NOT: m0
+
define amdgpu_kernel void @local_zextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 {
%ld = load <4 x i32>, <4 x i32> addrspace(3)* %in
%ext = zext <4 x i32> %ld to <4 x i64>
}
; FUNC-LABEL: {{^}}local_sextload_v4i32_to_v4i64:
+; SICIVI: s_mov_b32 m0, -1
+; GFX9-NOT: m0
+
define amdgpu_kernel void @local_sextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 {
%ld = load <4 x i32>, <4 x i32> addrspace(3)* %in
%ext = sext <4 x i32> %ld to <4 x i64>
}
; FUNC-LABEL: {{^}}local_zextload_v8i32_to_v8i64:
+; SICIVI: s_mov_b32 m0, -1
+; GFX9-NOT: m0
+
define amdgpu_kernel void @local_zextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 {
%ld = load <8 x i32>, <8 x i32> addrspace(3)* %in
%ext = zext <8 x i32> %ld to <8 x i64>
}
; FUNC-LABEL: {{^}}local_sextload_v8i32_to_v8i64:
+; SICIVI: s_mov_b32 m0, -1
+; GFX9-NOT: m0
+
define amdgpu_kernel void @local_sextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 {
%ld = load <8 x i32>, <8 x i32> addrspace(3)* %in
%ext = sext <8 x i32> %ld to <8 x i64>
}
; FUNC-LABEL: {{^}}local_sextload_v16i32_to_v16i64:
+; SICIVI: s_mov_b32 m0, -1
+; GFX9-NOT: m0
+
define amdgpu_kernel void @local_sextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 {
%ld = load <16 x i32>, <16 x i32> addrspace(3)* %in
%ext = sext <16 x i32> %ld to <16 x i64>
}
; FUNC-LABEL: {{^}}local_zextload_v16i32_to_v16i64
+; SICIVI: s_mov_b32 m0, -1
+; GFX9-NOT: m0
+
define amdgpu_kernel void @local_zextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 {
%ld = load <16 x i32>, <16 x i32> addrspace(3)* %in
%ext = zext <16 x i32> %ld to <16 x i64>
}
; FUNC-LABEL: {{^}}local_sextload_v32i32_to_v32i64:
+; SICIVI: s_mov_b32 m0, -1
+; GFX9-NOT: m0
+
define amdgpu_kernel void @local_sextload_v32i32_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i32> addrspace(3)* %in) #0 {
%ld = load <32 x i32>, <32 x i32> addrspace(3)* %in
%ext = sext <32 x i32> %ld to <32 x i64>
}
; FUNC-LABEL: {{^}}local_zextload_v32i32_to_v32i64:
+; SICIVI: s_mov_b32 m0, -1
+; GFX9-NOT: m0
+
define amdgpu_kernel void @local_zextload_v32i32_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i32> addrspace(3)* %in) #0 {
%ld = load <32 x i32>, <32 x i32> addrspace(3)* %in
%ext = zext <32 x i32> %ld to <32 x i64>
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG,FUNC %s
; FUNC-LABEL: {{^}}local_load_i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_read_b64 [[VAL:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}{{$}}
; GCN: ds_write_b64 v{{[0-9]+}}, [[VAL]]
}
; FUNC-LABEL: {{^}}local_load_v2i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_read2_b64
; EG: LDS_READ_RET
}
; FUNC-LABEL: {{^}}local_load_v3i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN-DAG: ds_read2_b64
; GCN-DAG: ds_read_b64
}
; FUNC-LABEL: {{^}}local_load_v4i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_read2_b64
; GCN: ds_read2_b64
}
; FUNC-LABEL: {{^}}local_load_v8i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_read2_b64
; GCN: ds_read2_b64
; GCN: ds_read2_b64
}
; FUNC-LABEL: {{^}}local_load_v16i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_read2_b64
; GCN: ds_read2_b64
; GCN: ds_read2_b64
-; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,SICIVI,FUNC %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,SICIVI,FUNC %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
; RUN: llc -march=r600 -mtriple=r600---amdgiz -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}local_load_i8:
; GCN-NOT: s_wqm_b64
-; GCN: s_mov_b32 m0
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
; GCN: ds_read_u8
; EG: LDS_UBYTE_READ_RET
; FUNC-LABEL: {{^}}local_load_v2i8:
; GCN-NOT: s_wqm_b64
-; GCN: s_mov_b32 m0
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
; GCN: ds_read_u16
; EG: LDS_USHORT_READ_RET
}
; FUNC-LABEL: {{^}}local_load_v3i8:
+; GFX9-NOT: m0
; GCN: ds_read_b32
; EG: DS_READ_RET
}
; FUNC-LABEL: {{^}}local_load_v4i8:
+; GFX9-NOT: m0
; GCN: ds_read_b32
; EG: LDS_READ_RET
}
; FUNC-LABEL: {{^}}local_load_v8i8:
+; GFX9-NOT: m0
; GCN: ds_read_b64
; EG: LDS_READ_RET
}
; FUNC-LABEL: {{^}}local_load_v16i8:
+; GFX9-NOT: m0
; GCN: ds_read2_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}}
; GCN: ds_write2_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:{{[0-9]+}}], v[{{[0-9]+}}:[[HI]]{{\]}} offset1:1{{$}}
}
; FUNC-LABEL: {{^}}local_zextload_i8_to_i32:
+; GFX9-NOT: m0
; GCN-NOT: s_wqm_b64
-; GCN: s_mov_b32 m0
+; SICIVI: s_mov_b32 m0
; GCN: ds_read_u8
; EG: LDS_UBYTE_READ_RET
; FUNC-LABEL: {{^}}local_sextload_i8_to_i32:
; GCN-NOT: s_wqm_b64
-; GCN: s_mov_b32 m0
+; GFX9-NOT: m0
+; SICIVI: s_mov_b32 m0
; GCN: ds_read_i8
; EG: LDS_UBYTE_READ_RET
}
; FUNC-LABEL: {{^}}local_sextload_v1i8_to_v1i32:
+; GFX9-NOT: m0
; EG: LDS_UBYTE_READ_RET
; EG: BFE_INT
}
; FUNC-LABEL: {{^}}local_zextload_v2i8_to_v2i32:
+; GFX9-NOT: m0
; GCN: ds_read_u16
; EG: LDS_USHORT_READ_RET
; FUNC-LABEL: {{^}}local_sextload_v2i8_to_v2i32:
; GCN-NOT: s_wqm_b64
-; GCN: s_mov_b32 m0
+; GFX9-NOT: m0
+; SICIVI: s_mov_b32 m0
; GCN: ds_read_u16
; FIXME: Need to optimize this sequence to avoid extra shift on VI.
; t23: i16 = srl t39, Constant:i32<8>
}
; FUNC-LABEL: {{^}}local_zextload_v3i8_to_v3i32:
+; GFX9-NOT: m0
; GCN: ds_read_b32
; SI-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8
; FUNC-LABEL: {{^}}local_sextload_v3i8_to_v3i32:
; GCN-NOT: s_wqm_b64
-; GCN: s_mov_b32 m0
+; GFX9-NOT: m0
+; SICIVI: s_mov_b32 m0
; GCN: ds_read_b32
; GCN-DAG: v_bfe_i32
; FUNC-LABEL: {{^}}local_zextload_v4i8_to_v4i32:
; GCN-NOT: s_wqm_b64
-; GCN: s_mov_b32 m0
+; GFX9-NOT: m0
+; SICIVI: s_mov_b32 m0
; GCN: ds_read_b32
; EG: LDS_READ_RET
; FUNC-LABEL: {{^}}local_sextload_v4i8_to_v4i32:
; GCN-NOT: s_wqm_b64
-; GCN: s_mov_b32 m0
+; GFX9-NOT: m0
+; SICIVI: s_mov_b32 m0
; GCN: ds_read_b32
; EG-DAG: LDS_READ_RET
}
; FUNC-LABEL: {{^}}local_zextload_v8i8_to_v8i32:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
; EG-DAG: LDS_READ_RET
; EG-DAG: LDS_READ_RET
}
; FUNC-LABEL: {{^}}local_sextload_v8i8_to_v8i32:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
; EG-DAG: LDS_READ_RET
; EG-DAG: LDS_READ_RET
}
; FUNC-LABEL: {{^}}local_zextload_v16i8_to_v16i32:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
; EG-DAG: LDS_READ_RET
; EG-DAG: LDS_READ_RET
}
; FUNC-LABEL: {{^}}local_sextload_v16i8_to_v16i32:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
; EG-DAG: LDS_READ_RET
; EG-DAG: LDS_READ_RET
}
; FUNC-LABEL: {{^}}local_zextload_v32i8_to_v32i32:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
; EG-DAG: LDS_READ_RET
; EG-DAG: LDS_READ_RET
}
; FUNC-LABEL: {{^}}local_sextload_v32i8_to_v32i32:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
; EG-DAG: LDS_READ_RET
; EG-DAG: LDS_READ_RET
}
; FUNC-LABEL: {{^}}local_zextload_v64i8_to_v64i32:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
; EG-DAG: LDS_READ_RET
; EG-DAG: LDS_READ_RET
}
; FUNC-LABEL: {{^}}local_sextload_v64i8_to_v64i32:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
; EG-DAG: LDS_READ_RET
; EG-DAG: LDS_READ_RET
}
; FUNC-LABEL: {{^}}local_zextload_i8_to_i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
; GCN-DAG: ds_read_u8 v[[LO:[0-9]+]],
; GCN: ds_write_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]]
}
; FUNC-LABEL: {{^}}local_sextload_i8_to_i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_read_i8 v[[LO:[0-9]+]],
; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
}
; FUNC-LABEL: {{^}}local_zextload_v1i8_to_v1i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
; EG: LDS_UBYTE_READ_RET
; EG: MOV {{.*}}, literal
}
; FUNC-LABEL: {{^}}local_sextload_v1i8_to_v1i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
; EG: LDS_UBYTE_READ_RET
; EG: ASHR
}
; FUNC-LABEL: {{^}}local_zextload_v2i8_to_v2i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
; EG: LDS_USHORT_READ_RET
define amdgpu_kernel void @local_zextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
}
; FUNC-LABEL: {{^}}local_sextload_v2i8_to_v2i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
; EG: LDS_USHORT_READ_RET
; EG: BFE_INT
}
; FUNC-LABEL: {{^}}local_zextload_v4i8_to_v4i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
; EG: LDS_READ_RET
define amdgpu_kernel void @local_zextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
}
; FUNC-LABEL: {{^}}local_sextload_v4i8_to_v4i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
; EG: LDS_READ_RET
define amdgpu_kernel void @local_sextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
}
; FUNC-LABEL: {{^}}local_zextload_v8i8_to_v8i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
; EG: LDS_READ_RET
; EG: LDS_READ_RET
}
; FUNC-LABEL: {{^}}local_sextload_v8i8_to_v8i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
; EG: LDS_READ_RET
; EG: LDS_READ_RET
}
; FUNC-LABEL: {{^}}local_zextload_v16i8_to_v16i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
; EG: LDS_READ_RET
; EG: LDS_READ_RET
}
; FUNC-LABEL: {{^}}local_sextload_v16i8_to_v16i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
; EG: LDS_READ_RET
; EG: LDS_READ_RET
}
; FUNC-LABEL: {{^}}local_zextload_v32i8_to_v32i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
; EG: LDS_READ_RET
; EG: LDS_READ_RET
}
; FUNC-LABEL: {{^}}local_sextload_v32i8_to_v32i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
; EG: LDS_READ_RET
; EG: LDS_READ_RET
; }
; FUNC-LABEL: {{^}}local_zextload_i8_to_i16:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
; GCN: ds_read_u8 v[[VAL:[0-9]+]],
; GCN: ds_write_b16 v[[VAL:[0-9]+]]
}
; FUNC-LABEL: {{^}}local_sextload_i8_to_i16:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
; GCN: ds_read_i8 v[[VAL:[0-9]+]],
; GCN: ds_write_b16 v{{[0-9]+}}, v[[VAL]]
}
; FUNC-LABEL: {{^}}local_zextload_v1i8_to_v1i16:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
; EG: LDS_UBYTE_READ_RET
; EG: LDS_SHORT_WRITE
}
; FUNC-LABEL: {{^}}local_sextload_v1i8_to_v1i16:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
; EG: LDS_UBYTE_READ_RET
; EG: BFE_INT
}
; FUNC-LABEL: {{^}}local_zextload_v2i8_to_v2i16:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
; EG: LDS_USHORT_READ_RET
; EG: LDS_WRITE
}
; FUNC-LABEL: {{^}}local_sextload_v2i8_to_v2i16:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
; EG: LDS_USHORT_READ_RET
; EG: BFE_INT
}
; FUNC-LABEL: {{^}}local_zextload_v4i8_to_v4i16:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
; EG: LDS_READ_RET
; EG: LDS_WRITE
}
; FUNC-LABEL: {{^}}local_sextload_v4i8_to_v4i16:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
; EG: LDS_READ_RET
; TODO: these do LSHR + BFE_INT, instead of just BFE_INT/ASHR
}
; FUNC-LABEL: {{^}}local_zextload_v8i8_to_v8i16:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
; EG: LDS_READ_RET
; EG: LDS_READ_RET
}
; FUNC-LABEL: {{^}}local_sextload_v8i8_to_v8i16:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
; EG: LDS_READ_RET
; EG: LDS_READ_RET
}
; FUNC-LABEL: {{^}}local_zextload_v16i8_to_v16i16:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
; EG: LDS_READ_RET
; EG: LDS_READ_RET
}
; FUNC-LABEL: {{^}}local_sextload_v16i8_to_v16i16:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
; EG: LDS_READ_RET
; EG: LDS_READ_RET
}
; FUNC-LABEL: {{^}}local_zextload_v32i8_to_v32i16:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
; EG: LDS_READ_RET
; EG: LDS_READ_RET
}
; FUNC-LABEL: {{^}}local_sextload_v32i8_to_v32i16:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
; EG: LDS_READ_RET
; EG: LDS_READ_RET
-; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck --check-prefix=SI --check-prefix=BOTH %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=BOTH %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=BOTH %s
+; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SICIVI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs< %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,SICIVI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,SICIVI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
-; BOTH-LABEL: {{^}}local_i32_load
-; BOTH: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} offset:28
-; BOTH: buffer_store_dword [[REG]],
+; GCN-LABEL: {{^}}local_i32_load
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} offset:28
+; GCN: buffer_store_dword [[REG]],
define amdgpu_kernel void @local_i32_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind {
%gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
%val = load i32, i32 addrspace(3)* %gep, align 4
ret void
}
-; BOTH-LABEL: {{^}}local_i32_load_0_offset
-; BOTH: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}}
-; BOTH: buffer_store_dword [[REG]],
+; GCN-LABEL: {{^}}local_i32_load_0_offset
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}}
+; GCN: buffer_store_dword [[REG]],
define amdgpu_kernel void @local_i32_load_0_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind {
%val = load i32, i32 addrspace(3)* %in, align 4
store i32 %val, i32 addrspace(1)* %out, align 4
ret void
}
-; BOTH-LABEL: {{^}}local_i8_load_i16_max_offset:
-; BOTH-NOT: ADD
-; BOTH: ds_read_u8 [[REG:v[0-9]+]], {{v[0-9]+}} offset:65535
-; BOTH: buffer_store_byte [[REG]],
+; GCN-LABEL: {{^}}local_i8_load_i16_max_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-NOT: add
+; GCN: ds_read_u8 [[REG:v[0-9]+]], {{v[0-9]+}} offset:65535
+; GCN: buffer_store_byte [[REG]],
define amdgpu_kernel void @local_i8_load_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind {
%gep = getelementptr i8, i8 addrspace(3)* %in, i32 65535
%val = load i8, i8 addrspace(3)* %gep, align 4
ret void
}
-; BOTH-LABEL: {{^}}local_i8_load_over_i16_max_offset:
+; GCN-LABEL: {{^}}local_i8_load_over_i16_max_offset:
+; SICIVI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
; The LDS offset will be 65536 bytes, which is larger than the size of LDS on
; SI, which is why it is being OR'd with the base pointer.
-; SI: s_or_b32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000
-; CI: s_add_i32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000
-; BOTH: v_mov_b32_e32 [[VREGADDR:v[0-9]+]], [[ADDR]]
-; BOTH: ds_read_u8 [[REG:v[0-9]+]], [[VREGADDR]]
-; BOTH: buffer_store_byte [[REG]],
+; SI-DAG: s_or_b32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000
+; CI-DAG: s_add_i32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000
+; VI-DAG: s_add_i32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000
+; GFX9-DAG: s_add_i32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000
+
+; GCN-DAG: v_mov_b32_e32 [[VREGADDR:v[0-9]+]], [[ADDR]]
+; GCN: ds_read_u8 [[REG:v[0-9]+]], [[VREGADDR]]
+; GCN: buffer_store_byte [[REG]],
define amdgpu_kernel void @local_i8_load_over_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind {
%gep = getelementptr i8, i8 addrspace(3)* %in, i32 65536
%val = load i8, i8 addrspace(3)* %gep, align 4
ret void
}
-; BOTH-LABEL: {{^}}local_i64_load:
-; BOTH-NOT: ADD
-; BOTH: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56
-; BOTH: buffer_store_dwordx2 [[REG]],
+; GCN-LABEL: {{^}}local_i64_load:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-NOT: add
+; GCN: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56
+; GCN: buffer_store_dwordx2 [[REG]],
define amdgpu_kernel void @local_i64_load(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind {
%gep = getelementptr i64, i64 addrspace(3)* %in, i32 7
%val = load i64, i64 addrspace(3)* %gep, align 8
ret void
}
-; BOTH-LABEL: {{^}}local_i64_load_0_offset
-; BOTH: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}
-; BOTH: buffer_store_dwordx2 [[REG]],
+; GCN-LABEL: {{^}}local_i64_load_0_offset
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}
+; GCN: buffer_store_dwordx2 [[REG]],
define amdgpu_kernel void @local_i64_load_0_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind {
%val = load i64, i64 addrspace(3)* %in, align 8
store i64 %val, i64 addrspace(1)* %out, align 8
ret void
}
-; BOTH-LABEL: {{^}}local_f64_load:
-; BOTH-NOT: ADD
-; BOTH: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56
-; BOTH: buffer_store_dwordx2 [[REG]],
+; GCN-LABEL: {{^}}local_f64_load:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-NOT: add
+; GCN: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56
+; GCN: buffer_store_dwordx2 [[REG]],
define amdgpu_kernel void @local_f64_load(double addrspace(1)* %out, double addrspace(3)* %in) nounwind {
%gep = getelementptr double, double addrspace(3)* %in, i32 7
%val = load double, double addrspace(3)* %gep, align 8
ret void
}
-; BOTH-LABEL: {{^}}local_f64_load_0_offset
-; BOTH: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}
-; BOTH: buffer_store_dwordx2 [[REG]],
+; GCN-LABEL: {{^}}local_f64_load_0_offset
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}
+; GCN: buffer_store_dwordx2 [[REG]],
define amdgpu_kernel void @local_f64_load_0_offset(double addrspace(1)* %out, double addrspace(3)* %in) nounwind {
%val = load double, double addrspace(3)* %in, align 8
store double %val, double addrspace(1)* %out, align 8
ret void
}
-; BOTH-LABEL: {{^}}local_i64_store:
-; BOTH-NOT: ADD
-; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56
+; GCN-LABEL: {{^}}local_i64_store:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-NOT: add
+; GCN: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56
define amdgpu_kernel void @local_i64_store(i64 addrspace(3)* %out) nounwind {
%gep = getelementptr i64, i64 addrspace(3)* %out, i32 7
store i64 5678, i64 addrspace(3)* %gep, align 8
ret void
}
-; BOTH-LABEL: {{^}}local_i64_store_0_offset:
-; BOTH-NOT: ADD
-; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
+; GCN-LABEL: {{^}}local_i64_store_0_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-NOT: add
+; GCN: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @local_i64_store_0_offset(i64 addrspace(3)* %out) nounwind {
store i64 1234, i64 addrspace(3)* %out, align 8
ret void
}
-; BOTH-LABEL: {{^}}local_f64_store:
-; BOTH-NOT: ADD
-; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56
+; GCN-LABEL: {{^}}local_f64_store:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-NOT: add
+; GCN: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56
define amdgpu_kernel void @local_f64_store(double addrspace(3)* %out) nounwind {
%gep = getelementptr double, double addrspace(3)* %out, i32 7
store double 16.0, double addrspace(3)* %gep, align 8
ret void
}
-; BOTH-LABEL: {{^}}local_f64_store_0_offset
-; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
+; GCN-LABEL: {{^}}local_f64_store_0_offset
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @local_f64_store_0_offset(double addrspace(3)* %out) nounwind {
store double 20.0, double addrspace(3)* %out, align 8
ret void
}
-; BOTH-LABEL: {{^}}local_v2i64_store:
-; BOTH-NOT: ADD
-; BOTH: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15
-; BOTH: s_endpgm
+; GCN-LABEL: {{^}}local_v2i64_store:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-NOT: add
+; GCN: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15
+; GCN: s_endpgm
define amdgpu_kernel void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind {
%gep = getelementptr <2 x i64>, <2 x i64> addrspace(3)* %out, i32 7
store <2 x i64> <i64 5678, i64 5678>, <2 x i64> addrspace(3)* %gep, align 16
ret void
}
-; BOTH-LABEL: {{^}}local_v2i64_store_0_offset:
-; BOTH-NOT: ADD
-; BOTH: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:1
-; BOTH: s_endpgm
+; GCN-LABEL: {{^}}local_v2i64_store_0_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-NOT: add
+; GCN: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:1
+; GCN: s_endpgm
define amdgpu_kernel void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind {
store <2 x i64> <i64 1234, i64 1234>, <2 x i64> addrspace(3)* %out, align 16
ret void
}
-; BOTH-LABEL: {{^}}local_v4i64_store:
-; BOTH-NOT: ADD
-; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:30 offset1:31
-; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:28 offset1:29
-; BOTH: s_endpgm
+; GCN-LABEL: {{^}}local_v4i64_store:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-NOT: add
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:30 offset1:31
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:28 offset1:29
+; GCN: s_endpgm
define amdgpu_kernel void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind {
%gep = getelementptr <4 x i64>, <4 x i64> addrspace(3)* %out, i32 7
store <4 x i64> <i64 5678, i64 5678, i64 5678, i64 5678>, <4 x i64> addrspace(3)* %gep, align 16
ret void
}
-; BOTH-LABEL: {{^}}local_v4i64_store_0_offset:
-; BOTH-NOT: ADD
-; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3
-; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:1
-; BOTH: s_endpgm
+; GCN-LABEL: {{^}}local_v4i64_store_0_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-NOT: add
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:1
+; GCN: s_endpgm
define amdgpu_kernel void @local_v4i64_store_0_offset(<4 x i64> addrspace(3)* %out) nounwind {
store <4 x i64> <i64 1234, i64 1234, i64 1234, i64 1234>, <4 x i64> addrspace(3)* %out, align 16
ret void
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SICIVI,FUNC %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,SICIVI,FUNC %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=EG,FUNC %s
; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i32:
; EG: LDS_WRXCHG_RET *
-; GCN: s_load_dword [[SPTR:s[0-9]+]],
-; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
-; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+
+; SICIVI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-DAG: s_load_dword [[SPTR:s[0-9]+]],
+; GCN-DAG: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
+; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
; GCN: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]]
; GCN: buffer_store_dword [[RESULT]],
; GCN: s_endpgm
}
; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i32_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; EG: LDS_WRXCHG_RET *
; GCN: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
; GCN: s_endpgm
; XXX - Is it really necessary to load 4 into VGPR?
; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32:
; EG: LDS_ADD_RET *
-; GCN: s_load_dword [[SPTR:s[0-9]+]],
-; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
-; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+
+; SICIVI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-DAG: s_load_dword [[SPTR:s[0-9]+]],
+; GCN-DAG: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
+; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
; GCN: ds_add_rtn_u32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]]
; GCN: buffer_store_dword [[RESULT]],
; GCN: s_endpgm
}
; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; EG: LDS_ADD_RET *
; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
; GCN: s_endpgm
}
; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32_bad_si_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; EG: LDS_ADD_RET *
; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; CIVI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
; FUNC-LABEL: {{^}}lds_atomic_add1_ret_i32:
; EG: LDS_ADD_RET *
-; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
+
+; SICIVI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[ONE]]
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_add1_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
; FUNC-LABEL: {{^}}lds_atomic_add1_ret_i32_offset:
; EG: LDS_ADD_RET *
-; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
+
+; SICIVI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[ONE]] offset:16
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_add1_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
}
; FUNC-LABEL: {{^}}lds_atomic_add1_ret_i32_bad_si_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; EG: LDS_ADD_RET *
; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; CIVI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i32:
; EG: LDS_SUB_RET *
+
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_sub_rtn_u32
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_sub_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i32_offset:
; EG: LDS_SUB_RET *
+
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_sub_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
; FUNC-LABEL: {{^}}lds_atomic_sub1_ret_i32:
; EG: LDS_SUB_RET *
-; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
+
+; SICIVI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[ONE]]
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_sub1_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
; FUNC-LABEL: {{^}}lds_atomic_sub1_ret_i32_offset:
; EG: LDS_SUB_RET *
-; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
+
+; SICIVI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[ONE]] offset:16
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_sub1_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
; FUNC-LABEL: {{^}}lds_atomic_and_ret_i32:
; EG: LDS_AND_RET *
+
+; SICIVI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_and_rtn_b32
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_and_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
}
; FUNC-LABEL: {{^}}lds_atomic_and_ret_i32_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; EG: LDS_AND_RET *
; GCN: ds_and_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
; GCN: s_endpgm
}
; FUNC-LABEL: {{^}}lds_atomic_or_ret_i32:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; EG: LDS_OR_RET *
; GCN: ds_or_rtn_b32
; GCN: s_endpgm
}
; FUNC-LABEL: {{^}}lds_atomic_or_ret_i32_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; EG: LDS_OR_RET *
; GCN: ds_or_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
; GCN: s_endpgm
}
; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i32:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; EG: LDS_XOR_RET *
; GCN: ds_xor_rtn_b32
; GCN: s_endpgm
}
; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i32_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; EG: LDS_XOR_RET *
; GCN: ds_xor_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
; GCN: s_endpgm
; }
; FUNC-LABEL: {{^}}lds_atomic_min_ret_i32:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; EG: LDS_MIN_INT_RET *
; GCN: ds_min_rtn_i32
; GCN: s_endpgm
}
; FUNC-LABEL: {{^}}lds_atomic_min_ret_i32_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; EG: LDS_MIN_INT_RET *
; GCN: ds_min_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
; GCN: s_endpgm
}
; FUNC-LABEL: {{^}}lds_atomic_max_ret_i32:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; EG: LDS_MAX_INT_RET *
; GCN: ds_max_rtn_i32
; GCN: s_endpgm
}
; FUNC-LABEL: {{^}}lds_atomic_max_ret_i32_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; EG: LDS_MAX_INT_RET *
; GCN: ds_max_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
; GCN: s_endpgm
}
; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i32:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; EG: LDS_MIN_UINT_RET *
; GCN: ds_min_rtn_u32
; GCN: s_endpgm
}
; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i32_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; EG: LDS_MIN_UINT_RET *
; GCN: ds_min_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
; GCN: s_endpgm
}
; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i32:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; EG: LDS_MAX_UINT_RET *
; GCN: ds_max_rtn_u32
; GCN: s_endpgm
}
; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i32_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; EG: LDS_MAX_UINT_RET *
; GCN: ds_max_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
; GCN: s_endpgm
}
; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i32:
-; GCN: s_load_dword [[SPTR:s[0-9]+]],
-; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
-; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; SICIVI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-DAG: s_load_dword [[SPTR:s[0-9]+]],
+; GCN-DAG: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
+; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
; GCN: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]]
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_xchg_noret_i32(i32 addrspace(3)* %ptr) nounwind {
}
; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i32_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_xchg_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
}
; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32:
-; GCN: s_load_dword [[SPTR:s[0-9]+]],
-; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
-; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; SICIVI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-DAG: s_load_dword [[SPTR:s[0-9]+]],
+; GCN-DAG: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
+; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
; GCN: ds_add_u32 [[VPTR]], [[DATA]]
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_add_noret_i32(i32 addrspace(3)* %ptr) nounwind {
}
; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_add_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
}
; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32_bad_si_offset
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}}
; CIVI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
; GCN: s_endpgm
}
; FUNC-LABEL: {{^}}lds_atomic_add1_noret_i32:
-; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
+; SICIVI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
; GCN: ds_add_u32 v{{[0-9]+}}, [[ONE]]
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_add1_noret_i32(i32 addrspace(3)* %ptr) nounwind {
}
; FUNC-LABEL: {{^}}lds_atomic_add1_noret_i32_offset:
-; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
+; SICIVI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
; GCN: ds_add_u32 v{{[0-9]+}}, [[ONE]] offset:16
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_add1_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
}
; FUNC-LABEL: {{^}}lds_atomic_add1_noret_i32_bad_si_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}}
; CIVI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
; GCN: s_endpgm
}
; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i32:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_sub_u32
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_sub_noret_i32(i32 addrspace(3)* %ptr) nounwind {
}
; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i32_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_sub_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
}
; FUNC-LABEL: {{^}}lds_atomic_sub1_noret_i32:
-; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
+; SICIVI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
; GCN: ds_sub_u32 v{{[0-9]+}}, [[ONE]]
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_sub1_noret_i32(i32 addrspace(3)* %ptr) nounwind {
}
; FUNC-LABEL: {{^}}lds_atomic_sub1_noret_i32_offset:
-; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
+; SICIVI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
; GCN: ds_sub_u32 v{{[0-9]+}}, [[ONE]] offset:16
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_sub1_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
}
; FUNC-LABEL: {{^}}lds_atomic_and_noret_i32:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_and_b32
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_and_noret_i32(i32 addrspace(3)* %ptr) nounwind {
}
; FUNC-LABEL: {{^}}lds_atomic_and_noret_i32_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_and_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_and_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
}
; FUNC-LABEL: {{^}}lds_atomic_or_noret_i32:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_or_b32
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_or_noret_i32(i32 addrspace(3)* %ptr) nounwind {
}
; FUNC-LABEL: {{^}}lds_atomic_or_noret_i32_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_or_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_or_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
}
; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i32:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_xor_b32
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_xor_noret_i32(i32 addrspace(3)* %ptr) nounwind {
}
; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i32_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_xor_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_xor_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
; }
; FUNC-LABEL: {{^}}lds_atomic_min_noret_i32:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_min_i32
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_min_noret_i32(i32 addrspace(3)* %ptr) nounwind {
}
; FUNC-LABEL: {{^}}lds_atomic_min_noret_i32_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_min_i32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_min_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
}
; FUNC-LABEL: {{^}}lds_atomic_max_noret_i32:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_max_i32
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_max_noret_i32(i32 addrspace(3)* %ptr) nounwind {
}
; FUNC-LABEL: {{^}}lds_atomic_max_noret_i32_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_max_i32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_max_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
}
; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i32:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_min_u32
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_umin_noret_i32(i32 addrspace(3)* %ptr) nounwind {
}
; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i32_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_min_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_umin_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
}
; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i32:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_max_u32
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_umax_noret_i32(i32 addrspace(3)* %ptr) nounwind {
}
; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i32_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_max_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_umax_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=VI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,SI,SICIVI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,VI,SICIVI,GFX89 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9,GFX89 %s
; GCN-LABEL: {{^}}lds_atomic_xchg_ret_i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_wrxchg_rtn_b64
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_xchg_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
}
; GCN-LABEL: {{^}}lds_atomic_xchg_ret_i64_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_wrxchg_rtn_b64 {{.*}} offset:32
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_xchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
}
; GCN-LABEL: {{^}}lds_atomic_add_ret_i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_add_rtn_u64
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_add_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
}
; GCN-LABEL: {{^}}lds_atomic_add_ret_i64_offset:
-; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9
-; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0
+; SICIVI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; SI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; GCN-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9
+; GCN-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0
; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
; GCN: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32
; GCN: buffer_store_dwordx2 [[RESULT]],
}
; GCN-LABEL: {{^}}lds_atomic_add1_ret_i64:
-; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 1{{$}}
-; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0{{$}}
-; GCN: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
+; SICIVI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 1{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0{{$}}
+; GCN: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
; GCN: buffer_store_dwordx2 [[RESULT]],
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_add1_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
}
; GCN-LABEL: {{^}}lds_atomic_add1_ret_i64_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_add_rtn_u64 {{.*}} offset:32
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_add1_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
}
; GCN-LABEL: {{^}}lds_atomic_sub_ret_i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_sub_rtn_u64
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_sub_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
}
; GCN-LABEL: {{^}}lds_atomic_sub_ret_i64_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_sub_rtn_u64 {{.*}} offset:32
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_sub_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
}
; GCN-LABEL: {{^}}lds_atomic_sub1_ret_i64:
-; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 1{{$}}
-; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0{{$}}
-; GCN: ds_sub_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
+; SICIVI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 1{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0{{$}}
+; GCN: ds_sub_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
; GCN: buffer_store_dwordx2 [[RESULT]],
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_sub1_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
}
; GCN-LABEL: {{^}}lds_atomic_sub1_ret_i64_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_sub_rtn_u64 {{.*}} offset:32
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_sub1_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
}
; GCN-LABEL: {{^}}lds_atomic_and_ret_i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_and_rtn_b64
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_and_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
}
; GCN-LABEL: {{^}}lds_atomic_and_ret_i64_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_and_rtn_b64 {{.*}} offset:32
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_and_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
}
; GCN-LABEL: {{^}}lds_atomic_or_ret_i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_or_rtn_b64
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_or_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
}
; GCN-LABEL: {{^}}lds_atomic_or_ret_i64_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_or_rtn_b64 {{.*}} offset:32
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_or_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
}
; GCN-LABEL: {{^}}lds_atomic_xor_ret_i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_xor_rtn_b64
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_xor_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
}
; GCN-LABEL: {{^}}lds_atomic_xor_ret_i64_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_xor_rtn_b64 {{.*}} offset:32
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_xor_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
; }
; GCN-LABEL: {{^}}lds_atomic_min_ret_i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_min_rtn_i64
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_min_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
}
; GCN-LABEL: {{^}}lds_atomic_min_ret_i64_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_min_rtn_i64 {{.*}} offset:32
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_min_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
}
; GCN-LABEL: {{^}}lds_atomic_max_ret_i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_max_rtn_i64
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_max_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
}
; GCN-LABEL: {{^}}lds_atomic_max_ret_i64_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_max_rtn_i64 {{.*}} offset:32
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_max_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
}
; GCN-LABEL: {{^}}lds_atomic_umin_ret_i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_min_rtn_u64
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_umin_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
}
; GCN-LABEL: {{^}}lds_atomic_umin_ret_i64_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_min_rtn_u64 {{.*}} offset:32
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_umin_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
}
; GCN-LABEL: {{^}}lds_atomic_umax_ret_i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_max_rtn_u64
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_umax_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
}
; GCN-LABEL: {{^}}lds_atomic_umax_ret_i64_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_max_rtn_u64 {{.*}} offset:32
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_umax_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
}
; GCN-LABEL: {{^}}lds_atomic_xchg_noret_i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_wrxchg_rtn_b64
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_xchg_noret_i64(i64 addrspace(3)* %ptr) nounwind {
}
; GCN-LABEL: {{^}}lds_atomic_xchg_noret_i64_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_wrxchg_rtn_b64 {{.*}} offset:32
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_xchg_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
}
; GCN-LABEL: {{^}}lds_atomic_add_noret_i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_add_u64
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_add_noret_i64(i64 addrspace(3)* %ptr) nounwind {
}
; GCN-LABEL: {{^}}lds_atomic_add_noret_i64_offset:
-; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
-; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
-; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9
-; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0
+; SICIVI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; SI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
+; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
+; GCN-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9
+; GCN-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0
; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-; GCN: ds_add_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32
+; GCN: ds_add_u64 {{v[0-9]+}}, v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_add_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
%gep = getelementptr i64, i64 addrspace(3)* %ptr, i64 4
}
; GCN-LABEL: {{^}}lds_atomic_add1_noret_i64:
+; SICIVI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 1{{$}}
; GCN-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0{{$}}
-; GCN: ds_add_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
+; GCN: ds_add_u64 {{v[0-9]+}}, v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_add1_noret_i64(i64 addrspace(3)* %ptr) nounwind {
%result = atomicrmw add i64 addrspace(3)* %ptr, i64 1 seq_cst
}
; GCN-LABEL: {{^}}lds_atomic_add1_noret_i64_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_add_u64 {{.*}} offset:32
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_add1_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
}
; GCN-LABEL: {{^}}lds_atomic_sub_noret_i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_sub_u64
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_sub_noret_i64(i64 addrspace(3)* %ptr) nounwind {
}
; GCN-LABEL: {{^}}lds_atomic_sub_noret_i64_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_sub_u64 {{.*}} offset:32
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_sub_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
}
; GCN-LABEL: {{^}}lds_atomic_sub1_noret_i64:
-; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 1{{$}}
-; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0{{$}}
-; GCN: ds_sub_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
+; SICIVI-DAG: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 1{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0{{$}}
+; GCN: ds_sub_u64 {{v[0-9]+}}, v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_sub1_noret_i64(i64 addrspace(3)* %ptr) nounwind {
%result = atomicrmw sub i64 addrspace(3)* %ptr, i64 1 seq_cst
}
; GCN-LABEL: {{^}}lds_atomic_sub1_noret_i64_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_sub_u64 {{.*}} offset:32
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_sub1_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
}
; GCN-LABEL: {{^}}lds_atomic_and_noret_i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_and_b64
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_and_noret_i64(i64 addrspace(3)* %ptr) nounwind {
}
; GCN-LABEL: {{^}}lds_atomic_and_noret_i64_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_and_b64 {{.*}} offset:32
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_and_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
}
; GCN-LABEL: {{^}}lds_atomic_or_noret_i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_or_b64
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_or_noret_i64(i64 addrspace(3)* %ptr) nounwind {
}
; GCN-LABEL: {{^}}lds_atomic_or_noret_i64_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_or_b64 {{.*}} offset:32
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_or_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
}
; GCN-LABEL: {{^}}lds_atomic_xor_noret_i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_xor_b64
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_xor_noret_i64(i64 addrspace(3)* %ptr) nounwind {
}
; GCN-LABEL: {{^}}lds_atomic_xor_noret_i64_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_xor_b64 {{.*}} offset:32
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_xor_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
; }
; GCN-LABEL: {{^}}lds_atomic_min_noret_i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_min_i64
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_min_noret_i64(i64 addrspace(3)* %ptr) nounwind {
}
; GCN-LABEL: {{^}}lds_atomic_min_noret_i64_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_min_i64 {{.*}} offset:32
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_min_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
}
; GCN-LABEL: {{^}}lds_atomic_max_noret_i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_max_i64
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_max_noret_i64(i64 addrspace(3)* %ptr) nounwind {
}
; GCN-LABEL: {{^}}lds_atomic_max_noret_i64_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_max_i64 {{.*}} offset:32
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_max_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
}
; GCN-LABEL: {{^}}lds_atomic_umin_noret_i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_min_u64
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_umin_noret_i64(i64 addrspace(3)* %ptr) nounwind {
}
; GCN-LABEL: {{^}}lds_atomic_umin_noret_i64_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_min_u64 {{.*}} offset:32
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_umin_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
}
; GCN-LABEL: {{^}}lds_atomic_umax_noret_i64:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_max_u64
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_umax_noret_i64(i64 addrspace(3)* %ptr) nounwind {
}
; GCN-LABEL: {{^}}lds_atomic_umax_noret_i64_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; GCN: ds_max_u64 {{.*}} offset:32
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_umax_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
; GCN: ds_read_u16 [[PACKED:v[0-9]+]]
-; GCN-NEXT: s_waitcnt
+; GCN: s_waitcnt
; GCN: ds_read_u16_d16_hi [[PACKED]]
; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[PACKED]] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
-; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mtriple=r600---amdgiz -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mtriple=r600---amdgiz -mcpu=cayman < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
+; RUN: llc -march=r600 -mtriple=r600---amdgiz -mcpu=redwood < %s | FileCheck -check-prefixes=EG,FUNC %s
+; RUN: llc -march=r600 -mtriple=r600---amdgiz -mcpu=cayman < %s | FileCheck -check-prefixes=CM,FUNC %s
; FUNC-LABEL: {{^}}store_local_i1:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; EG: LDS_BYTE_WRITE
; CM: LDS_BYTE_WRITE
}
; FUNC-LABEL: {{^}}store_local_i8:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; EG: LDS_BYTE_WRITE
; CM: LDS_BYTE_WRITE
}
; FUNC-LABEL: {{^}}store_local_i16:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; EG: LDS_SHORT_WRITE
; CM: LDS_SHORT_WRITE
}
; FUNC-LABEL: {{^}}store_local_v2i16:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; EG: LDS_WRITE
; CM: LDS_WRITE
}
; FUNC-LABEL: {{^}}store_local_v4i8:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; EG: LDS_WRITE
; CM: LDS_WRITE
}
; FUNC-LABEL: {{^}}store_local_v4i8_unaligned:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; EG: LDS_BYTE_WRITE
; EG: LDS_BYTE_WRITE
; EG: LDS_BYTE_WRITE
}
; FUNC-LABEL: {{^}}store_local_v4i8_halfaligned:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; EG: LDS_SHORT_WRITE
; EG: LDS_SHORT_WRITE
; EG-NOT: LDS_WRITE
}
; FUNC-LABEL: {{^}}store_local_v2i32:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; EG: LDS_WRITE
; EG: LDS_WRITE
; EG-NOT: LDS_WRITE
}
; FUNC-LABEL: {{^}}store_local_v4i32:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; EG: LDS_WRITE
; EG: LDS_WRITE
; EG: LDS_WRITE
}
; FUNC-LABEL: {{^}}store_local_v4i32_align4:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; EG: LDS_WRITE
; EG: LDS_WRITE
; EG: LDS_WRITE
}
; FUNC-LABEL: {{^}}store_local_i64_i8:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; EG: LDS_BYTE_WRITE
; GCN: ds_write_b8
define amdgpu_kernel void @store_local_i64_i8(i8 addrspace(3)* %out, i64 %in) {
}
; FUNC-LABEL: {{^}}store_local_i64_i16:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
; EG: LDS_SHORT_WRITE
; GCN: ds_write_b16
define amdgpu_kernel void @store_local_i64_i16(i16 addrspace(3)* %out, i64 %in) {
}
; GCN-LABEL: {{^}}local_store_i65:
-; GCN-DAG: ds_write_b8 v{{[0-9]+}}, v0 offset:8
+; GCN-DAG: ds_write_b8 v{{[0-9]+}}, v{{[0-9]+}} offset:8
; GCN-DAG: ds_write_b64
define amdgpu_kernel void @local_store_i65(i65 addrspace(3)* %ptr, i65 %arg) #0 {
store i65 %arg, i65 addrspace(3)* %ptr, align 8