return false;
}
-bool AMDGPUTargetLowering::shouldCombineMemoryType(const MemSDNode *M) const {
- EVT VT = M->getMemoryVT();
-
+bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
// i32 vectors are the canonical memory type.
if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
return false;
-
if (!VT.isByteSized())
return false;
if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
return false;
- unsigned Align = M->getAlignment();
- if (Align < Size) {
- bool IsFast;
- if (!allowsMisalignedMemoryAccesses(VT, M->getAddressSpace(), Align, &IsFast) ||
- !IsFast) {
- return false;
- }
- }
-
return true;
}
if (LN->isVolatile() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
return SDValue();
- if (!shouldCombineMemoryType(LN))
- return SDValue();
-
SDLoc SL(N);
SelectionDAG &DAG = DCI.DAG;
EVT VT = LN->getMemoryVT();
+
+ unsigned Size = VT.getStoreSize();
+ unsigned Align = LN->getAlignment();
+ if (Align < Size && isTypeLegal(VT)) {
+ bool IsFast;
+ unsigned AS = LN->getAddressSpace();
+
+ // Expand unaligned loads earlier than legalization. Due to visitation order
+ // problems during legalization, the emitted instructions to pack and unpack
+ // the bytes again are not eliminated in the case of an unaligned copy.
+ if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) {
+ SDValue Ops[2];
+ std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
+ return DAG.getMergeValues(Ops, SDLoc(N));
+ }
+
+ if (!IsFast)
+ return SDValue();
+ }
+
+ if (!shouldCombineMemoryType(VT))
+ return SDValue();
+
EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
SDValue NewLoad
if (SN->isVolatile() || !ISD::isNormalStore(SN))
return SDValue();
- if (!shouldCombineMemoryType(SN))
- return SDValue();
-
- SDValue Val = SN->getValue();
EVT VT = SN->getMemoryVT();
+ unsigned Size = VT.getStoreSize();
SDLoc SL(N);
SelectionDAG &DAG = DCI.DAG;
+ unsigned Align = SN->getAlignment();
+ if (Align < Size && isTypeLegal(VT)) {
+ bool IsFast;
+ unsigned AS = SN->getAddressSpace();
+
+ // Expand unaligned stores earlier than legalization. Due to visitation
+ // order problems during legalization, the emitted instructions to pack and
+ // unpack the bytes again are not eliminated in the case of an unaligned
+ // copy.
+ if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast))
+ return expandUnalignedStore(SN, DAG);
+
+ if (!IsFast)
+ return SDValue();
+ }
+
+ if (!shouldCombineMemoryType(VT))
+ return SDValue();
+
EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
+ SDValue Val = SN->getValue();
+
+ //DCI.AddToWorklist(Val.getNode());
bool OtherUses = !Val.hasOneUse();
SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
ret void
}
-; FUNC-LABEL: {{^}}unaligned_load_store_i16_global:
+; FUNC-LABEL: {{^}}global_unaligned_load_store_i16:
; GCN-NOHSA: buffer_load_ubyte
; GCN-NOHSA: buffer_load_ubyte
; GCN-NOHSA: buffer_store_byte
; GCN-HSA: flat_load_ubyte
; GCN-HSA: flat_store_byte
; GCN-HSA: flat_store_byte
-define void @unaligned_load_store_i16_global(i16 addrspace(1)* %p, i16 addrspace(1)* %r) #0 {
+define void @global_unaligned_load_store_i16(i16 addrspace(1)* %p, i16 addrspace(1)* %r) #0 {
%v = load i16, i16 addrspace(1)* %p, align 1
store i16 %v, i16 addrspace(1)* %r, align 1
ret void
}
; FUNC-LABEL: {{^}}local_unaligned_load_store_i32:
-; GCN: ds_read_u8
-; GCN: ds_read_u8
-; GCN: ds_read_u8
-; GCN: ds_read_u8
-; GCN: ds_write_b8
-; GCN: ds_write_b8
-; GCN: ds_write_b8
-; GCN: ds_write_b8
-; GCN: s_endpgm
+
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI-NOT: v_or
+; SI-NOT: v_lshl
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: s_endpgm
define void @local_unaligned_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) #0 {
%v = load i32, i32 addrspace(3)* %p, align 1
store i32 %v, i32 addrspace(3)* %r, align 1
ret void
}
-; FIXME: Unnecessary packing and unpacking of bytes.
; FUNC-LABEL: {{^}}local_unaligned_load_store_i64:
-; GCN: ds_read_u8
-; GCN: ds_read_u8
-; GCN: ds_read_u8
-; GCN: ds_read_u8
-; GCN: ds_read_u8
-; GCN: ds_read_u8
-; GCN: ds_read_u8
-; GCN: ds_read_u8
-
-; XGCN-NOT: v_or_b32
-; XGCN-NOT: v_lshl
-; GCN: ds_write_b8
-; XGCN-NOT: v_or_b32
-; XGCN-NOT: v_lshl
-
-; GCN: ds_write_b8
-; XGCN-NOT: v_or_b32
-; XGCN-NOT: v_lshl
-
-; GCN: ds_write_b8
-; XGCN-NOT: v_or_b32
-; XGCN-NOT: v_lshl
-
-; GCN: ds_write_b8
-; XGCN-NOT: v_or_b32
-; XGCN-NOT: v_lshl
-
-; GCN: ds_write_b8
-; XGCN-NOT: v_or_b32
-; XGCN-NOT: v_lshl
-
-; GCN: ds_write_b8
-; XGCN-NOT: v_or_b32
-; XGCN-NOT: v_lshl
-
-; GCN: ds_write_b8
-; XGCN-NOT: v_or_b32
-; XGCN-NOT: v_lshl
-; GCN: ds_write_b8
-; GCN: s_endpgm
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+
+; SI-NOT: v_or_b32
+; SI-NOT: v_lshl
+; SI: ds_write_b8
+; SI-NOT: v_or_b32
+; SI-NOT: v_lshl
+
+; SI: ds_write_b8
+; SI-NOT: v_or_b32
+; SI-NOT: v_lshl
+
+; SI: ds_write_b8
+; SI-NOT: v_or_b32
+; SI-NOT: v_lshl
+
+; SI: ds_write_b8
+; SI-NOT: v_or_b32
+; SI-NOT: v_lshl
+
+; SI: ds_write_b8
+; SI-NOT: v_or_b32
+; SI-NOT: v_lshl
+
+; SI: ds_write_b8
+; SI-NOT: v_or_b32
+; SI-NOT: v_lshl
+
+; SI: ds_write_b8
+; SI-NOT: v_or_b32
+; SI-NOT: v_lshl
+; SI: ds_write_b8
+; SI: s_endpgm
define void @local_unaligned_load_store_i64(i64 addrspace(3)* %p, i64 addrspace(3)* %r) {
%v = load i64, i64 addrspace(3)* %p, align 1
store i64 %v, i64 addrspace(3)* %r, align 1
ret void
}
-; FUNC-LABEL: {{^}}local_unaligned_load_store_v2i32:
-; GCN: ds_read_u8
-; GCN: ds_read_u8
-; GCN: ds_read_u8
-; GCN: ds_read_u8
-; GCN: ds_read_u8
-; GCN: ds_read_u8
-; GCN: ds_read_u8
-; GCN: ds_read_u8
-
-; XGCN-NOT: v_or_b32
-; XGCN-NOT: v_lshl
-; GCN: ds_write_b8
-; XGCN-NOT: v_or_b32
-; XGCN-NOT: v_lshl
-
-; GCN: ds_write_b8
-; XGCN-NOT: v_or_b32
-; XGCN-NOT: v_lshl
-
-; GCN: ds_write_b8
-; XGCN-NOT: v_or_b32
-; XGCN-NOT: v_lshl
-
-; GCN: ds_write_b8
-; XGCN-NOT: v_or_b32
-; XGCN-NOT: v_lshl
-
-; GCN: ds_write_b8
-; XGCN-NOT: v_or_b32
-; XGCN-NOT: v_lshl
-
-; GCN: ds_write_b8
-; XGCN-NOT: v_or_b32
-; XGCN-NOT: v_lshl
-
-; GCN: ds_write_b8
-; XGCN-NOT: v_or_b32
-; XGCN-NOT: v_lshl
-; GCN: ds_write_b8
-; GCN: s_endpgm
+; SI-LABEL: {{^}}local_unaligned_load_store_v2i32:
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+
+; SI-NOT: v_or_b32
+; SI-NOT: v_lshl
+; SI: ds_write_b8
+; SI-NOT: v_or_b32
+; SI-NOT: v_lshl
+
+; SI: ds_write_b8
+; SI-NOT: v_or_b32
+; SI-NOT: v_lshl
+
+; SI: ds_write_b8
+; SI-NOT: v_or_b32
+; SI-NOT: v_lshl
+
+; SI: ds_write_b8
+; SI-NOT: v_or_b32
+; SI-NOT: v_lshl
+
+; SI: ds_write_b8
+; SI-NOT: v_or_b32
+; SI-NOT: v_lshl
+
+; SI: ds_write_b8
+; SI-NOT: v_or_b32
+; SI-NOT: v_lshl
+
+; SI: ds_write_b8
+; SI-NOT: v_or_b32
+; SI-NOT: v_lshl
+; SI: ds_write_b8
+; SI: s_endpgm
define void @local_unaligned_load_store_v2i32(<2 x i32> addrspace(3)* %p, <2 x i32> addrspace(3)* %r) {
%v = load <2 x i32>, <2 x i32> addrspace(3)* %p, align 1
store <2 x i32> %v, <2 x i32> addrspace(3)* %r, align 1
ret void
}
-; FUNC-LABEL: {{^}}unaligned_load_store_i64_global:
-; GCN-NOHSA: buffer_load_ubyte
-; GCN-NOHSA: buffer_load_ubyte
-; GCN-NOHSA: buffer_load_ubyte
-; GCN-NOHSA: buffer_load_ubyte
-; GCN-NOHSA: buffer_load_ubyte
-; GCN-NOHSA: buffer_load_ubyte
-; GCN-NOHSA: buffer_load_ubyte
-; GCN-NOHSA: buffer_load_ubyte
+; SI-LABEL: {{^}}global_align2_load_store_i64:
+; SI: buffer_load_ushort
+; SI: buffer_load_ushort
-; GCN-HSA: flat_load_ubyte
-; GCN-HSA: flat_load_ubyte
-; GCN-HSA: flat_load_ubyte
-; GCN-HSA: flat_load_ubyte
-; GCN-HSA: flat_load_ubyte
-; GCN-HSA: flat_load_ubyte
-; GCN-HSA: flat_load_ubyte
-; GCN-HSA: flat_load_ubyte
+; SI-NOT: v_or_
+; SI-NOT: v_lshl
-; XGCN-NOT: v_or_
-; XGCN-NOT: v_lshl
+; SI: buffer_load_ushort
-; GCN-NOHSA: buffer_store_byte
-; GCN-NOHSA: buffer_store_byte
-; GCN-NOHSA: buffer_store_byte
-; GCN-NOHSA: buffer_store_byte
-; GCN-NOHSA: buffer_store_byte
-; GCN-NOHSA: buffer_store_byte
-; GCN-NOHSA: buffer_store_byte
-; GCN-NOHSA: buffer_store_byte
+; SI-NOT: v_or_
+; SI-NOT: v_lshl
-; GCN-HSA: flat_store_byte
-; GCN-HSA: flat_store_byte
-; GCN-HSA: flat_store_byte
-; GCN-HSA: flat_store_byte
-; GCN-HSA: flat_store_byte
-; GCN-HSA: flat_store_byte
-; GCN-HSA: flat_store_byte
-; GCN-HSA: flat_store_byte
-define void @unaligned_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace(1)* %r) #0 {
+; SI: buffer_load_ushort
+
+; SI-NOT: v_or_
+; SI-NOT: v_lshl
+
+; SI: buffer_store_short
+; SI: buffer_store_short
+; SI: buffer_store_short
+; SI: buffer_store_short
+define void @global_align2_load_store_i64(i64 addrspace(1)* %p, i64 addrspace(1)* %r) {
+ %v = load i64, i64 addrspace(1)* %p, align 2
+ store i64 %v, i64 addrspace(1)* %r, align 2
+ ret void
+}
+
+; SI-LABEL: {{^}}unaligned_load_store_i64_global:
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+
+; SI-NOT: v_or_
+; SI-NOT: v_lshl
+
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+define void @unaligned_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace(1)* %r) {
%v = load i64, i64 addrspace(1)* %p, align 1
store i64 %v, i64 addrspace(1)* %r, align 1
ret void
ret void
}
-; FUNC-LABEL: {{^}}global_unaligned_load_store_v4i32:
-; GCN-NOHSA: buffer_load_ubyte
-; GCN-NOHSA: buffer_load_ubyte
-; GCN-NOHSA: buffer_load_ubyte
-; GCN-NOHSA: buffer_load_ubyte
-; GCN-NOHSA: buffer_load_ubyte
-; GCN-NOHSA: buffer_load_ubyte
-; GCN-NOHSA: buffer_load_ubyte
-; GCN-NOHSA: buffer_load_ubyte
-; GCN-NOHSA: buffer_load_ubyte
-; GCN-NOHSA: buffer_load_ubyte
-; GCN-NOHSA: buffer_load_ubyte
-; GCN-NOHSA: buffer_load_ubyte
-; GCN-NOHSA: buffer_load_ubyte
-; GCN-NOHSA: buffer_load_ubyte
-; GCN-NOHSA: buffer_load_ubyte
-; GCN-NOHSA: buffer_load_ubyte
-
-; GCN-NOHSA: buffer_store_byte
-; GCN-NOHSA: buffer_store_byte
-; GCN-NOHSA: buffer_store_byte
-; GCN-NOHSA: buffer_store_byte
-; GCN-NOHSA: buffer_store_byte
-; GCN-NOHSA: buffer_store_byte
-; GCN-NOHSA: buffer_store_byte
-; GCN-NOHSA: buffer_store_byte
-; GCN-NOHSA: buffer_store_byte
-; GCN-NOHSA: buffer_store_byte
-; GCN-NOHSA: buffer_store_byte
-; GCN-NOHSA: buffer_store_byte
-; GCN-NOHSA: buffer_store_byte
-; GCN-NOHSA: buffer_store_byte
-; GCN-NOHSA: buffer_store_byte
-; GCN-NOHSA: buffer_store_byte
-
-
-; GCN-HSA: flat_load_ubyte
-; GCN-HSA: flat_load_ubyte
-; GCN-HSA: flat_load_ubyte
-; GCN-HSA: flat_load_ubyte
-; GCN-HSA: flat_load_ubyte
-; GCN-HSA: flat_load_ubyte
-; GCN-HSA: flat_load_ubyte
-; GCN-HSA: flat_load_ubyte
-; GCN-HSA: flat_load_ubyte
-; GCN-HSA: flat_load_ubyte
-; GCN-HSA: flat_load_ubyte
-; GCN-HSA: flat_load_ubyte
-; GCN-HSA: flat_load_ubyte
-; GCN-HSA: flat_load_ubyte
-; GCN-HSA: flat_load_ubyte
-; GCN-HSA: flat_load_ubyte
-
-; GCN-HSA: flat_store_byte
-; GCN-HSA: flat_store_byte
-; GCN-HSA: flat_store_byte
-; GCN-HSA: flat_store_byte
-; GCN-HSA: flat_store_byte
-; GCN-HSA: flat_store_byte
-; GCN-HSA: flat_store_byte
-; GCN-HSA: flat_store_byte
-; GCN-HSA: flat_store_byte
-; GCN-HSA: flat_store_byte
-; GCN-HSA: flat_store_byte
-; GCN-HSA: flat_store_byte
-; GCN-HSA: flat_store_byte
-; GCN-HSA: flat_store_byte
-; GCN-HSA: flat_store_byte
-; GCN-HSA: flat_store_byte
-define void @global_unaligned_load_store_v4i32(<4 x i32> addrspace(1)* %p, <4 x i32> addrspace(1)* %r) #0 {
+; SI-LABEL: {{^}}global_unaligned_load_store_v4i32
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+define void @global_unaligned_load_store_v4i32(<4 x i32> addrspace(1)* %p, <4 x i32> addrspace(1)* %r) nounwind {
%v = load <4 x i32>, <4 x i32> addrspace(1)* %p, align 1
store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 1
ret void