return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
}
+static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
+ Address DestAddr, QualType ElemType,
+ llvm::Value *Offset, SourceLocation Loc) {
+ CGBuilderTy &Bld = CGF.Builder;
+
+ CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
+ // Create the loop over the big sized data.
+ // ptr = (void*)Elem;
+ // ptrEnd = (void*) Elem + 1;
+ // Step = 8;
+ // while (ptr + Step < ptrEnd)
+ // shuffle((int64_t)*ptr);
+ // Step = 4;
+ // while (ptr + Step < ptrEnd)
+ // shuffle((int32_t)*ptr);
+ // ...
+ Address ElemPtr = DestAddr;
+ Address Ptr = SrcAddr;
+ Address PtrEnd = Bld.CreatePointerBitCastOrAddrSpaceCast(
+ Bld.CreateConstGEP(SrcAddr, 1, Size), CGF.VoidPtrTy);
+ for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
+ if (Size < CharUnits::fromQuantity(IntSize))
+ continue;
+ QualType IntType = CGF.getContext().getIntTypeForBitwidth(
+ CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),
+ /*Signed=*/1);
+ llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
+ Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo());
+ ElemPtr =
+ Bld.CreatePointerBitCastOrAddrSpaceCast(ElemPtr, IntTy->getPointerTo());
+ if (Size.getQuantity() / IntSize > 1) {
+ llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
+ llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
+ llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
+ llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
+ CGF.EmitBlock(PreCondBB);
+ llvm::PHINode *PhiSrc =
+ Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2);
+ PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB);
+ llvm::PHINode *PhiDest =
+ Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2);
+ PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB);
+ Ptr = Address(PhiSrc, Ptr.getAlignment());
+ ElemPtr = Address(PhiDest, ElemPtr.getAlignment());
+ llvm::Value *PtrDiff = Bld.CreatePtrDiff(
+ PtrEnd.getPointer(), Bld.CreatePointerBitCastOrAddrSpaceCast(
+ Ptr.getPointer(), CGF.VoidPtrTy));
+ Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
+ ThenBB, ExitBB);
+ CGF.EmitBlock(ThenBB);
+ llvm::Value *Res = createRuntimeShuffleFunction(
+ CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc),
+ IntType, Offset, Loc);
+ CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType);
+ Ptr = Bld.CreateConstGEP(Ptr, 1, CharUnits::fromQuantity(IntSize));
+ ElemPtr =
+ Bld.CreateConstGEP(ElemPtr, 1, CharUnits::fromQuantity(IntSize));
+ PhiSrc->addIncoming(Ptr.getPointer(), ThenBB);
+ PhiDest->addIncoming(ElemPtr.getPointer(), ThenBB);
+ CGF.EmitBranch(PreCondBB);
+ CGF.EmitBlock(ExitBB);
+ } else {
+ llvm::Value *Res = createRuntimeShuffleFunction(
+ CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc),
+ IntType, Offset, Loc);
+ CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType);
+ Ptr = Bld.CreateConstGEP(Ptr, 1, CharUnits::fromQuantity(IntSize));
+ ElemPtr =
+ Bld.CreateConstGEP(ElemPtr, 1, CharUnits::fromQuantity(IntSize));
+ }
+ Size = Size % IntSize;
+ }
+}
+
namespace {
enum CopyAction : unsigned {
// RemoteLaneToThread: Copy over a Reduce list from a remote lane in
// element as this is required in all directions
SrcElementAddr = Bld.CreateElementBitCast(
SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
- llvm::Value *Elem =
- CGF.EmitLoadOfScalar(SrcElementAddr, /*Volatile=*/false,
- Private->getType(), Private->getExprLoc());
+ DestElementAddr = Bld.CreateElementBitCast(DestElementAddr,
+ SrcElementAddr.getElementType());
// Now that all active lanes have read the element in the
// Reduce list, shuffle over the value from the remote lane.
if (ShuffleInElement) {
- Elem =
- createRuntimeShuffleFunction(CGF, Elem, Private->getType(),
- RemoteLaneOffset, Private->getExprLoc());
+ shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
+ RemoteLaneOffset, Private->getExprLoc());
+ } else {
+ if (Private->getType()->isScalarType()) {
+ llvm::Value *Elem =
+ CGF.EmitLoadOfScalar(SrcElementAddr, /*Volatile=*/false,
+ Private->getType(), Private->getExprLoc());
+ // Store the source element value to the dest element address.
+ CGF.EmitStoreOfScalar(Elem, DestElementAddr, /*Volatile=*/false,
+ Private->getType());
+ } else {
+ CGF.EmitAggregateCopy(
+ CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
+ CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
+ Private->getType(), AggValueSlot::DoesNotOverlap);
+ }
}
- DestElementAddr = Bld.CreateElementBitCast(DestElementAddr,
- SrcElementAddr.getElementType());
-
- // Store the source element value to the dest element address.
- CGF.EmitStoreOfScalar(Elem, DestElementAddr, /*Volatile=*/false,
- Private->getType());
-
// Step 3.1: Modify reference in dest Reduce list as needed.
// Modifying the reference in Reduce list to point to the newly
// created element. The element is live in the current function
Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
ElemPtr = Bld.CreateElementBitCast(
ElemPtr, CGF.ConvertTypeForMem(Private->getType()));
- // elem = *elemptr
- llvm::Value *Elem = CGF.EmitLoadOfScalar(
- ElemPtr, /*Volatile=*/false, Private->getType(), SourceLocation());
// Get pointer to location in transfer medium.
// MediumPtr = &medium[warp_id]
MediumPtr = Bld.CreateElementBitCast(
MediumPtr, CGF.ConvertTypeForMem(Private->getType()));
+ // elem = *elemptr
//*MediumPtr = elem
- Bld.CreateStore(Elem, MediumPtr);
+ if (Private->getType()->isScalarType()) {
+ llvm::Value *Elem = CGF.EmitLoadOfScalar(ElemPtr, /*Volatile=*/false,
+ Private->getType(), Loc);
+ // Store the source element value to the dest element address.
+ CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/false,
+ Private->getType());
+ } else {
+ CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),
+ CGF.MakeAddrLValue(MediumPtr, Private->getType()),
+ Private->getType(), AggValueSlot::DoesNotOverlap);
+ }
Bld.CreateBr(MergeBB);
// SrcMediumVal = *SrcMediumPtr;
SrcMediumPtr = Bld.CreateElementBitCast(
SrcMediumPtr, CGF.ConvertTypeForMem(Private->getType()));
- llvm::Value *SrcMediumValue = CGF.EmitLoadOfScalar(
- SrcMediumPtr, /*Volatile=*/false, Private->getType(), SourceLocation());
// TargetElemPtr = (type[i]*)(SrcDataAddr[i])
Address TargetElemPtrPtr =
TargetElemPtr, CGF.ConvertTypeForMem(Private->getType()));
// *TargetElemPtr = SrcMediumVal;
- CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
- Private->getType());
+ if (Private->getType()->isScalarType()) {
+ llvm::Value *SrcMediumValue = CGF.EmitLoadOfScalar(
+ SrcMediumPtr, /*Volatile=*/false, Private->getType(), Loc);
+ CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
+ Private->getType());
+ } else {
+ CGF.EmitAggregateCopy(
+ CGF.MakeAddrLValue(SrcMediumPtr, Private->getType()),
+ CGF.MakeAddrLValue(TargetElemPtr, Private->getType()),
+ Private->getType(), AggValueSlot::DoesNotOverlap);
+ }
Bld.CreateBr(W0MergeBB);
CGF.EmitBlock(W0ElseBB);
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
- // CHECK: [[ELT_VAL:%.+]] = load double, double* [[ELT]], align
//
- // CHECK: [[ELT_CAST:%.+]] = bitcast double [[ELT_VAL]] to i64
+ // CHECK: [[ELT_CAST:%.+]] = bitcast double* [[ELT]] to i64*
+ // CHECK: [[REMOTE_ELT_CAST:%.+]] = bitcast double* [[REMOTE_ELT]] to i64*
+ // CHECK: [[ELT_VAL:%.+]] = load i64, i64* [[ELT_CAST]], align
// CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
- // CHECK: [[REMOTE_ELT_VAL64:%.+]] = call i64 @__kmpc_shuffle_int64(i64 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]])
- // CHECK: [[REMOTE_ELT_VAL:%.+]] = bitcast i64 [[REMOTE_ELT_VAL64]] to double
+ // CHECK: [[REMOTE_ELT_VAL64:%.+]] = call i64 @__kmpc_shuffle_int64(i64 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
//
- // CHECK: store double [[REMOTE_ELT_VAL]], double* [[REMOTE_ELT]], align
+ // CHECK: store i64 [[REMOTE_ELT_VAL64]], i64* [[REMOTE_ELT_CAST]], align
// CHECK: [[REMOTE_ELT_VOID:%.+]] = bitcast double* [[REMOTE_ELT]] to i8*
// CHECK: store i8* [[REMOTE_ELT_VOID]], i8** [[REMOTE_ELT_REF]], align
//
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
// CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to double*
- // CHECK: [[REMOTE_ELT_VAL:%.+]] = load double, double* [[REMOTE_ELT]], align
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
+ // CHECK: [[REMOTE_ELT_VAL:%.+]] = load double, double* [[REMOTE_ELT]], align
// CHECK: store double [[REMOTE_ELT_VAL]], double* [[ELT]], align
// CHECK: br label {{%?}}[[COPY_CONT:.+]]
//
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
- // CHECK: [[ELT_VAL:%.+]] = load double, double* [[ELT]], align
//
// CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
// CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to double addrspace([[SHARED_ADDRSPACE]])*
+ // CHECK: [[ELT_VAL:%.+]] = load double, double* [[ELT]], align
// CHECK: store double [[ELT_VAL]], double addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: br label {{%?}}[[COPY_CONT:.+]]
//
// CHECK: [[DO_READ]]
// CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
// CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to double addrspace([[SHARED_ADDRSPACE]])*
- // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load double, double addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
+ // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load double, double addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: store double [[MEDIUM_ELT_VAL]], double* [[ELT]], align
// CHECK: br label {{%?}}[[READ_CONT:.+]]
//
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
- // CHECK: [[ELT_VAL:%.+]] = load float, float* [[ELT]], align
//
- // CHECK: [[ELT_CAST:%.+]] = bitcast float [[ELT_VAL]] to i32
+ // CHECK: [[ELT_CAST:%.+]] = bitcast float* [[ELT]] to i32*
+ // CHECK: [[REMOTE_ELT2_CAST:%.+]] = bitcast float* [[REMOTE_ELT2]] to i32*
+ // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT_CAST]], align
// CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
- // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]])
- // CHECK: [[REMOTE_ELT2_VAL:%.+]] = bitcast i32 [[REMOTE_ELT2_VAL32]] to float
+ // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
//
- // CHECK: store float [[REMOTE_ELT2_VAL]], float* [[REMOTE_ELT2]], align
+ // CHECK: store i32 [[REMOTE_ELT2_VAL32]], i32* [[REMOTE_ELT2_CAST]], align
// CHECK: [[REMOTE_ELT2C:%.+]] = bitcast float* [[REMOTE_ELT2]] to i8*
// CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align
//
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
// CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to float*
- // CHECK: [[REMOTE_ELT_VAL:%.+]] = load float, float* [[REMOTE_ELT]], align
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
+ // CHECK: [[REMOTE_ELT_VAL:%.+]] = load float, float* [[REMOTE_ELT]], align
// CHECK: store float [[REMOTE_ELT_VAL]], float* [[ELT]], align
// CHECK: br label {{%?}}[[COPY_CONT:.+]]
//
// [[DO_COPY]]
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
- // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align
//
// CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
// CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])*
+ // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align
// CHECK: store i8 [[ELT_VAL]], i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: br label {{%?}}[[COPY_CONT:.+]]
//
// CHECK: [[DO_READ]]
// CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
// CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])*
- // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i8, i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+ // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i8, i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: store i8 [[MEDIUM_ELT_VAL]], i8* [[ELT_VOID]], align
// CHECK: br label {{%?}}[[READ_CONT:.+]]
//
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
- // CHECK: [[ELT_VAL:%.+]] = load float, float* [[ELT]], align
//
// CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
// CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to float addrspace([[SHARED_ADDRSPACE]])*
+ // CHECK: [[ELT_VAL:%.+]] = load float, float* [[ELT]], align
// CHECK: store float [[ELT_VAL]], float addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: br label {{%?}}[[COPY_CONT:.+]]
//
// CHECK: [[DO_READ]]
// CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
// CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to float addrspace([[SHARED_ADDRSPACE]])*
- // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load float, float addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
+ // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load float, float addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: store float [[MEDIUM_ELT_VAL]], float* [[ELT]], align
// CHECK: br label {{%?}}[[READ_CONT:.+]]
//
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
// CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i32*
- // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+ // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align
// CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[ELT]], align
//
// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
// CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i16*
- // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
+ // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align
// CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[ELT]], align
// CHECK: br label {{%?}}[[COPY_CONT:.+]]
//
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
- // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
//
// CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
// CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i32 addrspace([[SHARED_ADDRSPACE]])*
+ // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
// CHECK: store i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: br label {{%?}}[[COPY_CONT:.+]]
//
// CHECK: [[DO_READ]]
// CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
// CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i32 addrspace([[SHARED_ADDRSPACE]])*
- // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+ // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align
// CHECK: br label {{%?}}[[READ_CONT:.+]]
//
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
- // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
//
// CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
// CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i16 addrspace([[SHARED_ADDRSPACE]])*
+ // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
// CHECK: store i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: br label {{%?}}[[COPY_CONT:.+]]
//
// CHECK: [[DO_READ]]
// CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
// CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i16 addrspace([[SHARED_ADDRSPACE]])*
- // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
+ // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: store i16 [[MEDIUM_ELT_VAL]], i16* [[ELT]], align
// CHECK: br label {{%?}}[[READ_CONT:.+]]
//
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
- // CHECK: [[ELT_VAL:%.+]] = load double, double* [[ELT]], align
//
- // CHECK: [[ELT_CAST:%.+]] = bitcast double [[ELT_VAL]] to i64
+ // CHECK: [[ELT_CAST:%.+]] = bitcast double* [[ELT]] to i64*
+ // CHECK: [[REMOTE_ELT_CAST:%.+]] = bitcast double* [[REMOTE_ELT]] to i64*
+ // CHECK: [[ELT_VAL:%.+]] = load i64, i64* [[ELT_CAST]], align
// CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
- // CHECK: [[REMOTE_ELT_VAL64:%.+]] = call i64 @__kmpc_shuffle_int64(i64 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]])
- // CHECK: [[REMOTE_ELT_VAL:%.+]] = bitcast i64 [[REMOTE_ELT_VAL64]] to double
+ // CHECK: [[REMOTE_ELT_VAL64:%.+]] = call i64 @__kmpc_shuffle_int64(i64 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
//
- // CHECK: store double [[REMOTE_ELT_VAL]], double* [[REMOTE_ELT]], align
+ // CHECK: store i64 [[REMOTE_ELT_VAL64]], i64* [[REMOTE_ELT_CAST]], align
// CHECK: [[REMOTE_ELT_VOID:%.+]] = bitcast double* [[REMOTE_ELT]] to i8*
// CHECK: store i8* [[REMOTE_ELT_VOID]], i8** [[REMOTE_ELT_REF]], align
//
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
// CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to double*
- // CHECK: [[REMOTE_ELT_VAL:%.+]] = load double, double* [[REMOTE_ELT]], align
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
+ // CHECK: [[REMOTE_ELT_VAL:%.+]] = load double, double* [[REMOTE_ELT]], align
// CHECK: store double [[REMOTE_ELT_VAL]], double* [[ELT]], align
// CHECK: br label {{%?}}[[COPY_CONT:.+]]
//
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
- // CHECK: [[ELT_VAL:%.+]] = load double, double* [[ELT]], align
//
// CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
// CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to double addrspace([[SHARED_ADDRSPACE]])*
+ // CHECK: [[ELT_VAL:%.+]] = load double, double* [[ELT]], align
// CHECK: store double [[ELT_VAL]], double addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: br label {{%?}}[[COPY_CONT:.+]]
//
// CHECK: [[DO_READ]]
// CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
// CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to double addrspace([[SHARED_ADDRSPACE]])*
- // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load double, double addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
+ // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load double, double addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: store double [[MEDIUM_ELT_VAL]], double* [[ELT]], align
// CHECK: br label {{%?}}[[READ_CONT:.+]]
//
// CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD]], [[P]]
// CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8*
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
- // CHECK: [[ELT_VAL:%.+]] = load double, double* [[ELT]], align
// CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to double*
+ // CHECK: [[ELT_VAL:%.+]] = load double, double* [[ELT]], align
// CHECK: store double [[ELT_VAL]], double* [[SCRATCHPAD_ELT_PTR]], align
//
// CHECK: ret
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
// CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to double*
- // CHECK: [[REMOTE_ELT_VAL:%.+]] = load double, double* [[REMOTE_ELT]], align
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
+ // CHECK: [[REMOTE_ELT_VAL:%.+]] = load double, double* [[REMOTE_ELT]], align
// CHECK: store double [[REMOTE_ELT_VAL]], double* [[ELT]], align
// CHECK: br label {{%?}}[[REDUCE_CONT]]
//
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
- // CHECK: [[ELT_VAL:%.+]] = load float, float* [[ELT]], align
//
- // CHECK: [[ELT_CAST:%.+]] = bitcast float [[ELT_VAL]] to i32
+ // CHECK: [[ELT_CAST:%.+]] = bitcast float* [[ELT]] to i32*
+ // CHECK: [[REMOTE_ELT2_CAST:%.+]] = bitcast float* [[REMOTE_ELT2]] to i32*
+ // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT_CAST]], align
// CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
- // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]])
- // CHECK: [[REMOTE_ELT2_VAL:%.+]] = bitcast i32 [[REMOTE_ELT2_VAL32]] to float
+ // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
//
- // CHECK: store float [[REMOTE_ELT2_VAL]], float* [[REMOTE_ELT2]], align
+ // CHECK: store i32 [[REMOTE_ELT2_VAL32]], i32* [[REMOTE_ELT2_CAST]], align
// CHECK: [[REMOTE_ELT2C:%.+]] = bitcast float* [[REMOTE_ELT2]] to i8*
// CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align
//
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
// CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to float*
- // CHECK: [[REMOTE_ELT_VAL:%.+]] = load float, float* [[REMOTE_ELT]], align
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
+ // CHECK: [[REMOTE_ELT_VAL:%.+]] = load float, float* [[REMOTE_ELT]], align
// CHECK: store float [[REMOTE_ELT_VAL]], float* [[ELT]], align
// CHECK: br label {{%?}}[[COPY_CONT:.+]]
//
// [[DO_COPY]]
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
- // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align
//
// CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
// CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])*
+ // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align
// CHECK: store i8 [[ELT_VAL]], i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: br label {{%?}}[[COPY_CONT:.+]]
//
// CHECK: [[DO_READ]]
// CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
// CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])*
- // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i8, i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+ // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i8, i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: store i8 [[MEDIUM_ELT_VAL]], i8* [[ELT_VOID]], align
// CHECK: br label {{%?}}[[READ_CONT:.+]]
//
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
- // CHECK: [[ELT_VAL:%.+]] = load float, float* [[ELT]], align
//
// CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
// CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to float addrspace([[SHARED_ADDRSPACE]])*
+ // CHECK: [[ELT_VAL:%.+]] = load float, float* [[ELT]], align
// CHECK: store float [[ELT_VAL]], float addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: br label {{%?}}[[COPY_CONT:.+]]
//
// CHECK: [[DO_READ]]
// CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
// CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to float addrspace([[SHARED_ADDRSPACE]])*
- // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load float, float addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
+ // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load float, float addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: store float [[MEDIUM_ELT_VAL]], float* [[ELT]], align
// CHECK: br label {{%?}}[[READ_CONT:.+]]
//
// CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD_NEXT]], [[P]]
// CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8*
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
- // CHECK: [[ELT_VAL:%.+]] = load float, float* [[ELT]], align
// CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to float*
+ // CHECK: [[ELT_VAL:%.+]] = load float, float* [[ELT]], align
// CHECK: store float [[ELT_VAL]], float* [[SCRATCHPAD_ELT_PTR]], align
//
// CHECK: ret
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
// CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to float*
- // CHECK: [[REMOTE_ELT_VAL:%.+]] = load float, float* [[REMOTE_ELT]], align
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
+ // CHECK: [[REMOTE_ELT_VAL:%.+]] = load float, float* [[REMOTE_ELT]], align
// CHECK: store float [[REMOTE_ELT_VAL]], float* [[ELT]], align
// CHECK: br label {{%?}}[[REDUCE_CONT]]
//
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
// CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i32*
- // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+ // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align
// CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[ELT]], align
//
// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
// CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i16*
- // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
+ // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align
// CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[ELT]], align
// CHECK: br label {{%?}}[[COPY_CONT:.+]]
//
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
- // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
//
// CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
// CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i32 addrspace([[SHARED_ADDRSPACE]])*
+ // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
// CHECK: store i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: br label {{%?}}[[COPY_CONT:.+]]
//
// CHECK: [[DO_READ]]
// CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
// CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i32 addrspace([[SHARED_ADDRSPACE]])*
- // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+ // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align
// CHECK: br label {{%?}}[[READ_CONT:.+]]
//
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
- // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
//
// CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
// CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i16 addrspace([[SHARED_ADDRSPACE]])*
+ // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
// CHECK: store i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: br label {{%?}}[[COPY_CONT:.+]]
//
// CHECK: [[DO_READ]]
// CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
// CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i16 addrspace([[SHARED_ADDRSPACE]])*
- // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
+ // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: store i16 [[MEDIUM_ELT_VAL]], i16* [[ELT]], align
// CHECK: br label {{%?}}[[READ_CONT:.+]]
//
// CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD]], [[P]]
// CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8*
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
- // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
// CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to i32*
+ // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
// CHECK: store i32 [[ELT_VAL]], i32* [[SCRATCHPAD_ELT_PTR]], align
//
// CHECK: [[OF:%.+]] = mul nuw i[[SZ]] [[NUM_TEAMS]], 4
// CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add nuw i[[SZ]] [[SCRATCHPAD_NEXT]], [[P]]
// CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8*
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
- // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
// CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to i16*
+ // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
// CHECK: store i16 [[ELT_VAL]], i16* [[SCRATCHPAD_ELT_PTR]], align
//
// CHECK: ret
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
// CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i32*
- // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+ // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align
// CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[ELT]], align
//
// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
// CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i16*
- // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
+ // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align
// CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[ELT]], align
// CHECK: br label {{%?}}[[REDUCE_CONT]]
//