From 9eda3abe7e183b05834947391c0cdc291f4ee0d8 Mon Sep 17 00:00:00 2001
From: John McCall <rjmccall@apple.com>
Date: Thu, 7 Mar 2013 21:37:17 +0000
Subject: [PATCH] Promote atomic type sizes up to a power of two, capped by
 MaxAtomicPromoteWidth.  Fix a ton of terrible bugs with _Atomic types and
 (non-intrinsic-mediated) loads and stores thereto.

git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@176658 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/AST/ASTContext.cpp        |  19 +-
 lib/CodeGen/CGAtomic.cpp      | 505 ++++++++++++++++++++++++++++++++--
 lib/CodeGen/CGDecl.cpp        |   8 +-
 lib/CodeGen/CGExpr.cpp        |  22 +-
 lib/CodeGen/CGExprAgg.cpp     | 149 +++++++++-
 lib/CodeGen/CGExprComplex.cpp |   7 +-
 lib/CodeGen/CGValue.h         |  30 +-
 lib/CodeGen/CodeGenFunction.h |   7 +
 lib/CodeGen/CodeGenModule.h   |   4 +
 lib/CodeGen/CodeGenTypes.cpp  |  24 +-
 test/CodeGen/c11atomics-ios.c | 214 ++++++++++++++
 test/CodeGen/c11atomics.c     | 207 ++++++++++++++
 12 files changed, 1150 insertions(+), 46 deletions(-)
 create mode 100644 test/CodeGen/c11atomics-ios.c
diff --git a/lib/AST/ASTContext.cpp b/lib/AST/ASTContext.cpp
index db1aa1a944..4580424696 100644
--- a/lib/AST/ASTContext.cpp
+++ b/lib/AST/ASTContext.cpp
@@ -1602,18 +1602,21 @@ ASTContext::getTypeInfoImpl(const Type *T) const {
   }
 
   case Type::Atomic: {
+    // Start with the base type information.
     std::pair<uint64_t, unsigned> Info
       = getTypeInfo(cast<AtomicType>(T)->getValueType());
     Width = Info.first;
     Align = Info.second;
-    if (Width != 0 && Width <= Target->getMaxAtomicPromoteWidth() &&
-        llvm::isPowerOf2_64(Width)) {
-      // We can potentially perform lock-free atomic operations for this
-      // type; promote the alignment appropriately.
-      // FIXME: We could potentially promote the width here as well...
-      // is that worthwhile?  (Non-struct atomic types generally have
-      // power-of-two size anyway, but structs might not.  Requires a bit
-      // of implementation work to make sure we zero out the extra bits.)
+
+    // If the size of the type doesn't exceed the platform's max
+    // atomic promotion width, make the size and alignment more
+    // favorable to atomic operations:
+    if (Width != 0 && Width <= Target->getMaxAtomicPromoteWidth()) {
+      // Round the size up to a power of 2.
+      if (!llvm::isPowerOf2_64(Width))
+        Width = llvm::NextPowerOf2(Width);
+
+      // Set the alignment equal to the size.
       Align = static_cast<unsigned>(Width);
     }
   }
diff --git a/lib/CodeGen/CGAtomic.cpp b/lib/CodeGen/CGAtomic.cpp
index f17e48d2f2..817d5c4cc6 100644
--- a/lib/CodeGen/CGAtomic.cpp
+++ b/lib/CodeGen/CGAtomic.cpp
@@ -17,10 +17,169 @@
 #include "clang/AST/ASTContext.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Operator.h"
 
 using namespace clang;
 using namespace CodeGen;
 
+// The ABI values for various atomic memory orderings.
+enum AtomicOrderingKind {
+  AO_ABI_memory_order_relaxed = 0,
+  AO_ABI_memory_order_consume = 1,
+  AO_ABI_memory_order_acquire = 2,
+  AO_ABI_memory_order_release = 3,
+  AO_ABI_memory_order_acq_rel = 4,
+  AO_ABI_memory_order_seq_cst = 5
+};
+
+namespace {
+  class AtomicInfo {
+    CodeGenFunction &CGF;
+    QualType AtomicTy;
+    QualType ValueTy;
+    uint64_t AtomicSizeInBits;
+    uint64_t ValueSizeInBits;
+    CharUnits AtomicAlign;
+    CharUnits ValueAlign;
+    CharUnits LValueAlign;
+    TypeEvaluationKind EvaluationKind;
+    bool UseLibcall;
+  public:
+    AtomicInfo(CodeGenFunction &CGF, LValue &lvalue) : CGF(CGF) {
+      assert(lvalue.isSimple());
+
+      AtomicTy = lvalue.getType();
+      ValueTy = AtomicTy->castAs<AtomicType>()->getValueType();
+      EvaluationKind = CGF.getEvaluationKind(ValueTy);
+
+      ASTContext &C = CGF.getContext();
+
+      uint64_t valueAlignInBits;
+      llvm::tie(ValueSizeInBits, valueAlignInBits) = C.getTypeInfo(ValueTy);
+
+      uint64_t atomicAlignInBits;
+      llvm::tie(AtomicSizeInBits, atomicAlignInBits) = C.getTypeInfo(AtomicTy);
+
+      assert(ValueSizeInBits <= AtomicSizeInBits);
+      assert(valueAlignInBits <= atomicAlignInBits);
+
+      AtomicAlign = C.toCharUnitsFromBits(atomicAlignInBits);
+      ValueAlign = C.toCharUnitsFromBits(valueAlignInBits);
+      if (lvalue.getAlignment().isZero())
+        lvalue.setAlignment(AtomicAlign);
+
+      UseLibcall =
+        (AtomicSizeInBits > uint64_t(C.toBits(lvalue.getAlignment())) ||
+         AtomicSizeInBits > C.getTargetInfo().getMaxAtomicInlineWidth());
+    }
+
+    QualType getAtomicType() const { return AtomicTy; }
+    QualType getValueType() const { return ValueTy; }
+    CharUnits getAtomicAlignment() const { return AtomicAlign; }
+    CharUnits getValueAlignment() const { return ValueAlign; }
+    uint64_t getAtomicSizeInBits() const { return AtomicSizeInBits; }
+    uint64_t getValueSizeInBits() const { return AtomicSizeInBits; }
+    TypeEvaluationKind getEvaluationKind() const { return EvaluationKind; }
+    bool shouldUseLibcall() const { return UseLibcall; }
+
+    /// Is the atomic size larger than the underlying value type?
+    ///
+    /// Note that the absence of padding does not mean that atomic
+    /// objects are completely interchangeable with non-atomic
+    /// objects: we might have promoted the alignment of a type
+    /// without making it bigger.
+    bool hasPadding() const {
+      return (ValueSizeInBits != AtomicSizeInBits);
+    }
+
+    void emitMemSetZeroIfNecessary(LValue dest) const;
+
+    llvm::Value *getAtomicSizeValue() const {
+      CharUnits size = CGF.getContext().toCharUnitsFromBits(AtomicSizeInBits);
+      return CGF.CGM.getSize(size);
+    }
+
+    /// Cast the given pointer to an integer pointer suitable for
+    /// atomic operations.
+    llvm::Value *emitCastToAtomicIntPointer(llvm::Value *addr) const;
+
+    /// Turn an atomic-layout object into an r-value.
+    RValue convertTempToRValue(llvm::Value *addr,
+                               AggValueSlot resultSlot) const;
+
+    /// Copy an atomic r-value into atomic-layout memory.
+    void emitCopyIntoMemory(RValue rvalue, LValue lvalue) const;
+
+    /// Project an l-value down to the value field.
+    LValue projectValue(LValue lvalue) const {
+      llvm::Value *addr = lvalue.getAddress();
+      if (hasPadding())
+        addr = CGF.Builder.CreateStructGEP(addr, 0);
+
+      return LValue::MakeAddr(addr, getValueType(), lvalue.getAlignment(),
+                              CGF.getContext(), lvalue.getTBAAInfo());
+    }
+
+    /// Materialize an atomic r-value in atomic-layout memory.
+    llvm::Value *materializeRValue(RValue rvalue) const;
+
+  private:
+    bool requiresMemSetZero(llvm::Type *type) const;
+  };
+}
+
+static RValue emitAtomicLibcall(CodeGenFunction &CGF,
+                                StringRef fnName,
+                                QualType resultType,
+                                CallArgList &args) {
+  const CGFunctionInfo &fnInfo =
+    CGF.CGM.getTypes().arrangeFreeFunctionCall(resultType, args,
+            FunctionType::ExtInfo(), RequiredArgs::All);
+  llvm::FunctionType *fnTy = CGF.CGM.getTypes().GetFunctionType(fnInfo);
+  llvm::Constant *fn = CGF.CGM.CreateRuntimeFunction(fnTy, fnName);
+  return CGF.EmitCall(fnInfo, fn, ReturnValueSlot(), args);
+}
+
+/// Does a store of the given IR type modify the full expected width?
+static bool isFullSizeType(CodeGenModule &CGM, llvm::Type *type,
+                           uint64_t expectedSize) {
+  return (CGM.getDataLayout().getTypeStoreSize(type) * 8 == expectedSize);
+}
+
+/// Does the atomic type require memsetting to zero before initialization?
+///
+/// The IR type is provided as a way of making certain queries faster.
+bool AtomicInfo::requiresMemSetZero(llvm::Type *type) const {
+  // If the atomic type has size padding, we definitely need a memset.
+  if (hasPadding()) return true;
+
+  // Otherwise, do some simple heuristics to try to avoid it:
+  switch (getEvaluationKind()) {
+  // For scalars and complexes, check whether the store size of the
+  // type uses the full size.
+  case TEK_Scalar:
+    return !isFullSizeType(CGF.CGM, type, AtomicSizeInBits);
+  case TEK_Complex:
+    return !isFullSizeType(CGF.CGM, type->getStructElementType(0),
+                           AtomicSizeInBits / 2);
+
+  // Just be pessimistic about aggregates.
+  case TEK_Aggregate:
+    return true;
+  }
+  llvm_unreachable("bad evaluation kind");
+}
+
+void AtomicInfo::emitMemSetZeroIfNecessary(LValue dest) const {
+  llvm::Value *addr = dest.getAddress();
+  if (!requiresMemSetZero(addr->getType()->getPointerElementType()))
+    return;
+
+  CGF.Builder.CreateMemSet(addr, llvm::ConstantInt::get(CGF.Int8Ty, 0),
+                           AtomicSizeInBits / 8,
+                           dest.getAlignment().getQuantity());
+}
+
 static void
 EmitAtomicOp(CodeGenFunction &CGF, AtomicExpr *E, llvm::Value *Dest,
              llvm::Value *Ptr, llvm::Value *Val1, llvm::Value *Val2,
@@ -177,24 +336,9 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E, llvm::Value *Dest) {
 
   if (E->getOp() == AtomicExpr::AO__c11_atomic_init) {
     assert(!Dest && "Init does not return a value");
-    LValue LV = MakeAddrLValue(Ptr, AtomicTy, alignChars);
-    switch (getEvaluationKind(E->getVal1()->getType())) {
-    case TEK_Scalar:
-      EmitScalarInit(EmitScalarExpr(E->getVal1()), LV);
-      return RValue::get(0);
-    case TEK_Complex:
-      EmitComplexExprIntoLValue(E->getVal1(), LV, /*isInit*/ true);
-      return RValue::get(0);
-    case TEK_Aggregate: {
-      AggValueSlot Slot = AggValueSlot::forLValue(LV,
-                                        AggValueSlot::IsNotDestructed,
-                                        AggValueSlot::DoesNotNeedGCBarriers,
-                                        AggValueSlot::IsNotAliased);
-      EmitAggExpr(E->getVal1(), Slot);
-      return RValue::get(0);
-    }
-    }
-    llvm_unreachable("bad evaluation kind");
+    LValue lvalue = LValue::MakeAddr(Ptr, AtomicTy, alignChars, getContext());
+    EmitAtomicInit(E->getVal1(), lvalue);
+    return RValue::get(0);
   }
 
   Order = EmitScalarExpr(E->getOrder());
@@ -385,30 +529,30 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E, llvm::Value *Dest) {
   if (isa<llvm::ConstantInt>(Order)) {
     int ord = cast<llvm::ConstantInt>(Order)->getZExtValue();
     switch (ord) {
-    case 0:  // memory_order_relaxed
+    case AO_ABI_memory_order_relaxed:
       EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, Size, Align,
                    llvm::Monotonic);
       break;
-    case 1:  // memory_order_consume
-    case 2:  // memory_order_acquire
+    case AO_ABI_memory_order_consume:
+    case AO_ABI_memory_order_acquire:
       if (IsStore)
         break; // Avoid crashing on code with undefined behavior
       EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, Size, Align,
                    llvm::Acquire);
       break;
-    case 3:  // memory_order_release
+    case AO_ABI_memory_order_release:
       if (IsLoad)
         break; // Avoid crashing on code with undefined behavior
       EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, Size, Align,
                    llvm::Release);
       break;
-    case 4:  // memory_order_acq_rel
+    case AO_ABI_memory_order_acq_rel:
       if (IsLoad || IsStore)
         break; // Avoid crashing on code with undefined behavior
       EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, Size, Align,
                    llvm::AcquireRelease);
       break;
-    case 5:  // memory_order_seq_cst
+    case AO_ABI_memory_order_seq_cst:
       EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, Size, Align,
                    llvm::SequentiallyConsistent);
       break;
@@ -483,3 +627,316 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E, llvm::Value *Dest) {
     return RValue::get(0);
   return convertTempToRValue(OrigDest, E->getType());
 }
+
+llvm::Value *AtomicInfo::emitCastToAtomicIntPointer(llvm::Value *addr) const {
+  unsigned addrspace =
+    cast<llvm::PointerType>(addr->getType())->getAddressSpace();
+  llvm::IntegerType *ty =
+    llvm::IntegerType::get(CGF.getLLVMContext(), AtomicSizeInBits);
+  return CGF.Builder.CreateBitCast(addr, ty->getPointerTo(addrspace));
+}
+
+RValue AtomicInfo::convertTempToRValue(llvm::Value *addr,
+                                       AggValueSlot resultSlot) const {
+  if (EvaluationKind == TEK_Aggregate) {
+    // Nothing to do if the result is ignored.
+    if (resultSlot.isIgnored()) return resultSlot.asRValue();
+
+    assert(resultSlot.getAddr() == addr || hasPadding());
+
+    // In these cases, we should have emitted directly into the result slot.
+    if (!hasPadding() || resultSlot.isValueOfAtomic())
+      return resultSlot.asRValue();
+
+    // Otherwise, fall into the common path.
+  }
+
+  // Drill into the padding structure if we have one.
+  if (hasPadding())
+    addr = CGF.Builder.CreateStructGEP(addr, 0);
+
+  // If we're emitting to an aggregate, copy into the result slot.
+  if (EvaluationKind == TEK_Aggregate) {
+    CGF.EmitAggregateCopy(resultSlot.getAddr(), addr, getValueType(),
+                          resultSlot.isVolatile());
+    return resultSlot.asRValue();
+  }
+
+  // Otherwise, just convert the temporary to an r-value using the
+  // normal conversion routine.
+  return CGF.convertTempToRValue(addr, getValueType());
+}
+
+/// Emit a load from an l-value of atomic type.  Note that the r-value
+/// we produce is an r-value of the atomic *value* type.
+RValue CodeGenFunction::EmitAtomicLoad(LValue src, AggValueSlot resultSlot) {
+  AtomicInfo atomics(*this, src);
+
+  // Check whether we should use a library call.
+  if (atomics.shouldUseLibcall()) {
+    llvm::Value *tempAddr;
+    if (resultSlot.isValueOfAtomic()) {
+      assert(atomics.getEvaluationKind() == TEK_Aggregate);
+      tempAddr = resultSlot.getPaddedAtomicAddr();
+    } else if (!resultSlot.isIgnored() && !atomics.hasPadding()) {
+      assert(atomics.getEvaluationKind() == TEK_Aggregate);
+      tempAddr = resultSlot.getAddr();
+    } else {
+      tempAddr = CreateMemTemp(atomics.getAtomicType(), "atomic-load-temp");
+    }
+
+    // void __atomic_load(size_t size, void *mem, void *return, int order);
+    CallArgList args;
+    args.add(RValue::get(atomics.getAtomicSizeValue()),
+             getContext().getSizeType());
+    args.add(RValue::get(EmitCastToVoidPtr(src.getAddress())),
+             getContext().VoidPtrTy);
+    args.add(RValue::get(EmitCastToVoidPtr(tempAddr)),
+             getContext().VoidPtrTy);
+    args.add(RValue::get(llvm::ConstantInt::get(IntTy,
+                                                AO_ABI_memory_order_seq_cst)),
+             getContext().IntTy);
+    emitAtomicLibcall(*this, "__atomic_load", getContext().VoidTy, args);
+
+    // Produce the r-value.
+    return atomics.convertTempToRValue(tempAddr, resultSlot);
+  }
+
+  // Okay, we're doing this natively.
+  llvm::Value *addr = atomics.emitCastToAtomicIntPointer(src.getAddress());
+  llvm::LoadInst *load = Builder.CreateLoad(addr, "atomic-load");
+  load->setAtomic(llvm::SequentiallyConsistent);
+
+  // Other decoration.
+  load->setAlignment(src.getAlignment().getQuantity());
+  if (src.isVolatileQualified())
+    load->setVolatile(true);
+  if (src.getTBAAInfo())
+    CGM.DecorateInstruction(load, src.getTBAAInfo());
+
+  // Okay, turn that back into the original value type.
+  QualType valueType = atomics.getValueType();
+  llvm::Value *result = load;
+
+  // If we're ignoring an aggregate return, don't do anything.
+  if (atomics.getEvaluationKind() == TEK_Aggregate && resultSlot.isIgnored())
+    return RValue::getAggregate(0, false);
+
+  // The easiest way to do this this is to go through memory, but we
+  // try not to in some easy cases.
+  if (atomics.getEvaluationKind() == TEK_Scalar && !atomics.hasPadding()) {
+    llvm::Type *resultTy = CGM.getTypes().ConvertTypeForMem(valueType);
+    if (isa<llvm::IntegerType>(resultTy)) {
+      assert(result->getType() == resultTy);
+      result = EmitFromMemory(result, valueType);
+    } else if (isa<llvm::PointerType>(resultTy)) {
+      result = Builder.CreateIntToPtr(result, resultTy);
+    } else {
+      result = Builder.CreateBitCast(result, resultTy);
+    }
+    return RValue::get(result);
+  }
+
+  // Create a temporary.  This needs to be big enough to hold the
+  // atomic integer.
+  llvm::Value *temp;
+  bool tempIsVolatile = false;
+  CharUnits tempAlignment;
+  if (atomics.getEvaluationKind() == TEK_Aggregate &&
+      (!atomics.hasPadding() || resultSlot.isValueOfAtomic())) {
+    assert(!resultSlot.isIgnored());
+    if (resultSlot.isValueOfAtomic()) {
+      temp = resultSlot.getPaddedAtomicAddr();
+      tempAlignment = atomics.getAtomicAlignment();
+    } else {
+      temp = resultSlot.getAddr();
+      tempAlignment = atomics.getValueAlignment();
+    }
+    tempIsVolatile = resultSlot.isVolatile();
+  } else {
+    temp = CreateMemTemp(atomics.getAtomicType(), "atomic-load-temp");
+    tempAlignment = atomics.getAtomicAlignment();
+  }
+
+  // Slam the integer into the temporary.
+  llvm::Value *castTemp = atomics.emitCastToAtomicIntPointer(temp);
+  Builder.CreateAlignedStore(result, castTemp, tempAlignment.getQuantity())
+    ->setVolatile(tempIsVolatile);
+
+  return atomics.convertTempToRValue(temp, resultSlot);
+}
+
+
+
+/// Copy an r-value into memory as part of storing to an atomic type.
+/// This needs to create a bit-pattern suitable for atomic operations.
+void AtomicInfo::emitCopyIntoMemory(RValue rvalue, LValue dest) const {
+  // If we have an r-value, the rvalue should be of the atomic type,
+  // which means that the caller is responsible for having zeroed
+  // any padding.  Just do an aggregate copy of that type.
+  if (rvalue.isAggregate()) {
+    CGF.EmitAggregateCopy(dest.getAddress(),
+                          rvalue.getAggregateAddr(),
+                          getAtomicType(),
+                          (rvalue.isVolatileQualified()
+                           || dest.isVolatileQualified()),
+                          dest.getAlignment());
+    return;
+  }
+
+  // Okay, otherwise we're copying stuff.
+
+  // Zero out the buffer if necessary.
+  emitMemSetZeroIfNecessary(dest);
+
+  // Drill past the padding if present.
+  dest = projectValue(dest);
+
+  // Okay, store the rvalue in.
+  if (rvalue.isScalar()) {
+    CGF.EmitStoreOfScalar(rvalue.getScalarVal(), dest, /*init*/ true);
+  } else {
+    CGF.EmitStoreOfComplex(rvalue.getComplexVal(), dest, /*init*/ true);
+  }
+}
+
+
+/// Materialize an r-value into memory for the purposes of storing it
+/// to an atomic type.
+llvm::Value *AtomicInfo::materializeRValue(RValue rvalue) const {
+  // Aggregate r-values are already in memory, and EmitAtomicStore
+  // requires them to be values of the atomic type.
+  if (rvalue.isAggregate())
+    return rvalue.getAggregateAddr();
+
+  // Otherwise, make a temporary and materialize into it.
+  llvm::Value *temp = CGF.CreateMemTemp(getAtomicType(), "atomic-store-temp");
+  LValue tempLV = CGF.MakeAddrLValue(temp, getAtomicType(), getAtomicAlignment());
+  emitCopyIntoMemory(rvalue, tempLV);
+  return temp;
+}
+
+/// Emit a store to an l-value of atomic type.
+///
+/// Note that the r-value is expected to be an r-value *of the atomic
+/// type*; this means that for aggregate r-values, it should include
+/// storage for any padding that was necessary.
+void CodeGenFunction::EmitAtomicStore(RValue rvalue, LValue dest,
+                                      bool isInit) {
+  // If this is an aggregate r-value, it should agree in type except
+  // maybe for address-space qualification.
+  assert(!rvalue.isAggregate() ||
+         rvalue.getAggregateAddr()->getType()->getPointerElementType()
+           == dest.getAddress()->getType()->getPointerElementType());
+
+  AtomicInfo atomics(*this, dest);
+
+  // If this is an initialization, just put the value there normally.
+  if (isInit) {
+    atomics.emitCopyIntoMemory(rvalue, dest);
+    return;
+  }
+
+  // Check whether we should use a library call.
+  if (atomics.shouldUseLibcall()) {
+    // Produce a source address.
+    llvm::Value *srcAddr = atomics.materializeRValue(rvalue);
+
+    // void __atomic_store(size_t size, void *mem, void *val, int order)
+    CallArgList args;
+    args.add(RValue::get(atomics.getAtomicSizeValue()),
+             getContext().getSizeType());
+    args.add(RValue::get(EmitCastToVoidPtr(dest.getAddress())),
+             getContext().VoidPtrTy);
+    args.add(RValue::get(EmitCastToVoidPtr(srcAddr)),
+             getContext().VoidPtrTy);
+    args.add(RValue::get(llvm::ConstantInt::get(IntTy,
+                                                AO_ABI_memory_order_seq_cst)),
+             getContext().IntTy);
+    emitAtomicLibcall(*this, "__atomic_store", getContext().VoidTy, args);
+    return;
+  }
+
+  // Okay, we're doing this natively.
+  llvm::Value *intValue;
+
+  // If we've got a scalar value of the right size, try to avoid going
+  // through memory.
+  if (rvalue.isScalar() && !atomics.hasPadding()) {
+    llvm::Value *value = rvalue.getScalarVal();
+    if (isa<llvm::IntegerType>(value->getType())) {
+      intValue = value;
+    } else {
+      llvm::IntegerType *inputIntTy =
+        llvm::IntegerType::get(getLLVMContext(), atomics.getValueSizeInBits());
+      if (isa<llvm::PointerType>(value->getType())) {
+        intValue = Builder.CreatePtrToInt(value, inputIntTy);
+      } else {
+        intValue = Builder.CreateBitCast(value, inputIntTy);
+      }
+    }
+
+  // Otherwise, we need to go through memory.
+  } else {
+    // Put the r-value in memory.
+    llvm::Value *addr = atomics.materializeRValue(rvalue);
+
+    // Cast the temporary to the atomic int type and pull a value out.
+    addr = atomics.emitCastToAtomicIntPointer(addr);
+    intValue = Builder.CreateAlignedLoad(addr,
+                                 atomics.getAtomicAlignment().getQuantity());
+  }
+
+  // Do the atomic store.
+  llvm::Value *addr = atomics.emitCastToAtomicIntPointer(dest.getAddress());
+  llvm::StoreInst *store = Builder.CreateStore(intValue, addr);
+
+  // Initializations don't need to be atomic.
+  if (!isInit) store->setAtomic(llvm::SequentiallyConsistent);
+
+  // Other decoration.
+  store->setAlignment(dest.getAlignment().getQuantity());
+  if (dest.isVolatileQualified())
+    store->setVolatile(true);
+  if (dest.getTBAAInfo())
+    CGM.DecorateInstruction(store, dest.getTBAAInfo());
+}
+
+void CodeGenFunction::EmitAtomicInit(Expr *init, LValue dest) {
+  AtomicInfo atomics(*this, dest);
+
+  switch (atomics.getEvaluationKind()) {
+  case TEK_Scalar: {
+    llvm::Value *value = EmitScalarExpr(init);
+    atomics.emitCopyIntoMemory(RValue::get(value), dest);
+    return;
+  }
+
+  case TEK_Complex: {
+    ComplexPairTy value = EmitComplexExpr(init);
+    atomics.emitCopyIntoMemory(RValue::getComplex(value), dest);
+    return;
+  }
+
+  case TEK_Aggregate: {
+    // Memset the buffer first if there's any possibility of
+    // uninitialized internal bits.
+    atomics.emitMemSetZeroIfNecessary(dest);
+
+    // HACK: whether the initializer actually has an atomic type
+    // doesn't really seem reliable right now.
+    if (!init->getType()->isAtomicType()) {
+      dest = atomics.projectValue(dest);
+    }
+
+    // Evaluate the expression directly into the destination.
+    AggValueSlot slot = AggValueSlot::forLValue(dest,
+                                        AggValueSlot::IsNotDestructed,
+                                        AggValueSlot::DoesNotNeedGCBarriers,
+                                        AggValueSlot::IsNotAliased);
+    EmitAggExpr(init, slot);
+    return;
+  }
+  }
+  llvm_unreachable("bad evaluation kind");
+}
diff --git a/lib/CodeGen/CGDecl.cpp b/lib/CodeGen/CGDecl.cpp
index bb5d6389d2..0e001301ae 100644
--- a/lib/CodeGen/CGDecl.cpp
+++ b/lib/CodeGen/CGDecl.cpp
@@ -1129,11 +1129,15 @@ void CodeGenFunction::EmitExprAsInit(const Expr *init,
     return;
   }
   case TEK_Aggregate:
-    // TODO: how can we delay here if D is captured by its initializer?
-    EmitAggExpr(init, AggValueSlot::forLValue(lvalue,
+    if (type->isAtomicType()) {
+      EmitAtomicInit(const_cast<Expr*>(init), lvalue);
+    } else {
+      // TODO: how can we delay here if D is captured by its initializer?
+      EmitAggExpr(init, AggValueSlot::forLValue(lvalue,
                                               AggValueSlot::IsDestructed,
                                          AggValueSlot::DoesNotNeedGCBarriers,
                                               AggValueSlot::IsNotAliased));
+    }
     MaybeEmitStdInitializerListCleanup(lvalue.getAddress(), init);
     return;
   }
diff --git a/lib/CodeGen/CGExpr.cpp b/lib/CodeGen/CGExpr.cpp
index ba816af352..a170028def 100644
--- a/lib/CodeGen/CGExpr.cpp
+++ b/lib/CodeGen/CGExpr.cpp
@@ -1144,6 +1144,14 @@ llvm::Value *CodeGenFunction::EmitLoadOfScalar(llvm::Value *Addr, bool Volatile,
       return EmitFromMemory(V, Ty);
     }
   }
+
+  // Atomic operations have to be done on integral types.
+  if (Ty->isAtomicType()) {
+    LValue lvalue = LValue::MakeAddr(Addr, Ty,
+                                     CharUnits::fromQuantity(Alignment),
+                                     getContext(), TBAAInfo);
+    return EmitAtomicLoad(lvalue).getScalarVal();
+  }
   
   llvm::LoadInst *Load = Builder.CreateLoad(Addr);
   if (Volatile)
@@ -1152,9 +1160,6 @@ llvm::Value *CodeGenFunction::EmitLoadOfScalar(llvm::Value *Addr, bool Volatile,
     Load->setAlignment(Alignment);
   if (TBAAInfo)
     CGM.DecorateInstruction(Load, TBAAInfo);
-  // If this is an atomic type, all normal reads must be atomic
-  if (Ty->isAtomicType())
-    Load->setAtomic(llvm::SequentiallyConsistent);
 
   if ((SanOpts->Bool && hasBooleanRepresentation(Ty)) ||
       (SanOpts->Enum && Ty->getAs<EnumType>())) {
@@ -1251,13 +1256,20 @@ void CodeGenFunction::EmitStoreOfScalar(llvm::Value *Value, llvm::Value *Addr,
   
   Value = EmitToMemory(Value, Ty);
 
+  if (Ty->isAtomicType()) {
+    EmitAtomicStore(RValue::get(Value),
+                    LValue::MakeAddr(Addr, Ty,
+                                     CharUnits::fromQuantity(Alignment),
+                                     getContext(), TBAAInfo),
+                    isInit);
+    return;
+  }
+
   llvm::StoreInst *Store = Builder.CreateStore(Value, Addr, Volatile);
   if (Alignment)
     Store->setAlignment(Alignment);
   if (TBAAInfo)
     CGM.DecorateInstruction(Store, TBAAInfo);
-  if (!isInit && Ty->isAtomicType())
-    Store->setAtomic(llvm::SequentiallyConsistent);
 }
 
 void CodeGenFunction::EmitStoreOfScalar(llvm::Value *value, LValue lvalue,
diff --git a/lib/CodeGen/CGExprAgg.cpp b/lib/CodeGen/CGExprAgg.cpp
index f8921db655..1ac13c01ed 100644
--- a/lib/CodeGen/CGExprAgg.cpp
+++ b/lib/CodeGen/CGExprAgg.cpp
@@ -29,6 +29,14 @@ using namespace CodeGen;
 //                        Aggregate Expression Emitter
 //===----------------------------------------------------------------------===//
 
+llvm::Value *AggValueSlot::getPaddedAtomicAddr() const {
+  assert(isValueOfAtomic());
+  llvm::GEPOperator *op = cast<llvm::GEPOperator>(getAddr());
+  assert(op->getNumIndices() == 2);
+  assert(op->hasAllZeroIndices());
+  return op->getPointerOperand();
+}
+
 namespace  {
 class AggExprEmitter : public StmtVisitor<AggExprEmitter> {
   CodeGenFunction &CGF;
@@ -190,6 +198,38 @@ public:
     CGF.EmitAtomicExpr(E, EnsureSlot(E->getType()).getAddr());
   }
 };
+
+/// A helper class for emitting expressions into the value sub-object
+/// of a padded atomic type.
+class ValueDestForAtomic {
+  AggValueSlot Dest;
+public:
+  ValueDestForAtomic(CodeGenFunction &CGF, AggValueSlot dest, QualType type)
+    : Dest(dest) {
+    assert(!Dest.isValueOfAtomic());
+    if (!Dest.isIgnored() && CGF.CGM.isPaddedAtomicType(type)) {
+      llvm::Value *valueAddr = CGF.Builder.CreateStructGEP(Dest.getAddr(), 0);
+      Dest = AggValueSlot::forAddr(valueAddr,
+                                   Dest.getAlignment(),
+                                   Dest.getQualifiers(),
+                                   Dest.isExternallyDestructed(),
+                                   Dest.requiresGCollection(),
+                                   Dest.isPotentiallyAliased(),
+                                   Dest.isZeroed(),
+                                   AggValueSlot::IsValueOfAtomic);
+    }
+  }
+
+  const AggValueSlot &getDest() const { return Dest; }
+
+  ~ValueDestForAtomic() {
+    // Kill the GEP if we made one and it didn't end up used.
+    if (Dest.isValueOfAtomic()) {
+      llvm::Instruction *addr = cast<llvm::GetElementPtrInst>(Dest.getAddr());
+      if (addr->use_empty()) addr->eraseFromParent();
+    }
+  }
+};
 }  // end anonymous namespace.
 
 //===----------------------------------------------------------------------===//
@@ -201,6 +241,14 @@ public:
 /// then loads the result into DestPtr.
 void AggExprEmitter::EmitAggLoadOfLValue(const Expr *E) {
   LValue LV = CGF.EmitLValue(E);
+
+  // If the type of the l-value is atomic, then do an atomic load.
+  if (LV.getType()->isAtomicType()) {
+    ValueDestForAtomic valueDest(CGF, Dest, LV.getType());
+    CGF.EmitAtomicLoad(LV, valueDest.getDest());
+    return;
+  }
+
   EmitFinalDestCopy(E->getType(), LV);
 }
 
@@ -543,6 +591,20 @@ AggExprEmitter::VisitCompoundLiteralExpr(CompoundLiteralExpr *E) {
   CGF.EmitAggExpr(E->getInitializer(), Slot);
 }
 
+/// Attempt to look through various unimportant expressions to find a
+/// cast of the given kind.
+static Expr *findPeephole(Expr *op, CastKind kind) {
+  while (true) {
+    op = op->IgnoreParens();
+    if (CastExpr *castE = dyn_cast<CastExpr>(op)) {
+      if (castE->getCastKind() == kind)
+        return castE->getSubExpr();
+      if (castE->getCastKind() == CK_NoOp)
+        continue;
+    }
+    return 0;
+  }
+}
 
 void AggExprEmitter::VisitCastExpr(CastExpr *E) {
   switch (E->getCastKind()) {
@@ -582,6 +644,75 @@ void AggExprEmitter::VisitCastExpr(CastExpr *E) {
                 "should have been unpacked before we got here");
   }
 
+  case CK_NonAtomicToAtomic:
+  case CK_AtomicToNonAtomic: {
+    bool isToAtomic = (E->getCastKind() == CK_NonAtomicToAtomic);
+
+    // Determine the atomic and value types.
+    QualType atomicType = E->getSubExpr()->getType();
+    QualType valueType = E->getType();
+    if (isToAtomic) std::swap(atomicType, valueType);
+
+    assert(atomicType->isAtomicType());
+    assert(CGF.getContext().hasSameUnqualifiedType(valueType,
+                          atomicType->castAs<AtomicType>()->getValueType()));
+
+    // Just recurse normally if we're ignoring the result or the
+    // atomic type doesn't change representation.
+    if (Dest.isIgnored() || !CGF.CGM.isPaddedAtomicType(atomicType)) {
+      return Visit(E->getSubExpr());
+    }
+
+    CastKind peepholeTarget =
+      (isToAtomic ? CK_AtomicToNonAtomic : CK_NonAtomicToAtomic);
+
+    // These two cases are reverses of each other; try to peephole them.
+    if (Expr *op = findPeephole(E->getSubExpr(), peepholeTarget)) {
+      assert(CGF.getContext().hasSameUnqualifiedType(op->getType(),
+                                                     E->getType()) &&
+           "peephole significantly changed types?");
+      return Visit(op);
+    }
+
+    // If we're converting an r-value of non-atomic type to an r-value
+    // of atomic type, just make an atomic temporary, emit into that,
+    // and then copy the value out.  (FIXME: do we need to
+    // zero-initialize it first?)
+    if (isToAtomic) {
+      ValueDestForAtomic valueDest(CGF, Dest, atomicType);
+      CGF.EmitAggExpr(E->getSubExpr(), valueDest.getDest());
+      return;
+    }
+
+    // Otherwise, we're converting an atomic type to a non-atomic type.
+
+    // If the dest is a value-of-atomic subobject, drill back out.
+    if (Dest.isValueOfAtomic()) {
+      AggValueSlot atomicSlot =
+        AggValueSlot::forAddr(Dest.getPaddedAtomicAddr(),
+                              Dest.getAlignment(),
+                              Dest.getQualifiers(),
+                              Dest.isExternallyDestructed(),
+                              Dest.requiresGCollection(),
+                              Dest.isPotentiallyAliased(),
+                              Dest.isZeroed(),
+                              AggValueSlot::IsNotValueOfAtomic);
+      CGF.EmitAggExpr(E->getSubExpr(), atomicSlot);
+      return;
+    }
+
+    // Otherwise, make an atomic temporary, emit into that, and then
+    // copy the value out.
+    AggValueSlot atomicSlot =
+      CGF.CreateAggTemp(atomicType, "atomic-to-nonatomic.temp");
+    CGF.EmitAggExpr(E->getSubExpr(), atomicSlot);
+
+    llvm::Value *valueAddr =
+      Builder.CreateStructGEP(atomicSlot.getAddr(), 0);
+    RValue rvalue = RValue::getAggregate(valueAddr, atomicSlot.isVolatile());
+    return EmitFinalDestCopy(valueType, rvalue);
+  }
+
   case CK_LValueToRValue:
     // If we're loading from a volatile type, force the destination
     // into existence.
@@ -589,11 +720,10 @@ void AggExprEmitter::VisitCastExpr(CastExpr *E) {
       EnsureDest(E->getType());
       return Visit(E->getSubExpr());
     }
+
     // fallthrough
 
   case CK_NoOp:
-  case CK_AtomicToNonAtomic:
-  case CK_NonAtomicToAtomic:
   case CK_UserDefinedConversion:
   case CK_ConstructorConversion:
     assert(CGF.getContext().hasSameUnqualifiedType(E->getSubExpr()->getType(),
@@ -775,6 +905,12 @@ void AggExprEmitter::VisitBinAssign(const BinaryOperator *E) {
     // Now emit the LHS and copy into it.
     LValue LHS = CGF.EmitCheckedLValue(E->getLHS(), CodeGenFunction::TCK_Store);
 
+    // That copy is an atomic copy if the LHS is atomic.
+    if (LHS.getType()->isAtomicType()) {
+      CGF.EmitAtomicStore(Dest.asRValue(), LHS, /*isInit*/ false);
+      return;
+    }
+
     EmitCopy(E->getLHS()->getType(),
              AggValueSlot::forLValue(LHS, AggValueSlot::IsDestructed,
                                      needsGC(E->getLHS()->getType()),
@@ -785,6 +921,15 @@ void AggExprEmitter::VisitBinAssign(const BinaryOperator *E) {
   
   LValue LHS = CGF.EmitLValue(E->getLHS());
 
+  // If we have an atomic type, evaluate into the destination and then
+  // do an atomic copy.
+  if (LHS.getType()->isAtomicType()) {
+    EnsureDest(E->getRHS()->getType());
+    Visit(E->getRHS());
+    CGF.EmitAtomicStore(Dest.asRValue(), LHS, /*isInit*/ false);
+    return;
+  }
+
   // Codegen the RHS so that it stores directly into the LHS.
   AggValueSlot LHSSlot =
     AggValueSlot::forLValue(LHS, AggValueSlot::IsDestructed, 
diff --git a/lib/CodeGen/CGExprComplex.cpp b/lib/CodeGen/CGExprComplex.cpp
index 840463b732..5fc73aa790 100644
--- a/lib/CodeGen/CGExprComplex.cpp
+++ b/lib/CodeGen/CGExprComplex.cpp
@@ -42,7 +42,6 @@ class ComplexExprEmitter
   : public StmtVisitor<ComplexExprEmitter, ComplexPairTy> {
   CodeGenFunction &CGF;
   CGBuilderTy &Builder;
-  // True is we should ignore the value of a
   bool IgnoreReal;
   bool IgnoreImag;
 public:
@@ -286,6 +285,9 @@ public:
 /// load the real and imaginary pieces, returning them as Real/Imag.
 ComplexPairTy ComplexExprEmitter::EmitLoadOfLValue(LValue lvalue) {
   assert(lvalue.isSimple() && "non-simple complex l-value?");
+  if (lvalue.getType()->isAtomicType())
+    return CGF.EmitAtomicLoad(lvalue).getComplexVal();
+
   llvm::Value *SrcPtr = lvalue.getAddress();
   bool isVolatile = lvalue.isVolatileQualified();
 
@@ -310,6 +312,9 @@ ComplexPairTy ComplexExprEmitter::EmitLoadOfLValue(LValue lvalue) {
 void ComplexExprEmitter::EmitStoreOfComplex(ComplexPairTy Val,
                                             LValue lvalue,
                                             bool isInit) {
+  if (lvalue.getType()->isAtomicType())
+    return CGF.EmitAtomicStore(RValue::getComplex(Val), lvalue, isInit);
+
   llvm::Value *Ptr = lvalue.getAddress();
   llvm::Value *RealPtr = Builder.CreateStructGEP(Ptr, 0, "real");
   llvm::Value *ImagPtr = Builder.CreateStructGEP(Ptr, 1, "imag");
diff --git a/lib/CodeGen/CGValue.h b/lib/CodeGen/CGValue.h
index 0bbd373443..6b0c271a08 100644
--- a/lib/CodeGen/CGValue.h
+++ b/lib/CodeGen/CGValue.h
@@ -350,11 +350,23 @@ class AggValueSlot {
   /// evaluating an expression which constructs such an object.
   bool AliasedFlag : 1;
 
+  /// ValueOfAtomicFlag - This is set to true if the slot is the value
+  /// subobject of an object the size of an _Atomic(T).  The specific
+  /// guarantees this makes are:
+  ///   - the address is guaranteed to be a getelementptr into the
+  ///     padding struct and
+  ///   - it is okay to store something the width of an _Atomic(T)
+  ///     into the address.
+  /// Tracking this allows us to avoid some obviously unnecessary
+  /// memcpys.
+  bool ValueOfAtomicFlag : 1;
+
 public:
   enum IsAliased_t { IsNotAliased, IsAliased };
   enum IsDestructed_t { IsNotDestructed, IsDestructed };
   enum IsZeroed_t { IsNotZeroed, IsZeroed };
   enum NeedsGCBarriers_t { DoesNotNeedGCBarriers, NeedsGCBarriers };
+  enum IsValueOfAtomic_t { IsNotValueOfAtomic, IsValueOfAtomic };
 
   /// ignored - Returns an aggregate value slot indicating that the
   /// aggregate value is being ignored.
@@ -378,7 +390,9 @@ public:
                               IsDestructed_t isDestructed,
                               NeedsGCBarriers_t needsGC,
                               IsAliased_t isAliased,
-                              IsZeroed_t isZeroed = IsNotZeroed) {
+                              IsZeroed_t isZeroed = IsNotZeroed,
+                              IsValueOfAtomic_t isValueOfAtomic
+                                = IsNotValueOfAtomic) {
     AggValueSlot AV;
     AV.Addr = addr;
     AV.Alignment = align.getQuantity();
@@ -387,6 +401,7 @@ public:
     AV.ObjCGCFlag = needsGC;
     AV.ZeroedFlag = isZeroed;
     AV.AliasedFlag = isAliased;
+    AV.ValueOfAtomicFlag = isValueOfAtomic;
     return AV;
   }
 
@@ -394,9 +409,12 @@ public:
                                 IsDestructed_t isDestructed,
                                 NeedsGCBarriers_t needsGC,
                                 IsAliased_t isAliased,
-                                IsZeroed_t isZeroed = IsNotZeroed) {
+                                IsZeroed_t isZeroed = IsNotZeroed,
+                                IsValueOfAtomic_t isValueOfAtomic
+                                  = IsNotValueOfAtomic) {
     return forAddr(LV.getAddress(), LV.getAlignment(),
-                   LV.getQuals(), isDestructed, needsGC, isAliased, isZeroed);
+                   LV.getQuals(), isDestructed, needsGC, isAliased, isZeroed,
+                   isValueOfAtomic);
   }
 
   IsDestructed_t isExternallyDestructed() const {
@@ -428,6 +446,12 @@ public:
     return Addr;
   }
 
+  IsValueOfAtomic_t isValueOfAtomic() const {
+    return IsValueOfAtomic_t(ValueOfAtomicFlag);
+  }
+
+  llvm::Value *getPaddedAtomicAddr() const;
+
   bool isIgnored() const {
     return Addr == 0;
   }
diff --git a/lib/CodeGen/CodeGenFunction.h b/lib/CodeGen/CodeGenFunction.h
index 4e6c72e10d..2be8dcc4a6 100644
--- a/lib/CodeGen/CodeGenFunction.h
+++ b/lib/CodeGen/CodeGenFunction.h
@@ -2149,6 +2149,13 @@ public:
 
   RValue convertTempToRValue(llvm::Value *addr, QualType type);
 
+  void EmitAtomicInit(Expr *E, LValue lvalue);
+
+  RValue EmitAtomicLoad(LValue lvalue,
+                        AggValueSlot slot = AggValueSlot::ignored());
+
+  void EmitAtomicStore(RValue rvalue, LValue lvalue, bool isInit);
+
   /// EmitToMemory - Change a scalar value from its value
   /// representation to its in-memory representation.
   llvm::Value *EmitToMemory(llvm::Value *Value, QualType Ty);
diff --git a/lib/CodeGen/CodeGenModule.h b/lib/CodeGen/CodeGenModule.h
index f5ae9cce7b..2bddb6f79f 100644
--- a/lib/CodeGen/CodeGenModule.h
+++ b/lib/CodeGen/CodeGenModule.h
@@ -47,6 +47,7 @@ namespace llvm {
 namespace clang {
   class TargetCodeGenInfo;
   class ASTContext;
+  class AtomicType;
   class FunctionDecl;
   class IdentifierInfo;
   class ObjCMethodDecl;
@@ -494,6 +495,9 @@ public:
 
   bool isTypeConstant(QualType QTy, bool ExcludeCtorDtor);
 
+  bool isPaddedAtomicType(QualType type);
+  bool isPaddedAtomicType(const AtomicType *type);
+
   static void DecorateInstruction(llvm::Instruction *Inst,
                                   llvm::MDNode *TBAAInfo);
 
diff --git a/lib/CodeGen/CodeGenTypes.cpp b/lib/CodeGen/CodeGenTypes.cpp
index 259d10673d..8fc78e3de6 100644
--- a/lib/CodeGen/CodeGenTypes.cpp
+++ b/lib/CodeGen/CodeGenTypes.cpp
@@ -582,7 +582,21 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) {
   }
 
   case Type::Atomic: {
-    ResultType = ConvertType(cast<AtomicType>(Ty)->getValueType());
+    QualType valueType = cast<AtomicType>(Ty)->getValueType();
+    ResultType = ConvertTypeForMem(valueType);
+
+    // Pad out to the inflated size if necessary.
+    uint64_t valueSize = Context.getTypeSize(valueType);
+    uint64_t atomicSize = Context.getTypeSize(Ty);
+    if (valueSize != atomicSize) {
+      assert(valueSize < atomicSize);
+      llvm::Type *elts[] = {
+        ResultType,
+        llvm::ArrayType::get(CGM.Int8Ty, (atomicSize - valueSize) / 8)
+      };
+      ResultType = llvm::StructType::get(getLLVMContext(),
+                                         llvm::makeArrayRef(elts));
+    }
     break;
   }
   }
@@ -593,6 +607,14 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) {
   return ResultType;
 }
 
+bool CodeGenModule::isPaddedAtomicType(QualType type) {
+  return isPaddedAtomicType(type->castAs<AtomicType>());
+}
+
+bool CodeGenModule::isPaddedAtomicType(const AtomicType *type) {
+  return Context.getTypeSize(type) != Context.getTypeSize(type->getValueType());
+}
+
 /// ConvertRecordDeclType - Lay out a tagged decl type like struct or union.
 llvm::StructType *CodeGenTypes::ConvertRecordDeclType(const RecordDecl *RD) {
   // TagDecl's are not necessarily unique, instead use the (clang)
diff --git a/test/CodeGen/c11atomics-ios.c b/test/CodeGen/c11atomics-ios.c
new file mode 100644
index 0000000000..d1c9b14330
--- /dev/null
+++ b/test/CodeGen/c11atomics-ios.c
@@ -0,0 +1,214 @@
+// RUN: %clang_cc1 %s -emit-llvm -o - -triple=armv7-apple-ios -std=c11 | FileCheck %s
+
+// There isn't really anything special about iOS; it just happens to
+// only deploy on processors with native atomics support, so it's a good
+// way to test those code-paths.
+
+// This work was done in pursuit of <rdar://13338582>.
+
+// CHECK: define arm_aapcscc void @testFloat(float*
+void testFloat(_Atomic(float) *fp) {
+// CHECK:      [[FP:%.*]] = alloca float*
+// CHECK-NEXT: [[X:%.*]] = alloca float
+// CHECK-NEXT: [[F:%.*]] = alloca float
+// CHECK-NEXT: store float* {{%.*}}, float** [[FP]]
+
+// CHECK-NEXT: [[T0:%.*]] = load float** [[FP]]
+// CHECK-NEXT: store float 1.000000e+00, float* [[T0]], align 4
+  __c11_atomic_init(fp, 1.0f);
+
+// CHECK-NEXT: store float 2.000000e+00, float* [[X]], align 4
+  _Atomic(float) x = 2.0f;
+
+// CHECK-NEXT: [[T0:%.*]] = load float** [[FP]]
+// CHECK-NEXT: [[T1:%.*]] = bitcast float* [[T0]] to i32*
+// CHECK-NEXT: [[T2:%.*]] = load atomic i32* [[T1]] seq_cst, align 4
+// CHECK-NEXT: [[T3:%.*]] = bitcast i32 [[T2]] to float
+// CHECK-NEXT: store float [[T3]], float* [[F]]
+  float f = *fp;
+
+// CHECK-NEXT: [[T0:%.*]] = load float* [[F]], align 4
+// CHECK-NEXT: [[T1:%.*]] = load float** [[FP]], align 4
+// CHECK-NEXT: [[T2:%.*]] = bitcast float [[T0]] to i32
+// CHECK-NEXT: [[T3:%.*]] = bitcast float* [[T1]] to i32*
+// CHECK-NEXT: store atomic i32 [[T2]], i32* [[T3]] seq_cst, align 4
+  *fp = f;
+
+// CHECK-NEXT: ret void
+}
+
+// CHECK: define arm_aapcscc void @testComplexFloat([[CF:{ float, float }]]*
+void testComplexFloat(_Atomic(_Complex float) *fp) {
+// CHECK:      [[FP:%.*]] = alloca [[CF]]*, align 4
+// CHECK-NEXT: [[X:%.*]] = alloca [[CF]], align 8
+// CHECK-NEXT: [[F:%.*]] = alloca [[CF]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = alloca [[CF]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = alloca [[CF]], align 8
+// CHECK-NEXT: store [[CF]]*
+
+// CHECK-NEXT: [[P:%.*]] = load [[CF]]** [[FP]]
+// CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[CF]]* [[P]], i32 0, i32 0
+// CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds [[CF]]* [[P]], i32 0, i32 1
+// CHECK-NEXT: store float 1.000000e+00, float* [[T0]]
+// CHECK-NEXT: store float 0.000000e+00, float* [[T1]]
+  __c11_atomic_init(fp, 1.0f);
+
+// CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[CF]]* [[X]], i32 0, i32 0
+// CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds [[CF]]* [[X]], i32 0, i32 1
+// CHECK-NEXT: store float 2.000000e+00, float* [[T0]]
+// CHECK-NEXT: store float 0.000000e+00, float* [[T1]]
+  _Atomic(_Complex float) x = 2.0f;
+
+// CHECK-NEXT: [[T0:%.*]] = load [[CF]]** [[FP]]
+// CHECK-NEXT: [[T1:%.*]] = bitcast [[CF]]* [[T0]] to i64*
+// CHECK-NEXT: [[T2:%.*]] = load atomic i64* [[T1]] seq_cst, align 8
+// CHECK-NEXT: [[T3:%.*]] = bitcast [[CF]]* [[TMP0]] to i64*
+// CHECK-NEXT: store i64 [[T2]], i64* [[T3]], align 8
+// CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[CF]]* [[TMP0]], i32 0, i32 0
+// CHECK-NEXT: [[R:%.*]] = load float* [[T0]]
+// CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[CF]]* [[TMP0]], i32 0, i32 1
+// CHECK-NEXT: [[I:%.*]] = load float* [[T0]]
+// CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[CF]]* [[F]], i32 0, i32 0
+// CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds [[CF]]* [[F]], i32 0, i32 1
+// CHECK-NEXT: store float [[R]], float* [[T0]]
+// CHECK-NEXT: store float [[I]], float* [[T1]]
+  _Complex float f = *fp;
+
+// CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[CF]]* [[F]], i32 0, i32 0
+// CHECK-NEXT: [[R:%.*]] = load float* [[T0]]
+// CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[CF]]* [[F]], i32 0, i32 1
+// CHECK-NEXT: [[I:%.*]] = load float* [[T0]]
+// CHECK-NEXT: [[DEST:%.*]] = load [[CF]]** [[FP]], align 4
+// CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[CF]]* [[TMP1]], i32 0, i32 0
+// CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds [[CF]]* [[TMP1]], i32 0, i32 1
+// CHECK-NEXT: store float [[R]], float* [[T0]]
+// CHECK-NEXT: store float [[I]], float* [[T1]]
+// CHECK-NEXT: [[T0:%.*]] = bitcast [[CF]]* [[TMP1]] to i64*
+// CHECK-NEXT: [[T1:%.*]] = load i64* [[T0]], align 8
+// CHECK-NEXT: [[T2:%.*]] = bitcast [[CF]]* [[DEST]] to i64*
+// CHECK-NEXT: store atomic i64 [[T1]], i64* [[T2]] seq_cst, align 8
+  *fp = f;
+
+// CHECK-NEXT: ret void
+}
+
+typedef struct { short x, y, z, w; } S;
+// CHECK: define arm_aapcscc void @testStruct([[S:.*]]*
+void testStruct(_Atomic(S) *fp) {
+// CHECK:      [[FP:%.*]] = alloca [[S]]*, align 4
+// CHECK-NEXT: [[X:%.*]] = alloca [[S]], align 8
+// CHECK-NEXT: [[F:%.*]] = alloca [[S:%.*]], align 2
+// CHECK-NEXT: [[TMP0:%.*]] = alloca [[S]], align 8
+// CHECK-NEXT: store [[S]]*
+
+// CHECK-NEXT: [[P:%.*]] = load [[S]]** [[FP]]
+// CHECK-NEXT: [[T0:%.*]] = bitcast [[S]]* [[P]] to i8*
+// CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[T0]], i8 0, i64 8, i32 8, i1 false)
+// CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[S]]* [[P]], i32 0, i32 0
+// CHECK-NEXT: store i16 1, i16* [[T0]], align 2
+// CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[S]]* [[P]], i32 0, i32 1
+// CHECK-NEXT: store i16 2, i16* [[T0]], align 2
+// CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[S]]* [[P]], i32 0, i32 2
+// CHECK-NEXT: store i16 3, i16* [[T0]], align 2
+// CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[S]]* [[P]], i32 0, i32 3
+// CHECK-NEXT: store i16 4, i16* [[T0]], align 2
+  __c11_atomic_init(fp, (S){1,2,3,4});
+
+// CHECK-NEXT: [[T0:%.*]] = bitcast [[S]]* [[X]] to i8*
+// CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[T0]], i8 0, i64 8, i32 8, i1 false)
+// CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[S]]* [[X]], i32 0, i32 0
+// CHECK-NEXT: store i16 1, i16* [[T0]], align 2
+// CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[S]]* [[X]], i32 0, i32 1
+// CHECK-NEXT: store i16 2, i16* [[T0]], align 2
+// CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[S]]* [[X]], i32 0, i32 2
+// CHECK-NEXT: store i16 3, i16* [[T0]], align 2
+// CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[S]]* [[X]], i32 0, i32 3
+// CHECK-NEXT: store i16 4, i16* [[T0]], align 2
+  _Atomic(S) x = (S){1,2,3,4};
+
+// CHECK-NEXT: [[T0:%.*]] = load [[S]]** [[FP]]
+// CHECK-NEXT: [[T1:%.*]] = bitcast [[S]]* [[T0]] to i64*
+// CHECK-NEXT: [[T2:%.*]] = load atomic i64* [[T1]] seq_cst, align 8
+// CHECK-NEXT: [[T3:%.*]] = bitcast [[S]]* [[F]] to i64*
+// CHECK-NEXT: store i64 [[T2]], i64* [[T3]], align 2
+  S f = *fp;
+
+// CHECK-NEXT: [[T0:%.*]] = load [[S]]** [[FP]]
+// CHECK-NEXT: [[T1:%.*]] = bitcast [[S]]* [[TMP0]] to i8*
+// CHECK-NEXT: [[T2:%.*]] = bitcast [[S]]* [[F]] to i8*
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[T1]], i8* [[T2]], i32 8, i32 2, i1 false)
+// CHECK-NEXT: [[T3:%.*]] = bitcast [[S]]* [[TMP0]] to i64*
+// CHECK-NEXT: [[T4:%.*]] = load i64* [[T3]], align 8
+// CHECK-NEXT: [[T5:%.*]] = bitcast [[S]]* [[T0]] to i64*
+// CHECK-NEXT: store atomic i64 [[T4]], i64* [[T5]] seq_cst, align 8
+  *fp = f;
+
+// CHECK-NEXT: ret void
+}
+
+typedef struct { short x, y, z; } PS;
+// CHECK: define arm_aapcscc void @testPromotedStruct([[APS:.*]]*
+void testPromotedStruct(_Atomic(PS) *fp) {
+// CHECK:      [[FP:%.*]] = alloca [[APS]]*, align 4
+// CHECK-NEXT: [[X:%.*]] = alloca [[APS]], align 8
+// CHECK-NEXT: [[F:%.*]] = alloca [[PS:%.*]], align 2
+// CHECK-NEXT: [[TMP0:%.*]] = alloca [[APS]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = alloca [[APS]], align 8
+// CHECK-NEXT: store [[APS]]*
+
+// CHECK-NEXT: [[P:%.*]] = load [[APS]]** [[FP]]
+// CHECK-NEXT: [[T0:%.*]] = bitcast [[APS]]* [[P]] to i8*
+// CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[T0]], i8 0, i64 8, i32 8, i1 false)
+// CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[APS]]* [[P]], i32 0, i32 0
+// CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds [[PS]]* [[T0]], i32 0, i32 0
+// CHECK-NEXT: store i16 1, i16* [[T1]], align 2
+// CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds [[PS]]* [[T0]], i32 0, i32 1
+// CHECK-NEXT: store i16 2, i16* [[T1]], align 2
+// CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds [[PS]]* [[T0]], i32 0, i32 2
+// CHECK-NEXT: store i16 3, i16* [[T1]], align 2
+  __c11_atomic_init(fp, (PS){1,2,3});
+
+// CHECK-NEXT: [[T0:%.*]] = bitcast [[APS]]* [[X]] to i8*
+// CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[T0]], i8 0, i64 8, i32 8, i1 false)
+// CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[APS]]* [[X]], i32 0, i32 0
+// CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds [[PS]]* [[T0]], i32 0, i32 0
+// CHECK-NEXT: store i16 1, i16* [[T1]], align 2
+// CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds [[PS]]* [[T0]], i32 0, i32 1
+// CHECK-NEXT: store i16 2, i16* [[T1]], align 2
+// CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds [[PS]]* [[T0]], i32 0, i32 2
+// CHECK-NEXT: store i16 3, i16* [[T1]], align 2
+  _Atomic(PS) x = (PS){1,2,3};
+
+// CHECK-NEXT: [[T0:%.*]] = load [[APS]]** [[FP]]
+// CHECK-NEXT: [[T1:%.*]] = bitcast [[APS]]* [[T0]] to i64*
+// CHECK-NEXT: [[T2:%.*]] = load atomic i64* [[T1]] seq_cst, align 8
+// CHECK-NEXT: [[T3:%.*]] = bitcast [[APS]]* [[TMP0]] to i64*
+// CHECK-NEXT: store i64 [[T2]], i64* [[T3]], align 8
+// CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[APS]]* [[TMP0]], i32 0, i32 0
+// CHECK-NEXT: [[T1:%.*]] = bitcast [[PS]]* [[F]] to i8*
+// CHECK-NEXT: [[T2:%.*]] = bitcast [[PS]]* [[T0]] to i8*
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[T1]], i8* [[T2]], i32 6, i32 2, i1 false)
+  PS f = *fp;
+
+// CHECK-NEXT: [[T0:%.*]] = load [[APS]]** [[FP]]
+// CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds [[APS]]* [[TMP1]], i32 0, i32 0
+// CHECK-NEXT: [[T2:%.*]] = bitcast [[PS]]* [[T1]] to i8*
+// CHECK-NEXT: [[T3:%.*]] = bitcast [[PS]]* [[F]] to i8*
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[T2]], i8* [[T3]], i32 6, i32 2, i1 false)
+// CHECK-NEXT: [[T4:%.*]] = bitcast [[APS]]* [[TMP1]] to i64*
+// CHECK-NEXT: [[T5:%.*]] = load i64* [[T4]], align 8
+// CHECK-NEXT: [[T6:%.*]] = bitcast [[APS]]* [[T0]] to i64*
+// CHECK-NEXT: store atomic i64 [[T5]], i64* [[T6]] seq_cst, align 8
+  *fp = f;
+
+// CHECK-NEXT: ret void
+}
+
+void testPromotedStructOps(_Atomic(PS) *p) {
+  PS a = __c11_atomic_load(p, 5);
+  __c11_atomic_store(p, a, 5);
+  PS b = __c11_atomic_exchange(p, a, 5);
+
+  _Bool v = __c11_atomic_compare_exchange_strong(p, &b, a, 5, 5);
+  v = __c11_atomic_compare_exchange_weak(p, &b, a, 5, 5);
+}
diff --git a/test/CodeGen/c11atomics.c b/test/CodeGen/c11atomics.c
index 726fc5166f..8d298af019 100644
--- a/test/CodeGen/c11atomics.c
+++ b/test/CodeGen/c11atomics.c
@@ -135,3 +135,210 @@ void testandeq(void)
   s &= 42;
 }
 
+// CHECK: define arm_aapcscc void @testFloat(float*
+void testFloat(_Atomic(float) *fp) {
+// CHECK:      [[FP:%.*]] = alloca float*
+// CHECK-NEXT: [[X:%.*]] = alloca float
+// CHECK-NEXT: [[F:%.*]] = alloca float
+// CHECK-NEXT: [[TMP0:%.*]] = alloca float
+// CHECK-NEXT: [[TMP1:%.*]] = alloca float
+// CHECK-NEXT: store float* {{%.*}}, float** [[FP]]
+
+// CHECK-NEXT: [[T0:%.*]] = load float** [[FP]]
+// CHECK-NEXT: store float 1.000000e+00, float* [[T0]], align 4
+  __c11_atomic_init(fp, 1.0f);
+
+// CHECK-NEXT: store float 2.000000e+00, float* [[X]], align 4
+  _Atomic(float) x = 2.0f;
+
+// CHECK-NEXT: [[T0:%.*]] = load float** [[FP]]
+// CHECK-NEXT: [[T1:%.*]] = bitcast float* [[T0]] to i8*
+// CHECK-NEXT: [[T2:%.*]] = bitcast float* [[TMP0]] to i8*
+// CHECK-NEXT: call arm_aapcscc void @__atomic_load(i32 4, i8* [[T1]], i8* [[T2]], i32 5)
+// CHECK-NEXT: [[T3:%.*]] = load float* [[TMP0]], align 4
+// CHECK-NEXT: store float [[T3]], float* [[F]]
+  float f = *fp;
+
+// CHECK-NEXT: [[T0:%.*]] = load float* [[F]], align 4
+// CHECK-NEXT: [[T1:%.*]] = load float** [[FP]], align 4
+// CHECK-NEXT: store float [[T0]], float* [[TMP1]], align 4
+// CHECK-NEXT: [[T2:%.*]] = bitcast float* [[T1]] to i8*
+// CHECK-NEXT: [[T3:%.*]] = bitcast float* [[TMP1]] to i8*
+// CHECK-NEXT: call arm_aapcscc void @__atomic_store(i32 4, i8* [[T2]], i8* [[T3]], i32 5)
+  *fp = f;
+
+// CHECK-NEXT: ret void
+}
+
+// CHECK: define arm_aapcscc void @testComplexFloat([[CF:{ float, float }]]*
+void testComplexFloat(_Atomic(_Complex float) *fp) {
+// CHECK:      [[FP:%.*]] = alloca [[CF]]*, align 4
+// CHECK-NEXT: [[X:%.*]] = alloca [[CF]], align 8
+// CHECK-NEXT: [[F:%.*]] = alloca [[CF]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = alloca [[CF]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = alloca [[CF]], align 8
+// CHECK-NEXT: store [[CF]]*
+
+// CHECK-NEXT: [[P:%.*]] = load [[CF]]** [[FP]]
+// CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[CF]]* [[P]], i32 0, i32 0
+// CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds [[CF]]* [[P]], i32 0, i32 1
+// CHECK-NEXT: store float 1.000000e+00, float* [[T0]]
+// CHECK-NEXT: store float 0.000000e+00, float* [[T1]]
+  __c11_atomic_init(fp, 1.0f);
+
+// CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[CF]]* [[X]], i32 0, i32 0
+// CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds [[CF]]* [[X]], i32 0, i32 1
+// CHECK-NEXT: store float 2.000000e+00, float* [[T0]]
+// CHECK-NEXT: store float 0.000000e+00, float* [[T1]]
+  _Atomic(_Complex float) x = 2.0f;
+
+// CHECK-NEXT: [[T0:%.*]] = load [[CF]]** [[FP]]
+// CHECK-NEXT: [[T1:%.*]] = bitcast [[CF]]* [[T0]] to i8*
+// CHECK-NEXT: [[T2:%.*]] = bitcast [[CF]]* [[TMP0]] to i8*
+// CHECK-NEXT: call arm_aapcscc void @__atomic_load(i32 8, i8* [[T1]], i8* [[T2]], i32 5)
+// CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[CF]]* [[TMP0]], i32 0, i32 0
+// CHECK-NEXT: [[R:%.*]] = load float* [[T0]]
+// CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[CF]]* [[TMP0]], i32 0, i32 1
+// CHECK-NEXT: [[I:%.*]] = load float* [[T0]]
+// CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[CF]]* [[F]], i32 0, i32 0
+// CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds [[CF]]* [[F]], i32 0, i32 1
+// CHECK-NEXT: store float [[R]], float* [[T0]]
+// CHECK-NEXT: store float [[I]], float* [[T1]]
+  _Complex float f = *fp;
+
+// CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[CF]]* [[F]], i32 0, i32 0
+// CHECK-NEXT: [[R:%.*]] = load float* [[T0]]
+// CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[CF]]* [[F]], i32 0, i32 1
+// CHECK-NEXT: [[I:%.*]] = load float* [[T0]]
+// CHECK-NEXT: [[DEST:%.*]] = load [[CF]]** [[FP]], align 4
+// CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[CF]]* [[TMP1]], i32 0, i32 0
+// CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds [[CF]]* [[TMP1]], i32 0, i32 1
+// CHECK-NEXT: store float [[R]], float* [[T0]]
+// CHECK-NEXT: store float [[I]], float* [[T1]]
+// CHECK-NEXT: [[T0:%.*]] = bitcast [[CF]]* [[DEST]] to i8*
+// CHECK-NEXT: [[T1:%.*]] = bitcast [[CF]]* [[TMP1]] to i8*
+// CHECK-NEXT: call arm_aapcscc void @__atomic_store(i32 8, i8* [[T0]], i8* [[T1]], i32 5)
+  *fp = f;
+
+// CHECK-NEXT: ret void
+}
+
+typedef struct { short x, y, z, w; } S;
+// CHECK: define arm_aapcscc void @testStruct([[S:.*]]*
+void testStruct(_Atomic(S) *fp) {
+// CHECK:      [[FP:%.*]] = alloca [[S]]*, align 4
+// CHECK-NEXT: [[X:%.*]] = alloca [[S]], align 8
+// CHECK-NEXT: [[F:%.*]] = alloca [[S:%.*]], align 2
+// CHECK-NEXT: [[TMP0:%.*]] = alloca [[S]], align 8
+// CHECK-NEXT: store [[S]]*
+
+// CHECK-NEXT: [[P:%.*]] = load [[S]]** [[FP]]
+// CHECK-NEXT: [[T0:%.*]] = bitcast [[S]]* [[P]] to i8*
+// CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[T0]], i8 0, i64 8, i32 8, i1 false)
+// CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[S]]* [[P]], i32 0, i32 0
+// CHECK-NEXT: store i16 1, i16* [[T0]], align 2
+// CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[S]]* [[P]], i32 0, i32 1
+// CHECK-NEXT: store i16 2, i16* [[T0]], align 2
+// CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[S]]* [[P]], i32 0, i32 2
+// CHECK-NEXT: store i16 3, i16* [[T0]], align 2
+// CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[S]]* [[P]], i32 0, i32 3
+// CHECK-NEXT: store i16 4, i16* [[T0]], align 2
+  __c11_atomic_init(fp, (S){1,2,3,4});
+
+// CHECK-NEXT: [[T0:%.*]] = bitcast [[S]]* [[X]] to i8*
+// CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[T0]], i8 0, i64 8, i32 8, i1 false)
+// CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[S]]* [[X]], i32 0, i32 0
+// CHECK-NEXT: store i16 1, i16* [[T0]], align 2
+// CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[S]]* [[X]], i32 0, i32 1
+// CHECK-NEXT: store i16 2, i16* [[T0]], align 2
+// CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[S]]* [[X]], i32 0, i32 2
+// CHECK-NEXT: store i16 3, i16* [[T0]], align 2
+// CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[S]]* [[X]], i32 0, i32 3
+// CHECK-NEXT: store i16 4, i16* [[T0]], align 2
+  _Atomic(S) x = (S){1,2,3,4};
+
+// CHECK-NEXT: [[T0:%.*]] = load [[S]]** [[FP]]
+// CHECK-NEXT: [[T1:%.*]] = bitcast [[S]]* [[T0]] to i8*
+// CHECK-NEXT: [[T2:%.*]] = bitcast [[S]]* [[F]] to i8*
+// CHECK-NEXT: call arm_aapcscc void @__atomic_load(i32 8, i8* [[T1]], i8* [[T2]], i32 5)
+  S f = *fp;
+
+// CHECK-NEXT: [[T0:%.*]] = load [[S]]** [[FP]]
+// CHECK-NEXT: [[T1:%.*]] = bitcast [[S]]* [[TMP0]] to i8*
+// CHECK-NEXT: [[T2:%.*]] = bitcast [[S]]* [[F]] to i8*
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[T1]], i8* [[T2]], i32 8, i32 2, i1 false)
+// CHECK-NEXT: [[T3:%.*]] = bitcast [[S]]* [[T0]] to i8*
+// CHECK-NEXT: [[T4:%.*]] = bitcast [[S]]* [[TMP0]] to i8*
+// CHECK-NEXT: call arm_aapcscc void @__atomic_store(i32 8, i8* [[T3]], i8* [[T4]], i32 5)
+  *fp = f;
+
+// CHECK-NEXT: ret void
+}
+
+typedef struct { short x, y, z; } PS;
+// CHECK: define arm_aapcscc void @testPromotedStruct([[APS:.*]]*
+void testPromotedStruct(_Atomic(PS) *fp) {
+// CHECK:      [[FP:%.*]] = alloca [[APS]]*, align 4
+// CHECK-NEXT: [[X:%.*]] = alloca [[APS]], align 8
+// CHECK-NEXT: [[F:%.*]] = alloca [[PS:%.*]], align 2
+// CHECK-NEXT: [[TMP0:%.*]] = alloca [[APS]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = alloca [[APS]], align 8
+// CHECK-NEXT: store [[APS]]*
+
+// CHECK-NEXT: [[P:%.*]] = load [[APS]]** [[FP]]
+// CHECK-NEXT: [[T0:%.*]] = bitcast [[APS]]* [[P]] to i8*
+// CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[T0]], i8 0, i64 8, i32 8, i1 false)
+// CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[APS]]* [[P]], i32 0, i32 0
+// CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds [[PS]]* [[T0]], i32 0, i32 0
+// CHECK-NEXT: store i16 1, i16* [[T1]], align 2
+// CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds [[PS]]* [[T0]], i32 0, i32 1
+// CHECK-NEXT: store i16 2, i16* [[T1]], align 2
+// CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds [[PS]]* [[T0]], i32 0, i32 2
+// CHECK-NEXT: store i16 3, i16* [[T1]], align 2
+  __c11_atomic_init(fp, (PS){1,2,3});
+
+// CHECK-NEXT: [[T0:%.*]] = bitcast [[APS]]* [[X]] to i8*
+// CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[T0]], i8 0, i64 8, i32 8, i1 false)
+// CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[APS]]* [[X]], i32 0, i32 0
+// CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds [[PS]]* [[T0]], i32 0, i32 0
+// CHECK-NEXT: store i16 1, i16* [[T1]], align 2
+// CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds [[PS]]* [[T0]], i32 0, i32 1
+// CHECK-NEXT: store i16 2, i16* [[T1]], align 2
+// CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds [[PS]]* [[T0]], i32 0, i32 2
+// CHECK-NEXT: store i16 3, i16* [[T1]], align 2
+  _Atomic(PS) x = (PS){1,2,3};
+
+// CHECK-NEXT: [[T0:%.*]] = load [[APS]]** [[FP]]
+// CHECK-NEXT: [[T1:%.*]] = bitcast [[APS]]* [[T0]] to i8*
+// CHECK-NEXT: [[T2:%.*]] = bitcast [[APS]]* [[TMP0]] to i8*
+// CHECK-NEXT: call arm_aapcscc void @__atomic_load(i32 8, i8* [[T1]], i8* [[T2]], i32 5)
+// CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[APS]]* [[TMP0]], i32 0, i32 0
+// CHECK-NEXT: [[T1:%.*]] = bitcast [[PS]]* [[F]] to i8*
+// CHECK-NEXT: [[T2:%.*]] = bitcast [[PS]]* [[T0]] to i8*
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[T1]], i8* [[T2]], i32 6, i32 2, i1 false)
+  PS f = *fp;
+
+// CHECK-NEXT: [[T0:%.*]] = load [[APS]]** [[FP]]
+// CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds [[APS]]* [[TMP1]], i32 0, i32 0
+// CHECK-NEXT: [[T2:%.*]] = bitcast [[PS]]* [[T1]] to i8*
+// CHECK-NEXT: [[T3:%.*]] = bitcast [[PS]]* [[F]] to i8*
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[T2]], i8* [[T3]], i32 6, i32 2, i1 false)
+// CHECK-NEXT: [[T4:%.*]] = bitcast [[APS]]* [[T0]] to i8*
+// CHECK-NEXT: [[T5:%.*]] = bitcast [[APS]]* [[TMP1]] to i8*
+// CHECK-NEXT: call arm_aapcscc void @__atomic_store(i32 8, i8* [[T4]], i8* [[T5]], i32 5)
+  *fp = f;
+
+// CHECK-NEXT: ret void
+}
+
+// CHECK: define arm_aapcscc void @testPromotedStructOps([[APS:.*]]*
+
+// FIXME: none of these look right, but we can leave the "test" here
+// to make sure they at least don't crash.
+void testPromotedStructOps(_Atomic(PS) *p) {
+  PS a = __c11_atomic_load(p, 5);
+  __c11_atomic_store(p, a, 5);
+  PS b = __c11_atomic_exchange(p, a, 5);
+  _Bool v = __c11_atomic_compare_exchange_strong(p, &b, a, 5, 5);
+  v = __c11_atomic_compare_exchange_weak(p, &b, a, 5, 5);
+}
-- 
2.40.0