]> granicus.if.org Git - clang/commitdiff
[Clang][X86] Convert non-temporal store builtins to generic __builtin_nontemporal_sto...
authorSimon Pilgrim <llvm-dev@redking.me.uk>
Mon, 13 Jun 2016 09:57:52 +0000 (09:57 +0000)
committerSimon Pilgrim <llvm-dev@redking.me.uk>
Mon, 13 Jun 2016 09:57:52 +0000 (09:57 +0000)
We can now use __builtin_nontemporal_store instead of target specific builtins for naturally aligned nontemporal stores which avoids the need for handling in CGBuiltin.cpp

The scalar integer nontemporal (unaligned) store builtins will have to wait as __builtin_nontemporal_store currently assumes natural alignment and doesn't accept the 'packed struct' trick that we use for normal unaligned load/stores.

The nontemporal loads require further backend support before we can safely convert them to __builtin_nontemporal_load

Differential Revision: http://reviews.llvm.org/D21272

git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@272540 91177308-0d34-0410-b5e6-96231b3b80d8

include/clang/Basic/BuiltinsX86.def
lib/CodeGen/CGBuiltin.cpp
lib/Headers/avx512fintrin.h
lib/Headers/avxintrin.h
lib/Headers/emmintrin.h
lib/Headers/xmmintrin.h
test/CodeGen/avx512f-builtins.c
test/CodeGen/builtins-x86.c

index b2ea3c6db7c8576f51a7550355fceca5328bb8d9..ab13c56d1f496e0146a2d8059949b183f1595cdc 100644 (file)
@@ -313,7 +313,6 @@ TARGET_BUILTIN(__builtin_ia32_cvtss2si64, "LLiV4f", "", "sse")
 TARGET_BUILTIN(__builtin_ia32_storehps, "vV2i*V4f", "", "sse")
 TARGET_BUILTIN(__builtin_ia32_storelps, "vV2i*V4f", "", "sse")
 TARGET_BUILTIN(__builtin_ia32_movmskps, "iV4f", "", "sse")
-TARGET_BUILTIN(__builtin_ia32_movntps, "vf*V4f", "", "sse")
 TARGET_BUILTIN(__builtin_ia32_sfence, "v", "", "sse")
 TARGET_BUILTIN(__builtin_ia32_rcpps, "V4fV4f", "", "sse")
 TARGET_BUILTIN(__builtin_ia32_rcpss, "V4fV4f", "", "sse")
@@ -327,8 +326,6 @@ TARGET_BUILTIN(__builtin_ia32_movmskpd, "iV2d", "", "sse2")
 TARGET_BUILTIN(__builtin_ia32_pmovmskb128, "iV16c", "", "sse2")
 TARGET_BUILTIN(__builtin_ia32_movnti, "vi*i", "", "sse2")
 TARGET_BUILTIN(__builtin_ia32_movnti64, "vLLi*LLi", "", "sse2")
-TARGET_BUILTIN(__builtin_ia32_movntpd, "vd*V2d", "", "sse2")
-TARGET_BUILTIN(__builtin_ia32_movntdq, "vV2LLi*V2LLi", "", "sse2")
 TARGET_BUILTIN(__builtin_ia32_psadbw128, "V2LLiV16cV16c", "", "sse2")
 TARGET_BUILTIN(__builtin_ia32_sqrtpd, "V2dV2d", "", "sse2")
 TARGET_BUILTIN(__builtin_ia32_sqrtsd, "V2dV2d", "", "sse2")
@@ -493,9 +490,6 @@ TARGET_BUILTIN(__builtin_ia32_vzeroupper, "v", "", "avx")
 TARGET_BUILTIN(__builtin_ia32_vbroadcastf128_pd256, "V4dV2dC*", "", "avx")
 TARGET_BUILTIN(__builtin_ia32_vbroadcastf128_ps256, "V8fV4fC*", "", "avx")
 TARGET_BUILTIN(__builtin_ia32_lddqu256, "V32ccC*", "", "avx")
-TARGET_BUILTIN(__builtin_ia32_movntdq256, "vV4LLi*V4LLi", "", "avx")
-TARGET_BUILTIN(__builtin_ia32_movntpd256, "vd*V4d", "", "avx")
-TARGET_BUILTIN(__builtin_ia32_movntps256, "vf*V8f", "", "avx")
 TARGET_BUILTIN(__builtin_ia32_maskloadpd, "V2dV2dC*V2LLi", "", "avx")
 TARGET_BUILTIN(__builtin_ia32_maskloadps, "V4fV4fC*V4i", "", "avx")
 TARGET_BUILTIN(__builtin_ia32_maskloadpd256, "V4dV4dC*V4LLi", "", "avx")
@@ -2154,10 +2148,7 @@ TARGET_BUILTIN(__builtin_ia32_kortestzhi, "iUsUs","","avx512f")
 TARGET_BUILTIN(__builtin_ia32_kunpckhi, "UsUsUs","","avx512f")
 TARGET_BUILTIN(__builtin_ia32_kxnorhi, "UsUsUs","","avx512f")
 TARGET_BUILTIN(__builtin_ia32_kxorhi, "UsUsUs","","avx512f")
-TARGET_BUILTIN(__builtin_ia32_movntdq512, "vV8LLi*V8LLi","","avx512f")
 TARGET_BUILTIN(__builtin_ia32_movntdqa512, "V8LLiV8LLi*","","avx512f")
-TARGET_BUILTIN(__builtin_ia32_movntpd512, "vd*V8d","","avx512f")
-TARGET_BUILTIN(__builtin_ia32_movntps512, "vf*V16f","","avx512f")
 TARGET_BUILTIN(__builtin_ia32_palignr512_mask, "V64cV64cV64cIiV64cULLi","","avx512bw")
 TARGET_BUILTIN(__builtin_ia32_palignr128_mask, "V16cV16cV16cIiV16cUs","","avx512bw,avx512vl")
 TARGET_BUILTIN(__builtin_ia32_palignr256_mask, "V32cV32cV32cIiV32cUi","","avx512bw,avx512vl")
index bf450b9ec2336692e131641e915dbbc6aff6cf56..d331dcd09765158dc6413fc3e6762db5091e1394 100644 (file)
@@ -243,14 +243,14 @@ static Value *EmitSignBit(CodeGenFunction &CGF, Value *V) {
     // little-Endian, the high bits in big-Endian. Therefore, on big-Endian
     // we need to shift the high bits down to the low before truncating.
     Width >>= 1;
-    if (CGF.getTarget().isBigEndian()) {
-      Value *ShiftCst = llvm::ConstantInt::get(IntTy, Width);
-      V = CGF.Builder.CreateLShr(V, ShiftCst);
-    } 
-    // We are truncating value in order to extract the higher-order 
-    // double, which we will be using to extract the sign from.
-    IntTy = llvm::IntegerType::get(C, Width);
-    V = CGF.Builder.CreateTrunc(V, IntTy);
+    if (CGF.getTarget().isBigEndian()) {\r
+      Value *ShiftCst = llvm::ConstantInt::get(IntTy, Width);\r
+      V = CGF.Builder.CreateLShr(V, ShiftCst);\r
+    } \r
+    // We are truncating value in order to extract the higher-order \r
+    // double, which we will be using to extract the sign from.\r
+    IntTy = llvm::IntegerType::get(C, Width);\r
+    V = CGF.Builder.CreateTrunc(V, IntTy);\r
   }
   Value *Zero = llvm::Constant::getNullValue(IntTy);
   return CGF.Builder.CreateICmpSLT(V, Zero);
@@ -1815,13 +1815,13 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD,
     case Builtin::BI__builtin_smull_overflow:
     case Builtin::BI__builtin_smulll_overflow:
       IntrinsicId = llvm::Intrinsic::smul_with_overflow;
-      break;
-    }
-
-    
-    llvm::Value *Carry;
-    llvm::Value *Sum = EmitOverflowIntrinsic(*this, IntrinsicId, X, Y, Carry);
-    Builder.CreateStore(Sum, SumOutPtr);
+      break;\r
+    }\r
+\r
+    \r
+    llvm::Value *Carry;\r
+    llvm::Value *Sum = EmitOverflowIntrinsic(*this, IntrinsicId, X, Y, Carry);\r
+    Builder.CreateStore(Sum, SumOutPtr);\r
 
     return RValue::get(Carry);
   }
@@ -3569,13 +3569,13 @@ static Value *packTBLDVectorList(CodeGenFunction &CGF, ArrayRef<Value *> Ops,
                                  llvm::Type *ResTy, unsigned IntID,
                                  const char *Name) {
   SmallVector<Value *, 2> TblOps;
-  if (ExtOp)
-    TblOps.push_back(ExtOp);
-
-  // Build a vector containing sequential number like (0, 1, 2, ..., 15)  
-  SmallVector<uint32_t, 16> Indices;
-  llvm::VectorType *TblTy = cast<llvm::VectorType>(Ops[0]->getType());
-  for (unsigned i = 0, e = TblTy->getNumElements(); i != e; ++i) {
+  if (ExtOp)\r
+    TblOps.push_back(ExtOp);\r
+\r
+  // Build a vector containing sequential number like (0, 1, 2, ..., 15)  \r
+  SmallVector<uint32_t, 16> Indices;\r
+  llvm::VectorType *TblTy = cast<llvm::VectorType>(Ops[0]->getType());\r
+  for (unsigned i = 0, e = TblTy->getNumElements(); i != e; ++i) {\r
     Indices.push_back(2*i);
     Indices.push_back(2*i+1);
   }
@@ -3596,13 +3596,13 @@ static Value *packTBLDVectorList(CodeGenFunction &CGF, ArrayRef<Value *> Ops,
                                                      ZeroTbl, Indices, Name));
   }
 
-  Function *TblF;
-  TblOps.push_back(IndexOp);
-  TblF = CGF.CGM.getIntrinsic(IntID, ResTy);
-  
-  return CGF.EmitNeonCall(TblF, TblOps, Name);
-}
-
+  Function *TblF;\r
+  TblOps.push_back(IndexOp);\r
+  TblF = CGF.CGM.getIntrinsic(IntID, ResTy);\r
+  \r
+  return CGF.EmitNeonCall(TblF, TblOps, Name);\r
+}\r
+\r
 Value *CodeGenFunction::GetValueForARMHint(unsigned BuiltinID) {
   unsigned Value;
   switch (BuiltinID) {
@@ -4102,13 +4102,13 @@ Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID,
                         "vsha1h");
 
   // The ARM _MoveToCoprocessor builtins put the input register value as
-  // the first argument, but the LLVM intrinsic expects it as the third one.
-  case ARM::BI_MoveToCoprocessor:
-  case ARM::BI_MoveToCoprocessor2: {
-    Function *F = CGM.getIntrinsic(BuiltinID == ARM::BI_MoveToCoprocessor ? 
-                                   Intrinsic::arm_mcr : Intrinsic::arm_mcr2);
-    return Builder.CreateCall(F, {Ops[1], Ops[2], Ops[0],
-                                  Ops[3], Ops[4], Ops[5]});
+  // the first argument, but the LLVM intrinsic expects it as the third one.\r
+  case ARM::BI_MoveToCoprocessor:\r
+  case ARM::BI_MoveToCoprocessor2: {\r
+    Function *F = CGM.getIntrinsic(BuiltinID == ARM::BI_MoveToCoprocessor ? \r
+                                   Intrinsic::arm_mcr : Intrinsic::arm_mcr2);\r
+    return Builder.CreateCall(F, {Ops[1], Ops[2], Ops[0],\r
+                                  Ops[3], Ops[4], Ops[5]});\r
   }
   }
 
@@ -6701,39 +6701,27 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
     if (Ops.size() == 3)
       return Align;
 
-    return EmitX86Select(*this, Ops[4], Align, Ops[3]);
-  }
-
-  case X86::BI__builtin_ia32_movntps:
-  case X86::BI__builtin_ia32_movntps256:
-  case X86::BI__builtin_ia32_movntpd:
-  case X86::BI__builtin_ia32_movntpd256:
-  case X86::BI__builtin_ia32_movntdq:
-  case X86::BI__builtin_ia32_movntdq256:
-  case X86::BI__builtin_ia32_movnti:
-  case X86::BI__builtin_ia32_movnti64: {
-    llvm::MDNode *Node = llvm::MDNode::get(
+    return EmitX86Select(*this, Ops[4], Align, Ops[3]);\r
+  }\r
+\r
+  case X86::BI__builtin_ia32_movnti:\r
+  case X86::BI__builtin_ia32_movnti64: {\r
+    llvm::MDNode *Node = llvm::MDNode::get(\r
         getLLVMContext(), llvm::ConstantAsMetadata::get(Builder.getInt32(1)));
 
     // Convert the type of the pointer to a pointer to the stored type.
     Value *BC = Builder.CreateBitCast(Ops[0],
                                 llvm::PointerType::getUnqual(Ops[1]->getType()),
                                       "cast");
-    StoreInst *SI = Builder.CreateDefaultAlignedStore(Ops[1], BC);
-    SI->setMetadata(CGM.getModule().getMDKindID("nontemporal"), Node);
-
-    // If the operand is an integer, we can't assume alignment. Otherwise,
-    // assume natural alignment.
-    QualType ArgTy = E->getArg(1)->getType();
-    unsigned Align;
-    if (ArgTy->isIntegerType())
-      Align = 1;
-    else
-      Align = getContext().getTypeSizeInChars(ArgTy).getQuantity();
-    SI->setAlignment(Align);
-    return SI;
-  }
-  case X86::BI__builtin_ia32_selectb_128:
+    StoreInst *SI = Builder.CreateDefaultAlignedStore(Ops[1], BC);\r
+    SI->setMetadata(CGM.getModule().getMDKindID("nontemporal"), Node);\r
+\r
+    // No alignment for scalar intrinsic store.\r
+    QualType ArgTy = E->getArg(1)->getType();\r
+    SI->setAlignment(1);\r
+    return SI;\r
+  }\r
+  case X86::BI__builtin_ia32_selectb_128:\r
   case X86::BI__builtin_ia32_selectb_256:
   case X86::BI__builtin_ia32_selectb_512:
   case X86::BI__builtin_ia32_selectw_128:
index 90b87b2586cb22bad21044e5d2becf271094e520..957cdfced9b3b05ff265b8d8bd95fbc7bc1593b7 100644 (file)
@@ -8866,7 +8866,7 @@ _mm512_kxor (__mmask16 __A, __mmask16 __B)
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm512_stream_si512 (__m512i * __P, __m512i __A)
 {
-  __builtin_ia32_movntdq512 ((__v8di *) __P, (__v8di) __A);
+  __builtin_nontemporal_store((__v8di)__A, (__v8di*)__P);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -8878,13 +8878,13 @@ _mm512_stream_load_si512 (void *__P)
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm512_stream_pd (double *__P, __m512d __A)
 {
-  __builtin_ia32_movntpd512 (__P, (__v8df) __A);
+  __builtin_nontemporal_store((__v8df)__A, (__v8df*)__P);
 }
 
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm512_stream_ps (float *__P, __m512 __A)
 {
-  __builtin_ia32_movntps512 (__P, (__v16sf) __A);
+  __builtin_nontemporal_store((__v16sf)__A, (__v16sf*)__P);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
index ce4f4243f2acddcbbadf51954ca84c07074e3980..45c052363e348af28e92f568fc5d8da752f3b491 100644 (file)
@@ -2496,19 +2496,19 @@ _mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_stream_si256(__m256i *__a, __m256i __b)
 {
-  __builtin_ia32_movntdq256((__v4di *)__a, (__v4di)__b);
+  __builtin_nontemporal_store((__v4di)__b, (__v4di*)__a);
 }
 
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_stream_pd(double *__a, __m256d __b)
 {
-  __builtin_ia32_movntpd256(__a, (__v4df)__b);
+  __builtin_nontemporal_store((__v4df)__b, (__v4df*)__a);
 }
 
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_stream_ps(float *__p, __m256 __a)
 {
-  __builtin_ia32_movntps256(__p, (__v8sf)__a);
+  __builtin_nontemporal_store((__v8sf)__a, (__v8sf*)__p);
 }
 
 /* Create vectors */
index adcf595fc6558e472c18ef3080b5cd1c317836b9..3eecb03033c7efb3a73695c9ab5ed601c82e8174 100644 (file)
@@ -2210,13 +2210,13 @@ _mm_storel_epi64(__m128i *__p, __m128i __a)
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_stream_pd(double *__p, __m128d __a)
 {
-  __builtin_ia32_movntpd(__p, (__v2df)__a);
+  __builtin_nontemporal_store((__v2df)__a, (__v2df*)__p);
 }
 
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_stream_si128(__m128i *__p, __m128i __a)
 {
-  __builtin_ia32_movntdq(__p, (__v2di)__a);
+  __builtin_nontemporal_store((__v2di)__a, (__v2di*)__p);
 }
 
 static __inline__ void __DEFAULT_FN_ATTRS
index 4ac846d8af9894a33a36c388b47d668c6c84c386..ce32d5a9d7a8d2d94bb79c637d9fb5341268435e 100644 (file)
@@ -2080,7 +2080,7 @@ _mm_stream_pi(__m64 *__p, __m64 __a)
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_stream_ps(float *__p, __m128 __a)
 {
-  __builtin_ia32_movntps(__p, (__v4sf)__a);
+  __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
 }
 
 /// \brief Forces strong memory ordering (serialization) between store
index 01cbaf5d07516e3a0446feb762d2b871604d5a29..743aa0499be4bfe4121507bf5be48946da4dc131 100644 (file)
@@ -5800,7 +5800,7 @@ __mmask16 test_mm512_kxor(__mmask16 __A, __mmask16 __B) {
 
 void test_mm512_stream_si512(__m512i * __P, __m512i __A) {
   // CHECK-LABEL: @test_mm512_stream_si512
-  // CHECK: @llvm.x86.avx512.storent.q.512
+  // CHECK: store <8 x i64> %{{.*}}, <8 x i64>* %{{.*}}, align 64, !nontemporal
   _mm512_stream_si512(__P, __A); 
 }
 
@@ -5812,13 +5812,13 @@ __m512i test_mm512_stream_load_si512(void *__P) {
 
 void test_mm512_stream_pd(double *__P, __m512d __A) {
   // CHECK-LABEL: @test_mm512_stream_pd
-  // CHECK: @llvm.x86.avx512.storent.pd.512
+  // CHECK: store <8 x double> %{{.*}}, <8 x double>* %{{.*}}, align 64, !nontemporal
   return _mm512_stream_pd(__P, __A); 
 }
 
 void test_mm512_stream_ps(float *__P, __m512 __A) {
   // CHECK-LABEL: @test_mm512_stream_ps
-  // CHECK: @llvm.x86.avx512.storent.ps.512
+  // CHECK: store <16 x float> %{{.*}}, <16 x float>* %{{.*}}, align 64, !nontemporal
   _mm512_stream_ps(__P, __A); 
 }
 
index 24e491c026226860ecacf5f27b71b58d76065c97..2dedd7913a70cde146218c44e98b04e55ee50905 100644 (file)
@@ -300,7 +300,6 @@ void f0() {
   (void) __builtin_ia32_storelps(tmp_V2ip, tmp_V4f);
   tmp_i = __builtin_ia32_movmskps(tmp_V4f);
   tmp_i = __builtin_ia32_pmovmskb(tmp_V8c);
-  (void) __builtin_ia32_movntps(tmp_fp, tmp_V4f);
   (void) __builtin_ia32_movntq(tmp_V1LLip, tmp_V1LLi);
   (void) __builtin_ia32_sfence();
 
@@ -318,8 +317,6 @@ void f0() {
 #ifdef USE_64
   (void) __builtin_ia32_movnti64(tmp_LLip, tmp_LLi);
 #endif
-  (void) __builtin_ia32_movntpd(tmp_dp, tmp_V2d);
-  (void) __builtin_ia32_movntdq(tmp_V2LLip, tmp_V2LLi);
   tmp_V2LLi = __builtin_ia32_psadbw128(tmp_V16c, tmp_V16c);
   tmp_V2d = __builtin_ia32_sqrtpd(tmp_V2d);
   tmp_V2d = __builtin_ia32_sqrtsd(tmp_V2d);
@@ -446,9 +443,6 @@ void f0() {
   tmp_V4d = __builtin_ia32_vbroadcastf128_pd256(tmp_V2dCp);
   tmp_V8f = __builtin_ia32_vbroadcastf128_ps256(tmp_V4fCp);
   tmp_V32c = __builtin_ia32_lddqu256(tmp_cCp);
-  __builtin_ia32_movntdq256(tmp_V4LLip, tmp_V4LLi);
-  __builtin_ia32_movntpd256(tmp_dp, tmp_V4d);
-  __builtin_ia32_movntps256(tmp_fp, tmp_V8f);
   tmp_V2d = __builtin_ia32_maskloadpd(tmp_V2dCp, tmp_V2LLi);
   tmp_V4f = __builtin_ia32_maskloadps(tmp_V4fCp, tmp_V4i);
   tmp_V4d = __builtin_ia32_maskloadpd256(tmp_V4dCp, tmp_V4LLi);