]> granicus.if.org Git - clang/commitdiff
[X86] Add back _mask, _maskz, and _mask3 builtins for some 512-bit fmadd/fmsub/fmadds...
authorCraig Topper <craig.topper@intel.com>
Thu, 7 Jun 2018 02:46:02 +0000 (02:46 +0000)
committerCraig Topper <craig.topper@intel.com>
Thu, 7 Jun 2018 02:46:02 +0000 (02:46 +0000)
Summary:
We recently switch to using a selects in the intrinsics header files for FMA instructions. But the 512-bit versions support flavors with rounding mode which must be an Integer Constant Expression. This has forced those intrinsics to be implemented as macros. As it stands now the mask and mask3 intrinsics evaluate one of their macro arguments twice. If that argument itself is another intrinsic macro, we can end up over expanding macros. Or if its something we can CSE later it would show up multiple times when it shouldn't.

I tried adding __extension__ around the macro and making it an expression statement and declaring a local variable. But whatever name you choose for the local variable can never be used as the name of an input to the macro in user code. If that happens you would end up with the same name on the LHS and RHS of an assignment after expansion. We might be safe if we use __ in front of the variable names because those names are reserved and user code shouldn't use that, but I wasn't sure I wanted to make that claim.

The other option which I've chosen here, is to add back _mask, _maskz, and _mask3 flavors of the builtin which we will expand in CGBuiltin.cpp to replicate the argument as needed and insert any fneg needed on the third operand to make a subtract. The _maskz isn't truly necessary if we have an unmasked version or if we use the masked version with a -1 mask and wrap a select around it. But I've chosen to make things more uniform.

I separated out the scalar builtin handling to avoid too many things going on in EmitX86FMAExpr. It was different enough due to the extract and insert that the minor duplication of the CreateCall was probably worth it.

Reviewers: tkrupa, RKSimon, spatel, GBuella

Reviewed By: tkrupa

Subscribers: cfe-commits

Differential Revision: https://reviews.llvm.org/D47724

git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@334159 91177308-0d34-0410-b5e6-96231b3b80d8

include/clang/Basic/BuiltinsX86.def
lib/CodeGen/CGBuiltin.cpp
lib/Headers/avx512fintrin.h
lib/Sema/SemaChecking.cpp
test/CodeGen/avx512f-builtins.c

index a164681b2e6cbf0b8798e58b30f760c1bd20efd2..33dda55fe39bec5a282c70df41e7920e74fb12e8 100644 (file)
@@ -731,10 +731,22 @@ TARGET_BUILTIN(__builtin_ia32_vfmaddpd256, "V4dV4dV4dV4d", "nc", "fma|fma4")
 TARGET_BUILTIN(__builtin_ia32_vfmaddsubps256, "V8fV8fV8fV8f", "nc", "fma|fma4")
 TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd256, "V4dV4dV4dV4d", "nc", "fma|fma4")
 
-TARGET_BUILTIN(__builtin_ia32_vfmaddpd512, "V8dV8dV8dV8dIi", "nc", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vfmaddps512, "V16fV16fV16fV16fIi", "nc", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd512, "V8dV8dV8dV8dIi", "nc", "avx512f")
-TARGET_BUILTIN(__builtin_ia32_vfmaddsubps512, "V16fV16fV16fV16fIi", "nc", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_vfmaddpd512_mask, "V8dV8dV8dV8dUcIi", "nc", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_vfmaddpd512_maskz, "V8dV8dV8dV8dUcIi", "nc", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_vfmaddpd512_mask3, "V8dV8dV8dV8dUcIi", "nc", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_vfmsubpd512_mask3, "V8dV8dV8dV8dUcIi", "nc", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_vfmaddps512_mask, "V16fV16fV16fV16fUsIi", "nc", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_vfmaddps512_maskz, "V16fV16fV16fV16fUsIi", "nc", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_vfmaddps512_mask3, "V16fV16fV16fV16fUsIi", "nc", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_vfmsubps512_mask3, "V16fV16fV16fV16fUsIi", "nc", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd512_mask, "V8dV8dV8dV8dUcIi", "nc", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd512_maskz, "V8dV8dV8dV8dUcIi", "nc", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd512_mask3, "V8dV8dV8dV8dUcIi", "nc", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_vfmsubaddpd512_mask3, "V8dV8dV8dV8dUcIi", "nc", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_vfmaddsubps512_mask, "V16fV16fV16fV16fUsIi", "nc", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_vfmaddsubps512_maskz, "V16fV16fV16fV16fUsIi", "nc", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_vfmaddsubps512_mask3, "V16fV16fV16fV16fUsIi", "nc", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_vfmsubaddps512_mask3, "V16fV16fV16fV16fUsIi", "nc", "avx512f")
 
 // XOP
 TARGET_BUILTIN(__builtin_ia32_vpmacssww, "V8sV8sV8sV8s", "nc", "xop")
index a09fa7aae5e92f5c90216066532bfbebf0be21db..a086b4d926cb2406d89e6c181000fd7e030747f4 100644 (file)
@@ -8555,79 +8555,110 @@ static Value *EmitX86MinMax(CodeGenFunction &CGF, ICmpInst::Predicate Pred,
 
 // Lowers X86 FMA intrinsics to IR.
 static Value *EmitX86FMAExpr(CodeGenFunction &CGF, ArrayRef<Value *> Ops,
-                             unsigned BuiltinID) {
+                             unsigned BuiltinID, bool IsAddSub) {
 
-  bool IsAddSub = false;
-  bool IsScalar = false;
-
-  // 4 operands always means rounding mode without a mask here.
-  bool IsRound = Ops.size() == 4;
-
-  Intrinsic::ID ID;
+  bool Subtract = false;
+  Intrinsic::ID IID = Intrinsic::not_intrinsic;
   switch (BuiltinID) {
   default: break;
-  case clang::X86::BI__builtin_ia32_vfmaddss3: IsScalar = true; break;
-  case clang::X86::BI__builtin_ia32_vfmaddsd3: IsScalar = true; break;
-  case clang::X86::BI__builtin_ia32_vfmaddps512:
-    ID = llvm::Intrinsic::x86_avx512_vfmadd_ps_512; break;
-  case clang::X86::BI__builtin_ia32_vfmaddpd512:
-    ID = llvm::Intrinsic::x86_avx512_vfmadd_pd_512; break;
-  case clang::X86::BI__builtin_ia32_vfmaddsubps: IsAddSub = true; break;
-  case clang::X86::BI__builtin_ia32_vfmaddsubpd: IsAddSub = true; break;
-  case clang::X86::BI__builtin_ia32_vfmaddsubps256: IsAddSub = true; break;
-  case clang::X86::BI__builtin_ia32_vfmaddsubpd256: IsAddSub = true; break;
-  case clang::X86::BI__builtin_ia32_vfmaddsubps512: {
-    ID = llvm::Intrinsic::x86_avx512_vfmaddsub_ps_512;
-    IsAddSub = true;
+  case clang::X86::BI__builtin_ia32_vfmsubps512_mask3:
+    Subtract = true;
+    LLVM_FALLTHROUGH;
+  case clang::X86::BI__builtin_ia32_vfmaddps512_mask:
+  case clang::X86::BI__builtin_ia32_vfmaddps512_maskz:
+  case clang::X86::BI__builtin_ia32_vfmaddps512_mask3:
+    IID = llvm::Intrinsic::x86_avx512_vfmadd_ps_512; break;
+  case clang::X86::BI__builtin_ia32_vfmsubpd512_mask3:
+    Subtract = true;
+    LLVM_FALLTHROUGH;
+  case clang::X86::BI__builtin_ia32_vfmaddpd512_mask:
+  case clang::X86::BI__builtin_ia32_vfmaddpd512_maskz:
+  case clang::X86::BI__builtin_ia32_vfmaddpd512_mask3:
+    IID = llvm::Intrinsic::x86_avx512_vfmadd_pd_512; break;
+  case clang::X86::BI__builtin_ia32_vfmsubaddps512_mask3:
+    Subtract = true;
+    LLVM_FALLTHROUGH;
+  case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask:
+  case clang::X86::BI__builtin_ia32_vfmaddsubps512_maskz:
+  case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask3:
+    IID = llvm::Intrinsic::x86_avx512_vfmaddsub_ps_512;
     break;
-  }
-  case clang::X86::BI__builtin_ia32_vfmaddsubpd512: {
-    ID = llvm::Intrinsic::x86_avx512_vfmaddsub_pd_512;
-    IsAddSub = true;
+  case clang::X86::BI__builtin_ia32_vfmsubaddpd512_mask3:
+    Subtract = true;
+    LLVM_FALLTHROUGH;
+  case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask:
+  case clang::X86::BI__builtin_ia32_vfmaddsubpd512_maskz:
+  case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask3:
+    IID = llvm::Intrinsic::x86_avx512_vfmaddsub_pd_512;
     break;
   }
-  }
-
-  // Only handle in case of _MM_FROUND_CUR_DIRECTION/4 (no rounding).
-  if (IsRound) {
-    Function *Intr = CGF.CGM.getIntrinsic(ID);
-    if (cast<llvm::ConstantInt>(Ops[3])->getZExtValue() != (uint64_t)4)
-      return CGF.Builder.CreateCall(Intr, Ops);
-  }
 
   Value *A = Ops[0];
   Value *B = Ops[1];
   Value *C = Ops[2];
 
-  if (IsScalar) {
-    A = CGF.Builder.CreateExtractElement(A, (uint64_t)0);
-    B = CGF.Builder.CreateExtractElement(B, (uint64_t)0);
-    C = CGF.Builder.CreateExtractElement(C, (uint64_t)0);
-  }
+  if (Subtract)
+    C = CGF.Builder.CreateFNeg(C);
 
-  llvm::Type *Ty = A->getType();
-  Function *FMA = CGF.CGM.getIntrinsic(Intrinsic::fma, Ty);
-  Value *Res = CGF.Builder.CreateCall(FMA, {A, B, C} );
+  Value *Res;
 
-  if (IsScalar)
-    return CGF.Builder.CreateInsertElement(Ops[0], Res, (uint64_t)0);
+  // Only handle in case of _MM_FROUND_CUR_DIRECTION/4 (no rounding).
+  if (IID != Intrinsic::not_intrinsic &&
+      cast<llvm::ConstantInt>(Ops.back())->getZExtValue() != (uint64_t)4) {
+    Function *Intr = CGF.CGM.getIntrinsic(IID);
+    Res = CGF.Builder.CreateCall(Intr, {A, B, C, Ops.back() });
+  } else {
+    llvm::Type *Ty = A->getType();
+    Function *FMA = CGF.CGM.getIntrinsic(Intrinsic::fma, Ty);
+    Res = CGF.Builder.CreateCall(FMA, {A, B, C} );
+
+    if (IsAddSub) {
+      // Negate even elts in C using a mask.
+      unsigned NumElts = Ty->getVectorNumElements();
+      SmallVector<Constant *, 16> NMask;
+      Constant *Zero = ConstantInt::get(CGF.Builder.getInt1Ty(), 0);
+      Constant *One = ConstantInt::get(CGF.Builder.getInt1Ty(), 1);
+      for (unsigned i = 0; i < NumElts; ++i) {
+        NMask.push_back(i % 2 == 0 ? One : Zero);
+      }
+      Value *NegMask = ConstantVector::get(NMask);
 
-  if (IsAddSub) {
-    // Negate even elts in C using a mask.
-    unsigned NumElts = Ty->getVectorNumElements();
-    SmallVector<Constant *, 16> NMask;
-    Constant *Zero = ConstantInt::get(CGF.Builder.getInt1Ty(), 0);
-    Constant *One = ConstantInt::get(CGF.Builder.getInt1Ty(), 1);
-    for (unsigned i = 0; i < NumElts; ++i) {
-      NMask.push_back(i % 2 == 0 ? One : Zero);
+      Value *NegC = CGF.Builder.CreateFNeg(C);
+      Value *FMSub = CGF.Builder.CreateCall(FMA, {A, B, NegC} );
+      Res = CGF.Builder.CreateSelect(NegMask, FMSub, Res);
     }
-    Value *NegMask = ConstantVector::get(NMask);
+  }
 
-    Value *NegC = CGF.Builder.CreateFNeg(C);
-    Value *FMSub = CGF.Builder.CreateCall(FMA, {A, B, NegC} );
-    Res = CGF.Builder.CreateSelect(NegMask, FMSub, Res);
+  // Handle any required masking.
+  Value *MaskFalseVal = nullptr;
+  switch (BuiltinID) {
+  case clang::X86::BI__builtin_ia32_vfmaddps512_mask:
+  case clang::X86::BI__builtin_ia32_vfmaddpd512_mask:
+  case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask:
+  case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask:
+    MaskFalseVal = Ops[0];
+    break;
+  case clang::X86::BI__builtin_ia32_vfmaddps512_maskz:
+  case clang::X86::BI__builtin_ia32_vfmaddpd512_maskz:
+  case clang::X86::BI__builtin_ia32_vfmaddsubps512_maskz:
+  case clang::X86::BI__builtin_ia32_vfmaddsubpd512_maskz:
+    MaskFalseVal = Constant::getNullValue(Ops[0]->getType());
+    break;
+  case clang::X86::BI__builtin_ia32_vfmsubps512_mask3:
+  case clang::X86::BI__builtin_ia32_vfmaddps512_mask3:
+  case clang::X86::BI__builtin_ia32_vfmsubpd512_mask3:
+  case clang::X86::BI__builtin_ia32_vfmaddpd512_mask3:
+  case clang::X86::BI__builtin_ia32_vfmsubaddps512_mask3:
+  case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask3:
+  case clang::X86::BI__builtin_ia32_vfmsubaddpd512_mask3:
+  case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask3:
+    MaskFalseVal = Ops[2];
+    break;
   }
 
+  if (MaskFalseVal)
+    return EmitX86Select(CGF, Ops[3], Res, MaskFalseVal);
+
   return Res;
 }
 
@@ -9046,20 +9077,40 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
     return EmitX86ConvertToMask(*this, Ops[0]);
 
   case X86::BI__builtin_ia32_vfmaddss3:
-  case X86::BI__builtin_ia32_vfmaddsd3:
+  case X86::BI__builtin_ia32_vfmaddsd3: {
+    Value *A = Builder.CreateExtractElement(Ops[0], (uint64_t)0);
+    Value *B = Builder.CreateExtractElement(Ops[1], (uint64_t)0);
+    Value *C = Builder.CreateExtractElement(Ops[2], (uint64_t)0);
+    Function *FMA = CGM.getIntrinsic(Intrinsic::fma, A->getType());
+    Value *Res = Builder.CreateCall(FMA, {A, B, C} );
+    return Builder.CreateInsertElement(Ops[0], Res, (uint64_t)0);
+  }
   case X86::BI__builtin_ia32_vfmaddps:
   case X86::BI__builtin_ia32_vfmaddpd:
   case X86::BI__builtin_ia32_vfmaddps256:
   case X86::BI__builtin_ia32_vfmaddpd256:
-  case X86::BI__builtin_ia32_vfmaddps512:
-  case X86::BI__builtin_ia32_vfmaddpd512:
+  case X86::BI__builtin_ia32_vfmaddps512_mask:
+  case X86::BI__builtin_ia32_vfmaddps512_maskz:
+  case X86::BI__builtin_ia32_vfmaddps512_mask3:
+  case X86::BI__builtin_ia32_vfmsubps512_mask3:
+  case X86::BI__builtin_ia32_vfmaddpd512_mask:
+  case X86::BI__builtin_ia32_vfmaddpd512_maskz:
+  case X86::BI__builtin_ia32_vfmaddpd512_mask3:
+  case X86::BI__builtin_ia32_vfmsubpd512_mask3:
+    return EmitX86FMAExpr(*this, Ops, BuiltinID, /*IsAddSub*/false);
   case X86::BI__builtin_ia32_vfmaddsubps:
   case X86::BI__builtin_ia32_vfmaddsubpd:
   case X86::BI__builtin_ia32_vfmaddsubps256:
   case X86::BI__builtin_ia32_vfmaddsubpd256:
-  case X86::BI__builtin_ia32_vfmaddsubps512:
-  case X86::BI__builtin_ia32_vfmaddsubpd512:
-    return EmitX86FMAExpr(*this, Ops, BuiltinID);
+  case X86::BI__builtin_ia32_vfmaddsubps512_mask:
+  case X86::BI__builtin_ia32_vfmaddsubps512_maskz:
+  case X86::BI__builtin_ia32_vfmaddsubps512_mask3:
+  case X86::BI__builtin_ia32_vfmsubaddps512_mask3:
+  case X86::BI__builtin_ia32_vfmaddsubpd512_mask:
+  case X86::BI__builtin_ia32_vfmaddsubpd512_maskz:
+  case X86::BI__builtin_ia32_vfmaddsubpd512_mask3:
+  case X86::BI__builtin_ia32_vfmsubaddpd512_mask3:
+    return EmitX86FMAExpr(*this, Ops, BuiltinID, /*IsAddSub*/true);
 
   case X86::BI__builtin_ia32_movdqa32store128_mask:
   case X86::BI__builtin_ia32_movdqa64store128_mask:
index 2b3633e377be28326a44c636584b75b27dc5571f..47a0c9d433f72d6c271b84fa03cb8979850d3167 100644 (file)
@@ -2578,910 +2578,818 @@ _mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) {
                                           (__mmask8)-1, (int)(R))
 
 #define _mm512_fmadd_round_pd(A, B, C, R) \
-  (__m512d)__builtin_ia32_vfmaddpd512((__v8df)(__m512d)(A), \
-                                      (__v8df)(__m512d)(B), \
-                                      (__v8df)(__m512d)(C), (int)(R))
+  (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
+                                           (__v8df)(__m512d)(B), \
+                                           (__v8df)(__m512d)(C), \
+                                           (__mmask8)-1, (int)(R))
 
 
 #define _mm512_mask_fmadd_round_pd(A, U, B, C, R) \
-  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-            __builtin_ia32_vfmaddpd512((__v8df)(__m512d)(A), \
-                                       (__v8df)(__m512d)(B), \
-                                       (__v8df)(__m512d)(C), \
-                                       (int)(R)), \
-            (__v8df)(__m512d)(A))
+  (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
+                                           (__v8df)(__m512d)(B), \
+                                           (__v8df)(__m512d)(C), \
+                                           (__mmask8)(U), (int)(R))
 
 
 #define _mm512_mask3_fmadd_round_pd(A, B, C, U, R) \
-  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-            __builtin_ia32_vfmaddpd512((__v8df)(__m512d)(A), \
-                                       (__v8df)(__m512d)(B), \
-                                       (__v8df)(__m512d)(C), \
-                                       (int)(R)), \
-            (__v8df)(__m512d)(C))
+  (__m512d)__builtin_ia32_vfmaddpd512_mask3((__v8df)(__m512d)(A), \
+                                            (__v8df)(__m512d)(B), \
+                                            (__v8df)(__m512d)(C), \
+                                            (__mmask8)(U), (int)(R))
 
 
 #define _mm512_maskz_fmadd_round_pd(U, A, B, C, R) \
-  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-            __builtin_ia32_vfmaddpd512((__v8df)(__m512d)(A), \
-                                       (__v8df)(__m512d)(B), \
-                                       (__v8df)(__m512d)(C), \
-                                       (int)(R)), \
-            (__v8df)_mm512_setzero_pd())
+  (__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \
+                                            (__v8df)(__m512d)(B), \
+                                            (__v8df)(__m512d)(C), \
+                                            (__mmask8)(U), (int)(R))
 
 
 #define _mm512_fmsub_round_pd(A, B, C, R) \
-  (__m512d)__builtin_ia32_vfmaddpd512((__v8df)(__m512d)(A), \
-                                      (__v8df)(__m512d)(B), \
-                                      -(__v8df)(__m512d)(C), \
-                                      (int)(R))
+  (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
+                                           (__v8df)(__m512d)(B), \
+                                           -(__v8df)(__m512d)(C), \
+                                           (__mmask8)-1, (int)(R))
 
 
 #define _mm512_mask_fmsub_round_pd(A, U, B, C, R) \
-  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-            __builtin_ia32_vfmaddpd512((__v8df)(__m512d)(A), \
-                                       (__v8df)(__m512d)(B), \
-                                       -(__v8df)(__m512d)(C), \
-                                       (int)(R)), \
-            (__v8df)(__m512d)(A))
+  (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
+                                           (__v8df)(__m512d)(B), \
+                                           -(__v8df)(__m512d)(C), \
+                                           (__mmask8)(U), (int)(R))
 
 
 #define _mm512_maskz_fmsub_round_pd(U, A, B, C, R) \
-  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-            __builtin_ia32_vfmaddpd512((__v8df)(__m512d)(A), \
-                                       (__v8df)(__m512d)(B), \
-                                       -(__v8df)(__m512d)(C), \
-                                       (int)(R)), \
-            (__v8df)_mm512_setzero_pd())
+  (__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \
+                                            (__v8df)(__m512d)(B), \
+                                            -(__v8df)(__m512d)(C), \
+                                            (__mmask8)(U), (int)(R))
 
 
 #define _mm512_fnmadd_round_pd(A, B, C, R) \
-  (__m512d)__builtin_ia32_vfmaddpd512(-(__v8df)(__m512d)(A), \
-                                      (__v8df)(__m512d)(B), \
-                                      (__v8df)(__m512d)(C), (int)(R))
+  (__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \
+                                           (__v8df)(__m512d)(B), \
+                                           (__v8df)(__m512d)(C), \
+                                           (__mmask8)-1, (int)(R))
 
 
 #define _mm512_mask3_fnmadd_round_pd(A, B, C, U, R) \
-  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-            __builtin_ia32_vfmaddpd512(-(__v8df)(__m512d)(A), \
-                                       (__v8df)(__m512d)(B), \
-                                       (__v8df)(__m512d)(C), \
-                                       (int)(R)), \
-            (__v8df)(__m512d)(C))
+  (__m512d)__builtin_ia32_vfmaddpd512_mask3(-(__v8df)(__m512d)(A), \
+                                            (__v8df)(__m512d)(B), \
+                                            (__v8df)(__m512d)(C), \
+                                            (__mmask8)(U), (int)(R))
 
 
 #define _mm512_maskz_fnmadd_round_pd(U, A, B, C, R) \
-  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-            __builtin_ia32_vfmaddpd512(-(__v8df)(__m512d)(A), \
-                                       (__v8df)(__m512d)(B), \
-                                       (__v8df)(__m512d)(C), \
-                                       (int)(R)), \
-            (__v8df)_mm512_setzero_pd())
+  (__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \
+                                            (__v8df)(__m512d)(B), \
+                                            (__v8df)(__m512d)(C), \
+                                            (__mmask8)(U), (int)(R))
 
 
 #define _mm512_fnmsub_round_pd(A, B, C, R) \
-  (__m512d)__builtin_ia32_vfmaddpd512(-(__v8df)(__m512d)(A), \
-                                      (__v8df)(__m512d)(B), \
-                                      -(__v8df)(__m512d)(C), \
-                                      (int)(R))
+  (__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \
+                                           (__v8df)(__m512d)(B), \
+                                           -(__v8df)(__m512d)(C), \
+                                           (__mmask8)-1, (int)(R))
 
 
 #define _mm512_maskz_fnmsub_round_pd(U, A, B, C, R) \
-  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-            __builtin_ia32_vfmaddpd512(-(__v8df)(__m512d)(A), \
-                                       (__v8df)(__m512d)(B), \
-                                       -(__v8df)(__m512d)(C), \
-                                       (int)(R)), \
-            (__v8df)_mm512_setzero_pd())
+  (__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \
+                                            (__v8df)(__m512d)(B), \
+                                            -(__v8df)(__m512d)(C), \
+                                            (__mmask8)(U), (int)(R))
 
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_fmadd_pd(__m512d __A, __m512d __B, __m512d __C)
 {
-  return (__m512d) __builtin_ia32_vfmaddpd512 ((__v8df) __A,
-                                               (__v8df) __B,
-                                               (__v8df) __C,
-                                               _MM_FROUND_CUR_DIRECTION);
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+                                                    (__v8df) __B,
+                                                    (__v8df) __C,
+                                                    (__mmask8) -1,
+                                                    _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_mask_fmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
 {
-  return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd512 ((__v8df) __A,
-                                                (__v8df) __B,
-                                                (__v8df) __C,
-                                                _MM_FROUND_CUR_DIRECTION),
-            (__v8df) __A);
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+                                                    (__v8df) __B,
+                                                    (__v8df) __C,
+                                                    (__mmask8) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_mask3_fmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
 {
-  return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd512 ((__v8df) __A,
-                                                (__v8df) __B,
-                                                (__v8df) __C,
-                                                _MM_FROUND_CUR_DIRECTION),
-            (__v8df) __C);
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_maskz_fmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
 {
-  return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd512 ((__v8df) __A,
-                                                (__v8df) __B,
-                                                (__v8df) __C,
-                                                _MM_FROUND_CUR_DIRECTION),
-            (__v8df) _mm512_setzero_pd());
+  return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_fmsub_pd(__m512d __A, __m512d __B, __m512d __C)
 {
-  return (__m512d) __builtin_ia32_vfmaddpd512 ((__v8df) __A,
-                                               (__v8df) __B,
-                                               -(__v8df) __C,
-                                               _MM_FROUND_CUR_DIRECTION);
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+                                                    (__v8df) __B,
+                                                    -(__v8df) __C,
+                                                    (__mmask8) -1,
+                                                    _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_mask_fmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
 {
-  return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd512 ((__v8df) __A,
-                                                (__v8df) __B,
-                                                -(__v8df) __C,
-                                                _MM_FROUND_CUR_DIRECTION),
-            (__v8df) __A);
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+                                                    (__v8df) __B,
+                                                    -(__v8df) __C,
+                                                    (__mmask8) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_maskz_fmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
 {
-  return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd512 ((__v8df) __A,
-                                                (__v8df) __B,
-                                                -(__v8df) __C,
-                                                _MM_FROUND_CUR_DIRECTION),
-            (__v8df) _mm512_setzero_pd());
+  return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
+                                                     (__v8df) __B,
+                                                     -(__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C)
 {
-  return (__m512d) __builtin_ia32_vfmaddpd512 (-(__v8df) __A,
-                                               (__v8df) __B,
-                                               (__v8df) __C,
-                                               _MM_FROUND_CUR_DIRECTION);
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+                                                    -(__v8df) __B,
+                                                    (__v8df) __C,
+                                                    (__mmask8) -1,
+                                                    _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_mask3_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
 {
-  return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd512 (-(__v8df) __A,
-                                                (__v8df) __B,
-                                                (__v8df) __C,
-                                                _MM_FROUND_CUR_DIRECTION),
-            (__v8df) __C);
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask3 (-(__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_maskz_fnmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
 {
-  return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd512 (-(__v8df) __A,
-                                                (__v8df) __B,
-                                                (__v8df) __C,
-                                                _MM_FROUND_CUR_DIRECTION),
-            (__v8df) _mm512_setzero_pd());
+  return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C)
 {
-  return (__m512d) __builtin_ia32_vfmaddpd512 (-(__v8df) __A,
-                                               (__v8df) __B,
-                                               -(__v8df) __C,
-                                               _MM_FROUND_CUR_DIRECTION);
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+                                                    -(__v8df) __B,
+                                                    -(__v8df) __C,
+                                                    (__mmask8) -1,
+                                                    _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_maskz_fnmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
 {
-  return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd512 (-(__v8df) __A,
-                                                (__v8df) __B,
-                                                -(__v8df) __C,
-                                                _MM_FROUND_CUR_DIRECTION),
-            (__v8df) _mm512_setzero_pd());
+  return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A,
+                                                     (__v8df) __B,
+                                                     -(__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
 }
 
 #define _mm512_fmadd_round_ps(A, B, C, R) \
-  (__m512)__builtin_ia32_vfmaddps512((__v16sf)(__m512)(A), \
-                                     (__v16sf)(__m512)(B), \
-                                     (__v16sf)(__m512)(C), (int)(R))
+  (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(B), \
+                                          (__v16sf)(__m512)(C), \
+                                          (__mmask16)-1, (int)(R))
 
 
 #define _mm512_mask_fmadd_round_ps(A, U, B, C, R) \
-  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-           __builtin_ia32_vfmaddps512((__v16sf)(__m512)(A), \
-                                      (__v16sf)(__m512)(B), \
-                                      (__v16sf)(__m512)(C), \
-                                      (int)(R)), \
-            (__v16sf)(__m512)(A))
+  (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(B), \
+                                          (__v16sf)(__m512)(C), \
+                                          (__mmask16)(U), (int)(R))
 
 
 #define _mm512_mask3_fmadd_round_ps(A, B, C, U, R) \
-  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-           __builtin_ia32_vfmaddps512((__v16sf)(__m512)(A), \
-                                      (__v16sf)(__m512)(B), \
-                                      (__v16sf)(__m512)(C), \
-                                      (int)(R)), \
-            (__v16sf)(__m512)(C))
+  (__m512)__builtin_ia32_vfmaddps512_mask3((__v16sf)(__m512)(A), \
+                                           (__v16sf)(__m512)(B), \
+                                           (__v16sf)(__m512)(C), \
+                                           (__mmask16)(U), (int)(R))
 
 
 #define _mm512_maskz_fmadd_round_ps(U, A, B, C, R) \
-  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-           __builtin_ia32_vfmaddps512((__v16sf)(__m512)(A), \
-                                      (__v16sf)(__m512)(B), \
-                                      (__v16sf)(__m512)(C), \
-                                      (int)(R)), \
-            (__v16sf)_mm512_setzero_ps())
+  (__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \
+                                           (__v16sf)(__m512)(B), \
+                                           (__v16sf)(__m512)(C), \
+                                           (__mmask16)(U), (int)(R))
 
 
 #define _mm512_fmsub_round_ps(A, B, C, R) \
-  (__m512)__builtin_ia32_vfmaddps512((__v16sf)(__m512)(A), \
-                                     (__v16sf)(__m512)(B), \
-                                     -(__v16sf)(__m512)(C), \
-                                     (int)(R))
+  (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(B), \
+                                          -(__v16sf)(__m512)(C), \
+                                          (__mmask16)-1, (int)(R))
 
 
 #define _mm512_mask_fmsub_round_ps(A, U, B, C, R) \
-  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-           __builtin_ia32_vfmaddps512((__v16sf)(__m512)(A), \
-                                      (__v16sf)(__m512)(B), \
-                                      -(__v16sf)(__m512)(C), \
-                                      (int)(R)), \
-            (__v16sf)(__m512)(A))
+  (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(B), \
+                                          -(__v16sf)(__m512)(C), \
+                                          (__mmask16)(U), (int)(R))
 
 
 #define _mm512_maskz_fmsub_round_ps(U, A, B, C, R) \
-  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-           __builtin_ia32_vfmaddps512((__v16sf)(__m512)(A), \
-                                      (__v16sf)(__m512)(B), \
-                                      -(__v16sf)(__m512)(C), \
-                                      (int)(R)), \
-            (__v16sf)_mm512_setzero_ps())
+  (__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \
+                                           (__v16sf)(__m512)(B), \
+                                           -(__v16sf)(__m512)(C), \
+                                           (__mmask16)(U), (int)(R))
 
 
 #define _mm512_fnmadd_round_ps(A, B, C, R) \
-  (__m512)__builtin_ia32_vfmaddps512(-(__v16sf)(__m512)(A), \
-                                     (__v16sf)(__m512)(B), \
-                                     (__v16sf)(__m512)(C), (int)(R))
+  (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
+                                          -(__v16sf)(__m512)(B), \
+                                          (__v16sf)(__m512)(C), \
+                                          (__mmask16)-1, (int)(R))
 
 
 #define _mm512_mask3_fnmadd_round_ps(A, B, C, U, R) \
-  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-           __builtin_ia32_vfmaddps512(-(__v16sf)(__m512)(A), \
-                                      (__v16sf)(__m512)(B), \
-                                      (__v16sf)(__m512)(C), \
-                                      (int)(R)), \
-            (__v16sf)(__m512)(C))
+  (__m512)__builtin_ia32_vfmaddps512_mask3(-(__v16sf)(__m512)(A), \
+                                           (__v16sf)(__m512)(B), \
+                                           (__v16sf)(__m512)(C), \
+                                           (__mmask16)(U), (int)(R))
 
 
 #define _mm512_maskz_fnmadd_round_ps(U, A, B, C, R) \
-  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-           __builtin_ia32_vfmaddps512(-(__v16sf)(__m512)(A), \
-                                      (__v16sf)(__m512)(B), \
-                                      (__v16sf)(__m512)(C), \
-                                      (int)(R)), \
-            (__v16sf)_mm512_setzero_ps())
+  (__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \
+                                           (__v16sf)(__m512)(B), \
+                                           (__v16sf)(__m512)(C), \
+                                           (__mmask16)(U), (int)(R))
 
 
 #define _mm512_fnmsub_round_ps(A, B, C, R) \
-  (__m512)__builtin_ia32_vfmaddps512(-(__v16sf)(__m512)(A), \
-                                     (__v16sf)(__m512)(B), \
-                                     -(__v16sf)(__m512)(C), \
-                                     (int)(R))
+  (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
+                                          -(__v16sf)(__m512)(B), \
+                                          -(__v16sf)(__m512)(C), \
+                                          (__mmask16)-1, (int)(R))
 
 
 #define _mm512_maskz_fnmsub_round_ps(U, A, B, C, R) \
-  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-           __builtin_ia32_vfmaddps512(-(__v16sf)(__m512)(A), \
-                                      (__v16sf)(__m512)(B), \
-                                      -(__v16sf)(__m512)(C), \
-                                      (int)(R)), \
-            (__v16sf)_mm512_setzero_ps())
+  (__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \
+                                           (__v16sf)(__m512)(B), \
+                                           -(__v16sf)(__m512)(C), \
+                                           (__mmask16)(U), (int)(R))
 
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_fmadd_ps(__m512 __A, __m512 __B, __m512 __C)
 {
-  return (__m512) __builtin_ia32_vfmaddps512 ((__v16sf) __A,
-                                              (__v16sf) __B,
-                                              (__v16sf) __C,
-                                              _MM_FROUND_CUR_DIRECTION);
+  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   (__v16sf) __C,
+                                                   (__mmask16) -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_mask_fmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
 {
-  return (__m512) __builtin_ia32_selectps_512((__mmask16) __U,
-                    __builtin_ia32_vfmaddps512 ((__v16sf) __A,
-                                                (__v16sf) __B,
-                                                (__v16sf) __C,
-                                                _MM_FROUND_CUR_DIRECTION),
-            (__v16sf) __A);
+  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   (__v16sf) __C,
+                                                   (__mmask16) __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_mask3_fmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
 {
-  return (__m512) __builtin_ia32_selectps_512((__mmask16) __U,
-                    __builtin_ia32_vfmaddps512 ((__v16sf) __A,
-                                                (__v16sf) __B,
-                                                (__v16sf) __C,
-                                                _MM_FROUND_CUR_DIRECTION),
-            (__v16sf) __C);
+  return (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_maskz_fmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
 {
-  return (__m512) __builtin_ia32_selectps_512((__mmask16) __U,
-                    __builtin_ia32_vfmaddps512 ((__v16sf) __A,
-                                                (__v16sf) __B,
-                                                (__v16sf) __C,
-                                                _MM_FROUND_CUR_DIRECTION),
-            (__v16sf) _mm512_setzero_ps());
+  return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_fmsub_ps(__m512 __A, __m512 __B, __m512 __C)
 {
-  return (__m512) __builtin_ia32_vfmaddps512 ((__v16sf) __A,
-                                              (__v16sf) __B,
-                                              -(__v16sf) __C,
-                                              _MM_FROUND_CUR_DIRECTION);
+  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   -(__v16sf) __C,
+                                                   (__mmask16) -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_mask_fmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
 {
-  return (__m512) __builtin_ia32_selectps_512((__mmask16) __U,
-                    __builtin_ia32_vfmaddps512 ((__v16sf) __A,
-                                                (__v16sf) __B,
-                                                -(__v16sf) __C,
-                                                _MM_FROUND_CUR_DIRECTION),
-            (__v16sf) __A);
+  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   -(__v16sf) __C,
+                                                   (__mmask16) __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_maskz_fmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
 {
-  return (__m512) __builtin_ia32_selectps_512((__mmask16) __U,
-                    __builtin_ia32_vfmaddps512 ((__v16sf) __A,
-                                                (__v16sf) __B,
-                                                -(__v16sf) __C,
-                                                _MM_FROUND_CUR_DIRECTION),
-            (__v16sf) _mm512_setzero_ps());
+  return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    -(__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C)
 {
-  return (__m512) __builtin_ia32_vfmaddps512 (-(__v16sf) __A,
-                                              (__v16sf) __B,
-                                              (__v16sf) __C,
-                                              _MM_FROUND_CUR_DIRECTION);
+  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+                                                   -(__v16sf) __B,
+                                                   (__v16sf) __C,
+                                                   (__mmask16) -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_mask3_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
 {
-  return (__m512) __builtin_ia32_selectps_512((__mmask16) __U,
-                    __builtin_ia32_vfmaddps512 (-(__v16sf) __A,
-                                                (__v16sf) __B,
-                                                (__v16sf) __C,
-                                                _MM_FROUND_CUR_DIRECTION),
-            (__v16sf) __C);
+  return (__m512) __builtin_ia32_vfmaddps512_mask3 (-(__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_maskz_fnmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
 {
-  return (__m512) __builtin_ia32_selectps_512((__mmask16) __U,
-                    __builtin_ia32_vfmaddps512 (-(__v16sf) __A,
-                                                (__v16sf) __B,
-                                                (__v16sf) __C,
-                                                _MM_FROUND_CUR_DIRECTION),
-            (__v16sf) _mm512_setzero_ps());
+  return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C)
 {
-  return (__m512) __builtin_ia32_vfmaddps512 (-(__v16sf) __A,
-                                              (__v16sf) __B,
-                                              -(__v16sf) __C,
-                                              _MM_FROUND_CUR_DIRECTION);
+  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+                                                   -(__v16sf) __B,
+                                                   -(__v16sf) __C,
+                                                   (__mmask16) -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_maskz_fnmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
 {
-  return (__m512) __builtin_ia32_selectps_512((__mmask16) __U,
-                    __builtin_ia32_vfmaddps512 (-(__v16sf) __A,
-                                                (__v16sf) __B,
-                                                -(__v16sf) __C,
-                                                _MM_FROUND_CUR_DIRECTION),
-            (__v16sf) _mm512_setzero_ps());
+  return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    -(__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
 }
 
 #define _mm512_fmaddsub_round_pd(A, B, C, R) \
-  (__m512d)__builtin_ia32_vfmaddsubpd512((__v8df)(__m512d)(A), \
-                                         (__v8df)(__m512d)(B), \
-                                         (__v8df)(__m512d)(C), \
-                                         (int)(R))
+  (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
+                                              (__v8df)(__m512d)(B), \
+                                              (__v8df)(__m512d)(C), \
+                                              (__mmask8)-1, (int)(R))
 
 
 #define _mm512_mask_fmaddsub_round_pd(A, U, B, C, R) \
-  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-            __builtin_ia32_vfmaddsubpd512((__v8df)(__m512d)(A), \
-                                          (__v8df)(__m512d)(B), \
-                                          (__v8df)(__m512d)(C), \
-                                          (int)(R)), \
-            (__v8df)(__m512d)(A))
+  (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
+                                              (__v8df)(__m512d)(B), \
+                                              (__v8df)(__m512d)(C), \
+                                              (__mmask8)(U), (int)(R))
 
 
 #define _mm512_mask3_fmaddsub_round_pd(A, B, C, U, R) \
-  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-            __builtin_ia32_vfmaddsubpd512((__v8df)(__m512d)(A), \
-                                          (__v8df)(__m512d)(B), \
-                                          (__v8df)(__m512d)(C), \
-                                          (int)(R)), \
-            (__v8df)(__m512d)(C))
+  (__m512d)__builtin_ia32_vfmaddsubpd512_mask3((__v8df)(__m512d)(A), \
+                                               (__v8df)(__m512d)(B), \
+                                               (__v8df)(__m512d)(C), \
+                                               (__mmask8)(U), (int)(R))
 
 
 #define _mm512_maskz_fmaddsub_round_pd(U, A, B, C, R) \
-  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-            __builtin_ia32_vfmaddsubpd512((__v8df)(__m512d)(A), \
-                                          (__v8df)(__m512d)(B), \
-                                          (__v8df)(__m512d)(C), \
-                                          (int)(R)), \
-            (__v8df)_mm512_setzero_pd())
+  (__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \
+                                               (__v8df)(__m512d)(B), \
+                                               (__v8df)(__m512d)(C), \
+                                               (__mmask8)(U), (int)(R))
 
 
 #define _mm512_fmsubadd_round_pd(A, B, C, R) \
-  (__m512d)__builtin_ia32_vfmaddsubpd512((__v8df)(__m512d)(A), \
-                                         (__v8df)(__m512d)(B), \
-                                         -(__v8df)(__m512d)(C), \
-                                         (int)(R))
+  (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
+                                              (__v8df)(__m512d)(B), \
+                                              -(__v8df)(__m512d)(C), \
+                                              (__mmask8)-1, (int)(R))
 
 
 #define _mm512_mask_fmsubadd_round_pd(A, U, B, C, R) \
-  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-            __builtin_ia32_vfmaddsubpd512((__v8df)(__m512d)(A), \
-                                          (__v8df)(__m512d)(B), \
-                                          -(__v8df)(__m512d)(C), \
-                                          (int)(R)), \
-            (__v8df)(__m512d)(A))
+  (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
+                                              (__v8df)(__m512d)(B), \
+                                              -(__v8df)(__m512d)(C), \
+                                              (__mmask8)(U), (int)(R))
 
 
 #define _mm512_maskz_fmsubadd_round_pd(U, A, B, C, R) \
-  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-            __builtin_ia32_vfmaddsubpd512((__v8df)(__m512d)(A), \
-                                          (__v8df)(__m512d)(B), \
-                                          -(__v8df)(__m512d)(C), \
-                                          (int)(R)), \
-            (__v8df)_mm512_setzero_pd())
+  (__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \
+                                               (__v8df)(__m512d)(B), \
+                                               -(__v8df)(__m512d)(C), \
+                                               (__mmask8)(U), (int)(R))
 
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C)
 {
-  return (__m512d) __builtin_ia32_vfmaddsubpd512 ((__v8df) __A,
-                                                  (__v8df) __B,
-                                                  (__v8df) __C,
-                                                  _MM_FROUND_CUR_DIRECTION);
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+                                                      (__v8df) __B,
+                                                      (__v8df) __C,
+                                                      (__mmask8) -1,
+                                                      _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_mask_fmaddsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
 {
-  return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U,
-                    __builtin_ia32_vfmaddsubpd512 ((__v8df) __A,
-                                                   (__v8df) __B,
-                                                   (__v8df) __C,
-                                                   _MM_FROUND_CUR_DIRECTION),
-            (__v8df) __A);
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+                                                      (__v8df) __B,
+                                                      (__v8df) __C,
+                                                      (__mmask8) __U,
+                                                      _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_mask3_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
 {
-  return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U,
-                    __builtin_ia32_vfmaddsubpd512 ((__v8df) __A,
-                                                   (__v8df) __B,
-                                                   (__v8df) __C,
-                                                   _MM_FROUND_CUR_DIRECTION),
-            (__v8df) __C);
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) __A,
+                                                       (__v8df) __B,
+                                                       (__v8df) __C,
+                                                       (__mmask8) __U,
+                                                       _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_maskz_fmaddsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
 {
-  return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U,
-                    __builtin_ia32_vfmaddsubpd512 ((__v8df) __A,
-                                                   (__v8df) __B,
-                                                   (__v8df) __C,
-                                                   _MM_FROUND_CUR_DIRECTION),
-            (__v8df) _mm512_setzero_pd());
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
+                                                       (__v8df) __B,
+                                                       (__v8df) __C,
+                                                       (__mmask8) __U,
+                                                       _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C)
 {
-  return (__m512d) __builtin_ia32_vfmaddsubpd512 ((__v8df) __A,
-                                                  (__v8df) __B,
-                                                  -(__v8df) __C,
-                                                  _MM_FROUND_CUR_DIRECTION);
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+                                                       (__v8df) __B,
+                                                       -(__v8df) __C,
+                                                       (__mmask8) -1,
+                                                       _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_mask_fmsubadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
 {
-  return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U,
-                    __builtin_ia32_vfmaddsubpd512 ((__v8df) __A,
-                                                   (__v8df) __B,
-                                                   -(__v8df) __C,
-                                                   _MM_FROUND_CUR_DIRECTION),
-            (__v8df) __A);
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+                                                       (__v8df) __B,
+                                                       -(__v8df) __C,
+                                                       (__mmask8) __U,
+                                                       _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_maskz_fmsubadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
 {
-  return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U,
-                    __builtin_ia32_vfmaddsubpd512 ((__v8df) __A,
-                                                   (__v8df) __B,
-                                                   -(__v8df) __C,
-                                                   _MM_FROUND_CUR_DIRECTION),
-            (__v8df) _mm512_setzero_pd());
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
+                                                        (__v8df) __B,
+                                                        -(__v8df) __C,
+                                                        (__mmask8) __U,
+                                                        _MM_FROUND_CUR_DIRECTION);
 }
 
 #define _mm512_fmaddsub_round_ps(A, B, C, R) \
-  (__m512)__builtin_ia32_vfmaddsubps512((__v16sf)(__m512)(A), \
-                                        (__v16sf)(__m512)(B), \
-                                        (__v16sf)(__m512)(C), \
-                                        (int)(R))
+  (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
+                                             (__v16sf)(__m512)(B), \
+                                             (__v16sf)(__m512)(C), \
+                                             (__mmask16)-1, (int)(R))
 
 
 #define _mm512_mask_fmaddsub_round_ps(A, U, B, C, R) \
-  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-           __builtin_ia32_vfmaddsubps512((__v16sf)(__m512)(A), \
-                                         (__v16sf)(__m512)(B), \
-                                         (__v16sf)(__m512)(C), \
-                                         (int)(R)), \
-            (__v16sf)(__m512)(A))
+  (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
+                                             (__v16sf)(__m512)(B), \
+                                             (__v16sf)(__m512)(C), \
+                                             (__mmask16)(U), (int)(R))
 
 
 #define _mm512_mask3_fmaddsub_round_ps(A, B, C, U, R) \
-  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-           __builtin_ia32_vfmaddsubps512((__v16sf)(__m512)(A), \
-                                         (__v16sf)(__m512)(B), \
-                                         (__v16sf)(__m512)(C), \
-                                         (int)(R)), \
-            (__v16sf)(__m512)(C))
+  (__m512)__builtin_ia32_vfmaddsubps512_mask3((__v16sf)(__m512)(A), \
+                                              (__v16sf)(__m512)(B), \
+                                              (__v16sf)(__m512)(C), \
+                                              (__mmask16)(U), (int)(R))
 
 
 #define _mm512_maskz_fmaddsub_round_ps(U, A, B, C, R) \
-  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-           __builtin_ia32_vfmaddsubps512((__v16sf)(__m512)(A), \
-                                         (__v16sf)(__m512)(B), \
-                                         (__v16sf)(__m512)(C), \
-                                         (int)(R)), \
-            (__v16sf)_mm512_setzero_ps())
+  (__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \
+                                              (__v16sf)(__m512)(B), \
+                                              (__v16sf)(__m512)(C), \
+                                              (__mmask16)(U), (int)(R))
 
 
 #define _mm512_fmsubadd_round_ps(A, B, C, R) \
-  (__m512)__builtin_ia32_vfmaddsubps512((__v16sf)(__m512)(A), \
-                                        (__v16sf)(__m512)(B), \
-                                        -(__v16sf)(__m512)(C), \
-                                        (int)(R))
+  (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
+                                             (__v16sf)(__m512)(B), \
+                                             -(__v16sf)(__m512)(C), \
+                                             (__mmask16)-1, (int)(R))
 
 
 #define _mm512_mask_fmsubadd_round_ps(A, U, B, C, R) \
-  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-           __builtin_ia32_vfmaddsubps512((__v16sf)(__m512)(A), \
-                                         (__v16sf)(__m512)(B), \
-                                         -(__v16sf)(__m512)(C), \
-                                         (int)(R)), \
-            (__v16sf)(__m512)(A))
+  (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
+                                             (__v16sf)(__m512)(B), \
+                                             -(__v16sf)(__m512)(C), \
+                                             (__mmask16)(U), (int)(R))
 
 
 #define _mm512_maskz_fmsubadd_round_ps(U, A, B, C, R) \
-  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-           __builtin_ia32_vfmaddsubps512((__v16sf)(__m512)(A), \
-                                         (__v16sf)(__m512)(B), \
-                                         -(__v16sf)(__m512)(C), \
-                                         (int)(R)), \
-            (__v16sf)_mm512_setzero_ps())
+  (__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \
+                                              (__v16sf)(__m512)(B), \
+                                              -(__v16sf)(__m512)(C), \
+                                              (__mmask16)(U), (int)(R))
 
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C)
 {
-  return (__m512) __builtin_ia32_vfmaddsubps512 ((__v16sf) __A,
-                                                 (__v16sf) __B,
-                                                 (__v16sf) __C,
-                                                 _MM_FROUND_CUR_DIRECTION);
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+                                                      (__v16sf) __B,
+                                                      (__v16sf) __C,
+                                                      (__mmask16) -1,
+                                                      _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_mask_fmaddsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
 {
-  return (__m512) __builtin_ia32_selectps_512((__mmask16) __U,
-                    __builtin_ia32_vfmaddsubps512 ((__v16sf) __A,
-                                                   (__v16sf) __B,
-                                                   (__v16sf) __C,
-                                                   _MM_FROUND_CUR_DIRECTION),
-            (__v16sf) __A);
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+                                                      (__v16sf) __B,
+                                                      (__v16sf) __C,
+                                                      (__mmask16) __U,
+                                                      _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_mask3_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
 {
-  return (__m512) __builtin_ia32_selectps_512((__mmask16) __U,
-                    __builtin_ia32_vfmaddsubps512 ((__v16sf) __A,
-                                                   (__v16sf) __B,
-                                                   (__v16sf) __C,
-                                                   _MM_FROUND_CUR_DIRECTION),
-            (__v16sf) __C);
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) __A,
+                                                       (__v16sf) __B,
+                                                       (__v16sf) __C,
+                                                       (__mmask16) __U,
+                                                       _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_maskz_fmaddsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
 {
-  return (__m512) __builtin_ia32_selectps_512((__mmask16) __U,
-                    __builtin_ia32_vfmaddsubps512 ((__v16sf) __A,
-                                                   (__v16sf) __B,
-                                                   (__v16sf) __C,
-                                                   _MM_FROUND_CUR_DIRECTION),
-            (__v16sf) _mm512_setzero_ps());
+  return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
+                                                       (__v16sf) __B,
+                                                       (__v16sf) __C,
+                                                       (__mmask16) __U,
+                                                       _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C)
 {
-  return (__m512) __builtin_ia32_vfmaddsubps512 ((__v16sf) __A,
-                                                 (__v16sf) __B,
-                                                 -(__v16sf) __C,
-                                                 _MM_FROUND_CUR_DIRECTION);
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+                                                      (__v16sf) __B,
+                                                      -(__v16sf) __C,
+                                                      (__mmask16) -1,
+                                                      _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_mask_fmsubadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
 {
-  return (__m512) __builtin_ia32_selectps_512((__mmask16) __U,
-                    __builtin_ia32_vfmaddsubps512 ((__v16sf) __A,
-                                                   (__v16sf) __B,
-                                                   -(__v16sf) __C,
-                                                   _MM_FROUND_CUR_DIRECTION),
-            (__v16sf) __A);
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+                                                      (__v16sf) __B,
+                                                      -(__v16sf) __C,
+                                                      (__mmask16) __U,
+                                                      _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_maskz_fmsubadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
 {
-  return (__m512) __builtin_ia32_selectps_512((__mmask16) __U,
-                    __builtin_ia32_vfmaddsubps512 ((__v16sf) __A,
-                                                   (__v16sf) __B,
-                                                   -(__v16sf) __C,
-                                                   _MM_FROUND_CUR_DIRECTION),
-            (__v16sf) _mm512_setzero_ps());
+  return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
+                                                       (__v16sf) __B,
+                                                       -(__v16sf) __C,
+                                                       (__mmask16) __U,
+                                                       _MM_FROUND_CUR_DIRECTION);
 }
 
 #define _mm512_mask3_fmsub_round_pd(A, B, C, U, R) \
-  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-                    (__m512d)__builtin_ia32_vfmaddpd512((__v8df)(__m512d)(A), \
-                                                        (__v8df)(__m512d)(B), \
-                                                        -(__v8df)(__m512d)(C), \
-                                                        (int)(R)), \
-                    (__v8df)(__m512d)(C))
+  (__m512d)__builtin_ia32_vfmsubpd512_mask3((__v8df)(__m512d)(A), \
+                                            (__v8df)(__m512d)(B), \
+                                            (__v8df)(__m512d)(C), \
+                                            (__mmask8)(U), (int)(R))
 
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_mask3_fmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
 {
-  return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U,
-                    (__m512d)__builtin_ia32_vfmaddpd512 ((__v8df) __A,
-                                                         (__v8df) __B,
-                                                         -(__v8df) __C,
-                                                         _MM_FROUND_CUR_DIRECTION),
-                    (__v8df) __C);
+  return (__m512d)__builtin_ia32_vfmsubpd512_mask3 ((__v8df) __A,
+                                                    (__v8df) __B,
+                                                    (__v8df) __C,
+                                                    (__mmask8) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
 }
 
 #define _mm512_mask3_fmsub_round_ps(A, B, C, U, R) \
-  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-                   (__m512)__builtin_ia32_vfmaddps512((__v16sf)(__m512)(A), \
-                                                      (__v16sf)(__m512)(B), \
-                                                      -(__v16sf)(__m512)(C), \
-                                                      (int)(R)), \
-                   (__v16sf)(__m512)(C))
+  (__m512)__builtin_ia32_vfmsubps512_mask3((__v16sf)(__m512)(A), \
+                                           (__v16sf)(__m512)(B), \
+                                           (__v16sf)(__m512)(C), \
+                                           (__mmask16)(U), (int)(R))
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_mask3_fmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
 {
-  return (__m512) __builtin_ia32_selectps_512((__mmask16) __U,
-                    (__m512)__builtin_ia32_vfmaddps512 ((__v16sf) __A,
-                                                        (__v16sf) __B,
-                                                        -(__v16sf) __C,
-                                                        _MM_FROUND_CUR_DIRECTION),
-                    (__v16sf) __C);
+  return (__m512)__builtin_ia32_vfmsubps512_mask3 ((__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   (__v16sf) __C,
+                                                   (__mmask16) __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
 }
 
 #define _mm512_mask3_fmsubadd_round_pd(A, B, C, U, R) \
-  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-                    (__m512d)__builtin_ia32_vfmaddsubpd512((__v8df)(__m512d)(A), \
-                                                           (__v8df)(__m512d)(B), \
-                                                           -(__v8df)(__m512d)(C), \
-                                                           (int)(R)), \
-                    (__v8df)(__m512d)(C))
+  (__m512d)__builtin_ia32_vfmsubaddpd512_mask3((__v8df)(__m512d)(A), \
+                                               (__v8df)(__m512d)(B), \
+                                               (__v8df)(__m512d)(C), \
+                                               (__mmask8)(U), (int)(R))
 
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_mask3_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
 {
-  return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U,
-                    (__m512d)__builtin_ia32_vfmaddsubpd512 ((__v8df) __A,
-                                                            (__v8df) __B,
-                                                            -(__v8df) __C,
-                                                            _MM_FROUND_CUR_DIRECTION),
-                    (__v8df) __C);
+  return (__m512d)__builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) __A,
+                                                       (__v8df) __B,
+                                                       (__v8df) __C,
+                                                       (__mmask8) __U,
+                                                       _MM_FROUND_CUR_DIRECTION);
 }
 
 #define _mm512_mask3_fmsubadd_round_ps(A, B, C, U, R) \
-  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-                    (__m512)__builtin_ia32_vfmaddsubps512((__v16sf)(__m512)(A), \
-                                                          (__v16sf)(__m512)(B), \
-                                                          -(__v16sf)(__m512)(C), \
-                                                          (int)(R)), \
-                    (__v16sf)(__m512)(C))
+  (__m512)__builtin_ia32_vfmsubaddps512_mask3((__v16sf)(__m512)(A), \
+                                              (__v16sf)(__m512)(B), \
+                                              (__v16sf)(__m512)(C), \
+                                              (__mmask16)(U), (int)(R))
 
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_mask3_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
 {
-  return (__m512) __builtin_ia32_selectps_512((__mmask16) __U,
-                    (__m512)__builtin_ia32_vfmaddsubps512 ((__v16sf) __A,
-                                                           (__v16sf) __B,
-                                                           -(__v16sf) __C,
-                                                           _MM_FROUND_CUR_DIRECTION),
-                    (__v16sf) __C);
+  return (__m512)__builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) __A,
+                                                      (__v16sf) __B,
+                                                      (__v16sf) __C,
+                                                      (__mmask16) __U,
+                                                      _MM_FROUND_CUR_DIRECTION);
 }
 
 #define _mm512_mask_fnmadd_round_pd(A, U, B, C, R) \
-  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-            __builtin_ia32_vfmaddpd512(-(__v8df)(__m512d)(A), \
-                                       (__v8df)(__m512d)(B), \
-                                       (__v8df)(__m512d)(C), \
-                                       (int)(R)), \
-            (__v8df)(__m512d)(A))
+  (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
+                                           -(__v8df)(__m512d)(B), \
+                                           (__v8df)(__m512d)(C), \
+                                           (__mmask8)(U), (int)(R))
 
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_mask_fnmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
 {
-  return (__m512d) __builtin_ia32_selectpd_512((__mmask8) __U,
-                    __builtin_ia32_vfmaddpd512 (-(__v8df) __A,
-                                                (__v8df) __B,
-                                                (__v8df) __C,
-                                                _MM_FROUND_CUR_DIRECTION),
-            (__v8df) __A);
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+                                                    -(__v8df) __B,
+                                                    (__v8df) __C,
+                                                    (__mmask8) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
 }
 
 #define _mm512_mask_fnmadd_round_ps(A, U, B, C, R) \
-  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-           __builtin_ia32_vfmaddps512(-(__v16sf)(__m512)(A), \
-                                      (__v16sf)(__m512)(B), \
-                                      (__v16sf)(__m512)(C), \
-                                      (int)(R)), \
-            (__v16sf)(__m512)(A))
+  (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
+                                          -(__v16sf)(__m512)(B), \
+                                          (__v16sf)(__m512)(C), \
+                                          (__mmask16)(U), (int)(R))
 
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_mask_fnmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
 {
-  return (__m512) __builtin_ia32_selectps_512((__mmask16) __U,
-                    __builtin_ia32_vfmaddps512 (-(__v16sf) __A,
-                                                (__v16sf) __B,
-                                                (__v16sf) __C,
-                                                _MM_FROUND_CUR_DIRECTION),
-            (__v16sf) __A);
+  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+                                                   -(__v16sf) __B,
+                                                   (__v16sf) __C,
+                                                   (__mmask16) __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
 }
 
 #define _mm512_mask_fnmsub_round_pd(A, U, B, C, R) \
-  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-            (__m512d)__builtin_ia32_vfmaddpd512((__v8df)(__m512d)(A), \
-                                                -(__v8df)(__m512d)(B), \
-                                                -(__v8df)(__m512d)(C), \
-                                                (int)(R)), \
-            (__v8df)(__m512d)(A))
+  (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
+                                           -(__v8df)(__m512d)(B), \
+                                           -(__v8df)(__m512d)(C), \
+                                           (__mmask8)(U), (int)(R))
 
 
 #define _mm512_mask3_fnmsub_round_pd(A, B, C, U, R) \
-  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-            (__m512d)__builtin_ia32_vfmaddpd512((__v8df)(__m512d)(A), \
-                                                -(__v8df)(__m512d)(B), \
-                                                -(__v8df)(__m512d)(C), \
-                                                (int)(R)), \
-            (__v8df)(__m512d)(C))
+  (__m512d)__builtin_ia32_vfmsubpd512_mask3(-(__v8df)(__m512d)(A), \
+                                            (__v8df)(__m512d)(B), \
+                                            (__v8df)(__m512d)(C), \
+                                            (__mmask8)(U), (int)(R))
 
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_mask_fnmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
 {
-  return (__m512d) __builtin_ia32_selectpd_512((__mmask16) __U,
-                    (__m512d) __builtin_ia32_vfmaddpd512 ((__v8df) __A,
-                                                           -(__v8df) __B,
-                                                           -(__v8df) __C,
-                                                           _MM_FROUND_CUR_DIRECTION),
-                    (__v8df) __A);
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+                                                    -(__v8df) __B,
+                                                    -(__v8df) __C,
+                                                    (__mmask8) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_mask3_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
 {
-  return (__m512d) __builtin_ia32_selectpd_512((__mmask16) __U,
-                    (__m512d) __builtin_ia32_vfmaddpd512 ((__v8df) __A,
-                                                           -(__v8df) __B,
-                                                           -(__v8df) __C,
-                                                           _MM_FROUND_CUR_DIRECTION),
-                    (__v8df) __C);
+  return (__m512d) __builtin_ia32_vfmsubpd512_mask3 (-(__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
 }
 
 #define _mm512_mask_fnmsub_round_ps(A, U, B, C, R) \
-  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-            (__m512)__builtin_ia32_vfmaddps512 ((__v16sf)(__m512)(A), \
-                                                -(__v16sf)(__m512)(B), \
-                                                -(__v16sf)(__m512)(C), \
-                                               (int)(R)), \
-            (__v16sf)(__m512)(A))
+  (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
+                                          -(__v16sf)(__m512)(B), \
+                                          -(__v16sf)(__m512)(C), \
+                                          (__mmask16)(U), (int)(R))
 
 
 #define _mm512_mask3_fnmsub_round_ps(A, B, C, U, R) \
-  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-            (__m512)__builtin_ia32_vfmaddps512 ((__v16sf)(__m512)(A), \
-                                                -(__v16sf)(__m512)(B), \
-                                                -(__v16sf)(__m512)(C), \
-                                               (int)(R)), \
-            (__v16sf)(__m512)(C))
+  (__m512)__builtin_ia32_vfmsubps512_mask3(-(__v16sf)(__m512)(A), \
+                                           (__v16sf)(__m512)(B), \
+                                           (__v16sf)(__m512)(C), \
+                                           (__mmask16)(U), (int)(R))
 
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_mask_fnmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
 {
-  return (__m512) __builtin_ia32_selectps_512((__mmask16) __U,
-                   (__m512) __builtin_ia32_vfmaddps512 ((__v16sf) __A,
-                                                        -(__v16sf) __B,
-                                                        -(__v16sf) __C,
-                                                        _MM_FROUND_CUR_DIRECTION),
-                   (__v16sf) __A);
+  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+                                                   -(__v16sf) __B,
+                                                   -(__v16sf) __C,
+                                                   (__mmask16) __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_mask3_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
 {
-  return (__m512) __builtin_ia32_selectps_512((__mmask16) __U,
-                   (__m512) __builtin_ia32_vfmaddps512 ((__v16sf) __A,
-                                                        -(__v16sf) __B,
-                                                        -(__v16sf) __C,
-                                                        _MM_FROUND_CUR_DIRECTION),
-                   (__v16sf) __C);
+  return (__m512) __builtin_ia32_vfmsubps512_mask3 (-(__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
 }
 
 
index b4fdcc0f7e9ddea007640e1b9f4733a8b2d90b35..44c87316419263557f2aa4af68f3937622df7e41 100644 (file)
@@ -2340,10 +2340,6 @@ bool Sema::CheckX86BuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall) {
   case X86::BI__builtin_ia32_cvtuqq2ps512_mask:
   case X86::BI__builtin_ia32_sqrtpd512_mask:
   case X86::BI__builtin_ia32_sqrtps512_mask:
-  case X86::BI__builtin_ia32_vfmaddpd512:
-  case X86::BI__builtin_ia32_vfmaddps512:
-  case X86::BI__builtin_ia32_vfmaddsubpd512:
-  case X86::BI__builtin_ia32_vfmaddsubps512:
     ArgNum = 3;
     HasRC = true;
     break;
@@ -2378,6 +2374,22 @@ bool Sema::CheckX86BuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall) {
   case X86::BI__builtin_ia32_vfmaddss3_mask:
   case X86::BI__builtin_ia32_vfmaddss3_maskz:
   case X86::BI__builtin_ia32_vfmaddss3_mask3:
+  case X86::BI__builtin_ia32_vfmaddpd512_mask:
+  case X86::BI__builtin_ia32_vfmaddpd512_maskz:
+  case X86::BI__builtin_ia32_vfmaddpd512_mask3:
+  case X86::BI__builtin_ia32_vfmsubpd512_mask3:
+  case X86::BI__builtin_ia32_vfmaddps512_mask:
+  case X86::BI__builtin_ia32_vfmaddps512_maskz:
+  case X86::BI__builtin_ia32_vfmaddps512_mask3:
+  case X86::BI__builtin_ia32_vfmsubps512_mask3:
+  case X86::BI__builtin_ia32_vfmaddsubpd512_mask:
+  case X86::BI__builtin_ia32_vfmaddsubpd512_maskz:
+  case X86::BI__builtin_ia32_vfmaddsubpd512_mask3:
+  case X86::BI__builtin_ia32_vfmsubaddpd512_mask3:
+  case X86::BI__builtin_ia32_vfmaddsubps512_mask:
+  case X86::BI__builtin_ia32_vfmaddsubps512_maskz:
+  case X86::BI__builtin_ia32_vfmaddsubps512_mask3:
+  case X86::BI__builtin_ia32_vfmsubaddps512_mask3:
     ArgNum = 4;
     HasRC = true;
     break;
index cd773791a2c81be9956184ff68f9abf3dd727bee..a6f7e17aa6e131f08894f1114c6c81371ca19cad 100644 (file)
@@ -461,7 +461,7 @@ __m512d test_mm512_maskz_fmadd_round_pd(__mmask8 __U, __m512d __A, __m512d __B,
   // CHECK-LABEL: @test_mm512_maskz_fmadd_round_pd
   // CHECK: @llvm.x86.avx512.vfmadd.pd.512
   // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer
   return _mm512_maskz_fmadd_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_fmsub_round_pd(__m512d __A, __m512d __B, __m512d __C) {
@@ -483,7 +483,7 @@ __m512d test_mm512_maskz_fmsub_round_pd(__mmask8 __U, __m512d __A, __m512d __B,
   // CHECK: fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>
   // CHECK: @llvm.x86.avx512.vfmadd.pd.512
   // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer
   return _mm512_maskz_fmsub_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_fnmadd_round_pd(__m512d __A, __m512d __B, __m512d __C) {
@@ -505,7 +505,7 @@ __m512d test_mm512_maskz_fnmadd_round_pd(__mmask8 __U, __m512d __A, __m512d __B,
   // CHECK: fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>
   // CHECK: @llvm.x86.avx512.vfmadd.pd.512
   // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer
   return _mm512_maskz_fnmadd_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_fnmsub_round_pd(__m512d __A, __m512d __B, __m512d __C) {
@@ -521,7 +521,7 @@ __m512d test_mm512_maskz_fnmsub_round_pd(__mmask8 __U, __m512d __A, __m512d __B,
   // CHECK: fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>
   // CHECK: @llvm.x86.avx512.vfmadd.pd.512
   // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer
   return _mm512_maskz_fnmsub_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_fmadd_pd(__m512d __A, __m512d __B, __m512d __C) {
@@ -547,7 +547,7 @@ __m512d test_mm512_maskz_fmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512
   // CHECK-LABEL: @test_mm512_maskz_fmadd_pd
   // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
   // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer
   return _mm512_maskz_fmadd_pd(__U, __A, __B, __C);
 }
 __m512d test_mm512_fmsub_pd(__m512d __A, __m512d __B, __m512d __C) {
@@ -569,7 +569,7 @@ __m512d test_mm512_maskz_fmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512
   // CHECK: fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.*}}
   // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
   // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer
   return _mm512_maskz_fmsub_pd(__U, __A, __B, __C);
 }
 __m512d test_mm512_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C) {
@@ -591,7 +591,7 @@ __m512d test_mm512_maskz_fnmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m51
   // CHECK: fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.*}}
   // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
   // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer
   return _mm512_maskz_fnmadd_pd(__U, __A, __B, __C);
 }
 __m512d test_mm512_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C) {
@@ -607,7 +607,7 @@ __m512d test_mm512_maskz_fnmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m51
   // CHECK: fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.*}}
   // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
   // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer
   return _mm512_maskz_fnmsub_pd(__U, __A, __B, __C);
 }
 __m512 test_mm512_fmadd_round_ps(__m512 __A, __m512 __B, __m512 __C) {
@@ -633,7 +633,7 @@ __m512 test_mm512_maskz_fmadd_round_ps(__mmask16 __U, __m512 __A, __m512 __B, __
   // CHECK-LABEL: @test_mm512_maskz_fmadd_round_ps
   // CHECK: @llvm.x86.avx512.vfmadd.ps.512
   // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> zeroinitializer
   return _mm512_maskz_fmadd_round_ps(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_fmsub_round_ps(__m512 __A, __m512 __B, __m512 __C) {
@@ -655,7 +655,7 @@ __m512 test_mm512_maskz_fmsub_round_ps(__mmask16 __U, __m512 __A, __m512 __B, __
   // CHECK: fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
   // CHECK: @llvm.x86.avx512.vfmadd.ps.512
   // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> zeroinitializer
   return _mm512_maskz_fmsub_round_ps(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_fnmadd_round_ps(__m512 __A, __m512 __B, __m512 __C) {
@@ -677,7 +677,7 @@ __m512 test_mm512_maskz_fnmadd_round_ps(__mmask16 __U, __m512 __A, __m512 __B, _
   // CHECK: fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
   // CHECK: @llvm.x86.avx512.vfmadd.ps.512
   // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> zeroinitializer
   return _mm512_maskz_fnmadd_round_ps(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_fnmsub_round_ps(__m512 __A, __m512 __B, __m512 __C) {
@@ -693,7 +693,7 @@ __m512 test_mm512_maskz_fnmsub_round_ps(__mmask16 __U, __m512 __A, __m512 __B, _
   // CHECK: fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
   // CHECK: @llvm.x86.avx512.vfmadd.ps.512
   // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> zeroinitializer
   return _mm512_maskz_fnmsub_round_ps(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_fmadd_ps(__m512 __A, __m512 __B, __m512 __C) {
@@ -717,7 +717,7 @@ __m512 test_mm512_maskz_fmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 _
   // CHECK-LABEL: @test_mm512_maskz_fmadd_ps
   // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}})
   // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> zeroinitializer
   return _mm512_maskz_fmadd_ps(__U, __A, __B, __C);
 }
 __m512 test_mm512_fmsub_ps(__m512 __A, __m512 __B, __m512 __C) {
@@ -739,7 +739,7 @@ __m512 test_mm512_maskz_fmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 _
   // CHECK: fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
   // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}})
   // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> zeroinitializer
   return _mm512_maskz_fmsub_ps(__U, __A, __B, __C);
 }
 __m512 test_mm512_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C) {
@@ -761,7 +761,7 @@ __m512 test_mm512_maskz_fnmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512
   // CHECK: fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
   // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}})
   // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> zeroinitializer
   return _mm512_maskz_fnmadd_ps(__U, __A, __B, __C);
 }
 __m512 test_mm512_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C) {
@@ -777,7 +777,7 @@ __m512 test_mm512_maskz_fnmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512
   // CHECK: fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
   // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}})
   // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> zeroinitializer
   return _mm512_maskz_fnmsub_ps(__U, __A, __B, __C);
 }
 __m512d test_mm512_fmaddsub_round_pd(__m512d __A, __m512d __B, __m512d __C) {
@@ -803,7 +803,7 @@ __m512d test_mm512_maskz_fmaddsub_round_pd(__mmask8 __U, __m512d __A, __m512d __
   // CHECK-LABEL: @test_mm512_maskz_fmaddsub_round_pd
   // CHECK: @llvm.x86.avx512.vfmaddsub.pd.512
   // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer
   return _mm512_maskz_fmaddsub_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_fmsubadd_round_pd(__m512d __A, __m512d __B, __m512d __C) {
@@ -825,7 +825,7 @@ __m512d test_mm512_maskz_fmsubadd_round_pd(__mmask8 __U, __m512d __A, __m512d __
   // CHECK: fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %{{.*}}
   // CHECK: @llvm.x86.avx512.vfmaddsub.pd.512
   // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer
   return _mm512_maskz_fmsubadd_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512d test_mm512_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C) {
@@ -863,7 +863,7 @@ __m512d test_mm512_maskz_fmaddsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m
   // check: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
   // check: select <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer
   return _mm512_maskz_fmaddsub_pd(__U, __A, __B, __C);
 }
 __m512d test_mm512_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C) {
@@ -891,7 +891,7 @@ __m512d test_mm512_maskz_fmsubadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m
   // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
   // CHECK: select <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer
   return _mm512_maskz_fmsubadd_pd(__U, __A, __B, __C);
 }
 __m512 test_mm512_fmaddsub_round_ps(__m512 __A, __m512 __B, __m512 __C) {
@@ -917,7 +917,7 @@ __m512 test_mm512_maskz_fmaddsub_round_ps(__mmask16 __U, __m512 __A, __m512 __B,
   // CHECK-LABEL: @test_mm512_maskz_fmaddsub_round_ps
   // CHECK: @llvm.x86.avx512.vfmaddsub.ps.512
   // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> zeroinitializer
   return _mm512_maskz_fmaddsub_round_ps(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_fmsubadd_round_ps(__m512 __A, __m512 __B, __m512 __C) {
@@ -939,7 +939,7 @@ __m512 test_mm512_maskz_fmsubadd_round_ps(__mmask16 __U, __m512 __A, __m512 __B,
   // CHECK: fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
   // CHECK: @llvm.x86.avx512.vfmaddsub.ps.512
   // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> zeroinitializer
   return _mm512_maskz_fmsubadd_round_ps(__U, __A, __B, __C, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 __m512 test_mm512_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C) {
@@ -977,7 +977,7 @@ __m512 test_mm512_maskz_fmaddsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m51
   // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}})
   // CHECK: select <16 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> zeroinitializer
   return _mm512_maskz_fmaddsub_ps(__U, __A, __B, __C);
 }
 __m512 test_mm512_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C) {
@@ -1005,7 +1005,7 @@ __m512 test_mm512_maskz_fmsubadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m51
   // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}})
   // CHECK: select <16 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> zeroinitializer
   return _mm512_maskz_fmsubadd_ps(__U, __A, __B, __C);
 }
 __m512d test_mm512_mask3_fmsub_round_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) {