TARGET_BUILTIN(__builtin_ia32_storeapd512_mask, "vV8d*V8dUc", "n", "avx512f")
TARGET_BUILTIN(__builtin_ia32_storeups512_mask, "vf*V16fUs", "n", "avx512f")
TARGET_BUILTIN(__builtin_ia32_storeaps512_mask, "vV16f*V16fUs", "n", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_alignq512, "V8LLiV8LLiV8LLiIi", "nc", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_alignd512, "V16iV16iV16iIi", "nc", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_alignd128, "V4iV4iV4iIi", "nc", "avx512vl")
+TARGET_BUILTIN(__builtin_ia32_alignd256, "V8iV8iV8iIi", "nc", "avx512vl")
+TARGET_BUILTIN(__builtin_ia32_alignq128, "V2LLiV2LLiV2LLiIi", "nc", "avx512vl")
+TARGET_BUILTIN(__builtin_ia32_alignq256, "V4LLiV4LLiV4LLiIi", "nc", "avx512vl")
TARGET_BUILTIN(__builtin_ia32_vpdpbusd128, "V4iV4iV4iV4i", "nc", "avx512vl,avx512vnni")
TARGET_BUILTIN(__builtin_ia32_vpdpbusd256, "V8iV8iV8iV8i", "nc", "avx512vl,avx512vnni")
makeArrayRef(Indices, NumElts),
"palignr");
}
+ case X86::BI__builtin_ia32_alignd128:
+ case X86::BI__builtin_ia32_alignd256:
+ case X86::BI__builtin_ia32_alignd512:
+ case X86::BI__builtin_ia32_alignq128:
+ case X86::BI__builtin_ia32_alignq256:
+ case X86::BI__builtin_ia32_alignq512: {
+ unsigned NumElts = Ops[0]->getType()->getVectorNumElements();
+ unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
+
+ // Mask the shift amount to width of two vectors.
+ ShiftVal &= (2 * NumElts) - 1;
+
+ uint32_t Indices[16];
+ for (unsigned i = 0; i != NumElts; ++i)
+ Indices[i] = i + ShiftVal;
+
+ return Builder.CreateShuffleVector(Ops[1], Ops[0],
+ makeArrayRef(Indices, NumElts),
+ "valign");
+ }
case X86::BI__builtin_ia32_vperm2f128_pd256:
case X86::BI__builtin_ia32_vperm2f128_ps256:
}
#define _mm512_alignr_epi64(A, B, I) \
- (__m512i)__builtin_shufflevector((__v8di)(__m512i)(B), \
- (__v8di)(__m512i)(A), \
- ((int)(I) & 0x7) + 0, \
- ((int)(I) & 0x7) + 1, \
- ((int)(I) & 0x7) + 2, \
- ((int)(I) & 0x7) + 3, \
- ((int)(I) & 0x7) + 4, \
- ((int)(I) & 0x7) + 5, \
- ((int)(I) & 0x7) + 6, \
- ((int)(I) & 0x7) + 7)
+ (__m512i)__builtin_ia32_alignq512((__v8di)(__m512i)(A), \
+ (__v8di)(__m512i)(B), (int)(I))
#define _mm512_mask_alignr_epi64(W, U, A, B, imm) \
(__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
(__v8di)_mm512_setzero_si512())
#define _mm512_alignr_epi32(A, B, I) \
- (__m512i)__builtin_shufflevector((__v16si)(__m512i)(B), \
- (__v16si)(__m512i)(A), \
- ((int)(I) & 0xf) + 0, \
- ((int)(I) & 0xf) + 1, \
- ((int)(I) & 0xf) + 2, \
- ((int)(I) & 0xf) + 3, \
- ((int)(I) & 0xf) + 4, \
- ((int)(I) & 0xf) + 5, \
- ((int)(I) & 0xf) + 6, \
- ((int)(I) & 0xf) + 7, \
- ((int)(I) & 0xf) + 8, \
- ((int)(I) & 0xf) + 9, \
- ((int)(I) & 0xf) + 10, \
- ((int)(I) & 0xf) + 11, \
- ((int)(I) & 0xf) + 12, \
- ((int)(I) & 0xf) + 13, \
- ((int)(I) & 0xf) + 14, \
- ((int)(I) & 0xf) + 15)
+ (__m512i)__builtin_ia32_alignd512((__v16si)(__m512i)(A), \
+ (__v16si)(__m512i)(B), (int)(I))
#define _mm512_mask_alignr_epi32(W, U, A, B, imm) \
(__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
}
#define _mm_alignr_epi32(A, B, imm) \
- (__m128i)__builtin_shufflevector((__v4si)(__m128i)(B), \
- (__v4si)(__m128i)(A), \
- ((int)(imm) & 0x3) + 0, \
- ((int)(imm) & 0x3) + 1, \
- ((int)(imm) & 0x3) + 2, \
- ((int)(imm) & 0x3) + 3)
+ (__m128i)__builtin_ia32_alignd128((__v4si)(__m128i)(A), \
+ (__v4si)(__m128i)(B), (int)(imm))
#define _mm_mask_alignr_epi32(W, U, A, B, imm) \
(__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
(__v4si)_mm_setzero_si128())
#define _mm256_alignr_epi32(A, B, imm) \
- (__m256i)__builtin_shufflevector((__v8si)(__m256i)(B), \
- (__v8si)(__m256i)(A), \
- ((int)(imm) & 0x7) + 0, \
- ((int)(imm) & 0x7) + 1, \
- ((int)(imm) & 0x7) + 2, \
- ((int)(imm) & 0x7) + 3, \
- ((int)(imm) & 0x7) + 4, \
- ((int)(imm) & 0x7) + 5, \
- ((int)(imm) & 0x7) + 6, \
- ((int)(imm) & 0x7) + 7)
+ (__m256i)__builtin_ia32_alignd256((__v8si)(__m256i)(A), \
+ (__v8si)(__m256i)(B), (int)(imm))
#define _mm256_mask_alignr_epi32(W, U, A, B, imm) \
(__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
(__v8si)_mm256_setzero_si256())
#define _mm_alignr_epi64(A, B, imm) \
- (__m128i)__builtin_shufflevector((__v2di)(__m128i)(B), \
- (__v2di)(__m128i)(A), \
- ((int)(imm) & 0x1) + 0, \
- ((int)(imm) & 0x1) + 1)
+ (__m128i)__builtin_ia32_alignq128((__v2di)(__m128i)(A), \
+ (__v2di)(__m128i)(B), (int)(imm))
#define _mm_mask_alignr_epi64(W, U, A, B, imm) \
(__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
(__v2di)_mm_setzero_si128())
#define _mm256_alignr_epi64(A, B, imm) \
- (__m256i)__builtin_shufflevector((__v4di)(__m256i)(B), \
- (__v4di)(__m256i)(A), \
- ((int)(imm) & 0x3) + 0, \
- ((int)(imm) & 0x3) + 1, \
- ((int)(imm) & 0x3) + 2, \
- ((int)(imm) & 0x3) + 3)
+ (__m256i)__builtin_ia32_alignq256((__v4di)(__m256i)(A), \
+ (__v4di)(__m256i)(B), (int)(imm))
#define _mm256_mask_alignr_epi64(W, U, A, B, imm) \
(__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
case X86::BI__builtin_ia32_palignr128:
case X86::BI__builtin_ia32_palignr256:
case X86::BI__builtin_ia32_palignr512:
+ case X86::BI__builtin_ia32_alignq512:
+ case X86::BI__builtin_ia32_alignd512:
+ case X86::BI__builtin_ia32_alignd128:
+ case X86::BI__builtin_ia32_alignd256:
+ case X86::BI__builtin_ia32_alignq128:
+ case X86::BI__builtin_ia32_alignq256:
case X86::BI__builtin_ia32_vcomisd:
case X86::BI__builtin_ia32_vcomiss:
case X86::BI__builtin_ia32_dbpsadbw128_mask: