TARGET_BUILTIN(__builtin_ia32_vperm2f128_pd256, "V4dV4dV4dIc", "nc", "avx")
TARGET_BUILTIN(__builtin_ia32_vperm2f128_ps256, "V8fV8fV8fIc", "nc", "avx")
TARGET_BUILTIN(__builtin_ia32_vperm2f128_si256, "V8iV8iV8iIc", "nc", "avx")
+TARGET_BUILTIN(__builtin_ia32_vpermilpd, "V2dV2dIi", "nc", "avx")
+TARGET_BUILTIN(__builtin_ia32_vpermilps, "V4fV4fIi", "nc", "avx")
+TARGET_BUILTIN(__builtin_ia32_vpermilpd256, "V4dV4dIi", "nc", "avx")
+TARGET_BUILTIN(__builtin_ia32_vpermilps256, "V8fV8fIi", "nc", "avx")
TARGET_BUILTIN(__builtin_ia32_sqrtpd256, "V4dV4d", "nc", "avx")
TARGET_BUILTIN(__builtin_ia32_sqrtps256, "V8fV8f", "nc", "avx")
TARGET_BUILTIN(__builtin_ia32_rsqrtps256, "V8fV8f", "nc", "avx")
TARGET_BUILTIN(__builtin_ia32_vcvttsd2usi32, "UiV2dIi", "nc", "avx512f")
TARGET_BUILTIN(__builtin_ia32_vcvttss2si32, "iV4fIi", "nc", "avx512f")
TARGET_BUILTIN(__builtin_ia32_vcvttss2usi32, "UiV4fIi", "nc", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_vpermilpd512, "V8dV8dIi", "nc", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_vpermilps512, "V16fV16fIi", "nc", "avx512f")
TARGET_BUILTIN(__builtin_ia32_vpermilvarpd512, "V8dV8dV8LLi", "nc", "avx512f")
TARGET_BUILTIN(__builtin_ia32_vpermilvarps512, "V16fV16fV16i", "nc", "avx512f")
TARGET_BUILTIN(__builtin_ia32_rndscalesd_round_mask, "V2dV2dV2dV2dUcIiIi", "nc", "avx512f")
makeArrayRef(Indices, NumElts),
"blend");
}
+ case X86::BI__builtin_ia32_vpermilpd:
+ case X86::BI__builtin_ia32_vpermilps:
+ case X86::BI__builtin_ia32_vpermilpd256:
+ case X86::BI__builtin_ia32_vpermilps256:
+ case X86::BI__builtin_ia32_vpermilpd512:
+ case X86::BI__builtin_ia32_vpermilps512: {
+ uint32_t Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
+ llvm::Type *Ty = Ops[0]->getType();
+ unsigned NumElts = Ty->getVectorNumElements();
+ unsigned NumLanes = Ty->getPrimitiveSizeInBits() / 128;
+ unsigned NumLaneElts = NumElts / NumLanes;
+
+ // Splat the 8-bits of immediate 4 times to help the loop wrap around.
+ Imm = (Imm & 0xff) * 0x01010101;
+
+ uint32_t Indices[16];
+ for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+ for (unsigned i = 0; i != NumLaneElts; ++i) {
+ Indices[i + l] = (Imm % NumLaneElts) + l;
+ Imm /= NumLaneElts;
+ }
+ }
+
+ return Builder.CreateShuffleVector(Ops[0], UndefValue::get(Ty),
+ makeArrayRef(Indices, NumElts),
+ "permil");
+ }
case X86::BI__builtin_ia32_palignr128:
case X86::BI__builtin_ia32_palignr256:
case X86::BI__builtin_ia32_palignr512: {
#endif
#define _mm512_permute_pd(X, C) \
- (__m512d)__builtin_shufflevector((__v8df)(__m512d)(X), \
- (__v8df)_mm512_undefined_pd(), \
- 0 + (((C) >> 0) & 0x1), \
- 0 + (((C) >> 1) & 0x1), \
- 2 + (((C) >> 2) & 0x1), \
- 2 + (((C) >> 3) & 0x1), \
- 4 + (((C) >> 4) & 0x1), \
- 4 + (((C) >> 5) & 0x1), \
- 6 + (((C) >> 6) & 0x1), \
- 6 + (((C) >> 7) & 0x1))
+ (__m512d)__builtin_ia32_vpermilpd512((__v8df)(__m512d)(X), (int)(C))
#define _mm512_mask_permute_pd(W, U, X, C) \
(__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
(__v8df)_mm512_setzero_pd())
#define _mm512_permute_ps(X, C) \
- (__m512)__builtin_shufflevector((__v16sf)(__m512)(X), \
- (__v16sf)_mm512_undefined_ps(), \
- 0 + (((C) >> 0) & 0x3), \
- 0 + (((C) >> 2) & 0x3), \
- 0 + (((C) >> 4) & 0x3), \
- 0 + (((C) >> 6) & 0x3), \
- 4 + (((C) >> 0) & 0x3), \
- 4 + (((C) >> 2) & 0x3), \
- 4 + (((C) >> 4) & 0x3), \
- 4 + (((C) >> 6) & 0x3), \
- 8 + (((C) >> 0) & 0x3), \
- 8 + (((C) >> 2) & 0x3), \
- 8 + (((C) >> 4) & 0x3), \
- 8 + (((C) >> 6) & 0x3), \
- 12 + (((C) >> 0) & 0x3), \
- 12 + (((C) >> 2) & 0x3), \
- 12 + (((C) >> 4) & 0x3), \
- 12 + (((C) >> 6) & 0x3))
+ (__m512)__builtin_ia32_vpermilps512((__v16sf)(__m512)(X), (int)(C))
#define _mm512_mask_permute_ps(W, U, X, C) \
(__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
/// returned vector.
/// \returns A 128-bit vector of [2 x double] containing the copied values.
#define _mm_permute_pd(A, C) \
- (__m128d)__builtin_shufflevector((__v2df)(__m128d)(A), \
- (__v2df)_mm_undefined_pd(), \
- ((C) >> 0) & 0x1, ((C) >> 1) & 0x1)
+ (__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C))
/// Copies the values in a 256-bit vector of [4 x double] as specified by
/// the immediate integer operand.
/// returned vector.
/// \returns A 256-bit vector of [4 x double] containing the copied values.
#define _mm256_permute_pd(A, C) \
- (__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \
- (__v4df)_mm256_undefined_pd(), \
- 0 + (((C) >> 0) & 0x1), \
- 0 + (((C) >> 1) & 0x1), \
- 2 + (((C) >> 2) & 0x1), \
- 2 + (((C) >> 3) & 0x1))
+ (__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C))
/// Copies the values in a 128-bit vector of [4 x float] as specified by
/// the immediate integer operand.
/// returned vector.
/// \returns A 128-bit vector of [4 x float] containing the copied values.
#define _mm_permute_ps(A, C) \
- (__m128)__builtin_shufflevector((__v4sf)(__m128)(A), \
- (__v4sf)_mm_undefined_ps(), \
- ((C) >> 0) & 0x3, ((C) >> 2) & 0x3, \
- ((C) >> 4) & 0x3, ((C) >> 6) & 0x3)
+ (__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C))
/// Copies the values in a 256-bit vector of [8 x float] as specified by
/// the immediate integer operand.
/// returned vector.
/// \returns A 256-bit vector of [8 x float] containing the copied values.
#define _mm256_permute_ps(A, C) \
- (__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \
- (__v8sf)_mm256_undefined_ps(), \
- 0 + (((C) >> 0) & 0x3), \
- 0 + (((C) >> 2) & 0x3), \
- 0 + (((C) >> 4) & 0x3), \
- 0 + (((C) >> 6) & 0x3), \
- 4 + (((C) >> 0) & 0x3), \
- 4 + (((C) >> 2) & 0x3), \
- 4 + (((C) >> 4) & 0x3), \
- 4 + (((C) >> 6) & 0x3))
+ (__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C))
/// Permutes 128-bit data values stored in two 256-bit vectors of
/// [4 x double], as specified by the immediate integer operand.
case X86::BI__builtin_ia32_vec_set_v2di:
i = 2; l = 0; u = 1;
break;
+ case X86::BI__builtin_ia32_vpermilpd:
case X86::BI__builtin_ia32_vec_ext_v4hi:
case X86::BI__builtin_ia32_vec_ext_v4si:
case X86::BI__builtin_ia32_vec_ext_v4sf:
case X86::BI__builtin_ia32_vec_set_v8si:
i = 2; l = 0; u = 7;
break;
+ case X86::BI__builtin_ia32_vpermilpd256:
case X86::BI__builtin_ia32_roundps:
case X86::BI__builtin_ia32_roundpd:
case X86::BI__builtin_ia32_roundps256:
case X86::BI__builtin_ia32_vec_set_v32qi:
i = 2; l = 0; u = 31;
break;
+ case X86::BI__builtin_ia32_vpermilps:
+ case X86::BI__builtin_ia32_vpermilps256:
+ case X86::BI__builtin_ia32_vpermilpd512:
+ case X86::BI__builtin_ia32_vpermilps512:
case X86::BI__builtin_ia32_vcvtps2ph:
case X86::BI__builtin_ia32_vcvtps2ph_mask:
case X86::BI__builtin_ia32_vcvtps2ph256:
__m128d test_mm_permute_pd(__m128d A) {
// CHECK-LABEL: test_mm_permute_pd
- // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> zeroinitializer, <2 x i32> <i32 1, i32 0>
+ // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> undef, <2 x i32> <i32 1, i32 0>
return _mm_permute_pd(A, 1);
}
__m256d test_mm256_permute_pd(__m256d A) {
// CHECK-LABEL: test_mm256_permute_pd
- // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> zeroinitializer, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+ // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
return _mm256_permute_pd(A, 5);
}
__m128 test_mm_permute_ps(__m128 A) {
// CHECK-LABEL: test_mm_permute_ps
- // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> zeroinitializer, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+ // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
return _mm_permute_ps(A, 0x1b);
}
// Test case for PR12401
__m128 test2_mm_permute_ps(__m128 a) {
// CHECK-LABEL: test2_mm_permute_ps
- // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> zeroinitializer, <4 x i32> <i32 2, i32 1, i32 2, i32 3>
+ // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> undef, <4 x i32> <i32 2, i32 1, i32 2, i32 3>
return _mm_permute_ps(a, 0xe6);
}
__m256 test_mm256_permute_ps(__m256 A) {
// CHECK-LABEL: test_mm256_permute_ps
- // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> zeroinitializer, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+ // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
return _mm256_permute_ps(A, 0x1b);
}
__m512d test_mm512_permute_pd(__m512d __X) {
// CHECK-LABEL: @test_mm512_permute_pd
- // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
return _mm512_permute_pd(__X, 2);
}
__m512d test_mm512_mask_permute_pd(__m512d __W, __mmask8 __U, __m512d __X) {
// CHECK-LABEL: @test_mm512_mask_permute_pd
- // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
// CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
return _mm512_mask_permute_pd(__W, __U, __X, 2);
}
__m512d test_mm512_maskz_permute_pd(__mmask8 __U, __m512d __X) {
// CHECK-LABEL: @test_mm512_maskz_permute_pd
- // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
// CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
return _mm512_maskz_permute_pd(__U, __X, 2);
}
__m512 test_mm512_permute_ps(__m512 __X) {
// CHECK-LABEL: @test_mm512_permute_ps
- // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> zeroinitializer, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
+ // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
return _mm512_permute_ps(__X, 2);
}
__m512 test_mm512_mask_permute_ps(__m512 __W, __mmask16 __U, __m512 __X) {
// CHECK-LABEL: @test_mm512_mask_permute_ps
- // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> zeroinitializer, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
+ // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
// CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
return _mm512_mask_permute_ps(__W, __U, __X, 2);
}
__m512 test_mm512_maskz_permute_ps(__mmask16 __U, __m512 __X) {
// CHECK-LABEL: @test_mm512_maskz_permute_ps
- // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> zeroinitializer, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
+ // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
// CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
return _mm512_maskz_permute_ps(__U, __X, 2);
}
__m128d test_mm_mask_permute_pd(__m128d __W, __mmask8 __U, __m128d __X) {
// CHECK-LABEL: @test_mm_mask_permute_pd
- // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> zeroinitializer, <2 x i32> <i32 1, i32 0>
+ // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> undef, <2 x i32> <i32 1, i32 0>
// CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
return _mm_mask_permute_pd(__W, __U, __X, 1);
}
__m128d test_mm_maskz_permute_pd(__mmask8 __U, __m128d __X) {
// CHECK-LABEL: @test_mm_maskz_permute_pd
- // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> zeroinitializer, <2 x i32> <i32 1, i32 0>
+ // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> undef, <2 x i32> <i32 1, i32 0>
// CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
return _mm_maskz_permute_pd(__U, __X, 1);
}
__m256d test_mm256_mask_permute_pd(__m256d __W, __mmask8 __U, __m256d __X) {
// CHECK-LABEL: @test_mm256_mask_permute_pd
- // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> zeroinitializer, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+ // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
// CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
return _mm256_mask_permute_pd(__W, __U, __X, 5);
}
__m256d test_mm256_maskz_permute_pd(__mmask8 __U, __m256d __X) {
// CHECK-LABEL: @test_mm256_maskz_permute_pd
- // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> zeroinitializer, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+ // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
// CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
return _mm256_maskz_permute_pd(__U, __X, 5);
}
__m128 test_mm_mask_permute_ps(__m128 __W, __mmask8 __U, __m128 __X) {
// CHECK-LABEL: @test_mm_mask_permute_ps
- // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> zeroinitializer, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+ // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
// CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
return _mm_mask_permute_ps(__W, __U, __X, 0x1b);
}
__m128 test_mm_maskz_permute_ps(__mmask8 __U, __m128 __X) {
// CHECK-LABEL: @test_mm_maskz_permute_ps
- // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> zeroinitializer, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+ // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
// CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
return _mm_maskz_permute_ps(__U, __X, 0x1b);
}
__m256 test_mm256_mask_permute_ps(__m256 __W, __mmask8 __U, __m256 __X) {
// CHECK-LABEL: @test_mm256_mask_permute_ps
- // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> zeroinitializer, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+ // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
// CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
return _mm256_mask_permute_ps(__W, __U, __X, 0x1b);
}
__m256 test_mm256_maskz_permute_ps(__mmask8 __U, __m256 __X) {
// CHECK-LABEL: @test_mm256_maskz_permute_ps
- // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> zeroinitializer, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+ // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
// CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
return _mm256_maskz_permute_ps(__U, __X, 0x1b);
}