From 2e5ffa9f6bde60beebf1c597912adb48cf9a58c6 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 9 Jun 2016 05:15:12 +0000 Subject: [PATCH] [X86] Handle AVX2 pslldqi and psrldqi intrinsics shufflevector creation directly in the header file instead of in CGBuiltin.cpp. Simplify the sse2 equivalents as well. git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@272246 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/clang/Basic/BuiltinsX86.def | 2 - lib/CodeGen/CGBuiltin.cpp | 52 ------------------- lib/Headers/avx2intrin.h | 78 ++++++++++++++++++++++++++-- lib/Headers/emmintrin.h | 80 +++++++++++++++-------------- test/CodeGen/avx2-builtins.c | 8 +-- 5 files changed, 120 insertions(+), 100 deletions(-) diff --git a/include/clang/Basic/BuiltinsX86.def b/include/clang/Basic/BuiltinsX86.def index 8191a3d1ff..d07c274171 100644 --- a/include/clang/Basic/BuiltinsX86.def +++ b/include/clang/Basic/BuiltinsX86.def @@ -557,7 +557,6 @@ TARGET_BUILTIN(__builtin_ia32_pshufb256, "V32cV32cV32c", "", "avx2") TARGET_BUILTIN(__builtin_ia32_psignb256, "V32cV32cV32c", "", "avx2") TARGET_BUILTIN(__builtin_ia32_psignw256, "V16sV16sV16s", "", "avx2") TARGET_BUILTIN(__builtin_ia32_psignd256, "V8iV8iV8i", "", "avx2") -TARGET_BUILTIN(__builtin_ia32_pslldqi256, "V4LLiV4LLiIi", "", "avx2") TARGET_BUILTIN(__builtin_ia32_psllwi256, "V16sV16si", "", "avx2") TARGET_BUILTIN(__builtin_ia32_psllw256, "V16sV16sV8s", "", "avx2") TARGET_BUILTIN(__builtin_ia32_pslldi256, "V8iV8ii", "", "avx2") @@ -568,7 +567,6 @@ TARGET_BUILTIN(__builtin_ia32_psrawi256, "V16sV16si", "", "avx2") TARGET_BUILTIN(__builtin_ia32_psraw256, "V16sV16sV8s", "", "avx2") TARGET_BUILTIN(__builtin_ia32_psradi256, "V8iV8ii", "", "avx2") TARGET_BUILTIN(__builtin_ia32_psrad256, "V8iV8iV4i", "", "avx2") -TARGET_BUILTIN(__builtin_ia32_psrldqi256, "V4LLiV4LLiIi", "", "avx2") TARGET_BUILTIN(__builtin_ia32_psrlwi256, "V16sV16si", "", "avx2") TARGET_BUILTIN(__builtin_ia32_psrlw256, "V16sV16sV8s", "", "avx2") TARGET_BUILTIN(__builtin_ia32_psrldi256, "V8iV8ii", "", "avx2") diff --git a/lib/CodeGen/CGBuiltin.cpp b/lib/CodeGen/CGBuiltin.cpp index 9013ca4f27..5db1ef51ef 100644 --- a/lib/CodeGen/CGBuiltin.cpp +++ b/lib/CodeGen/CGBuiltin.cpp @@ -6704,58 +6704,6 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, return EmitX86Select(*this, Ops[4], Align, Ops[3]); } - case X86::BI__builtin_ia32_pslldqi256: { - // Shift value is in bits so divide by 8. - unsigned shiftVal = cast(Ops[1])->getZExtValue() >> 3; - - // If pslldq is shifting the vector more than 15 bytes, emit zero. - if (shiftVal >= 16) - return llvm::Constant::getNullValue(ConvertType(E->getType())); - - int Indices[32]; - // 256-bit pslldq operates on 128-bit lanes so we need to handle that - for (unsigned l = 0; l != 32; l += 16) { - for (unsigned i = 0; i != 16; ++i) { - unsigned Idx = 32 + i - shiftVal; - if (Idx < 32) Idx -= 16; // end of lane, switch operand. - Indices[l + i] = Idx + l; - } - } - - llvm::Type *VecTy = llvm::VectorType::get(Int8Ty, 32); - Ops[0] = Builder.CreateBitCast(Ops[0], VecTy, "cast"); - Value *Zero = llvm::Constant::getNullValue(VecTy); - - Value *SV = Builder.CreateShuffleVector(Zero, Ops[0], Indices, "pslldq"); - llvm::Type *ResultType = ConvertType(E->getType()); - return Builder.CreateBitCast(SV, ResultType, "cast"); - } - case X86::BI__builtin_ia32_psrldqi256: { - // Shift value is in bits so divide by 8. - unsigned shiftVal = cast(Ops[1])->getZExtValue() >> 3; - - // If psrldq is shifting the vector more than 15 bytes, emit zero. - if (shiftVal >= 16) - return llvm::Constant::getNullValue(ConvertType(E->getType())); - - int Indices[32]; - // 256-bit psrldq operates on 128-bit lanes so we need to handle that - for (unsigned l = 0; l != 32; l += 16) { - for (unsigned i = 0; i != 16; ++i) { - unsigned Idx = i + shiftVal; - if (Idx >= 16) Idx += 16; // end of lane, switch operand. - Indices[l + i] = Idx + l; - } - } - - llvm::Type *VecTy = llvm::VectorType::get(Int8Ty, 32); - Ops[0] = Builder.CreateBitCast(Ops[0], VecTy, "cast"); - Value *Zero = llvm::Constant::getNullValue(VecTy); - - Value *SV = Builder.CreateShuffleVector(Ops[0], Zero, Indices, "psrldq"); - llvm::Type *ResultType = ConvertType(E->getType()); - return Builder.CreateBitCast(SV, ResultType, "cast"); - } case X86::BI__builtin_ia32_movntps: case X86::BI__builtin_ia32_movntps256: case X86::BI__builtin_ia32_movntpd: diff --git a/lib/Headers/avx2intrin.h b/lib/Headers/avx2intrin.h index 9df915a10b..fc8b939fe3 100644 --- a/lib/Headers/avx2intrin.h +++ b/lib/Headers/avx2intrin.h @@ -549,8 +549,43 @@ _mm256_sign_epi32(__m256i __a, __m256i __b) return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b); } -#define _mm256_slli_si256(a, count) __extension__ ({ \ - (__m256i)__builtin_ia32_pslldqi256((__m256i)(a), (count)*8); }) +#define _mm256_slli_si256(a, imm) __extension__ ({ \ + ((char)(imm)&0xF0) ? _mm256_setzero_si256() : \ + (__m256i)__builtin_shufflevector( \ + (__v32qi)_mm256_setzero_si256(), \ + (__v32qi)(__m256i)(a), \ + ((char)(imm)>0x0 ? 16 : 32) - (char)(imm), \ + ((char)(imm)>0x1 ? 17 : 33) - (char)(imm), \ + ((char)(imm)>0x2 ? 18 : 34) - (char)(imm), \ + ((char)(imm)>0x3 ? 19 : 35) - (char)(imm), \ + ((char)(imm)>0x4 ? 20 : 36) - (char)(imm), \ + ((char)(imm)>0x5 ? 21 : 37) - (char)(imm), \ + ((char)(imm)>0x6 ? 22 : 38) - (char)(imm), \ + ((char)(imm)>0x7 ? 23 : 39) - (char)(imm), \ + ((char)(imm)>0x8 ? 24 : 40) - (char)(imm), \ + ((char)(imm)>0x9 ? 25 : 41) - (char)(imm), \ + ((char)(imm)>0xA ? 26 : 42) - (char)(imm), \ + ((char)(imm)>0xB ? 27 : 43) - (char)(imm), \ + ((char)(imm)>0xC ? 28 : 44) - (char)(imm), \ + ((char)(imm)>0xD ? 29 : 45) - (char)(imm), \ + ((char)(imm)>0xE ? 30 : 46) - (char)(imm), \ + ((char)(imm)>0xF ? 31 : 47) - (char)(imm), \ + ((char)(imm)>0x0 ? 32 : 48) - (char)(imm), \ + ((char)(imm)>0x1 ? 33 : 49) - (char)(imm), \ + ((char)(imm)>0x2 ? 34 : 50) - (char)(imm), \ + ((char)(imm)>0x3 ? 35 : 51) - (char)(imm), \ + ((char)(imm)>0x4 ? 36 : 52) - (char)(imm), \ + ((char)(imm)>0x5 ? 37 : 53) - (char)(imm), \ + ((char)(imm)>0x6 ? 38 : 54) - (char)(imm), \ + ((char)(imm)>0x7 ? 39 : 55) - (char)(imm), \ + ((char)(imm)>0x8 ? 40 : 56) - (char)(imm), \ + ((char)(imm)>0x9 ? 41 : 57) - (char)(imm), \ + ((char)(imm)>0xA ? 42 : 58) - (char)(imm), \ + ((char)(imm)>0xB ? 43 : 59) - (char)(imm), \ + ((char)(imm)>0xC ? 44 : 60) - (char)(imm), \ + ((char)(imm)>0xD ? 45 : 61) - (char)(imm), \ + ((char)(imm)>0xE ? 46 : 62) - (char)(imm), \ + ((char)(imm)>0xF ? 47 : 63) - (char)(imm)); }) #define _mm256_bslli_epi128(a, count) _mm256_slli_si256((a), (count)) @@ -614,8 +649,43 @@ _mm256_sra_epi32(__m256i __a, __m128i __count) return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count); } -#define _mm256_srli_si256(a, count) __extension__ ({ \ - (__m256i)__builtin_ia32_psrldqi256((__m256i)(a), (count)*8); }) +#define _mm256_srli_si256(a, imm) __extension__ ({ \ + ((char)(imm)&0xF0) ? _mm256_setzero_si256() : \ + (__m256i)__builtin_shufflevector( \ + (__v32qi)(__m256i)(a), \ + (__v32qi)_mm256_setzero_si256(), \ + (char)(imm) + ((char)(imm)>0xF ? 16 : 0), \ + (char)(imm) + ((char)(imm)>0xE ? 17 : 1), \ + (char)(imm) + ((char)(imm)>0xD ? 18 : 2), \ + (char)(imm) + ((char)(imm)>0xC ? 19 : 3), \ + (char)(imm) + ((char)(imm)>0xB ? 20 : 4), \ + (char)(imm) + ((char)(imm)>0xA ? 21 : 5), \ + (char)(imm) + ((char)(imm)>0x9 ? 22 : 6), \ + (char)(imm) + ((char)(imm)>0x8 ? 23 : 7), \ + (char)(imm) + ((char)(imm)>0x7 ? 24 : 8), \ + (char)(imm) + ((char)(imm)>0x6 ? 25 : 9), \ + (char)(imm) + ((char)(imm)>0x5 ? 26 : 10), \ + (char)(imm) + ((char)(imm)>0x4 ? 27 : 11), \ + (char)(imm) + ((char)(imm)>0x3 ? 28 : 12), \ + (char)(imm) + ((char)(imm)>0x2 ? 29 : 13), \ + (char)(imm) + ((char)(imm)>0x1 ? 30 : 14), \ + (char)(imm) + ((char)(imm)>0x0 ? 31 : 15), \ + (char)(imm) + ((char)(imm)>0xF ? 32 : 16), \ + (char)(imm) + ((char)(imm)>0xE ? 33 : 17), \ + (char)(imm) + ((char)(imm)>0xD ? 34 : 18), \ + (char)(imm) + ((char)(imm)>0xC ? 35 : 19), \ + (char)(imm) + ((char)(imm)>0xB ? 36 : 20), \ + (char)(imm) + ((char)(imm)>0xA ? 37 : 21), \ + (char)(imm) + ((char)(imm)>0x9 ? 38 : 22), \ + (char)(imm) + ((char)(imm)>0x8 ? 39 : 23), \ + (char)(imm) + ((char)(imm)>0x7 ? 40 : 24), \ + (char)(imm) + ((char)(imm)>0x6 ? 41 : 25), \ + (char)(imm) + ((char)(imm)>0x5 ? 42 : 26), \ + (char)(imm) + ((char)(imm)>0x4 ? 43 : 27), \ + (char)(imm) + ((char)(imm)>0x3 ? 44 : 28), \ + (char)(imm) + ((char)(imm)>0x2 ? 45 : 29), \ + (char)(imm) + ((char)(imm)>0x1 ? 46 : 30), \ + (char)(imm) + ((char)(imm)>0x0 ? 47 : 31)); }) #define _mm256_bsrli_epi128(a, count) _mm256_srli_si256((a), (count)) diff --git a/lib/Headers/emmintrin.h b/lib/Headers/emmintrin.h index 80e391d268..adcf595fc6 100644 --- a/lib/Headers/emmintrin.h +++ b/lib/Headers/emmintrin.h @@ -1089,25 +1089,27 @@ _mm_xor_si128(__m128i __a, __m128i __b) /// An immediate value specifying the number of bytes to left-shift /// operand a. /// \returns A 128-bit integer vector containing the left-shifted value. -#define _mm_slli_si128(a, imm) __extension__ ({ \ - (__m128i)__builtin_shufflevector((__v16qi)_mm_setzero_si128(), \ - (__v16qi)(__m128i)(a), \ - ((imm)&0xF0) ? 0 : 16 - ((imm)&0xF), \ - ((imm)&0xF0) ? 0 : 17 - ((imm)&0xF), \ - ((imm)&0xF0) ? 0 : 18 - ((imm)&0xF), \ - ((imm)&0xF0) ? 0 : 19 - ((imm)&0xF), \ - ((imm)&0xF0) ? 0 : 20 - ((imm)&0xF), \ - ((imm)&0xF0) ? 0 : 21 - ((imm)&0xF), \ - ((imm)&0xF0) ? 0 : 22 - ((imm)&0xF), \ - ((imm)&0xF0) ? 0 : 23 - ((imm)&0xF), \ - ((imm)&0xF0) ? 0 : 24 - ((imm)&0xF), \ - ((imm)&0xF0) ? 0 : 25 - ((imm)&0xF), \ - ((imm)&0xF0) ? 0 : 26 - ((imm)&0xF), \ - ((imm)&0xF0) ? 0 : 27 - ((imm)&0xF), \ - ((imm)&0xF0) ? 0 : 28 - ((imm)&0xF), \ - ((imm)&0xF0) ? 0 : 29 - ((imm)&0xF), \ - ((imm)&0xF0) ? 0 : 30 - ((imm)&0xF), \ - ((imm)&0xF0) ? 0 : 31 - ((imm)&0xF)); }) +#define _mm_slli_si128(a, imm) __extension__ ({ \ + ((char)(imm)&0xF0) ? _mm_setzero_si128() : \ + (__m128i)__builtin_shufflevector( \ + (__v16qi)_mm_setzero_si128(), \ + (__v16qi)(__m128i)(a), \ + 16 - (char)(imm), \ + 17 - (char)(imm), \ + 18 - (char)(imm), \ + 19 - (char)(imm), \ + 20 - (char)(imm), \ + 21 - (char)(imm), \ + 22 - (char)(imm), \ + 23 - (char)(imm), \ + 24 - (char)(imm), \ + 25 - (char)(imm), \ + 26 - (char)(imm), \ + 27 - (char)(imm), \ + 28 - (char)(imm), \ + 29 - (char)(imm), \ + 30 - (char)(imm), \ + 31 - (char)(imm)); }) #define _mm_bslli_si128(a, imm) \ _mm_slli_si128((a), (imm)) @@ -1323,25 +1325,27 @@ _mm_sra_epi32(__m128i __a, __m128i __count) /// An immediate value specifying the number of bytes to right-shift operand /// a. /// \returns A 128-bit integer vector containing the right-shifted value. -#define _mm_srli_si128(a, imm) __extension__ ({ \ - (__m128i)__builtin_shufflevector((__v16qi)(__m128i)(a), \ - (__v16qi)_mm_setzero_si128(), \ - ((imm)&0xF0) ? 16 : ((imm)&0xF) + 0, \ - ((imm)&0xF0) ? 16 : ((imm)&0xF) + 1, \ - ((imm)&0xF0) ? 16 : ((imm)&0xF) + 2, \ - ((imm)&0xF0) ? 16 : ((imm)&0xF) + 3, \ - ((imm)&0xF0) ? 16 : ((imm)&0xF) + 4, \ - ((imm)&0xF0) ? 16 : ((imm)&0xF) + 5, \ - ((imm)&0xF0) ? 16 : ((imm)&0xF) + 6, \ - ((imm)&0xF0) ? 16 : ((imm)&0xF) + 7, \ - ((imm)&0xF0) ? 16 : ((imm)&0xF) + 8, \ - ((imm)&0xF0) ? 16 : ((imm)&0xF) + 9, \ - ((imm)&0xF0) ? 16 : ((imm)&0xF) + 10, \ - ((imm)&0xF0) ? 16 : ((imm)&0xF) + 11, \ - ((imm)&0xF0) ? 16 : ((imm)&0xF) + 12, \ - ((imm)&0xF0) ? 16 : ((imm)&0xF) + 13, \ - ((imm)&0xF0) ? 16 : ((imm)&0xF) + 14, \ - ((imm)&0xF0) ? 16 : ((imm)&0xF) + 15); }) +#define _mm_srli_si128(a, imm) __extension__ ({ \ + ((char)(imm)&0xF0) ? _mm_setzero_si128() : \ + (__m128i)__builtin_shufflevector( \ + (__v16qi)(__m128i)(a), \ + (__v16qi)_mm_setzero_si128(), \ + (char)(imm) + 0, \ + (char)(imm) + 1, \ + (char)(imm) + 2, \ + (char)(imm) + 3, \ + (char)(imm) + 4, \ + (char)(imm) + 5, \ + (char)(imm) + 6, \ + (char)(imm) + 7, \ + (char)(imm) + 8, \ + (char)(imm) + 9, \ + (char)(imm) + 10, \ + (char)(imm) + 11, \ + (char)(imm) + 12, \ + (char)(imm) + 13, \ + (char)(imm) + 14, \ + (char)(imm) + 15); }) #define _mm_bsrli_si128(a, imm) \ _mm_srli_si128((a), (imm)) diff --git a/test/CodeGen/avx2-builtins.c b/test/CodeGen/avx2-builtins.c index f7d850fe14..31e01fc9a3 100644 --- a/test/CodeGen/avx2-builtins.c +++ b/test/CodeGen/avx2-builtins.c @@ -232,13 +232,13 @@ __m256i test_mm256_broadcastw_epi16(__m128i a) { __m256i test_mm256_bslli_epi128(__m256i a) { // CHECK-LABEL: test_mm256_bslli_epi128 - // CHECK: shufflevector <32 x i8> zeroinitializer, <32 x i8> %{{.*}}, <32 x i32> + // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> return _mm256_bslli_epi128(a, 3); } __m256i test_mm256_bsrli_epi128(__m256i a) { // CHECK-LABEL: test_mm256_bsrli_epi128 - // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> zeroinitializer, <32 x i32> + // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> return _mm256_bsrli_epi128(a, 3); } @@ -959,7 +959,7 @@ __m256i test_mm256_slli_epi64(__m256i a) { __m256i test_mm256_slli_si256(__m256i a) { // CHECK-LABEL: test_mm256_slli_si256 - // CHECK: shufflevector <32 x i8> zeroinitializer, <32 x i8> %{{.*}}, <32 x i32> + // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> return _mm256_slli_si256(a, 3); } @@ -1061,7 +1061,7 @@ __m256i test_mm256_srli_epi64(__m256i a) { __m256i test_mm256_srli_si256(__m256i a) { // CHECK-LABEL: test_mm256_srli_si256 - // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> zeroinitializer, <32 x i32> + // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> return _mm256_srli_si256(a, 3); } -- 2.40.0