From: Craig Topper Date: Mon, 19 Dec 2011 07:03:25 +0000 (+0000) Subject: More AVX2 intrinsic support including saturating add/sub and palignr. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=9c2ffd803af03f1728423d0d73ff87d988642633;p=clang More AVX2 intrinsic support including saturating add/sub and palignr. git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@146857 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/include/clang/Basic/BuiltinsX86.def b/include/clang/Basic/BuiltinsX86.def index a67de2d866..3dc7271469 100644 --- a/include/clang/Basic/BuiltinsX86.def +++ b/include/clang/Basic/BuiltinsX86.def @@ -490,5 +490,14 @@ BUILTIN(__builtin_ia32_packsswb256, "V32cV16sV16s", "") BUILTIN(__builtin_ia32_packssdw256, "V16sV8iV8i", "") BUILTIN(__builtin_ia32_packuswb256, "V32cV16sV16s", "") BUILTIN(__builtin_ia32_packusdw256, "V16sV8iV8i", "") +BUILTIN(__builtin_ia32_paddsb256, "V32cV32cV32c", "") +BUILTIN(__builtin_ia32_paddsw256, "V16sV16sV16s", "") +BUILTIN(__builtin_ia32_psubsb256, "V32cV32cV32c", "") +BUILTIN(__builtin_ia32_psubsw256, "V16sV16sV16s", "") +BUILTIN(__builtin_ia32_paddusb256, "V32cV32cV32c", "") +BUILTIN(__builtin_ia32_paddusw256, "V16sV16sV16s", "") +BUILTIN(__builtin_ia32_psubusb256, "V32cV32cV32c", "") +BUILTIN(__builtin_ia32_psubusw256, "V16sV16sV16s", "") +BUILTIN(__builtin_ia32_palignr256, "V32cV32cV32cIc", "") #undef BUILTIN diff --git a/lib/CodeGen/CGBuiltin.cpp b/lib/CodeGen/CGBuiltin.cpp index ffe5fffa12..71d515646e 100644 --- a/lib/CodeGen/CGBuiltin.cpp +++ b/lib/CodeGen/CGBuiltin.cpp @@ -2288,6 +2288,44 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, // If palignr is shifting the pair of vectors more than 32 bytes, emit zero. return llvm::Constant::getNullValue(ConvertType(E->getType())); } + case X86::BI__builtin_ia32_palignr256: { + unsigned shiftVal = cast(Ops[2])->getZExtValue(); + + // If palignr is shifting the pair of input vectors less than 17 bytes, + // emit a shuffle instruction. + if (shiftVal <= 16) { + SmallVector Indices; + // 256-bit palignr operates on 128-bit lanes so we need to handle that + for (unsigned l = 0; l != 2; ++l) { + unsigned LaneStart = l * 16; + unsigned LaneEnd = (l+1) * 16; + for (unsigned i = 0; i != 16; ++i) { + unsigned Idx = shiftVal + i + LaneStart; + if (Idx >= LaneEnd) Idx += 16; // end of lane, switch operand + Indices.push_back(llvm::ConstantInt::get(Int32Ty, Idx)); + } + } + + Value* SV = llvm::ConstantVector::get(Indices); + return Builder.CreateShuffleVector(Ops[1], Ops[0], SV, "palignr"); + } + + // If palignr is shifting the pair of input vectors more than 16 but less + // than 32 bytes, emit a logical right shift of the destination. + if (shiftVal < 32) { + llvm::Type *VecTy = llvm::VectorType::get(Int64Ty, 4); + + Ops[0] = Builder.CreateBitCast(Ops[0], VecTy, "cast"); + Ops[1] = llvm::ConstantInt::get(Int32Ty, (shiftVal-16) * 8); + + // create i32 constant + llvm::Function *F = CGM.getIntrinsic(Intrinsic::x86_avx2_psrl_dq); + return Builder.CreateCall(F, makeArrayRef(&Ops[0], 2), "palignr"); + } + + // If palignr is shifting the pair of vectors more than 32 bytes, emit zero. + return llvm::Constant::getNullValue(ConvertType(E->getType())); + } case X86::BI__builtin_ia32_movntps: case X86::BI__builtin_ia32_movntpd: case X86::BI__builtin_ia32_movntdq: diff --git a/lib/Headers/avx2intrin.h b/lib/Headers/avx2intrin.h index 1cfcac5c29..e4f1e14c90 100644 --- a/lib/Headers/avx2intrin.h +++ b/lib/Headers/avx2intrin.h @@ -94,6 +94,35 @@ _mm256_add_epi64(__m256i a, __m256i b) return a + b; } +static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) +_mm256_adds_epi8(__m256i a, __m256i b) +{ + return (__m256i)__builtin_ia32_paddsb256((__v32qi)a, (__v32qi)b); +} + +static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) +_mm256_adds_epi16(__m256i a, __m256i b) +{ + return (__m256i)__builtin_ia32_paddsw256((__v16hi)a, (__v16hi)b); +} + +static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) +_mm256_adds_epu8(__m256i a, __m256i b) +{ + return (__m256i)__builtin_ia32_paddusb256((__v32qi)a, (__v32qi)b); +} + +static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) +_mm256_adds_epu16(__m256i a, __m256i b) +{ + return (__m256i)__builtin_ia32_paddusw256((__v16hi)a, (__v16hi)b); +} + +#define _mm256_alignr_epi8(a, b, n) __extension__ ({ \ + __m256i __a = (a); \ + __m256i __b = (b); \ + (__m256i)__builtin_ia32_palignr256((__v32qi)__a, (__v32qi)__b, (n)); }) + static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) _mm256_sub_epi8(__m256i a, __m256i b) { @@ -117,3 +146,28 @@ _mm256_sub_epi64(__m256i a, __m256i b) { return a - b; } + +static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) +_mm256_subs_epi8(__m256i a, __m256i b) +{ + return (__m256i)__builtin_ia32_psubsb256((__v32qi)a, (__v32qi)b); +} + +static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) +_mm256_subs_epi16(__m256i a, __m256i b) +{ + return (__m256i)__builtin_ia32_psubsw256((__v16hi)a, (__v16hi)b); +} + +static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) +_mm256_subs_epu8(__m256i a, __m256i b) +{ + return (__m256i)__builtin_ia32_psubusb256((__v32qi)a, (__v32qi)b); +} + +static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) +_mm256_subs_epu16(__m256i a, __m256i b) +{ + return (__m256i)__builtin_ia32_psubusw256((__v16hi)a, (__v16hi)b); +} + diff --git a/test/CodeGen/avx2-builtins.c b/test/CodeGen/avx2-builtins.c index 0d9a1a7644..b40342f5f6 100644 --- a/test/CodeGen/avx2-builtins.c +++ b/test/CodeGen/avx2-builtins.c @@ -65,6 +65,36 @@ __m256 test_mm256_add_epi64(__m256 a, __m256 b) { return _mm256_add_epi64(a, b); } +__m256 test_mm256_adds_epi8(__m256 a, __m256 b) { + // CHECK: @llvm.x86.avx2.padds.b + return _mm256_adds_epi8(a, b); +} + +__m256 test_mm256_adds_epi16(__m256 a, __m256 b) { + // CHECK: @llvm.x86.avx2.padds.w + return _mm256_adds_epi16(a, b); +} + +__m256 test_mm256_adds_epu8(__m256 a, __m256 b) { + // CHECK: @llvm.x86.avx2.paddus.b + return _mm256_adds_epu8(a, b); +} + +__m256 test_mm256_adds_epu16(__m256 a, __m256 b) { + // CHECK: @llvm.x86.avx2.paddus.w + return _mm256_adds_epu16(a, b); +} + +__m256 test_mm256_alignr_epi8(__m256 a, __m256 b) { + // CHECK: shufflevector <32 x i8> %1, <32 x i8> %0, <32 x i32> + return _mm256_alignr_epi8(a, b, 2); +} + +__m256 test2_mm256_alignr_epi8(__m256 a, __m256 b) { + // CHECK: @llvm.x86.avx2.psrl.dq({{.*}}, i32 8) + return _mm256_alignr_epi8(a, b, 17); +} + __m256 test_mm256_sub_epi8(__m256 a, __m256 b) { // CHECK: sub <32 x i8> return _mm256_sub_epi8(a, b); @@ -84,3 +114,23 @@ __m256 test_mm256_sub_epi64(__m256 a, __m256 b) { // CHECK: sub <4 x i64> return _mm256_sub_epi64(a, b); } + +__m256 test_mm256_subs_epi8(__m256 a, __m256 b) { + // CHECK: @llvm.x86.avx2.psubs.b + return _mm256_subs_epi8(a, b); +} + +__m256 test_mm256_subs_epi16(__m256 a, __m256 b) { + // CHECK: @llvm.x86.avx2.psubs.w + return _mm256_subs_epi16(a, b); +} + +__m256 test_mm256_subs_epu8(__m256 a, __m256 b) { + // CHECK: @llvm.x86.avx2.psubus.b + return _mm256_subs_epu8(a, b); +} + +__m256 test_mm256_subs_epu16(__m256 a, __m256 b) { + // CHECK: @llvm.x86.avx2.psubus.w + return _mm256_subs_epu16(a, b); +}