From: Henrik Gramner Date: Tue, 14 May 2013 16:53:12 +0000 (+0200) Subject: x86: AVX2 high bit-depth dequant X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=edf31ed3577f35e7ed3934dd74be474f9d22384a;p=libx264 x86: AVX2 high bit-depth dequant --- diff --git a/common/quant.c b/common/quant.c index d1f7b2fd..2fa1521d 100644 --- a/common/quant.c +++ b/common/quant.c @@ -543,6 +543,8 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->quant_4x4_dc = x264_quant_4x4_dc_avx2; pf->quant_8x8 = x264_quant_8x8_avx2; pf->quant_4x4x4 = x264_quant_4x4x4_avx2; + pf->dequant_4x4 = x264_dequant_4x4_avx2; + pf->dequant_8x8 = x264_dequant_8x8_avx2; pf->denoise_dct = x264_denoise_dct_avx2; } #endif // HAVE_MMX diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm index 7c2557b1..02328a9a 100644 --- a/common/x86/quant-a.asm +++ b/common/x86/quant-a.asm @@ -524,19 +524,25 @@ cglobal quant_4x4x4, 3,3,6 ;;; %1 dct[y][x] ;;; %2,%3 dequant_mf[i_mf][y][x] ;;; m2 i_qbits - mova m0, %2 %if HIGH_BIT_DEPTH - pmaddwd m0, %1 - pslld m0, m2 + mova m0, %1 + mova m1, %4 + pmaddwd m0, %2 + pmaddwd m1, %3 + pslld m0, xm2 + pslld m1, xm2 + mova %1, m0 + mova %4, m1 %else + mova m0, %2 packssdw m0, %3 %if mmsize==32 vpermq m0, m0, q3120 %endif pmullw m0, %1 psllw m0, xm2 -%endif mova %1, m0 +%endif %endmacro %macro DEQUANT32_R 4 @@ -545,33 +551,34 @@ cglobal quant_4x4x4, 3,3,6 ;;; m2 -i_qbits ;;; m3 f ;;; m4 0 -%if mmsize==32 - pmovzxwd m0, %1 - pmovzxwd m1, %4 - pmaddwd m0, %2 - pmaddwd m1, %3 - paddd m0, m3 - paddd m1, m3 +%if HIGH_BIT_DEPTH + mova m0, %1 + mova m1, %4 + pmadcswd m0, m0, %2, m3 + pmadcswd m1, m1, %3, m3 psrad m0, xm2 psrad m1, xm2 - packssdw m0, m1 - vpermq m0, m0, q3120 + mova %1, m0 + mova %4, m1 %else - mova m0, %1 -%if HIGH_BIT_DEPTH - pmadcswd m0, m0, %2, m3 - psrad m0, m2 +%if mmsize == 32 + pmovzxwd m0, %1 + pmovzxwd m1, %4 %else + mova m0, %1 punpckhwd m1, m0, m4 punpcklwd m0, m4 +%endif pmadcswd m0, m0, %2, m3 pmadcswd m1, m1, %3, m3 - psrad m0, m2 - psrad m1, m2 + psrad m0, xm2 + psrad m1, xm2 packssdw m0, m1 -%endif +%if mmsize == 32 + vpermq m0, m0, q3120 %endif mova %1, m0 +%endif %endmacro %macro DEQUANT_LOOP 3 @@ -609,10 +616,8 @@ cglobal quant_4x4x4, 3,3,6 %endrep %endmacro -%if WIN64 +%if ARCH_X86_64 DECLARE_REG_TMP 6,3,2 -%elif ARCH_X86_64 - DECLARE_REG_TMP 4,3,2 %else DECLARE_REG_TMP 2,0,1 %endif @@ -621,8 +626,8 @@ cglobal quant_4x4x4, 3,3,6 movifnidn t2d, r2m imul t0d, t2d, 0x2b shr t0d, 8 ; i_qbits = i_qp / 6 - lea t1, [t0*3] - sub t2d, t1d + lea t1d, [t0*5] + sub t2d, t0d sub t2d, t1d ; i_mf = i_qp % 6 shl t2d, %1 %if ARCH_X86_64 @@ -666,8 +671,8 @@ cglobal dequant_%1x%1_flat16, 0,3 %endif imul t0d, t2d, 0x2b shr t0d, 8 ; i_qbits = i_qp / 6 - lea t1, [t0*3] - sub t2d, t1d + lea t1d, [t0*5] + sub t2d, t0d sub t2d, t1d ; i_mf = i_qp % 6 shl t2d, %2 %ifdef PIC @@ -719,11 +724,14 @@ cglobal dequant_%1x%1_flat16, 0,3 %if HIGH_BIT_DEPTH INIT_XMM sse2 -DEQUANT 4, 4, 1 -DEQUANT 8, 6, 1 +DEQUANT 4, 4, 2 +DEQUANT 8, 6, 2 INIT_XMM xop -DEQUANT 4, 4, 1 -DEQUANT 8, 6, 1 +DEQUANT 4, 4, 2 +DEQUANT 8, 6, 2 +INIT_YMM avx2 +DEQUANT 4, 4, 4 +DEQUANT 8, 6, 4 %else %if ARCH_X86_64 == 0 INIT_MMX mmx