From e7cb328580c3e1bd7604a64f40abf3e03c474771 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Tue, 14 May 2013 18:57:40 +0200 Subject: [PATCH] x86: AVX2 dequant_4x4_dc --- common/quant.c | 5 +-- common/x86/quant-a.asm | 75 ++++++++++++++++++++++++------------------ common/x86/quant.h | 1 + 3 files changed, 47 insertions(+), 34 deletions(-) diff --git a/common/quant.c b/common/quant.c index 2fa1521d..bb87d70a 100644 --- a/common/quant.c +++ b/common/quant.c @@ -545,6 +545,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->quant_4x4x4 = x264_quant_4x4x4_avx2; pf->dequant_4x4 = x264_dequant_4x4_avx2; pf->dequant_8x8 = x264_dequant_8x8_avx2; + pf->dequant_4x4_dc = x264_dequant_4x4dc_avx2; pf->denoise_dct = x264_denoise_dct_avx2; } #endif // HAVE_MMX @@ -691,10 +692,9 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->quant_4x4_dc = x264_quant_4x4_dc_avx2; pf->quant_8x8 = x264_quant_8x8_avx2; pf->quant_4x4x4 = x264_quant_4x4x4_avx2; - if( cpu&X264_CPU_LZCNT ) - pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx2_lzcnt; pf->dequant_4x4 = x264_dequant_4x4_avx2; pf->dequant_8x8 = x264_dequant_8x8_avx2; + pf->dequant_4x4_dc = x264_dequant_4x4dc_avx2; if( h->param.i_cqm_preset == X264_CQM_FLAT ) { pf->dequant_4x4 = x264_dequant_4x4_flat16_avx2; @@ -704,6 +704,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->denoise_dct = x264_denoise_dct_avx2; if( cpu&X264_CPU_LZCNT ) { + pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx2_lzcnt; pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_avx2_lzcnt; pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_avx2_lzcnt; } diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm index 02328a9a..def485ee 100644 --- a/common/x86/quant-a.asm +++ b/common/x86/quant-a.asm @@ -757,55 +757,62 @@ cglobal dequant_4x4dc, 0,3,6 DEQUANT_START 6, 6 .lshift: - movd m3, [r1] - movd m2, t0d - pslld m3, m2 - SPLAT%1 m3, m3, 0 -%assign x 0 -%rep SIZEOF_PIXEL*16/mmsize - mova m0, [r0+mmsize*0+x] - mova m1, [r0+mmsize*1+x] - %2 m0, m3 - %2 m1, m3 - mova [r0+mmsize*0+x], m0 - mova [r0+mmsize*1+x], m1 -%assign x x+mmsize*2 +%if cpuflag(avx2) + vpbroadcastdct m3, [r1] +%else + movd xm3, [r1] + SPLAT%1 m3, xm3 +%endif + movd xm2, t0d + pslld m3, xm2 +%assign %%x 0 +%rep SIZEOF_PIXEL*32/mmsize + %2 m0, m3, [r0+%%x] + mova [r0+%%x], m0 +%assign %%x %%x+mmsize %endrep RET .rshift32: - neg t0d - movd m3, t0d - mova m4, [p%1_1] - mova m5, m4 - pslld m4, m3 - psrld m4, 1 - movd m2, [r1] -%assign x 0 + neg t0d +%if cpuflag(avx2) + vpbroadcastdct m2, [r1] +%else + movd xm2, [r1] +%endif + mova m5, [p%1_1] + movd xm3, t0d + pslld m4, m5, xm3 + psrld m4, 1 %if HIGH_BIT_DEPTH - pshufd m2, m2, 0 +%if notcpuflag(avx2) + pshufd m2, m2, 0 +%endif +%assign %%x 0 %rep SIZEOF_PIXEL*32/mmsize - mova m0, [r0+x] - pmadcswd m0, m0, m2, m4 - psrad m0, m3 - mova [r0+x], m0 -%assign x x+mmsize + pmadcswd m0, m2, [r0+%%x], m4 + psrad m0, xm3 + mova [r0+%%x], m0 +%assign %%x %%x+mmsize %endrep %else ; !HIGH_BIT_DEPTH +%if notcpuflag(avx2) PSHUFLW m2, m2, 0 +%endif punpcklwd m2, m4 +%assign %%x 0 %rep SIZEOF_PIXEL*32/mmsize - mova m0, [r0+x] + mova m0, [r0+%%x] punpckhwd m1, m0, m5 punpcklwd m0, m5 pmaddwd m0, m2 pmaddwd m1, m2 - psrad m0, m3 - psrad m1, m3 + psrad m0, xm3 + psrad m1, xm3 packssdw m0, m1 - mova [r0+x], m0 -%assign x x+mmsize + mova [r0+%%x], m0 +%assign %%x %%x+mmsize %endrep %endif ; !HIGH_BIT_DEPTH RET @@ -816,6 +823,8 @@ INIT_XMM sse2 DEQUANT_DC d, pmaddwd INIT_XMM xop DEQUANT_DC d, pmaddwd +INIT_YMM avx2 +DEQUANT_DC d, pmaddwd %else %if ARCH_X86_64 == 0 INIT_MMX mmx2 @@ -825,6 +834,8 @@ INIT_XMM sse2 DEQUANT_DC w, pmullw INIT_XMM avx DEQUANT_DC w, pmullw +INIT_YMM avx2 +DEQUANT_DC w, pmullw %endif ; t4 is eax for return value. diff --git a/common/x86/quant.h b/common/x86/quant.h index 5541db03..089942ad 100644 --- a/common/x86/quant.h +++ b/common/x86/quant.h @@ -65,6 +65,7 @@ void x264_dequant_4x4_xop( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); void x264_dequant_4x4dc_xop( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); void x264_dequant_8x8_xop( dctcoef dct[64], int dequant_mf[6][64], int i_qp ); void x264_dequant_4x4_avx2( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); +void x264_dequant_4x4dc_avx2( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); void x264_dequant_8x8_avx2( dctcoef dct[64], int dequant_mf[6][64], int i_qp ); void x264_dequant_4x4_flat16_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp ); void x264_dequant_8x8_flat16_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp ); -- 2.40.0