From 89f067b7cacecf413569e84c6c973c23f67b1ad3 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Mon, 6 May 2013 18:41:24 +0200 Subject: [PATCH] x86: AVX2 high bit-depth denoise_dct 28->15 cycles Also reorder instructions to use fewer registers, 3 cycles faster on Ivy Bridge with 64-bit Windows. --- common/quant.c | 1 + common/x86/quant-a.asm | 26 +++++++++++++------------- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/common/quant.c b/common/quant.c index 24da5f99..d1f7b2fd 100644 --- a/common/quant.c +++ b/common/quant.c @@ -543,6 +543,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->quant_4x4_dc = x264_quant_4x4_dc_avx2; pf->quant_8x8 = x264_quant_8x8_avx2; pf->quant_4x4x4 = x264_quant_4x4x4_avx2; + pf->denoise_dct = x264_denoise_dct_avx2; } #endif // HAVE_MMX #else // !HIGH_BIT_DEPTH diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm index b6931815..7c2557b1 100644 --- a/common/x86/quant-a.asm +++ b/common/x86/quant-a.asm @@ -946,31 +946,29 @@ OPTIMIZE_CHROMA_2x2_DC ; void denoise_dct( int32_t *dct, uint32_t *sum, uint32_t *offset, int size ) ;----------------------------------------------------------------------------- %macro DENOISE_DCT 0 -cglobal denoise_dct, 4,4,8 - pxor m6, m6 +cglobal denoise_dct, 4,4,6 + pxor m5, m5 movsxdifnidn r3, r3d .loop: mova m2, [r0+r3*4-2*mmsize] mova m3, [r0+r3*4-1*mmsize] ABSD m0, m2 ABSD m1, m3 - mova m4, m0 - mova m5, m1 + paddd m4, m0, [r1+r3*4-2*mmsize] psubd m0, [r2+r3*4-2*mmsize] + mova [r1+r3*4-2*mmsize], m4 + paddd m4, m1, [r1+r3*4-1*mmsize] psubd m1, [r2+r3*4-1*mmsize] - pcmpgtd m7, m0, m6 - pand m0, m7 - pcmpgtd m7, m1, m6 - pand m1, m7 + mova [r1+r3*4-1*mmsize], m4 + pcmpgtd m4, m0, m5 + pand m0, m4 + pcmpgtd m4, m1, m5 + pand m1, m4 PSIGND m0, m2 PSIGND m1, m3 mova [r0+r3*4-2*mmsize], m0 mova [r0+r3*4-1*mmsize], m1 - paddd m4, [r1+r3*4-2*mmsize] - paddd m5, [r1+r3*4-1*mmsize] - mova [r1+r3*4-2*mmsize], m4 - mova [r1+r3*4-1*mmsize], m5 - sub r3, mmsize/2 + sub r3d, mmsize/2 jg .loop RET %endmacro @@ -985,6 +983,8 @@ INIT_XMM ssse3 DENOISE_DCT INIT_XMM avx DENOISE_DCT +INIT_YMM avx2 +DENOISE_DCT %else ; !HIGH_BIT_DEPTH -- 2.40.0