From 481e4cdb52989e4b514a2f4345870a19c5c0ae92 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Sat, 4 May 2013 18:48:58 +0200 Subject: [PATCH] x86: AVX2 high bit-depth quant quant_4x4: 13->6 cycles quant_4x4_dc: 14->8 cycles quant_8x8: 47->24 cycles quant_4x4x4: 48->25 cycles --- common/quant.c | 7 +++++++ common/x86/quant-a.asm | 34 +++++++++++++++++++++++++++++----- 2 files changed, 36 insertions(+), 5 deletions(-) diff --git a/common/quant.c b/common/quant.c index 3f70310f..24da5f99 100644 --- a/common/quant.c +++ b/common/quant.c @@ -537,6 +537,13 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->dequant_8x8 = x264_dequant_8x8_xop; } } + if( cpu&X264_CPU_AVX2 ) + { + pf->quant_4x4 = x264_quant_4x4_avx2; + pf->quant_4x4_dc = x264_quant_4x4_dc_avx2; + pf->quant_8x8 = x264_quant_8x8_avx2; + pf->quant_4x4x4 = x264_quant_4x4x4_avx2; + } #endif // HAVE_MMX #else // !HIGH_BIT_DEPTH #if HAVE_MMX diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm index 3cd453ba..b6931815 100644 --- a/common/x86/quant-a.asm +++ b/common/x86/quant-a.asm @@ -7,7 +7,7 @@ ;* Fiona Glaser ;* Christian Heine ;* Oskar Arvidsson -;* Henrik Gramner +;* Henrik Gramner ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by @@ -238,10 +238,10 @@ cextern popcnt_table mova [%1 ], m2 mova [%1+mmsize], m3 ACCUM por, %5, 2, %4 - ACCUM por, %5, 3, %4+mmsize + por m%5, m3 %else ; !sse4 QUANT_ONE_AC_MMX %1, %2, %3, %4, %5 - QUANT_ONE_AC_MMX %1+mmsize, %2+mmsize, %3+mmsize, %4+mmsize, %5 + QUANT_ONE_AC_MMX %1+mmsize, %2+mmsize, %3+mmsize, 1, %5 %endif ; cpuflag %endmacro @@ -279,8 +279,8 @@ cglobal quant_%1x%2, 3,3,8 %endmacro %macro QUANT_4x4 2 - QUANT_TWO_AC r0+%1+mmsize*0, r1+mmsize*0, r2+mmsize*0, mmsize*0, %2 - QUANT_TWO_AC r0+%1+mmsize*2, r1+mmsize*2, r2+mmsize*2, mmsize*2, %2 + QUANT_TWO_AC r0+%1+mmsize*0, r1+mmsize*0, r2+mmsize*0, 0, %2 + QUANT_TWO_AC r0+%1+mmsize*2, r1+mmsize*2, r2+mmsize*2, 1, %2 %endmacro %macro QUANT_4x4x4 0 @@ -324,6 +324,30 @@ QUANT_AC 4, 4 QUANT_AC 8, 8 QUANT_4x4x4 +INIT_YMM avx2 +QUANT_DC 4, 4 +QUANT_AC 4, 4 +QUANT_AC 8, 8 + +INIT_YMM avx2 +cglobal quant_4x4x4, 3,3,6 + QUANT_TWO_AC r0, r1, r2, 0, 4 + QUANT_TWO_AC r0+64, r1, r2, 0, 5 + add r0, 128 + packssdw m4, m5 + QUANT_TWO_AC r0, r1, r2, 0, 5 + QUANT_TWO_AC r0+64, r1, r2, 0, 1 + packssdw m5, m1 + packssdw m4, m5 + pxor m3, m3 + pcmpeqd m4, m3 + movmskps eax, m4 + mov edx, eax + shr eax, 4 + and eax, edx + xor eax, 0xf + RET + %endif ; HIGH_BIT_DEPTH %if HIGH_BIT_DEPTH == 0 -- 2.40.0