From: Henrik Gramner Date: Mon, 4 Aug 2014 23:42:47 +0000 (+0200) Subject: x86: Faster quant_4x4x4 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=98100b88b475227f375d9bcbaea0bac57008accc;p=libx264 x86: Faster quant_4x4x4 Also drop the MMX version instead of doing a bunch of ifdeffery to support it after this change. --- diff --git a/common/quant.c b/common/quant.c index 3515b2e6..d7b69115 100644 --- a/common/quant.c +++ b/common/quant.c @@ -559,7 +559,6 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) { #if ARCH_X86 pf->quant_4x4 = x264_quant_4x4_mmx; - pf->quant_4x4x4 = x264_quant_4x4x4_mmx; pf->quant_8x8 = x264_quant_8x8_mmx; pf->dequant_4x4 = x264_dequant_4x4_mmx; pf->dequant_4x4_dc = x264_dequant_4x4dc_mmx2; diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm index ed01c372..fb588d36 100644 --- a/common/x86/quant-a.asm +++ b/common/x86/quant-a.asm @@ -292,14 +292,11 @@ cglobal quant_4x4x4, 3,3,8 QUANT_4x4 0, 6 QUANT_4x4 64, 7 packssdw m6, m7 - packssdw m5, m6 - packssdw m5, m5 ; AA BB CC DD - packsswb m5, m5 ; A B C D + packssdw m5, m6 ; AAAA BBBB CCCC DDDD pxor m4, m4 - pcmpeqb m5, m4 - pmovmskb eax, m5 - not eax - and eax, 0xf + pcmpeqd m5, m4 + movmskps eax, m5 + xor eax, 0xf RET %endmacro @@ -444,16 +441,11 @@ cglobal quant_4x4x4, 3,3,7 QUANT_4x4 64, 5 QUANT_4x4 96, 6 packssdw m5, m6 - packssdw m4, m5 -%if mmsize == 16 - packssdw m4, m4 ; AA BB CC DD -%endif - packsswb m4, m4 ; A B C D + packssdw m4, m5 ; AAAA BBBB CCCC DDDD pxor m3, m3 - pcmpeqb m4, m3 - pmovmskb eax, m4 - not eax - and eax, 0xf + pcmpeqd m4, m3 + movmskps eax, m4 + xor eax, 0xf RET %endmacro @@ -464,7 +456,6 @@ QUANT_DC quant_4x4_dc, 4 INIT_MMX mmx QUANT_AC quant_4x4, 4 QUANT_AC quant_8x8, 16 -QUANT_4x4x4 %endif INIT_XMM sse2 diff --git a/common/x86/quant.h b/common/x86/quant.h index 5adc6877..1fcb8001 100644 --- a/common/x86/quant.h +++ b/common/x86/quant.h @@ -31,7 +31,6 @@ int x264_quant_2x2_dc_mmx2( dctcoef dct[4], int mf, int bias ); int x264_quant_4x4_dc_mmx2( dctcoef dct[16], int mf, int bias ); int x264_quant_4x4_mmx( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] ); -int x264_quant_4x4x4_mmx( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] ); int x264_quant_8x8_mmx( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] ); int x264_quant_2x2_dc_sse2( dctcoef dct[16], int mf, int bias ); int x264_quant_4x4_dc_sse2( dctcoef dct[16], int mf, int bias );