From: Henrik Gramner <henrik@gramner.com>
Date: Mon, 4 Aug 2014 23:42:47 +0000 (+0200)
Subject: x86: Faster quant_4x4x4
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=98100b88b475227f375d9bcbaea0bac57008accc;p=libx264

x86: Faster quant_4x4x4

Also drop the MMX version instead of doing a bunch of ifdeffery to support it after this change.
---

diff --git a/common/quant.c b/common/quant.c
index 3515b2e6..d7b69115 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -559,7 +559,6 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
     {
 #if ARCH_X86
         pf->quant_4x4 = x264_quant_4x4_mmx;
-        pf->quant_4x4x4 = x264_quant_4x4x4_mmx;
         pf->quant_8x8 = x264_quant_8x8_mmx;
         pf->dequant_4x4 = x264_dequant_4x4_mmx;
         pf->dequant_4x4_dc = x264_dequant_4x4dc_mmx2;
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
index ed01c372..fb588d36 100644
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -292,14 +292,11 @@ cglobal quant_4x4x4, 3,3,8
     QUANT_4x4  0, 6
     QUANT_4x4 64, 7
     packssdw  m6, m7
-    packssdw  m5, m6
-    packssdw  m5, m5  ; AA BB CC DD
-    packsswb  m5, m5  ; A B C D
+    packssdw  m5, m6  ; AAAA BBBB CCCC DDDD
     pxor      m4, m4
-    pcmpeqb   m5, m4
-    pmovmskb eax, m5
-    not      eax
-    and      eax, 0xf
+    pcmpeqd   m5, m4
+    movmskps eax, m5
+    xor      eax, 0xf
     RET
 %endmacro
 
@@ -444,16 +441,11 @@ cglobal quant_4x4x4, 3,3,7
     QUANT_4x4 64, 5
     QUANT_4x4 96, 6
     packssdw  m5, m6
-    packssdw  m4, m5
-%if mmsize == 16
-    packssdw  m4, m4  ; AA BB CC DD
-%endif
-    packsswb  m4, m4  ; A B C D
+    packssdw  m4, m5  ; AAAA BBBB CCCC DDDD
     pxor      m3, m3
-    pcmpeqb   m4, m3
-    pmovmskb eax, m4
-    not      eax
-    and      eax, 0xf
+    pcmpeqd   m4, m3
+    movmskps eax, m4
+    xor      eax, 0xf
     RET
 %endmacro
 
@@ -464,7 +456,6 @@ QUANT_DC quant_4x4_dc, 4
 INIT_MMX mmx
 QUANT_AC quant_4x4, 4
 QUANT_AC quant_8x8, 16
-QUANT_4x4x4
 %endif
 
 INIT_XMM sse2
diff --git a/common/x86/quant.h b/common/x86/quant.h
index 5adc6877..1fcb8001 100644
--- a/common/x86/quant.h
+++ b/common/x86/quant.h
@@ -31,7 +31,6 @@
 int x264_quant_2x2_dc_mmx2( dctcoef dct[4], int mf, int bias );
 int x264_quant_4x4_dc_mmx2( dctcoef dct[16], int mf, int bias );
 int x264_quant_4x4_mmx( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
-int x264_quant_4x4x4_mmx( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] );
 int x264_quant_8x8_mmx( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
 int x264_quant_2x2_dc_sse2( dctcoef dct[16], int mf, int bias );
 int x264_quant_4x4_dc_sse2( dctcoef dct[16], int mf, int bias );