From 9df377f87702c82a2202d34919c07e32c60b40ae Mon Sep 17 00:00:00 2001
From: Anton Mitrofanov <BugMaster@narod.ru>
Date: Thu, 28 Aug 2014 20:13:13 +0400
Subject: [PATCH] Fix inappropriate instruction use

---
 common/dct.c           | 2 +-
 common/quant.c         | 4 ++--
 common/x86/dct-a.asm   | 2 +-
 common/x86/dct.h       | 2 +-
 common/x86/pixel-a.asm | 2 +-
 common/x86/quant-a.asm | 2 +-
 common/x86/quant.h     | 4 ++--
 7 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/common/dct.c b/common/dct.c
index f5900efd..08f4e893 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -611,7 +611,6 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
     {
         dctf->sub4x4_dct    = x264_sub4x4_dct_mmx;
         dctf->add4x4_idct   = x264_add4x4_idct_mmx;
-        dctf->dct4x4dc      = x264_dct4x4dc_mmx;
         dctf->idct4x4dc     = x264_idct4x4dc_mmx;
         dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_mmx2;
 
@@ -630,6 +629,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
 
     if( cpu&X264_CPU_MMX2 )
     {
+        dctf->dct4x4dc         = x264_dct4x4dc_mmx2;
         dctf->add8x8_idct_dc   = x264_add8x8_idct_dc_mmx2;
         dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx2;
     }
diff --git a/common/quant.c b/common/quant.c
index d7b69115..31d8901d 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -558,8 +558,6 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
     if( cpu&X264_CPU_MMX )
     {
 #if ARCH_X86
-        pf->quant_4x4 = x264_quant_4x4_mmx;
-        pf->quant_8x8 = x264_quant_8x8_mmx;
         pf->dequant_4x4 = x264_dequant_4x4_mmx;
         pf->dequant_4x4_dc = x264_dequant_4x4dc_mmx2;
         pf->dequant_8x8 = x264_dequant_8x8_mmx;
@@ -576,6 +574,8 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
     {
         pf->quant_2x2_dc = x264_quant_2x2_dc_mmx2;
 #if ARCH_X86
+        pf->quant_4x4 = x264_quant_4x4_mmx2;
+        pf->quant_8x8 = x264_quant_8x8_mmx2;
         pf->quant_4x4_dc = x264_quant_4x4_dc_mmx2;
         pf->decimate_score15 = x264_decimate_score15_mmx2;
         pf->decimate_score16 = x264_decimate_score16_mmx2;
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
index 4376e369..bc82ff63 100644
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -143,7 +143,7 @@ INIT_XMM avx
 DCT4x4_DC
 %else
 
-INIT_MMX mmx
+INIT_MMX mmx2
 cglobal dct4x4dc, 1,1
     movq   m3, [r0+24]
     movq   m2, [r0+16]
diff --git a/common/x86/dct.h b/common/x86/dct.h
index 337a6327..f22a979a 100644
--- a/common/x86/dct.h
+++ b/common/x86/dct.h
@@ -70,7 +70,7 @@ void x264_add8x8_idct_dc_avx    ( pixel   *p_dst, dctcoef dct    [ 4] );
 void x264_add16x16_idct_dc_avx  ( pixel   *p_dst, dctcoef dct    [16] );
 void x264_add16x16_idct_dc_avx2 ( uint8_t *p_dst, int16_t dct    [16] );
 
-void x264_dct4x4dc_mmx       ( int16_t d[16] );
+void x264_dct4x4dc_mmx2      ( int16_t d[16] );
 void x264_dct4x4dc_sse2      ( int32_t d[16] );
 void x264_dct4x4dc_avx       ( int32_t d[16] );
 void x264_idct4x4dc_mmx      ( int16_t d[16] );
diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
index 262c5377..f5f6a82e 100644
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -1600,7 +1600,7 @@ cglobal pixel_satd_4x4, 4,6
 %macro SATDS_SSE2 0
 %define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH)
 
-%if vertical==0 || HIGH_BIT_DEPTH
+%if cpuflag(ssse3) && (vertical==0 || HIGH_BIT_DEPTH)
 cglobal pixel_satd_4x4, 4, 6, 6
     SATD_START_MMX
     mova m4, [hmul_4p]
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
index fb588d36..731f7d15 100644
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -453,7 +453,7 @@ INIT_MMX mmx2
 QUANT_DC quant_2x2_dc, 1
 %if ARCH_X86_64 == 0 ; not needed because sse2 is faster
 QUANT_DC quant_4x4_dc, 4
-INIT_MMX mmx
+INIT_MMX mmx2
 QUANT_AC quant_4x4, 4
 QUANT_AC quant_8x8, 16
 %endif
diff --git a/common/x86/quant.h b/common/x86/quant.h
index 1fcb8001..c6a8a9b1 100644
--- a/common/x86/quant.h
+++ b/common/x86/quant.h
@@ -30,8 +30,8 @@
 
 int x264_quant_2x2_dc_mmx2( dctcoef dct[4], int mf, int bias );
 int x264_quant_4x4_dc_mmx2( dctcoef dct[16], int mf, int bias );
-int x264_quant_4x4_mmx( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
-int x264_quant_8x8_mmx( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
+int x264_quant_4x4_mmx2( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
+int x264_quant_8x8_mmx2( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
 int x264_quant_2x2_dc_sse2( dctcoef dct[16], int mf, int bias );
 int x264_quant_4x4_dc_sse2( dctcoef dct[16], int mf, int bias );
 int x264_quant_4x4_sse2( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
-- 
2.40.0