From 86ca49033ad85660dc88f82ab721263f4d29290e Mon Sep 17 00:00:00 2001
From: Laurent Aimar <fenrir@videolan.org>
Date: Fri, 23 Jul 2004 18:14:59 +0000
Subject: [PATCH]  * encoder.c, analyse.c, macroblock: fixed when using a qp
 per MB.  (Buggy for pskip and mb with null cbp luma and chroma).  * dct*:
 fixed order of idct.

git-svn-id: svn://svn.videolan.org/x264/trunk@13 df754926-b1dd-0310-bc7b-ec298dee348c
---
 core/dct.c           | 37 +++++++++++++++++++------------------
 core/i386/dct.asm    | 20 ++++++++++----------
 core/macroblock.c    |  3 +++
 encoder/analyse.c    | 17 +++++++++++------
 encoder/encoder.c    |  2 +-
 encoder/macroblock.c | 12 ++++++++++--
 6 files changed, 54 insertions(+), 37 deletions(-)

diff --git a/core/dct.c b/core/dct.c
index 65aab0cf..6e3a16fc 100644
--- a/core/dct.c
+++ b/core/dct.c
@@ -197,30 +197,31 @@ static void add4x4_idct( uint8_t *p_dst, int i_dst, int16_t dct[4][4] )
 
     for( i = 0; i < 4; i++ )
     {
-        const int s02 = dct[0][i]      + dct[2][i];
-        const int d02 = dct[0][i]      - dct[2][i];
-        const int s13 = dct[1][i]      + (dct[3][i]>>1);
-        const int d13 = (dct[1][i]>>1) -  dct[3][i];
-
-        tmp[0][i] = s02 + s13;
-        tmp[1][i] = d02 + d13;
-        tmp[2][i] = d02 - d13;
-        tmp[3][i] = s02 - s13;
+        const int s02 =  dct[i][0]     +  dct[i][2];
+        const int d02 =  dct[i][0]     -  dct[i][2];
+        const int s13 =  dct[i][1]     + (dct[i][3]>>1);
+        const int d13 = (dct[i][1]>>1) -  dct[i][3];
+
+        tmp[i][0] = s02 + s13;
+        tmp[i][1] = d02 + d13;
+        tmp[i][2] = d02 - d13;
+        tmp[i][3] = s02 - s13;
     }
 
     for( i = 0; i < 4; i++ )
     {
-        const int s02 =  tmp[i][0]     +  tmp[i][2];
-        const int d02 =  tmp[i][0]     -  tmp[i][2];
-        const int s13 =  tmp[i][1]     + (tmp[i][3]>>1);
-        const int d13 = (tmp[i][1]>>1) -  tmp[i][3];
-
-        d[i][0] = ( s02 + s13 + 32 ) >> 6;
-        d[i][1] = ( d02 + d13 + 32 ) >> 6;
-        d[i][2] = ( d02 - d13 + 32 ) >> 6;
-        d[i][3] = ( s02 - s13 + 32 ) >> 6;
+        const int s02 =  tmp[0][i]     +  tmp[2][i];
+        const int d02 =  tmp[0][i]     -  tmp[2][i];
+        const int s13 =  tmp[1][i]     + (tmp[3][i]>>1);
+        const int d13 = (tmp[1][i]>>1) -   tmp[3][i];
+
+        d[0][i] = ( s02 + s13 + 32 ) >> 6;
+        d[1][i] = ( d02 + d13 + 32 ) >> 6;
+        d[2][i] = ( d02 - d13 + 32 ) >> 6;
+        d[3][i] = ( s02 - s13 + 32 ) >> 6;
     }
 
+
     for( y = 0; y < 4; y++ )
     {
         for( x = 0; x < 4; x++ )
diff --git a/core/i386/dct.asm b/core/i386/dct.asm
index 054daba7..92dbc5ae 100644
--- a/core/i386/dct.asm
+++ b/core/i386/dct.asm
@@ -277,14 +277,17 @@ x264_add4x4_idct_mmxext:
     ; Load dct coeffs
     mov     eax, [esp+12]   ; dct
     movq    mm0, [eax+ 0]
-    movq    mm1, [eax+ 8]
-    movq    mm2, [eax+16]
-    movq    mm3, [eax+24]
+    movq    mm4, [eax+ 8]
+    movq    mm3, [eax+16]
+    movq    mm1, [eax+24]
     
     mov     eax, [esp+ 4]   ; p_dst
     mov     ecx, [esp+ 8]   ; i_dst
     lea     edx, [ecx+ecx*2]
 
+    ; out:mm0, mm1, mm2, mm3
+    MMX_TRANSPOSE       mm0, mm4, mm3, mm1, mm2
+
     MMX_SUMSUB_BA       mm2, mm0                        ; mm2=s02  mm0=d02
     MMX_SUMSUBD2_AB     mm1, mm3, mm5, mm4              ; mm1=s13  mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3)
 
@@ -298,16 +301,13 @@ x264_add4x4_idct_mmxext:
 
     MMX_SUMSUB_BADC     mm2, mm3, mm4, mm1              ; mm2=s02+s13  mm3=s02-s13  mm4=d02+d13  mm1=d02-d13
 
-    ; in: mm2, mm4, mm1, mm3  out: mm2, mm3, mm0, mm1
-    MMX_TRANSPOSE       mm2, mm4, mm1, mm3, mm0
-
     MMX_ZERO            mm7
     movq                mm6, [x264_mmx_32]
     
-    MMX_STORE_DIFF_4P   mm2, mm4, mm6, mm7, [eax]
-    MMX_STORE_DIFF_4P   mm3, mm4, mm6, mm7, [eax+ecx]
-    MMX_STORE_DIFF_4P   mm0, mm4, mm6, mm7, [eax+ecx*2]
-    MMX_STORE_DIFF_4P   mm1, mm4, mm6, mm7, [eax+edx]
+    MMX_STORE_DIFF_4P   mm2, mm0, mm6, mm7, [eax]
+    MMX_STORE_DIFF_4P   mm4, mm0, mm6, mm7, [eax+ecx]
+    MMX_STORE_DIFF_4P   mm1, mm0, mm6, mm7, [eax+ecx*2]
+    MMX_STORE_DIFF_4P   mm3, mm0, mm6, mm7, [eax+edx]
 
     ret
 
diff --git a/core/macroblock.c b/core/macroblock.c
index 59603f03..d03413c6 100644
--- a/core/macroblock.c
+++ b/core/macroblock.c
@@ -892,6 +892,9 @@ void x264_macroblock_cache_save( x264_t *h )
 
     int i;
 
+    if( IS_SKIP( h->mb.i_type ) )
+        h->mb.qp[i_mb_xy] = h->mb.i_last_qp;
+
     h->mb.i_last_dqp = h->mb.qp[i_mb_xy] - h->mb.i_last_qp;
     h->mb.i_last_qp = h->mb.qp[i_mb_xy];
 
diff --git a/encoder/analyse.c b/encoder/analyse.c
index 77e2c316..e4a7a025 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -847,9 +847,15 @@ void x264_macroblock_analyse( x264_t *h )
     x264_mb_analysis_t analysis;
     int i;
 
-    /* qp TODO */
+    /* qp TODO implement a nice RC */
     h->mb.qp[h->mb.i_mb_xy] = x264_clip3( h->pps->i_pic_init_qp + h->sh.i_qp_delta + 0, 0, 51 );
 
+    /* FIXME check if it's 12 */
+    if( h->mb.qp[h->mb.i_mb_xy] - h->mb.i_last_qp < -12 )
+        h->mb.qp[h->mb.i_mb_xy] = h->mb.i_last_qp - 12;
+    else if( h->mb.qp[h->mb.i_mb_xy] - h->mb.i_last_qp > 12 )
+        h->mb.qp[h->mb.i_mb_xy] = h->mb.i_last_qp + 12;
+
     /* init analysis */
     x264_mb_analyse_init( h, &analysis, h->mb.qp[h->mb.i_mb_xy] );
 
@@ -871,11 +877,10 @@ void x264_macroblock_analyse( x264_t *h )
         int i_cost;
 
         /* Fast P_SKIP detection */
-        if( analysis.i_qp == h->mb.i_last_qp &&
-            ( ( (i_neighbour&MB_LEFT) && h->mb.type[h->mb.i_mb_xy - 1] == P_SKIP ) ||
-              ( (i_neighbour&MB_TOP) && h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride] == P_SKIP ) ||
-              ( ((i_neighbour&(MB_TOP|MB_LEFT)) == (MB_TOP|MB_LEFT) ) && h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride-1 ] == P_SKIP ) ||
-              ( (i_neighbour&MB_TOPRIGHT) && h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride+1 ] == P_SKIP ) ) )
+        if( ( (i_neighbour&MB_LEFT) && h->mb.type[h->mb.i_mb_xy - 1] == P_SKIP ) ||
+            ( (i_neighbour&MB_TOP) && h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride] == P_SKIP ) ||
+            ( ((i_neighbour&(MB_TOP|MB_LEFT)) == (MB_TOP|MB_LEFT) ) && h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride-1 ] == P_SKIP ) ||
+            ( (i_neighbour&MB_TOPRIGHT) && h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride+1 ] == P_SKIP ) )
         {
             b_skip = x264_macroblock_probe_pskip( h );
         }
diff --git a/encoder/encoder.c b/encoder/encoder.c
index 73ca380b..3188bb73 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -37,7 +37,7 @@
 #include "macroblock.h"
 
 //#define DEBUG_MB_TYPE
-//#define DEBUG_DUMP_FRAME 1
+#define DEBUG_DUMP_FRAME 1
 
 static int64_t i_mtime_encode_frame = 0;
 
diff --git a/encoder/macroblock.c b/encoder/macroblock.c
index 353f3d4b..17bea963 100644
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -724,12 +724,19 @@ void x264_macroblock_encode( x264_t *h )
     /* store cbp */
     h->mb.cbp[h->mb.i_mb_xy] = (i_cbp_dc << 8) | (h->mb.i_cbp_chroma << 4) | h->mb.i_cbp_luma;
 
+    if( h->mb.i_type != I_16x16 && h->mb.i_cbp_luma == 0 && h->mb.i_cbp_chroma == 0 )
+    {
+        /* It won'y change anything at the decoder side but it is needed else the
+         * decoder will fail to read the next QP */
+        h->mb.qp[h->mb.i_mb_xy] = h->mb.i_last_qp;
+    }
+
+
     /* Check for P_SKIP
      * XXX: in the me perhaps we should take x264_mb_predict_mv_pskip into account
      *      (if multiple mv give same result)*/
     if( h->mb.i_type == P_L0 && h->mb.i_partition == D_16x16 &&
-        h->mb.i_cbp_luma == 0x00 && h->mb.i_cbp_chroma== 0x00 &&
-        h->mb.qp[h->mb.i_mb_xy] == h->mb.i_last_qp )
+        h->mb.i_cbp_luma == 0x00 && h->mb.i_cbp_chroma== 0x00 )
     {
         if( h->mb.cache.ref[0][x264_scan8[0]] == 0 )
         {
@@ -740,6 +747,7 @@ void x264_macroblock_encode( x264_t *h )
                 h->mb.cache.mv[0][x264_scan8[0]][1] == mvp[1] )
             {
                 h->mb.type[h->mb.i_mb_xy] = h->mb.i_type = P_SKIP;
+                h->mb.qp[h->mb.i_mb_xy] = h->mb.i_last_qp;  /* Needed */
             }
         }
     }
-- 
2.40.0