]> granicus.if.org Git - libx264/commitdiff
configure: use -falign-loops=16 on OS X
authorEric Petit <titer@videolan.org>
Tue, 12 Apr 2005 18:45:24 +0000 (18:45 +0000)
committerEric Petit <titer@videolan.org>
Tue, 12 Apr 2005 18:45:24 +0000 (18:45 +0000)
 common/ppc/: added AltiVecized mc_chroma + cleaning
 checkasm.c:  really fixed MC tests

git-svn-id: svn://svn.videolan.org/x264/trunk@199 df754926-b1dd-0310-bc7b-ec298dee348c

common/ppc/mc.c
common/ppc/pixel.c
common/ppc/ppccommon.h
configure
testing/checkasm.c

index 14ea74be079eb016e7238f375220f68392e9b8f6..bf41d210b5d623858e28b7ca79e02277e1892313 100644 (file)
@@ -87,7 +87,7 @@ static inline void pixel_avg_w16( uint8_t *dst,  int i_dst,
                                   int i_height )
 {
     int y;
-    vector_u8_t src1v, src2v;
+    vec_u8_t src1v, src2v;
     for( y = 0; y < i_height; y++ )
     {
         LOAD_16( src1, src1v );
@@ -119,8 +119,8 @@ MC_COPY( mc_copy_w8,  8  )
 MC_COPY( mc_copy_w16, 16 )
 
 /* TAP_FILTER:
-   a is source (vector_s16_t [6])
-   b is a temporary vector_s16_t
+   a is source (vec_s16_t [6])
+   b is a temporary vec_s16_t
    c is the result
 
    c   = src[0] + a[5] - 5 * ( a[1] + a[4] ) + 20 * ( a[2] + a[3] );
@@ -167,11 +167,11 @@ static inline void mc_hh_w8( uint8_t *src, int i_src,
     DECLARE_ALIGNED( int16_t, tmp[8], 16 );
 
     LOAD_ZERO;
-    vector_u8_t    loadv;
-    vector_s16_t   srcv[6];
-    vector_u8_t  * _srcv = (vector_u8_t*) srcv;
-    vector_s16_t   dstv;
-    vector_s16_t   tmpv;
+    vec_u8_t    loadv;
+    vec_s16_t   srcv[6];
+    vec_u8_t  * _srcv = (vec_u8_t*) srcv;
+    vec_s16_t   dstv;
+    vec_s16_t   tmpv;
 
     for( y = 0; y < i_height; y++ )
     {
@@ -179,9 +179,9 @@ static inline void mc_hh_w8( uint8_t *src, int i_src,
 
         for( x = 0; x < 6; x++ )
         {
-            _srcv[x] = vec_perm( loadv, zero_u8,
+            _srcv[x] = vec_perm( loadv, zero_u8v,
                                  vec_lvsl( 0, (int*) x ) );
-            CONVERT_U8_TO_S16( srcv[x] );
+            CONVERT_U8_TO_S16( srcv[x], srcv[x] );
         }
 
         TAP_FILTER( srcv, tmpv, dstv );
@@ -226,10 +226,10 @@ static inline void mc_hv_w8( uint8_t *src, int i_src,
     DECLARE_ALIGNED( int16_t, tmp[8], 16 );
 
     LOAD_ZERO;
-    vector_s16_t   srcv[6];
-    vector_u8_t  * _srcv = (vector_u8_t*) srcv;
-    vector_s16_t   dstv;
-    vector_s16_t   tmpv;
+    vec_s16_t   srcv[6];
+    vec_u8_t  * _srcv = (vec_u8_t*) srcv;
+    vec_s16_t   dstv;
+    vec_s16_t   tmpv;
 
     for( y = 0; y < i_height; y++ )
     {
@@ -240,14 +240,14 @@ static inline void mc_hv_w8( uint8_t *src, int i_src,
                 srcv[x] = srcv[x+1];
             }
             LOAD_8( &src[3*i_src], _srcv[5] );
-            CONVERT_U8_TO_S16( srcv[5] );
+            CONVERT_U8_TO_S16( srcv[5], srcv[5] );
         }
         else
         {
             for( x = 0; x < 6; x++ )
             {
                 LOAD_8( &src[(x-2)*i_src], _srcv[x] );
-                CONVERT_U8_TO_S16( srcv[x] );
+                CONVERT_U8_TO_S16( srcv[x], srcv[x] );
             }
         }
 
@@ -787,8 +787,94 @@ uint8_t *get_ref_altivec( uint8_t *src[4], int i_src_stride,
     }
 }
 
+static void mc_chroma_altivec( uint8_t *src, int i_src_stride,
+                               uint8_t *dst, int i_dst_stride,
+                               int mvx, int mvy,
+                               int i_width, int i_height )
+{
+    uint8_t *srcp;
+    int x, y;
+    int d8x = mvx & 0x07;
+    int d8y = mvy & 0x07;
+
+    DECLARE_ALIGNED( uint16_t, coeff[4], 16 );
+    coeff[0] = (8-d8x)*(8-d8y);
+    coeff[1] = d8x    *(8-d8y);
+    coeff[2] = (8-d8x)*d8y;
+    coeff[3] = d8x    *d8y;
+
+    src  += (mvy >> 3) * i_src_stride + (mvx >> 3);
+    srcp  = &src[i_src_stride];
+
+    if( i_width < 8 )
+    {
+        /* TODO: optimize */
+        for( y = 0; y < i_height; y++ )
+        {
+            for( x = 0; x < i_width; x++ )
+            {
+                dst[x] = ( coeff[0]*src[x]  + coeff[1]*src[x+1] +
+                           coeff[2]*srcp[x] + coeff[3]*srcp[x+1] + 32 ) >> 6;
+            }
+            dst  += i_dst_stride;
+
+            src   = srcp;
+            srcp += i_src_stride;
+        }
+        return;
+    }
+    
+    /* We now assume that i_width == 8 */
+    LOAD_ZERO;
+    vec_u16_t   coeffv[4];
+    vec_u16_t   k32v;
+    vec_u8_t    srcv_8[4];
+    vec_u16_t   srcv_16[4];
+    vec_u8_t    dstv_8;
+    vec_u16_t   dstv_16;
+    vec_u8_t    permv;
+    vec_u16_t   shiftv;
+    
+    coeffv[0] = vec_ld( 0, coeff );
+    coeffv[3] = vec_splat( coeffv[0], 3 );
+    coeffv[2] = vec_splat( coeffv[0], 2 );
+    coeffv[1] = vec_splat( coeffv[0], 1 );
+    coeffv[0] = vec_splat( coeffv[0], 0 );
+    k32v      = vec_sl( vec_splat_u16( 1 ), vec_splat_u16( 5 ) );
+    permv     = vec_lvsl( 0, (uint8_t *) 1 );
+    shiftv    = vec_splat_u16( 6 );
+
+    LOAD_16( src, srcv_8[2] );
+    srcv_8[3] = vec_perm( srcv_8[2], srcv_8[2], permv );
+
+    for( y = 0; y < i_height; y++ )
+    {
+        int i;
+
+        srcv_8[0] = srcv_8[2];
+        srcv_8[1] = srcv_8[3];
+        LOAD_16( srcp, srcv_8[2] );
+        srcv_8[3] = vec_perm( srcv_8[2], srcv_8[2], permv );
+
+        dstv_16 = k32v;
+        for( i = 0; i < 4; i++ )
+        {
+            CONVERT_U8_TO_U16( srcv_8[i], srcv_16[i] );
+            srcv_16[i] = vec_mladd( coeffv[i], srcv_16[i], zero_u16v );
+            dstv_16 = vec_add( dstv_16, srcv_16[i] );
+        }
+        dstv_16 = vec_sr( dstv_16, shiftv );
+        CONVERT_U16_TO_U8( dstv_16, dstv_8 );
+        STORE_8( dstv_8, dst );
+
+        dst  += i_dst_stride;
+        srcp += i_src_stride;
+    }
+}
+
 void x264_mc_altivec_init( x264_mc_functions_t *pf )
 {
     pf->mc_luma   = mc_luma_altivec;
     pf->get_ref   = get_ref_altivec;
+    pf->mc_chroma = mc_chroma_altivec;
 }
index de822e7fc2cf7309e59ba07994458c0827ec56ce..a2341fd3afb714722940f6a1aae1349871f7f583 100644 (file)
@@ -44,21 +44,21 @@ static int name( uint8_t *pix1, int i_pix1,            \
     int y;                                             \
     DECLARE_ALIGNED( int, sum, 16 );                   \
                                                        \
-    LOAD_ZERO;                                         \
-    vector_u8_t  pix1v, pix2v;                         \
-    vector_s32_t sumv = zero_s32;                      \
+    LOAD_ZERO;                                     \
+    vec_u8_t  pix1v, pix2v;                            \
+    vec_s32_t sumv = zero_s32v;                        \
     for( y = 0; y < ly; y++ )                          \
     {                                                  \
         LOAD_##lx( pix1, pix1v );                      \
         LOAD_##lx( pix2, pix2v );                      \
-        sumv = (vector_s32_t) vec_sum4s(               \
+        sumv = (vec_s32_t) vec_sum4s(                  \
                    vec_sub( vec_max( pix1v, pix2v ),   \
                             vec_min( pix1v, pix2v ) ), \
-                   (vector_u32_t) sumv );              \
+                   (vec_u32_t) sumv );                 \
         pix1 += i_pix1;                                \
         pix2 += i_pix2;                                \
     }                                                  \
-    sumv = vec_sum##a( sumv, zero_s32 );               \
+    sumv = vec_sum##a( sumv, zero_s32v );              \
     vec_ste( vec_splat( sumv, b ), 0, &sum );          \
     return sum;                                        \
 }
@@ -76,12 +76,12 @@ static inline int pixel_satd_8x8_altivec( uint8_t *pix1, int i_pix1,
     DECLARE_ALIGNED( int, i_satd, 16 );
 
     LOAD_ZERO;
-    vector_s32_t satdv = zero_s32;
-    vector_u8_t  pix1u8v, pix2u8v;
-    vector_s16_t pix1s16v, pix2s16v;
-    vector_s16_t diffv[8];
-    vector_s16_t tmpv[8];
-    vector_s16_t s01v, s23v, d01v, d23v;
+    vec_s32_t satdv = zero_s32v;
+    vec_u8_t  pix1u8v, pix2u8v;
+    vec_s16_t pix1s16v, pix2s16v;
+    vec_s16_t diffv[8];
+    vec_s16_t tmpv[8];
+    vec_s16_t s01v, s23v, d01v, d23v;
 
     /* Diff 8x8 */
     for( i = 0; i < 8; i++ )
@@ -90,8 +90,8 @@ static inline int pixel_satd_8x8_altivec( uint8_t *pix1, int i_pix1,
         LOAD_8( pix2, pix2u8v );
 
         /* u8 -> s16 conversion */
-        pix1s16v = (vector_s16_t) vec_mergeh( zero_u8, pix1u8v );
-        pix2s16v = (vector_s16_t) vec_mergeh( zero_u8, pix2u8v );
+        CONVERT_U8_TO_S16( pix1u8v, pix1s16v );
+        CONVERT_U8_TO_S16( pix2u8v, pix2s16v );
 
         diffv[i] = vec_sub( pix1s16v, pix2s16v );
 
@@ -115,7 +115,7 @@ static inline int pixel_satd_8x8_altivec( uint8_t *pix1, int i_pix1,
     {
         satdv = vec_sum4s( vec_abs( tmpv[i] ), satdv );
     }
-    satdv = vec_sums( satdv, zero_s32 );
+    satdv = vec_sums( satdv, zero_s32v );
 
     /* Done */
     vec_ste( vec_splat( satdv, 3 ), 0, &i_satd );
@@ -158,12 +158,12 @@ static inline int pixel_satd_4x4_altivec( uint8_t *pix1, int i_pix1,
     DECLARE_ALIGNED( int, i_satd, 16 );
 
     LOAD_ZERO;
-    vector_s32_t satdv = zero_s32;
-    vector_u8_t  pix1u8v, pix2u8v;
-    vector_s16_t pix1s16v, pix2s16v;
-    vector_s16_t diffv[4];
-    vector_s16_t tmpv[4];
-    vector_s16_t s01v, s23v, d01v, d23v;
+    vec_s32_t satdv = zero_s32v;
+    vec_u8_t  pix1u8v, pix2u8v;
+    vec_s16_t pix1s16v, pix2s16v;
+    vec_s16_t diffv[4];
+    vec_s16_t tmpv[4];
+    vec_s16_t s01v, s23v, d01v, d23v;
 
     /* Diff 4x8 */
     for( i = 0; i < 4; i++ )
@@ -171,9 +171,8 @@ static inline int pixel_satd_4x4_altivec( uint8_t *pix1, int i_pix1,
         LOAD_4( pix1, pix1u8v );
         LOAD_4( pix2, pix2u8v );
 
-        /* u8 -> s16 conversion */
-        pix1s16v = (vector_s16_t) vec_mergeh( zero_u8, pix1u8v );
-        pix2s16v = (vector_s16_t) vec_mergeh( zero_u8, pix2u8v );
+        CONVERT_U8_TO_S16( pix1u8v, pix1s16v );
+        CONVERT_U8_TO_S16( pix2u8v, pix2s16v );
 
         diffv[i] = vec_sub( pix1s16v, pix2s16v );
 
@@ -195,7 +194,7 @@ static inline int pixel_satd_4x4_altivec( uint8_t *pix1, int i_pix1,
     {
         satdv = vec_sum4s( vec_abs( tmpv[i] ), satdv );
     }
-    satdv = vec_sum2s( satdv, zero_s32 );
+    satdv = vec_sum2s( satdv, zero_s32v );
 
     /* Done */
     vec_ste( vec_splat( satdv, 1 ), 0, &i_satd );
index dd8f23876bad5db6df2319a54b385458beaa3021..9919f7b2a1f36e10ff7d25312ec076aa641158d4 100644 (file)
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
  *****************************************************************************/
 
-/* Handy */
-#define vector_u8_t  vector unsigned char
-#define vector_s16_t vector signed short
-#define vector_u32_t vector unsigned int
-#define vector_s32_t vector signed int
+/***********************************************************************
+ * Vector types
+ **********************************************************************/
+#define vec_u8_t  vector unsigned char
+#define vec_s8_t  vector signed char
+#define vec_u16_t vector unsigned short
+#define vec_s16_t vector signed short
+#define vec_u32_t vector unsigned int
+#define vec_s32_t vector signed int
 
-#define LOAD_ZERO    vector_s32_t zero = vec_splat_s32( 0 )
-#define zero_u8      (vector_u8_t)  zero
-#define zero_s16     (vector_s16_t) zero
-#define zero_s32     (vector_s32_t) zero
+/***********************************************************************
+ * Null vector
+ **********************************************************************/
+#define LOAD_ZERO vec_u8_t zerov = vec_splat_u8( 0 )
 
-#define CONVERT_U8_TO_S16( a ) \
-    a = (vector_s16_t) vec_mergeh( zero_u8, (vector_u8_t) a )
+#define zero_u8v  (vec_u8_t)  zerov
+#define zero_s8v  (vec_s8_t)  zerov
+#define zero_u16v (vec_u16_t) zerov
+#define zero_s16v (vec_s16_t) zerov
+#define zero_u32v (vec_u32_t) zerov
+#define zero_s32v (vec_s32_t) zerov
 
-/* Macros to load aligned or unaligned data without risking buffer
-   overflows. */
+/***********************************************************************
+ * CONVERT_*
+ **********************************************************************/
+#define CONVERT_U8_TO_U16( s, d ) \
+    d = (vec_u16_t) vec_mergeh( zero_u8v, (vec_u8_t) s )
+#define CONVERT_U8_TO_S16( s, d ) \
+    d = (vec_s16_t) vec_mergeh( zero_u8v, (vec_u8_t) s )
+#define CONVERT_U16_TO_U8( s, d ) \
+    d = (vec_u8_t) vec_pack( (vec_u16_t) s, zero_u16v )
+#define CONVERT_S16_TO_U8( s, d ) \
+    d = (vec_u8_t) vec_pack( (vec_s16_t) s, zero_s16v )
+
+/***********************************************************************
+ * LOAD_16
+ ***********************************************************************
+ * p: uint8_t *
+ * v: vec_u8_t
+ * Loads 16 bytes from p into v
+ **********************************************************************/
 #define LOAD_16( p, v )                                \
     if( (long) p & 0xF )                               \
     {                                                  \
         v = vec_ld( 0, p );                            \
     }
 
-#define LOAD_8( p, v )                                             \
-    if( !( (long) p & 0xF ) )                                      \
-    {                                                              \
-        v = vec_ld( 0, p );                                        \
-    }                                                              \
-    else if( ( (long) p & 0xF ) < 9 )                              \
-    {                                                              \
-        v = vec_perm( vec_ld( 0, p ), (vector unsigned char) zero, \
-                      vec_lvsl( 0, p ) );                          \
-    }                                                              \
-    else                                                           \
-    {                                                              \
-        v = vec_perm( vec_ld( 0, p ), vec_ld( 16, p ),             \
-                      vec_lvsl( 0, p ) );                          \
+/***********************************************************************
+ * LOAD_8
+ ***********************************************************************
+ * p: uint8_t *
+ * v: vec_u8_t
+ * Loads 8 bytes from p into the first half of v
+ **********************************************************************/
+#define LOAD_8( p, v )                                 \
+    if( !( (long) p & 0xF ) )                          \
+    {                                                  \
+        v = vec_ld( 0, p );                            \
+    }                                                  \
+    else if( ( (long) p & 0xF ) < 9 )                  \
+    {                                                  \
+        v = vec_perm( vec_ld( 0, p ), zero_u8v,        \
+                      vec_lvsl( 0, p ) );              \
+    }                                                  \
+    else                                               \
+    {                                                  \
+        v = vec_perm( vec_ld( 0, p ), vec_ld( 16, p ), \
+                      vec_lvsl( 0, p ) );              \
+    }
+
+/***********************************************************************
+ * LOAD_4
+ ***********************************************************************
+ * p: uint8_t *
+ * v: vec_u8_t
+ * Loads 4 bytes from p into the first quarter of v
+ **********************************************************************/
+#define LOAD_4( p, v )                                 \
+    if( !( (long) p & 0xF ) )                          \
+    {                                                  \
+        v = vec_ld( 0, p );                            \
+    }                                                  \
+    else if( ( (long) p & 0xF ) < 13 )                 \
+    {                                                  \
+        v = vec_perm( vec_ld( 0, p ), zero_u8v,        \
+                      vec_lvsl( 0, p ) );              \
+    }                                                  \
+    else                                               \
+    {                                                  \
+        v = vec_perm( vec_ld( 0, p ), vec_ld( 16, p ), \
+                      vec_lvsl( 0, p ) );              \
     }
 
-#define LOAD_4( p, v )                                             \
-    if( !( (long) p & 0xF ) )                                      \
-    {                                                              \
-        v = vec_ld( 0, p );                                        \
-    }                                                              \
-    else if( ( (long) p & 0xF ) < 13 )                             \
-    {                                                              \
-        v = vec_perm( vec_ld( 0, p ), (vector unsigned char) zero, \
-                      vec_lvsl( 0, p ) );                          \
-    }                                                              \
-    else                                                           \
-    {                                                              \
-        v = vec_perm( vec_ld( 0, p ), vec_ld( 16, p ),             \
-                      vec_lvsl( 0, p ) );                          \
+/***********************************************************************
+ * STORE_16
+ ***********************************************************************
+ * v: vec_u8_t
+ * p: uint8_t *
+ * Stores the 16 bytes from v at address p
+ **********************************************************************/
+#define STORE_16( v, p )                  \
+    if( (long) p & 0xF )                  \
+    {                                     \
+        vec_u8_t hv, lv, tmp1, tmp2;      \
+        hv   = vec_ld( 0, p );            \
+        lv   = vec_ld( 16, p );           \
+        tmp2 = vec_lvsl( 0, p );          \
+        tmp1 = vec_perm( lv, hv, tmp2 );  \
+        tmp2 = vec_lvsr( 0, p );          \
+        hv   = vec_perm( tmp1, v, tmp2 ); \
+        lv   = vec_perm( v, tmp1, tmp2 ); \
+        vec_st( lv, 16, p );              \
+        vec_st( hv, 0, p );               \
+    }                                     \
+    else                                  \
+    {                                     \
+        vec_st( v, 0, p );                \
     }
 
-/* Store aligned or unaligned data */
-#define STORE_16( v, p )                              \
-    if( (long) p & 0xF )                              \
-    {                                                 \
-        vector unsigned char tmp1, tmp2;              \
-        vector unsigned char align, mask;             \
-        tmp1 = vec_ld( 0, p );                        \
-        tmp2 = vec_ld( 16, p );                       \
-        align = vec_lvsr( 0, p );                     \
-        mask = vec_perm( (vector unsigned char) {0},  \
-                         (vector unsigned char) {1},  \
-                         align);                      \
-        v = vec_perm( v, v, align);                   \
-        tmp1 = vec_sel( tmp1, v, mask );              \
-        tmp2 = vec_sel( v, tmp2, mask );              \
-        vec_st( tmp1, 0, p );                         \
-        vec_st( tmp2, 16, p );                        \
-    }                                                 \
-    else                                              \
-    {                                                 \
-        vec_st( v, 0, p );                            \
+/* FIXME We can do better than that */
+#define STORE_8( v, p ) \
+    { \
+        DECLARE_ALIGNED( uint8_t, _p[16], 16 ); \
+        vec_st( v, 0, _p ); \
+        memcpy( p, _p, 8 ); \
     }
 
-/* Transpose 8x8 (vector_s16_t [8]) */
+/* Transpose 8x8 (vec_s16_t [8]) */
 #define TRANSPOSE8x8( a, b )           \
     b[0] = vec_mergeh( a[0], a[4] ); \
     b[1] = vec_mergel( a[0], a[4] ); \
     b[6] = vec_mergeh( a[3], a[7] ); \
     b[7] = vec_mergel( a[3], a[7] );
 
-/* Transpose 4x4 (vector_s16_t [4]) */
+/* Transpose 4x4 (vec_s16_t [4]) */
 #define TRANSPOSE4x4( a, b ) \
-    (b)[0] = vec_mergeh( (a)[0], zero_s16 ); \
-    (b)[1] = vec_mergeh( (a)[1], zero_s16 ); \
-    (b)[2] = vec_mergeh( (a)[2], zero_s16 ); \
-    (b)[3] = vec_mergeh( (a)[3], zero_s16 ); \
+    (b)[0] = vec_mergeh( (a)[0], zero_s16v ); \
+    (b)[1] = vec_mergeh( (a)[1], zero_s16v ); \
+    (b)[2] = vec_mergeh( (a)[2], zero_s16v ); \
+    (b)[3] = vec_mergeh( (a)[3], zero_s16v ); \
     (a)[0] = vec_mergeh( (b)[0], (b)[2] );   \
     (a)[1] = vec_mergel( (b)[0], (b)[2] );   \
     (a)[2] = vec_mergeh( (b)[1], (b)[3] );   \
     (b)[2] = vec_mergeh( (a)[1], (a)[3] );   \
     (b)[3] = vec_mergel( (a)[1], (a)[3] );
 
-/* Hadamar (vector_s16_t [4]) */
+/* Hadamar (vec_s16_t [4]) */
 #define HADAMAR( a, b ) \
     s01v   = vec_add( (a)[0], (a)[1] ); \
     s23v   = vec_add( (a)[2], (a)[3] ); \
index f7bb9ce611fdbeb4fc4b004c852234fd3ad6d610..5f375e13e999419eb4db32f8e7c735c8a1652923 100755 (executable)
--- a/configure
+++ b/configure
@@ -35,6 +35,7 @@ case "$UNAMES" in
     ;;
   Darwin)
     SYS="MACOSX"
+    CFLAGS="$CFLAGS -falign-loops=16"
     LDFLAGS="$LDFLAGS -lm -lmx"
     ;;
   FreeBSD)
index b7f0552803a75d7c50ab22524dbf2b8fd82bc2e2..58a403cd565e5cc67f23ea5ddfe2ea75b841e360 100644 (file)
@@ -11,6 +11,7 @@
 #endif
 #ifdef ARCH_PPC
 #include "common/ppc/pixel.h"
+#include "common/ppc/mc.h"
 #endif
 
 /* buf1, buf2: initialised to randome data and shouldn't write into them */
@@ -262,12 +263,11 @@ static int check_mc()
 #define MC_TEST_LUMA( w, h ) \
         if( mc_a.mc_luma ) \
         { \
-            memset(buf1, 0xCD, 1024); \
             memset(buf3, 0xCD, 1024); \
             memset(buf4, 0xCD, 1024); \
             mc_c.mc_luma( src2, 32, dst1, 16, dx, dy, w, h );     \
             mc_a.mc_luma( src2, 32, dst2, 16, dx, dy, w, h );   \
-            if( memcmp( dst1, dst2, 16*16 ) )               \
+            if( memcmp( buf3, buf4, 1024 ) )               \
             { \
                 fprintf( stderr, "mc_luma[mv(%d,%d) %2dx%-2d]     [FAILED]\n", dx, dy, w, h );   \
                 ok[0] = 0; \
@@ -277,11 +277,11 @@ static int check_mc()
 #define MC_TEST_CHROMA( w, h ) \
         if( mc_a.mc_chroma ) \
         { \
-            memset(dst1, 0xCD, (h) * 16); \
+            memset(buf3, 0xCD, 1024); \
+            memset(buf4, 0xCD, 1024); \
             mc_c.mc_chroma( src, 32, dst1, 16, dx, dy, w, h );     \
-            memset(dst2, 0xCD, (h) * 16); \
             mc_a.mc_chroma( src, 32, dst2, 16, dx, dy, w, h );   \
-            if( memcmp( dst1, dst2, 16*16 ) )               \
+            if( memcmp( buf3, buf4, 1024 ) )               \
             { \
                 fprintf( stderr, "mc_chroma[mv(%d,%d) %2dx%-2d]     [FAILED]\n", dx, dy, w, h );   \
                 ok[1] = 0; \