factor mallocs out of hpel, ssim, and esa.

author Steven Walters <kemuri9@gmail.com>

Mon, 29 Dec 2008 05:14:26 +0000 (05:14 +0000)

committer Loren Merritt <pengvado@akuvian.org>

Wed, 31 Dec 2008 03:47:18 +0000 (03:47 +0000)
author Steven Walters <kemuri9@gmail.com>
Mon, 29 Dec 2008 05:14:26 +0000 (05:14 +0000)
committer Loren Merritt <pengvado@akuvian.org>
Wed, 31 Dec 2008 03:47:18 +0000 (03:47 +0000)
diff --git a/common/common.h b/common/common.h

index 1668a6300626e7c5cefe53093e4dd315ad9e5428..4e1782a28ed078c82841994ef84967de7ad0e6ce 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -605,6 +605,8 @@ struct x264_t
  
      } stat;
  
+    void *scratch_buffer; /* for any temporary storage that doesn't want repeated malloc */
+
      /* CPU functions dependents */
      x264_predict_t      predict_16x16[4+3];
      x264_predict_t      predict_8x8c[4+3];
diff --git a/common/macroblock.c b/common/macroblock.c

index f597c57ab50e1d6e93f29a5ec0c026824a9b03fd..94a5d4fca3ed457ea9b59bb948db6d02165fe96c 100644 (file)
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -23,6 +23,7 @@
   *****************************************************************************/
  
  #include "common.h"
+#include "encoder/me.h"
  
  void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int16_t mvp[2] )
  {
@@ -838,6 +839,13 @@ int x264_macroblock_cache_init( x264_t *h )
      h->mb.i_neighbour4[15] =
      h->mb.i_neighbour8[3] = MB_LEFT|MB_TOP|MB_TOPLEFT;
  
+    int buf_hpel = (h->param.i_width+40) * sizeof(int16_t);
+    int buf_ssim = h->param.analyse.b_ssim * 8 * (h->param.i_width/4+3) * sizeof(int);
+    int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range);
+    int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) *
+        ((me_range*2+18) * sizeof(int16_t) + (me_range+1) * (me_range+1) * 4 * sizeof(mvsad_t));
+    CHECKED_MALLOC( h->scratch_buffer, X264_MAX3( buf_hpel, buf_ssim, buf_tesa ) );
+
      return 0;
  fail: return -1;
  }
@@ -863,6 +871,7 @@ void x264_macroblock_cache_end( x264_t *h )
      x264_free( h->mb.skipbp );
      x264_free( h->mb.cbp );
      x264_free( h->mb.qp );
+    x264_free( h->scratch_buffer );
  }
  void x264_macroblock_slice_init( x264_t *h )
  {
diff --git a/common/mc.c b/common/mc.c

index 331e6265d1caab27bb94ba90d182f50a8cb54f14..f9818b2a48d8cb89f57989507a37c6ce5e181fc5 100644 (file)
--- a/common/mc.c
+++ b/common/mc.c
@@ -132,9 +132,8 @@ static void mc_copy( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_str
  
  #define TAPFILTER(pix, d) ((pix)[x-2*d] + (pix)[x+3*d] - 5*((pix)[x-d] + (pix)[x+2*d]) + 20*((pix)[x] + (pix)[x+d]))
  static void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
-                         int stride, int width, int height )
+                         int stride, int width, int height, int16_t *buf )
  {
-    int16_t *buf = x264_malloc((width+5)*sizeof(int16_t));
      int x, y;
      for( y=0; y<height; y++ )
      {
@@ -153,7 +152,6 @@ static void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *s
          dstc += stride;
          src += stride;
      }
-    x264_free(buf);
  }
  
  static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
@@ -423,7 +421,8 @@ void x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
              frame->filtered[2] + offs,
              frame->filtered[3] + offs,
              frame->plane[0] + offs,
-            stride, width + 16, height - start );
+            stride, width + 16, height - start,
+            h->scratch_buffer );
      }
  
      /* generate integral image:
diff --git a/common/mc.h b/common/mc.h

index 884d01659772d6494e5d57f94fedfeca484ab748..594940f82a157f776c8eabeee2761f1e0b69b26e 100644 (file)
--- a/common/mc.h
+++ b/common/mc.h
@@ -55,7 +55,7 @@ typedef struct
                          uint8_t *src, int i_src, int w, int h);
  
      void (*hpel_filter)( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
-                         int i_stride, int i_width, int i_height );
+                         int i_stride, int i_width, int i_height, int16_t *buf );
  
      /* prefetch the next few macroblocks of fenc or fdec */
      void (*prefetch_fenc)( uint8_t *pix_y, int stride_y,
diff --git a/common/pixel.c b/common/pixel.c

index 1c37b3178b8c1e8b71ecd4a64d1c4075aa49a3e4..f97f1b26744c3055eec6f813821c0e2c139c225f 100644 (file)
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -488,12 +488,12 @@ static float ssim_end4( int sum0[5][4], int sum1[5][4], int width )
  float x264_pixel_ssim_wxh( x264_pixel_function_t *pf,
                             uint8_t *pix1, int stride1,
                             uint8_t *pix2, int stride2,
-                           int width, int height )
+                           int width, int height, void *buf )
  {
      int x, y, z;
      float ssim = 0.0;
-    int (*sum0)[4] = x264_malloc(4 * (width/4+3) * sizeof(int));
-    int (*sum1)[4] = x264_malloc(4 * (width/4+3) * sizeof(int));
+    int (*sum0)[4] = buf;
+    int (*sum1)[4] = sum0 + width/4+3;
      width >>= 2;
      height >>= 2;
      z = 0;
@@ -508,8 +508,6 @@ float x264_pixel_ssim_wxh( x264_pixel_function_t *pf,
          for( x = 0; x < width-1; x += 4 )
              ssim += pf->ssim_end4( sum0+x, sum1+x, X264_MIN(4,width-x-1) );
      }
-    x264_free(sum0);
-    x264_free(sum1);
      return ssim;
  }
  
diff --git a/common/pixel.h b/common/pixel.h

index 1a2cefd3f414af72c0a1806ced1e6fec72afdf27..a08879c95232f41800715ce11eede6a2eb09b251 100644 (file)
--- a/common/pixel.h
+++ b/common/pixel.h
@@ -104,6 +104,6 @@ typedef struct
  
  void x264_pixel_init( int cpu, x264_pixel_function_t *pixf );
  int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height );
-float x264_pixel_ssim_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height );
+float x264_pixel_ssim_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height, void *buf );
  
  #endif
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c

index 56ca4c4e3bd47a8043482612ec71118e8157eac4..8fe507c857508db3300de247df581d3830bf00f8 100644 (file)
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -202,16 +202,14 @@ void x264_hpel_filter_c_##cpuc( uint8_t *dst, int16_t *buf, int width );\
  void x264_hpel_filter_h_##cpuh( uint8_t *dst, uint8_t *src, int width );\
  void x264_sfence( void );\
  static void x264_hpel_filter_##cpu( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,\
-                             int stride, int width, int height )\
+                             int stride, int width, int height, int16_t *buf )\
  {\
-    int16_t *buf;\
      int realign = (long)src & (align-1);\
      src -= realign;\
      dstv -= realign;\
      dstc -= realign;\
      dsth -= realign;\
      width += realign;\
-    buf = x264_malloc((width+16)*sizeof(int16_t));\
      while( height-- )\
      {\
          x264_hpel_filter_v_##cpuv( dstv, src, buf+8, stride, width );\
@@ -223,14 +221,13 @@ static void x264_hpel_filter_##cpu( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
          src  += stride;\
      }\
      x264_sfence();\
-    x264_free(buf);\
  }
  
  HPEL(8, mmxext, mmxext, mmxext, mmxext)
  HPEL(16, sse2_amd, mmxext, mmxext, sse2)
  #ifdef ARCH_X86_64
-void x264_hpel_filter_sse2( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int stride, int width, int height );
-void x264_hpel_filter_ssse3( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int stride, int width, int height );
+void x264_hpel_filter_sse2( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int stride, int width, int height, int16_t *buf );
+void x264_hpel_filter_ssse3( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int stride, int width, int height, int16_t *buf );
  #else
  HPEL(16, sse2, sse2, sse2, sse2)
  HPEL(16, ssse3, sse2, ssse3, ssse3)
diff --git a/encoder/encoder.c b/encoder/encoder.c

index bd7d4f4e46d53858f286c0d895c18c01eabd656f..ce8a1f8d66995c5aa7444707e3e721660ce8fda6 100644 (file)
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -1049,7 +1049,7 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y )
              x264_pixel_ssim_wxh( &h->pixf,
                  h->fdec->plane[0] + 2+min_y*h->fdec->i_stride[0], h->fdec->i_stride[0],
                  h->fenc->plane[0] + 2+min_y*h->fenc->i_stride[0], h->fenc->i_stride[0],
-                h->param.i_width-2, max_y-min_y );
+                h->param.i_width-2, max_y-min_y, h->scratch_buffer );
      }
  }
  
diff --git a/encoder/me.c b/encoder/me.c

index 1f40d48db2f4c8a8d93286a947fbe9a653709c90..815cf501d973e9459e7b4e934ec95732b01dcf2a 100644 (file)
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -474,8 +474,7 @@ me_hex2:
              DECLARE_ALIGNED_16( int enc_dc[4] );
              int sad_size = i_pixel <= PIXEL_8x8 ? PIXEL_8x8 : PIXEL_4x4;
              int delta = x264_pixel_size[sad_size].w;
-            int16_t xs_buf[64];
-            int16_t *xs = width<=64 ? xs_buf : x264_malloc( (width+15)*sizeof(int16_t) );
+            int16_t *xs = h->scratch_buffer;
              int xn;
              uint16_t *cost_fpel_mvx = x264_cost_mv_fpel[h->mb.i_qp][-m->mvp[0]&3] + (-m->mvp[0]>>2);
  
@@ -492,11 +491,7 @@ me_hex2:
              if( h->mb.i_me_method == X264_ME_TESA )
              {
                  // ADS threshold, then SAD threshold, then keep the best few SADs, then SATD
-                typedef struct {
-                    int sad;
-                    int16_t mx, my;
-                } mvsad_t;
-                mvsad_t *mvsads = x264_malloc( width*(max_y-min_y+1)*sizeof(mvsad_t) );
+                mvsad_t *mvsads = (mvsad_t *)(xs + ((width+15)&~15));
                  int nmvsad = 0, limit;
                  int sad_thresh = i_me_range <= 16 ? 10 : i_me_range <= 24 ? 11 : 12;
                  int bsad = h->pixf.sad[i_pixel]( m->p_fenc[0], FENC_STRIDE, p_fref+bmy*stride+bmx, stride )
@@ -581,7 +576,6 @@ me_hex2:
                  }
                  for( i=0; i<nmvsad; i++ )
                      COST_MV( mvsads[i].mx, mvsads[i].my );
-                x264_free( mvsads );
              }
              else
              {
@@ -601,9 +595,6 @@ me_hex2:
                          COST_MV( min_x+xs[i], my );
                  }
              }
-
-            if( xs != xs_buf )
-                x264_free( xs );
  #endif
          }
          break;
diff --git a/encoder/me.h b/encoder/me.h

index eba4957a9a9865b24a47eeac2ce029ef455d4698..3d7a446ca45690c060a78a2552e86b77f423bee9 100644 (file)
--- a/encoder/me.h
+++ b/encoder/me.h
@@ -48,6 +48,11 @@ typedef struct
      DECLARE_ALIGNED_4( int16_t mv[2] );
  } DECLARE_ALIGNED_16( x264_me_t );
  
+typedef struct {
+    int sad;
+    int16_t mx, my;
+} mvsad_t;
+
  void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, int *p_fullpel_thresh );
  static inline void x264_me_search( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc )
      { x264_me_search_ref( h, m, mvc, i_mvc, NULL ); }
diff --git a/tools/checkasm.c b/tools/checkasm.c

index 85658b77bae66643775f1e12daeecd04ccf2256d..d154941e82225c5fe5215f21ebfd5195fa6ae46a 100644 (file)
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -407,8 +407,8 @@ static int check_pixel( int cpu_ref, int cpu_new )
          int sums[5][4] = {{0}};
          used_asm = ok = 1;
          x264_emms();
-        res_c = x264_pixel_ssim_wxh( &pixel_c,   buf1+2, 32, buf2+2, 32, 32, 28 );
-        res_a = x264_pixel_ssim_wxh( &pixel_asm, buf1+2, 32, buf2+2, 32, 32, 28 );
+        res_c = x264_pixel_ssim_wxh( &pixel_c,   buf1+2, 32, buf2+2, 32, 32, 28, buf3 );
+        res_a = x264_pixel_ssim_wxh( &pixel_asm, buf1+2, 32, buf2+2, 32, 32, 28, buf3 );
          if( fabs(res_c - res_a) > 1e-6 )
          {
              ok = 0;
@@ -792,12 +792,13 @@ static int check_mc( int cpu_ref, int cpu_new )
          uint8_t *src = buf1+8+2*64;
          uint8_t *dstc[3] = { buf3+8, buf3+8+16*64, buf3+8+32*64 };
          uint8_t *dsta[3] = { buf4+8, buf4+8+16*64, buf4+8+32*64 };
+        void *tmp = buf3+49*64;
          set_func_name( "hpel_filter" );
          ok = 1; used_asm = 1;
          memset( buf3, 0, 4096 );
          memset( buf4, 0, 4096 );
-        call_c( mc_c.hpel_filter, dstc[0], dstc[1], dstc[2], src, 64, 48, 10 );
-        call_a( mc_a.hpel_filter, dsta[0], dsta[1], dsta[2], src, 64, 48, 10 );
+        call_c( mc_c.hpel_filter, dstc[0], dstc[1], dstc[2], src, 64, 48, 10, tmp );
+        call_a( mc_a.hpel_filter, dsta[0], dsta[1], dsta[2], src, 64, 48, 10, tmp );
          for( i=0; i<3; i++ )
              for( j=0; j<10; j++ )
                  //FIXME ideally the first pixels would match too, but they aren't actually used
author	Steven Walters <kemuri9@gmail.com>
	Mon, 29 Dec 2008 05:14:26 +0000 (05:14 +0000)
committer	Loren Merritt <pengvado@akuvian.org>
	Wed, 31 Dec 2008 03:47:18 +0000 (03:47 +0000)
common/common.h		patch \| blob \| history
common/macroblock.c		patch \| blob \| history
common/mc.c		patch \| blob \| history
common/mc.h		patch \| blob \| history
common/pixel.c		patch \| blob \| history
common/pixel.h		patch \| blob \| history
common/x86/mc-c.c		patch \| blob \| history
encoder/encoder.c		patch \| blob \| history
encoder/me.c		patch \| blob \| history
encoder/me.h		patch \| blob \| history
tools/checkasm.c		patch \| blob \| history