Fix PADH alignment

author Anton Mitrofanov <BugMaster@narod.ru>

Thu, 21 Jan 2021 20:26:27 +0000 (23:26 +0300)

committer Anton Mitrofanov <BugMaster@narod.ru>

Tue, 26 Jan 2021 18:49:17 +0000 (21:49 +0300)
author Anton Mitrofanov <BugMaster@narod.ru>
Thu, 21 Jan 2021 20:26:27 +0000 (23:26 +0300)
committer Anton Mitrofanov <BugMaster@narod.ru>
Tue, 26 Jan 2021 18:49:17 +0000 (21:49 +0300)
diff --git a/common/frame.c b/common/frame.c

index b27efa05e1e9b1e5298958326886a0a3cc49819f..9d76b8cbd0edae3f48597c7ad5062474f9c8fd72 100644 (file)
--- a/common/frame.c
+++ b/common/frame.c
@@ -38,7 +38,7 @@ static int align_stride( int x, int align, int disalign )
  static int align_plane_size( int x, int disalign )
  {
      if( !(x&(disalign-1)) )
-        x += 128;
+        x += X264_MAX( 128, NATIVE_ALIGN ) / SIZEOF_PIXEL;
      return x;
  }
  
@@ -63,29 +63,28 @@ static x264_frame_t *frame_new( x264_t *h, int b_fdec )
      int i_mb_count = h->mb.i_mb_count;
      int i_stride, i_width, i_lines, luma_plane_count;
      int i_padv = PADV << PARAM_INTERLACED;
-    int align = 16;
+    int align = NATIVE_ALIGN / SIZEOF_PIXEL;
  #if ARCH_X86 || ARCH_X86_64
      if( h->param.cpu&X264_CPU_CACHELINE_64 || h->param.cpu&X264_CPU_AVX512 )
-        align = 64;
+        align = 64 / SIZEOF_PIXEL;
      else if( h->param.cpu&X264_CPU_CACHELINE_32 || h->param.cpu&X264_CPU_AVX )
-        align = 32;
+        align = 32 / SIZEOF_PIXEL;
+    else
+        align = 16 / SIZEOF_PIXEL;
  #endif
  #if ARCH_PPC
-    int disalign = 1<<9;
+    int disalign = (1<<9) / SIZEOF_PIXEL;
  #else
-    int disalign = 1<<10;
+    int disalign = (1<<10) / SIZEOF_PIXEL;
  #endif
  
-    /* ensure frame alignment after PADH is added */
-    int padh_align = X264_MAX( align - PADH * SIZEOF_PIXEL, 0 ) / SIZEOF_PIXEL;
-
      CHECKED_MALLOCZERO( frame, sizeof(x264_frame_t) );
      PREALLOC_INIT
  
      /* allocate frame data (+64 for extra data for me) */
      i_width  = h->mb.i_mb_width*16;
      i_lines  = h->mb.i_mb_height*16;
-    i_stride = align_stride( i_width + 2*PADH, align, disalign );
+    i_stride = align_stride( i_width + PADH2, align, disalign );
  
      if( i_csp == X264_CSP_NV12 || i_csp == X264_CSP_NV16 )
      {
@@ -123,7 +122,7 @@ static x264_frame_t *frame_new( x264_t *h, int b_fdec )
      frame->i_csp = i_csp;
      frame->i_width_lowres = frame->i_width[0]/2;
      frame->i_lines_lowres = frame->i_lines[0]/2;
-    frame->i_stride_lowres = align_stride( frame->i_width_lowres + 2*PADH, align, disalign<<1 );
+    frame->i_stride_lowres = align_stride( frame->i_width_lowres + PADH2, align, disalign<<1 );
  
      for( int i = 0; i < h->param.i_bframe + 2; i++ )
          for( int j = 0; j < h->param.i_bframe + 2; j++ )
@@ -152,9 +151,9 @@ static x264_frame_t *frame_new( x264_t *h, int b_fdec )
      {
          int chroma_padv = i_padv >> (i_csp == X264_CSP_NV12);
          int chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + 2*chroma_padv));
-        PREALLOC( frame->buffer[1], (chroma_plane_size + padh_align) * SIZEOF_PIXEL );
+        PREALLOC( frame->buffer[1], chroma_plane_size * SIZEOF_PIXEL );
          if( PARAM_INTERLACED )
-            PREALLOC( frame->buffer_fld[1], (chroma_plane_size + padh_align) * SIZEOF_PIXEL );
+            PREALLOC( frame->buffer_fld[1], chroma_plane_size * SIZEOF_PIXEL );
      }
  
      /* all 4 luma planes allocated together, since the cacheline split code
@@ -167,9 +166,9 @@ static x264_frame_t *frame_new( x264_t *h, int b_fdec )
              luma_plane_size *= 4;
  
          /* FIXME: Don't allocate both buffers in non-adaptive MBAFF. */
-        PREALLOC( frame->buffer[p], (luma_plane_size + padh_align) * SIZEOF_PIXEL );
+        PREALLOC( frame->buffer[p], luma_plane_size * SIZEOF_PIXEL );
          if( PARAM_INTERLACED )
-            PREALLOC( frame->buffer_fld[p], (luma_plane_size + padh_align) * SIZEOF_PIXEL );
+            PREALLOC( frame->buffer_fld[p], luma_plane_size * SIZEOF_PIXEL );
      }
  
      frame->b_duplicate = 0;
@@ -207,7 +206,7 @@ static x264_frame_t *frame_new( x264_t *h, int b_fdec )
          {
              int64_t luma_plane_size = align_plane_size( frame->i_stride_lowres * (frame->i_lines[0]/2 + 2*PADV), disalign );
  
-            PREALLOC( frame->buffer_lowres, (4 * luma_plane_size + padh_align) * SIZEOF_PIXEL );
+            PREALLOC( frame->buffer_lowres, 4 * luma_plane_size * SIZEOF_PIXEL );
  
              for( int j = 0; j <= !!h->param.i_bframe; j++ )
                  for( int i = 0; i <= h->param.i_bframe; i++ )
@@ -237,9 +236,9 @@ static x264_frame_t *frame_new( x264_t *h, int b_fdec )
      if( i_csp == X264_CSP_NV12 || i_csp == X264_CSP_NV16 )
      {
          int chroma_padv = i_padv >> (i_csp == X264_CSP_NV12);
-        frame->plane[1] = frame->buffer[1] + frame->i_stride[1] * chroma_padv + PADH + padh_align;
+        frame->plane[1] = frame->buffer[1] + frame->i_stride[1] * chroma_padv + PADH_ALIGN;
          if( PARAM_INTERLACED )
-            frame->plane_fld[1] = frame->buffer_fld[1] + frame->i_stride[1] * chroma_padv + PADH + padh_align;
+            frame->plane_fld[1] = frame->buffer_fld[1] + frame->i_stride[1] * chroma_padv + PADH_ALIGN;
      }
  
      for( int p = 0; p < luma_plane_count; p++ )
@@ -249,18 +248,18 @@ static x264_frame_t *frame_new( x264_t *h, int b_fdec )
          {
              for( int i = 0; i < 4; i++ )
              {
-                frame->filtered[p][i] = frame->buffer[p] + i*luma_plane_size + frame->i_stride[p] * i_padv + PADH + padh_align;
+                frame->filtered[p][i] = frame->buffer[p] + i*luma_plane_size + frame->i_stride[p] * i_padv + PADH_ALIGN;
                  if( PARAM_INTERLACED )
-                    frame->filtered_fld[p][i] = frame->buffer_fld[p] + i*luma_plane_size + frame->i_stride[p] * i_padv + PADH + padh_align;
+                    frame->filtered_fld[p][i] = frame->buffer_fld[p] + i*luma_plane_size + frame->i_stride[p] * i_padv + PADH_ALIGN;
              }
              frame->plane[p] = frame->filtered[p][0];
              frame->plane_fld[p] = frame->filtered_fld[p][0];
          }
          else
          {
-            frame->filtered[p][0] = frame->plane[p] = frame->buffer[p] + frame->i_stride[p] * i_padv + PADH + padh_align;
+            frame->filtered[p][0] = frame->plane[p] = frame->buffer[p] + frame->i_stride[p] * i_padv + PADH_ALIGN;
              if( PARAM_INTERLACED )
-                frame->filtered_fld[p][0] = frame->plane_fld[p] = frame->buffer_fld[p] + frame->i_stride[p] * i_padv + PADH + padh_align;
+                frame->filtered_fld[p][0] = frame->plane_fld[p] = frame->buffer_fld[p] + frame->i_stride[p] * i_padv + PADH_ALIGN;
          }
      }
  
@@ -270,7 +269,7 @@ static x264_frame_t *frame_new( x264_t *h, int b_fdec )
          frame->mv16x16++;
  
          if( h->param.analyse.i_me_method >= X264_ME_ESA )
-            frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH;
+            frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH_ALIGN;
      }
      else
      {
@@ -278,7 +277,7 @@ static x264_frame_t *frame_new( x264_t *h, int b_fdec )
          {
              int64_t luma_plane_size = align_plane_size( frame->i_stride_lowres * (frame->i_lines[0]/2 + 2*PADV), disalign );
              for( int i = 0; i < 4; i++ )
-                frame->lowres[i] = frame->buffer_lowres + frame->i_stride_lowres * PADV + PADH + padh_align + i * luma_plane_size;
+                frame->lowres[i] = frame->buffer_lowres + frame->i_stride_lowres * PADV + PADH_ALIGN + i * luma_plane_size;
  
              for( int j = 0; j <= !!h->param.i_bframe; j++ )
                  for( int i = 0; i <= h->param.i_bframe; i++ )
diff --git a/common/frame.h b/common/frame.h

index 10970eb16e5d08cd5f6326f03cbbefa9ac5e384c..ef20200810316471bf56bac1c1ee60d15ea9286f 100644 (file)
--- a/common/frame.h
+++ b/common/frame.h
@@ -31,6 +31,8 @@
  /* number of pixels past the edge of the frame, for motion estimation/compensation */
  #define PADH 32
  #define PADV 32
+#define PADH_ALIGN X264_MAX( PADH, NATIVE_ALIGN / SIZEOF_PIXEL )
+#define PADH2 (PADH_ALIGN + PADH)
  
  typedef struct x264_frame
  {
diff --git a/common/mc.c b/common/mc.c

index 2d2aafa9114dc18bd04cb100e0b5f29acbcf803d..32f2793f6737be6ec0e133c3f5458f921337e656 100644 (file)
--- a/common/mc.c
+++ b/common/mc.c
@@ -749,15 +749,15 @@ void x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
          int stride = frame->i_stride[0];
          if( start < 0 )
          {
-            memset( frame->integral - PADV * stride - PADH, 0, stride * sizeof(uint16_t) );
+            memset( frame->integral - PADV * stride - PADH_ALIGN, 0, stride * sizeof(uint16_t) );
              start = -PADV;
          }
          if( b_end )
              height += PADV-9;
          for( int y = start; y < height; y++ )
          {
-            pixel    *pix  = frame->plane[0] + y * stride - PADH;
-            uint16_t *sum8 = frame->integral + (y+1) * stride - PADH;
+            pixel    *pix  = frame->plane[0] + y * stride - PADH_ALIGN;
+            uint16_t *sum8 = frame->integral + (y+1) * stride - PADH_ALIGN;
              uint16_t *sum4;
              if( h->frames.b_have_sub8x8_esa )
              {
diff --git a/encoder/analyse.c b/encoder/analyse.c

index 970bd305a218e173e940e87bca873c5dc15e03e8..efd12377d76cc5714cdf99a9bb35e522681ca5d2 100644 (file)
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -223,10 +223,10 @@ void x264_analyse_weight_frame( x264_t *h, int end )
          if( h->sh.weight[j][0].weightfn )
          {
              x264_frame_t *frame = h->fref[0][j];
-            int width = frame->i_width[0] + 2*PADH;
+            int width = frame->i_width[0] + PADH2;
              int i_padv = PADV << PARAM_INTERLACED;
              int offset, height;
-            pixel *src = frame->filtered[0][0] - frame->i_stride[0]*i_padv - PADH;
+            pixel *src = frame->filtered[0][0] - frame->i_stride[0]*i_padv - PADH_ALIGN;
              height = X264_MIN( 16 + end + i_padv, h->fref[0][j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted;
              offset = h->fenc->i_lines_weighted*frame->i_stride[0];
              h->fenc->i_lines_weighted += height;
@@ -234,7 +234,7 @@ void x264_analyse_weight_frame( x264_t *h, int end )
                  for( int k = j; k < h->i_ref[0]; k++ )
                      if( h->sh.weight[k][0].weightfn )
                      {
-                        pixel *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH;
+                        pixel *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH_ALIGN;
                          x264_weight_scale_plane( h, dst + offset, frame->i_stride[0],
                                                   src + offset, frame->i_stride[0],
                                                   width, height, &h->sh.weight[k][0] );
diff --git a/encoder/encoder.c b/encoder/encoder.c

index 04dcecec70d41b2dde2f9c3abde5a629255b36cc..18b04113eaabea70073b6af2168a2e9a8ebc9aaf 100644 (file)
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -2185,14 +2185,14 @@ static void weighted_pred_init( x264_t *h )
                      assert( h->sh.weight[j][i].i_denom == denom );
                      if( !i )
                      {
-                        h->fenc->weighted[j] = h->mb.p_weight_buf[buffer_next++] + h->fenc->i_stride[0] * i_padv + PADH;
+                        h->fenc->weighted[j] = h->mb.p_weight_buf[buffer_next++] + h->fenc->i_stride[0] * i_padv + PADH_ALIGN;
                          //scale full resolution frame
                          if( h->param.i_threads == 1 )
                          {
-                            pixel *src = h->fref[0][j]->filtered[0][0] - h->fref[0][j]->i_stride[0]*i_padv - PADH;
-                            pixel *dst = h->fenc->weighted[j] - h->fenc->i_stride[0]*i_padv - PADH;
+                            pixel *src = h->fref[0][j]->filtered[0][0] - h->fref[0][j]->i_stride[0]*i_padv - PADH_ALIGN;
+                            pixel *dst = h->fenc->weighted[j] - h->fenc->i_stride[0]*i_padv - PADH_ALIGN;
                              int stride = h->fenc->i_stride[0];
-                            int width = h->fenc->i_width[0] + PADH*2;
+                            int width = h->fenc->i_width[0] + PADH2;
                              int height = h->fenc->i_lines[0] + i_padv*2;
                              x264_weight_scale_plane( h, dst, stride, src, stride, width, height, &h->sh.weight[j][0] );
                              h->fenc->i_lines_weighted = height;
diff --git a/encoder/slicetype.c b/encoder/slicetype.c

index c7984dd15e988c72b5032cff3f67441a2d2d55c8..0315ba6bf291679e3c7ddc3b7216b00a087896f6 100644 (file)
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -492,11 +492,11 @@ void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int
          //scale lowres in lookahead for slicetype_frame_cost
          pixel *src = ref->buffer_lowres;
          pixel *dst = h->mb.p_weight_buf[0];
-        int width = ref->i_width_lowres + PADH*2;
+        int width = ref->i_width_lowres + PADH2;
          int height = ref->i_lines_lowres + PADV*2;
          x264_weight_scale_plane( h, dst, ref->i_stride_lowres, src, ref->i_stride_lowres,
                                   width, height, &weights[0] );
-        fenc->weighted[0] = h->mb.p_weight_buf[0] + PADH + ref->i_stride_lowres * PADV;
+        fenc->weighted[0] = h->mb.p_weight_buf[0] + PADH_ALIGN + ref->i_stride_lowres * PADV;
      }
  }
author	Anton Mitrofanov <BugMaster@narod.ru>
	Thu, 21 Jan 2021 20:26:27 +0000 (23:26 +0300)
committer	Anton Mitrofanov <BugMaster@narod.ru>
	Tue, 26 Jan 2021 18:49:17 +0000 (21:49 +0300)
common/frame.c		patch \| blob \| history
common/frame.h		patch \| blob \| history
common/mc.c		patch \| blob \| history
encoder/analyse.c		patch \| blob \| history
encoder/encoder.c		patch \| blob \| history
encoder/slicetype.c		patch \| blob \| history