rearrange cabac struct to reduce code size

author Fiona Glaser <fiona@x264.com>

Tue, 25 Mar 2008 01:12:07 +0000 (19:12 -0600)

committer Loren Merritt <pengvado@akuvian.org>

Tue, 25 Mar 2008 01:12:07 +0000 (19:12 -0600)
author Fiona Glaser <fiona@x264.com>
Tue, 25 Mar 2008 01:12:07 +0000 (19:12 -0600)
committer Loren Merritt <pengvado@akuvian.org>
Tue, 25 Mar 2008 01:12:07 +0000 (19:12 -0600)
diff --git a/common/cabac.h b/common/cabac.h

index 709c516e742b09d1ad62473fc94fecabe996f974..1c762b89ec1261bc79f8252fe7929ee10a050521 100644 (file)
--- a/common/cabac.h
+++ b/common/cabac.h
@@ -26,11 +26,6 @@
  
  typedef struct
  {
-    /* context */
-    DECLARE_ALIGNED_16( uint8_t state[460] );
-
-    int f8_bits_encoded; // only if using x264_cabac_size_decision()
-
      /* state */
      int i_low;
      int i_range;
@@ -43,6 +38,11 @@ typedef struct
      uint8_t *p;
      uint8_t *p_end;
  
+    /* aligned for aligned_memcpy starting here */
+    DECLARE_ALIGNED_16( int f8_bits_encoded ); // only if using x264_cabac_size_decision()
+    
+    /* context */
+    uint8_t state[460];
  } x264_cabac_t;
  
  extern const uint8_t x264_cabac_transition[128][2];
diff --git a/common/x86/cabac-a.asm b/common/x86/cabac-a.asm

index 9a6fbcd383ca60d65250f315df894524bf7378e4..9c21096a225cbd3aa63b346eb73cca8609053a46 100644 (file)
--- a/common/x86/cabac-a.asm
+++ b/common/x86/cabac-a.asm
@@ -40,20 +40,25 @@ cextern x264_cabac_renorm_shift
  ; t3 must be ecx, since it's used for shift.
  %ifdef ARCH_X86_64
      DEF_TMP 0,1,2,3,4,5,6,7, 0,1,2,3,4,5,6,10
-    %define pointer 8
+    %define pointer resq
  %else
      DEF_TMP 0,1,2,3,4,5,6,7, 0,3,2,1,4,5,6,3
-    %define pointer 4
+    %define pointer resd
  %endif
  
-%define cb.state r0+0
-%define cb.low   r0+464
-%define cb.range r0+468
-%define cb.queue r0+472
-%define cb.bytes_outstanding r0+476
-%define cb.p     r0+480+pointer
-%define cb.end   r0+480+pointer*2
-
+struc cb
+    .low: resd 1
+    .range: resd 1
+    .queue: resd 1
+    .bytes_outstanding: resd 1
+    .start: pointer 1
+    .p: pointer 1
+    .end: pointer 1
+    align 16
+    .bits_encoded: resd 1
+    .state: resb 460
+endstruc
+    
  %macro LOAD_GLOBAL 4
  %ifdef PIC64
      ; this would be faster if the arrays were declared in asm, so that I didn't have to duplicate the lea
@@ -78,8 +83,8 @@ cglobal x264_cabac_encode_decision, 0,7
      movifnidn t0d, r0m
      movifnidn t1d, r1m
      picgetgot t2
-    mov   t5d, [cb.range]
-    movzx t3d, byte [cb.state+t1]
+    mov   t5d, [r0+cb.range]
+    movzx t3d, byte [r0+cb.state+t1]
      mov   t4d, t5d
      shr   t5d, 6
      and   t5d, 3
@@ -93,7 +98,7 @@ cglobal x264_cabac_encode_decision, 0,7
      movifnidn t2d, r2m
      cmp   t6d, t2d
  %endif
-    mov   t6d, [cb.low]
+    mov   t6d, [r0+cb.low]
      lea   t7,  [t6+t4]
      cmovne t4d, t5d
      cmovne t6d, t7d
@@ -103,18 +108,18 @@ cglobal x264_cabac_encode_decision, 0,7
  %else
      LOAD_GLOBAL t3d, x264_cabac_transition, t2, t3*2
  %endif
-    if32 mov t1d, r1m
-    mov   [cb.state+t1], t3b
+    movifnidn t1d, r1m
+    mov   [r0+cb.state+t1], t3b
  .renorm:
      mov   t3d, t4d
      shr   t3d, 3
      LOAD_GLOBAL t3d, x264_cabac_renorm_shift, 0, t3
      shl   t4d, t3b
      shl   t6d, t3b
-    add   t3d, [cb.queue]
-    mov   [cb.range], t4d
-    mov   [cb.low], t6d
-    mov   [cb.queue], t3d
+    add   t3d, [r0+cb.queue]
+    mov   [r0+cb.range], t4d
+    mov   [r0+cb.low], t6d
+    mov   [r0+cb.queue], t3d
      cmp   t3d, 8
      jge .putbyte
  .ret:
@@ -130,15 +135,15 @@ cglobal x264_cabac_encode_decision, 0,7
      sub   t3d, 10
      and   t6d, t1d
      cmp   t2b, 0xff ; FIXME is a 32bit op faster?
-    mov   [cb.queue], t3d
-    mov   [cb.low], t6d
+    mov   [r0+cb.queue], t3d
+    mov   [r0+cb.low], t6d
      mov   t1d, t2d
-    mov   t4,  [cb.p]
+    mov   t4,  [r0+cb.p]
      je .postpone
-    mov   t5d, [cb.bytes_outstanding]
+    mov   t5d, [r0+cb.bytes_outstanding]
      shr   t1d, 8 ; carry
      lea   t6, [t4+t5+1]
-    cmp   t6, [cb.end]
+    cmp   t6, [r0+cb.end]
      jge .ret
      add   [t4-1], t1b
      test  t5d, t5d
@@ -152,10 +157,10 @@ cglobal x264_cabac_encode_decision, 0,7
  .no_outstanding:
      mov   [t4], t2b
      inc   t4
-    mov   [cb.bytes_outstanding], t5d ; is zero, but a reg has smaller opcode than an immediate
-    mov   [cb.p], t4
+    mov   [r0+cb.bytes_outstanding], t5d ; is zero, but a reg has smaller opcode than an immediate
+    mov   [r0+cb.p], t4
      RET
  .postpone:
-    inc   dword [cb.bytes_outstanding]
+    inc   dword [r0+cb.bytes_outstanding]
      RET
  
diff --git a/encoder/rdo.c b/encoder/rdo.c

index e1227314919ba42cedc91c468f5220307665e881..7967a92450c4ae5788c786767ac2b1de3f2e5dba 100644 (file)
--- a/encoder/rdo.c
+++ b/encoder/rdo.c
@@ -49,7 +49,9 @@ static int cabac_prefix_size[15][128];
  #define x264_macroblock_write_cabac  x264_macroblock_size_cabac
  #include "cabac.c"
  
-
+#define COPY_CABAC h->mc.memcpy_aligned( &cabac_tmp.f8_bits_encoded, &h->cabac.f8_bits_encoded, \
+        sizeof(x264_cabac_t) - offsetof(x264_cabac_t,f8_bits_encoded) )
+    
  static int ssd_mb( x264_t *h )
  {
      return h->pixf.ssd[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
@@ -83,7 +85,7 @@ static int x264_rd_cost_mb( x264_t *h, int i_lambda2 )
      else if( h->param.b_cabac )
      {
          x264_cabac_t cabac_tmp;
-        h->mc.memcpy_aligned( &cabac_tmp, &h->cabac, offsetof(x264_cabac_t,i_low) );
+        COPY_CABAC;
          x264_macroblock_size_cabac( h, &cabac_tmp );
          i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 32768 ) >> 16;
      }
@@ -125,7 +127,7 @@ int x264_rd_cost_part( x264_t *h, int i_lambda2, int i8, int i_pixel )
      if( h->param.b_cabac )
      {
          x264_cabac_t cabac_tmp;
-        h->mc.memcpy_aligned( &cabac_tmp, &h->cabac, offsetof(x264_cabac_t,i_low) );
+        COPY_CABAC;
          x264_partition_size_cabac( h, &cabac_tmp, i8, i_pixel );
          i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 32768 ) >> 16;
      }
@@ -147,7 +149,7 @@ int x264_rd_cost_i8x8( x264_t *h, int i_lambda2, int i8, int i_mode )
      if( h->param.b_cabac )
      {
          x264_cabac_t cabac_tmp;
-        h->mc.memcpy_aligned( &cabac_tmp, &h->cabac, offsetof(x264_cabac_t,i_low) );
+        COPY_CABAC;
          x264_partition_i8x8_size_cabac( h, &cabac_tmp, i8, i_mode );
          i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 32768 ) >> 16;
      }
@@ -169,7 +171,7 @@ int x264_rd_cost_i4x4( x264_t *h, int i_lambda2, int i4, int i_mode )
      if( h->param.b_cabac )
      {
          x264_cabac_t cabac_tmp;
-        h->mc.memcpy_aligned( &cabac_tmp, &h->cabac, offsetof(x264_cabac_t,i_low) );
+        COPY_CABAC;
          x264_partition_i4x4_size_cabac( h, &cabac_tmp, i4, i_mode );
          i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 32768 ) >> 16;
      }
@@ -195,7 +197,7 @@ int x264_rd_cost_i8x8_chroma( x264_t *h, int i_lambda2, int i_mode, int b_dct )
      if( h->param.b_cabac )
      {
          x264_cabac_t cabac_tmp;
-        h->mc.memcpy_aligned( &cabac_tmp, &h->cabac, offsetof(x264_cabac_t,i_low) );
+        COPY_CABAC;
          x264_i8x8_chroma_size_cabac( h, &cabac_tmp );
          i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 32768 ) >> 16;
      }
author	Fiona Glaser <fiona@x264.com>
	Tue, 25 Mar 2008 01:12:07 +0000 (19:12 -0600)
committer	Loren Merritt <pengvado@akuvian.org>
	Tue, 25 Mar 2008 01:12:07 +0000 (19:12 -0600)
common/cabac.h		patch \| blob \| history
common/x86/cabac-a.asm		patch \| blob \| history
encoder/rdo.c		patch \| blob \| history