From: Fiona Glaser Date: Tue, 25 Mar 2008 01:12:07 +0000 (-0600) Subject: rearrange cabac struct to reduce code size X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=a3e11cbf36d78cb6b4147e8f1a73ee7c9387397e;p=libx264 rearrange cabac struct to reduce code size --- diff --git a/common/cabac.h b/common/cabac.h index 709c516e..1c762b89 100644 --- a/common/cabac.h +++ b/common/cabac.h @@ -26,11 +26,6 @@ typedef struct { - /* context */ - DECLARE_ALIGNED_16( uint8_t state[460] ); - - int f8_bits_encoded; // only if using x264_cabac_size_decision() - /* state */ int i_low; int i_range; @@ -43,6 +38,11 @@ typedef struct uint8_t *p; uint8_t *p_end; + /* aligned for aligned_memcpy starting here */ + DECLARE_ALIGNED_16( int f8_bits_encoded ); // only if using x264_cabac_size_decision() + + /* context */ + uint8_t state[460]; } x264_cabac_t; extern const uint8_t x264_cabac_transition[128][2]; diff --git a/common/x86/cabac-a.asm b/common/x86/cabac-a.asm index 9a6fbcd3..9c21096a 100644 --- a/common/x86/cabac-a.asm +++ b/common/x86/cabac-a.asm @@ -40,20 +40,25 @@ cextern x264_cabac_renorm_shift ; t3 must be ecx, since it's used for shift. %ifdef ARCH_X86_64 DEF_TMP 0,1,2,3,4,5,6,7, 0,1,2,3,4,5,6,10 - %define pointer 8 + %define pointer resq %else DEF_TMP 0,1,2,3,4,5,6,7, 0,3,2,1,4,5,6,3 - %define pointer 4 + %define pointer resd %endif -%define cb.state r0+0 -%define cb.low r0+464 -%define cb.range r0+468 -%define cb.queue r0+472 -%define cb.bytes_outstanding r0+476 -%define cb.p r0+480+pointer -%define cb.end r0+480+pointer*2 - +struc cb + .low: resd 1 + .range: resd 1 + .queue: resd 1 + .bytes_outstanding: resd 1 + .start: pointer 1 + .p: pointer 1 + .end: pointer 1 + align 16 + .bits_encoded: resd 1 + .state: resb 460 +endstruc + %macro LOAD_GLOBAL 4 %ifdef PIC64 ; this would be faster if the arrays were declared in asm, so that I didn't have to duplicate the lea @@ -78,8 +83,8 @@ cglobal x264_cabac_encode_decision, 0,7 movifnidn t0d, r0m movifnidn t1d, r1m picgetgot t2 - mov t5d, [cb.range] - movzx t3d, byte [cb.state+t1] + mov t5d, [r0+cb.range] + movzx t3d, byte [r0+cb.state+t1] mov t4d, t5d shr t5d, 6 and t5d, 3 @@ -93,7 +98,7 @@ cglobal x264_cabac_encode_decision, 0,7 movifnidn t2d, r2m cmp t6d, t2d %endif - mov t6d, [cb.low] + mov t6d, [r0+cb.low] lea t7, [t6+t4] cmovne t4d, t5d cmovne t6d, t7d @@ -103,18 +108,18 @@ cglobal x264_cabac_encode_decision, 0,7 %else LOAD_GLOBAL t3d, x264_cabac_transition, t2, t3*2 %endif - if32 mov t1d, r1m - mov [cb.state+t1], t3b + movifnidn t1d, r1m + mov [r0+cb.state+t1], t3b .renorm: mov t3d, t4d shr t3d, 3 LOAD_GLOBAL t3d, x264_cabac_renorm_shift, 0, t3 shl t4d, t3b shl t6d, t3b - add t3d, [cb.queue] - mov [cb.range], t4d - mov [cb.low], t6d - mov [cb.queue], t3d + add t3d, [r0+cb.queue] + mov [r0+cb.range], t4d + mov [r0+cb.low], t6d + mov [r0+cb.queue], t3d cmp t3d, 8 jge .putbyte .ret: @@ -130,15 +135,15 @@ cglobal x264_cabac_encode_decision, 0,7 sub t3d, 10 and t6d, t1d cmp t2b, 0xff ; FIXME is a 32bit op faster? - mov [cb.queue], t3d - mov [cb.low], t6d + mov [r0+cb.queue], t3d + mov [r0+cb.low], t6d mov t1d, t2d - mov t4, [cb.p] + mov t4, [r0+cb.p] je .postpone - mov t5d, [cb.bytes_outstanding] + mov t5d, [r0+cb.bytes_outstanding] shr t1d, 8 ; carry lea t6, [t4+t5+1] - cmp t6, [cb.end] + cmp t6, [r0+cb.end] jge .ret add [t4-1], t1b test t5d, t5d @@ -152,10 +157,10 @@ cglobal x264_cabac_encode_decision, 0,7 .no_outstanding: mov [t4], t2b inc t4 - mov [cb.bytes_outstanding], t5d ; is zero, but a reg has smaller opcode than an immediate - mov [cb.p], t4 + mov [r0+cb.bytes_outstanding], t5d ; is zero, but a reg has smaller opcode than an immediate + mov [r0+cb.p], t4 RET .postpone: - inc dword [cb.bytes_outstanding] + inc dword [r0+cb.bytes_outstanding] RET diff --git a/encoder/rdo.c b/encoder/rdo.c index e1227314..7967a924 100644 --- a/encoder/rdo.c +++ b/encoder/rdo.c @@ -49,7 +49,9 @@ static int cabac_prefix_size[15][128]; #define x264_macroblock_write_cabac x264_macroblock_size_cabac #include "cabac.c" - +#define COPY_CABAC h->mc.memcpy_aligned( &cabac_tmp.f8_bits_encoded, &h->cabac.f8_bits_encoded, \ + sizeof(x264_cabac_t) - offsetof(x264_cabac_t,f8_bits_encoded) ) + static int ssd_mb( x264_t *h ) { return h->pixf.ssd[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, @@ -83,7 +85,7 @@ static int x264_rd_cost_mb( x264_t *h, int i_lambda2 ) else if( h->param.b_cabac ) { x264_cabac_t cabac_tmp; - h->mc.memcpy_aligned( &cabac_tmp, &h->cabac, offsetof(x264_cabac_t,i_low) ); + COPY_CABAC; x264_macroblock_size_cabac( h, &cabac_tmp ); i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 32768 ) >> 16; } @@ -125,7 +127,7 @@ int x264_rd_cost_part( x264_t *h, int i_lambda2, int i8, int i_pixel ) if( h->param.b_cabac ) { x264_cabac_t cabac_tmp; - h->mc.memcpy_aligned( &cabac_tmp, &h->cabac, offsetof(x264_cabac_t,i_low) ); + COPY_CABAC; x264_partition_size_cabac( h, &cabac_tmp, i8, i_pixel ); i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 32768 ) >> 16; } @@ -147,7 +149,7 @@ int x264_rd_cost_i8x8( x264_t *h, int i_lambda2, int i8, int i_mode ) if( h->param.b_cabac ) { x264_cabac_t cabac_tmp; - h->mc.memcpy_aligned( &cabac_tmp, &h->cabac, offsetof(x264_cabac_t,i_low) ); + COPY_CABAC; x264_partition_i8x8_size_cabac( h, &cabac_tmp, i8, i_mode ); i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 32768 ) >> 16; } @@ -169,7 +171,7 @@ int x264_rd_cost_i4x4( x264_t *h, int i_lambda2, int i4, int i_mode ) if( h->param.b_cabac ) { x264_cabac_t cabac_tmp; - h->mc.memcpy_aligned( &cabac_tmp, &h->cabac, offsetof(x264_cabac_t,i_low) ); + COPY_CABAC; x264_partition_i4x4_size_cabac( h, &cabac_tmp, i4, i_mode ); i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 32768 ) >> 16; } @@ -195,7 +197,7 @@ int x264_rd_cost_i8x8_chroma( x264_t *h, int i_lambda2, int i_mode, int b_dct ) if( h->param.b_cabac ) { x264_cabac_t cabac_tmp; - h->mc.memcpy_aligned( &cabac_tmp, &h->cabac, offsetof(x264_cabac_t,i_low) ); + COPY_CABAC; x264_i8x8_chroma_size_cabac( h, &cabac_tmp ); i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 32768 ) >> 16; }