From: Fiona Glaser Date: Thu, 27 May 2010 21:27:32 +0000 (-0700) Subject: x86 assembly code for NAL escaping X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=6589ad6dc6a2ac7599c5a19566306c274bd86853;p=libx264 x86 assembly code for NAL escaping Up to ~10x faster than C depending on CPU. Helps the most at very high bitrates (e.g. lossless). Also make the C code faster and simpler. --- diff --git a/Makefile b/Makefile index 0b43a3e0..519e1812 100644 --- a/Makefile +++ b/Makefile @@ -8,7 +8,7 @@ SRCS = common/mc.c common/predict.c common/pixel.c common/macroblock.c \ common/frame.c common/dct.c common/cpu.c common/cabac.c \ common/common.c common/mdate.c common/rectangle.c \ common/set.c common/quant.c common/deblock.c common/vlc.c \ - common/mvpred.c \ + common/mvpred.c common/bitstream.c \ encoder/analyse.c encoder/me.c encoder/ratecontrol.c \ encoder/set.c encoder/macroblock.c encoder/cabac.c \ encoder/cavlc.c encoder/encoder.c encoder/lookahead.c @@ -52,7 +52,7 @@ endif ifneq ($(AS),) X86SRC0 = const-a.asm cabac-a.asm dct-a.asm deblock-a.asm mc-a.asm \ mc-a2.asm pixel-a.asm predict-a.asm quant-a.asm sad-a.asm \ - cpu-a.asm dct-32.asm + cpu-a.asm dct-32.asm bitstream-a.asm X86SRC = $(X86SRC0:%=common/x86/%) ifeq ($(ARCH),X86) diff --git a/common/bitstream.c b/common/bitstream.c new file mode 100644 index 00000000..e094c261 --- /dev/null +++ b/common/bitstream.c @@ -0,0 +1,92 @@ +/***************************************************************************** + * bitstream.c: h264 encoder library + ***************************************************************************** + * Copyright (C) 2010 x264 project + * + * Authors: Laurent Aimar + * Fiona Glaser + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + *****************************************************************************/ + +#include "common.h" + +static uint8_t *x264_nal_escape_c( uint8_t *dst, uint8_t *src, uint8_t *end ) +{ + if( src < end ) *dst++ = *src++; + if( src < end ) *dst++ = *src++; + while( src < end ) + { + if( src[0] <= 0x03 && !dst[-2] && !dst[-1] ) + *dst++ = 0x03; + *dst++ = *src++; + } + return dst; +} + +#ifdef HAVE_MMX +uint8_t *x264_nal_escape_mmxext( uint8_t *dst, uint8_t *src, uint8_t *end ); +uint8_t *x264_nal_escape_sse2( uint8_t *dst, uint8_t *src, uint8_t *end ); +#endif + +/**************************************************************************** + * x264_nal_encode: + ****************************************************************************/ +int x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal, int b_long_startcode ) +{ + uint8_t *src = nal->p_payload; + uint8_t *end = nal->p_payload + nal->i_payload; + uint8_t *orig_dst = dst; + + if( h->param.b_annexb ) + { + if( b_long_startcode ) + *dst++ = 0x00; + *dst++ = 0x00; + *dst++ = 0x00; + *dst++ = 0x01; + } + else /* save room for size later */ + dst += 4; + + /* nal header */ + *dst++ = ( 0x00 << 7 ) | ( nal->i_ref_idc << 5 ) | nal->i_type; + + dst = h->bsf.nal_escape( dst, src, end ); + int size = (dst - orig_dst) - 4; + + /* Write the size header for mp4/etc */ + if( !h->param.b_annexb ) + { + /* Size doesn't include the size of the header we're writing now. */ + orig_dst[0] = size>>24; + orig_dst[1] = size>>16; + orig_dst[2] = size>> 8; + orig_dst[3] = size>> 0; + } + + return size+4; +} + +void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf ) +{ + pf->nal_escape = x264_nal_escape_c; +#ifdef HAVE_MMX + if( cpu&X264_CPU_MMXEXT ) + pf->nal_escape = x264_nal_escape_mmxext; + if( (cpu&X264_CPU_SSE2) && (cpu&X264_CPU_SSE2_IS_FAST) ) + pf->nal_escape = x264_nal_escape_sse2; +#endif +} diff --git a/common/bs.h b/common/bitstream.h similarity index 96% rename from common/bs.h rename to common/bitstream.h index a090988b..73dc6910 100644 --- a/common/bs.h +++ b/common/bitstream.h @@ -1,5 +1,5 @@ /***************************************************************************** - * bs.h : + * bitstream.h: h264 encoder library ***************************************************************************** * Copyright (C) 2003-2008 x264 project * @@ -63,6 +63,14 @@ extern const vlc_t x264_total_zeros[15][16]; extern const vlc_t x264_total_zeros_dc[3][4]; extern const vlc_t x264_run_before[7][16]; +typedef struct +{ + uint8_t *(*nal_escape) ( uint8_t *dst, uint8_t *src, uint8_t *end ); +} x264_bitstream_function_t; + +int x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal, int b_long_startcode ); +void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf ); + /* A larger level table size theoretically could help a bit at extremely * high bitrates, but the cost in cache is usually too high for it to be * useful. diff --git a/common/common.c b/common/common.c index fccf2b05..2458f65b 100644 --- a/common/common.c +++ b/common/common.c @@ -1026,60 +1026,6 @@ void x264_picture_clean( x264_picture_t *pic ) memset( pic, 0, sizeof( x264_picture_t ) ); } -/**************************************************************************** - * x264_nal_encode: - ****************************************************************************/ -int x264_nal_encode( uint8_t *dst, x264_nal_t *nal, int b_annexb, int b_long_startcode ) -{ - uint8_t *src = nal->p_payload; - uint8_t *end = nal->p_payload + nal->i_payload; - uint8_t *orig_dst = dst; - int i_count = 0, size; - - if( b_annexb ) - { - if( b_long_startcode ) - *dst++ = 0x00; - *dst++ = 0x00; - *dst++ = 0x00; - *dst++ = 0x01; - } - else /* save room for size later */ - dst += 4; - - /* nal header */ - *dst++ = ( 0x00 << 7 ) | ( nal->i_ref_idc << 5 ) | nal->i_type; - - while( src < end ) - { - if( i_count == 2 && *src <= 0x03 ) - { - *dst++ = 0x03; - i_count = 0; - } - if( *src == 0 ) - i_count++; - else - i_count = 0; - *dst++ = *src++; - } - size = (dst - orig_dst) - 4; - - /* Write the size header for mp4/etc */ - if( !b_annexb ) - { - /* Size doesn't include the size of the header we're writing now. */ - orig_dst[0] = size>>24; - orig_dst[1] = size>>16; - orig_dst[2] = size>> 8; - orig_dst[3] = size>> 0; - } - - return size+4; -} - - - /**************************************************************************** * x264_malloc: ****************************************************************************/ diff --git a/common/common.h b/common/common.h index 539ea656..93712fe5 100644 --- a/common/common.h +++ b/common/common.h @@ -137,7 +137,7 @@ static const int x264_scan8[16+2*4+3] = */ #include "x264.h" -#include "bs.h" +#include "bitstream.h" #include "set.h" #include "predict.h" #include "pixel.h" @@ -166,8 +166,6 @@ int64_t x264_mdate( void ); * the encoding options */ char *x264_param2string( x264_param_t *p, int b_res ); -int x264_nal_encode( uint8_t *dst, x264_nal_t *nal, int b_annexb, int b_long_startcode ); - /* log */ void x264_log( x264_t *h, int i_level, const char *psz_fmt, ... ); @@ -796,6 +794,7 @@ struct x264_t x264_zigzag_function_t zigzagf; x264_quant_function_t quantf; x264_deblock_function_t loopf; + x264_bitstream_function_t bsf; #ifdef HAVE_VISUALIZE struct visualize_t *visualize; diff --git a/common/x86/bitstream-a.asm b/common/x86/bitstream-a.asm new file mode 100644 index 00000000..2fc21856 --- /dev/null +++ b/common/x86/bitstream-a.asm @@ -0,0 +1,112 @@ +;***************************************************************************** +;* bitstream-a.asm: h264 encoder library +;***************************************************************************** +;* Copyright (C) 2010 x264 project +;* +;* Authors: Fiona Glaser +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +;***************************************************************************** + +%include "x86inc.asm" +%include "x86util.asm" + +SECTION .text + +;----------------------------------------------------------------------------- +; uint8_t *x264_nal_escape( uint8_t *dst, uint8_t *src, uint8_t *end ) +;----------------------------------------------------------------------------- + +%macro NAL_LOOP 2 +ALIGN 16 +%1: + mova m0, [r1+r2] + mova m1, m0 +%if mmsize == 8 + psllq m0, 8 +%else + pslldq m0, 1 +%endif + %2 [r0+r1], m1 + por m1, m0 + pcmpeqb m1, m2 + pmovmskb r3d, m1 + test r3d, r3d + jnz .escape + add r1, mmsize + jl %1 +%endmacro + +%macro NAL_ESCAPE 1 + +cglobal nal_escape_%1, 3,5 + pxor m2, m2 + sub r1, r2 ; r1 = offset of current src pointer from end of src + sub r0, r1 ; r0 = projected end of dst, assuming no more escapes + + mov r3b, [r1+r2] + mov [r0+r1], r3b + inc r1 + jge .ret + + ; Start off by jumping into the escape loop in + ; case there's an escape at the start. + ; And do a few more in scalar until src is aligned again. + lea r4d, [r1+r2] + or r4d, -mmsize + neg r4d + jmp .first_escape + + NAL_LOOP .loop_aligned, mova +%if mmsize==16 + NAL_LOOP .loop_unaligned, movu +%endif + +.ret: + movifnidn rax, r0 + RET +ALIGN 16 +.escape: + mov r4d, mmsize +.first_escape: + mov r3b, [r1+r2] +.escape_loop: + mov [r0+r1], r3b + inc r1 + jge .ret + mov r3b, [r1+r2] + cmp r3b, 3 + jna .escape_check +.no_escape: + dec r4d + jg .escape_loop +%if mmsize==16 + lea r4d, [r0+r1] + test r4d, mmsize-1 + jnz .loop_unaligned +%endif + jmp .loop_aligned +.escape_check: + cmp word [r0+r1-2], 0 + jnz .no_escape + mov byte [r0+r1], 3 + inc r0 + jmp .no_escape +%endmacro + +INIT_MMX +NAL_ESCAPE mmxext +INIT_XMM +NAL_ESCAPE sse2 diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm index aedd688e..a039ed85 100644 --- a/common/x86/deblock-a.asm +++ b/common/x86/deblock-a.asm @@ -4,6 +4,7 @@ ;* Copyright (C) 2005-2008 x264 project ;* ;* Authors: Loren Merritt +;* Fiona Glaser ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by diff --git a/encoder/encoder.c b/encoder/encoder.c index 4360a100..2d2afed1 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -987,6 +987,7 @@ x264_t *x264_encoder_open( x264_param_t *param ) x264_mc_init( h->param.cpu, &h->mc ); x264_quant_init( h, h->param.cpu, &h->quantf ); x264_deblock_init( h->param.cpu, &h->loopf ); + x264_bitstream_init( h->param.cpu, &h->bsf ); x264_dct_init_weights(); mbcmp_init( h ); @@ -1273,7 +1274,7 @@ static int x264_encoder_encapsulate_nals( x264_t *h, int start ) for( int i = start; i < h->out.i_nal; i++ ) { int long_startcode = !i || h->out.nal[i].i_type == NAL_SPS || h->out.nal[i].i_type == NAL_PPS; - int size = x264_nal_encode( nal_buffer, &h->out.nal[i], h->param.b_annexb, long_startcode ); + int size = x264_nal_encode( h, nal_buffer, &h->out.nal[i], long_startcode ); h->out.nal[i].i_payload = size; h->out.nal[i].p_payload = nal_buffer; nal_buffer += size; diff --git a/tools/checkasm.c b/tools/checkasm.c index 2cd77ed9..c663bef3 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -1661,6 +1661,56 @@ static int check_cabac( int cpu_ref, int cpu_new ) return ret; } +static int check_bitstream( int cpu_ref, int cpu_new ) +{ + x264_bitstream_function_t bs_c; + x264_bitstream_function_t bs_ref; + x264_bitstream_function_t bs_a; + + int ret = 0, ok = 1, used_asm = 0; + + x264_bitstream_init( 0, &bs_c ); + x264_bitstream_init( cpu_ref, &bs_ref ); + x264_bitstream_init( cpu_new, &bs_a ); + if( bs_a.nal_escape != bs_ref.nal_escape ) + { + int size = 0x4000; + uint8_t *input = malloc(size+100); + uint8_t *output1 = malloc(size*2); + uint8_t *output2 = malloc(size*2); + used_asm = 1; + set_func_name( "nal_escape" ); + for( int i = 0; i < 100; i++ ) + { + /* Test corner-case sizes */ + int test_size = i < 10 ? i+1 : rand() & 0x3fff; + /* Test 8 different probability distributions of zeros */ + for( int j = 0; j < test_size; j++ ) + input[j] = (rand()&((1 << ((i&7)+1)) - 1)) * rand(); + uint8_t *end_c = (uint8_t*)call_c1( bs_c.nal_escape, output1, input, input+test_size ); + uint8_t *end_a = (uint8_t*)call_a1( bs_a.nal_escape, output2, input, input+test_size ); + int size_c = end_c-output1; + int size_a = end_a-output2; + if( size_c != size_a || memcmp( output1, output2, size_c ) ) + { + fprintf( stderr, "nal_escape : [FAILED] %d %d\n", size_c, size_a ); + ok = 0; + break; + } + } + for( int j = 0; j < size; j++ ) + input[j] = rand(); + call_c2( bs_c.nal_escape, output1, input, input+size ); + call_a2( bs_a.nal_escape, output2, input, input+size ); + free(input); + free(output1); + free(output2); + } + report( "nal escape:" ); + + return ret; +} + static int check_all_funcs( int cpu_ref, int cpu_new ) { return check_pixel( cpu_ref, cpu_new ) @@ -1669,7 +1719,8 @@ static int check_all_funcs( int cpu_ref, int cpu_new ) + check_intra( cpu_ref, cpu_new ) + check_deblock( cpu_ref, cpu_new ) + check_quant( cpu_ref, cpu_new ) - + check_cabac( cpu_ref, cpu_new ); + + check_cabac( cpu_ref, cpu_new ) + + check_bitstream( cpu_ref, cpu_new ); } static int add_flags( int *cpu_ref, int *cpu_new, int flags, const char *name )