Up to ~10x faster than C depending on CPU.
Helps the most at very high bitrates (e.g. lossless).
Also make the C code faster and simpler.
common/frame.c common/dct.c common/cpu.c common/cabac.c \
common/common.c common/mdate.c common/rectangle.c \
common/set.c common/quant.c common/deblock.c common/vlc.c \
- common/mvpred.c \
+ common/mvpred.c common/bitstream.c \
encoder/analyse.c encoder/me.c encoder/ratecontrol.c \
encoder/set.c encoder/macroblock.c encoder/cabac.c \
encoder/cavlc.c encoder/encoder.c encoder/lookahead.c
ifneq ($(AS),)
X86SRC0 = const-a.asm cabac-a.asm dct-a.asm deblock-a.asm mc-a.asm \
mc-a2.asm pixel-a.asm predict-a.asm quant-a.asm sad-a.asm \
- cpu-a.asm dct-32.asm
+ cpu-a.asm dct-32.asm bitstream-a.asm
X86SRC = $(X86SRC0:%=common/x86/%)
ifeq ($(ARCH),X86)
--- /dev/null
+/*****************************************************************************
+ * bitstream.c: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2010 x264 project
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ * Fiona Glaser <fiona@x264.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *****************************************************************************/
+
+#include "common.h"
+
+static uint8_t *x264_nal_escape_c( uint8_t *dst, uint8_t *src, uint8_t *end )
+{
+ if( src < end ) *dst++ = *src++;
+ if( src < end ) *dst++ = *src++;
+ while( src < end )
+ {
+ if( src[0] <= 0x03 && !dst[-2] && !dst[-1] )
+ *dst++ = 0x03;
+ *dst++ = *src++;
+ }
+ return dst;
+}
+
+#ifdef HAVE_MMX
+uint8_t *x264_nal_escape_mmxext( uint8_t *dst, uint8_t *src, uint8_t *end );
+uint8_t *x264_nal_escape_sse2( uint8_t *dst, uint8_t *src, uint8_t *end );
+#endif
+
+/****************************************************************************
+ * x264_nal_encode:
+ ****************************************************************************/
+int x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal, int b_long_startcode )
+{
+ uint8_t *src = nal->p_payload;
+ uint8_t *end = nal->p_payload + nal->i_payload;
+ uint8_t *orig_dst = dst;
+
+ if( h->param.b_annexb )
+ {
+ if( b_long_startcode )
+ *dst++ = 0x00;
+ *dst++ = 0x00;
+ *dst++ = 0x00;
+ *dst++ = 0x01;
+ }
+ else /* save room for size later */
+ dst += 4;
+
+ /* nal header */
+ *dst++ = ( 0x00 << 7 ) | ( nal->i_ref_idc << 5 ) | nal->i_type;
+
+ dst = h->bsf.nal_escape( dst, src, end );
+ int size = (dst - orig_dst) - 4;
+
+ /* Write the size header for mp4/etc */
+ if( !h->param.b_annexb )
+ {
+ /* Size doesn't include the size of the header we're writing now. */
+ orig_dst[0] = size>>24;
+ orig_dst[1] = size>>16;
+ orig_dst[2] = size>> 8;
+ orig_dst[3] = size>> 0;
+ }
+
+ return size+4;
+}
+
+void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf )
+{
+ pf->nal_escape = x264_nal_escape_c;
+#ifdef HAVE_MMX
+ if( cpu&X264_CPU_MMXEXT )
+ pf->nal_escape = x264_nal_escape_mmxext;
+ if( (cpu&X264_CPU_SSE2) && (cpu&X264_CPU_SSE2_IS_FAST) )
+ pf->nal_escape = x264_nal_escape_sse2;
+#endif
+}
/*****************************************************************************
- * bs.h :
+ * bitstream.h: h264 encoder library
*****************************************************************************
* Copyright (C) 2003-2008 x264 project
*
extern const vlc_t x264_total_zeros_dc[3][4];
extern const vlc_t x264_run_before[7][16];
+typedef struct
+{
+ uint8_t *(*nal_escape) ( uint8_t *dst, uint8_t *src, uint8_t *end );
+} x264_bitstream_function_t;
+
+int x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal, int b_long_startcode );
+void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf );
+
/* A larger level table size theoretically could help a bit at extremely
* high bitrates, but the cost in cache is usually too high for it to be
* useful.
memset( pic, 0, sizeof( x264_picture_t ) );
}
-/****************************************************************************
- * x264_nal_encode:
- ****************************************************************************/
-int x264_nal_encode( uint8_t *dst, x264_nal_t *nal, int b_annexb, int b_long_startcode )
-{
- uint8_t *src = nal->p_payload;
- uint8_t *end = nal->p_payload + nal->i_payload;
- uint8_t *orig_dst = dst;
- int i_count = 0, size;
-
- if( b_annexb )
- {
- if( b_long_startcode )
- *dst++ = 0x00;
- *dst++ = 0x00;
- *dst++ = 0x00;
- *dst++ = 0x01;
- }
- else /* save room for size later */
- dst += 4;
-
- /* nal header */
- *dst++ = ( 0x00 << 7 ) | ( nal->i_ref_idc << 5 ) | nal->i_type;
-
- while( src < end )
- {
- if( i_count == 2 && *src <= 0x03 )
- {
- *dst++ = 0x03;
- i_count = 0;
- }
- if( *src == 0 )
- i_count++;
- else
- i_count = 0;
- *dst++ = *src++;
- }
- size = (dst - orig_dst) - 4;
-
- /* Write the size header for mp4/etc */
- if( !b_annexb )
- {
- /* Size doesn't include the size of the header we're writing now. */
- orig_dst[0] = size>>24;
- orig_dst[1] = size>>16;
- orig_dst[2] = size>> 8;
- orig_dst[3] = size>> 0;
- }
-
- return size+4;
-}
-
-
-
/****************************************************************************
* x264_malloc:
****************************************************************************/
*/
#include "x264.h"
-#include "bs.h"
+#include "bitstream.h"
#include "set.h"
#include "predict.h"
#include "pixel.h"
* the encoding options */
char *x264_param2string( x264_param_t *p, int b_res );
-int x264_nal_encode( uint8_t *dst, x264_nal_t *nal, int b_annexb, int b_long_startcode );
-
/* log */
void x264_log( x264_t *h, int i_level, const char *psz_fmt, ... );
x264_zigzag_function_t zigzagf;
x264_quant_function_t quantf;
x264_deblock_function_t loopf;
+ x264_bitstream_function_t bsf;
#ifdef HAVE_VISUALIZE
struct visualize_t *visualize;
--- /dev/null
+;*****************************************************************************
+;* bitstream-a.asm: h264 encoder library
+;*****************************************************************************
+;* Copyright (C) 2010 x264 project
+;*
+;* Authors: Fiona Glaser <fiona@x264.com>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+;*****************************************************************************
+
+%include "x86inc.asm"
+%include "x86util.asm"
+
+SECTION .text
+
+;-----------------------------------------------------------------------------
+; uint8_t *x264_nal_escape( uint8_t *dst, uint8_t *src, uint8_t *end )
+;-----------------------------------------------------------------------------
+
+%macro NAL_LOOP 2
+ALIGN 16
+%1:
+ mova m0, [r1+r2]
+ mova m1, m0
+%if mmsize == 8
+ psllq m0, 8
+%else
+ pslldq m0, 1
+%endif
+ %2 [r0+r1], m1
+ por m1, m0
+ pcmpeqb m1, m2
+ pmovmskb r3d, m1
+ test r3d, r3d
+ jnz .escape
+ add r1, mmsize
+ jl %1
+%endmacro
+
+%macro NAL_ESCAPE 1
+
+cglobal nal_escape_%1, 3,5
+ pxor m2, m2
+ sub r1, r2 ; r1 = offset of current src pointer from end of src
+ sub r0, r1 ; r0 = projected end of dst, assuming no more escapes
+
+ mov r3b, [r1+r2]
+ mov [r0+r1], r3b
+ inc r1
+ jge .ret
+
+ ; Start off by jumping into the escape loop in
+ ; case there's an escape at the start.
+ ; And do a few more in scalar until src is aligned again.
+ lea r4d, [r1+r2]
+ or r4d, -mmsize
+ neg r4d
+ jmp .first_escape
+
+ NAL_LOOP .loop_aligned, mova
+%if mmsize==16
+ NAL_LOOP .loop_unaligned, movu
+%endif
+
+.ret:
+ movifnidn rax, r0
+ RET
+ALIGN 16
+.escape:
+ mov r4d, mmsize
+.first_escape:
+ mov r3b, [r1+r2]
+.escape_loop:
+ mov [r0+r1], r3b
+ inc r1
+ jge .ret
+ mov r3b, [r1+r2]
+ cmp r3b, 3
+ jna .escape_check
+.no_escape:
+ dec r4d
+ jg .escape_loop
+%if mmsize==16
+ lea r4d, [r0+r1]
+ test r4d, mmsize-1
+ jnz .loop_unaligned
+%endif
+ jmp .loop_aligned
+.escape_check:
+ cmp word [r0+r1-2], 0
+ jnz .no_escape
+ mov byte [r0+r1], 3
+ inc r0
+ jmp .no_escape
+%endmacro
+
+INIT_MMX
+NAL_ESCAPE mmxext
+INIT_XMM
+NAL_ESCAPE sse2
;* Copyright (C) 2005-2008 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
+;* Fiona Glaser <fiona@x264.com>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
x264_mc_init( h->param.cpu, &h->mc );
x264_quant_init( h, h->param.cpu, &h->quantf );
x264_deblock_init( h->param.cpu, &h->loopf );
+ x264_bitstream_init( h->param.cpu, &h->bsf );
x264_dct_init_weights();
mbcmp_init( h );
for( int i = start; i < h->out.i_nal; i++ )
{
int long_startcode = !i || h->out.nal[i].i_type == NAL_SPS || h->out.nal[i].i_type == NAL_PPS;
- int size = x264_nal_encode( nal_buffer, &h->out.nal[i], h->param.b_annexb, long_startcode );
+ int size = x264_nal_encode( h, nal_buffer, &h->out.nal[i], long_startcode );
h->out.nal[i].i_payload = size;
h->out.nal[i].p_payload = nal_buffer;
nal_buffer += size;
return ret;
}
+static int check_bitstream( int cpu_ref, int cpu_new )
+{
+ x264_bitstream_function_t bs_c;
+ x264_bitstream_function_t bs_ref;
+ x264_bitstream_function_t bs_a;
+
+ int ret = 0, ok = 1, used_asm = 0;
+
+ x264_bitstream_init( 0, &bs_c );
+ x264_bitstream_init( cpu_ref, &bs_ref );
+ x264_bitstream_init( cpu_new, &bs_a );
+ if( bs_a.nal_escape != bs_ref.nal_escape )
+ {
+ int size = 0x4000;
+ uint8_t *input = malloc(size+100);
+ uint8_t *output1 = malloc(size*2);
+ uint8_t *output2 = malloc(size*2);
+ used_asm = 1;
+ set_func_name( "nal_escape" );
+ for( int i = 0; i < 100; i++ )
+ {
+ /* Test corner-case sizes */
+ int test_size = i < 10 ? i+1 : rand() & 0x3fff;
+ /* Test 8 different probability distributions of zeros */
+ for( int j = 0; j < test_size; j++ )
+ input[j] = (rand()&((1 << ((i&7)+1)) - 1)) * rand();
+ uint8_t *end_c = (uint8_t*)call_c1( bs_c.nal_escape, output1, input, input+test_size );
+ uint8_t *end_a = (uint8_t*)call_a1( bs_a.nal_escape, output2, input, input+test_size );
+ int size_c = end_c-output1;
+ int size_a = end_a-output2;
+ if( size_c != size_a || memcmp( output1, output2, size_c ) )
+ {
+ fprintf( stderr, "nal_escape : [FAILED] %d %d\n", size_c, size_a );
+ ok = 0;
+ break;
+ }
+ }
+ for( int j = 0; j < size; j++ )
+ input[j] = rand();
+ call_c2( bs_c.nal_escape, output1, input, input+size );
+ call_a2( bs_a.nal_escape, output2, input, input+size );
+ free(input);
+ free(output1);
+ free(output2);
+ }
+ report( "nal escape:" );
+
+ return ret;
+}
+
static int check_all_funcs( int cpu_ref, int cpu_new )
{
return check_pixel( cpu_ref, cpu_new )
+ check_intra( cpu_ref, cpu_new )
+ check_deblock( cpu_ref, cpu_new )
+ check_quant( cpu_ref, cpu_new )
- + check_cabac( cpu_ref, cpu_new );
+ + check_cabac( cpu_ref, cpu_new )
+ + check_bitstream( cpu_ref, cpu_new );
}
static int add_flags( int *cpu_ref, int *cpu_new, int flags, const char *name )