From: Fiona Glaser <fiona@x264.com>
Date: Thu, 27 May 2010 21:27:32 +0000 (-0700)
Subject: x86 assembly code for NAL escaping
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=6589ad6dc6a2ac7599c5a19566306c274bd86853;p=libx264

x86 assembly code for NAL escaping
Up to ~10x faster than C depending on CPU.
Helps the most at very high bitrates (e.g. lossless).
Also make the C code faster and simpler.
---

diff --git a/Makefile b/Makefile
index 0b43a3e0..519e1812 100644
--- a/Makefile
+++ b/Makefile
@@ -8,7 +8,7 @@ SRCS = common/mc.c common/predict.c common/pixel.c common/macroblock.c \
        common/frame.c common/dct.c common/cpu.c common/cabac.c \
        common/common.c common/mdate.c common/rectangle.c \
        common/set.c common/quant.c common/deblock.c common/vlc.c \
-       common/mvpred.c \
+       common/mvpred.c common/bitstream.c \
        encoder/analyse.c encoder/me.c encoder/ratecontrol.c \
        encoder/set.c encoder/macroblock.c encoder/cabac.c \
        encoder/cavlc.c encoder/encoder.c encoder/lookahead.c
@@ -52,7 +52,7 @@ endif
 ifneq ($(AS),)
 X86SRC0 = const-a.asm cabac-a.asm dct-a.asm deblock-a.asm mc-a.asm \
           mc-a2.asm pixel-a.asm predict-a.asm quant-a.asm sad-a.asm \
-          cpu-a.asm dct-32.asm
+          cpu-a.asm dct-32.asm bitstream-a.asm
 X86SRC = $(X86SRC0:%=common/x86/%)
 
 ifeq ($(ARCH),X86)
diff --git a/common/bitstream.c b/common/bitstream.c
new file mode 100644
index 00000000..e094c261
--- /dev/null
+++ b/common/bitstream.c
@@ -0,0 +1,92 @@
+/*****************************************************************************
+ * bitstream.c: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2010 x264 project
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *          Fiona Glaser <fiona@x264.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include "common.h"
+
+static uint8_t *x264_nal_escape_c( uint8_t *dst, uint8_t *src, uint8_t *end )
+{
+    if( src < end ) *dst++ = *src++;
+    if( src < end ) *dst++ = *src++;
+    while( src < end )
+    {
+        if( src[0] <= 0x03 && !dst[-2] && !dst[-1] )
+            *dst++ = 0x03;
+        *dst++ = *src++;
+    }
+    return dst;
+}
+
+#ifdef HAVE_MMX
+uint8_t *x264_nal_escape_mmxext( uint8_t *dst, uint8_t *src, uint8_t *end );
+uint8_t *x264_nal_escape_sse2( uint8_t *dst, uint8_t *src, uint8_t *end );
+#endif
+
+/****************************************************************************
+ * x264_nal_encode:
+ ****************************************************************************/
+int x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal, int b_long_startcode )
+{
+    uint8_t *src = nal->p_payload;
+    uint8_t *end = nal->p_payload + nal->i_payload;
+    uint8_t *orig_dst = dst;
+
+    if( h->param.b_annexb )
+    {
+        if( b_long_startcode )
+            *dst++ = 0x00;
+        *dst++ = 0x00;
+        *dst++ = 0x00;
+        *dst++ = 0x01;
+    }
+    else /* save room for size later */
+        dst += 4;
+
+    /* nal header */
+    *dst++ = ( 0x00 << 7 ) | ( nal->i_ref_idc << 5 ) | nal->i_type;
+
+    dst = h->bsf.nal_escape( dst, src, end );
+    int size = (dst - orig_dst) - 4;
+
+    /* Write the size header for mp4/etc */
+    if( !h->param.b_annexb )
+    {
+        /* Size doesn't include the size of the header we're writing now. */
+        orig_dst[0] = size>>24;
+        orig_dst[1] = size>>16;
+        orig_dst[2] = size>> 8;
+        orig_dst[3] = size>> 0;
+    }
+
+    return size+4;
+}
+
+void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf )
+{
+    pf->nal_escape = x264_nal_escape_c;
+#ifdef HAVE_MMX
+    if( cpu&X264_CPU_MMXEXT )
+        pf->nal_escape = x264_nal_escape_mmxext;
+    if( (cpu&X264_CPU_SSE2) && (cpu&X264_CPU_SSE2_IS_FAST) )
+        pf->nal_escape = x264_nal_escape_sse2;
+#endif
+}
diff --git a/common/bs.h b/common/bitstream.h
similarity index 96%
rename from common/bs.h
rename to common/bitstream.h
index a090988b..73dc6910 100644
--- a/common/bs.h
+++ b/common/bitstream.h
@@ -1,5 +1,5 @@
 /*****************************************************************************
- * bs.h :
+ * bitstream.h: h264 encoder library
  *****************************************************************************
  * Copyright (C) 2003-2008 x264 project
  *
@@ -63,6 +63,14 @@ extern const vlc_t x264_total_zeros[15][16];
 extern const vlc_t x264_total_zeros_dc[3][4];
 extern const vlc_t x264_run_before[7][16];
 
+typedef struct
+{
+    uint8_t *(*nal_escape) ( uint8_t *dst, uint8_t *src, uint8_t *end );
+} x264_bitstream_function_t;
+
+int x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal, int b_long_startcode );
+void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf );
+
 /* A larger level table size theoretically could help a bit at extremely
  * high bitrates, but the cost in cache is usually too high for it to be
  * useful.
diff --git a/common/common.c b/common/common.c
index fccf2b05..2458f65b 100644
--- a/common/common.c
+++ b/common/common.c
@@ -1026,60 +1026,6 @@ void x264_picture_clean( x264_picture_t *pic )
     memset( pic, 0, sizeof( x264_picture_t ) );
 }
 
-/****************************************************************************
- * x264_nal_encode:
- ****************************************************************************/
-int x264_nal_encode( uint8_t *dst, x264_nal_t *nal, int b_annexb, int b_long_startcode )
-{
-    uint8_t *src = nal->p_payload;
-    uint8_t *end = nal->p_payload + nal->i_payload;
-    uint8_t *orig_dst = dst;
-    int i_count = 0, size;
-
-    if( b_annexb )
-    {
-        if( b_long_startcode )
-            *dst++ = 0x00;
-        *dst++ = 0x00;
-        *dst++ = 0x00;
-        *dst++ = 0x01;
-    }
-    else /* save room for size later */
-        dst += 4;
-
-    /* nal header */
-    *dst++ = ( 0x00 << 7 ) | ( nal->i_ref_idc << 5 ) | nal->i_type;
-
-    while( src < end )
-    {
-        if( i_count == 2 && *src <= 0x03 )
-        {
-            *dst++ = 0x03;
-            i_count = 0;
-        }
-        if( *src == 0 )
-            i_count++;
-        else
-            i_count = 0;
-        *dst++ = *src++;
-    }
-    size = (dst - orig_dst) - 4;
-
-    /* Write the size header for mp4/etc */
-    if( !b_annexb )
-    {
-        /* Size doesn't include the size of the header we're writing now. */
-        orig_dst[0] = size>>24;
-        orig_dst[1] = size>>16;
-        orig_dst[2] = size>> 8;
-        orig_dst[3] = size>> 0;
-    }
-
-    return size+4;
-}
-
-
-
 /****************************************************************************
  * x264_malloc:
  ****************************************************************************/
diff --git a/common/common.h b/common/common.h
index 539ea656..93712fe5 100644
--- a/common/common.h
+++ b/common/common.h
@@ -137,7 +137,7 @@ static const int x264_scan8[16+2*4+3] =
 */
 
 #include "x264.h"
-#include "bs.h"
+#include "bitstream.h"
 #include "set.h"
 #include "predict.h"
 #include "pixel.h"
@@ -166,8 +166,6 @@ int64_t x264_mdate( void );
  * the encoding options */
 char *x264_param2string( x264_param_t *p, int b_res );
 
-int x264_nal_encode( uint8_t *dst, x264_nal_t *nal, int b_annexb, int b_long_startcode );
-
 /* log */
 void x264_log( x264_t *h, int i_level, const char *psz_fmt, ... );
 
@@ -796,6 +794,7 @@ struct x264_t
     x264_zigzag_function_t zigzagf;
     x264_quant_function_t quantf;
     x264_deblock_function_t loopf;
+    x264_bitstream_function_t bsf;
 
 #ifdef HAVE_VISUALIZE
     struct visualize_t *visualize;
diff --git a/common/x86/bitstream-a.asm b/common/x86/bitstream-a.asm
new file mode 100644
index 00000000..2fc21856
--- /dev/null
+++ b/common/x86/bitstream-a.asm
@@ -0,0 +1,112 @@
+;*****************************************************************************
+;* bitstream-a.asm: h264 encoder library
+;*****************************************************************************
+;* Copyright (C) 2010 x264 project
+;*
+;* Authors: Fiona Glaser <fiona@x264.com>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+;*****************************************************************************
+
+%include "x86inc.asm"
+%include "x86util.asm"
+
+SECTION .text
+
+;-----------------------------------------------------------------------------
+; uint8_t *x264_nal_escape( uint8_t *dst, uint8_t *src, uint8_t *end )
+;-----------------------------------------------------------------------------
+
+%macro NAL_LOOP 2
+ALIGN 16
+%1:
+    mova      m0, [r1+r2]
+    mova      m1, m0
+%if mmsize == 8
+    psllq     m0, 8
+%else
+    pslldq    m0, 1
+%endif
+    %2   [r0+r1], m1
+    por       m1, m0
+    pcmpeqb   m1, m2
+    pmovmskb r3d, m1
+    test     r3d, r3d
+    jnz .escape
+    add       r1, mmsize
+    jl %1
+%endmacro
+
+%macro NAL_ESCAPE 1
+
+cglobal nal_escape_%1, 3,5
+    pxor      m2, m2
+    sub       r1, r2 ; r1 = offset of current src pointer from end of src
+    sub       r0, r1 ; r0 = projected end of dst, assuming no more escapes
+
+    mov      r3b, [r1+r2]
+    mov  [r0+r1], r3b
+    inc       r1
+    jge .ret
+
+    ; Start off by jumping into the escape loop in
+    ; case there's an escape at the start.
+    ; And do a few more in scalar until src is aligned again.
+    lea      r4d, [r1+r2]
+    or       r4d, -mmsize
+    neg      r4d
+    jmp .first_escape
+
+    NAL_LOOP .loop_aligned, mova
+%if mmsize==16
+    NAL_LOOP .loop_unaligned, movu
+%endif
+
+.ret:
+    movifnidn rax, r0
+    RET
+ALIGN 16
+.escape:
+    mov      r4d, mmsize
+.first_escape:
+    mov      r3b, [r1+r2]
+.escape_loop:
+    mov  [r0+r1], r3b
+    inc      r1
+    jge .ret
+    mov      r3b, [r1+r2]
+    cmp      r3b, 3
+    jna .escape_check
+.no_escape:
+    dec      r4d
+    jg .escape_loop
+%if mmsize==16
+    lea      r4d, [r0+r1]
+    test     r4d, mmsize-1
+    jnz .loop_unaligned
+%endif
+    jmp .loop_aligned
+.escape_check:
+    cmp word [r0+r1-2], 0
+    jnz .no_escape
+    mov byte [r0+r1], 3
+    inc      r0
+    jmp .no_escape
+%endmacro
+
+INIT_MMX
+NAL_ESCAPE mmxext
+INIT_XMM
+NAL_ESCAPE sse2
diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm
index aedd688e..a039ed85 100644
--- a/common/x86/deblock-a.asm
+++ b/common/x86/deblock-a.asm
@@ -4,6 +4,7 @@
 ;* Copyright (C) 2005-2008 x264 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
+;*          Fiona Glaser <fiona@x264.com>
 ;*
 ;* This program is free software; you can redistribute it and/or modify
 ;* it under the terms of the GNU General Public License as published by
diff --git a/encoder/encoder.c b/encoder/encoder.c
index 4360a100..2d2afed1 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -987,6 +987,7 @@ x264_t *x264_encoder_open( x264_param_t *param )
     x264_mc_init( h->param.cpu, &h->mc );
     x264_quant_init( h, h->param.cpu, &h->quantf );
     x264_deblock_init( h->param.cpu, &h->loopf );
+    x264_bitstream_init( h->param.cpu, &h->bsf );
     x264_dct_init_weights();
 
     mbcmp_init( h );
@@ -1273,7 +1274,7 @@ static int x264_encoder_encapsulate_nals( x264_t *h, int start )
     for( int i = start; i < h->out.i_nal; i++ )
     {
         int long_startcode = !i || h->out.nal[i].i_type == NAL_SPS || h->out.nal[i].i_type == NAL_PPS;
-        int size = x264_nal_encode( nal_buffer, &h->out.nal[i], h->param.b_annexb, long_startcode );
+        int size = x264_nal_encode( h, nal_buffer, &h->out.nal[i], long_startcode );
         h->out.nal[i].i_payload = size;
         h->out.nal[i].p_payload = nal_buffer;
         nal_buffer += size;
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 2cd77ed9..c663bef3 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -1661,6 +1661,56 @@ static int check_cabac( int cpu_ref, int cpu_new )
     return ret;
 }
 
+static int check_bitstream( int cpu_ref, int cpu_new )
+{
+    x264_bitstream_function_t bs_c;
+    x264_bitstream_function_t bs_ref;
+    x264_bitstream_function_t bs_a;
+
+    int ret = 0, ok = 1, used_asm = 0;
+
+    x264_bitstream_init( 0, &bs_c );
+    x264_bitstream_init( cpu_ref, &bs_ref );
+    x264_bitstream_init( cpu_new, &bs_a );
+    if( bs_a.nal_escape != bs_ref.nal_escape )
+    {
+        int size = 0x4000;
+        uint8_t *input = malloc(size+100);
+        uint8_t *output1 = malloc(size*2);
+        uint8_t *output2 = malloc(size*2);
+        used_asm = 1;
+        set_func_name( "nal_escape" );
+        for( int i = 0; i < 100; i++ )
+        {
+            /* Test corner-case sizes */
+            int test_size = i < 10 ? i+1 : rand() & 0x3fff;
+            /* Test 8 different probability distributions of zeros */
+            for( int j = 0; j < test_size; j++ )
+                input[j] = (rand()&((1 << ((i&7)+1)) - 1)) * rand();
+            uint8_t *end_c = (uint8_t*)call_c1( bs_c.nal_escape, output1, input, input+test_size );
+            uint8_t *end_a = (uint8_t*)call_a1( bs_a.nal_escape, output2, input, input+test_size );
+            int size_c = end_c-output1;
+            int size_a = end_a-output2;
+            if( size_c != size_a || memcmp( output1, output2, size_c ) )
+            {
+                fprintf( stderr, "nal_escape :  [FAILED] %d %d\n", size_c, size_a );
+                ok = 0;
+                break;
+            }
+        }
+        for( int j = 0; j < size; j++ )
+            input[j] = rand();
+        call_c2( bs_c.nal_escape, output1, input, input+size );
+        call_a2( bs_a.nal_escape, output2, input, input+size );
+        free(input);
+        free(output1);
+        free(output2);
+    }
+    report( "nal escape:" );
+
+    return ret;
+}
+
 static int check_all_funcs( int cpu_ref, int cpu_new )
 {
     return check_pixel( cpu_ref, cpu_new )
@@ -1669,7 +1719,8 @@ static int check_all_funcs( int cpu_ref, int cpu_new )
          + check_intra( cpu_ref, cpu_new )
          + check_deblock( cpu_ref, cpu_new )
          + check_quant( cpu_ref, cpu_new )
-         + check_cabac( cpu_ref, cpu_new );
+         + check_cabac( cpu_ref, cpu_new )
+         + check_bitstream( cpu_ref, cpu_new );
 }
 
 static int add_flags( int *cpu_ref, int *cpu_new, int flags, const char *name )