x86: AVX2 nal_escape

author Henrik Gramner <henrik@gramner.com>

Tue, 16 Apr 2013 21:27:32 +0000 (23:27 +0200)

committer Fiona Glaser <fiona@x264.com>

Tue, 23 Apr 2013 21:36:33 +0000 (14:36 -0700)
author Henrik Gramner <henrik@gramner.com>
Tue, 16 Apr 2013 21:27:32 +0000 (23:27 +0200)
committer Fiona Glaser <fiona@x264.com>
Tue, 23 Apr 2013 21:36:33 +0000 (14:36 -0700)
diff --git a/common/bitstream.c b/common/bitstream.c

index b577f6c6199bebdf15f5319453795a8a19138ea3..cecd5f7f5a3fa85278d5bcfd2e726f14b72e21f2 100644 (file)
--- a/common/bitstream.c
+++ b/common/bitstream.c
@@ -41,7 +41,7 @@ static uint8_t *x264_nal_escape_c( uint8_t *dst, uint8_t *src, uint8_t *end )
  
  uint8_t *x264_nal_escape_mmx2( uint8_t *dst, uint8_t *src, uint8_t *end );
  uint8_t *x264_nal_escape_sse2( uint8_t *dst, uint8_t *src, uint8_t *end );
-uint8_t *x264_nal_escape_avx( uint8_t *dst, uint8_t *src, uint8_t *end );
+uint8_t *x264_nal_escape_avx2( uint8_t *dst, uint8_t *src, uint8_t *end );
  void x264_cabac_block_residual_rd_internal_sse2       ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
  void x264_cabac_block_residual_rd_internal_sse2_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
  void x264_cabac_block_residual_rd_internal_ssse3      ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
@@ -132,8 +132,11 @@ void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf )
              pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_ssse3_lzcnt;
          }
      }
+
+    if( cpu&X264_CPU_AVX2 )
+    {
+        pf->nal_escape = x264_nal_escape_avx2;
+    }
  #endif
-    if( cpu&X264_CPU_AVX )
-        pf->nal_escape = x264_nal_escape_avx;
  #endif
  }
diff --git a/common/x86/bitstream-a.asm b/common/x86/bitstream-a.asm

index ef3fcab0c9bdcb9223854e0291ab3b839d1152e2..8aff5069916791653ede9c61052914b708736e51 100644 (file)
--- a/common/x86/bitstream-a.asm
+++ b/common/x86/bitstream-a.asm
@@ -4,7 +4,7 @@
  ;* Copyright (C) 2010-2013 x264 project
  ;*
  ;* Authors: Fiona Glaser <fiona@x264.com>
-;*          Henrik Gramner <hengar-6@student.ltu.se>
+;*          Henrik Gramner <henrik@gramner.com>
  ;*
  ;* This program is free software; you can redistribute it and/or modify
  ;* it under the terms of the GNU General Public License as published by
@@ -32,100 +32,105 @@ SECTION .text
  ;-----------------------------------------------------------------------------
  ; uint8_t *x264_nal_escape( uint8_t *dst, uint8_t *src, uint8_t *end )
  ;-----------------------------------------------------------------------------
-
  %macro NAL_LOOP 2
-%1_escape:
+%%escape:
      ; Detect false positive to avoid unneccessary escape loop
      xor      r3d, r3d
      cmp byte [r0+r1-1], 0
      setnz    r3b
-    xor      r3d, r4d
+    xor       k3, k4
      jnz .escape
-    jmp %1_continue
+    jmp %%continue
  ALIGN 16
  %1:
-    pcmpeqb   m3, m1, m4
-    pcmpeqb   m2, m0, m4
-    pmovmskb r3d, m3
-    %2   [r0+r1], m0
+    mova [r0+r1+mmsize], m1
+    pcmpeqb   m1, m0
+    mova [r0+r1], m2
+    pcmpeqb   m2, m0
+    pmovmskb r3d, m1
+    %2        m1, [r1+r2+3*mmsize]
      pmovmskb r4d, m2
-    shl      r3d, mmsize
-    mova      m0, [r1+r2+2*mmsize]
-    or       r4d, r3d
-    %2 [r0+r1+mmsize], m1
-    lea      r3d, [r4+r4+1]
-    mova      m1, [r1+r2+3*mmsize]
-    and      r4d, r3d
-    jnz %1_escape
-%1_continue:
+    %2        m2, [r1+r2+2*mmsize]
+    shl       k3, mmsize
+    or        k3, k4
+    lea       k4, [2*r3+1]
+    and       k4, k3
+    jnz %%escape
+%%continue:
      add       r1, 2*mmsize
      jl %1
  %endmacro
  
  %macro NAL_ESCAPE 0
+%if mmsize == 32
+    %xdefine k3 r3
+    %xdefine k4 r4
+%else
+    %xdefine k3 r3d
+    %xdefine k4 r4d
+%endif
  
  cglobal nal_escape, 3,5
-    mov      r3w, [r1]
+    movzx    r3d, byte [r1]
      sub       r1, r2 ; r1 = offset of current src pointer from end of src
-    pxor      m4, m4
+    pxor      m0, m0
+    mov     [r0], r3b
      sub       r0, r1 ; r0 = projected end of dst, assuming no more escapes
-    mov  [r0+r1], r3w
-    add       r1, 2
-    jge .ret
+    or       r3d, 0xffffff00 ; ignore data before src
  
-    ; Start off by jumping into the escape loop in
-    ; case there's an escape at the start.
-    ; And do a few more in scalar until src is aligned again.
-    jmp .first_escape
+    ; Start off by jumping into the escape loop in case there's an escape at the start.
+    ; And do a few more in scalar until dst is aligned.
+    jmp .escape_loop
  
+%if mmsize == 16
      NAL_LOOP .loop_aligned, mova
-%if mmsize==16
      jmp .ret
-    NAL_LOOP .loop_unaligned, movu
  %endif
+    NAL_LOOP .loop_unaligned, movu
  .ret:
      movifnidn rax, r0
      RET
  
-ALIGN 16
  .escape:
      ; Skip bytes that are known to be valid
-    and      r4d, r3d
-    tzcnt    r3d, r4d
-    add       r1, r3
+    and       k4, k3
+    tzcnt     k4, k4
+    xor      r3d, r3d ; the last two bytes are known to be zero
+    add       r1, r4
  .escape_loop:
      inc       r1
      jge .ret
-.first_escape:
-    movzx    r3d, byte [r1+r2]
-    lea       r4, [r1+r2]
-    cmp      r3d, 3
-    jna .escape_check
-.no_escape:
+    movzx    r4d, byte [r1+r2]
+    shl      r3d, 8
+    or       r3d, r4d
+    test     r3d, 0xfffffc ; if the last two bytes are 0 and the current byte is <=3
+    jz .add_escape_byte
+.escaped:
+    lea      r4d, [r0+r1]
      mov  [r0+r1], r3b
-    test     r4d, mmsize-1 ; Do SIMD when src is aligned
+    test     r4d, mmsize-1 ; Do SIMD when dst is aligned
      jnz .escape_loop
-    mova      m0, [r4]
-    mova      m1, [r4+mmsize]
-%if mmsize==16
-    lea      r4d, [r0+r1]
+    movu      m1, [r1+r2+mmsize]
+    movu      m2, [r1+r2]
+%if mmsize == 16
+    lea      r4d, [r1+r2]
      test     r4d, mmsize-1
-    jnz .loop_unaligned
+    jz .loop_aligned
  %endif
-    jmp .loop_aligned
+    jmp .loop_unaligned
  
-ALIGN 16
-.escape_check:
-    cmp word [r0+r1-2], 0
-    jnz .no_escape
+.add_escape_byte:
      mov byte [r0+r1], 3
-    inc      r0
-    jmp .no_escape
+    inc       r0
+    or       r3d, 0x0300
+    jmp .escaped
  %endmacro
  
  INIT_MMX mmx2
  NAL_ESCAPE
  INIT_XMM sse2
  NAL_ESCAPE
-INIT_XMM avx
+%if ARCH_X86_64
+INIT_YMM avx2
  NAL_ESCAPE
+%endif
diff --git a/encoder/encoder.c b/encoder/encoder.c

index f9f411bb212ec0c2d7903884e604e5ee4c8eace4..e239de0e20af4bfe91b5bf2c8029dd3082c48084 100644 (file)
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -1377,7 +1377,7 @@ x264_t *x264_encoder_open( x264_param_t *param )
          * ( h->param.rc.i_rc_method == X264_RC_ABR ? pow( 0.95, h->param.rc.i_qp_min )
            : pow( 0.95, h->param.rc.i_qp_constant ) * X264_MAX( 1, h->param.rc.f_ip_factor )));
  
-    h->nal_buffer_size = h->out.i_bitstream * 3/2 + 4;
+    h->nal_buffer_size = h->out.i_bitstream * 3/2 + 4 + 64; /* +4 for startcode, +64 for nal_escape assembly padding */
      CHECKED_MALLOC( h->nal_buffer, h->nal_buffer_size );
  
      if( h->param.i_threads > 1 &&
@@ -1625,9 +1625,9 @@ static int x264_nal_end( x264_t *h )
      x264_nal_t *nal = &h->out.nal[h->out.i_nal];
      uint8_t *end = &h->out.p_bitstream[bs_pos( &h->out.bs ) / 8];
      nal->i_payload = end - nal->p_payload;
-    /* nal_escape_mmx reads past the end of the input.
+    /* Assembly implementation of nal_escape reads past the end of the input.
       * While undefined padding wouldn't actually affect the output, it makes valgrind unhappy. */
-    memset( end, 0xff, 32 );
+    memset( end, 0xff, 64 );
      if( h->param.nalu_process )
          h->param.nalu_process( h, nal, h->fenc->opaque );
      h->out.i_nal++;
@@ -1653,7 +1653,7 @@ static int x264_encoder_encapsulate_nals( x264_t *h, int start )
          nal_size += h->out.nal[i].i_payload;
  
      /* Worst-case NAL unit escaping: reallocate the buffer if it's too small. */
-    int necessary_size = nal_size * 3/2 + h->out.i_nal * 4;
+    int necessary_size = nal_size * 3/2 + h->out.i_nal * 4 + 4 + 64;
      if( h->nal_buffer_size < necessary_size )
      {
          h->nal_buffer_size = necessary_size * 2;
diff --git a/x264.h b/x264.h

index b2b1f9a36cff5a503ba60a9a02feac3b764163e6..d2a62deace0bfb3efe4bb903ff3bc4b1c3db9c0f 100644 (file)
--- a/x264.h
+++ b/x264.h
@@ -499,7 +499,7 @@ typedef struct x264_param_t
       * is done encoding.
       *
       * This callback MUST do the following in order to work correctly:
-     * 1) Have available an output buffer of at least size nal->i_payload*3/2 + 5 + 16.
+     * 1) Have available an output buffer of at least size nal->i_payload*3/2 + 5 + 64.
       * 2) Call x264_nal_encode( h, dst, nal ), where dst is the output buffer.
       * After these steps, the content of nal is valid and can be used in the same way as if
       * the NAL unit were output by x264_encoder_encode.
author	Henrik Gramner <henrik@gramner.com>
	Tue, 16 Apr 2013 21:27:32 +0000 (23:27 +0200)
committer	Fiona Glaser <fiona@x264.com>
	Tue, 23 Apr 2013 21:36:33 +0000 (14:36 -0700)
common/bitstream.c		patch \| blob \| history
common/x86/bitstream-a.asm		patch \| blob \| history
encoder/encoder.c		patch \| blob \| history
x264.h		patch \| blob \| history