From e7a46b6536ab3ea4806f585b771b6cbb261031d1 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Tue, 16 Apr 2013 23:27:32 +0200 Subject: [PATCH] x86: AVX2 nal_escape Also rewrite the entire function to be faster and drop the AVX version which is no longer useful. --- common/bitstream.c | 9 ++- common/x86/bitstream-a.asm | 111 +++++++++++++++++++------------------ encoder/encoder.c | 8 +-- x264.h | 2 +- 4 files changed, 69 insertions(+), 61 deletions(-) diff --git a/common/bitstream.c b/common/bitstream.c index b577f6c6..cecd5f7f 100644 --- a/common/bitstream.c +++ b/common/bitstream.c @@ -41,7 +41,7 @@ static uint8_t *x264_nal_escape_c( uint8_t *dst, uint8_t *src, uint8_t *end ) uint8_t *x264_nal_escape_mmx2( uint8_t *dst, uint8_t *src, uint8_t *end ); uint8_t *x264_nal_escape_sse2( uint8_t *dst, uint8_t *src, uint8_t *end ); -uint8_t *x264_nal_escape_avx( uint8_t *dst, uint8_t *src, uint8_t *end ); +uint8_t *x264_nal_escape_avx2( uint8_t *dst, uint8_t *src, uint8_t *end ); void x264_cabac_block_residual_rd_internal_sse2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); void x264_cabac_block_residual_rd_internal_sse2_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); void x264_cabac_block_residual_rd_internal_ssse3 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); @@ -132,8 +132,11 @@ void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf ) pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_ssse3_lzcnt; } } + + if( cpu&X264_CPU_AVX2 ) + { + pf->nal_escape = x264_nal_escape_avx2; + } #endif - if( cpu&X264_CPU_AVX ) - pf->nal_escape = x264_nal_escape_avx; #endif } diff --git a/common/x86/bitstream-a.asm b/common/x86/bitstream-a.asm index ef3fcab0..8aff5069 100644 --- a/common/x86/bitstream-a.asm +++ b/common/x86/bitstream-a.asm @@ -4,7 +4,7 @@ ;* Copyright (C) 2010-2013 x264 project ;* ;* Authors: Fiona Glaser -;* Henrik Gramner +;* Henrik Gramner ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by @@ -32,100 +32,105 @@ SECTION .text ;----------------------------------------------------------------------------- ; uint8_t *x264_nal_escape( uint8_t *dst, uint8_t *src, uint8_t *end ) ;----------------------------------------------------------------------------- - %macro NAL_LOOP 2 -%1_escape: +%%escape: ; Detect false positive to avoid unneccessary escape loop xor r3d, r3d cmp byte [r0+r1-1], 0 setnz r3b - xor r3d, r4d + xor k3, k4 jnz .escape - jmp %1_continue + jmp %%continue ALIGN 16 %1: - pcmpeqb m3, m1, m4 - pcmpeqb m2, m0, m4 - pmovmskb r3d, m3 - %2 [r0+r1], m0 + mova [r0+r1+mmsize], m1 + pcmpeqb m1, m0 + mova [r0+r1], m2 + pcmpeqb m2, m0 + pmovmskb r3d, m1 + %2 m1, [r1+r2+3*mmsize] pmovmskb r4d, m2 - shl r3d, mmsize - mova m0, [r1+r2+2*mmsize] - or r4d, r3d - %2 [r0+r1+mmsize], m1 - lea r3d, [r4+r4+1] - mova m1, [r1+r2+3*mmsize] - and r4d, r3d - jnz %1_escape -%1_continue: + %2 m2, [r1+r2+2*mmsize] + shl k3, mmsize + or k3, k4 + lea k4, [2*r3+1] + and k4, k3 + jnz %%escape +%%continue: add r1, 2*mmsize jl %1 %endmacro %macro NAL_ESCAPE 0 +%if mmsize == 32 + %xdefine k3 r3 + %xdefine k4 r4 +%else + %xdefine k3 r3d + %xdefine k4 r4d +%endif cglobal nal_escape, 3,5 - mov r3w, [r1] + movzx r3d, byte [r1] sub r1, r2 ; r1 = offset of current src pointer from end of src - pxor m4, m4 + pxor m0, m0 + mov [r0], r3b sub r0, r1 ; r0 = projected end of dst, assuming no more escapes - mov [r0+r1], r3w - add r1, 2 - jge .ret + or r3d, 0xffffff00 ; ignore data before src - ; Start off by jumping into the escape loop in - ; case there's an escape at the start. - ; And do a few more in scalar until src is aligned again. - jmp .first_escape + ; Start off by jumping into the escape loop in case there's an escape at the start. + ; And do a few more in scalar until dst is aligned. + jmp .escape_loop +%if mmsize == 16 NAL_LOOP .loop_aligned, mova -%if mmsize==16 jmp .ret - NAL_LOOP .loop_unaligned, movu %endif + NAL_LOOP .loop_unaligned, movu .ret: movifnidn rax, r0 RET -ALIGN 16 .escape: ; Skip bytes that are known to be valid - and r4d, r3d - tzcnt r3d, r4d - add r1, r3 + and k4, k3 + tzcnt k4, k4 + xor r3d, r3d ; the last two bytes are known to be zero + add r1, r4 .escape_loop: inc r1 jge .ret -.first_escape: - movzx r3d, byte [r1+r2] - lea r4, [r1+r2] - cmp r3d, 3 - jna .escape_check -.no_escape: + movzx r4d, byte [r1+r2] + shl r3d, 8 + or r3d, r4d + test r3d, 0xfffffc ; if the last two bytes are 0 and the current byte is <=3 + jz .add_escape_byte +.escaped: + lea r4d, [r0+r1] mov [r0+r1], r3b - test r4d, mmsize-1 ; Do SIMD when src is aligned + test r4d, mmsize-1 ; Do SIMD when dst is aligned jnz .escape_loop - mova m0, [r4] - mova m1, [r4+mmsize] -%if mmsize==16 - lea r4d, [r0+r1] + movu m1, [r1+r2+mmsize] + movu m2, [r1+r2] +%if mmsize == 16 + lea r4d, [r1+r2] test r4d, mmsize-1 - jnz .loop_unaligned + jz .loop_aligned %endif - jmp .loop_aligned + jmp .loop_unaligned -ALIGN 16 -.escape_check: - cmp word [r0+r1-2], 0 - jnz .no_escape +.add_escape_byte: mov byte [r0+r1], 3 - inc r0 - jmp .no_escape + inc r0 + or r3d, 0x0300 + jmp .escaped %endmacro INIT_MMX mmx2 NAL_ESCAPE INIT_XMM sse2 NAL_ESCAPE -INIT_XMM avx +%if ARCH_X86_64 +INIT_YMM avx2 NAL_ESCAPE +%endif diff --git a/encoder/encoder.c b/encoder/encoder.c index f9f411bb..e239de0e 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -1377,7 +1377,7 @@ x264_t *x264_encoder_open( x264_param_t *param ) * ( h->param.rc.i_rc_method == X264_RC_ABR ? pow( 0.95, h->param.rc.i_qp_min ) : pow( 0.95, h->param.rc.i_qp_constant ) * X264_MAX( 1, h->param.rc.f_ip_factor ))); - h->nal_buffer_size = h->out.i_bitstream * 3/2 + 4; + h->nal_buffer_size = h->out.i_bitstream * 3/2 + 4 + 64; /* +4 for startcode, +64 for nal_escape assembly padding */ CHECKED_MALLOC( h->nal_buffer, h->nal_buffer_size ); if( h->param.i_threads > 1 && @@ -1625,9 +1625,9 @@ static int x264_nal_end( x264_t *h ) x264_nal_t *nal = &h->out.nal[h->out.i_nal]; uint8_t *end = &h->out.p_bitstream[bs_pos( &h->out.bs ) / 8]; nal->i_payload = end - nal->p_payload; - /* nal_escape_mmx reads past the end of the input. + /* Assembly implementation of nal_escape reads past the end of the input. * While undefined padding wouldn't actually affect the output, it makes valgrind unhappy. */ - memset( end, 0xff, 32 ); + memset( end, 0xff, 64 ); if( h->param.nalu_process ) h->param.nalu_process( h, nal, h->fenc->opaque ); h->out.i_nal++; @@ -1653,7 +1653,7 @@ static int x264_encoder_encapsulate_nals( x264_t *h, int start ) nal_size += h->out.nal[i].i_payload; /* Worst-case NAL unit escaping: reallocate the buffer if it's too small. */ - int necessary_size = nal_size * 3/2 + h->out.i_nal * 4; + int necessary_size = nal_size * 3/2 + h->out.i_nal * 4 + 4 + 64; if( h->nal_buffer_size < necessary_size ) { h->nal_buffer_size = necessary_size * 2; diff --git a/x264.h b/x264.h index b2b1f9a3..d2a62dea 100644 --- a/x264.h +++ b/x264.h @@ -499,7 +499,7 @@ typedef struct x264_param_t * is done encoding. * * This callback MUST do the following in order to work correctly: - * 1) Have available an output buffer of at least size nal->i_payload*3/2 + 5 + 16. + * 1) Have available an output buffer of at least size nal->i_payload*3/2 + 5 + 64. * 2) Call x264_nal_encode( h, dst, nal ), where dst is the output buffer. * After these steps, the content of nal is valid and can be used in the same way as if * the NAL unit were output by x264_encoder_encode. -- 2.40.0