From: Loren Merritt Date: Mon, 24 Mar 2008 04:14:18 +0000 (-0600) Subject: cabac asm. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=23e52ef3bbc0690fe55e49bf32c595adc0404878;p=libx264 cabac asm. mostly because gcc refuses to use cmov. 28% faster than c on core2, 11% on k8, 6% on p4. --- diff --git a/Makefile b/Makefile index aedb1bee..081a4500 100644 --- a/Makefile +++ b/Makefile @@ -21,7 +21,7 @@ endif # MMX/SSE optims ifneq ($(AS),) -X86SRC0 = dct-a.asm deblock-a.asm mc-a.asm mc-a2.asm \ +X86SRC0 = cabac-a.asm dct-a.asm deblock-a.asm mc-a.asm mc-a2.asm \ pixel-a.asm predict-a.asm quant-a.asm sad-a.asm \ cpu-32.asm dct-32.asm X86SRC = $(X86SRC0:%=common/x86/%) diff --git a/common/cabac.c b/common/cabac.c index 1fd5864d..6ecb9a5c 100644 --- a/common/cabac.c +++ b/common/cabac.c @@ -666,7 +666,7 @@ static const int8_t x264_cabac_context_init_PB[3][460][2] = /* FIXME could avoid this duplication by reversing the order of states * with MPS=0, but that would uglify the other tables */ -static const uint8_t x264_cabac_range_lps[128][4] = +const uint8_t x264_cabac_range_lps[128][4] = { { 2, 2, 2, 2 }, { 6, 7, 8, 9 }, { 6, 7, 9, 10 }, { 6, 8, 9, 11 }, @@ -735,7 +735,7 @@ const uint8_t x264_cabac_transition[128][2] = {100,121}, {100,122}, {101,123}, {101,124}, {101,125}, {102,126}, {102,126}, {127,127}, }; -static const uint8_t x264_cabac_renorm_shift[64]= { +const uint8_t x264_cabac_renorm_shift[64]= { 6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, @@ -895,6 +895,7 @@ static inline void x264_cabac_encode_renorm( x264_cabac_t *cb ) x264_cabac_putbyte( cb ); } +#ifndef HAVE_MMX void x264_cabac_encode_decision( x264_cabac_t *cb, int i_ctx, int b ) { int i_state = cb->state[i_ctx]; @@ -908,6 +909,7 @@ void x264_cabac_encode_decision( x264_cabac_t *cb, int i_ctx, int b ) cb->state[i_ctx] = x264_cabac_transition[i_state][b]; x264_cabac_encode_renorm( cb ); } +#endif void x264_cabac_encode_bypass( x264_cabac_t *cb, int b ) { diff --git a/common/x86/cabac-a.asm b/common/x86/cabac-a.asm new file mode 100644 index 00000000..9a6fbcd3 --- /dev/null +++ b/common/x86/cabac-a.asm @@ -0,0 +1,161 @@ +;***************************************************************************** +;* cabac-a.asm: h264 encoder library +;***************************************************************************** +;* Copyright (C) 2008 x264 project +;* +;* Author: Loren Merritt +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. +;***************************************************************************** + +%include "x86inc.asm" + +SECTION .text + +cextern x264_cabac_range_lps +cextern x264_cabac_transition +cextern x264_cabac_renorm_shift + +%macro DEF_TMP 16 + %rep 8 + %define t%1d r%9d + %define t%1b r%9b + %define t%1 r%9 + %rotate 1 + %endrep +%endmacro + +; t3 must be ecx, since it's used for shift. +%ifdef ARCH_X86_64 + DEF_TMP 0,1,2,3,4,5,6,7, 0,1,2,3,4,5,6,10 + %define pointer 8 +%else + DEF_TMP 0,1,2,3,4,5,6,7, 0,3,2,1,4,5,6,3 + %define pointer 4 +%endif + +%define cb.state r0+0 +%define cb.low r0+464 +%define cb.range r0+468 +%define cb.queue r0+472 +%define cb.bytes_outstanding r0+476 +%define cb.p r0+480+pointer +%define cb.end r0+480+pointer*2 + +%macro LOAD_GLOBAL 4 +%ifdef PIC64 + ; this would be faster if the arrays were declared in asm, so that I didn't have to duplicate the lea + lea r11, [%2 GLOBAL] + %ifnidn %3, 0 + add r11, %3 + %endif + movzx %1, byte [r11+%4] +%elifdef PIC32 + %ifnidn %3, 0 + lea %1, [%3+%4] + movzx %1, byte [%2+%1 GLOBAL] + %else + movzx %1, byte [%2+%3+%4 GLOBAL] + %endif +%else + movzx %1, byte [%2+%3+%4] +%endif +%endmacro + +cglobal x264_cabac_encode_decision, 0,7 + movifnidn t0d, r0m + movifnidn t1d, r1m + picgetgot t2 + mov t5d, [cb.range] + movzx t3d, byte [cb.state+t1] + mov t4d, t5d + shr t5d, 6 + and t5d, 3 + LOAD_GLOBAL t5d, x264_cabac_range_lps, t5, t3*4 + sub t4d, t5d + mov t6d, t3d + shr t6d, 6 +%ifdef PIC32 + cmp t6d, r2m +%else + movifnidn t2d, r2m + cmp t6d, t2d +%endif + mov t6d, [cb.low] + lea t7, [t6+t4] + cmovne t4d, t5d + cmovne t6d, t7d +%ifdef PIC32 + mov t1, r2m + LOAD_GLOBAL t3d, x264_cabac_transition, t1, t3*2 +%else + LOAD_GLOBAL t3d, x264_cabac_transition, t2, t3*2 +%endif + if32 mov t1d, r1m + mov [cb.state+t1], t3b +.renorm: + mov t3d, t4d + shr t3d, 3 + LOAD_GLOBAL t3d, x264_cabac_renorm_shift, 0, t3 + shl t4d, t3b + shl t6d, t3b + add t3d, [cb.queue] + mov [cb.range], t4d + mov [cb.low], t6d + mov [cb.queue], t3d + cmp t3d, 8 + jge .putbyte +.ret: + REP_RET +.putbyte: + ; alive: t0=cb t3=queue t6=low + add t3d, 2 + mov t1d, 1 + mov t2d, t6d + shl t1d, t3b + shr t2d, t3b ; out + dec t1d + sub t3d, 10 + and t6d, t1d + cmp t2b, 0xff ; FIXME is a 32bit op faster? + mov [cb.queue], t3d + mov [cb.low], t6d + mov t1d, t2d + mov t4, [cb.p] + je .postpone + mov t5d, [cb.bytes_outstanding] + shr t1d, 8 ; carry + lea t6, [t4+t5+1] + cmp t6, [cb.end] + jge .ret + add [t4-1], t1b + test t5d, t5d + jz .no_outstanding + dec t1d +.loop_outstanding: + mov [t4], t1b + inc t4 + dec t5d + jg .loop_outstanding +.no_outstanding: + mov [t4], t2b + inc t4 + mov [cb.bytes_outstanding], t5d ; is zero, but a reg has smaller opcode than an immediate + mov [cb.p], t4 + RET +.postpone: + inc dword [cb.bytes_outstanding] + RET + diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm index 256aa2e4..d0432f45 100644 --- a/common/x86/x86inc.asm +++ b/common/x86/x86inc.asm @@ -50,34 +50,36 @@ ; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons ; which are slow when a normal ret follows a branch. -%macro DECLARE_REG 5 +%macro DECLARE_REG 6 %define r%1q %2 %define r%1d %3 %define r%1w %4 - ; no r%1b, because some regs don't have a byte form, and anyway x264 doesn't need it - %define r%1m %5 - %define r%1 r%1q + %define r%1b %5 + %define r%1m %6 + %define r%1 %2 %endmacro -%macro DECLARE_REG_SIZE 1 +%macro DECLARE_REG_SIZE 2 %define r%1q r%1 %define e%1q r%1 %define r%1d e%1 %define e%1d e%1 %define r%1w %1 %define e%1w %1 + %define r%1b %2 + %define e%1b %2 %ifndef ARCH_X86_64 %define r%1 e%1 %endif %endmacro -DECLARE_REG_SIZE ax -DECLARE_REG_SIZE bx -DECLARE_REG_SIZE cx -DECLARE_REG_SIZE dx -DECLARE_REG_SIZE si -DECLARE_REG_SIZE di -DECLARE_REG_SIZE bp +DECLARE_REG_SIZE ax, al +DECLARE_REG_SIZE bx, bl +DECLARE_REG_SIZE cx, cl +DECLARE_REG_SIZE dx, dl +DECLARE_REG_SIZE si, sil +DECLARE_REG_SIZE di, dil +DECLARE_REG_SIZE bp, bpl %ifdef ARCH_X86_64 %define push_size 8 @@ -129,13 +131,13 @@ DECLARE_REG_SIZE bp %ifdef WIN64 ;================================================================ -DECLARE_REG 0, rcx, ecx, cx, ecx -DECLARE_REG 1, rdx, edx, dx, edx -DECLARE_REG 2, r8, r8d, r8w, r8d -DECLARE_REG 3, r9, r9d, r9w, r9d -DECLARE_REG 4, rdi, edi, di, [rsp + stack_offset + 40] -DECLARE_REG 5, rsi, esi, si, [rsp + stack_offset + 48] -DECLARE_REG 6, rax, eax, ax, [rsp + stack_offset + 56] +DECLARE_REG 0, rcx, ecx, cx, cl, ecx +DECLARE_REG 1, rdx, edx, dx, dl, edx +DECLARE_REG 2, r8, r8d, r8w, r8b, r8d +DECLARE_REG 3, r9, r9d, r9w, r9b, r9d +DECLARE_REG 4, rdi, edi, di, dil, [rsp + stack_offset + 40] +DECLARE_REG 5, rsi, esi, si, sil, [rsp + stack_offset + 48] +DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56] %define r7m [rsp + stack_offset + 64] %macro LOAD_IF_USED 2 ; reg_id, number_of_args @@ -163,13 +165,13 @@ DECLARE_REG 6, rax, eax, ax, [rsp + stack_offset + 56] %elifdef ARCH_X86_64 ;======================================================== -DECLARE_REG 0, rdi, edi, di, edi -DECLARE_REG 1, rsi, esi, si, esi -DECLARE_REG 2, rdx, edx, dx, edx -DECLARE_REG 3, rcx, ecx, cx, ecx -DECLARE_REG 4, r8, r8d, r8w, r8d -DECLARE_REG 5, r9, r9d, r9w, r9d -DECLARE_REG 6, rax, eax, ax, [rsp + stack_offset + 8] +DECLARE_REG 0, rdi, edi, di, dil, edi +DECLARE_REG 1, rsi, esi, si, sil, esi +DECLARE_REG 2, rdx, edx, dx, dl, edx +DECLARE_REG 3, rcx, ecx, cx, cl, ecx +DECLARE_REG 4, r8, r8d, r8w, r8b, r8d +DECLARE_REG 5, r9, r9d, r9w, r9b, r9d +DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 8] %define r7m [rsp + stack_offset + 16] %macro LOAD_IF_USED 2 ; reg_id, number_of_args @@ -195,13 +197,13 @@ DECLARE_REG 6, rax, eax, ax, [rsp + stack_offset + 8] %else ; X86_32 ;============================================================== -DECLARE_REG 0, eax, eax, ax, [esp + stack_offset + 4] -DECLARE_REG 1, ecx, ecx, cx, [esp + stack_offset + 8] -DECLARE_REG 2, edx, edx, dx, [esp + stack_offset + 12] -DECLARE_REG 3, ebx, ebx, bx, [esp + stack_offset + 16] -DECLARE_REG 4, esi, esi, si, [esp + stack_offset + 20] -DECLARE_REG 5, edi, edi, di, [esp + stack_offset + 24] -DECLARE_REG 6, ebp, ebp, bp, [esp + stack_offset + 28] +DECLARE_REG 0, eax, eax, ax, al, [esp + stack_offset + 4] +DECLARE_REG 1, ecx, ecx, cx, cl, [esp + stack_offset + 8] +DECLARE_REG 2, edx, edx, dx, dl, [esp + stack_offset + 12] +DECLARE_REG 3, ebx, ebx, bx, bl, [esp + stack_offset + 16] +DECLARE_REG 4, esi, esi, si, null, [esp + stack_offset + 20] +DECLARE_REG 5, edi, edi, di, null, [esp + stack_offset + 24] +DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28] %define r7m [esp + stack_offset + 32] %define rsp esp