/* FIXME could avoid this duplication by reversing the order of states
* with MPS=0, but that would uglify the other tables */
-static const uint8_t x264_cabac_range_lps[128][4] =
+const uint8_t x264_cabac_range_lps[128][4] =
{
{ 2, 2, 2, 2 },
{ 6, 7, 8, 9 }, { 6, 7, 9, 10 }, { 6, 8, 9, 11 },
{100,121}, {100,122}, {101,123}, {101,124}, {101,125}, {102,126}, {102,126}, {127,127},
};
-static const uint8_t x264_cabac_renorm_shift[64]= {
+const uint8_t x264_cabac_renorm_shift[64]= {
6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
x264_cabac_putbyte( cb );
}
+#ifndef HAVE_MMX
void x264_cabac_encode_decision( x264_cabac_t *cb, int i_ctx, int b )
{
int i_state = cb->state[i_ctx];
cb->state[i_ctx] = x264_cabac_transition[i_state][b];
x264_cabac_encode_renorm( cb );
}
+#endif
void x264_cabac_encode_bypass( x264_cabac_t *cb, int b )
{
--- /dev/null
+;*****************************************************************************
+;* cabac-a.asm: h264 encoder library
+;*****************************************************************************
+;* Copyright (C) 2008 x264 project
+;*
+;* Author: Loren Merritt <lorenm@u.washington.edu>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
+;*****************************************************************************
+
+%include "x86inc.asm"
+
+SECTION .text
+
+cextern x264_cabac_range_lps
+cextern x264_cabac_transition
+cextern x264_cabac_renorm_shift
+
+%macro DEF_TMP 16
+ %rep 8
+ %define t%1d r%9d
+ %define t%1b r%9b
+ %define t%1 r%9
+ %rotate 1
+ %endrep
+%endmacro
+
+; t3 must be ecx, since it's used for shift.
+%ifdef ARCH_X86_64
+ DEF_TMP 0,1,2,3,4,5,6,7, 0,1,2,3,4,5,6,10
+ %define pointer 8
+%else
+ DEF_TMP 0,1,2,3,4,5,6,7, 0,3,2,1,4,5,6,3
+ %define pointer 4
+%endif
+
+%define cb.state r0+0
+%define cb.low r0+464
+%define cb.range r0+468
+%define cb.queue r0+472
+%define cb.bytes_outstanding r0+476
+%define cb.p r0+480+pointer
+%define cb.end r0+480+pointer*2
+
+%macro LOAD_GLOBAL 4
+%ifdef PIC64
+ ; this would be faster if the arrays were declared in asm, so that I didn't have to duplicate the lea
+ lea r11, [%2 GLOBAL]
+ %ifnidn %3, 0
+ add r11, %3
+ %endif
+ movzx %1, byte [r11+%4]
+%elifdef PIC32
+ %ifnidn %3, 0
+ lea %1, [%3+%4]
+ movzx %1, byte [%2+%1 GLOBAL]
+ %else
+ movzx %1, byte [%2+%3+%4 GLOBAL]
+ %endif
+%else
+ movzx %1, byte [%2+%3+%4]
+%endif
+%endmacro
+
+cglobal x264_cabac_encode_decision, 0,7
+ movifnidn t0d, r0m
+ movifnidn t1d, r1m
+ picgetgot t2
+ mov t5d, [cb.range]
+ movzx t3d, byte [cb.state+t1]
+ mov t4d, t5d
+ shr t5d, 6
+ and t5d, 3
+ LOAD_GLOBAL t5d, x264_cabac_range_lps, t5, t3*4
+ sub t4d, t5d
+ mov t6d, t3d
+ shr t6d, 6
+%ifdef PIC32
+ cmp t6d, r2m
+%else
+ movifnidn t2d, r2m
+ cmp t6d, t2d
+%endif
+ mov t6d, [cb.low]
+ lea t7, [t6+t4]
+ cmovne t4d, t5d
+ cmovne t6d, t7d
+%ifdef PIC32
+ mov t1, r2m
+ LOAD_GLOBAL t3d, x264_cabac_transition, t1, t3*2
+%else
+ LOAD_GLOBAL t3d, x264_cabac_transition, t2, t3*2
+%endif
+ if32 mov t1d, r1m
+ mov [cb.state+t1], t3b
+.renorm:
+ mov t3d, t4d
+ shr t3d, 3
+ LOAD_GLOBAL t3d, x264_cabac_renorm_shift, 0, t3
+ shl t4d, t3b
+ shl t6d, t3b
+ add t3d, [cb.queue]
+ mov [cb.range], t4d
+ mov [cb.low], t6d
+ mov [cb.queue], t3d
+ cmp t3d, 8
+ jge .putbyte
+.ret:
+ REP_RET
+.putbyte:
+ ; alive: t0=cb t3=queue t6=low
+ add t3d, 2
+ mov t1d, 1
+ mov t2d, t6d
+ shl t1d, t3b
+ shr t2d, t3b ; out
+ dec t1d
+ sub t3d, 10
+ and t6d, t1d
+ cmp t2b, 0xff ; FIXME is a 32bit op faster?
+ mov [cb.queue], t3d
+ mov [cb.low], t6d
+ mov t1d, t2d
+ mov t4, [cb.p]
+ je .postpone
+ mov t5d, [cb.bytes_outstanding]
+ shr t1d, 8 ; carry
+ lea t6, [t4+t5+1]
+ cmp t6, [cb.end]
+ jge .ret
+ add [t4-1], t1b
+ test t5d, t5d
+ jz .no_outstanding
+ dec t1d
+.loop_outstanding:
+ mov [t4], t1b
+ inc t4
+ dec t5d
+ jg .loop_outstanding
+.no_outstanding:
+ mov [t4], t2b
+ inc t4
+ mov [cb.bytes_outstanding], t5d ; is zero, but a reg has smaller opcode than an immediate
+ mov [cb.p], t4
+ RET
+.postpone:
+ inc dword [cb.bytes_outstanding]
+ RET
+
; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
; which are slow when a normal ret follows a branch.
-%macro DECLARE_REG 5
+%macro DECLARE_REG 6
%define r%1q %2
%define r%1d %3
%define r%1w %4
- ; no r%1b, because some regs don't have a byte form, and anyway x264 doesn't need it
- %define r%1m %5
- %define r%1 r%1q
+ %define r%1b %5
+ %define r%1m %6
+ %define r%1 %2
%endmacro
-%macro DECLARE_REG_SIZE 1
+%macro DECLARE_REG_SIZE 2
%define r%1q r%1
%define e%1q r%1
%define r%1d e%1
%define e%1d e%1
%define r%1w %1
%define e%1w %1
+ %define r%1b %2
+ %define e%1b %2
%ifndef ARCH_X86_64
%define r%1 e%1
%endif
%endmacro
-DECLARE_REG_SIZE ax
-DECLARE_REG_SIZE bx
-DECLARE_REG_SIZE cx
-DECLARE_REG_SIZE dx
-DECLARE_REG_SIZE si
-DECLARE_REG_SIZE di
-DECLARE_REG_SIZE bp
+DECLARE_REG_SIZE ax, al
+DECLARE_REG_SIZE bx, bl
+DECLARE_REG_SIZE cx, cl
+DECLARE_REG_SIZE dx, dl
+DECLARE_REG_SIZE si, sil
+DECLARE_REG_SIZE di, dil
+DECLARE_REG_SIZE bp, bpl
%ifdef ARCH_X86_64
%define push_size 8
%ifdef WIN64 ;================================================================
-DECLARE_REG 0, rcx, ecx, cx, ecx
-DECLARE_REG 1, rdx, edx, dx, edx
-DECLARE_REG 2, r8, r8d, r8w, r8d
-DECLARE_REG 3, r9, r9d, r9w, r9d
-DECLARE_REG 4, rdi, edi, di, [rsp + stack_offset + 40]
-DECLARE_REG 5, rsi, esi, si, [rsp + stack_offset + 48]
-DECLARE_REG 6, rax, eax, ax, [rsp + stack_offset + 56]
+DECLARE_REG 0, rcx, ecx, cx, cl, ecx
+DECLARE_REG 1, rdx, edx, dx, dl, edx
+DECLARE_REG 2, r8, r8d, r8w, r8b, r8d
+DECLARE_REG 3, r9, r9d, r9w, r9b, r9d
+DECLARE_REG 4, rdi, edi, di, dil, [rsp + stack_offset + 40]
+DECLARE_REG 5, rsi, esi, si, sil, [rsp + stack_offset + 48]
+DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56]
%define r7m [rsp + stack_offset + 64]
%macro LOAD_IF_USED 2 ; reg_id, number_of_args
%elifdef ARCH_X86_64 ;========================================================
-DECLARE_REG 0, rdi, edi, di, edi
-DECLARE_REG 1, rsi, esi, si, esi
-DECLARE_REG 2, rdx, edx, dx, edx
-DECLARE_REG 3, rcx, ecx, cx, ecx
-DECLARE_REG 4, r8, r8d, r8w, r8d
-DECLARE_REG 5, r9, r9d, r9w, r9d
-DECLARE_REG 6, rax, eax, ax, [rsp + stack_offset + 8]
+DECLARE_REG 0, rdi, edi, di, dil, edi
+DECLARE_REG 1, rsi, esi, si, sil, esi
+DECLARE_REG 2, rdx, edx, dx, dl, edx
+DECLARE_REG 3, rcx, ecx, cx, cl, ecx
+DECLARE_REG 4, r8, r8d, r8w, r8b, r8d
+DECLARE_REG 5, r9, r9d, r9w, r9b, r9d
+DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 8]
%define r7m [rsp + stack_offset + 16]
%macro LOAD_IF_USED 2 ; reg_id, number_of_args
%else ; X86_32 ;==============================================================
-DECLARE_REG 0, eax, eax, ax, [esp + stack_offset + 4]
-DECLARE_REG 1, ecx, ecx, cx, [esp + stack_offset + 8]
-DECLARE_REG 2, edx, edx, dx, [esp + stack_offset + 12]
-DECLARE_REG 3, ebx, ebx, bx, [esp + stack_offset + 16]
-DECLARE_REG 4, esi, esi, si, [esp + stack_offset + 20]
-DECLARE_REG 5, edi, edi, di, [esp + stack_offset + 24]
-DECLARE_REG 6, ebp, ebp, bp, [esp + stack_offset + 28]
+DECLARE_REG 0, eax, eax, ax, al, [esp + stack_offset + 4]
+DECLARE_REG 1, ecx, ecx, cx, cl, [esp + stack_offset + 8]
+DECLARE_REG 2, edx, edx, dx, dl, [esp + stack_offset + 12]
+DECLARE_REG 3, ebx, ebx, bx, bl, [esp + stack_offset + 16]
+DECLARE_REG 4, esi, esi, si, null, [esp + stack_offset + 20]
+DECLARE_REG 5, edi, edi, di, null, [esp + stack_offset + 24]
+DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
%define r7m [esp + stack_offset + 32]
%define rsp esp