;*
;* Author: Loren Merritt <lorenm@u.washington.edu>
;* Fiona Glaser <fiona@x264.com>
+;* Holger Lubitz <holger@lubitz.org>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
DECLARE_REG_TMP 0,1,2,3,4,5,6,10
%define pointer resq
%else
- DECLARE_REG_TMP 0,3,2,1,4,5,6,3
+ DECLARE_REG_TMP 0,4,2,1,3,5,6,2
%define pointer resd
%endif
movifnidn t0, r0mp
movifnidn t1d, r1m
mov t5d, [t0+cb.range]
- movzx t3d, byte [t0+cb.state+t1]
- mov t4d, t5d
+ movzx t6d, byte [t0+cb.state+t1]
+ mov t3d, t5d
shr t5d, 6
- LOAD_GLOBAL t5d, x264_cabac_range_lps-4, t5, t3*4
- sub t4d, t5d
- mov t6d, t3d
- shr t6d, 6
movifnidn t2d, r2m
+ LOAD_GLOBAL t5d, x264_cabac_range_lps-4, t5, t6*4
+ LOAD_GLOBAL t4d, x264_cabac_transition, t2, t6*2
+ shr t6d, 6
+ sub t3d, t5d
cmp t6d, t2d
mov t6d, [t0+cb.low]
- lea t7, [t6+t4]
- cmovne t4d, t5d
+ lea t7, [t6+t3]
+ cmovne t3d, t5d
cmovne t6d, t7d
- LOAD_GLOBAL t3d, x264_cabac_transition, t2, t3*2
- movifnidn t1d, r1m
- mov [t0+cb.state+t1], t3b
-.renorm:
- mov t3d, t4d
+ mov [t0+cb.state+t1], t4b
+;x264_cabac_encode_renorm
+ mov t4d, t3d
shr t3d, 3
LOAD_GLOBAL t3d, x264_cabac_renorm_shift, 0, t3
shl t4d, t3b
shl t6d, t3b
add t3d, [t0+cb.queue]
mov [t0+cb.range], t4d
- mov [t0+cb.low], t6d
- mov [t0+cb.queue], t3d
cmp t3d, 8
- jge .putbyte
- REP_RET
-.putbyte:
+ jl .update_queue_low
+;x264_cabac_putbyte
; alive: t0=cb t3=queue t6=low
+%ifdef WIN64
+ DECLARE_REG_TMP 3,4,1,0,2,5,6,10
+%endif
+ mov t1d, -1
add t3d, 2
- mov t1d, 1
mov t2d, t6d
shl t1d, t3b
shr t2d, t3b ; out
- dec t1d
+ not t1d
sub t3d, 10
and t6d, t1d
- cmp t2b, 0xff ; FIXME is a 32bit op faster?
- mov [t0+cb.queue], t3d
- mov [t0+cb.low], t6d
- mov t1d, t2d
- mov t4, [t0+cb.p]
- je .postpone
mov t5d, [t0+cb.bytes_outstanding]
- shr t1d, 8 ; carry
- add [t4-1], t1b
- test t5d, t5d
- jz .no_outstanding
- dec t1d
+ cmp t2b, 0xff ; FIXME is a 32bit op faster?
+ jz .postpone
+ mov t1, [t0+cb.p]
+ add [t1-1], dh ; t2h
+ dec dh
.loop_outstanding:
- mov [t4], t1b
- inc t4
+ mov [t1], dh
+ inc t1
dec t5d
- jg .loop_outstanding
-.no_outstanding:
- mov [t4], t2b
- inc t4
- mov [t0+cb.bytes_outstanding], t5d ; is zero, but a reg has smaller opcode than an immediate
- mov [t0+cb.p], t4
- RET
+ jge .loop_outstanding
+ mov [t1-1], t2b
+ mov [t0+cb.p], t1
.postpone:
- inc dword [t0+cb.bytes_outstanding]
+ inc t5d
+ mov [t0+cb.bytes_outstanding], t5d
+.update_queue_low:
+ mov [t0+cb.low], t6d
+ mov [t0+cb.queue], t3d
RET