2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license and patent
5 ; grant that can be found in the LICENSE file in the root of the source
6 ; tree. All contributing project authors may be found in the AUTHORS
7 ; file in the root of the source tree.
11 %include "vpx_ports/x86_abi_support.asm"
14 ;int vp8_regular_quantize_b_impl_sse2(short *coeff_ptr, short *zbin_ptr,
15 ; short *qcoeff_ptr,short *dequant_ptr,
16 ; const int *default_zig_zag, short *round_ptr,
17 ; short *quant_ptr, short *dqcoeff_ptr,
18 ; unsigned short zbin_oq_value,
19 ; short *zbin_boost_ptr);
21 global sym(vp8_regular_quantize_b_impl_sse2)
22 sym(vp8_regular_quantize_b_impl_sse2):
25 SHADOW_ARGS_TO_STACK 10
33 %define abs_minus_zbin_lo 0
34 %define abs_minus_zbin_hi 16
35 %define temp_qcoeff_lo 32
36 %define temp_qcoeff_hi 48
41 %define vp8_regularquantizeb_stack_size eob + 16
43 sub rsp, vp8_regularquantizeb_stack_size
45 movdqa OWORD PTR[rsp + save_xmm6], xmm6
46 movdqa OWORD PTR[rsp + save_xmm7], xmm7
48 mov rdx, arg(0) ;coeff_ptr
49 mov eax, arg(8) ;zbin_oq_value
51 mov rcx, arg(1) ;zbin_ptr
54 movdqa xmm0, OWORD PTR[rdx]
55 movdqa xmm4, OWORD PTR[rdx + 16]
60 psraw xmm0, 15 ;sign of z (aka sz)
61 psraw xmm4, 15 ;sign of z (aka sz)
66 movdqa xmm2, OWORD PTR[rcx] ;load zbin_ptr
67 movdqa xmm3, OWORD PTR[rcx + 16] ;load zbin_ptr
70 psubw xmm1, xmm0 ;x = abs(z)
72 punpcklwd xmm7, xmm7 ;duplicated zbin_oq_value
73 psubw xmm5, xmm4 ;x = abs(z)
78 psubw xmm1, xmm2 ;sub (zbin_ptr + zbin_oq_value)
79 psubw xmm5, xmm3 ;sub (zbin_ptr + zbin_oq_value)
81 mov rdi, arg(5) ;round_ptr
82 mov rsi, arg(6) ;quant_ptr
84 movdqa OWORD PTR[rsp + abs_minus_zbin_lo], xmm1
85 movdqa OWORD PTR[rsp + abs_minus_zbin_hi], xmm5
87 paddw xmm1, xmm2 ;add (zbin_ptr + zbin_oq_value) back
88 paddw xmm5, xmm3 ;add (zbin_ptr + zbin_oq_value) back
90 movdqa xmm2, OWORD PTR[rdi]
91 movdqa xmm3, OWORD PTR[rsi]
93 movdqa xmm6, OWORD PTR[rdi + 16]
94 movdqa xmm7, OWORD PTR[rsi + 16]
102 mov rsi, arg(2) ;qcoeff_ptr
111 movdqa OWORD PTR[rsp + temp_qcoeff_lo], xmm1
112 movdqa OWORD PTR[rsp + temp_qcoeff_hi], xmm5
114 movdqa OWORD PTR[rsi], xmm6 ;zero qcoeff
115 movdqa OWORD PTR[rsi + 16], xmm6 ;zero qcoeff
121 mov rsi, arg(9) ;zbin_boost_ptr
123 mov rbx, arg(4) ;default_zig_zag
126 movsxd rcx, DWORD PTR[rbx + rax*4] ;now we have rc
127 movsx edi, WORD PTR [rsi] ;*zbin_boost_ptr aka zbin
128 lea rsi, [rsi + 2] ;zbin_boost_ptr++
130 movsx edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2]
132 sub edx, edi ;x - zbin
135 mov rdi, arg(2) ;qcoeff_ptr
137 movsx edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2]
142 mov WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc]
144 mov rsi, arg(9) ;zbin_boost_ptr
145 mov [rsp + eob], rax ;eob = i
148 movsxd rcx, DWORD PTR[rbx + rax*4 + 4]
149 movsx edi, WORD PTR [rsi] ;*zbin_boost_ptr aka zbin
150 lea rsi, [rsi + 2] ;zbin_boost_ptr++
152 movsx edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2]
155 sub edx, edi ;x - zbin
158 mov rdi, arg(2) ;qcoeff_ptr
160 movsx edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2]
165 mov WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc]
167 mov rsi, arg(9) ;zbin_boost_ptr
168 mov [rsp + eob], rax ;eob = i
171 movsxd rcx, DWORD PTR[rbx + rax*4 + 4]
172 movsx edi, WORD PTR [rsi] ;*zbin_boost_ptr aka zbin
173 lea rsi, [rsi + 2] ;zbin_boost_ptr++
175 movsx edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2]
178 sub edx, edi ;x - zbin
181 mov rdi, arg(2) ;qcoeff_ptr
183 movsx edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2]
188 mov WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc]
190 mov rsi, arg(9) ;zbin_boost_ptr
191 mov [rsp + eob], rax ;eob = i
194 movsxd rcx, DWORD PTR[rbx + rax*4 + 4]
195 movsx edi, WORD PTR [rsi] ;*zbin_boost_ptr aka zbin
196 lea rsi, [rsi + 2] ;zbin_boost_ptr++
198 movsx edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2]
201 sub edx, edi ;x - zbin
204 mov rdi, arg(2) ;qcoeff_ptr
206 movsx edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2]
211 mov WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc]
213 mov rsi, arg(9) ;zbin_boost_ptr
214 mov [rsp + eob], rax ;eob = i
222 mov rdi, arg(2) ;qcoeff_ptr
223 mov rcx, arg(3) ;dequant_ptr
224 mov rsi, arg(7) ;dqcoeff_ptr
226 movdqa xmm2, OWORD PTR[rdi]
227 movdqa xmm3, OWORD PTR[rdi + 16]
229 movdqa xmm0, OWORD PTR[rcx]
230 movdqa xmm1, OWORD PTR[rcx + 16]
235 movdqa OWORD PTR[rsi], xmm0 ;store dqcoeff
236 movdqa OWORD PTR[rsi + 16], xmm1 ;store dqcoeff
240 movdqa xmm6, OWORD PTR[rsp + save_xmm6]
241 movdqa xmm7, OWORD PTR[rsp + save_xmm7]
245 add rsp, vp8_regularquantizeb_stack_size
257 ;int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr,
258 ; short *qcoeff_ptr,short *dequant_ptr,
259 ; short *scan_mask, short *round_ptr,
260 ; short *quant_ptr, short *dqcoeff_ptr);
261 global sym(vp8_fast_quantize_b_impl_ssse2)
262 sym(vp8_fast_quantize_b_impl_ssse2):
265 SHADOW_ARGS_TO_STACK 7
276 %define vp8_fastquantizeb_stack_size save_xmm7 + 16
278 sub rsp, vp8_fastquantizeb_stack_size
280 movdqa XMMWORD PTR[rsp + save_xmm6], xmm6
281 movdqa XMMWORD PTR[rsp + save_xmm7], xmm7
283 mov rdx, arg(0) ;coeff_ptr
284 mov rcx, arg(2) ;dequant_ptr
285 mov rax, arg(3) ;scan_mask
286 mov rdi, arg(4) ;round_ptr
287 mov rsi, arg(5) ;quant_ptr
289 movdqa xmm0, XMMWORD PTR[rdx]
290 movdqa xmm4, XMMWORD PTR[rdx + 16]
292 movdqa xmm6, XMMWORD PTR[rdi] ;round lo
293 movdqa xmm7, XMMWORD PTR[rdi + 16] ;round hi
298 psraw xmm0, 15 ;sign of z (aka sz)
299 psraw xmm4, 15 ;sign of z (aka sz)
303 psubw xmm1, xmm0 ;x = abs(z)
304 psubw xmm5, xmm4 ;x = abs(z)
309 pmulhw xmm1, XMMWORD PTR[rsi]
310 pmulhw xmm5, XMMWORD PTR[rsi + 16]
312 mov rdi, arg(1) ;qcoeff_ptr
313 mov rsi, arg(6) ;dqcoeff_ptr
315 movdqa xmm6, XMMWORD PTR[rcx]
316 movdqa xmm7, XMMWORD PTR[rcx + 16]
323 movdqa XMMWORD PTR[rdi], xmm1
324 movdqa XMMWORD PTR[rdi + 16], xmm5
329 movdqa xmm2, XMMWORD PTR[rax]
330 movdqa xmm3, XMMWORD PTR[rax+16];
332 pxor xmm4, xmm4 ;clear all bits
336 pcmpeqw xmm4, xmm4 ;set all bits
373 movdqa XMMWORD PTR[rsi], xmm6 ;store dqcoeff
374 movdqa XMMWORD PTR[rsi + 16], xmm7 ;store dqcoeff
376 movdqa xmm6, XMMWORD PTR[rsp + save_xmm6]
377 movdqa xmm7, XMMWORD PTR[rsp + save_xmm7]
379 add rsp, vp8_fastquantizeb_stack_size