2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
16 %macro PROCESS_16X2X3 1
40 movdqa xmm0, QWORD PTR [rsi+rax]
41 lddqu xmm1, QWORD PTR [rdi+rdx]
42 lddqu xmm2, QWORD PTR [rdi+rdx+1]
43 lddqu xmm3, QWORD PTR [rdi+rdx+2]
57 %macro PROCESS_16X2X3_OFFSET 2
64 palignr xmm5, xmm4, %2
67 palignr xmm6, xmm4, (%2+1)
69 palignr xmm7, xmm4, (%2+2)
80 palignr xmm1, xmm4, %2
83 palignr xmm2, xmm4, (%2+1)
85 palignr xmm3, xmm4, (%2+2)
95 movdqa xmm0, QWORD PTR [rsi+rax]
96 movdqa xmm4, QWORD PTR [rdi+rdx]
97 movdqa xmm3, QWORD PTR [rdi+rdx+16]
100 palignr xmm1, xmm4, %2
103 palignr xmm2, xmm4, (%2+1)
105 palignr xmm3, xmm4, (%2+2)
119 %macro PROCESS_16X16X3_OFFSET 2
124 PROCESS_16X2X3_OFFSET 1, %1
125 PROCESS_16X2X3_OFFSET 0, %1
126 PROCESS_16X2X3_OFFSET 0, %1
127 PROCESS_16X2X3_OFFSET 0, %1
128 PROCESS_16X2X3_OFFSET 0, %1
129 PROCESS_16X2X3_OFFSET 0, %1
130 PROCESS_16X2X3_OFFSET 0, %1
131 PROCESS_16X2X3_OFFSET 0, %1
137 %macro PROCESS_16X8X3_OFFSET 2
142 PROCESS_16X2X3_OFFSET 1, %1
143 PROCESS_16X2X3_OFFSET 0, %1
144 PROCESS_16X2X3_OFFSET 0, %1
145 PROCESS_16X2X3_OFFSET 0, %1
151 ;void int vp8_sad16x16x3_ssse3(
152 ; unsigned char *src_ptr,
154 ; unsigned char *ref_ptr,
157 global sym(vp8_sad16x16x3_ssse3)
158 sym(vp8_sad16x16x3_ssse3):
161 SHADOW_ARGS_TO_STACK 5
167 mov rsi, arg(0) ;src_ptr
168 mov rdi, arg(2) ;ref_ptr
173 jmp vp8_sad16x16x3_ssse3_skiptable
174 vp8_sad16x16x3_ssse3_jumptable:
175 dd vp8_sad16x16x3_ssse3_aligned_by_0 - vp8_sad16x16x3_ssse3_do_jump
176 dd vp8_sad16x16x3_ssse3_aligned_by_1 - vp8_sad16x16x3_ssse3_do_jump
177 dd vp8_sad16x16x3_ssse3_aligned_by_2 - vp8_sad16x16x3_ssse3_do_jump
178 dd vp8_sad16x16x3_ssse3_aligned_by_3 - vp8_sad16x16x3_ssse3_do_jump
179 dd vp8_sad16x16x3_ssse3_aligned_by_4 - vp8_sad16x16x3_ssse3_do_jump
180 dd vp8_sad16x16x3_ssse3_aligned_by_5 - vp8_sad16x16x3_ssse3_do_jump
181 dd vp8_sad16x16x3_ssse3_aligned_by_6 - vp8_sad16x16x3_ssse3_do_jump
182 dd vp8_sad16x16x3_ssse3_aligned_by_7 - vp8_sad16x16x3_ssse3_do_jump
183 dd vp8_sad16x16x3_ssse3_aligned_by_8 - vp8_sad16x16x3_ssse3_do_jump
184 dd vp8_sad16x16x3_ssse3_aligned_by_9 - vp8_sad16x16x3_ssse3_do_jump
185 dd vp8_sad16x16x3_ssse3_aligned_by_10 - vp8_sad16x16x3_ssse3_do_jump
186 dd vp8_sad16x16x3_ssse3_aligned_by_11 - vp8_sad16x16x3_ssse3_do_jump
187 dd vp8_sad16x16x3_ssse3_aligned_by_12 - vp8_sad16x16x3_ssse3_do_jump
188 dd vp8_sad16x16x3_ssse3_aligned_by_13 - vp8_sad16x16x3_ssse3_do_jump
189 dd vp8_sad16x16x3_ssse3_aligned_by_14 - vp8_sad16x16x3_ssse3_do_jump
190 dd vp8_sad16x16x3_ssse3_aligned_by_15 - vp8_sad16x16x3_ssse3_do_jump
191 vp8_sad16x16x3_ssse3_skiptable:
193 call vp8_sad16x16x3_ssse3_do_jump
194 vp8_sad16x16x3_ssse3_do_jump:
195 pop rcx ; get the address of do_jump
196 mov rax, vp8_sad16x16x3_ssse3_jumptable - vp8_sad16x16x3_ssse3_do_jump
197 add rax, rcx ; get the absolute address of vp8_sad16x16x3_ssse3_jumptable
199 movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
202 movsxd rax, dword ptr arg(1) ;src_stride
203 movsxd rdx, dword ptr arg(3) ;ref_stride
207 PROCESS_16X16X3_OFFSET 0, vp8_sad16x16x3_ssse3
208 PROCESS_16X16X3_OFFSET 1, vp8_sad16x16x3_ssse3
209 PROCESS_16X16X3_OFFSET 2, vp8_sad16x16x3_ssse3
210 PROCESS_16X16X3_OFFSET 3, vp8_sad16x16x3_ssse3
211 PROCESS_16X16X3_OFFSET 4, vp8_sad16x16x3_ssse3
212 PROCESS_16X16X3_OFFSET 5, vp8_sad16x16x3_ssse3
213 PROCESS_16X16X3_OFFSET 6, vp8_sad16x16x3_ssse3
214 PROCESS_16X16X3_OFFSET 7, vp8_sad16x16x3_ssse3
215 PROCESS_16X16X3_OFFSET 8, vp8_sad16x16x3_ssse3
216 PROCESS_16X16X3_OFFSET 9, vp8_sad16x16x3_ssse3
217 PROCESS_16X16X3_OFFSET 10, vp8_sad16x16x3_ssse3
218 PROCESS_16X16X3_OFFSET 11, vp8_sad16x16x3_ssse3
219 PROCESS_16X16X3_OFFSET 12, vp8_sad16x16x3_ssse3
220 PROCESS_16X16X3_OFFSET 13, vp8_sad16x16x3_ssse3
221 PROCESS_16X16X3_OFFSET 14, vp8_sad16x16x3_ssse3
223 vp8_sad16x16x3_ssse3_aligned_by_15:
233 vp8_sad16x16x3_ssse3_store_off:
234 mov rdi, arg(4) ;Results
262 ;void int vp8_sad16x8x3_ssse3(
263 ; unsigned char *src_ptr,
265 ; unsigned char *ref_ptr,
268 global sym(vp8_sad16x8x3_ssse3)
269 sym(vp8_sad16x8x3_ssse3):
272 SHADOW_ARGS_TO_STACK 5
278 mov rsi, arg(0) ;src_ptr
279 mov rdi, arg(2) ;ref_ptr
284 jmp vp8_sad16x8x3_ssse3_skiptable
285 vp8_sad16x8x3_ssse3_jumptable:
286 dd vp8_sad16x8x3_ssse3_aligned_by_0 - vp8_sad16x8x3_ssse3_do_jump
287 dd vp8_sad16x8x3_ssse3_aligned_by_1 - vp8_sad16x8x3_ssse3_do_jump
288 dd vp8_sad16x8x3_ssse3_aligned_by_2 - vp8_sad16x8x3_ssse3_do_jump
289 dd vp8_sad16x8x3_ssse3_aligned_by_3 - vp8_sad16x8x3_ssse3_do_jump
290 dd vp8_sad16x8x3_ssse3_aligned_by_4 - vp8_sad16x8x3_ssse3_do_jump
291 dd vp8_sad16x8x3_ssse3_aligned_by_5 - vp8_sad16x8x3_ssse3_do_jump
292 dd vp8_sad16x8x3_ssse3_aligned_by_6 - vp8_sad16x8x3_ssse3_do_jump
293 dd vp8_sad16x8x3_ssse3_aligned_by_7 - vp8_sad16x8x3_ssse3_do_jump
294 dd vp8_sad16x8x3_ssse3_aligned_by_8 - vp8_sad16x8x3_ssse3_do_jump
295 dd vp8_sad16x8x3_ssse3_aligned_by_9 - vp8_sad16x8x3_ssse3_do_jump
296 dd vp8_sad16x8x3_ssse3_aligned_by_10 - vp8_sad16x8x3_ssse3_do_jump
297 dd vp8_sad16x8x3_ssse3_aligned_by_11 - vp8_sad16x8x3_ssse3_do_jump
298 dd vp8_sad16x8x3_ssse3_aligned_by_12 - vp8_sad16x8x3_ssse3_do_jump
299 dd vp8_sad16x8x3_ssse3_aligned_by_13 - vp8_sad16x8x3_ssse3_do_jump
300 dd vp8_sad16x8x3_ssse3_aligned_by_14 - vp8_sad16x8x3_ssse3_do_jump
301 dd vp8_sad16x8x3_ssse3_aligned_by_15 - vp8_sad16x8x3_ssse3_do_jump
302 vp8_sad16x8x3_ssse3_skiptable:
304 call vp8_sad16x8x3_ssse3_do_jump
305 vp8_sad16x8x3_ssse3_do_jump:
306 pop rcx ; get the address of do_jump
307 mov rax, vp8_sad16x8x3_ssse3_jumptable - vp8_sad16x8x3_ssse3_do_jump
308 add rax, rcx ; get the absolute address of vp8_sad16x8x3_ssse3_jumptable
310 movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
313 movsxd rax, dword ptr arg(1) ;src_stride
314 movsxd rdx, dword ptr arg(3) ;ref_stride
318 PROCESS_16X8X3_OFFSET 0, vp8_sad16x8x3_ssse3
319 PROCESS_16X8X3_OFFSET 1, vp8_sad16x8x3_ssse3
320 PROCESS_16X8X3_OFFSET 2, vp8_sad16x8x3_ssse3
321 PROCESS_16X8X3_OFFSET 3, vp8_sad16x8x3_ssse3
322 PROCESS_16X8X3_OFFSET 4, vp8_sad16x8x3_ssse3
323 PROCESS_16X8X3_OFFSET 5, vp8_sad16x8x3_ssse3
324 PROCESS_16X8X3_OFFSET 6, vp8_sad16x8x3_ssse3
325 PROCESS_16X8X3_OFFSET 7, vp8_sad16x8x3_ssse3
326 PROCESS_16X8X3_OFFSET 8, vp8_sad16x8x3_ssse3
327 PROCESS_16X8X3_OFFSET 9, vp8_sad16x8x3_ssse3
328 PROCESS_16X8X3_OFFSET 10, vp8_sad16x8x3_ssse3
329 PROCESS_16X8X3_OFFSET 11, vp8_sad16x8x3_ssse3
330 PROCESS_16X8X3_OFFSET 12, vp8_sad16x8x3_ssse3
331 PROCESS_16X8X3_OFFSET 13, vp8_sad16x8x3_ssse3
332 PROCESS_16X8X3_OFFSET 14, vp8_sad16x8x3_ssse3
334 vp8_sad16x8x3_ssse3_aligned_by_15:
341 vp8_sad16x8x3_ssse3_store_off:
342 mov rdi, arg(4) ;Results