2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
14 %macro PROCESS_16X2X3 1
16 movdqa xmm0, XMMWORD PTR [rsi]
17 lddqu xmm5, XMMWORD PTR [rdi]
18 lddqu xmm6, XMMWORD PTR [rdi+1]
19 lddqu xmm7, XMMWORD PTR [rdi+2]
25 movdqa xmm0, XMMWORD PTR [rsi]
26 lddqu xmm1, XMMWORD PTR [rdi]
27 lddqu xmm2, XMMWORD PTR [rdi+1]
28 lddqu xmm3, XMMWORD PTR [rdi+2]
38 movdqa xmm0, XMMWORD PTR [rsi+rax]
39 lddqu xmm1, XMMWORD PTR [rdi+rdx]
40 lddqu xmm2, XMMWORD PTR [rdi+rdx+1]
41 lddqu xmm3, XMMWORD PTR [rdi+rdx+2]
55 %macro PROCESS_8X2X3 1
57 movq mm0, QWORD PTR [rsi]
58 movq mm5, QWORD PTR [rdi]
59 movq mm6, QWORD PTR [rdi+1]
60 movq mm7, QWORD PTR [rdi+2]
66 movq mm0, QWORD PTR [rsi]
67 movq mm1, QWORD PTR [rdi]
68 movq mm2, QWORD PTR [rdi+1]
69 movq mm3, QWORD PTR [rdi+2]
79 movq mm0, QWORD PTR [rsi+rax]
80 movq mm1, QWORD PTR [rdi+rdx]
81 movq mm2, QWORD PTR [rdi+rdx+1]
82 movq mm3, QWORD PTR [rdi+rdx+2]
96 %macro LOAD_X4_ADDRESSES 5
97 mov %2, [%1+REG_SZ_BYTES*0]
98 mov %3, [%1+REG_SZ_BYTES*1]
100 mov %4, [%1+REG_SZ_BYTES*2]
101 mov %5, [%1+REG_SZ_BYTES*3]
104 %macro PROCESS_16X2X4 1
106 movdqa xmm0, XMMWORD PTR [rsi]
107 lddqu xmm4, XMMWORD PTR [rcx]
108 lddqu xmm5, XMMWORD PTR [rdx]
109 lddqu xmm6, XMMWORD PTR [rbx]
110 lddqu xmm7, XMMWORD PTR [rdi]
117 movdqa xmm0, XMMWORD PTR [rsi]
118 lddqu xmm1, XMMWORD PTR [rcx]
119 lddqu xmm2, XMMWORD PTR [rdx]
120 lddqu xmm3, XMMWORD PTR [rbx]
127 lddqu xmm1, XMMWORD PTR [rdi]
134 movdqa xmm0, XMMWORD PTR [rsi+rax]
135 lddqu xmm1, XMMWORD PTR [rcx+rbp]
136 lddqu xmm2, XMMWORD PTR [rdx+rbp]
137 lddqu xmm3, XMMWORD PTR [rbx+rbp]
144 lddqu xmm1, XMMWORD PTR [rdi+rbp]
161 %macro PROCESS_8X2X4 1
163 movq mm0, QWORD PTR [rsi]
164 movq mm4, QWORD PTR [rcx]
165 movq mm5, QWORD PTR [rdx]
166 movq mm6, QWORD PTR [rbx]
167 movq mm7, QWORD PTR [rdi]
174 movq mm0, QWORD PTR [rsi]
175 movq mm1, QWORD PTR [rcx]
176 movq mm2, QWORD PTR [rdx]
177 movq mm3, QWORD PTR [rbx]
184 movq mm1, QWORD PTR [rdi]
191 movq mm0, QWORD PTR [rsi+rax]
192 movq mm1, QWORD PTR [rcx+rbp]
193 movq mm2, QWORD PTR [rdx+rbp]
194 movq mm3, QWORD PTR [rbx+rbp]
201 movq mm1, QWORD PTR [rdi+rbp]
218 ;void int vp8_sad16x16x3_sse3(
219 ; unsigned char *src_ptr,
221 ; unsigned char *ref_ptr,
224 global sym(vp8_sad16x16x3_sse3)
225 sym(vp8_sad16x16x3_sse3):
228 SHADOW_ARGS_TO_STACK 5
233 mov rsi, arg(0) ;src_ptr
234 mov rdi, arg(2) ;ref_ptr
236 movsxd rax, dword ptr arg(1) ;src_stride
237 movsxd rdx, dword ptr arg(3) ;ref_stride
248 mov rdi, arg(4) ;Results
275 ;void int vp8_sad16x8x3_sse3(
276 ; unsigned char *src_ptr,
278 ; unsigned char *ref_ptr,
281 global sym(vp8_sad16x8x3_sse3)
282 sym(vp8_sad16x8x3_sse3):
285 SHADOW_ARGS_TO_STACK 5
290 mov rsi, arg(0) ;src_ptr
291 mov rdi, arg(2) ;ref_ptr
293 movsxd rax, dword ptr arg(1) ;src_stride
294 movsxd rdx, dword ptr arg(3) ;ref_stride
301 mov rdi, arg(4) ;Results
328 ;void int vp8_sad8x16x3_sse3(
329 ; unsigned char *src_ptr,
331 ; unsigned char *ref_ptr,
334 global sym(vp8_sad8x16x3_sse3)
335 sym(vp8_sad8x16x3_sse3):
338 SHADOW_ARGS_TO_STACK 5
343 mov rsi, arg(0) ;src_ptr
344 mov rdi, arg(2) ;ref_ptr
346 movsxd rax, dword ptr arg(1) ;src_stride
347 movsxd rdx, dword ptr arg(3) ;ref_stride
358 mov rdi, arg(4) ;Results
371 ;void int vp8_sad8x8x3_sse3(
372 ; unsigned char *src_ptr,
374 ; unsigned char *ref_ptr,
377 global sym(vp8_sad8x8x3_sse3)
378 sym(vp8_sad8x8x3_sse3):
381 SHADOW_ARGS_TO_STACK 5
386 mov rsi, arg(0) ;src_ptr
387 mov rdi, arg(2) ;ref_ptr
389 movsxd rax, dword ptr arg(1) ;src_stride
390 movsxd rdx, dword ptr arg(3) ;ref_stride
397 mov rdi, arg(4) ;Results
410 ;void int vp8_sad4x4x3_sse3(
411 ; unsigned char *src_ptr,
413 ; unsigned char *ref_ptr,
416 global sym(vp8_sad4x4x3_sse3)
417 sym(vp8_sad4x4x3_sse3):
420 SHADOW_ARGS_TO_STACK 5
425 mov rsi, arg(0) ;src_ptr
426 mov rdi, arg(2) ;ref_ptr
428 movsxd rax, dword ptr arg(1) ;src_stride
429 movsxd rdx, dword ptr arg(3) ;ref_stride
431 movd mm0, DWORD PTR [rsi]
432 movd mm1, DWORD PTR [rdi]
434 movd mm2, DWORD PTR [rsi+rax]
435 movd mm3, DWORD PTR [rdi+rdx]
440 movd mm4, DWORD PTR [rdi+1]
441 movd mm5, DWORD PTR [rdi+2]
443 movd mm2, DWORD PTR [rdi+rdx+1]
444 movd mm3, DWORD PTR [rdi+rdx+2]
459 movd mm0, DWORD PTR [rsi]
460 movd mm2, DWORD PTR [rdi]
462 movd mm3, DWORD PTR [rsi+rax]
463 movd mm6, DWORD PTR [rdi+rdx]
468 movd mm3, DWORD PTR [rdi+1]
469 movd mm7, DWORD PTR [rdi+2]
475 movd mm2, DWORD PTR [rdi+rdx+1]
476 movd mm6, DWORD PTR [rdi+rdx+2]
487 mov rdi, arg(4) ;Results
501 ;unsigned int vp8_sad16x16_sse3(
502 ; unsigned char *src_ptr,
504 ; unsigned char *ref_ptr,
507 ;%define lddqu movdqu
508 global sym(vp8_sad16x16_sse3)
509 sym(vp8_sad16x16_sse3):
512 SHADOW_ARGS_TO_STACK 5
518 mov rsi, arg(0) ;src_ptr
519 mov rdi, arg(2) ;ref_ptr
521 movsxd rbx, dword ptr arg(1) ;src_stride
522 movsxd rdx, dword ptr arg(3) ;ref_stride
529 vp8_sad16x16_sse3_loop:
533 jg vp8_sad16x16_early_exit
535 movq mm0, QWORD PTR [rsi]
536 movq mm2, QWORD PTR [rsi+8]
538 movq mm1, QWORD PTR [rdi]
539 movq mm3, QWORD PTR [rdi+8]
541 movq mm4, QWORD PTR [rsi+rbx]
542 movq mm5, QWORD PTR [rdi+rdx]
547 movq mm1, QWORD PTR [rsi+rbx+8]
548 movq mm3, QWORD PTR [rdi+rdx+8]
563 jne vp8_sad16x16_sse3_loop
567 vp8_sad16x16_early_exit:
577 ;void vp8_sad16x16x4d_sse3(
578 ; unsigned char *src_ptr,
580 ; unsigned char *ref_ptr_base,
583 global sym(vp8_sad16x16x4d_sse3)
584 sym(vp8_sad16x16x4d_sse3):
587 SHADOW_ARGS_TO_STACK 5
594 mov rdi, arg(2) ; ref_ptr_base
596 LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
598 mov rsi, arg(0) ;src_ptr
600 movsxd rbx, dword ptr arg(1) ;src_stride
601 movsxd rbp, dword ptr arg(3) ;ref_stride
615 mov rdi, arg(4) ;Results
649 ;void vp8_sad16x8x4d_sse3(
650 ; unsigned char *src_ptr,
652 ; unsigned char *ref_ptr_base,
655 global sym(vp8_sad16x8x4d_sse3)
656 sym(vp8_sad16x8x4d_sse3):
659 SHADOW_ARGS_TO_STACK 5
666 mov rdi, arg(2) ; ref_ptr_base
668 LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
670 mov rsi, arg(0) ;src_ptr
672 movsxd rbx, dword ptr arg(1) ;src_stride
673 movsxd rbp, dword ptr arg(3) ;ref_stride
683 mov rdi, arg(4) ;Results
717 ;void int vp8_sad8x16x4d_sse3(
718 ; unsigned char *src_ptr,
720 ; unsigned char *ref_ptr,
723 global sym(vp8_sad8x16x4d_sse3)
724 sym(vp8_sad8x16x4d_sse3):
727 SHADOW_ARGS_TO_STACK 5
734 mov rdi, arg(2) ; ref_ptr_base
736 LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
738 mov rsi, arg(0) ;src_ptr
740 movsxd rbx, dword ptr arg(1) ;src_stride
741 movsxd rbp, dword ptr arg(3) ;ref_stride
755 mov rdi, arg(4) ;Results
770 ;void int vp8_sad8x8x4d_sse3(
771 ; unsigned char *src_ptr,
773 ; unsigned char *ref_ptr,
776 global sym(vp8_sad8x8x4d_sse3)
777 sym(vp8_sad8x8x4d_sse3):
780 SHADOW_ARGS_TO_STACK 5
787 mov rdi, arg(2) ; ref_ptr_base
789 LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
791 mov rsi, arg(0) ;src_ptr
793 movsxd rbx, dword ptr arg(1) ;src_stride
794 movsxd rbp, dword ptr arg(3) ;ref_stride
804 mov rdi, arg(4) ;Results
819 ;void int vp8_sad4x4x4d_sse3(
820 ; unsigned char *src_ptr,
822 ; unsigned char *ref_ptr,
825 global sym(vp8_sad4x4x4d_sse3)
826 sym(vp8_sad4x4x4d_sse3):
829 SHADOW_ARGS_TO_STACK 5
836 mov rdi, arg(2) ; ref_ptr_base
838 LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
840 mov rsi, arg(0) ;src_ptr
842 movsxd rbx, dword ptr arg(1) ;src_stride
843 movsxd rbp, dword ptr arg(3) ;ref_stride
847 movd mm0, DWORD PTR [rsi]
848 movd mm1, DWORD PTR [rcx]
850 movd mm2, DWORD PTR [rsi+rax]
851 movd mm3, DWORD PTR [rcx+rbp]
856 movd mm4, DWORD PTR [rdx]
857 movd mm5, DWORD PTR [rbx]
859 movd mm6, DWORD PTR [rdi]
860 movd mm2, DWORD PTR [rdx+rbp]
862 movd mm3, DWORD PTR [rbx+rbp]
863 movd mm7, DWORD PTR [rdi+rbp]
886 movd mm0, DWORD PTR [rsi]
887 movd mm2, DWORD PTR [rcx]
889 movd mm3, DWORD PTR [rsi+rax]
890 movd mm7, DWORD PTR [rcx+rbp]
895 movd mm3, DWORD PTR [rdx]
896 movd mm7, DWORD PTR [rbx]
902 mov rsi, arg(4) ;Results
907 movd mm2, DWORD PTR [rdx+rax]
908 movd mm1, DWORD PTR [rbx+rax]
916 movd mm2, DWORD PTR [rdi]
917 movd mm1, DWORD PTR [rdi+rax]