2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
16 %macro PROCESS_16X2X3 1
40 movdqa xmm0, QWORD PTR [rsi+rax]
41 lddqu xmm1, QWORD PTR [rdi+rdx]
42 lddqu xmm2, QWORD PTR [rdi+rdx+1]
43 lddqu xmm3, QWORD PTR [rdi+rdx+2]
57 %macro PROCESS_8X2X3 1
81 movq mm0, QWORD PTR [rsi+rax]
82 movq mm1, QWORD PTR [rdi+rdx]
83 movq mm2, QWORD PTR [rdi+rdx+1]
84 movq mm3, QWORD PTR [rdi+rdx+2]
98 %macro LOAD_X4_ADDRESSES 5
99 mov %2, [%1+REG_SZ_BYTES*0]
100 mov %3, [%1+REG_SZ_BYTES*1]
102 mov %4, [%1+REG_SZ_BYTES*2]
103 mov %5, [%1+REG_SZ_BYTES*3]
106 %macro PROCESS_16X2X4 1
136 movdqa xmm0, QWORD PTR [rsi+rax]
137 lddqu xmm1, QWORD PTR [rcx+rbp]
138 lddqu xmm2, QWORD PTR [rdx+rbp]
139 lddqu xmm3, QWORD PTR [rbx+rbp]
146 lddqu xmm1, QWORD PTR [rdi+rbp]
163 %macro PROCESS_8X2X4 1
193 movq mm0, QWORD PTR [rsi+rax]
194 movq mm1, QWORD PTR [rcx+rbp]
195 movq mm2, QWORD PTR [rdx+rbp]
196 movq mm3, QWORD PTR [rbx+rbp]
203 movq mm1, QWORD PTR [rdi+rbp]
220 ;void int vp8_sad16x16x3_sse3(
221 ; unsigned char *src_ptr,
223 ; unsigned char *ref_ptr,
226 global sym(vp8_sad16x16x3_sse3)
227 sym(vp8_sad16x16x3_sse3):
230 SHADOW_ARGS_TO_STACK 5
235 mov rsi, arg(0) ;src_ptr
236 mov rdi, arg(2) ;ref_ptr
238 movsxd rax, dword ptr arg(1) ;src_stride
239 movsxd rdx, dword ptr arg(3) ;ref_stride
250 mov rdi, arg(4) ;Results
277 ;void int vp8_sad16x8x3_sse3(
278 ; unsigned char *src_ptr,
280 ; unsigned char *ref_ptr,
283 global sym(vp8_sad16x8x3_sse3)
284 sym(vp8_sad16x8x3_sse3):
287 SHADOW_ARGS_TO_STACK 5
292 mov rsi, arg(0) ;src_ptr
293 mov rdi, arg(2) ;ref_ptr
295 movsxd rax, dword ptr arg(1) ;src_stride
296 movsxd rdx, dword ptr arg(3) ;ref_stride
303 mov rdi, arg(4) ;Results
330 ;void int vp8_sad8x16x3_sse3(
331 ; unsigned char *src_ptr,
333 ; unsigned char *ref_ptr,
336 global sym(vp8_sad8x16x3_sse3)
337 sym(vp8_sad8x16x3_sse3):
340 SHADOW_ARGS_TO_STACK 5
345 mov rsi, arg(0) ;src_ptr
346 mov rdi, arg(2) ;ref_ptr
348 movsxd rax, dword ptr arg(1) ;src_stride
349 movsxd rdx, dword ptr arg(3) ;ref_stride
360 mov rdi, arg(4) ;Results
373 ;void int vp8_sad8x8x3_sse3(
374 ; unsigned char *src_ptr,
376 ; unsigned char *ref_ptr,
379 global sym(vp8_sad8x8x3_sse3)
380 sym(vp8_sad8x8x3_sse3):
383 SHADOW_ARGS_TO_STACK 5
388 mov rsi, arg(0) ;src_ptr
389 mov rdi, arg(2) ;ref_ptr
391 movsxd rax, dword ptr arg(1) ;src_stride
392 movsxd rdx, dword ptr arg(3) ;ref_stride
399 mov rdi, arg(4) ;Results
412 ;void int vp8_sad4x4x3_sse3(
413 ; unsigned char *src_ptr,
415 ; unsigned char *ref_ptr,
418 global sym(vp8_sad4x4x3_sse3)
419 sym(vp8_sad4x4x3_sse3):
422 SHADOW_ARGS_TO_STACK 5
427 mov rsi, arg(0) ;src_ptr
428 mov rdi, arg(2) ;ref_ptr
430 movsxd rax, dword ptr arg(1) ;src_stride
431 movsxd rdx, dword ptr arg(3) ;ref_stride
433 movd mm0, QWORD PTR [rsi]
434 movd mm1, QWORD PTR [rdi]
436 movd mm2, QWORD PTR [rsi+rax]
437 movd mm3, QWORD PTR [rdi+rdx]
442 movd mm4, QWORD PTR [rdi+1]
443 movd mm5, QWORD PTR [rdi+2]
445 movd mm2, QWORD PTR [rdi+rdx+1]
446 movd mm3, QWORD PTR [rdi+rdx+2]
461 movd mm0, QWORD PTR [rsi]
462 movd mm2, QWORD PTR [rdi]
464 movd mm3, QWORD PTR [rsi+rax]
465 movd mm6, QWORD PTR [rdi+rdx]
470 movd mm3, QWORD PTR [rdi+1]
471 movd mm7, QWORD PTR [rdi+2]
477 movd mm2, QWORD PTR [rdi+rdx+1]
478 movd mm6, QWORD PTR [rdi+rdx+2]
489 mov rdi, arg(4) ;Results
503 ;unsigned int vp8_sad16x16_sse3(
504 ; unsigned char *src_ptr,
506 ; unsigned char *ref_ptr,
509 ;%define lddqu movdqu
510 global sym(vp8_sad16x16_sse3)
511 sym(vp8_sad16x16_sse3):
514 SHADOW_ARGS_TO_STACK 5
520 mov rsi, arg(0) ;src_ptr
521 mov rdi, arg(2) ;ref_ptr
523 movsxd rbx, dword ptr arg(1) ;src_stride
524 movsxd rdx, dword ptr arg(3) ;ref_stride
531 vp8_sad16x16_sse3_loop:
535 jg vp8_sad16x16_early_exit
537 movq mm0, QWORD PTR [rsi]
538 movq mm2, QWORD PTR [rsi+8]
540 movq mm1, QWORD PTR [rdi]
541 movq mm3, QWORD PTR [rdi+8]
543 movq mm4, QWORD PTR [rsi+rbx]
544 movq mm5, QWORD PTR [rdi+rdx]
549 movq mm1, QWORD PTR [rsi+rbx+8]
550 movq mm3, QWORD PTR [rdi+rdx+8]
565 jne vp8_sad16x16_sse3_loop
569 vp8_sad16x16_early_exit:
579 ;void vp8_sad16x16x4d_sse3(
580 ; unsigned char *src_ptr,
582 ; unsigned char *ref_ptr_base,
585 global sym(vp8_sad16x16x4d_sse3)
586 sym(vp8_sad16x16x4d_sse3):
589 SHADOW_ARGS_TO_STACK 5
596 mov rdi, arg(2) ; ref_ptr_base
598 LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
600 mov rsi, arg(0) ;src_ptr
602 movsxd rbx, dword ptr arg(1) ;src_stride
603 movsxd rbp, dword ptr arg(3) ;ref_stride
617 mov rdi, arg(4) ;Results
651 ;void vp8_sad16x8x4d_sse3(
652 ; unsigned char *src_ptr,
654 ; unsigned char *ref_ptr_base,
657 global sym(vp8_sad16x8x4d_sse3)
658 sym(vp8_sad16x8x4d_sse3):
661 SHADOW_ARGS_TO_STACK 5
668 mov rdi, arg(2) ; ref_ptr_base
670 LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
672 mov rsi, arg(0) ;src_ptr
674 movsxd rbx, dword ptr arg(1) ;src_stride
675 movsxd rbp, dword ptr arg(3) ;ref_stride
685 mov rdi, arg(4) ;Results
719 ;void int vp8_sad8x16x4d_sse3(
720 ; unsigned char *src_ptr,
722 ; unsigned char *ref_ptr,
725 global sym(vp8_sad8x16x4d_sse3)
726 sym(vp8_sad8x16x4d_sse3):
729 SHADOW_ARGS_TO_STACK 5
736 mov rdi, arg(2) ; ref_ptr_base
738 LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
740 mov rsi, arg(0) ;src_ptr
742 movsxd rbx, dword ptr arg(1) ;src_stride
743 movsxd rbp, dword ptr arg(3) ;ref_stride
757 mov rdi, arg(4) ;Results
772 ;void int vp8_sad8x8x4d_sse3(
773 ; unsigned char *src_ptr,
775 ; unsigned char *ref_ptr,
778 global sym(vp8_sad8x8x4d_sse3)
779 sym(vp8_sad8x8x4d_sse3):
782 SHADOW_ARGS_TO_STACK 5
789 mov rdi, arg(2) ; ref_ptr_base
791 LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
793 mov rsi, arg(0) ;src_ptr
795 movsxd rbx, dword ptr arg(1) ;src_stride
796 movsxd rbp, dword ptr arg(3) ;ref_stride
806 mov rdi, arg(4) ;Results
821 ;void int vp8_sad4x4x4d_sse3(
822 ; unsigned char *src_ptr,
824 ; unsigned char *ref_ptr,
827 global sym(vp8_sad4x4x4d_sse3)
828 sym(vp8_sad4x4x4d_sse3):
831 SHADOW_ARGS_TO_STACK 5
838 mov rdi, arg(2) ; ref_ptr_base
840 LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
842 mov rsi, arg(0) ;src_ptr
844 movsxd rbx, dword ptr arg(1) ;src_stride
845 movsxd rbp, dword ptr arg(3) ;ref_stride
849 movd mm0, QWORD PTR [rsi]
850 movd mm1, QWORD PTR [rcx]
852 movd mm2, QWORD PTR [rsi+rax]
853 movd mm3, QWORD PTR [rcx+rbp]
858 movd mm4, QWORD PTR [rdx]
859 movd mm5, QWORD PTR [rbx]
861 movd mm6, QWORD PTR [rdi]
862 movd mm2, QWORD PTR [rdx+rbp]
864 movd mm3, QWORD PTR [rbx+rbp]
865 movd mm7, QWORD PTR [rdi+rbp]
888 movd mm0, QWORD PTR [rsi]
889 movd mm2, QWORD PTR [rcx]
891 movd mm3, QWORD PTR [rsi+rax]
892 movd mm7, QWORD PTR [rcx+rbp]
897 movd mm3, QWORD PTR [rdx]
898 movd mm7, QWORD PTR [rbx]
904 mov rsi, arg(4) ;Results
909 movd mm2, QWORD PTR [rdx+rax]
910 movd mm1, QWORD PTR [rbx+rax]
918 movd mm2, QWORD PTR [rdi]
919 movd mm1, QWORD PTR [rdi+rax]