From 12cd6f421db0d00e1972973024c9c9bd60fbb3b5 Mon Sep 17 00:00:00 2001 From: Dmitry Kovalev Date: Thu, 28 Aug 2014 17:31:00 -0700 Subject: [PATCH] Removing variance MMX code. Removed functions: * vp9_mse16x16_mmx * vp9_get_mb_ss_mmx * vp9_get4x4var_mmx * vp9_get8x8var_mmx * vp9_variance4x4_mmx * vp9_variance8x8_mmx * vp9_variance16x16_mmx * vp9_variance16x8_mmx * vp9_variance8x16_mmx They all have SSE2 equivalent. Change-Id: I3796f2477c4f59b35b4828f46a300c16e62a2615 --- test/variance_test.cc | 15 - vp9/common/vp9_rtcd_defs.pl | 16 +- vp9/encoder/x86/vp9_variance_impl_mmx.asm | 510 ---------------------- vp9/encoder/x86/vp9_variance_mmx.c | 103 ----- vp9/vp9cx.mk | 2 - 5 files changed, 8 insertions(+), 638 deletions(-) delete mode 100644 vp9/encoder/x86/vp9_variance_impl_mmx.asm delete mode 100644 vp9/encoder/x86/vp9_variance_mmx.c diff --git a/test/variance_test.cc b/test/variance_test.cc index 7d8118235..925ff03af 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -485,21 +485,6 @@ INSTANTIATE_TEST_CASE_P( make_tuple(6, 5, subpel_avg_variance64x32_c), make_tuple(6, 6, subpel_avg_variance64x64_c))); -#if HAVE_MMX -const vp9_variance_fn_t variance4x4_mmx = vp9_variance4x4_mmx; -const vp9_variance_fn_t variance8x8_mmx = vp9_variance8x8_mmx; -const vp9_variance_fn_t variance8x16_mmx = vp9_variance8x16_mmx; -const vp9_variance_fn_t variance16x8_mmx = vp9_variance16x8_mmx; -const vp9_variance_fn_t variance16x16_mmx = vp9_variance16x16_mmx; -INSTANTIATE_TEST_CASE_P( - MMX, VP9VarianceTest, - ::testing::Values(make_tuple(2, 2, variance4x4_mmx), - make_tuple(3, 3, variance8x8_mmx), - make_tuple(3, 4, variance8x16_mmx), - make_tuple(4, 3, variance16x8_mmx), - make_tuple(4, 4, variance16x16_mmx))); -#endif - #if HAVE_SSE2 #if CONFIG_USE_X86INC const vp9_variance_fn_t variance4x4_sse2 = vp9_variance4x4_sse2; diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 92f93184a..5dadee906 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -420,19 +420,19 @@ add_proto qw/unsigned int vp9_variance64x64/, "const uint8_t *src_ptr, int sourc specialize qw/vp9_variance64x64 avx2/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance16x16 mmx avx2 neon/, "$sse2_x86inc"; +specialize qw/vp9_variance16x16 avx2 neon/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance16x8 mmx/, "$sse2_x86inc"; +specialize qw/vp9_variance16x8/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance8x16 mmx/, "$sse2_x86inc"; +specialize qw/vp9_variance8x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance8x8 mmx neon/, "$sse2_x86inc"; +specialize qw/vp9_variance8x8 neon/, "$sse2_x86inc"; add_proto qw/void vp9_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; -specialize qw/vp9_get8x8var mmx neon/, "$sse2_x86inc"; +specialize qw/vp9_get8x8var neon/, "$sse2_x86inc"; add_proto qw/void vp9_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; specialize qw/vp9_get16x16var avx2 neon/, "$sse2_x86inc"; @@ -444,7 +444,7 @@ add_proto qw/unsigned int vp9_variance4x8/, "const uint8_t *src_ptr, int source_ specialize qw/vp9_variance4x8/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance4x4 mmx/, "$sse2_x86inc"; +specialize qw/vp9_variance4x4/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vp9_sub_pixel_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc"; @@ -693,7 +693,7 @@ add_proto qw/void vp9_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, cons specialize qw/vp9_sad4x4x4d sse/; add_proto qw/unsigned int vp9_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; -specialize qw/vp9_mse16x16 mmx avx2/, "$sse2_x86inc"; +specialize qw/vp9_mse16x16 avx2/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; specialize qw/vp9_mse8x16/; @@ -705,7 +705,7 @@ add_proto qw/unsigned int vp9_mse8x8/, "const uint8_t *src_ptr, int source_stri specialize qw/vp9_mse8x8/; add_proto qw/unsigned int vp9_get_mb_ss/, "const int16_t *"; -specialize qw/vp9_get_mb_ss mmx sse2/; +specialize qw/vp9_get_mb_ss sse2/; # ENCODEMB INVOKE add_proto qw/int64_t vp9_block_error/, "const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz"; diff --git a/vp9/encoder/x86/vp9_variance_impl_mmx.asm b/vp9/encoder/x86/vp9_variance_impl_mmx.asm deleted file mode 100644 index 3501cf1fd..000000000 --- a/vp9/encoder/x86/vp9_variance_impl_mmx.asm +++ /dev/null @@ -1,510 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -;unsigned int vp9_get_mb_ss_mmx( short *src_ptr ) -global sym(vp9_get_mb_ss_mmx) PRIVATE -sym(vp9_get_mb_ss_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - GET_GOT rbx - push rsi - push rdi - sub rsp, 8 - ; end prolog - - mov rax, arg(0) ;src_ptr - mov rcx, 16 - pxor mm4, mm4 - -.NEXTROW: - movq mm0, [rax] - movq mm1, [rax+8] - movq mm2, [rax+16] - movq mm3, [rax+24] - pmaddwd mm0, mm0 - pmaddwd mm1, mm1 - pmaddwd mm2, mm2 - pmaddwd mm3, mm3 - - paddd mm4, mm0 - paddd mm4, mm1 - paddd mm4, mm2 - paddd mm4, mm3 - - add rax, 32 - dec rcx - ja .NEXTROW - movq QWORD PTR [rsp], mm4 - - ;return sum[0]+sum[1]; - movsxd rax, dword ptr [rsp] - movsxd rcx, dword ptr [rsp+4] - add rax, rcx - - - ; begin epilog - add rsp, 8 - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;unsigned int vp9_get8x8var_mmx -;( -; unsigned char *src_ptr, -; int source_stride, -; unsigned char *ref_ptr, -; int recon_stride, -; unsigned int *SSE, -; int *Sum -;) -global sym(vp9_get8x8var_mmx) PRIVATE -sym(vp9_get8x8var_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - push rsi - push rdi - push rbx - sub rsp, 16 - ; end prolog - - - pxor mm5, mm5 ; Blank mmx6 - pxor mm6, mm6 ; Blank mmx7 - pxor mm7, mm7 ; Blank mmx7 - - mov rax, arg(0) ;[src_ptr] ; Load base addresses - mov rbx, arg(2) ;[ref_ptr] - movsxd rcx, dword ptr arg(1) ;[source_stride] - movsxd rdx, dword ptr arg(3) ;[recon_stride] - - ; Row 1 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm1, [rbx] ; Copy eight bytes to mm1 - movq mm2, mm0 ; Take copies - movq mm3, mm1 ; Take copies - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - punpckhbw mm2, mm6 ; unpack to higher prrcision - punpckhbw mm3, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - psubsw mm2, mm3 ; A-B (high order) to MM2 - - paddw mm5, mm0 ; accumulate differences in mm5 - paddw mm5, mm2 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - pmaddwd mm2, mm2 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movq mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - paddd mm7, mm2 ; accumulate in mm7 - - - ; Row 2 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm2, mm0 ; Take copies - movq mm3, mm1 ; Take copies - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - punpckhbw mm2, mm6 ; unpack to higher prrcision - punpckhbw mm3, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - psubsw mm2, mm3 ; A-B (high order) to MM2 - - paddw mm5, mm0 ; accumulate differences in mm5 - paddw mm5, mm2 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - pmaddwd mm2, mm2 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movq mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - paddd mm7, mm2 ; accumulate in mm7 - - ; Row 3 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm2, mm0 ; Take copies - movq mm3, mm1 ; Take copies - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - punpckhbw mm2, mm6 ; unpack to higher prrcision - punpckhbw mm3, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - psubsw mm2, mm3 ; A-B (high order) to MM2 - - paddw mm5, mm0 ; accumulate differences in mm5 - paddw mm5, mm2 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - pmaddwd mm2, mm2 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movq mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - paddd mm7, mm2 ; accumulate in mm7 - - ; Row 4 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm2, mm0 ; Take copies - movq mm3, mm1 ; Take copies - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - punpckhbw mm2, mm6 ; unpack to higher prrcision - punpckhbw mm3, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - psubsw mm2, mm3 ; A-B (high order) to MM2 - - paddw mm5, mm0 ; accumulate differences in mm5 - paddw mm5, mm2 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - pmaddwd mm2, mm2 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movq mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - paddd mm7, mm2 ; accumulate in mm7 - - ; Row 5 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm2, mm0 ; Take copies - movq mm3, mm1 ; Take copies - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - punpckhbw mm2, mm6 ; unpack to higher prrcision - punpckhbw mm3, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - psubsw mm2, mm3 ; A-B (high order) to MM2 - - paddw mm5, mm0 ; accumulate differences in mm5 - paddw mm5, mm2 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - pmaddwd mm2, mm2 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movq mm1, [rbx] ; Copy eight bytes to mm1 - ; movq mm4, [rbx + rdx] - paddd mm7, mm0 ; accumulate in mm7 - paddd mm7, mm2 ; accumulate in mm7 - - ; Row 6 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm2, mm0 ; Take copies - movq mm3, mm1 ; Take copies - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - punpckhbw mm2, mm6 ; unpack to higher prrcision - punpckhbw mm3, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - psubsw mm2, mm3 ; A-B (high order) to MM2 - - paddw mm5, mm0 ; accumulate differences in mm5 - paddw mm5, mm2 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - pmaddwd mm2, mm2 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movq mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - paddd mm7, mm2 ; accumulate in mm7 - - ; Row 7 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm2, mm0 ; Take copies - movq mm3, mm1 ; Take copies - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - punpckhbw mm2, mm6 ; unpack to higher prrcision - punpckhbw mm3, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - psubsw mm2, mm3 ; A-B (high order) to MM2 - - paddw mm5, mm0 ; accumulate differences in mm5 - paddw mm5, mm2 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - pmaddwd mm2, mm2 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movq mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - paddd mm7, mm2 ; accumulate in mm7 - - ; Row 8 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm2, mm0 ; Take copies - movq mm3, mm1 ; Take copies - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - punpckhbw mm2, mm6 ; unpack to higher prrcision - punpckhbw mm3, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - psubsw mm2, mm3 ; A-B (high order) to MM2 - - paddw mm5, mm0 ; accumulate differences in mm5 - paddw mm5, mm2 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - pmaddwd mm2, mm2 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - paddd mm7, mm0 ; accumulate in mm7 - paddd mm7, mm2 ; accumulate in mm7 - - ; Now accumulate the final results. - movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory - movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory - movsx rdx, WORD PTR [rsp+8] - movsx rcx, WORD PTR [rsp+10] - movsx rbx, WORD PTR [rsp+12] - movsx rax, WORD PTR [rsp+14] - add rdx, rcx - add rbx, rax - add rdx, rbx ;XSum - movsxd rax, DWORD PTR [rsp] - movsxd rcx, DWORD PTR [rsp+4] - add rax, rcx ;XXSum - mov rsi, arg(4) ;SSE - mov rdi, arg(5) ;Sum - mov dword ptr [rsi], eax - mov dword ptr [rdi], edx - xor rax, rax ; return 0 - - - ; begin epilog - add rsp, 16 - pop rbx - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - - -;unsigned int -;vp9_get4x4var_mmx -;( -; unsigned char *src_ptr, -; int source_stride, -; unsigned char *ref_ptr, -; int recon_stride, -; unsigned int *SSE, -; int *Sum -;) -global sym(vp9_get4x4var_mmx) PRIVATE -sym(vp9_get4x4var_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - push rsi - push rdi - push rbx - sub rsp, 16 - ; end prolog - - - pxor mm5, mm5 ; Blank mmx6 - pxor mm6, mm6 ; Blank mmx7 - pxor mm7, mm7 ; Blank mmx7 - - mov rax, arg(0) ;[src_ptr] ; Load base addresses - mov rbx, arg(2) ;[ref_ptr] - movsxd rcx, dword ptr arg(1) ;[source_stride] - movsxd rdx, dword ptr arg(3) ;[recon_stride] - - ; Row 1 - movd mm0, [rax] ; Copy 4 bytes to mm0 - movd mm1, [rbx] ; Copy 4 bytes to mm1 - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - paddw mm5, mm0 ; accumulate differences in mm5 - pmaddwd mm0, mm0 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movd mm1, [rbx] ; Copy 4 bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - - - ; Row 2 - movd mm0, [rax] ; Copy 4 bytes to mm0 - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - paddw mm5, mm0 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movd mm1, [rbx] ; Copy 4 bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - - ; Row 3 - movd mm0, [rax] ; Copy 4 bytes to mm0 - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - paddw mm5, mm0 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movd mm1, [rbx] ; Copy 4 bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - - ; Row 4 - movd mm0, [rax] ; Copy 4 bytes to mm0 - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - - paddw mm5, mm0 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - paddd mm7, mm0 ; accumulate in mm7 - - - ; Now accumulate the final results. - movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory - movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory - movsx rdx, WORD PTR [rsp+8] - movsx rcx, WORD PTR [rsp+10] - movsx rbx, WORD PTR [rsp+12] - movsx rax, WORD PTR [rsp+14] - add rdx, rcx - add rbx, rax - add rdx, rbx ;XSum - movsxd rax, DWORD PTR [rsp] - movsxd rcx, DWORD PTR [rsp+4] - add rax, rcx ;XXSum - mov rsi, arg(4) ;SSE - mov rdi, arg(5) ;Sum - mov dword ptr [rsi], eax - mov dword ptr [rdi], edx - xor rax, rax ; return 0 - - - ; begin epilog - add rsp, 16 - pop rbx - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - - -;unsigned int -;vp9_get4x4sse_cs_mmx -;( -; unsigned char *src_ptr, -; int source_stride, -; unsigned char *ref_ptr, -; int recon_stride -;) -global sym(vp9_get4x4sse_cs_mmx) PRIVATE -sym(vp9_get4x4sse_cs_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - push rsi - push rdi - push rbx - ; end prolog - - - pxor mm6, mm6 ; Blank mmx7 - pxor mm7, mm7 ; Blank mmx7 - - mov rax, arg(0) ;[src_ptr] ; Load base addresses - mov rbx, arg(2) ;[ref_ptr] - movsxd rcx, dword ptr arg(1) ;[source_stride] - movsxd rdx, dword ptr arg(3) ;[recon_stride] - ; Row 1 - movd mm0, [rax] ; Copy eight bytes to mm0 - movd mm1, [rbx] ; Copy eight bytes to mm1 - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - pmaddwd mm0, mm0 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movd mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - - ; Row 2 - movd mm0, [rax] ; Copy eight bytes to mm0 - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - pmaddwd mm0, mm0 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movd mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - - ; Row 3 - movd mm0, [rax] ; Copy eight bytes to mm0 - punpcklbw mm1, mm6 - punpcklbw mm0, mm6 ; unpack to higher prrcision - psubsw mm0, mm1 ; A-B (low order) to MM0 - - pmaddwd mm0, mm0 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movd mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - - ; Row 4 - movd mm0, [rax] ; Copy eight bytes to mm0 - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - pmaddwd mm0, mm0 ; square and accumulate - paddd mm7, mm0 ; accumulate in mm7 - - movq mm0, mm7 ; - psrlq mm7, 32 - - paddd mm0, mm7 - movq rax, mm0 - - - ; begin epilog - pop rbx - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret diff --git a/vp9/encoder/x86/vp9_variance_mmx.c b/vp9/encoder/x86/vp9_variance_mmx.c deleted file mode 100644 index ce1c83297..000000000 --- a/vp9/encoder/x86/vp9_variance_mmx.c +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "./vpx_config.h" -#include "vp9/encoder/vp9_variance.h" -#include "vpx_ports/mem.h" - -unsigned int vp9_get8x8var_mmx(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse, int *sum); - -unsigned int vp9_get4x4var_mmx(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *SSE, int *sum); - -unsigned int vp9_variance4x4_mmx(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - vp9_get4x4var_mmx(src, src_stride, ref, ref_stride, sse, &sum); - return *sse - (((unsigned int)sum * sum) >> 4); -} - -unsigned int vp9_variance8x8_mmx(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, sse, &sum); - return *sse - (((unsigned int)sum * sum) >> 6); -} - -unsigned int vp9_mse16x16_mmx(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - unsigned int sse0, sse1, sse2, sse3; - int sum0, sum1, sum2, sum3; - - vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, &sse0, &sum0); - vp9_get8x8var_mmx(src + 8, src_stride, ref + 8, ref_stride, &sse1, &sum1); - vp9_get8x8var_mmx(src + 8 * src_stride, src_stride, - ref + 8 * ref_stride, ref_stride, &sse2, &sum2); - vp9_get8x8var_mmx(src + 8 * src_stride + 8, src_stride, - ref + 8 * ref_stride + 8, ref_stride, &sse3, &sum3); - - *sse = sse0 + sse1 + sse2 + sse3; - return *sse; -} - - -unsigned int vp9_variance16x16_mmx(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - unsigned int sse0, sse1, sse2, sse3; - int sum0, sum1, sum2, sum3, sum; - - vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, &sse0, &sum0); - vp9_get8x8var_mmx(src + 8, src_stride, ref + 8, ref_stride, &sse1, &sum1); - vp9_get8x8var_mmx(src + 8 * src_stride, src_stride, - ref + 8 * ref_stride, ref_stride, &sse2, &sum2); - vp9_get8x8var_mmx(src + 8 * src_stride + 8, src_stride, - ref + 8 * ref_stride + 8, ref_stride, &sse3, &sum3); - - *sse = sse0 + sse1 + sse2 + sse3; - sum = sum0 + sum1 + sum2 + sum3; - return *sse - (((unsigned int)sum * sum) >> 8); -} - -unsigned int vp9_variance16x8_mmx(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - unsigned int sse0, sse1; - int sum0, sum1, sum; - - vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, &sse0, &sum0); - vp9_get8x8var_mmx(src + 8, src_stride, ref + 8, ref_stride, &sse1, &sum1); - - *sse = sse0 + sse1; - sum = sum0 + sum1; - return *sse - (((unsigned int)sum * sum) >> 7); -} - - -unsigned int vp9_variance8x16_mmx(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - unsigned int sse0, sse1; - int sum0, sum1, sum; - - vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, &sse0, &sum0); - vp9_get8x8var_mmx(src + 8 * src_stride, src_stride, - ref + 8 * ref_stride, ref_stride, &sse1, &sum1); - - *sse = sse0 + sse1; - sum = sum0 + sum1; - return *sse - (((unsigned int)sum * sum) >> 7); -} diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index dc46c4e35..3bcecc2b4 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -93,8 +93,6 @@ VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.h VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h -VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_mmx.c -VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_impl_mmx.asm VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_sad_mmx.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_impl_sse2.asm VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_impl_intrin_avx2.c -- 2.40.0