From: Jian Zhou Date: Thu, 17 Dec 2015 19:08:17 +0000 (-0800) Subject: Code clean of sad4xNx4D_sse X-Git-Tag: v1.6.0~418^2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=789dbb3131380f95fc1507de61fe180a12db44f3;p=libvpx Code clean of sad4xNx4D_sse Replace MMX with SSE2. Change-Id: I948ca1be6ed9b8e67f16555e226f1203726b7da6 --- diff --git a/test/sad_test.cc b/test/sad_test.cc index a144cfce7..3f0f74cae 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -700,16 +700,6 @@ const SadMxNParam mmx_tests[] = { INSTANTIATE_TEST_CASE_P(MMX, SADTest, ::testing::ValuesIn(mmx_tests)); #endif // HAVE_MMX -#if HAVE_SSE -#if CONFIG_USE_X86INC -const SadMxNx4Param x4d_sse_tests[] = { - make_tuple(4, 8, &vpx_sad4x8x4d_sse, -1), - make_tuple(4, 4, &vpx_sad4x4x4d_sse, -1), -}; -INSTANTIATE_TEST_CASE_P(SSE, SADx4Test, ::testing::ValuesIn(x4d_sse_tests)); -#endif // CONFIG_USE_X86INC -#endif // HAVE_SSE - #if HAVE_SSE2 #if CONFIG_USE_X86INC const SadMxNParam sse2_tests[] = { @@ -828,6 +818,8 @@ const SadMxNx4Param x4d_sse2_tests[] = { make_tuple(8, 16, &vpx_sad8x16x4d_sse2, -1), make_tuple(8, 8, &vpx_sad8x8x4d_sse2, -1), make_tuple(8, 4, &vpx_sad8x4x4d_sse2, -1), + make_tuple(4, 8, &vpx_sad4x8x4d_sse2, -1), + make_tuple(4, 4, &vpx_sad4x4x4d_sse2, -1), #if CONFIG_VP9_HIGHBITDEPTH make_tuple(64, 64, &vpx_highbd_sad64x64x4d_sse2, 8), make_tuple(64, 32, &vpx_highbd_sad64x32x4d_sse2, 8), diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 897913571..5facc8215 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1156,10 +1156,10 @@ add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const specialize qw/vpx_sad8x4x4d msa/, "$sse2_x86inc"; add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; -specialize qw/vpx_sad4x8x4d msa/, "$sse_x86inc"; +specialize qw/vpx_sad4x8x4d msa/, "$sse2_x86inc"; add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; -specialize qw/vpx_sad4x4x4d msa/, "$sse_x86inc"; +specialize qw/vpx_sad4x4x4d msa/, "$sse2_x86inc"; # # Structured Similarity (SSIM) diff --git a/vpx_dsp/x86/sad4d_sse2.asm b/vpx_dsp/x86/sad4d_sse2.asm index a2f0ae79e..3f6e55ce9 100644 --- a/vpx_dsp/x86/sad4d_sse2.asm +++ b/vpx_dsp/x86/sad4d_sse2.asm @@ -20,33 +20,41 @@ SECTION .text movd m4, [ref2q+%3] movd m7, [ref3q+%3] movd m5, [ref4q+%3] - punpckldq m0, [srcq +%4] - punpckldq m6, [ref1q+%5] - punpckldq m4, [ref2q+%5] - punpckldq m7, [ref3q+%5] - punpckldq m5, [ref4q+%5] + movd m1, [srcq +%4] + movd m2, [ref1q+%5] + punpckldq m0, m1 + punpckldq m6, m2 + movd m1, [ref2q+%5] + movd m2, [ref3q+%5] + movd m3, [ref4q+%5] + punpckldq m4, m1 + punpckldq m7, m2 + punpckldq m5, m3 + movlhps m0, m0 + movlhps m6, m4 + movlhps m7, m5 psadbw m6, m0 - psadbw m4, m0 psadbw m7, m0 - psadbw m5, m0 - punpckldq m6, m4 - punpckldq m7, m5 %else movd m1, [ref1q+%3] + movd m5, [ref1q+%5] movd m2, [ref2q+%3] + movd m4, [ref2q+%5] + punpckldq m1, m5 + punpckldq m2, m4 movd m3, [ref3q+%3] + movd m5, [ref3q+%5] + punpckldq m3, m5 movd m4, [ref4q+%3] - punpckldq m0, [srcq +%4] - punpckldq m1, [ref1q+%5] - punpckldq m2, [ref2q+%5] - punpckldq m3, [ref3q+%5] - punpckldq m4, [ref4q+%5] + movd m5, [ref4q+%5] + punpckldq m4, m5 + movd m5, [srcq +%4] + punpckldq m0, m5 + movlhps m0, m0 + movlhps m1, m2 + movlhps m3, m4 psadbw m1, m0 - psadbw m2, m0 psadbw m3, m0 - psadbw m4, m0 - punpckldq m1, m2 - punpckldq m3, m4 paddd m6, m1 paddd m7, m3 %endif @@ -170,7 +178,7 @@ SECTION .text ; void vpx_sadNxNx4d_sse2(uint8_t *src, int src_stride, ; uint8_t *ref[4], int ref_stride, ; uint32_t res[4]); -; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8 +; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16, 8x8, 8x4, 4x8 and 4x4 %macro SADNXN4D 2 %if UNIX64 cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ @@ -192,7 +200,7 @@ cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ %endrep PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0 -%if mmsize == 16 +%if %1 > 4 pslldq m5, 4 pslldq m7, 4 por m4, m5 @@ -207,8 +215,10 @@ cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ RET %else movifnidn r4, r4mp - movq [r4+0], m6 - movq [r4+8], m7 + pshufd m6, m6, 0x08 + pshufd m7, m7, 0x08 + movq [r4+0], m6 + movq [r4+8], m7 RET %endif %endmacro @@ -225,7 +235,5 @@ SADNXN4D 16, 8 SADNXN4D 8, 16 SADNXN4D 8, 8 SADNXN4D 8, 4 - -INIT_MMX sse SADNXN4D 4, 8 SADNXN4D 4, 4