From fa96eeb835c88522740a96dcda77f8685fae96a5 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Tue, 11 Jun 2013 15:19:14 -0700 Subject: [PATCH] Implement SSE version for sad4x8x4d and SSE2 version for sad8x4x4d. Encoding time of crew (CIF, first 50 frames) @ 1500kbps goes from 4min56 to 4min42. Change-Id: I92c0c8b32980d2ae7c6dafc8b883a2c7fcd14a9f --- test/sad_test.cc | 30 +++++++++++++++++++++++++++++- vp9/common/vp9_rtcd_defs.sh | 5 +++-- vp9/encoder/x86/vp9_sad4d_sse2.asm | 2 ++ 3 files changed, 34 insertions(+), 3 deletions(-) diff --git a/test/sad_test.cc b/test/sad_test.cc index 9555a9ab5..15667be2b 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -332,15 +332,31 @@ INSTANTIATE_TEST_CASE_P(C, SADTest, ::testing::ValuesIn(c_tests)); #if CONFIG_VP9_ENCODER const sad_n_by_n_by_4_fn_t sad_64x64x4d_c = vp9_sad64x64x4d_c; +const sad_n_by_n_by_4_fn_t sad_64x32x4d_c = vp9_sad64x32x4d_c; +const sad_n_by_n_by_4_fn_t sad_32x64x4d_c = vp9_sad32x64x4d_c; const sad_n_by_n_by_4_fn_t sad_32x32x4d_c = vp9_sad32x32x4d_c; +const sad_n_by_n_by_4_fn_t sad_32x16x4d_c = vp9_sad32x16x4d_c; +const sad_n_by_n_by_4_fn_t sad_16x32x4d_c = vp9_sad16x32x4d_c; const sad_n_by_n_by_4_fn_t sad_16x16x4d_c = vp9_sad16x16x4d_c; +const sad_n_by_n_by_4_fn_t sad_16x8x4d_c = vp9_sad16x8x4d_c; +const sad_n_by_n_by_4_fn_t sad_8x16x4d_c = vp9_sad8x16x4d_c; const sad_n_by_n_by_4_fn_t sad_8x8x4d_c = vp9_sad8x8x4d_c; +const sad_n_by_n_by_4_fn_t sad_8x4x4d_c = vp9_sad8x4x4d_c; +const sad_n_by_n_by_4_fn_t sad_4x8x4d_c = vp9_sad4x8x4d_c; const sad_n_by_n_by_4_fn_t sad_4x4x4d_c = vp9_sad4x4x4d_c; INSTANTIATE_TEST_CASE_P(C, SADx4Test, ::testing::Values( make_tuple(64, 64, sad_64x64x4d_c), + make_tuple(64, 32, sad_64x32x4d_c), + make_tuple(32, 64, sad_32x64x4d_c), make_tuple(32, 32, sad_32x32x4d_c), + make_tuple(32, 16, sad_32x16x4d_c), + make_tuple(16, 32, sad_16x32x4d_c), make_tuple(16, 16, sad_16x16x4d_c), + make_tuple(16, 8, sad_16x8x4d_c), + make_tuple(8, 16, sad_8x16x4d_c), make_tuple(8, 8, sad_8x8x4d_c), + make_tuple(8, 4, sad_8x4x4d_c), + make_tuple(4, 8, sad_4x8x4d_c), make_tuple(4, 4, sad_4x4x4d_c))); #endif @@ -407,8 +423,10 @@ const sad_m_by_n_fn_t sad_4x4_sse_vp9 = vp9_sad4x4_sse; INSTANTIATE_TEST_CASE_P(SSE, SADTest, ::testing::Values( make_tuple(4, 4, sad_4x4_sse_vp9))); +const sad_n_by_n_by_4_fn_t sad_4x8x4d_sse = vp9_sad4x8x4d_sse; const sad_n_by_n_by_4_fn_t sad_4x4x4d_sse = vp9_sad4x4x4d_sse; INSTANTIATE_TEST_CASE_P(SSE, SADx4Test, ::testing::Values( + make_tuple(4, 8, sad_4x8x4d_sse), make_tuple(4, 4, sad_4x4x4d_sse))); #endif #endif @@ -450,18 +468,28 @@ INSTANTIATE_TEST_CASE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests)); #if CONFIG_VP9_ENCODER const sad_n_by_n_by_4_fn_t sad_64x64x4d_sse2 = vp9_sad64x64x4d_sse2; +const sad_n_by_n_by_4_fn_t sad_64x32x4d_sse2 = vp9_sad64x32x4d_sse2; +const sad_n_by_n_by_4_fn_t sad_32x64x4d_sse2 = vp9_sad32x64x4d_sse2; const sad_n_by_n_by_4_fn_t sad_32x32x4d_sse2 = vp9_sad32x32x4d_sse2; +const sad_n_by_n_by_4_fn_t sad_32x16x4d_sse2 = vp9_sad32x16x4d_sse2; +const sad_n_by_n_by_4_fn_t sad_16x32x4d_sse2 = vp9_sad16x32x4d_sse2; const sad_n_by_n_by_4_fn_t sad_16x16x4d_sse2 = vp9_sad16x16x4d_sse2; const sad_n_by_n_by_4_fn_t sad_16x8x4d_sse2 = vp9_sad16x8x4d_sse2; const sad_n_by_n_by_4_fn_t sad_8x16x4d_sse2 = vp9_sad8x16x4d_sse2; const sad_n_by_n_by_4_fn_t sad_8x8x4d_sse2 = vp9_sad8x8x4d_sse2; +const sad_n_by_n_by_4_fn_t sad_8x4x4d_sse2 = vp9_sad8x4x4d_sse2; INSTANTIATE_TEST_CASE_P(SSE2, SADx4Test, ::testing::Values( make_tuple(64, 64, sad_64x64x4d_sse2), + make_tuple(64, 32, sad_64x32x4d_sse2), + make_tuple(32, 64, sad_32x64x4d_sse2), make_tuple(32, 32, sad_32x32x4d_sse2), + make_tuple(32, 16, sad_32x16x4d_sse2), + make_tuple(16, 32, sad_16x32x4d_sse2), make_tuple(16, 16, sad_16x16x4d_sse2), make_tuple(16, 8, sad_16x8x4d_sse2), make_tuple(8, 16, sad_8x16x4d_sse2), - make_tuple(8, 8, sad_8x8x4d_sse2))); + make_tuple(8, 8, sad_8x8x4d_sse2), + make_tuple(8, 4, sad_8x4x4d_sse2))); #endif #endif diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 90eb7e999..a937d829c 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -499,13 +499,14 @@ specialize vp9_sad8x8x4d sse2 # TODO(jingning): need to convert these 4x8/8x4 functions into sse2 form prototype void vp9_sad8x4x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" -specialize vp9_sad8x4x4d +specialize vp9_sad8x4x4d sse2 prototype void vp9_sad4x8x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" -specialize vp9_sad4x8x4d +specialize vp9_sad4x8x4d sse prototype void vp9_sad4x4x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" specialize vp9_sad4x4x4d sse + prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse" specialize vp9_sub_pixel_mse16x16 sse2 mmx diff --git a/vp9/encoder/x86/vp9_sad4d_sse2.asm b/vp9/encoder/x86/vp9_sad4d_sse2.asm index 25dd064e1..b4936281f 100644 --- a/vp9/encoder/x86/vp9_sad4d_sse2.asm +++ b/vp9/encoder/x86/vp9_sad4d_sse2.asm @@ -224,6 +224,8 @@ SADNXN4D 16, 16 SADNXN4D 16, 8 SADNXN4D 8, 16 SADNXN4D 8, 8 +SADNXN4D 8, 4 INIT_MMX sse +SADNXN4D 4, 8 SADNXN4D 4, 4 -- 2.50.1