]> granicus.if.org Git - libvpx/commitdiff
Implement SSE version for sad4x8x4d and SSE2 version for sad8x4x4d.
authorRonald S. Bultje <rbultje@google.com>
Tue, 11 Jun 2013 22:19:14 +0000 (15:19 -0700)
committerRonald S. Bultje <rbultje@google.com>
Wed, 12 Jun 2013 21:40:01 +0000 (17:40 -0400)
Encoding time of crew (CIF, first 50 frames) @ 1500kbps goes from 4min56
to 4min42.

Change-Id: I92c0c8b32980d2ae7c6dafc8b883a2c7fcd14a9f

test/sad_test.cc
vp9/common/vp9_rtcd_defs.sh
vp9/encoder/x86/vp9_sad4d_sse2.asm

index 9555a9ab5c44e3f4dcc1c6ce9f276f6ee1a8e056..15667be2b9311be9224ec96d7b5acc0f0cce65f0 100644 (file)
@@ -332,15 +332,31 @@ INSTANTIATE_TEST_CASE_P(C, SADTest, ::testing::ValuesIn(c_tests));
 
 #if CONFIG_VP9_ENCODER
 const sad_n_by_n_by_4_fn_t sad_64x64x4d_c = vp9_sad64x64x4d_c;
+const sad_n_by_n_by_4_fn_t sad_64x32x4d_c = vp9_sad64x32x4d_c;
+const sad_n_by_n_by_4_fn_t sad_32x64x4d_c = vp9_sad32x64x4d_c;
 const sad_n_by_n_by_4_fn_t sad_32x32x4d_c = vp9_sad32x32x4d_c;
+const sad_n_by_n_by_4_fn_t sad_32x16x4d_c = vp9_sad32x16x4d_c;
+const sad_n_by_n_by_4_fn_t sad_16x32x4d_c = vp9_sad16x32x4d_c;
 const sad_n_by_n_by_4_fn_t sad_16x16x4d_c = vp9_sad16x16x4d_c;
+const sad_n_by_n_by_4_fn_t sad_16x8x4d_c = vp9_sad16x8x4d_c;
+const sad_n_by_n_by_4_fn_t sad_8x16x4d_c = vp9_sad8x16x4d_c;
 const sad_n_by_n_by_4_fn_t sad_8x8x4d_c = vp9_sad8x8x4d_c;
+const sad_n_by_n_by_4_fn_t sad_8x4x4d_c = vp9_sad8x4x4d_c;
+const sad_n_by_n_by_4_fn_t sad_4x8x4d_c = vp9_sad4x8x4d_c;
 const sad_n_by_n_by_4_fn_t sad_4x4x4d_c = vp9_sad4x4x4d_c;
 INSTANTIATE_TEST_CASE_P(C, SADx4Test, ::testing::Values(
                         make_tuple(64, 64, sad_64x64x4d_c),
+                        make_tuple(64, 32, sad_64x32x4d_c),
+                        make_tuple(32, 64, sad_32x64x4d_c),
                         make_tuple(32, 32, sad_32x32x4d_c),
+                        make_tuple(32, 16, sad_32x16x4d_c),
+                        make_tuple(16, 32, sad_16x32x4d_c),
                         make_tuple(16, 16, sad_16x16x4d_c),
+                        make_tuple(16, 8, sad_16x8x4d_c),
+                        make_tuple(8, 16, sad_8x16x4d_c),
                         make_tuple(8, 8, sad_8x8x4d_c),
+                        make_tuple(8, 4, sad_8x4x4d_c),
+                        make_tuple(4, 8, sad_4x8x4d_c),
                         make_tuple(4, 4, sad_4x4x4d_c)));
 #endif
 
@@ -407,8 +423,10 @@ const sad_m_by_n_fn_t sad_4x4_sse_vp9 = vp9_sad4x4_sse;
 INSTANTIATE_TEST_CASE_P(SSE, SADTest, ::testing::Values(
                         make_tuple(4, 4, sad_4x4_sse_vp9)));
 
+const sad_n_by_n_by_4_fn_t sad_4x8x4d_sse = vp9_sad4x8x4d_sse;
 const sad_n_by_n_by_4_fn_t sad_4x4x4d_sse = vp9_sad4x4x4d_sse;
 INSTANTIATE_TEST_CASE_P(SSE, SADx4Test, ::testing::Values(
+                        make_tuple(4, 8, sad_4x8x4d_sse),
                         make_tuple(4, 4, sad_4x4x4d_sse)));
 #endif
 #endif
@@ -450,18 +468,28 @@ INSTANTIATE_TEST_CASE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests));
 
 #if CONFIG_VP9_ENCODER
 const sad_n_by_n_by_4_fn_t sad_64x64x4d_sse2 = vp9_sad64x64x4d_sse2;
+const sad_n_by_n_by_4_fn_t sad_64x32x4d_sse2 = vp9_sad64x32x4d_sse2;
+const sad_n_by_n_by_4_fn_t sad_32x64x4d_sse2 = vp9_sad32x64x4d_sse2;
 const sad_n_by_n_by_4_fn_t sad_32x32x4d_sse2 = vp9_sad32x32x4d_sse2;
+const sad_n_by_n_by_4_fn_t sad_32x16x4d_sse2 = vp9_sad32x16x4d_sse2;
+const sad_n_by_n_by_4_fn_t sad_16x32x4d_sse2 = vp9_sad16x32x4d_sse2;
 const sad_n_by_n_by_4_fn_t sad_16x16x4d_sse2 = vp9_sad16x16x4d_sse2;
 const sad_n_by_n_by_4_fn_t sad_16x8x4d_sse2 = vp9_sad16x8x4d_sse2;
 const sad_n_by_n_by_4_fn_t sad_8x16x4d_sse2 = vp9_sad8x16x4d_sse2;
 const sad_n_by_n_by_4_fn_t sad_8x8x4d_sse2 = vp9_sad8x8x4d_sse2;
+const sad_n_by_n_by_4_fn_t sad_8x4x4d_sse2 = vp9_sad8x4x4d_sse2;
 INSTANTIATE_TEST_CASE_P(SSE2, SADx4Test, ::testing::Values(
                         make_tuple(64, 64, sad_64x64x4d_sse2),
+                        make_tuple(64, 32, sad_64x32x4d_sse2),
+                        make_tuple(32, 64, sad_32x64x4d_sse2),
                         make_tuple(32, 32, sad_32x32x4d_sse2),
+                        make_tuple(32, 16, sad_32x16x4d_sse2),
+                        make_tuple(16, 32, sad_16x32x4d_sse2),
                         make_tuple(16, 16, sad_16x16x4d_sse2),
                         make_tuple(16, 8, sad_16x8x4d_sse2),
                         make_tuple(8, 16, sad_8x16x4d_sse2),
-                        make_tuple(8, 8, sad_8x8x4d_sse2)));
+                        make_tuple(8, 8, sad_8x8x4d_sse2),
+                        make_tuple(8, 4, sad_8x4x4d_sse2)));
 #endif
 #endif
 
index 90eb7e9994740988a37e5f82518ae3480054fa62..a937d829ce4144d53979b00fb0debc2c1e63c30d 100644 (file)
@@ -499,13 +499,14 @@ specialize vp9_sad8x8x4d sse2
 
 # TODO(jingning): need to convert these 4x8/8x4 functions into sse2 form
 prototype void vp9_sad8x4x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"
-specialize vp9_sad8x4x4d
+specialize vp9_sad8x4x4d sse2
 
 prototype void vp9_sad4x8x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"
-specialize vp9_sad4x8x4d
+specialize vp9_sad4x8x4d sse
 
 prototype void vp9_sad4x4x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad4x4x4d sse
+
 prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse"
 specialize vp9_sub_pixel_mse16x16 sse2 mmx
 
index 25dd064e1983233b24faded840933a8151d54594..b4936281f62ec42004d5a6d3db38a82c3ab8385b 100644 (file)
@@ -224,6 +224,8 @@ SADNXN4D 16, 16
 SADNXN4D 16,  8
 SADNXN4D  8, 16
 SADNXN4D  8,  8
+SADNXN4D  8,  4
 
 INIT_MMX sse
+SADNXN4D  4,  8
 SADNXN4D  4,  4