From 41a350a83dd080b40d8d0ee209ef400e6c72bde1 Mon Sep 17 00:00:00 2001 From: Jingning Han Date: Thu, 8 May 2014 09:42:26 -0700 Subject: [PATCH] Change eob threshold for partial inverse 8x8 2D-DCT to 12 The scanning order has the first 12 coefficients of the 8x8 2D-DCT sitting in the top left 4x4 block. Hence the partial inverse 8x8 2D-DCT allows to handle cases with eob below 12. The overall runtime of the inverse 8x8 2D-DCT unit is reduced from 166 cycles (using SSE2) to 150 cycles (using SSSE3). Change-Id: I4514f9748042809ac84df4c14382c00f313f1cd2 --- test/partial_idct_test.cc | 12 ++++++------ vp9/common/arm/neon/vp9_idct8x8_add_neon.asm | 8 ++++---- vp9/common/mips/dspr2/vp9_itrans8_dspr2.c | 2 +- vp9/common/vp9_idct.c | 6 +++--- vp9/common/vp9_rtcd_defs.pl | 4 ++-- vp9/common/x86/vp9_idct_intrin_sse2.c | 2 +- vp9/common/x86/vp9_idct_ssse3.asm | 2 +- 7 files changed, 18 insertions(+), 18 deletions(-) diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc index 8849ce626..e95fc6790 100644 --- a/test/partial_idct_test.cc +++ b/test/partial_idct_test.cc @@ -132,8 +132,8 @@ INSTANTIATE_TEST_CASE_P( &vp9_idct16x16_1_add_c, TX_16X16, 1), make_tuple(&vp9_idct8x8_64_add_c, - &vp9_idct8x8_10_add_c, - TX_8X8, 10), + &vp9_idct8x8_12_add_c, + TX_8X8, 12), make_tuple(&vp9_idct8x8_64_add_c, &vp9_idct8x8_1_add_c, TX_8X8, 1), @@ -154,8 +154,8 @@ INSTANTIATE_TEST_CASE_P( &vp9_idct16x16_1_add_neon, TX_16X16, 1), make_tuple(&vp9_idct8x8_64_add_c, - &vp9_idct8x8_10_add_neon, - TX_8X8, 10), + &vp9_idct8x8_12_add_neon, + TX_8X8, 12), make_tuple(&vp9_idct8x8_64_add_c, &vp9_idct8x8_1_add_neon, TX_8X8, 1), @@ -181,8 +181,8 @@ INSTANTIATE_TEST_CASE_P( &vp9_idct16x16_1_add_sse2, TX_16X16, 1), make_tuple(&vp9_idct8x8_64_add_c, - &vp9_idct8x8_10_add_sse2, - TX_8X8, 10), + &vp9_idct8x8_12_add_sse2, + TX_8X8, 12), make_tuple(&vp9_idct8x8_64_add_c, &vp9_idct8x8_1_add_sse2, TX_8X8, 1), diff --git a/vp9/common/arm/neon/vp9_idct8x8_add_neon.asm b/vp9/common/arm/neon/vp9_idct8x8_add_neon.asm index 54764008b..ab5bb6920 100644 --- a/vp9/common/arm/neon/vp9_idct8x8_add_neon.asm +++ b/vp9/common/arm/neon/vp9_idct8x8_add_neon.asm @@ -9,7 +9,7 @@ ; EXPORT |vp9_idct8x8_64_add_neon| - EXPORT |vp9_idct8x8_10_add_neon| + EXPORT |vp9_idct8x8_12_add_neon| ARM REQUIRE8 PRESERVE8 @@ -310,13 +310,13 @@ bx lr ENDP ; |vp9_idct8x8_64_add_neon| -;void vp9_idct8x8_10_add_neon(int16_t *input, uint8_t *dest, int dest_stride) +;void vp9_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride) ; ; r0 int16_t input ; r1 uint8_t *dest ; r2 int dest_stride) -|vp9_idct8x8_10_add_neon| PROC +|vp9_idct8x8_12_add_neon| PROC push {r4-r9} vpush {d8-d15} vld1.s16 {q8,q9}, [r0]! @@ -514,6 +514,6 @@ vpop {d8-d15} pop {r4-r9} bx lr - ENDP ; |vp9_idct8x8_10_add_neon| + ENDP ; |vp9_idct8x8_12_add_neon| END diff --git a/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c index acccaea6d..fc44ffa31 100644 --- a/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c +++ b/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c @@ -617,7 +617,7 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, } } -void vp9_idct8x8_10_add_dspr2(const int16_t *input, uint8_t *dest, +void vp9_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest, int dest_stride) { DECLARE_ALIGNED(32, int16_t, out[8 * 8]); int16_t *outptr = out; diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c index 20b78bfed..856d41e70 100644 --- a/vp9/common/vp9_idct.c +++ b/vp9/common/vp9_idct.c @@ -421,7 +421,7 @@ void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride, } } -void vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int stride) { +void vp9_idct8x8_12_add_c(const int16_t *input, uint8_t *dest, int stride) { int16_t out[8 * 8] = { 0 }; int16_t *outptr = out; int i, j; @@ -1348,8 +1348,8 @@ void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob) { if (eob == 1) // DC only DCT coefficient vp9_idct8x8_1_add(input, dest, stride); - else if (eob <= 10) - vp9_idct8x8_10_add(input, dest, stride); + else if (eob <= 12) + vp9_idct8x8_12_add(input, dest, stride); else vp9_idct8x8_64_add(input, dest, stride); } diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index aacabb117..11990265d 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -312,8 +312,8 @@ specialize qw/vp9_idct8x8_1_add sse2 neon dspr2/; add_proto qw/void vp9_idct8x8_64_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; specialize qw/vp9_idct8x8_64_add sse2 neon dspr2/, "$ssse3_x86_64"; -add_proto qw/void vp9_idct8x8_10_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; -specialize qw/vp9_idct8x8_10_add sse2 neon dspr2/, "$ssse3_x86_64"; +add_proto qw/void vp9_idct8x8_12_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; +specialize qw/vp9_idct8x8_12_add sse2 neon dspr2/, "$ssse3_x86_64"; add_proto qw/void vp9_idct16x16_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; specialize qw/vp9_idct16x16_1_add sse2 neon dspr2/; diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c index 13a5b5a82..0231726dc 100644 --- a/vp9/common/x86/vp9_idct_intrin_sse2.c +++ b/vp9/common/x86/vp9_idct_intrin_sse2.c @@ -995,7 +995,7 @@ void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride, RECON_AND_STORE(dest, in[7]); } -void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) { +void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) { const __m128i zero = _mm_setzero_si128(); const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i final_rounding = _mm_set1_epi16(1<<4); diff --git a/vp9/common/x86/vp9_idct_ssse3.asm b/vp9/common/x86/vp9_idct_ssse3.asm index 744801179..2c1060710 100644 --- a/vp9/common/x86/vp9_idct_ssse3.asm +++ b/vp9/common/x86/vp9_idct_ssse3.asm @@ -185,7 +185,7 @@ cglobal idct8x8_64_add, 3, 5, 13, input, output, stride RET ; inverse 8x8 2D-DCT transform with only first 10 coeffs non-zero -cglobal idct8x8_10_add, 3, 5, 13, input, output, stride +cglobal idct8x8_12_add, 3, 5, 13, input, output, stride mova m8, [pd_8192] mova m11, [pw_16] mova m12, [pw_11585x2] -- 2.40.0