From: Jingning Han Date: Fri, 2 May 2014 23:29:08 +0000 (-0700) Subject: SSSE3 8x8 inverse 2D-DCT with first 10 coeffs non-zero X-Git-Tag: v1.4.0~1625^2~2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=9e7b09bc5d45e309c0deeb1c95981ccb7b857876;p=libvpx SSSE3 8x8 inverse 2D-DCT with first 10 coeffs non-zero This commit enables ssse3 assembly implementation of the 8x8 inverse 2D-DCT with only first 10 coefficients non-zero. The average runtime for this unit goes down from 198 cycles to 129 cycles (34.8% faster). Change-Id: Ie7fa4386f6d3a2fe0d47a2eb26fc2a6bbc592ac7 --- diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index de69772d3..aacabb117 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -313,7 +313,7 @@ add_proto qw/void vp9_idct8x8_64_add/, "const int16_t *input, uint8_t *dest, int specialize qw/vp9_idct8x8_64_add sse2 neon dspr2/, "$ssse3_x86_64"; add_proto qw/void vp9_idct8x8_10_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; -specialize qw/vp9_idct8x8_10_add sse2 neon dspr2/; +specialize qw/vp9_idct8x8_10_add sse2 neon dspr2/, "$ssse3_x86_64"; add_proto qw/void vp9_idct16x16_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; specialize qw/vp9_idct16x16_1_add sse2 neon dspr2/; diff --git a/vp9/common/x86/vp9_idct_ssse3.asm b/vp9/common/x86/vp9_idct_ssse3.asm index f2a120f04..744801179 100644 --- a/vp9/common/x86/vp9_idct_ssse3.asm +++ b/vp9/common/x86/vp9_idct_ssse3.asm @@ -28,6 +28,29 @@ TRANSFORM_COEFFS 6270, 15137 TRANSFORM_COEFFS 3196, 16069 TRANSFORM_COEFFS 13623, 9102 +%macro PAIR_PP_COEFFS 2 +dpw_%1_%2: dw %1, %1, %1, %1, %2, %2, %2, %2 +%endmacro + +%macro PAIR_MP_COEFFS 2 +dpw_m%1_%2: dw -%1, -%1, -%1, -%1, %2, %2, %2, %2 +%endmacro + +%macro PAIR_MM_COEFFS 2 +dpw_m%1_m%2: dw -%1, -%1, -%1, -%1, -%2, -%2, -%2, -%2 +%endmacro + +PAIR_PP_COEFFS 30274, 12540 +PAIR_PP_COEFFS 6392, 32138 +PAIR_MP_COEFFS 18204, 27246 + +PAIR_PP_COEFFS 12540, 12540 +PAIR_PP_COEFFS 30274, 30274 +PAIR_PP_COEFFS 6392, 6392 +PAIR_PP_COEFFS 32138, 32138 +PAIR_MM_COEFFS 18204, 18204 +PAIR_PP_COEFFS 27246, 27246 + SECTION .text %if ARCH_X86_64 @@ -128,6 +151,7 @@ SECTION .text %endmacro INIT_XMM ssse3 +; full inverse 8x8 2D-DCT transform cglobal idct8x8_64_add, 3, 5, 13, input, output, stride mova m8, [pd_8192] mova m11, [pw_16] @@ -159,4 +183,118 @@ cglobal idct8x8_64_add, 3, 5, 13, input, output, stride ADD_STORE_8P_2X 6, 7, 9, 10, 12 RET + +; inverse 8x8 2D-DCT transform with only first 10 coeffs non-zero +cglobal idct8x8_10_add, 3, 5, 13, input, output, stride + mova m8, [pd_8192] + mova m11, [pw_16] + mova m12, [pw_11585x2] + + lea r3, [2 * strideq] + + mova m0, [inputq + 0] + mova m1, [inputq + 16] + mova m2, [inputq + 32] + mova m3, [inputq + 48] + + punpcklwd m0, m1 + punpcklwd m2, m3 + punpckhdq m9, m0, m2 + punpckldq m0, m2 + SWAP 2, 9 + + ; m0 -> [0], [0] + ; m1 -> [1], [1] + ; m2 -> [2], [2] + ; m3 -> [3], [3] + punpckhqdq m10, m0, m0 + punpcklqdq m0, m0 + punpckhqdq m9, m2, m2 + punpcklqdq m2, m2 + SWAP 1, 10 + SWAP 3, 9 + + pmulhrsw m0, m12 + pmulhrsw m2, [dpw_30274_12540] + pmulhrsw m1, [dpw_6392_32138] + pmulhrsw m3, [dpw_m18204_27246] + + SUM_SUB 0, 2, 9 + SUM_SUB 1, 3, 9 + + punpcklqdq m9, m3, m3 + punpckhqdq m5, m3, m9 + + SUM_SUB 3, 5, 9 + punpckhqdq m5, m3 + pmulhrsw m5, m12 + + punpckhqdq m9, m1, m5 + punpcklqdq m1, m5 + SWAP 5, 9 + + SUM_SUB 0, 5, 9 + SUM_SUB 2, 1, 9 + + punpckhqdq m3, m0, m0 + punpckhqdq m4, m1, m1 + punpckhqdq m6, m5, m5 + punpckhqdq m7, m2, m2 + + punpcklwd m0, m3 + punpcklwd m7, m2 + punpcklwd m1, m4 + punpcklwd m6, m5 + + punpckhdq m4, m0, m7 + punpckldq m0, m7 + punpckhdq m10, m1, m6 + punpckldq m5, m1, m6 + + punpckhqdq m1, m0, m5 + punpcklqdq m0, m5 + punpckhqdq m3, m4, m10 + punpcklqdq m2, m4, m10 + + + pmulhrsw m0, m12 + pmulhrsw m6, m2, [dpw_30274_30274] + pmulhrsw m4, m2, [dpw_12540_12540] + + pmulhrsw m7, m1, [dpw_32138_32138] + pmulhrsw m1, [dpw_6392_6392] + pmulhrsw m5, m3, [dpw_m18204_m18204] + pmulhrsw m3, [dpw_27246_27246] + + mova m2, m0 + SUM_SUB 0, 6, 9 + SUM_SUB 2, 4, 9 + SUM_SUB 1, 5, 9 + SUM_SUB 7, 3, 9 + + SUM_SUB 3, 5, 9 + pmulhrsw m3, m12 + pmulhrsw m5, m12 + + SUM_SUB 0, 7, 9 + SUM_SUB 2, 3, 9 + SUM_SUB 4, 5, 9 + SUM_SUB 6, 1, 9 + + SWAP 3, 6 + SWAP 1, 2 + SWAP 2, 4 + + + pxor m12, m12 + ADD_STORE_8P_2X 0, 1, 9, 10, 12 + lea outputq, [outputq + r3] + ADD_STORE_8P_2X 2, 3, 9, 10, 12 + lea outputq, [outputq + r3] + ADD_STORE_8P_2X 4, 5, 9, 10, 12 + lea outputq, [outputq + r3] + ADD_STORE_8P_2X 6, 7, 9, 10, 12 + + RET + %endif