From: Yunqing Wang Date: Wed, 27 Feb 2013 00:27:41 +0000 (-0800) Subject: Optimize vp9_dc_only_idct_add_c function X-Git-Tag: v1.3.0~1151^2~115^2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=35bc02c6eb22602997d9c8aebeb46ef588266cc4;p=libvpx Optimize vp9_dc_only_idct_add_c function Wrote SSE2 version of vp9_dc_only_idct_add_c function. In order to improve performance, clipped the absolute diff values to [0, 255]. This allowed us to keep the additions/subtractions in 8 bits. Test showed an over 2% decoder performance increase. Change-Id: Ie1a236d23d207e4ffcd1fc9f3d77462a9c7fe09d --- diff --git a/vp9/common/vp9_idct.h b/vp9/common/vp9_idct.h index 430cec083..d25d0ac2a 100644 --- a/vp9/common/vp9_idct.h +++ b/vp9/common/vp9_idct.h @@ -13,6 +13,13 @@ #include "./vpx_config.h" +#define ROUND_POWER_OF_TWO(value, n) (((value) + (1 << ((n) - 1))) >> (n)) + +/* If we don't want to use ROUND_POWER_OF_TWO macro +static INLINE int16_t round_power_of_two(int16_t value, int n) { + return (value + (1 << (n - 1))) >> n; +}*/ + // Constants and Macros used by all idct/dct functions #define DCT_CONST_BITS 14 #define DCT_CONST_ROUNDING (1 << (DCT_CONST_BITS - 1)) diff --git a/vp9/common/vp9_idctllm.c b/vp9/common/vp9_idctllm.c index 67cfc9d71..19397028b 100644 --- a/vp9/common/vp9_idctllm.c +++ b/vp9/common/vp9_idctllm.c @@ -31,13 +31,6 @@ #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_idct.h" -#define ROUND_POWER_OF_TWO(value, n) (((value) + (1 << ((n) - 1))) >> (n)) - -/* If we don't want to use ROUND_POWER_OF_TWO macro -static INLINE int16_t round_power_of_two(int16_t value, int n) { - return (value + (1 << (n - 1))) >> n; -}*/ - typedef void (*transform_1d)(int16_t*, int16_t*); typedef struct { diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 700af7fa7..02a6711e5 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -296,7 +296,7 @@ specialize vp9_short_iht16x16 # dct and add prototype void vp9_dc_only_idct_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride" -specialize vp9_dc_only_idct_add +specialize vp9_dc_only_idct_add sse2 prototype void vp9_short_inv_walsh4x4_1_x8 "int16_t *input, int16_t *output, int pitch" specialize vp9_short_inv_walsh4x4_1_x8 diff --git a/vp9/common/x86/vp9_idctllm_x86.c b/vp9/common/x86/vp9_idctllm_x86.c new file mode 100644 index 000000000..667f5c1d3 --- /dev/null +++ b/vp9/common/x86/vp9_idctllm_x86.c @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include // SSE2 +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_idct.h" + +#if HAVE_SSE2 +// In order to improve performance, clip absolute diff values to [0, 255], +// which allows to keep the additions/subtractions in 8 bits. +void vp9_dc_only_idct_add_sse2(int input_dc, uint8_t *pred_ptr, + uint8_t *dst_ptr, int pitch, int stride) { + int a1; + int16_t out; + uint8_t abs_diff; + __m128i p0, p1, p2, p3; + unsigned int extended_diff; + __m128i diff; + + out = dct_const_round_shift(input_dc * cospi_16_64); + out = dct_const_round_shift(out * cospi_16_64); + a1 = ROUND_POWER_OF_TWO(out, 4); + + // Read prediction data. + p0 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 0 * pitch)); + p1 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 1 * pitch)); + p2 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 2 * pitch)); + p3 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 3 * pitch)); + + // Unpack prediction data, and store 4x4 array in 1 XMM register. + p0 = _mm_unpacklo_epi32(p0, p1); + p2 = _mm_unpacklo_epi32(p2, p3); + p0 = _mm_unpacklo_epi64(p0, p2); + + // Clip dc value to [0, 255] range. Then, do addition or subtraction + // according to its sign. + if (a1 >= 0) { + abs_diff = (a1 > 255) ? 255 : a1; + extended_diff = abs_diff * 0x01010101u; + diff = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_diff), 0); + + p1 = _mm_adds_epu8(p0, diff); + } else { + abs_diff = (a1 < -255) ? 255 : -a1; + extended_diff = abs_diff * 0x01010101u; + diff = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_diff), 0); + + p1 = _mm_subs_epu8(p0, diff); + } + + // Store results to dst. + *(int *)dst_ptr = _mm_cvtsi128_si32(p1); + dst_ptr += stride; + + p1 = _mm_srli_si128(p1, 4); + *(int *)dst_ptr = _mm_cvtsi128_si32(p1); + dst_ptr += stride; + + p1 = _mm_srli_si128(p1, 4); + *(int *)dst_ptr = _mm_cvtsi128_si32(p1); + dst_ptr += stride; + + p1 = _mm_srli_si128(p1, 4); + *(int *)dst_ptr = _mm_cvtsi128_si32(p1); +} +#endif diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c index d42bccd88..05a1bf9e0 100644 --- a/vp9/decoder/vp9_decodframe.c +++ b/vp9/decoder/vp9_decodframe.c @@ -126,7 +126,7 @@ static void mb_init_dequantizer(VP9D_COMP *pbi, MACROBLOCKD *xd) { xd->inv_txm4x4_1 = vp9_short_idct4x4llm_1; xd->inv_txm4x4 = vp9_short_idct4x4llm; xd->itxm_add = vp9_dequant_idct_add; - xd->dc_only_itxm_add = vp9_dc_only_idct_add_c; + xd->dc_only_itxm_add = vp9_dc_only_idct_add; xd->itxm_add_y_block = vp9_dequant_idct_add_y_block; xd->itxm_add_uv_block = vp9_dequant_idct_add_uv_block; if (xd->lossless) { diff --git a/vp9/decoder/vp9_idct_blk.c b/vp9/decoder/vp9_idct_blk.c index 6e55e45ae..50d874470 100644 --- a/vp9/decoder/vp9_idct_blk.c +++ b/vp9/decoder/vp9_idct_blk.c @@ -47,7 +47,7 @@ void vp9_dequant_idct_add_y_block_c(int16_t *q, const int16_t *dq, if (xd->block[i * 4 + j].eob > 1) vp9_dequant_idct_add_c(q, dq, pre, dst, 16, stride); else { - vp9_dc_only_idct_add_c(q[0]*dq[0], pre, dst, 16, stride); + vp9_dc_only_idct_add(q[0]*dq[0], pre, dst, 16, stride); ((int *)q)[0] = 0; } @@ -72,7 +72,7 @@ void vp9_dequant_idct_add_uv_block_c(int16_t *q, const int16_t *dq, if (xd->block[16 + i * 2 + j].eob > 1) vp9_dequant_idct_add_c(q, dq, pre, dstu, 8, stride); else { - vp9_dc_only_idct_add_c(q[0]*dq[0], pre, dstu, 8, stride); + vp9_dc_only_idct_add(q[0]*dq[0], pre, dstu, 8, stride); ((int *)q)[0] = 0; } @@ -90,7 +90,7 @@ void vp9_dequant_idct_add_uv_block_c(int16_t *q, const int16_t *dq, if (xd->block[20 + i * 2 + j].eob > 1) vp9_dequant_idct_add_c(q, dq, pre, dstv, 8, stride); else { - vp9_dc_only_idct_add_c(q[0]*dq[0], pre, dstv, 8, stride); + vp9_dc_only_idct_add(q[0]*dq[0], pre, dstv, 8, stride); ((int *)q)[0] = 0; } diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index eb152f521..f330b464a 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -110,10 +110,13 @@ VP9_COMMON_SRCS-yes += common/vp9_maskingmv.c VP9_COMMON_SRCS-$(HAVE_SSE3) += common/x86/vp9_mask_sse3.asm endif +VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_idctllm_x86.c VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_sadmxn_x86.c ifeq ($(HAVE_SSE2),yes) +vp9/common/x86/vp9_idctllm_x86.c.o: CFLAGS += -msse2 vp9/common/x86/vp9_loopfilter_x86.c.o: CFLAGS += -msse2 vp9/common/x86/vp9_sadmxn_x86.c.o: CFLAGS += -msse2 +vp9/common/x86/vp9_idctllm_x86.c.d: CFLAGS += -msse2 vp9/common/x86/vp9_loopfilter_x86.c.d: CFLAGS += -msse2 vp9/common/x86/vp9_sadmxn_x86.c.d: CFLAGS += -msse2 endif