From: Timothy B. Terriberry Date: Fri, 17 May 2013 17:11:30 +0000 (-0700) Subject: Reduce WHT complexity. X-Git-Tag: v1.3.0~1104^2~132 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=95339d68258b8f752e114e6c81e6e19f99cde5a4;p=libvpx Reduce WHT complexity. Saves 1 add, 3 shifts (and a shift bias) per 1-D transform. Change-Id: I1104bb1679fe342b2f9677df8a9cdc0cb9699e7d --- diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c index f61230c80..026ba913d 100644 --- a/vp9/common/vp9_idct.c +++ b/vp9/common/vp9_idct.c @@ -19,22 +19,32 @@ #include "vp9/common/vp9_idct.h" void vp9_short_iwalsh4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) { +/* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, + 0.5 shifts per pixel. */ int i; int16_t output[16]; - int a1, b1, c1, d1; + int a1, b1, c1, d1, e1; int16_t *ip = input; int16_t *op = output; for (i = 0; i < 4; i++) { - a1 = (ip[0] + ip[3]) >> WHT_UPSCALE_FACTOR; - b1 = (ip[1] + ip[2]) >> WHT_UPSCALE_FACTOR; - c1 = (ip[1] - ip[2]) >> WHT_UPSCALE_FACTOR; - d1 = (ip[0] - ip[3]) >> WHT_UPSCALE_FACTOR; - - op[0] = (a1 + b1 + 1) >> 1; - op[1] = (c1 + d1) >> 1; - op[2] = (a1 - b1) >> 1; - op[3] = (d1 - c1) >> 1; + a1 = ip[0] >> WHT_UPSCALE_FACTOR; + c1 = ip[1] >> WHT_UPSCALE_FACTOR; + d1 = ip[2] >> WHT_UPSCALE_FACTOR; + b1 = ip[3] >> WHT_UPSCALE_FACTOR; + + c1 = a1 - c1; + b1 += d1; + e1 = (c1 - b1) >> 1; + a1 -= e1; + d1 += e1; + b1 = a1 - b1; + c1 -= d1; + + op[0] = a1; + op[1] = b1; + op[2] = c1; + op[3] = d1; ip += 4; op += 4; @@ -42,20 +52,23 @@ void vp9_short_iwalsh4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) { ip = output; for (i = 0; i < 4; i++) { - a1 = ip[4 * 0] + ip[4 * 3]; - b1 = ip[4 * 1] + ip[4 * 2]; - c1 = ip[4 * 1] - ip[4 * 2]; - d1 = ip[4 * 0] - ip[4 * 3]; - - - dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + - ((a1 + b1 + 1) >> 1)); - dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + - ((c1 + d1) >> 1)); - dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + - ((a1 - b1) >> 1)); - dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + - ((d1 - c1) >> 1)); + a1 = ip[4 * 0]; + c1 = ip[4 * 1]; + d1 = ip[4 * 2]; + b1 = ip[4 * 3]; + + c1 = a1 - c1; + b1 += d1; + e1 = (c1 - b1) >> 1; + a1 -= e1; + d1 += e1; + b1 = a1 - b1; + c1 -= d1; + + dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + a1); + dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + b1); + dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + c1); + dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + d1); ip++; dest++; @@ -64,23 +77,24 @@ void vp9_short_iwalsh4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) { void vp9_short_iwalsh4x4_1_add_c(int16_t *in, uint8_t *dest, int dest_stride) { int i; + int a1, e1; int16_t tmp[4]; int16_t *ip = in; int16_t *op = tmp; - op[0] = ((ip[0] >> WHT_UPSCALE_FACTOR) + 1) >> 1; - op[1] = op[2] = op[3] = (ip[0] >> WHT_UPSCALE_FACTOR) >> 1; + a1 = ip[0] >> WHT_UPSCALE_FACTOR; + e1 = a1 >> 1; + op[0] = op[1] = op[2] = a1 - e1; + op[3] = e1; ip = tmp; for (i = 0; i < 4; i++) { - dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + - ((ip[0] + 1) >> 1)); - dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + - (ip[0] >> 1)); - dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + - (ip[0] >> 1)); - dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + - (ip[0] >> 1)); + e1 = ip[0] >> 1; + a1 = ip[0] - e1; + dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + a1); + dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + a1); + dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + a1); + dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + e1); ip++; dest++; } diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c index ebf40e4e6..d22644424 100644 --- a/vp9/encoder/vp9_dct.c +++ b/vp9/encoder/vp9_dct.c @@ -591,23 +591,33 @@ void vp9_short_fht8x8_c(int16_t *input, int16_t *output, } } +/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per + pixel. */ void vp9_short_walsh4x4_c(short *input, short *output, int pitch) { int i; - int a1, b1, c1, d1; + int a1, b1, c1, d1, e1; short *ip = input; short *op = output; int pitch_short = pitch >> 1; for (i = 0; i < 4; i++) { - a1 = ip[0 * pitch_short] + ip[3 * pitch_short]; - b1 = ip[1 * pitch_short] + ip[2 * pitch_short]; - c1 = ip[1 * pitch_short] - ip[2 * pitch_short]; - d1 = ip[0 * pitch_short] - ip[3 * pitch_short]; - - op[0] = (a1 + b1 + 1) >> 1; - op[4] = (c1 + d1) >> 1; - op[8] = (a1 - b1) >> 1; - op[12] = (d1 - c1) >> 1; + a1 = ip[0 * pitch_short]; + b1 = ip[1 * pitch_short]; + c1 = ip[2 * pitch_short]; + d1 = ip[3 * pitch_short]; + + b1 = a1 - b1; + c1 += d1; + e1 = (c1 - b1) >> 1; + a1 += e1; + d1 -= e1; + c1 = a1 - c1; + b1 -= d1; + + op[0] = a1; + op[4] = c1; + op[8] = d1; + op[12] = b1; ip++; op++; @@ -616,15 +626,23 @@ void vp9_short_walsh4x4_c(short *input, short *output, int pitch) { op = output; for (i = 0; i < 4; i++) { - a1 = ip[0] + ip[3]; - b1 = ip[1] + ip[2]; - c1 = ip[1] - ip[2]; - d1 = ip[0] - ip[3]; - - op[0] = ((a1 + b1 + 1) >> 1) << WHT_UPSCALE_FACTOR; - op[1] = ((c1 + d1) >> 1) << WHT_UPSCALE_FACTOR; - op[2] = ((a1 - b1) >> 1) << WHT_UPSCALE_FACTOR; - op[3] = ((d1 - c1) >> 1) << WHT_UPSCALE_FACTOR; + a1 = ip[0]; + b1 = ip[1]; + c1 = ip[2]; + d1 = ip[3]; + + b1 = a1 - b1; + c1 += d1; + e1 = (c1 - b1) >> 1; + a1 += e1; + d1 -= e1; + c1 = a1 - c1; + b1 -= d1; + + op[0] = a1 << WHT_UPSCALE_FACTOR; + op[1] = c1 << WHT_UPSCALE_FACTOR; + op[2] = d1 << WHT_UPSCALE_FACTOR; + op[3] = b1 << WHT_UPSCALE_FACTOR; ip += 4; op += 4;