From 46a487299946e8a2130c3629bfaac1252ff068c4 Mon Sep 17 00:00:00 2001 From: Loren Merritt Date: Sun, 5 Jun 2005 01:09:38 +0000 Subject: [PATCH] slightly faster 8x8 dct git-svn-id: svn://svn.videolan.org/x264/trunk@249 df754926-b1dd-0310-bc7b-ec298dee348c --- common/dct.c | 175 ++++++++++++++++++++++++--------------------------- 1 file changed, 82 insertions(+), 93 deletions(-) diff --git a/common/dct.c b/common/dct.c index 7c6b2b2a..b5bc95b7 100644 --- a/common/dct.c +++ b/common/dct.c @@ -260,60 +260,56 @@ static void add16x16_idct( uint8_t *p_dst, int i_dst, int16_t dct[16][4][4] ) * 8x8 transform: ****************************************************************************/ -static inline void dct8_1d( int16_t src[8][8], int16_t dst[8][8] ) -{ - int i; - for( i = 0; i < 8; i++ ) - { - const int s07 = src[i][0] + src[i][7]; - const int s16 = src[i][1] + src[i][6]; - const int s25 = src[i][2] + src[i][5]; - const int s34 = src[i][3] + src[i][4]; - - const int a0 = s07 + s34; - const int a1 = s16 + s25; - const int a2 = s07 - s34; - const int a3 = s16 - s25; - - const int d07 = src[i][0] - src[i][7]; - const int d16 = src[i][1] - src[i][6]; - const int d25 = src[i][2] - src[i][5]; - const int d34 = src[i][3] - src[i][4]; - - const int a4 = d16 + d25 + (d07 + (d07>>1)); - const int a5 = d07 - d34 - (d25 + (d25>>1)); - const int a6 = d07 + d34 - (d16 + (d16>>1)); - const int a7 = d16 - d25 + (d34 + (d34>>1)); - - dst[0][i] = a0 + a1; - dst[1][i] = a4 + (a7>>2); - dst[2][i] = a2 + (a3>>1); - dst[3][i] = a5 + (a6>>2); - dst[4][i] = a0 - a1; - dst[5][i] = a6 - (a5>>2); - dst[6][i] = (a2>>1) - a3; - dst[7][i] = (a4>>2) - a7; - } +#define DCT8_1D {\ + const int s07 = SRC(0) + SRC(7);\ + const int s16 = SRC(1) + SRC(6);\ + const int s25 = SRC(2) + SRC(5);\ + const int s34 = SRC(3) + SRC(4);\ + const int a0 = s07 + s34;\ + const int a1 = s16 + s25;\ + const int a2 = s07 - s34;\ + const int a3 = s16 - s25;\ + const int d07 = SRC(0) - SRC(7);\ + const int d16 = SRC(1) - SRC(6);\ + const int d25 = SRC(2) - SRC(5);\ + const int d34 = SRC(3) - SRC(4);\ + const int a4 = d16 + d25 + (d07 + (d07>>1));\ + const int a5 = d07 - d34 - (d25 + (d25>>1));\ + const int a6 = d07 + d34 - (d16 + (d16>>1));\ + const int a7 = d16 - d25 + (d34 + (d34>>1));\ + SRC(0) = a0 + a1 ;\ + SRC(1) = a4 + (a7>>2);\ + SRC(2) = a2 + (a3>>1);\ + SRC(3) = a5 + (a6>>2);\ + SRC(4) = a0 - a1 ;\ + SRC(5) = a6 - (a5>>2);\ + SRC(6) = (a2>>1) - a3 ;\ + SRC(7) = (a4>>2) - a7 ;\ } static void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 ) { - int16_t d[8][8]; - int16_t tmp[8][8]; - int y, x; + int y, x, i; for( y = 0; y < 8; y++ ) { for( x = 0; x < 8; x++ ) { - d[y][x] = pix1[x] - pix2[x]; + dct[y][x] = pix1[x] - pix2[x]; } pix1 += i_pix1; pix2 += i_pix2; } - dct8_1d( d, tmp ); - dct8_1d( tmp, dct ); +#define SRC(x) dct[i][x] + for( i = 0; i < 8; i++ ) + DCT8_1D +#undef SRC + +#define SRC(x) dct[x][i] + for( i = 0; i < 8; i++ ) + DCT8_1D +#undef SRC } static void sub16x16_dct8( int16_t dct[4][8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 ) @@ -324,67 +320,60 @@ static void sub16x16_dct8( int16_t dct[4][8][8], uint8_t *pix1, int i_pix1, uint sub8x8_dct8( dct[3], &pix1[8*i_pix1+8], i_pix1, &pix2[8*i_pix2+8], i_pix2 ); } -static inline void idct8_1d( int16_t src[8][8], int16_t dst[8][8] ) -{ - int i; - for( i = 0; i < 8; i++ ) - { - const int a0 = src[i][0] + src[i][4]; - const int a2 = src[i][0] - src[i][4]; - const int a4 = (src[i][2]>>1) - src[i][6]; - const int a6 = (src[i][6]>>1) + src[i][2]; - - const int b0 = a0 + a6; - const int b2 = a2 + a4; - const int b4 = a2 - a4; - const int b6 = a0 - a6; - - const int a1 = -src[i][3] + src[i][5] - src[i][7] - (src[i][7]>>1); - const int a3 = src[i][1] + src[i][7] - src[i][3] - (src[i][3]>>1); - const int a5 = -src[i][1] + src[i][7] + src[i][5] + (src[i][5]>>1); - const int a7 = src[i][3] + src[i][5] + src[i][1] + (src[i][1]>>1); - - const int b1 = (a7>>2) + a1; - const int b3 = a3 + (a5>>2); - const int b5 = (a3>>2) - a5; - const int b7 = a7 - (a1>>2); - - dst[0][i] = b0 + b7; - dst[7][i] = b0 - b7; - dst[1][i] = b2 + b5; - dst[6][i] = b2 - b5; - dst[2][i] = b4 + b3; - dst[5][i] = b4 - b3; - dst[3][i] = b6 + b1; - dst[4][i] = b6 - b1; - } +#define IDCT8_1D {\ + const int a0 = SRC(0) + SRC(4);\ + const int a2 = SRC(0) - SRC(4);\ + const int a4 = (SRC(2)>>1) - SRC(6);\ + const int a6 = (SRC(6)>>1) + SRC(2);\ + const int b0 = a0 + a6;\ + const int b2 = a2 + a4;\ + const int b4 = a2 - a4;\ + const int b6 = a0 - a6;\ + const int a1 = -SRC(3) + SRC(5) - SRC(7) - (SRC(7)>>1);\ + const int a3 = SRC(1) + SRC(7) - SRC(3) - (SRC(3)>>1);\ + const int a5 = -SRC(1) + SRC(7) + SRC(5) + (SRC(5)>>1);\ + const int a7 = SRC(3) + SRC(5) + SRC(1) + (SRC(1)>>1);\ + const int b1 = (a7>>2) + a1;\ + const int b3 = a3 + (a5>>2);\ + const int b5 = (a3>>2) - a5;\ + const int b7 = a7 - (a1>>2);\ + DST(0, b0 + b7);\ + DST(1, b2 + b5);\ + DST(2, b4 + b3);\ + DST(3, b6 + b1);\ + DST(4, b6 - b1);\ + DST(5, b4 - b3);\ + DST(6, b2 - b5);\ + DST(7, b0 - b7);\ } -static void add8x8_idct8( uint8_t *p_dst, int i_dst, int16_t dct[8][8] ) +static void add8x8_idct8( uint8_t *dst, int i_dst, int16_t dct[8][8] ) { - int16_t d[8][8]; - int16_t tmp[8][8]; - int y, x; + int i; - idct8_1d( dct, tmp ); - idct8_1d( tmp, d ); + dct[0][0] += 32; // rounding for the >>6 at the end - for( y = 0; y < 8; y++ ) - { - for( x = 0; x < 8; x++ ) - { - p_dst[x] = clip_uint8( p_dst[x] + ((d[y][x] + 32) >> 6) ); - } - p_dst += i_dst; - } +#define SRC(x) dct[i][x] +#define DST(x,rhs) dct[i][x] = (rhs) + for( i = 0; i < 8; i++ ) + IDCT8_1D +#undef SRC +#undef DST + +#define SRC(x) dct[x][i] +#define DST(x,rhs) dst[i + x*i_dst] = clip_uint8( dst[i + x*i_dst] + ((rhs) >> 6) ); + for( i = 0; i < 8; i++ ) + IDCT8_1D +#undef SRC +#undef DST } -static void add16x16_idct8( uint8_t *p_dst, int i_dst, int16_t dct[4][8][8] ) +static void add16x16_idct8( uint8_t *dst, int i_dst, int16_t dct[4][8][8] ) { - add8x8_idct8( &p_dst[0], i_dst, dct[0] ); - add8x8_idct8( &p_dst[8], i_dst, dct[1] ); - add8x8_idct8( &p_dst[8*i_dst], i_dst, dct[2] ); - add8x8_idct8( &p_dst[8*i_dst+8], i_dst, dct[3] ); + add8x8_idct8( &dst[0], i_dst, dct[0] ); + add8x8_idct8( &dst[8], i_dst, dct[1] ); + add8x8_idct8( &dst[8*i_dst], i_dst, dct[2] ); + add8x8_idct8( &dst[8*i_dst+8], i_dst, dct[3] ); } -- 2.40.0