From d0dd01b8ce8bc5f477d70f1c127d795418c5efb5 Mon Sep 17 00:00:00 2001 From: Yaowu Xu Date: Wed, 16 Jun 2010 12:52:18 -0700 Subject: [PATCH] Redo the forward 4x4 dct The new fdct lowers the round trip sum squared error for a 4x4 block ~0.12. or ~0.008/pixel. For reference, the old matrix multiply version has average round trip error 1.46 for a 4x4 block. Thanks to "derf" for his suggestions and references. Change-Id: I5559d1e81d333b319404ab16b336b739f87afc79 --- vp8/encoder/block.h | 5 - vp8/encoder/dct.c | 141 +-------- vp8/encoder/dct.h | 10 - vp8/encoder/encodeintra.c | 2 +- vp8/encoder/encodemb.c | 39 ++- vp8/encoder/ethreading.c | 3 - vp8/encoder/generic/csystemdependent.c | 4 +- vp8/encoder/onyx_if.c | 6 - vp8/encoder/rdopt.c | 4 +- vp8/encoder/x86/csystemdependent.c | 26 +- vp8/encoder/x86/dct_mmx.asm | 392 +++---------------------- vp8/encoder/x86/dct_x86.h | 13 +- vp8/encoder/x86/x86_csystemdependent.c | 34 +-- vp8/vp8cx.mk | 1 - 14 files changed, 118 insertions(+), 562 deletions(-) diff --git a/vp8/encoder/block.h b/vp8/encoder/block.h index c1fcfe29a..b55bc51cb 100644 --- a/vp8/encoder/block.h +++ b/vp8/encoder/block.h @@ -100,14 +100,9 @@ typedef struct void (*vp8_short_fdct4x4)(short *input, short *output, int pitch); void (*vp8_short_fdct8x4)(short *input, short *output, int pitch); - void (*short_fdct4x4rd)(short *input, short *output, int pitch); - void (*short_fdct8x4rd)(short *input, short *output, int pitch); void (*short_walsh4x4)(short *input, short *output, int pitch); - void (*quantize_b)(BLOCK *b, BLOCKD *d); - - } MACROBLOCK; diff --git a/vp8/encoder/dct.c b/vp8/encoder/dct.c index 3075e5853..58e36109c 100644 --- a/vp8/encoder/dct.c +++ b/vp8/encoder/dct.c @@ -11,163 +11,54 @@ #include - -static const short dct_matrix2[4][4] = -{ - { 23170, 30274, 23170, 12540 }, - { 23170, 12540, -23170, -30274 }, - { 23170, -12540, -23170, 30274 }, - { 23170, -30274, 23170, -12540 } -}; - -static const short dct_matrix1[4][4] = -{ - { 23170, 23170, 23170, 23170 }, - { 30274, 12540, -12540, -30274 }, - { 23170, -23170, -23170, 23170 }, - { 12540, -30274, 30274, -12540 } -}; - - -#define _1STSTAGESHIFT 14 -#define _1STSTAGEROUNDING (1<<( _1STSTAGESHIFT-1)) -#define _2NDSTAGESHIFT 16 -#define _2NDSTAGEROUNDING (1<<( _2NDSTAGESHIFT-1)) - -// using matrix multiply void vp8_short_fdct4x4_c(short *input, short *output, int pitch) -{ - int i, j, k; - short temp[4][4]; - int sumtemp; - pitch >>= 1; - - for (i = 0; i < 4; i++) - { - for (j = 0; j < 4; j++) - { - sumtemp = 0; - - for (k = 0; k < 4; k++) - { - sumtemp += input[i*pitch+k] * dct_matrix2[k][j]; - - } - - temp[i][j] = (short)((sumtemp + _1STSTAGEROUNDING) >> _1STSTAGESHIFT); - } - } - - - for (i = 0; i < 4; i++) - { - for (j = 0; j < 4; j++) - { - sumtemp = 0; - - for (k = 0; k < 4; k++) - { - sumtemp += dct_matrix1[i][ k] * temp[k][ j]; - } - - output[i*4+j] = (short)((sumtemp + _2NDSTAGEROUNDING) >> _2NDSTAGESHIFT); - } - } - -} - - -void vp8_short_fdct8x4_c(short *input, short *output, int pitch) -{ - vp8_short_fdct4x4_c(input, output, pitch); - vp8_short_fdct4x4_c(input + 4, output + 16, pitch); -} - - -static const signed short x_c1 = 60547; -static const signed short x_c2 = 46341; -static const signed short x_c3 = 25080; - -void vp8_fast_fdct4x4_c(short *input, short *output, int pitch) { int i; int a1, b1, c1, d1; - int a2, b2, c2, d2; short *ip = input; - short *op = output; - int temp1, temp2; for (i = 0; i < 4; i++) { - a1 = (ip[0] + ip[3]) * 2; - b1 = (ip[1] + ip[2]) * 2; - c1 = (ip[1] - ip[2]) * 2; - d1 = (ip[0] - ip[3]) * 2; - - temp1 = a1 + b1; - temp2 = a1 - b1; - - op[0] = ((temp1 * x_c2) >> 16) + temp1; - op[2] = ((temp2 * x_c2) >> 16) + temp2; - - temp1 = (c1 * x_c3) >> 16; - temp2 = ((d1 * x_c1) >> 16) + d1; + a1 = ((ip[0] + ip[3])<<3); + b1 = ((ip[1] + ip[2])<<3); + c1 = ((ip[1] - ip[2])<<3); + d1 = ((ip[0] - ip[3])<<3); - op[1] = temp1 + temp2; - - temp1 = (d1 * x_c3) >> 16; - temp2 = ((c1 * x_c1) >> 16) + c1; + op[0] = a1 + b1; + op[2] = a1 - b1; - op[3] = temp1 - temp2; + op[1] = (c1 * 2217 + d1 * 5352 + 14500)>>12; + op[3] = (d1 * 2217 - c1 * 5352 + 7500)>>12; ip += pitch / 2; op += 4; - } + } ip = output; op = output; - for (i = 0; i < 4; i++) { - a1 = ip[0] + ip[12]; b1 = ip[4] + ip[8]; c1 = ip[4] - ip[8]; d1 = ip[0] - ip[12]; + op[0] = ( a1 + b1 + 7)>>4; + op[8] = ( a1 - b1 + 7)>>4; - temp1 = a1 + b1; - temp2 = a1 - b1; - - a2 = ((temp1 * x_c2) >> 16) + temp1; - c2 = ((temp2 * x_c2) >> 16) + temp2; - - temp1 = (c1 * x_c3) >> 16; - temp2 = ((d1 * x_c1) >> 16) + d1; - - b2 = temp1 + temp2; - - temp1 = (d1 * x_c3) >> 16; - temp2 = ((c1 * x_c1) >> 16) + c1; - - d2 = temp1 - temp2; - - - op[0] = (a2 + 1) >> 1; - op[4] = (b2 + 1) >> 1; - op[8] = (c2 + 1) >> 1; - op[12] = (d2 + 1) >> 1; + op[4] =((c1 * 2217 + d1 * 5352 + 12000)>>16) + (d1!=0); + op[12] = (d1 * 2217 - c1 * 5352 + 51000)>>16; ip++; op++; } } -void vp8_fast_fdct8x4_c(short *input, short *output, int pitch) +void vp8_short_fdct8x4_c(short *input, short *output, int pitch) { - vp8_fast_fdct4x4_c(input, output, pitch); - vp8_fast_fdct4x4_c(input + 4, output + 16, pitch); + vp8_short_fdct4x4_c(input, output, pitch); + vp8_short_fdct4x4_c(input + 4, output + 16, pitch); } void vp8_short_walsh4x4_c(short *input, short *output, int pitch) diff --git a/vp8/encoder/dct.h b/vp8/encoder/dct.h index f79dba4f2..0ab40b310 100644 --- a/vp8/encoder/dct.h +++ b/vp8/encoder/dct.h @@ -32,16 +32,6 @@ extern prototype_fdct(vp8_fdct_short4x4); #endif extern prototype_fdct(vp8_fdct_short8x4); -#ifndef vp8_fdct_fast4x4 -#define vp8_fdct_fast4x4 vp8_fast_fdct4x4_c -#endif -extern prototype_fdct(vp8_fdct_fast4x4); - -#ifndef vp8_fdct_fast8x4 -#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_c -#endif -extern prototype_fdct(vp8_fdct_fast8x4); - #ifndef vp8_fdct_walsh_short4x4 #define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_c #endif diff --git a/vp8/encoder/encodeintra.c b/vp8/encoder/encodeintra.c index 0e160930d..870cb5815 100644 --- a/vp8/encoder/encodeintra.c +++ b/vp8/encoder/encodeintra.c @@ -66,7 +66,7 @@ void vp8_encode_intra4x4block_rd(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x, BL ENCODEMB_INVOKE(&rtcd->encodemb, subb)(be, b, 16); - x->short_fdct4x4rd(be->src_diff, be->coeff, 32); + x->vp8_short_fdct4x4(be->src_diff, be->coeff, 32); x->quantize_b(be, b); diff --git a/vp8/encoder/encodemb.c b/vp8/encoder/encodemb.c index 824850c41..8bc01df5b 100644 --- a/vp8/encoder/encodemb.c +++ b/vp8/encoder/encodemb.c @@ -130,7 +130,8 @@ void vp8_transform_mbuvrd(MACROBLOCK *x) for (i = 16; i < 24; i += 2) { - x->short_fdct8x4rd(&x->block[i].src_diff[0], &x->block[i].coeff[0], 16); + x->vp8_short_fdct8x4(&x->block[i].src_diff[0], + &x->block[i].coeff[0], 16); } } @@ -140,14 +141,16 @@ void vp8_transform_intra_mby(MACROBLOCK *x) for (i = 0; i < 16; i += 2) { - x->vp8_short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32); + x->vp8_short_fdct8x4(&x->block[i].src_diff[0], + &x->block[i].coeff[0], 32); } // build dc block from 16 y dc values vp8_build_dcblock(x); // do 2nd order transform on the dc block - x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8); + x->short_walsh4x4(&x->block[24].src_diff[0], + &x->block[24].coeff[0], 8); } @@ -157,14 +160,16 @@ void vp8_transform_intra_mbyrd(MACROBLOCK *x) for (i = 0; i < 16; i += 2) { - x->short_fdct8x4rd(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32); + x->vp8_short_fdct8x4(&x->block[i].src_diff[0], + &x->block[i].coeff[0], 32); } // build dc block from 16 y dc values vp8_build_dcblock(x); // do 2nd order transform on the dc block - x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8); + x->short_walsh4x4(&x->block[24].src_diff[0], + &x->block[24].coeff[0], 8); } void vp8_transform_mb(MACROBLOCK *x) @@ -173,7 +178,8 @@ void vp8_transform_mb(MACROBLOCK *x) for (i = 0; i < 16; i += 2) { - x->vp8_short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32); + x->vp8_short_fdct8x4(&x->block[i].src_diff[0], + &x->block[i].coeff[0], 32); } // build dc block from 16 y dc values @@ -182,12 +188,14 @@ void vp8_transform_mb(MACROBLOCK *x) for (i = 16; i < 24; i += 2) { - x->vp8_short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 16); + x->vp8_short_fdct8x4(&x->block[i].src_diff[0], + &x->block[i].coeff[0], 16); } // do 2nd order transform on the dc block if (x->e_mbd.mbmi.mode != SPLITMV) - x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8); + x->short_walsh4x4(&x->block[24].src_diff[0], + &x->block[24].coeff[0], 8); } @@ -197,14 +205,16 @@ void vp8_transform_mby(MACROBLOCK *x) for (i = 0; i < 16; i += 2) { - x->vp8_short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32); + x->vp8_short_fdct8x4(&x->block[i].src_diff[0], + &x->block[i].coeff[0], 32); } // build dc block from 16 y dc values if (x->e_mbd.mbmi.mode != SPLITMV) { vp8_build_dcblock(x); - x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8); + x->short_walsh4x4(&x->block[24].src_diff[0], + &x->block[24].coeff[0], 8); } } @@ -214,7 +224,8 @@ void vp8_transform_mbrd(MACROBLOCK *x) for (i = 0; i < 16; i += 2) { - x->short_fdct8x4rd(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32); + x->vp8_short_fdct8x4(&x->block[i].src_diff[0], + &x->block[i].coeff[0], 32); } // build dc block from 16 y dc values @@ -223,12 +234,14 @@ void vp8_transform_mbrd(MACROBLOCK *x) for (i = 16; i < 24; i += 2) { - x->short_fdct8x4rd(&x->block[i].src_diff[0], &x->block[i].coeff[0], 16); + x->vp8_short_fdct8x4(&x->block[i].src_diff[0], + &x->block[i].coeff[0], 16); } // do 2nd order transform on the dc block if (x->e_mbd.mbmi.mode != SPLITMV) - x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8); + x->short_walsh4x4(&x->block[24].src_diff[0], + &x->block[24].coeff[0], 8); } void vp8_stuff_inter16x16(MACROBLOCK *x) diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c index a205667dc..dd98a09d1 100644 --- a/vp8/encoder/ethreading.c +++ b/vp8/encoder/ethreading.c @@ -257,9 +257,6 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc) z->vp8_short_fdct4x4 = x->vp8_short_fdct4x4; z->vp8_short_fdct8x4 = x->vp8_short_fdct8x4; - z->short_fdct4x4rd = x->short_fdct4x4rd; - z->short_fdct8x4rd = x->short_fdct8x4rd; - z->short_fdct8x4rd = x->short_fdct8x4rd; z->short_walsh4x4 = x->short_walsh4x4; z->quantize_b = x->quantize_b; diff --git a/vp8/encoder/generic/csystemdependent.c b/vp8/encoder/generic/csystemdependent.c index e68d65025..dd89f1a82 100644 --- a/vp8/encoder/generic/csystemdependent.c +++ b/vp8/encoder/generic/csystemdependent.c @@ -68,8 +68,8 @@ void vp8_cmachine_specific_config(VP8_COMP *cpi) cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_c; cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_c; - cpi->rtcd.fdct.fast4x4 = vp8_fast_fdct4x4_c; - cpi->rtcd.fdct.fast8x4 = vp8_fast_fdct8x4_c; + cpi->rtcd.fdct.fast4x4 = vp8_short_fdct4x4_c; + cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_c; cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_c; cpi->rtcd.encodemb.berr = vp8_block_error_c; diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index f3456a733..60d807c03 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -137,8 +137,6 @@ extern unsigned int inter_b_modes[15]; extern void (*vp8_short_fdct4x4)(short *input, short *output, int pitch); extern void (*vp8_short_fdct8x4)(short *input, short *output, int pitch); -extern void (*vp8_fast_fdct4x4)(short *input, short *output, int pitch); -extern void (*vp8_fast_fdct8x4)(short *input, short *output, int pitch); extern const int vp8_bits_per_mb[2][QINDEX_RANGE]; @@ -1136,15 +1134,11 @@ void vp8_set_speed_features(VP8_COMP *cpi) { cpi->mb.vp8_short_fdct8x4 = FDCT_INVOKE(&cpi->rtcd.fdct, short8x4); cpi->mb.vp8_short_fdct4x4 = FDCT_INVOKE(&cpi->rtcd.fdct, short4x4); - cpi->mb.short_fdct8x4rd = FDCT_INVOKE(&cpi->rtcd.fdct, short8x4); - cpi->mb.short_fdct4x4rd = FDCT_INVOKE(&cpi->rtcd.fdct, short4x4); } else { cpi->mb.vp8_short_fdct8x4 = FDCT_INVOKE(&cpi->rtcd.fdct, fast8x4); cpi->mb.vp8_short_fdct4x4 = FDCT_INVOKE(&cpi->rtcd.fdct, fast4x4); - cpi->mb.short_fdct8x4rd = FDCT_INVOKE(&cpi->rtcd.fdct, fast8x4); - cpi->mb.short_fdct4x4rd = FDCT_INVOKE(&cpi->rtcd.fdct, fast4x4); } cpi->mb.short_walsh4x4 = FDCT_INVOKE(&cpi->rtcd.fdct, walsh_short4x4); diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c index 2d6dee139..70cf122fa 100644 --- a/vp8/encoder/rdopt.c +++ b/vp8/encoder/rdopt.c @@ -1028,7 +1028,7 @@ static unsigned int vp8_encode_inter_mb_segment(MACROBLOCK *x, int const *labels vp8_build_inter_predictors_b(bd, 16, x->e_mbd.subpixel_predict); ENCODEMB_INVOKE(rtcd, subb)(be, bd, 16); - x->short_fdct4x4rd(be->src_diff, be->coeff, 32); + x->vp8_short_fdct4x4(be->src_diff, be->coeff, 32); // set to 0 no way to account for 2nd order DC so discount //be->coeff[0] = 0; @@ -1056,7 +1056,7 @@ static void macro_block_yrd(MACROBLOCK *mb, int *Rate, int *Distortion, const vp // Fdct and building the 2nd order block for (beptr = mb->block; beptr < mb->block + 16; beptr += 2) { - mb->short_fdct8x4rd(beptr->src_diff, beptr->coeff, 32); + mb->vp8_short_fdct8x4(beptr->src_diff, beptr->coeff, 32); *Y2DCPtr++ = beptr->coeff[0]; *Y2DCPtr++ = beptr->coeff[16]; } diff --git a/vp8/encoder/x86/csystemdependent.c b/vp8/encoder/x86/csystemdependent.c index 6aeac508f..bf12fee54 100644 --- a/vp8/encoder/x86/csystemdependent.c +++ b/vp8/encoder/x86/csystemdependent.c @@ -181,10 +181,17 @@ void vp8_cmachine_specific_config(void) // Willamette instruction set available: vp8_mbuverror = vp8_mbuverror_xmm; vp8_fast_quantize_b = vp8_fast_quantize_b_sse; +#if 0 //new fdct vp8_short_fdct4x4 = vp8_short_fdct4x4_mmx; vp8_short_fdct8x4 = vp8_short_fdct8x4_mmx; - vp8_fast_fdct4x4 = vp8_fast_fdct4x4_mmx; - vp8_fast_fdct8x4 = vp8_fast_fdct8x4_wmt; + vp8_fast_fdct4x4 = vp8_short_fdct4x4_mmx; + vp8_fast_fdct8x4 = vp8_short_fdct8x4_wmt; +#else + vp8_short_fdct4x4 = vp8_short_fdct4x4_c; + vp8_short_fdct8x4 = vp8_short_fdct8x4_c; + vp8_fast_fdct4x4 = vp8_short_fdct4x4_c; + vp8_fast_fdct8x4 = vp8_fast_fdct8x4_c; +#endif vp8_subtract_b = vp8_subtract_b_mmx; vp8_subtract_mbuv = vp8_subtract_mbuv_mmx; vp8_variance4x4 = vp8_variance4x4_mmx; @@ -218,10 +225,17 @@ void vp8_cmachine_specific_config(void) // MMX instruction set available: vp8_mbuverror = vp8_mbuverror_mmx; vp8_fast_quantize_b = vp8_fast_quantize_b_mmx; +#if 0 // new fdct vp8_short_fdct4x4 = vp8_short_fdct4x4_mmx; vp8_short_fdct8x4 = vp8_short_fdct8x4_mmx; - vp8_fast_fdct4x4 = vp8_fast_fdct4x4_mmx; - vp8_fast_fdct8x4 = vp8_fast_fdct8x4_mmx; + vp8_fast_fdct4x4 = vp8_short_fdct4x4_mmx; + vp8_fast_fdct8x4 = vp8_short_fdct8x4_mmx; +#else + vp8_short_fdct4x4 = vp8_short_fdct4x4_c; + vp8_short_fdct8x4 = vp8_short_fdct8x4_c; + vp8_fast_fdct4x4 = vp8_short_fdct4x4_c; + vp8_fast_fdct8x4 = vp8_fast_fdct8x4_c; +#endif vp8_subtract_b = vp8_subtract_b_mmx; vp8_subtract_mbuv = vp8_subtract_mbuv_mmx; vp8_variance4x4 = vp8_variance4x4_mmx; @@ -254,10 +268,10 @@ void vp8_cmachine_specific_config(void) { // Pure C: vp8_mbuverror = vp8_mbuverror_c; - vp8_fast_quantize_b = vp8_fast_quantize_b_c; + vp8_fast_quantize_b = vp8_fast_quantize_b_c; vp8_short_fdct4x4 = vp8_short_fdct4x4_c; vp8_short_fdct8x4 = vp8_short_fdct8x4_c; - vp8_fast_fdct4x4 = vp8_fast_fdct4x4_c; + vp8_fast_fdct4x4 = vp8_short_fdct4x4_c; vp8_fast_fdct8x4 = vp8_fast_fdct8x4_c; vp8_subtract_b = vp8_subtract_b_c; vp8_subtract_mbuv = vp8_subtract_mbuv_c; diff --git a/vp8/encoder/x86/dct_mmx.asm b/vp8/encoder/x86/dct_mmx.asm index 32d6610aa..ff96c49f3 100644 --- a/vp8/encoder/x86/dct_mmx.asm +++ b/vp8/encoder/x86/dct_mmx.asm @@ -13,8 +13,7 @@ section .text global sym(vp8_short_fdct4x4_mmx) - global sym(vp8_fast_fdct4x4_mmx) - global sym(vp8_fast_fdct8x4_wmt) + global sym(vp8_short_fdct8x4_wmt) %define DCTCONSTANTSBITS (16) @@ -24,339 +23,8 @@ section .text %define x_c3 (25080) ; cos(pi*3/8) * (1<<15) -%define _1STSTAGESHIFT 14 -%define _2NDSTAGESHIFT 16 - -; using matrix multiply with source and destbuffer has a pitch ;void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch) sym(vp8_short_fdct4x4_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 3 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;input - mov rdi, arg(1) ;output - - movsxd rax, dword ptr arg(2) ;pitch - lea rdx, [dct_matrix GLOBAL] - - movq mm0, [rsi ] - movq mm1, [rsi + rax] - - movq mm2, [rsi + rax*2] - lea rsi, [rsi + rax*2] - - movq mm3, [rsi + rax] - - ; first column - movq mm4, mm0 - movq mm7, [rdx] - - pmaddwd mm4, mm7 - movq mm5, mm1 - - pmaddwd mm5, mm7 - movq mm6, mm4 - - punpckldq mm4, mm5 - punpckhdq mm6, mm5 - - paddd mm4, mm6 - movq mm5, mm2 - - - pmaddwd mm5, mm7 - movq mm6, mm3 - - pmaddwd mm6, mm7 - movq mm7, mm5 - - punpckldq mm5, mm6 - punpckhdq mm7, mm6 - - paddd mm5, mm7 - movq mm6, [dct1st_stage_rounding_mmx GLOBAL] - - paddd mm4, mm6 - paddd mm5, mm6 - - psrad mm4, _1STSTAGESHIFT - psrad mm5, _1STSTAGESHIFT - - packssdw mm4, mm5 - movq [rdi], mm4 - - ;second column - movq mm4, mm0 - - pmaddwd mm4, [rdx+8] - movq mm5, mm1 - - pmaddwd mm5, [rdx+8] - movq mm6, mm4 - - punpckldq mm4, mm5 - punpckhdq mm6, mm5 - - paddd mm4, mm6 - movq mm5, mm2 - - pmaddwd mm5, [rdx+8] - movq mm6, mm3 - - pmaddwd mm6, [rdx+8] - movq mm7, mm5 - - punpckldq mm5, mm6 - punpckhdq mm7, mm6 - - paddd mm5, mm7 - movq mm6, [dct1st_stage_rounding_mmx GLOBAL] - - paddd mm4, mm6 - paddd mm5, mm6 - - psrad mm4, _1STSTAGESHIFT - psrad mm5, _1STSTAGESHIFT - - packssdw mm4, mm5 - movq [rdi+8], mm4 - - - ;third column - movq mm4, mm0 - - pmaddwd mm4, [rdx+16] - movq mm5, mm1 - - pmaddwd mm5, [rdx+16] - movq mm6, mm4 - - punpckldq mm4, mm5 - punpckhdq mm6, mm5 - - paddd mm4, mm6 - movq mm5, mm2 - - pmaddwd mm5, [rdx+16] - movq mm6, mm3 - - pmaddwd mm6, [rdx+16] - movq mm7, mm5 - - punpckldq mm5, mm6 - punpckhdq mm7, mm6 - - paddd mm5, mm7 - movq mm6, [dct1st_stage_rounding_mmx GLOBAL] - - paddd mm4, mm6 - paddd mm5, mm6 - - psrad mm4, _1STSTAGESHIFT - psrad mm5, _1STSTAGESHIFT - - packssdw mm4, mm5 - movq [rdi+16], mm4 - - ;fourth column (this is the last column, so we do not have save the source any more) - - pmaddwd mm0, [rdx+24] - - pmaddwd mm1, [rdx+24] - movq mm6, mm0 - - punpckldq mm0, mm1 - punpckhdq mm6, mm1 - - paddd mm0, mm6 - - pmaddwd mm2, [rdx+24] - - pmaddwd mm3, [rdx+24] - movq mm7, mm2 - - punpckldq mm2, mm3 - punpckhdq mm7, mm3 - - paddd mm2, mm7 - movq mm6, [dct1st_stage_rounding_mmx GLOBAL] - - paddd mm0, mm6 - paddd mm2, mm6 - - psrad mm0, _1STSTAGESHIFT - psrad mm2, _1STSTAGESHIFT - - packssdw mm0, mm2 - - movq mm3, mm0 - - ; done with one pass - ; now start second pass - movq mm0, [rdi ] - movq mm1, [rdi+ 8] - movq mm2, [rdi+ 16] - - movq mm4, mm0 - - pmaddwd mm4, [rdx] - movq mm5, mm1 - - pmaddwd mm5, [rdx] - movq mm6, mm4 - - punpckldq mm4, mm5 - punpckhdq mm6, mm5 - - paddd mm4, mm6 - movq mm5, mm2 - - pmaddwd mm5, [rdx] - movq mm6, mm3 - - pmaddwd mm6, [rdx] - movq mm7, mm5 - - punpckldq mm5, mm6 - punpckhdq mm7, mm6 - - paddd mm5, mm7 - movq mm6, [dct2nd_stage_rounding_mmx GLOBAL] - - paddd mm4, mm6 - paddd mm5, mm6 - - psrad mm4, _2NDSTAGESHIFT - psrad mm5, _2NDSTAGESHIFT - - packssdw mm4, mm5 - movq [rdi], mm4 - - ;second column - movq mm4, mm0 - - pmaddwd mm4, [rdx+8] - movq mm5, mm1 - - pmaddwd mm5, [rdx+8] - movq mm6, mm4 - - punpckldq mm4, mm5 - punpckhdq mm6, mm5 - - paddd mm4, mm6 - movq mm5, mm2 - - pmaddwd mm5, [rdx+8] - movq mm6, mm3 - - pmaddwd mm6, [rdx+8] - movq mm7, mm5 - - punpckldq mm5, mm6 - punpckhdq mm7, mm6 - - paddd mm5, mm7 - movq mm6, [dct2nd_stage_rounding_mmx GLOBAL] - - paddd mm4, mm6 - paddd mm5, mm6 - - psrad mm4, _2NDSTAGESHIFT - psrad mm5, _2NDSTAGESHIFT - - packssdw mm4, mm5 - movq [rdi+8], mm4 - - - ;third column - movq mm4, mm0 - - pmaddwd mm4, [rdx+16] - movq mm5, mm1 - - pmaddwd mm5, [rdx+16] - movq mm6, mm4 - - punpckldq mm4, mm5 - punpckhdq mm6, mm5 - - paddd mm4, mm6 - movq mm5, mm2 - - pmaddwd mm5, [rdx+16] - movq mm6, mm3 - - pmaddwd mm6, [rdx+16] - movq mm7, mm5 - - punpckldq mm5, mm6 - punpckhdq mm7, mm6 - - paddd mm5, mm7 - movq mm6, [dct2nd_stage_rounding_mmx GLOBAL] - - paddd mm4, mm6 - paddd mm5, mm6 - - psrad mm4, _2NDSTAGESHIFT - psrad mm5, _2NDSTAGESHIFT - - packssdw mm4, mm5 - movq [rdi+16], mm4 - - ;fourth column - movq mm4, mm0 - - pmaddwd mm4, [rdx+24] - movq mm5, mm1 - - pmaddwd mm5, [rdx+24] - movq mm6, mm4 - - punpckldq mm4, mm5 - punpckhdq mm6, mm5 - - paddd mm4, mm6 - movq mm5, mm2 - - pmaddwd mm5, [rdx+24] - movq mm6, mm3 - - pmaddwd mm6, [rdx+24] - movq mm7, mm5 - - punpckldq mm5, mm6 - punpckhdq mm7, mm6 - - paddd mm5, mm7 - movq mm6, [dct2nd_stage_rounding_mmx GLOBAL] - - paddd mm4, mm6 - paddd mm5, mm6 - - psrad mm4, _2NDSTAGESHIFT - psrad mm5, _2NDSTAGESHIFT - - packssdw mm4, mm5 - movq [rdi+24], mm4 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;void vp8_fast_fdct4x4_mmx(short *input, short *output, int pitch) -sym(vp8_fast_fdct4x4_mmx): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 3 @@ -379,11 +47,11 @@ sym(vp8_fast_fdct4x4_mmx): movq mm3, [rcx + rax] ; get the constants ;shift to left by 1 for prescision - paddw mm0, mm0 - paddw mm1, mm1 + psllw mm0, 3 + psllw mm1, 3 - psllw mm2, 1 - psllw mm3, 1 + psllw mm2, 3 + psllw mm3, 3 ; transpose for the second stage movq mm4, mm0 ; 00 01 02 03 @@ -531,20 +199,23 @@ sym(vp8_fast_fdct4x4_mmx): movq mm3, mm5 ; done with vertical - pcmpeqw mm4, mm4 - pcmpeqw mm5, mm5 - psrlw mm4, 15 - psrlw mm5, 15 + pcmpeqw mm4, mm4 + pcmpeqw mm5, mm5 + psrlw mm4, 15 + psrlw mm5, 15 + + psllw mm4, 2 + psllw mm5, 2 paddw mm0, mm4 paddw mm1, mm5 paddw mm2, mm4 paddw mm3, mm5 - psraw mm0, 1 - psraw mm1, 1 - psraw mm2, 1 - psraw mm3, 1 + psraw mm0, 3 + psraw mm1, 3 + psraw mm2, 3 + psraw mm3, 3 movq [rdi ], mm0 movq [rdi+ 8], mm1 @@ -560,8 +231,8 @@ sym(vp8_fast_fdct4x4_mmx): ret -;void vp8_fast_fdct8x4_wmt(short *input, short *output, int pitch) -sym(vp8_fast_fdct8x4_wmt): +;void vp8_short_fdct8x4_wmt(short *input, short *output, int pitch) +sym(vp8_short_fdct8x4_wmt): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 3 @@ -584,11 +255,11 @@ sym(vp8_fast_fdct8x4_wmt): movdqa xmm3, [rcx + rax] ; get the constants ;shift to left by 1 for prescision - psllw xmm0, 1 - psllw xmm2, 1 + psllw xmm0, 3 + psllw xmm2, 3 - psllw xmm4, 1 - psllw xmm3, 1 + psllw xmm4, 3 + psllw xmm3, 3 ; transpose for the second stage movdqa xmm1, xmm0 ; 00 01 02 03 04 05 06 07 @@ -758,20 +429,23 @@ sym(vp8_fast_fdct8x4_wmt): ; done with vertical - pcmpeqw xmm4, xmm4 - pcmpeqw xmm5, xmm5; - psrlw xmm4, 15 - psrlw xmm5, 15 + pcmpeqw xmm4, xmm4 + pcmpeqw xmm5, xmm5; + psrlw xmm4, 15 + psrlw xmm5, 15 + + psllw xmm4, 2 + psllw xmm5, 2 paddw xmm0, xmm4 paddw xmm1, xmm5 paddw xmm2, xmm4 paddw xmm3, xmm5 - psraw xmm0, 1 - psraw xmm1, 1 - psraw xmm2, 1 - psraw xmm3, 1 + psraw xmm0, 3 + psraw xmm1, 3 + psraw xmm2, 3 + psraw xmm3, 3 movq QWORD PTR[rdi ], xmm0 movq QWORD PTR[rdi+ 8], xmm1 diff --git a/vp8/encoder/x86/dct_x86.h b/vp8/encoder/x86/dct_x86.h index 05d018043..ada16d34f 100644 --- a/vp8/encoder/x86/dct_x86.h +++ b/vp8/encoder/x86/dct_x86.h @@ -22,31 +22,22 @@ #if HAVE_MMX extern prototype_fdct(vp8_short_fdct4x4_mmx); extern prototype_fdct(vp8_short_fdct8x4_mmx); -extern prototype_fdct(vp8_fast_fdct4x4_mmx); -extern prototype_fdct(vp8_fast_fdct8x4_mmx); #if !CONFIG_RUNTIME_CPU_DETECT +#if 0 new c version, #undef vp8_fdct_short4x4 #define vp8_fdct_short4x4 vp8_short_fdct4x4_mmx #undef vp8_fdct_short8x4 #define vp8_fdct_short8x4 vp8_short_fdct8x4_mmx - -#undef vp8_fdct_fast4x4 -#define vp8_fdct_fast4x4 vp8_fast_fdct4x4_mmx - -#undef vp8_fdct_fast8x4 -#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_mmx +#endif #endif #endif #if HAVE_SSE2 -extern prototype_fdct(vp8_short_fdct4x4_wmt); extern prototype_fdct(vp8_short_fdct8x4_wmt); -extern prototype_fdct(vp8_fast_fdct8x4_wmt); - extern prototype_fdct(vp8_short_walsh4x4_sse2); #if !CONFIG_RUNTIME_CPU_DETECT diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c index f3750455b..0fb82e60e 100644 --- a/vp8/encoder/x86/x86_csystemdependent.c +++ b/vp8/encoder/x86/x86_csystemdependent.c @@ -18,15 +18,10 @@ #if HAVE_MMX void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch) { - vp8_short_fdct4x4_mmx(input, output, pitch); - vp8_short_fdct4x4_mmx(input + 4, output + 16, pitch); + vp8_short_fdct4x4_c(input, output, pitch); + vp8_short_fdct4x4_c(input + 4, output + 16, pitch); } -void vp8_fast_fdct8x4_mmx(short *input, short *output, int pitch) -{ - vp8_fast_fdct4x4_mmx(input, output , pitch); - vp8_fast_fdct4x4_mmx(input + 4, output + 16, pitch); -} int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr, short *qcoeff_ptr, short *dequant_ptr, @@ -87,11 +82,6 @@ void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch) #endif #if HAVE_SSE2 -void vp8_short_fdct8x4_wmt(short *input, short *output, int pitch) -{ - vp8_short_fdct4x4_wmt(input, output, pitch); - vp8_short_fdct4x4_wmt(input + 4, output + 16, pitch); -} int vp8_fast_quantize_b_impl_sse(short *coeff_ptr, short *zbin_ptr, short *qcoeff_ptr, short *dequant_ptr, @@ -221,11 +211,19 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) cpi->rtcd.variance.get8x8var = vp8_get8x8var_mmx; cpi->rtcd.variance.get16x16var = vp8_get16x16var_mmx; cpi->rtcd.variance.get4x4sse_cs = vp8_get4x4sse_cs_mmx; - +#if 0 // new fdct cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_mmx; cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_mmx; - cpi->rtcd.fdct.fast4x4 = vp8_fast_fdct4x4_mmx; - cpi->rtcd.fdct.fast8x4 = vp8_fast_fdct8x4_mmx; + cpi->rtcd.fdct.fast4x4 = vp8_short_fdct4x4_mmx; + cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_mmx; +#else + cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_c; + cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_c; + cpi->rtcd.fdct.fast4x4 = vp8_short_fdct4x4_c; + cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_c; + +#endif + cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_c; cpi->rtcd.encodemb.berr = vp8_block_error_mmx; @@ -270,13 +268,13 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) cpi->rtcd.variance.get16x16var = vp8_get16x16var_sse2; /* cpi->rtcd.variance.get4x4sse_cs not implemented for wmt */; -#if 0 +#if 0 //new fdct /* short SSE2 DCT currently disabled, does not match the MMX version */ cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_wmt; cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_wmt; -#endif /* cpi->rtcd.fdct.fast4x4 not implemented for wmt */; - cpi->rtcd.fdct.fast8x4 = vp8_fast_fdct8x4_wmt; + cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_wmt; +#endif cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_sse2; cpi->rtcd.encodemb.berr = vp8_block_error_xmm; diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk index f09f25852..f86a0b2aa 100644 --- a/vp8/vp8cx.mk +++ b/vp8/vp8cx.mk @@ -96,7 +96,6 @@ VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/subtract_mmx.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/variance_sse2.c VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/variance_impl_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/sad_sse2.asm -VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.asm VP8_CX_SRCS-$(HAVE_SSE3) += encoder/x86/sad_sse3.asm -- 2.40.0