From: John Koleszar Date: Fri, 12 Apr 2013 22:33:04 +0000 (-0700) Subject: Merge branch 'experimental' into master X-Git-Tag: v1.3.0~1151 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=7f7d1357a2732e0a1c36f3baded7dd14f449e535;p=libvpx Merge branch 'experimental' into master VP9 preview bitstream 2, commit '868ecb55a1528ca3f19286e7d1551572bf89b642' Conflicts: vp9/vp9_common.mk Change-Id: I3f0f6e692c987ff24f98ceafbb86cb9cf64ad8d3 --- 7f7d1357a2732e0a1c36f3baded7dd14f449e535 diff --cc vp8/vp8_cx_iface.c index b985cb1b7,b985cb1b7..4531d5ad0 --- a/vp8/vp8_cx_iface.c +++ b/vp8/vp8_cx_iface.c @@@ -684,6 -684,6 +684,8 @@@ static vpx_codec_err_t image2yuvconfig( yv12->u_buffer = img->planes[VPX_PLANE_U]; yv12->v_buffer = img->planes[VPX_PLANE_V]; ++ yv12->y_crop_width = img->d_w; ++ yv12->y_crop_height = img->d_h; yv12->y_width = img->d_w; yv12->y_height = img->d_h; yv12->uv_width = (1 + yv12->y_width) / 2; diff --cc vp8/vp8_dx_iface.c index f3834b063,f3834b063..90a175436 --- a/vp8/vp8_dx_iface.c +++ b/vp8/vp8_dx_iface.c @@@ -790,6 -790,6 +790,8 @@@ static vpx_codec_err_t image2yuvconfig( yv12->u_buffer = img->planes[VPX_PLANE_U]; yv12->v_buffer = img->planes[VPX_PLANE_V]; ++ yv12->y_crop_width = img->d_w; ++ yv12->y_crop_height = img->d_h; yv12->y_width = img->d_w; yv12->y_height = img->d_h; yv12->uv_width = yv12->y_width / 2; diff --cc vp9/encoder/vp9_dct.c index bfde02ccb,6365ed9a2..aeef9c6df --- a/vp9/encoder/vp9_dct.c +++ b/vp9/encoder/vp9_dct.c @@@ -1652,737 -1224,28 +1224,27 @@@ static void dct32_1d(int *input, int *o } void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch) { - vp9_clear_system_state(); // Make it simd safe : __asm emms; - { - int shortpitch = pitch >> 1; - int i, j; - double output[1024]; - // First transform columns - for (i = 0; i < 32; i++) { - double temp_in[32], temp_out[32]; - for (j = 0; j < 32; j++) - temp_in[j] = input[j*shortpitch + i]; - dct32_1d(temp_in, temp_out, 1); - for (j = 0; j < 32; j++) - output[j*32 + i] = temp_out[j]; - } - // Then transform rows - for (i = 0; i < 32; ++i) { - double temp_in[32], temp_out[32]; - for (j = 0; j < 32; ++j) - temp_in[j] = output[j + i*32]; - dct32_1d(temp_in, temp_out, 1); - for (j = 0; j < 32; ++j) - output[j + i*32] = temp_out[j]; - } - // Scale by some magic number - for (i = 0; i < 1024; i++) { - out[i] = (short)round(output[i]/4); - } - } - - vp9_clear_system_state(); // Make it simd safe : __asm emms; - } - - #else // CONFIG_DWTDCTHYBRID - - #if DWT_TYPE == 53 - - // Note: block length must be even for this implementation - static void analysis_53_row(int length, short *x, - short *lowpass, short *highpass) { - int n; - short r, *a, *b; - - n = length >> 1; - b = highpass; - a = lowpass; - while (--n) { - *a++ = (r = *x++) << 1; - *b++ = *x - ((r + x[1] + 1) >> 1); - x++; - } - *a = (r = *x++) << 1; - *b = *x - r; - - n = length >> 1; - b = highpass; - a = lowpass; - r = *highpass; - while (n--) { - *a++ += (r + (*b) + 1) >> 1; - r = *b++; - } - } - - static void analysis_53_col(int length, short *x, - short *lowpass, short *highpass) { - int n; - short r, *a, *b; - - n = length >> 1; - b = highpass; - a = lowpass; - while (--n) { - *a++ = (r = *x++); - *b++ = (((*x) << 1) - (r + x[1]) + 2) >> 2; - x++; - } - *a = (r = *x++); - *b = (*x - r + 1) >> 1; - - n = length >> 1; - b = highpass; - a = lowpass; - r = *highpass; - while (n--) { - *a++ += (r + (*b) + 1) >> 1; - r = *b++; - } - } - - static void dyadic_analyze_53(int levels, int width, int height, - short *x, int pitch_x, short *c, int pitch_c) { - int lv, i, j, nh, nw, hh = height, hw = width; - short buffer[2 * DWT_MAX_LENGTH]; - for (i = 0; i < height; i++) { - for (j = 0; j < width; j++) { - c[i * pitch_c + j] = x[i * pitch_x + j] << DWT_PRECISION_BITS; - } - } - for (lv = 0; lv < levels; lv++) { - nh = hh; - hh = (hh + 1) >> 1; - nw = hw; - hw = (hw + 1) >> 1; - if ((nh < 2) || (nw < 2)) return; - for (i = 0; i < nh; i++) { - memcpy(buffer, &c[i * pitch_c], nw * sizeof(short)); - analysis_53_row(nw, buffer, &c[i * pitch_c], &c[i * pitch_c] + hw); - } - for (j = 0; j < nw; j++) { - for (i = 0; i < nh; i++) - buffer[i + nh] = c[i * pitch_c + j]; - analysis_53_col(nh, buffer + nh, buffer, buffer + hh); - for (i = 0; i < nh; i++) - c[i * pitch_c + j] = buffer[i]; - } - } - } - - #elif DWT_TYPE == 26 - - static void analysis_26_row(int length, short *x, - short *lowpass, short *highpass) { - int i, n; - short r, s, *a, *b; - a = lowpass; - b = highpass; - for (i = length >> 1; i; i--) { - r = *x++; - s = *x++; - *a++ = r + s; - *b++ = r - s; - } - n = length >> 1; - if (n >= 4) { - a = lowpass; - b = highpass; - r = *lowpass; - while (--n) { - *b++ -= (r - a[1] + 4) >> 3; - r = *a++; - } - *b -= (r - *a + 4) >> 3; - } - } - - static void analysis_26_col(int length, short *x, - short *lowpass, short *highpass) { - int i, n; - short r, s, *a, *b; - a = lowpass; - b = highpass; - for (i = length >> 1; i; i--) { - r = *x++; - s = *x++; - *a++ = (r + s + 1) >> 1; - *b++ = (r - s + 1) >> 1; - } - n = length >> 1; - if (n >= 4) { - a = lowpass; - b = highpass; - r = *lowpass; - while (--n) { - *b++ -= (r - a[1] + 4) >> 3; - r = *a++; - } - *b -= (r - *a + 4) >> 3; - } - } - - static void dyadic_analyze_26(int levels, int width, int height, - short *x, int pitch_x, short *c, int pitch_c) { - int lv, i, j, nh, nw, hh = height, hw = width; - short buffer[2 * DWT_MAX_LENGTH]; - for (i = 0; i < height; i++) { - for (j = 0; j < width; j++) { - c[i * pitch_c + j] = x[i * pitch_x + j] << DWT_PRECISION_BITS; - } - } - for (lv = 0; lv < levels; lv++) { - nh = hh; - hh = (hh + 1) >> 1; - nw = hw; - hw = (hw + 1) >> 1; - if ((nh < 2) || (nw < 2)) return; - for (i = 0; i < nh; i++) { - memcpy(buffer, &c[i * pitch_c], nw * sizeof(short)); - analysis_26_row(nw, buffer, &c[i * pitch_c], &c[i * pitch_c] + hw); - } - for (j = 0; j < nw; j++) { - for (i = 0; i < nh; i++) - buffer[i + nh] = c[i * pitch_c + j]; - analysis_26_col(nh, buffer + nh, buffer, buffer + hh); - for (i = 0; i < nh; i++) - c[i * pitch_c + j] = buffer[i]; - } - } - } - - #elif DWT_TYPE == 97 - - static void analysis_97(int length, double *x, - double *lowpass, double *highpass) { - static const double a_predict1 = -1.586134342; - static const double a_update1 = -0.05298011854; - static const double a_predict2 = 0.8829110762; - static const double a_update2 = 0.4435068522; - static const double s_low = 1.149604398; - static const double s_high = 1/1.149604398; - int i; - double y[DWT_MAX_LENGTH]; - // Predict 1 - for (i = 1; i < length - 2; i += 2) { - x[i] += a_predict1 * (x[i - 1] + x[i + 1]); - } - x[length - 1] += 2 * a_predict1 * x[length - 2]; - // Update 1 - for (i = 2; i < length; i += 2) { - x[i] += a_update1 * (x[i - 1] + x[i + 1]); - } - x[0] += 2 * a_update1 * x[1]; - // Predict 2 - for (i = 1; i < length - 2; i += 2) { - x[i] += a_predict2 * (x[i - 1] + x[i + 1]); - } - x[length - 1] += 2 * a_predict2 * x[length - 2]; - // Update 2 - for (i = 2; i < length; i += 2) { - x[i] += a_update2 * (x[i - 1] + x[i + 1]); - } - x[0] += 2 * a_update2 * x[1]; - memcpy(y, x, sizeof(*y) * length); - // Scale and pack - for (i = 0; i < length / 2; i++) { - lowpass[i] = y[2 * i] * s_low; - highpass[i] = y[2 * i + 1] * s_high; - } - } - - static void dyadic_analyze_97(int levels, int width, int height, - short *x, int pitch_x, short *c, int pitch_c) { - int lv, i, j, nh, nw, hh = height, hw = width; - double buffer[2 * DWT_MAX_LENGTH]; - double y[DWT_MAX_LENGTH * DWT_MAX_LENGTH]; - for (i = 0; i < height; i++) { - for (j = 0; j < width; j++) { - y[i * DWT_MAX_LENGTH + j] = x[i * pitch_x + j] << DWT_PRECISION_BITS; - } - } - for (lv = 0; lv < levels; lv++) { - nh = hh; - hh = (hh + 1) >> 1; - nw = hw; - hw = (hw + 1) >> 1; - if ((nh < 2) || (nw < 2)) return; - for (i = 0; i < nh; i++) { - memcpy(buffer, &y[i * DWT_MAX_LENGTH], nw * sizeof(*buffer)); - analysis_97(nw, buffer, &y[i * DWT_MAX_LENGTH], - &y[i * DWT_MAX_LENGTH] + hw); - } - for (j = 0; j < nw; j++) { - for (i = 0; i < nh; i++) - buffer[i + nh] = y[i * DWT_MAX_LENGTH + j]; - analysis_97(nh, buffer + nh, buffer, buffer + hh); - for (i = 0; i < nh; i++) - c[i * pitch_c + j] = round(buffer[i]); - } - } - } - - #endif // DWT_TYPE - - // TODO(debargha): Implement the scaling differently so as not to have to - // use the floating point dct - static void dct16x16_1d_f(double input[16], double output[16]) { - static const double C1 = 0.995184726672197; - static const double C2 = 0.98078528040323; - static const double C3 = 0.956940335732209; - static const double C4 = 0.923879532511287; - static const double C5 = 0.881921264348355; - static const double C6 = 0.831469612302545; - static const double C7 = 0.773010453362737; - static const double C8 = 0.707106781186548; - static const double C9 = 0.634393284163646; - static const double C10 = 0.555570233019602; - static const double C11 = 0.471396736825998; - static const double C12 = 0.38268343236509; - static const double C13 = 0.290284677254462; - static const double C14 = 0.195090322016128; - static const double C15 = 0.098017140329561; - - vp9_clear_system_state(); // Make it simd safe : __asm emms; - { - double step[16]; - double intermediate[16]; - double temp1, temp2; - - // step 1 - step[ 0] = input[0] + input[15]; - step[ 1] = input[1] + input[14]; - step[ 2] = input[2] + input[13]; - step[ 3] = input[3] + input[12]; - step[ 4] = input[4] + input[11]; - step[ 5] = input[5] + input[10]; - step[ 6] = input[6] + input[ 9]; - step[ 7] = input[7] + input[ 8]; - step[ 8] = input[7] - input[ 8]; - step[ 9] = input[6] - input[ 9]; - step[10] = input[5] - input[10]; - step[11] = input[4] - input[11]; - step[12] = input[3] - input[12]; - step[13] = input[2] - input[13]; - step[14] = input[1] - input[14]; - step[15] = input[0] - input[15]; - - // step 2 - output[0] = step[0] + step[7]; - output[1] = step[1] + step[6]; - output[2] = step[2] + step[5]; - output[3] = step[3] + step[4]; - output[4] = step[3] - step[4]; - output[5] = step[2] - step[5]; - output[6] = step[1] - step[6]; - output[7] = step[0] - step[7]; - - temp1 = step[ 8]*C7; - temp2 = step[15]*C9; - output[ 8] = temp1 + temp2; - - temp1 = step[ 9]*C11; - temp2 = step[14]*C5; - output[ 9] = temp1 - temp2; - - temp1 = step[10]*C3; - temp2 = step[13]*C13; - output[10] = temp1 + temp2; - - temp1 = step[11]*C15; - temp2 = step[12]*C1; - output[11] = temp1 - temp2; - - temp1 = step[11]*C1; - temp2 = step[12]*C15; - output[12] = temp2 + temp1; - - temp1 = step[10]*C13; - temp2 = step[13]*C3; - output[13] = temp2 - temp1; - - temp1 = step[ 9]*C5; - temp2 = step[14]*C11; - output[14] = temp2 + temp1; - - temp1 = step[ 8]*C9; - temp2 = step[15]*C7; - output[15] = temp2 - temp1; - - // step 3 - step[ 0] = output[0] + output[3]; - step[ 1] = output[1] + output[2]; - step[ 2] = output[1] - output[2]; - step[ 3] = output[0] - output[3]; - - temp1 = output[4]*C14; - temp2 = output[7]*C2; - step[ 4] = temp1 + temp2; - - temp1 = output[5]*C10; - temp2 = output[6]*C6; - step[ 5] = temp1 + temp2; - - temp1 = output[5]*C6; - temp2 = output[6]*C10; - step[ 6] = temp2 - temp1; - - temp1 = output[4]*C2; - temp2 = output[7]*C14; - step[ 7] = temp2 - temp1; - - step[ 8] = output[ 8] + output[11]; - step[ 9] = output[ 9] + output[10]; - step[10] = output[ 9] - output[10]; - step[11] = output[ 8] - output[11]; - - step[12] = output[12] + output[15]; - step[13] = output[13] + output[14]; - step[14] = output[13] - output[14]; - step[15] = output[12] - output[15]; - - // step 4 - output[ 0] = (step[ 0] + step[ 1]); - output[ 8] = (step[ 0] - step[ 1]); - - temp1 = step[2]*C12; - temp2 = step[3]*C4; - temp1 = temp1 + temp2; - output[ 4] = 2*(temp1*C8); - - temp1 = step[2]*C4; - temp2 = step[3]*C12; - temp1 = temp2 - temp1; - output[12] = 2*(temp1*C8); - - output[ 2] = 2*((step[4] + step[ 5])*C8); - output[14] = 2*((step[7] - step[ 6])*C8); - - temp1 = step[4] - step[5]; - temp2 = step[6] + step[7]; - output[ 6] = (temp1 + temp2); - output[10] = (temp1 - temp2); - - intermediate[8] = step[8] + step[14]; - intermediate[9] = step[9] + step[15]; - - temp1 = intermediate[8]*C12; - temp2 = intermediate[9]*C4; - temp1 = temp1 - temp2; - output[3] = 2*(temp1*C8); - - temp1 = intermediate[8]*C4; - temp2 = intermediate[9]*C12; - temp1 = temp2 + temp1; - output[13] = 2*(temp1*C8); - - output[ 9] = 2*((step[10] + step[11])*C8); - - intermediate[11] = step[10] - step[11]; - intermediate[12] = step[12] + step[13]; - intermediate[13] = step[12] - step[13]; - intermediate[14] = step[ 8] - step[14]; - intermediate[15] = step[ 9] - step[15]; - - output[15] = (intermediate[11] + intermediate[12]); - output[ 1] = -(intermediate[11] - intermediate[12]); - - output[ 7] = 2*(intermediate[13]*C8); - - temp1 = intermediate[14]*C12; - temp2 = intermediate[15]*C4; - temp1 = temp1 - temp2; - output[11] = -2*(temp1*C8); - - temp1 = intermediate[14]*C4; - temp2 = intermediate[15]*C12; - temp1 = temp2 + temp1; - output[ 5] = 2*(temp1*C8); - } - vp9_clear_system_state(); // Make it simd safe : __asm emms; - } - - static void vp9_short_fdct16x16_c_f(short *input, short *out, int pitch, - int scale) { - vp9_clear_system_state(); // Make it simd safe : __asm emms; - { - int shortpitch = pitch >> 1; - int i, j; - double output[256]; - // First transform columns - for (i = 0; i < 16; i++) { - double temp_in[16], temp_out[16]; - for (j = 0; j < 16; j++) - temp_in[j] = input[j*shortpitch + i]; - dct16x16_1d_f(temp_in, temp_out); - for (j = 0; j < 16; j++) - output[j*16 + i] = temp_out[j]; - } - // Then transform rows - for (i = 0; i < 16; ++i) { - double temp_in[16], temp_out[16]; - for (j = 0; j < 16; ++j) - temp_in[j] = output[j + i*16]; - dct16x16_1d_f(temp_in, temp_out); - for (j = 0; j < 16; ++j) - output[j + i*16] = temp_out[j]; - } - // Scale by some magic number - for (i = 0; i < 256; i++) - out[i] = (short)round(output[i] / (2 << scale)); - } - vp9_clear_system_state(); // Make it simd safe : __asm emms; - } - - void vp9_short_fdct8x8_c_f(short *block, short *coefs, int pitch, int scale) { - int j1, i, j, k; - float b[8]; - float b1[8]; - float d[8][8]; - float f0 = (float) .7071068; - float f1 = (float) .4903926; - float f2 = (float) .4619398; - float f3 = (float) .4157348; - float f4 = (float) .3535534; - float f5 = (float) .2777851; - float f6 = (float) .1913417; - float f7 = (float) .0975452; - pitch = pitch / 2; - for (i = 0, k = 0; i < 8; i++, k += pitch) { - for (j = 0; j < 8; j++) { - b[j] = (float)(block[k + j] << (3 - scale)); - } - /* Horizontal transform */ - for (j = 0; j < 4; j++) { - j1 = 7 - j; - b1[j] = b[j] + b[j1]; - b1[j1] = b[j] - b[j1]; - } - b[0] = b1[0] + b1[3]; - b[1] = b1[1] + b1[2]; - b[2] = b1[1] - b1[2]; - b[3] = b1[0] - b1[3]; - b[4] = b1[4]; - b[5] = (b1[6] - b1[5]) * f0; - b[6] = (b1[6] + b1[5]) * f0; - b[7] = b1[7]; - d[i][0] = (b[0] + b[1]) * f4; - d[i][4] = (b[0] - b[1]) * f4; - d[i][2] = b[2] * f6 + b[3] * f2; - d[i][6] = b[3] * f6 - b[2] * f2; - b1[4] = b[4] + b[5]; - b1[7] = b[7] + b[6]; - b1[5] = b[4] - b[5]; - b1[6] = b[7] - b[6]; - d[i][1] = b1[4] * f7 + b1[7] * f1; - d[i][5] = b1[5] * f3 + b1[6] * f5; - d[i][7] = b1[7] * f7 - b1[4] * f1; - d[i][3] = b1[6] * f3 - b1[5] * f5; - } - /* Vertical transform */ - for (i = 0; i < 8; i++) { - for (j = 0; j < 4; j++) { - j1 = 7 - j; - b1[j] = d[j][i] + d[j1][i]; - b1[j1] = d[j][i] - d[j1][i]; - } - b[0] = b1[0] + b1[3]; - b[1] = b1[1] + b1[2]; - b[2] = b1[1] - b1[2]; - b[3] = b1[0] - b1[3]; - b[4] = b1[4]; - b[5] = (b1[6] - b1[5]) * f0; - b[6] = (b1[6] + b1[5]) * f0; - b[7] = b1[7]; - d[0][i] = (b[0] + b[1]) * f4; - d[4][i] = (b[0] - b[1]) * f4; - d[2][i] = b[2] * f6 + b[3] * f2; - d[6][i] = b[3] * f6 - b[2] * f2; - b1[4] = b[4] + b[5]; - b1[7] = b[7] + b[6]; - b1[5] = b[4] - b[5]; - b1[6] = b[7] - b[6]; - d[1][i] = b1[4] * f7 + b1[7] * f1; - d[5][i] = b1[5] * f3 + b1[6] * f5; - d[7][i] = b1[7] * f7 - b1[4] * f1; - d[3][i] = b1[6] * f3 - b1[5] * f5; - } - for (i = 0; i < 8; i++) { - for (j = 0; j < 8; j++) { - *(coefs + j + i * 8) = (short) floor(d[i][j] + 0.5); - } - } - return; - } - - #define divide_bits(d, n) ((n) < 0 ? (d) << (n) : (d) >> (n)) - - #if DWTDCT_TYPE == DWTDCT16X16_LEAN - - void vp9_short_fdct32x32_c(short *input, short *out, int pitch) { - // assume out is a 32x32 buffer - short buffer[16 * 16]; + int shortpitch = pitch >> 1; int i, j; - const int short_pitch = pitch >> 1; - #if DWT_TYPE == 26 - dyadic_analyze_26(1, 32, 32, input, short_pitch, out, 32); - #elif DWT_TYPE == 97 - dyadic_analyze_97(1, 32, 32, input, short_pitch, out, 32); - #elif DWT_TYPE == 53 - dyadic_analyze_53(1, 32, 32, input, short_pitch, out, 32); - #endif - // TODO(debargha): Implement more efficiently by adding output pitch - // argument to the dct16x16 function - vp9_short_fdct16x16_c_f(out, buffer, 64, 1 + DWT_PRECISION_BITS); - for (i = 0; i < 16; ++i) - vpx_memcpy(out + i * 32, buffer + i * 16, sizeof(short) * 16); - for (i = 0; i < 16; ++i) { - for (j = 16; j < 32; ++j) { - out[i * 32 + j] = divide_bits(out[i * 32 + j], DWT_PRECISION_BITS - 2); - } + int output[32 * 32]; + + // Columns + for (i = 0; i < 32; i++) { + int temp_in[32], temp_out[32]; + for (j = 0; j < 32; j++) + temp_in[j] = input[j * shortpitch + i] << 2; + dct32_1d(temp_in, temp_out); + for (j = 0; j < 32; j++) + output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; } - for (i = 16; i < 32; ++i) { - for (j = 0; j < 32; ++j) { - out[i * 32 + j] = divide_bits(out[i * 32 + j], DWT_PRECISION_BITS - 2); - } - } - } - - #elif DWTDCT_TYPE == DWTDCT16X16 - void vp9_short_fdct32x32_c(short *input, short *out, int pitch) { - // assume out is a 32x32 buffer - short buffer[16 * 16]; - int i, j; - const int short_pitch = pitch >> 1; - #if DWT_TYPE == 26 - dyadic_analyze_26(1, 32, 32, input, short_pitch, out, 32); - #elif DWT_TYPE == 97 - dyadic_analyze_97(1, 32, 32, input, short_pitch, out, 32); - #elif DWT_TYPE == 53 - dyadic_analyze_53(1, 32, 32, input, short_pitch, out, 32); - #endif - // TODO(debargha): Implement more efficiently by adding output pitch - // argument to the dct16x16 function - vp9_short_fdct16x16_c_f(out, buffer, 64, 1 + DWT_PRECISION_BITS); - for (i = 0; i < 16; ++i) - vpx_memcpy(out + i * 32, buffer + i * 16, sizeof(short) * 16); - vp9_short_fdct16x16_c_f(out + 16, buffer, 64, 1 + DWT_PRECISION_BITS); - for (i = 0; i < 16; ++i) - vpx_memcpy(out + i * 32 + 16, buffer + i * 16, sizeof(short) * 16); - - vp9_short_fdct16x16_c_f(out + 32 * 16, buffer, 64, 1 + DWT_PRECISION_BITS); - for (i = 0; i < 16; ++i) - vpx_memcpy(out + i * 32 + 32 * 16, buffer + i * 16, sizeof(short) * 16); - - vp9_short_fdct16x16_c_f(out + 33 * 16, buffer, 64, 1 + DWT_PRECISION_BITS); - for (i = 0; i < 16; ++i) - vpx_memcpy(out + i * 32 + 33 * 16, buffer + i * 16, sizeof(short) * 16); - } - - #elif DWTDCT_TYPE == DWTDCT8X8 - - void vp9_short_fdct32x32_c(short *input, short *out, int pitch) { - // assume out is a 32x32 buffer - short buffer[8 * 8]; - int i, j; - const int short_pitch = pitch >> 1; - #if DWT_TYPE == 26 - dyadic_analyze_26(2, 32, 32, input, short_pitch, out, 32); - #elif DWT_TYPE == 97 - dyadic_analyze_97(2, 32, 32, input, short_pitch, out, 32); - #elif DWT_TYPE == 53 - dyadic_analyze_53(2, 32, 32, input, short_pitch, out, 32); - #endif - // TODO(debargha): Implement more efficiently by adding output pitch - // argument to the dct16x16 function - vp9_short_fdct8x8_c_f(out, buffer, 64, 1 + DWT_PRECISION_BITS); - for (i = 0; i < 8; ++i) - vpx_memcpy(out + i * 32, buffer + i * 8, sizeof(short) * 8); - - vp9_short_fdct8x8_c_f(out + 8, buffer, 64, 1 + DWT_PRECISION_BITS); - for (i = 0; i < 8; ++i) - vpx_memcpy(out + i * 32 + 8, buffer + i * 8, sizeof(short) * 8); - - vp9_short_fdct8x8_c_f(out + 32 * 8, buffer, 64, 1 + DWT_PRECISION_BITS); - for (i = 0; i < 8; ++i) - vpx_memcpy(out + i * 32 + 32 * 8, buffer + i * 8, sizeof(short) * 8); - - vp9_short_fdct8x8_c_f(out + 33 * 8, buffer, 64, 1 + DWT_PRECISION_BITS); - for (i = 0; i < 8; ++i) - vpx_memcpy(out + i * 32 + 33 * 8, buffer + i * 8, sizeof(short) * 8); - - for (i = 0; i < 16; ++i) { - for (j = 16; j < 32; ++j) { - out[i * 32 + j] = divide_bits(out[i * 32 + j], DWT_PRECISION_BITS - 2); - } - } - for (i = 16; i < 32; ++i) { - for (j = 0; j < 32; ++j) { - out[i * 32 + j] = divide_bits(out[i * 32 + j], DWT_PRECISION_BITS - 2); - } - } - } - - #endif - - #if CONFIG_TX64X64 - void vp9_short_fdct64x64_c(short *input, short *out, int pitch) { - // assume out is a 64x64 buffer - short buffer[16 * 16]; - int i, j; - const int short_pitch = pitch >> 1; - #if DWT_TYPE == 26 - dyadic_analyze_26(2, 64, 64, input, short_pitch, out, 64); - #elif DWT_TYPE == 97 - dyadic_analyze_97(2, 64, 64, input, short_pitch, out, 64); - #elif DWT_TYPE == 53 - dyadic_analyze_53(2, 64, 64, input, short_pitch, out, 64); - #endif - // TODO(debargha): Implement more efficiently by adding output pitch - // argument to the dct16x16 function - vp9_short_fdct16x16_c_f(out, buffer, 128, 2 + DWT_PRECISION_BITS); - for (i = 0; i < 16; ++i) - vpx_memcpy(out + i * 64, buffer + i * 16, sizeof(short) * 16); - - #if DWTDCT_TYPE == DWTDCT16X16_LEAN - for (i = 0; i < 16; ++i) { - for (j = 16; j < 48; ++j) { - out[i * 64 + j] = divide_bits(out[i * 64 + j], DWT_PRECISION_BITS - 1); - } - } - for (i = 16; i < 64; ++i) { - for (j = 0; j < 64; ++j) { - out[i * 64 + j] = divide_bits(out[i * 64 + j], DWT_PRECISION_BITS - 1); - } - } - #elif DWTDCT_TYPE == DWTDCT16X16 - vp9_short_fdct16x16_c_f(out + 16, buffer, 128, 2 + DWT_PRECISION_BITS); - for (i = 0; i < 16; ++i) - vpx_memcpy(out + i * 64 + 16, buffer + i * 16, sizeof(short) * 16); - - vp9_short_fdct16x16_c_f(out + 64 * 16, buffer, 128, 2 + DWT_PRECISION_BITS); - for (i = 0; i < 16; ++i) - vpx_memcpy(out + i * 64 + 64 * 16, buffer + i * 16, sizeof(short) * 16); - - vp9_short_fdct16x16_c_f(out + 65 * 16, buffer, 128, 2 + DWT_PRECISION_BITS); - for (i = 0; i < 16; ++i) - vpx_memcpy(out + i * 64 + 65 * 16, buffer + i * 16, sizeof(short) * 16); - - // There is no dct used on the highest bands for now. - // Need to scale these coeffs by a factor of 2/2^DWT_PRECISION_BITS - // TODO(debargha): experiment with turning these coeffs to 0 + // Rows for (i = 0; i < 32; ++i) { - for (j = 32; j < 64; ++j) { - out[i * 64 + j] = divide_bits(out[i * 64 + j], DWT_PRECISION_BITS - 1); - } - } - for (i = 32; i < 64; ++i) { - for (j = 0; j < 64; ++j) { - out[i * 64 + j] = divide_bits(out[i * 64 + j], DWT_PRECISION_BITS - 1); - } + int temp_in[32], temp_out[32]; + for (j = 0; j < 32; ++j) + temp_in[j] = output[j + i * 32]; + dct32_1d(temp_in, temp_out); + for (j = 0; j < 32; ++j) + out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; } - #endif // DWTDCT_TYPE } - #endif // CONFIG_TX64X64 - #endif // CONFIG_DWTDCTHYBRID - diff --cc vp9/encoder/vp9_sad_c.c index 84121f79c,af5526dce..96d993863 --- a/vp9/encoder/vp9_sad_c.c +++ b/vp9/encoder/vp9_sad_c.c @@@ -472,73 -473,16 +473,15 @@@ void vp9_sad8x16x4d_c(const uint8_t *sr void vp9_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, - uint8_t *ref_ptr[], + const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad4x4_c(src_ptr, src_stride, - ref_ptr[0], ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad4x4_c(src_ptr, src_stride, - ref_ptr[1], ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad4x4_c(src_ptr, src_stride, - ref_ptr[2], ref_stride, 0x7fffffff); - sad_array[3] = vp9_sad4x4_c(src_ptr, src_stride, - ref_ptr[3], ref_stride, 0x7fffffff); - } - - /* Copy 2 macroblocks to a buffer */ - void vp9_copy32xn_c(uint8_t *src_ptr, - int src_stride, - uint8_t *dst_ptr, - int dst_stride, - int height) { - int r; - - for (r = 0; r < height; r++) { - #if !(CONFIG_FAST_UNALIGNED) - dst_ptr[0] = src_ptr[0]; - dst_ptr[1] = src_ptr[1]; - dst_ptr[2] = src_ptr[2]; - dst_ptr[3] = src_ptr[3]; - dst_ptr[4] = src_ptr[4]; - dst_ptr[5] = src_ptr[5]; - dst_ptr[6] = src_ptr[6]; - dst_ptr[7] = src_ptr[7]; - dst_ptr[8] = src_ptr[8]; - dst_ptr[9] = src_ptr[9]; - dst_ptr[10] = src_ptr[10]; - dst_ptr[11] = src_ptr[11]; - dst_ptr[12] = src_ptr[12]; - dst_ptr[13] = src_ptr[13]; - dst_ptr[14] = src_ptr[14]; - dst_ptr[15] = src_ptr[15]; - dst_ptr[16] = src_ptr[16]; - dst_ptr[17] = src_ptr[17]; - dst_ptr[18] = src_ptr[18]; - dst_ptr[19] = src_ptr[19]; - dst_ptr[20] = src_ptr[20]; - dst_ptr[21] = src_ptr[21]; - dst_ptr[22] = src_ptr[22]; - dst_ptr[23] = src_ptr[23]; - dst_ptr[24] = src_ptr[24]; - dst_ptr[25] = src_ptr[25]; - dst_ptr[26] = src_ptr[26]; - dst_ptr[27] = src_ptr[27]; - dst_ptr[28] = src_ptr[28]; - dst_ptr[29] = src_ptr[29]; - dst_ptr[30] = src_ptr[30]; - dst_ptr[31] = src_ptr[31]; - #else - ((uint32_t *)dst_ptr)[0] = ((uint32_t *)src_ptr)[0]; - ((uint32_t *)dst_ptr)[1] = ((uint32_t *)src_ptr)[1]; - ((uint32_t *)dst_ptr)[2] = ((uint32_t *)src_ptr)[2]; - ((uint32_t *)dst_ptr)[3] = ((uint32_t *)src_ptr)[3]; - ((uint32_t *)dst_ptr)[4] = ((uint32_t *)src_ptr)[4]; - ((uint32_t *)dst_ptr)[5] = ((uint32_t *)src_ptr)[5]; - ((uint32_t *)dst_ptr)[6] = ((uint32_t *)src_ptr)[6]; - ((uint32_t *)dst_ptr)[7] = ((uint32_t *)src_ptr)[7]; - #endif - src_ptr += src_stride; - dst_ptr += dst_stride; - - } + sad_array[0] = vp9_sad4x4(src_ptr, src_stride, + ref_ptr[0], ref_stride, 0x7fffffff); + sad_array[1] = vp9_sad4x4(src_ptr, src_stride, + ref_ptr[1], ref_stride, 0x7fffffff); + sad_array[2] = vp9_sad4x4(src_ptr, src_stride, + ref_ptr[2], ref_stride, 0x7fffffff); + sad_array[3] = vp9_sad4x4(src_ptr, src_stride, + ref_ptr[3], ref_stride, 0x7fffffff); } - diff --cc vp9/vp9_common.mk index 714cefdcc,ea8631711..5e1ff62f7 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@@ -112,21 -111,15 +112,15 @@@ VP9_COMMON_SRCS-yes += common/vp9_maski VP9_COMMON_SRCS-$(HAVE_SSE3) += common/x86/vp9_mask_sse3.asm endif - VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_filter_sse4.c - ifeq ($(HAVE_SSE4_1),yes) - vp9/common/x86/vp9_filter_sse4.c.o: CFLAGS += -msse4 - vp9/common/x86/vp9_filter_sse4.c.d: CFLAGS += -msse4 - endif - - VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_filter_sse2.c + VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_idct_x86.c -VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_sadmxn_x86.c +VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_sadmxn_sse2.c ifeq ($(HAVE_SSE2),yes) - vp9/common/x86/vp9_filter_sse2.c.o: CFLAGS += -msse2 + vp9/common/x86/vp9_idct_x86.c.o: CFLAGS += -msse2 -vp9/common/x86/vp9_loopfilter_x86.c.o: CFLAGS += -msse2 -vp9/common/x86/vp9_sadmxn_x86.c.o: CFLAGS += -msse2 +vp9/common/x86/vp9_loopfilter_intrin_sse2.c.o: CFLAGS += -msse2 +vp9/common/x86/vp9_sadmxn_sse2.c.o: CFLAGS += -msse2 - vp9/common/x86/vp9_filter_sse2.c.d: CFLAGS += -msse2 + vp9/common/x86/vp9_idct_x86.c.d: CFLAGS += -msse2 -vp9/common/x86/vp9_loopfilter_x86.c.d: CFLAGS += -msse2 -vp9/common/x86/vp9_sadmxn_x86.c.d: CFLAGS += -msse2 +vp9/common/x86/vp9_loopfilter_intrin_sse2.c.d: CFLAGS += -msse2 +vp9/common/x86/vp9_sadmxn_sse2.c.d: CFLAGS += -msse2 endif $(eval $(call asm_offsets_template,\ diff --cc vpx_scale/generic/yv12extend.c index 5a427356b,49d7e8e56..a322e0a2c --- a/vpx_scale/generic/yv12extend.c +++ b/vpx_scale/generic/yv12extend.c @@@ -216,6 -117,9 +117,14 @@@ vp8_yv12_copy_frame_c(YV12_BUFFER_CONFI int row; unsigned char *source, *dest; ++#if 0 ++ /* These assertions are valid in the codec, but the libvpx-tester uses ++ * this code slightly differently. ++ */ + assert(src_ybc->y_width == dst_ybc->y_width); + assert(src_ybc->y_height == dst_ybc->y_height); ++#endif + source = src_ybc->y_buffer; dest = dst_ybc->y_buffer;